diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,73115 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999178959467966, + "eval_steps": 500, + "global_step": 9134, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010947207093790197, + "grad_norm": 1.1375685723564313, + "learning_rate": 4.99999985212739e-05, + "loss": 0.6712, + "num_input_tokens_seen": 200704, + "step": 1 + }, + { + "epoch": 0.00021894414187580393, + "grad_norm": 1.1605247657233773, + "learning_rate": 4.9999994085095755e-05, + "loss": 0.6784, + "num_input_tokens_seen": 386624, + "step": 2 + }, + { + "epoch": 0.0003284162128137059, + "grad_norm": 1.2436587180373795, + "learning_rate": 4.9999986691466115e-05, + "loss": 0.802, + "num_input_tokens_seen": 592032, + "step": 3 + }, + { + "epoch": 0.00043788828375160787, + "grad_norm": 1.5537743114923308, + "learning_rate": 4.999997634038584e-05, + "loss": 0.6541, + "num_input_tokens_seen": 764736, + "step": 4 + }, + { + "epoch": 0.0005473603546895098, + "grad_norm": 1.415405731398495, + "learning_rate": 4.9999963031856145e-05, + "loss": 0.5063, + "num_input_tokens_seen": 930944, + "step": 5 + }, + { + "epoch": 0.0006568324256274118, + "grad_norm": 1.3973241881739824, + "learning_rate": 4.999994676587863e-05, + "loss": 0.7316, + "num_input_tokens_seen": 1134560, + "step": 6 + }, + { + "epoch": 0.0007663044965653138, + "grad_norm": 1.1401563230213645, + "learning_rate": 4.9999927542455196e-05, + "loss": 0.5013, + "num_input_tokens_seen": 1325856, + "step": 7 + }, + { + "epoch": 0.0008757765675032157, + "grad_norm": 1.2263192099542266, + "learning_rate": 4.9999905361588115e-05, + "loss": 0.4773, + "num_input_tokens_seen": 1524096, + "step": 8 + }, + { + "epoch": 0.0009852486384411177, + "grad_norm": 1.3538059732378338, + "learning_rate": 4.999988022328004e-05, + "loss": 0.6527, + "num_input_tokens_seen": 1688064, + "step": 9 + }, + { + "epoch": 0.0010947207093790197, + "grad_norm": 1.3169036287835785, + "learning_rate": 4.999985212753391e-05, + "loss": 0.6857, + "num_input_tokens_seen": 1861216, + "step": 10 + }, + { + "epoch": 0.0012041927803169217, + "grad_norm": 1.3775049280815796, + "learning_rate": 4.999982107435308e-05, + "loss": 0.729, + "num_input_tokens_seen": 2038176, + "step": 11 + }, + { + "epoch": 0.0013136648512548236, + "grad_norm": 1.3916497044852547, + "learning_rate": 4.99997870637412e-05, + "loss": 0.7044, + "num_input_tokens_seen": 2213344, + "step": 12 + }, + { + "epoch": 0.0014231369221927256, + "grad_norm": 1.0808066037788786, + "learning_rate": 4.99997500957023e-05, + "loss": 0.6114, + "num_input_tokens_seen": 2387392, + "step": 13 + }, + { + "epoch": 0.0015326089931306276, + "grad_norm": 1.446204740035348, + "learning_rate": 4.999971017024076e-05, + "loss": 0.9107, + "num_input_tokens_seen": 2558528, + "step": 14 + }, + { + "epoch": 0.0016420810640685294, + "grad_norm": 1.4002516077649108, + "learning_rate": 4.99996672873613e-05, + "loss": 0.7237, + "num_input_tokens_seen": 2744224, + "step": 15 + }, + { + "epoch": 0.0017515531350064315, + "grad_norm": 1.291097710520378, + "learning_rate": 4.999962144706898e-05, + "loss": 0.6368, + "num_input_tokens_seen": 2939552, + "step": 16 + }, + { + "epoch": 0.0018610252059443335, + "grad_norm": 1.132461914533669, + "learning_rate": 4.999957264936925e-05, + "loss": 0.7459, + "num_input_tokens_seen": 3119200, + "step": 17 + }, + { + "epoch": 0.0019704972768822353, + "grad_norm": 1.1220580947781853, + "learning_rate": 4.999952089426785e-05, + "loss": 0.5144, + "num_input_tokens_seen": 3288096, + "step": 18 + }, + { + "epoch": 0.0020799693478201374, + "grad_norm": 1.255043868479814, + "learning_rate": 4.9999466181770934e-05, + "loss": 0.7758, + "num_input_tokens_seen": 3496864, + "step": 19 + }, + { + "epoch": 0.0021894414187580394, + "grad_norm": 1.1836915935701577, + "learning_rate": 4.999940851188495e-05, + "loss": 0.9462, + "num_input_tokens_seen": 3710560, + "step": 20 + }, + { + "epoch": 0.0022989134896959414, + "grad_norm": 1.374230125948353, + "learning_rate": 4.999934788461673e-05, + "loss": 0.8865, + "num_input_tokens_seen": 3894016, + "step": 21 + }, + { + "epoch": 0.0024083855606338435, + "grad_norm": 1.1307017162100463, + "learning_rate": 4.9999284299973456e-05, + "loss": 0.5197, + "num_input_tokens_seen": 4062688, + "step": 22 + }, + { + "epoch": 0.002517857631571745, + "grad_norm": 1.2849048960035863, + "learning_rate": 4.999921775796263e-05, + "loss": 0.6756, + "num_input_tokens_seen": 4241216, + "step": 23 + }, + { + "epoch": 0.002627329702509647, + "grad_norm": 1.2966354332686816, + "learning_rate": 4.999914825859214e-05, + "loss": 0.6375, + "num_input_tokens_seen": 4415712, + "step": 24 + }, + { + "epoch": 0.002736801773447549, + "grad_norm": 1.294241886339698, + "learning_rate": 4.999907580187019e-05, + "loss": 0.8339, + "num_input_tokens_seen": 4624256, + "step": 25 + }, + { + "epoch": 0.002846273844385451, + "grad_norm": 1.3497145704242808, + "learning_rate": 4.9999000387805375e-05, + "loss": 0.6684, + "num_input_tokens_seen": 4798304, + "step": 26 + }, + { + "epoch": 0.002955745915323353, + "grad_norm": 1.2495057559420786, + "learning_rate": 4.99989220164066e-05, + "loss": 0.6474, + "num_input_tokens_seen": 4979744, + "step": 27 + }, + { + "epoch": 0.0030652179862612552, + "grad_norm": 1.6041937821643815, + "learning_rate": 4.9998840687683135e-05, + "loss": 0.7383, + "num_input_tokens_seen": 5139008, + "step": 28 + }, + { + "epoch": 0.0031746900571991573, + "grad_norm": 1.4431920348169647, + "learning_rate": 4.999875640164461e-05, + "loss": 0.8955, + "num_input_tokens_seen": 5321344, + "step": 29 + }, + { + "epoch": 0.003284162128137059, + "grad_norm": 1.3190368719984544, + "learning_rate": 4.9998669158301e-05, + "loss": 0.737, + "num_input_tokens_seen": 5515552, + "step": 30 + }, + { + "epoch": 0.003393634199074961, + "grad_norm": 1.3916466635372422, + "learning_rate": 4.999857895766261e-05, + "loss": 0.8718, + "num_input_tokens_seen": 5720064, + "step": 31 + }, + { + "epoch": 0.003503106270012863, + "grad_norm": 1.2276217509410152, + "learning_rate": 4.999848579974012e-05, + "loss": 0.851, + "num_input_tokens_seen": 5921440, + "step": 32 + }, + { + "epoch": 0.003612578340950765, + "grad_norm": 1.2148459087978207, + "learning_rate": 4.9998389684544546e-05, + "loss": 0.7081, + "num_input_tokens_seen": 6113184, + "step": 33 + }, + { + "epoch": 0.003722050411888667, + "grad_norm": 1.136058917371372, + "learning_rate": 4.999829061208726e-05, + "loss": 0.5156, + "num_input_tokens_seen": 6283648, + "step": 34 + }, + { + "epoch": 0.003831522482826569, + "grad_norm": 1.1218150407074938, + "learning_rate": 4.999818858237999e-05, + "loss": 0.6898, + "num_input_tokens_seen": 6468896, + "step": 35 + }, + { + "epoch": 0.003940994553764471, + "grad_norm": 1.1919061831319286, + "learning_rate": 4.99980835954348e-05, + "loss": 0.6718, + "num_input_tokens_seen": 6661312, + "step": 36 + }, + { + "epoch": 0.004050466624702373, + "grad_norm": 1.2358926063225693, + "learning_rate": 4.999797565126411e-05, + "loss": 0.6001, + "num_input_tokens_seen": 6839840, + "step": 37 + }, + { + "epoch": 0.004159938695640275, + "grad_norm": 1.0979278337078424, + "learning_rate": 4.999786474988067e-05, + "loss": 0.5177, + "num_input_tokens_seen": 7020832, + "step": 38 + }, + { + "epoch": 0.004269410766578176, + "grad_norm": 1.1982213448326078, + "learning_rate": 4.9997750891297636e-05, + "loss": 0.634, + "num_input_tokens_seen": 7218624, + "step": 39 + }, + { + "epoch": 0.004378882837516079, + "grad_norm": 1.271601752094647, + "learning_rate": 4.9997634075528454e-05, + "loss": 0.6785, + "num_input_tokens_seen": 7394912, + "step": 40 + }, + { + "epoch": 0.00448835490845398, + "grad_norm": 1.329866994718059, + "learning_rate": 4.999751430258695e-05, + "loss": 0.6915, + "num_input_tokens_seen": 7573888, + "step": 41 + }, + { + "epoch": 0.004597826979391883, + "grad_norm": 1.2701181786287075, + "learning_rate": 4.999739157248729e-05, + "loss": 0.6931, + "num_input_tokens_seen": 7770112, + "step": 42 + }, + { + "epoch": 0.0047072990503297845, + "grad_norm": 1.2917071869349803, + "learning_rate": 4.9997265885243993e-05, + "loss": 0.7323, + "num_input_tokens_seen": 7986944, + "step": 43 + }, + { + "epoch": 0.004816771121267687, + "grad_norm": 1.2481899985330958, + "learning_rate": 4.999713724087193e-05, + "loss": 0.8218, + "num_input_tokens_seen": 8189664, + "step": 44 + }, + { + "epoch": 0.0049262431922055885, + "grad_norm": 1.3795275706722696, + "learning_rate": 4.999700563938632e-05, + "loss": 0.9619, + "num_input_tokens_seen": 8383424, + "step": 45 + }, + { + "epoch": 0.00503571526314349, + "grad_norm": 1.2361077809854482, + "learning_rate": 4.999687108080272e-05, + "loss": 0.6285, + "num_input_tokens_seen": 8576960, + "step": 46 + }, + { + "epoch": 0.005145187334081393, + "grad_norm": 1.2048509776299716, + "learning_rate": 4.999673356513707e-05, + "loss": 0.6056, + "num_input_tokens_seen": 8737792, + "step": 47 + }, + { + "epoch": 0.005254659405019294, + "grad_norm": 1.1653587753179582, + "learning_rate": 4.999659309240561e-05, + "loss": 0.6865, + "num_input_tokens_seen": 8913856, + "step": 48 + }, + { + "epoch": 0.005364131475957197, + "grad_norm": 1.1864551665833467, + "learning_rate": 4.9996449662624986e-05, + "loss": 0.6112, + "num_input_tokens_seen": 9081408, + "step": 49 + }, + { + "epoch": 0.005473603546895098, + "grad_norm": 1.2517284364269574, + "learning_rate": 4.999630327581214e-05, + "loss": 0.6682, + "num_input_tokens_seen": 9257472, + "step": 50 + }, + { + "epoch": 0.005583075617833001, + "grad_norm": 1.3397732374137514, + "learning_rate": 4.9996153931984415e-05, + "loss": 0.8365, + "num_input_tokens_seen": 9419648, + "step": 51 + }, + { + "epoch": 0.005692547688770902, + "grad_norm": 1.3276760913416354, + "learning_rate": 4.999600163115945e-05, + "loss": 0.7045, + "num_input_tokens_seen": 9596384, + "step": 52 + }, + { + "epoch": 0.005802019759708804, + "grad_norm": 1.2838270740345719, + "learning_rate": 4.999584637335529e-05, + "loss": 0.7067, + "num_input_tokens_seen": 9769984, + "step": 53 + }, + { + "epoch": 0.005911491830646706, + "grad_norm": 1.3014800339822237, + "learning_rate": 4.9995688158590284e-05, + "loss": 0.7084, + "num_input_tokens_seen": 9985248, + "step": 54 + }, + { + "epoch": 0.006020963901584608, + "grad_norm": 1.3596091671655097, + "learning_rate": 4.9995526986883146e-05, + "loss": 0.7537, + "num_input_tokens_seen": 10184832, + "step": 55 + }, + { + "epoch": 0.0061304359725225105, + "grad_norm": 1.350615903090275, + "learning_rate": 4.999536285825295e-05, + "loss": 0.8001, + "num_input_tokens_seen": 10334240, + "step": 56 + }, + { + "epoch": 0.006239908043460412, + "grad_norm": 1.2279643897724124, + "learning_rate": 4.999519577271912e-05, + "loss": 0.5893, + "num_input_tokens_seen": 10484544, + "step": 57 + }, + { + "epoch": 0.0063493801143983146, + "grad_norm": 1.3462862431740028, + "learning_rate": 4.9995025730301406e-05, + "loss": 0.859, + "num_input_tokens_seen": 10655008, + "step": 58 + }, + { + "epoch": 0.006458852185336216, + "grad_norm": 1.2974966466636175, + "learning_rate": 4.999485273101993e-05, + "loss": 0.8568, + "num_input_tokens_seen": 10842496, + "step": 59 + }, + { + "epoch": 0.006568324256274118, + "grad_norm": 1.233930048844086, + "learning_rate": 4.9994676774895154e-05, + "loss": 0.5817, + "num_input_tokens_seen": 10991456, + "step": 60 + }, + { + "epoch": 0.00667779632721202, + "grad_norm": 1.2547017838934604, + "learning_rate": 4.99944978619479e-05, + "loss": 0.7665, + "num_input_tokens_seen": 11186784, + "step": 61 + }, + { + "epoch": 0.006787268398149922, + "grad_norm": 1.0973100218002094, + "learning_rate": 4.9994315992199335e-05, + "loss": 0.6607, + "num_input_tokens_seen": 11373824, + "step": 62 + }, + { + "epoch": 0.006896740469087824, + "grad_norm": 1.1520029889604417, + "learning_rate": 4.9994131165670965e-05, + "loss": 0.6294, + "num_input_tokens_seen": 11558176, + "step": 63 + }, + { + "epoch": 0.007006212540025726, + "grad_norm": 1.330066249531122, + "learning_rate": 4.999394338238466e-05, + "loss": 0.766, + "num_input_tokens_seen": 11744096, + "step": 64 + }, + { + "epoch": 0.0071156846109636275, + "grad_norm": 1.35972283834916, + "learning_rate": 4.999375264236263e-05, + "loss": 0.9307, + "num_input_tokens_seen": 11934048, + "step": 65 + }, + { + "epoch": 0.00722515668190153, + "grad_norm": 1.2439166815169054, + "learning_rate": 4.999355894562745e-05, + "loss": 0.7371, + "num_input_tokens_seen": 12089728, + "step": 66 + }, + { + "epoch": 0.007334628752839432, + "grad_norm": 1.3260443152765562, + "learning_rate": 4.9993362292202024e-05, + "loss": 0.7666, + "num_input_tokens_seen": 12249888, + "step": 67 + }, + { + "epoch": 0.007444100823777334, + "grad_norm": 1.1537749115640215, + "learning_rate": 4.999316268210962e-05, + "loss": 0.6124, + "num_input_tokens_seen": 12437152, + "step": 68 + }, + { + "epoch": 0.007553572894715236, + "grad_norm": 1.308760948317531, + "learning_rate": 4.999296011537384e-05, + "loss": 0.6846, + "num_input_tokens_seen": 12618592, + "step": 69 + }, + { + "epoch": 0.007663044965653138, + "grad_norm": 1.2632118106930115, + "learning_rate": 4.999275459201866e-05, + "loss": 0.9314, + "num_input_tokens_seen": 12802944, + "step": 70 + }, + { + "epoch": 0.00777251703659104, + "grad_norm": 1.2801486142877243, + "learning_rate": 4.9992546112068394e-05, + "loss": 0.7839, + "num_input_tokens_seen": 12979456, + "step": 71 + }, + { + "epoch": 0.007881989107528941, + "grad_norm": 1.175245549537397, + "learning_rate": 4.9992334675547704e-05, + "loss": 0.696, + "num_input_tokens_seen": 13140064, + "step": 72 + }, + { + "epoch": 0.007991461178466844, + "grad_norm": 1.195755521167767, + "learning_rate": 4.999212028248159e-05, + "loss": 0.598, + "num_input_tokens_seen": 13289920, + "step": 73 + }, + { + "epoch": 0.008100933249404746, + "grad_norm": 1.2464561772422158, + "learning_rate": 4.999190293289543e-05, + "loss": 0.6061, + "num_input_tokens_seen": 13497792, + "step": 74 + }, + { + "epoch": 0.008210405320342647, + "grad_norm": 1.310941497469276, + "learning_rate": 4.999168262681492e-05, + "loss": 0.5776, + "num_input_tokens_seen": 13658176, + "step": 75 + }, + { + "epoch": 0.00831987739128055, + "grad_norm": 1.3134431629117824, + "learning_rate": 4.999145936426614e-05, + "loss": 0.6711, + "num_input_tokens_seen": 13830432, + "step": 76 + }, + { + "epoch": 0.008429349462218452, + "grad_norm": 1.3566804246806279, + "learning_rate": 4.9991233145275495e-05, + "loss": 0.6841, + "num_input_tokens_seen": 14008064, + "step": 77 + }, + { + "epoch": 0.008538821533156353, + "grad_norm": 1.1617955914181757, + "learning_rate": 4.999100396986974e-05, + "loss": 0.5426, + "num_input_tokens_seen": 14177408, + "step": 78 + }, + { + "epoch": 0.008648293604094255, + "grad_norm": 1.4081358238782862, + "learning_rate": 4.999077183807599e-05, + "loss": 0.7276, + "num_input_tokens_seen": 14332416, + "step": 79 + }, + { + "epoch": 0.008757765675032158, + "grad_norm": 1.2956730431695114, + "learning_rate": 4.9990536749921704e-05, + "loss": 0.7968, + "num_input_tokens_seen": 14506912, + "step": 80 + }, + { + "epoch": 0.00886723774597006, + "grad_norm": 1.3432844819414627, + "learning_rate": 4.999029870543469e-05, + "loss": 0.7246, + "num_input_tokens_seen": 14691488, + "step": 81 + }, + { + "epoch": 0.00897670981690796, + "grad_norm": 1.215107830819847, + "learning_rate": 4.999005770464312e-05, + "loss": 0.6807, + "num_input_tokens_seen": 14874272, + "step": 82 + }, + { + "epoch": 0.009086181887845863, + "grad_norm": 1.4219052657375089, + "learning_rate": 4.99898137475755e-05, + "loss": 0.7125, + "num_input_tokens_seen": 15070944, + "step": 83 + }, + { + "epoch": 0.009195653958783766, + "grad_norm": 1.2447303799448415, + "learning_rate": 4.998956683426068e-05, + "loss": 0.8963, + "num_input_tokens_seen": 15264032, + "step": 84 + }, + { + "epoch": 0.009305126029721666, + "grad_norm": 1.236325376968584, + "learning_rate": 4.9989316964727873e-05, + "loss": 0.6492, + "num_input_tokens_seen": 15433824, + "step": 85 + }, + { + "epoch": 0.009414598100659569, + "grad_norm": 1.2932306583358555, + "learning_rate": 4.9989064139006645e-05, + "loss": 0.7585, + "num_input_tokens_seen": 15591296, + "step": 86 + }, + { + "epoch": 0.009524070171597471, + "grad_norm": 1.291843489367351, + "learning_rate": 4.99888083571269e-05, + "loss": 0.7022, + "num_input_tokens_seen": 15773408, + "step": 87 + }, + { + "epoch": 0.009633542242535374, + "grad_norm": 1.2330059233381645, + "learning_rate": 4.99885496191189e-05, + "loss": 0.6068, + "num_input_tokens_seen": 15975904, + "step": 88 + }, + { + "epoch": 0.009743014313473275, + "grad_norm": 1.380008030398127, + "learning_rate": 4.998828792501324e-05, + "loss": 0.7437, + "num_input_tokens_seen": 16134272, + "step": 89 + }, + { + "epoch": 0.009852486384411177, + "grad_norm": 1.342459924337691, + "learning_rate": 4.998802327484089e-05, + "loss": 0.8501, + "num_input_tokens_seen": 16287936, + "step": 90 + }, + { + "epoch": 0.00996195845534908, + "grad_norm": 1.3102632263948444, + "learning_rate": 4.9987755668633165e-05, + "loss": 0.7088, + "num_input_tokens_seen": 16470048, + "step": 91 + }, + { + "epoch": 0.01007143052628698, + "grad_norm": 1.6900652708055774, + "learning_rate": 4.998748510642171e-05, + "loss": 1.2246, + "num_input_tokens_seen": 16663584, + "step": 92 + }, + { + "epoch": 0.010180902597224883, + "grad_norm": 1.1778728346781973, + "learning_rate": 4.998721158823853e-05, + "loss": 0.7137, + "num_input_tokens_seen": 16859808, + "step": 93 + }, + { + "epoch": 0.010290374668162785, + "grad_norm": 1.1839983157423, + "learning_rate": 4.998693511411599e-05, + "loss": 0.8742, + "num_input_tokens_seen": 17044832, + "step": 94 + }, + { + "epoch": 0.010399846739100688, + "grad_norm": 1.3026541209057638, + "learning_rate": 4.998665568408679e-05, + "loss": 0.8652, + "num_input_tokens_seen": 17235456, + "step": 95 + }, + { + "epoch": 0.010509318810038588, + "grad_norm": 1.2607408637313926, + "learning_rate": 4.998637329818399e-05, + "loss": 0.804, + "num_input_tokens_seen": 17433696, + "step": 96 + }, + { + "epoch": 0.010618790880976491, + "grad_norm": 1.172348740506192, + "learning_rate": 4.9986087956441e-05, + "loss": 0.6367, + "num_input_tokens_seen": 17581760, + "step": 97 + }, + { + "epoch": 0.010728262951914393, + "grad_norm": 1.1897495946368453, + "learning_rate": 4.9985799658891563e-05, + "loss": 0.8407, + "num_input_tokens_seen": 17766112, + "step": 98 + }, + { + "epoch": 0.010837735022852294, + "grad_norm": 1.2996089108612177, + "learning_rate": 4.998550840556979e-05, + "loss": 0.9003, + "num_input_tokens_seen": 17971296, + "step": 99 + }, + { + "epoch": 0.010947207093790197, + "grad_norm": 1.0973625468942212, + "learning_rate": 4.998521419651014e-05, + "loss": 0.6371, + "num_input_tokens_seen": 18153632, + "step": 100 + }, + { + "epoch": 0.011056679164728099, + "grad_norm": 1.3438372029242445, + "learning_rate": 4.998491703174742e-05, + "loss": 0.8215, + "num_input_tokens_seen": 18327680, + "step": 101 + }, + { + "epoch": 0.011166151235666001, + "grad_norm": 1.3508748857687496, + "learning_rate": 4.998461691131677e-05, + "loss": 0.6554, + "num_input_tokens_seen": 18492096, + "step": 102 + }, + { + "epoch": 0.011275623306603902, + "grad_norm": 1.304374520229074, + "learning_rate": 4.9984313835253705e-05, + "loss": 0.6907, + "num_input_tokens_seen": 18668160, + "step": 103 + }, + { + "epoch": 0.011385095377541805, + "grad_norm": 1.3466960637657748, + "learning_rate": 4.998400780359408e-05, + "loss": 0.8705, + "num_input_tokens_seen": 18863712, + "step": 104 + }, + { + "epoch": 0.011494567448479707, + "grad_norm": 1.3093877860016294, + "learning_rate": 4.998369881637408e-05, + "loss": 0.8243, + "num_input_tokens_seen": 19085920, + "step": 105 + }, + { + "epoch": 0.011604039519417608, + "grad_norm": 1.2837554811176786, + "learning_rate": 4.9983386873630285e-05, + "loss": 0.7918, + "num_input_tokens_seen": 19289312, + "step": 106 + }, + { + "epoch": 0.01171351159035551, + "grad_norm": 1.2113823350513657, + "learning_rate": 4.998307197539958e-05, + "loss": 0.8296, + "num_input_tokens_seen": 19510400, + "step": 107 + }, + { + "epoch": 0.011822983661293413, + "grad_norm": 1.2047157175442955, + "learning_rate": 4.998275412171921e-05, + "loss": 0.6138, + "num_input_tokens_seen": 19667648, + "step": 108 + }, + { + "epoch": 0.011932455732231315, + "grad_norm": 1.253281149327372, + "learning_rate": 4.99824333126268e-05, + "loss": 0.6533, + "num_input_tokens_seen": 19813248, + "step": 109 + }, + { + "epoch": 0.012041927803169216, + "grad_norm": 1.356347186917736, + "learning_rate": 4.9982109548160274e-05, + "loss": 0.6766, + "num_input_tokens_seen": 20019552, + "step": 110 + }, + { + "epoch": 0.012151399874107119, + "grad_norm": 1.3927892495470626, + "learning_rate": 4.998178282835795e-05, + "loss": 0.8005, + "num_input_tokens_seen": 20190688, + "step": 111 + }, + { + "epoch": 0.012260871945045021, + "grad_norm": 1.1484580524707517, + "learning_rate": 4.998145315325848e-05, + "loss": 0.7124, + "num_input_tokens_seen": 20392736, + "step": 112 + }, + { + "epoch": 0.012370344015982922, + "grad_norm": 1.111208998156478, + "learning_rate": 4.998112052290086e-05, + "loss": 0.7671, + "num_input_tokens_seen": 20599488, + "step": 113 + }, + { + "epoch": 0.012479816086920824, + "grad_norm": 1.2434348315689434, + "learning_rate": 4.9980784937324434e-05, + "loss": 0.7535, + "num_input_tokens_seen": 20764128, + "step": 114 + }, + { + "epoch": 0.012589288157858727, + "grad_norm": 1.3001190424480877, + "learning_rate": 4.9980446396568906e-05, + "loss": 0.6377, + "num_input_tokens_seen": 20911072, + "step": 115 + }, + { + "epoch": 0.012698760228796629, + "grad_norm": 1.3033923270556487, + "learning_rate": 4.998010490067432e-05, + "loss": 0.7537, + "num_input_tokens_seen": 21116480, + "step": 116 + }, + { + "epoch": 0.01280823229973453, + "grad_norm": 1.285282951996608, + "learning_rate": 4.997976044968108e-05, + "loss": 0.8034, + "num_input_tokens_seen": 21263872, + "step": 117 + }, + { + "epoch": 0.012917704370672432, + "grad_norm": 1.2700183110156096, + "learning_rate": 4.997941304362993e-05, + "loss": 0.7566, + "num_input_tokens_seen": 21443744, + "step": 118 + }, + { + "epoch": 0.013027176441610335, + "grad_norm": 1.1933837488560382, + "learning_rate": 4.997906268256197e-05, + "loss": 0.5467, + "num_input_tokens_seen": 21616896, + "step": 119 + }, + { + "epoch": 0.013136648512548236, + "grad_norm": 1.3830137375778893, + "learning_rate": 4.997870936651865e-05, + "loss": 0.8963, + "num_input_tokens_seen": 21807520, + "step": 120 + }, + { + "epoch": 0.013246120583486138, + "grad_norm": 1.257332562168919, + "learning_rate": 4.9978353095541766e-05, + "loss": 0.7902, + "num_input_tokens_seen": 22001728, + "step": 121 + }, + { + "epoch": 0.01335559265442404, + "grad_norm": 1.3204808325367638, + "learning_rate": 4.997799386967345e-05, + "loss": 0.79, + "num_input_tokens_seen": 22203776, + "step": 122 + }, + { + "epoch": 0.013465064725361941, + "grad_norm": 1.3640715049280374, + "learning_rate": 4.9977631688956215e-05, + "loss": 0.7779, + "num_input_tokens_seen": 22382752, + "step": 123 + }, + { + "epoch": 0.013574536796299844, + "grad_norm": 1.4345919177023527, + "learning_rate": 4.99772665534329e-05, + "loss": 0.7406, + "num_input_tokens_seen": 22556576, + "step": 124 + }, + { + "epoch": 0.013684008867237746, + "grad_norm": 1.1456887703070118, + "learning_rate": 4.9976898463146706e-05, + "loss": 0.7324, + "num_input_tokens_seen": 22740256, + "step": 125 + }, + { + "epoch": 0.013793480938175649, + "grad_norm": 1.2628167161160788, + "learning_rate": 4.997652741814116e-05, + "loss": 0.8445, + "num_input_tokens_seen": 22933120, + "step": 126 + }, + { + "epoch": 0.01390295300911355, + "grad_norm": 1.1689941090124547, + "learning_rate": 4.9976153418460184e-05, + "loss": 0.5797, + "num_input_tokens_seen": 23090144, + "step": 127 + }, + { + "epoch": 0.014012425080051452, + "grad_norm": 1.361829974413629, + "learning_rate": 4.997577646414799e-05, + "loss": 0.8427, + "num_input_tokens_seen": 23281888, + "step": 128 + }, + { + "epoch": 0.014121897150989354, + "grad_norm": 1.1775641120578497, + "learning_rate": 4.997539655524919e-05, + "loss": 0.8864, + "num_input_tokens_seen": 23460640, + "step": 129 + }, + { + "epoch": 0.014231369221927255, + "grad_norm": 1.2854167361184734, + "learning_rate": 4.997501369180872e-05, + "loss": 0.8312, + "num_input_tokens_seen": 23639840, + "step": 130 + }, + { + "epoch": 0.014340841292865157, + "grad_norm": 1.267659706347049, + "learning_rate": 4.997462787387188e-05, + "loss": 0.7574, + "num_input_tokens_seen": 23806048, + "step": 131 + }, + { + "epoch": 0.01445031336380306, + "grad_norm": 1.180767651124116, + "learning_rate": 4.997423910148431e-05, + "loss": 0.6549, + "num_input_tokens_seen": 23972928, + "step": 132 + }, + { + "epoch": 0.014559785434740962, + "grad_norm": 1.1825930453544513, + "learning_rate": 4.9973847374691985e-05, + "loss": 0.6969, + "num_input_tokens_seen": 24161984, + "step": 133 + }, + { + "epoch": 0.014669257505678863, + "grad_norm": 1.1945018637456202, + "learning_rate": 4.997345269354127e-05, + "loss": 0.6144, + "num_input_tokens_seen": 24306016, + "step": 134 + }, + { + "epoch": 0.014778729576616766, + "grad_norm": 1.3292269903304006, + "learning_rate": 4.9973055058078835e-05, + "loss": 0.7967, + "num_input_tokens_seen": 24494400, + "step": 135 + }, + { + "epoch": 0.014888201647554668, + "grad_norm": 1.3011517154685959, + "learning_rate": 4.997265446835172e-05, + "loss": 0.8719, + "num_input_tokens_seen": 24669344, + "step": 136 + }, + { + "epoch": 0.014997673718492569, + "grad_norm": 1.274512042375184, + "learning_rate": 4.997225092440733e-05, + "loss": 0.7693, + "num_input_tokens_seen": 24866688, + "step": 137 + }, + { + "epoch": 0.015107145789430471, + "grad_norm": 1.2153761699748957, + "learning_rate": 4.9971844426293395e-05, + "loss": 0.658, + "num_input_tokens_seen": 25045440, + "step": 138 + }, + { + "epoch": 0.015216617860368374, + "grad_norm": 1.5907334596120806, + "learning_rate": 4.9971434974058e-05, + "loss": 0.8443, + "num_input_tokens_seen": 25220384, + "step": 139 + }, + { + "epoch": 0.015326089931306276, + "grad_norm": 1.3242865350979092, + "learning_rate": 4.997102256774959e-05, + "loss": 0.6314, + "num_input_tokens_seen": 25393760, + "step": 140 + }, + { + "epoch": 0.015435562002244177, + "grad_norm": 1.3546339647039225, + "learning_rate": 4.997060720741694e-05, + "loss": 0.7165, + "num_input_tokens_seen": 25558400, + "step": 141 + }, + { + "epoch": 0.01554503407318208, + "grad_norm": 1.2851046288045158, + "learning_rate": 4.9970188893109194e-05, + "loss": 0.7813, + "num_input_tokens_seen": 25745888, + "step": 142 + }, + { + "epoch": 0.015654506144119982, + "grad_norm": 1.3505280345053423, + "learning_rate": 4.996976762487584e-05, + "loss": 0.7194, + "num_input_tokens_seen": 25876928, + "step": 143 + }, + { + "epoch": 0.015763978215057883, + "grad_norm": 1.3367935917746943, + "learning_rate": 4.996934340276671e-05, + "loss": 0.7349, + "num_input_tokens_seen": 26026560, + "step": 144 + }, + { + "epoch": 0.015873450285995787, + "grad_norm": 1.0933933766574906, + "learning_rate": 4.996891622683199e-05, + "loss": 0.8035, + "num_input_tokens_seen": 26232192, + "step": 145 + }, + { + "epoch": 0.015982922356933688, + "grad_norm": 1.1930001525143368, + "learning_rate": 4.99684860971222e-05, + "loss": 0.8288, + "num_input_tokens_seen": 26404448, + "step": 146 + }, + { + "epoch": 0.01609239442787159, + "grad_norm": 1.2073113570144962, + "learning_rate": 4.996805301368825e-05, + "loss": 0.7917, + "num_input_tokens_seen": 26596640, + "step": 147 + }, + { + "epoch": 0.016201866498809493, + "grad_norm": 1.259253393317462, + "learning_rate": 4.9967616976581354e-05, + "loss": 0.9584, + "num_input_tokens_seen": 26787264, + "step": 148 + }, + { + "epoch": 0.016311338569747393, + "grad_norm": 0.9447077558631755, + "learning_rate": 4.99671779858531e-05, + "loss": 0.4423, + "num_input_tokens_seen": 26954816, + "step": 149 + }, + { + "epoch": 0.016420810640685294, + "grad_norm": 1.1899499716781126, + "learning_rate": 4.996673604155542e-05, + "loss": 0.6417, + "num_input_tokens_seen": 27141856, + "step": 150 + }, + { + "epoch": 0.016530282711623198, + "grad_norm": 1.20810560294741, + "learning_rate": 4.9966291143740595e-05, + "loss": 0.8161, + "num_input_tokens_seen": 27349056, + "step": 151 + }, + { + "epoch": 0.0166397547825611, + "grad_norm": 1.1554031879635294, + "learning_rate": 4.996584329246126e-05, + "loss": 0.6427, + "num_input_tokens_seen": 27561408, + "step": 152 + }, + { + "epoch": 0.016749226853499, + "grad_norm": 1.2001107343603206, + "learning_rate": 4.996539248777038e-05, + "loss": 0.8038, + "num_input_tokens_seen": 27762784, + "step": 153 + }, + { + "epoch": 0.016858698924436904, + "grad_norm": 1.2942535378767162, + "learning_rate": 4.99649387297213e-05, + "loss": 0.74, + "num_input_tokens_seen": 27954304, + "step": 154 + }, + { + "epoch": 0.016968170995374805, + "grad_norm": 1.32273128542281, + "learning_rate": 4.996448201836769e-05, + "loss": 0.8268, + "num_input_tokens_seen": 28126112, + "step": 155 + }, + { + "epoch": 0.017077643066312705, + "grad_norm": 1.2041055226009494, + "learning_rate": 4.9964022353763586e-05, + "loss": 0.682, + "num_input_tokens_seen": 28325248, + "step": 156 + }, + { + "epoch": 0.01718711513725061, + "grad_norm": 1.3357101157686495, + "learning_rate": 4.996355973596336e-05, + "loss": 0.6126, + "num_input_tokens_seen": 28487424, + "step": 157 + }, + { + "epoch": 0.01729658720818851, + "grad_norm": 1.1628431552043688, + "learning_rate": 4.996309416502174e-05, + "loss": 0.8077, + "num_input_tokens_seen": 28670880, + "step": 158 + }, + { + "epoch": 0.017406059279126414, + "grad_norm": 1.331740003499244, + "learning_rate": 4.99626256409938e-05, + "loss": 0.6373, + "num_input_tokens_seen": 28836864, + "step": 159 + }, + { + "epoch": 0.017515531350064315, + "grad_norm": 1.2132496571158131, + "learning_rate": 4.996215416393496e-05, + "loss": 0.6675, + "num_input_tokens_seen": 29027040, + "step": 160 + }, + { + "epoch": 0.017625003421002216, + "grad_norm": 1.153445941953066, + "learning_rate": 4.996167973390101e-05, + "loss": 0.6027, + "num_input_tokens_seen": 29220800, + "step": 161 + }, + { + "epoch": 0.01773447549194012, + "grad_norm": 1.2664845220626775, + "learning_rate": 4.996120235094807e-05, + "loss": 0.7717, + "num_input_tokens_seen": 29402240, + "step": 162 + }, + { + "epoch": 0.01784394756287802, + "grad_norm": 1.5897797892683405, + "learning_rate": 4.99607220151326e-05, + "loss": 0.753, + "num_input_tokens_seen": 29604960, + "step": 163 + }, + { + "epoch": 0.01795341963381592, + "grad_norm": 1.1701446473948087, + "learning_rate": 4.9960238726511435e-05, + "loss": 0.6594, + "num_input_tokens_seen": 29785504, + "step": 164 + }, + { + "epoch": 0.018062891704753826, + "grad_norm": 1.2202959770226978, + "learning_rate": 4.995975248514175e-05, + "loss": 0.6007, + "num_input_tokens_seen": 29945664, + "step": 165 + }, + { + "epoch": 0.018172363775691727, + "grad_norm": 1.2282411787297483, + "learning_rate": 4.995926329108106e-05, + "loss": 0.5952, + "num_input_tokens_seen": 30127776, + "step": 166 + }, + { + "epoch": 0.018281835846629627, + "grad_norm": 1.2640172848907905, + "learning_rate": 4.995877114438723e-05, + "loss": 0.6946, + "num_input_tokens_seen": 30298240, + "step": 167 + }, + { + "epoch": 0.01839130791756753, + "grad_norm": 1.2008509261417466, + "learning_rate": 4.995827604511849e-05, + "loss": 0.641, + "num_input_tokens_seen": 30466688, + "step": 168 + }, + { + "epoch": 0.018500779988505432, + "grad_norm": 1.3243884762721467, + "learning_rate": 4.995777799333341e-05, + "loss": 0.8853, + "num_input_tokens_seen": 30675456, + "step": 169 + }, + { + "epoch": 0.018610252059443333, + "grad_norm": 1.4545201812791781, + "learning_rate": 4.99572769890909e-05, + "loss": 0.9814, + "num_input_tokens_seen": 30839872, + "step": 170 + }, + { + "epoch": 0.018719724130381237, + "grad_norm": 1.2242757291016226, + "learning_rate": 4.9956773032450234e-05, + "loss": 0.7272, + "num_input_tokens_seen": 31008320, + "step": 171 + }, + { + "epoch": 0.018829196201319138, + "grad_norm": 1.3488267763221111, + "learning_rate": 4.995626612347103e-05, + "loss": 0.6521, + "num_input_tokens_seen": 31167136, + "step": 172 + }, + { + "epoch": 0.018938668272257042, + "grad_norm": 1.3268669003829234, + "learning_rate": 4.995575626221325e-05, + "loss": 0.5594, + "num_input_tokens_seen": 31368288, + "step": 173 + }, + { + "epoch": 0.019048140343194943, + "grad_norm": 1.3298842951272898, + "learning_rate": 4.995524344873721e-05, + "loss": 0.6103, + "num_input_tokens_seen": 31545248, + "step": 174 + }, + { + "epoch": 0.019157612414132844, + "grad_norm": 1.3688183458239211, + "learning_rate": 4.9954727683103576e-05, + "loss": 0.6036, + "num_input_tokens_seen": 31689504, + "step": 175 + }, + { + "epoch": 0.019267084485070748, + "grad_norm": 1.2367860449231904, + "learning_rate": 4.995420896537336e-05, + "loss": 0.6305, + "num_input_tokens_seen": 31844064, + "step": 176 + }, + { + "epoch": 0.01937655655600865, + "grad_norm": 1.1431578640726583, + "learning_rate": 4.995368729560793e-05, + "loss": 0.8005, + "num_input_tokens_seen": 32049472, + "step": 177 + }, + { + "epoch": 0.01948602862694655, + "grad_norm": 1.3481964797663968, + "learning_rate": 4.9953162673869005e-05, + "loss": 0.835, + "num_input_tokens_seen": 32213216, + "step": 178 + }, + { + "epoch": 0.019595500697884453, + "grad_norm": 1.320128523970803, + "learning_rate": 4.9952635100218623e-05, + "loss": 0.7817, + "num_input_tokens_seen": 32383008, + "step": 179 + }, + { + "epoch": 0.019704972768822354, + "grad_norm": 1.1874281710935837, + "learning_rate": 4.995210457471922e-05, + "loss": 0.7382, + "num_input_tokens_seen": 32601632, + "step": 180 + }, + { + "epoch": 0.019814444839760255, + "grad_norm": 1.1670096040966296, + "learning_rate": 4.995157109743354e-05, + "loss": 0.6084, + "num_input_tokens_seen": 32779712, + "step": 181 + }, + { + "epoch": 0.01992391691069816, + "grad_norm": 1.3274182402582513, + "learning_rate": 4.99510346684247e-05, + "loss": 0.887, + "num_input_tokens_seen": 32956672, + "step": 182 + }, + { + "epoch": 0.02003338898163606, + "grad_norm": 1.182883776755829, + "learning_rate": 4.995049528775616e-05, + "loss": 0.867, + "num_input_tokens_seen": 33140128, + "step": 183 + }, + { + "epoch": 0.02014286105257396, + "grad_norm": 1.3510078218093886, + "learning_rate": 4.994995295549173e-05, + "loss": 0.6597, + "num_input_tokens_seen": 33290208, + "step": 184 + }, + { + "epoch": 0.020252333123511865, + "grad_norm": 1.3725559881912301, + "learning_rate": 4.9949407671695554e-05, + "loss": 1.0128, + "num_input_tokens_seen": 33511520, + "step": 185 + }, + { + "epoch": 0.020361805194449765, + "grad_norm": 1.2080280422530558, + "learning_rate": 4.994885943643215e-05, + "loss": 0.7103, + "num_input_tokens_seen": 33703264, + "step": 186 + }, + { + "epoch": 0.02047127726538767, + "grad_norm": 1.3011150676382275, + "learning_rate": 4.994830824976636e-05, + "loss": 0.8895, + "num_input_tokens_seen": 33855136, + "step": 187 + }, + { + "epoch": 0.02058074933632557, + "grad_norm": 1.266647161576189, + "learning_rate": 4.99477541117634e-05, + "loss": 0.7295, + "num_input_tokens_seen": 34066816, + "step": 188 + }, + { + "epoch": 0.02069022140726347, + "grad_norm": 1.0882706312331067, + "learning_rate": 4.994719702248883e-05, + "loss": 0.5532, + "num_input_tokens_seen": 34269088, + "step": 189 + }, + { + "epoch": 0.020799693478201375, + "grad_norm": 1.6104727399307883, + "learning_rate": 4.9946636982008534e-05, + "loss": 0.7301, + "num_input_tokens_seen": 34457696, + "step": 190 + }, + { + "epoch": 0.020909165549139276, + "grad_norm": 1.3437319466021311, + "learning_rate": 4.994607399038877e-05, + "loss": 0.8629, + "num_input_tokens_seen": 34625920, + "step": 191 + }, + { + "epoch": 0.021018637620077177, + "grad_norm": 1.2116179492571733, + "learning_rate": 4.9945508047696154e-05, + "loss": 0.63, + "num_input_tokens_seen": 34823040, + "step": 192 + }, + { + "epoch": 0.02112810969101508, + "grad_norm": 1.115837081283721, + "learning_rate": 4.9944939153997614e-05, + "loss": 0.6346, + "num_input_tokens_seen": 35002464, + "step": 193 + }, + { + "epoch": 0.021237581761952982, + "grad_norm": 1.229045517118139, + "learning_rate": 4.994436730936046e-05, + "loss": 0.7864, + "num_input_tokens_seen": 35199584, + "step": 194 + }, + { + "epoch": 0.021347053832890883, + "grad_norm": 1.1079801467569736, + "learning_rate": 4.994379251385235e-05, + "loss": 0.6279, + "num_input_tokens_seen": 35414176, + "step": 195 + }, + { + "epoch": 0.021456525903828787, + "grad_norm": 1.2821148535360536, + "learning_rate": 4.9943214767541255e-05, + "loss": 0.7367, + "num_input_tokens_seen": 35598976, + "step": 196 + }, + { + "epoch": 0.021565997974766687, + "grad_norm": 1.2999059349654116, + "learning_rate": 4.994263407049554e-05, + "loss": 0.7586, + "num_input_tokens_seen": 35749504, + "step": 197 + }, + { + "epoch": 0.021675470045704588, + "grad_norm": 1.3176660593763079, + "learning_rate": 4.9942050422783906e-05, + "loss": 0.797, + "num_input_tokens_seen": 35929600, + "step": 198 + }, + { + "epoch": 0.021784942116642492, + "grad_norm": 1.2688547063006441, + "learning_rate": 4.994146382447538e-05, + "loss": 0.8002, + "num_input_tokens_seen": 36103200, + "step": 199 + }, + { + "epoch": 0.021894414187580393, + "grad_norm": 1.3863673234967293, + "learning_rate": 4.994087427563936e-05, + "loss": 0.9701, + "num_input_tokens_seen": 36276128, + "step": 200 + }, + { + "epoch": 0.022003886258518294, + "grad_norm": 1.0883695962267241, + "learning_rate": 4.9940281776345596e-05, + "loss": 0.7224, + "num_input_tokens_seen": 36457344, + "step": 201 + }, + { + "epoch": 0.022113358329456198, + "grad_norm": 1.2334500964197828, + "learning_rate": 4.993968632666417e-05, + "loss": 0.6799, + "num_input_tokens_seen": 36627360, + "step": 202 + }, + { + "epoch": 0.0222228304003941, + "grad_norm": 1.2432949000308366, + "learning_rate": 4.993908792666554e-05, + "loss": 0.7229, + "num_input_tokens_seen": 36761536, + "step": 203 + }, + { + "epoch": 0.022332302471332003, + "grad_norm": 1.2874550709708967, + "learning_rate": 4.9938486576420474e-05, + "loss": 0.8847, + "num_input_tokens_seen": 36932896, + "step": 204 + }, + { + "epoch": 0.022441774542269904, + "grad_norm": 1.2862197047188586, + "learning_rate": 4.993788227600013e-05, + "loss": 0.5768, + "num_input_tokens_seen": 37059904, + "step": 205 + }, + { + "epoch": 0.022551246613207804, + "grad_norm": 1.2603784297891532, + "learning_rate": 4.993727502547598e-05, + "loss": 0.6446, + "num_input_tokens_seen": 37243360, + "step": 206 + }, + { + "epoch": 0.02266071868414571, + "grad_norm": 1.147274804135399, + "learning_rate": 4.9936664824919865e-05, + "loss": 0.5917, + "num_input_tokens_seen": 37434880, + "step": 207 + }, + { + "epoch": 0.02277019075508361, + "grad_norm": 1.2530918031324108, + "learning_rate": 4.993605167440397e-05, + "loss": 0.7557, + "num_input_tokens_seen": 37638272, + "step": 208 + }, + { + "epoch": 0.02287966282602151, + "grad_norm": 1.2693449852088732, + "learning_rate": 4.9935435574000834e-05, + "loss": 0.6493, + "num_input_tokens_seen": 37796192, + "step": 209 + }, + { + "epoch": 0.022989134896959414, + "grad_norm": 1.3096754471395613, + "learning_rate": 4.993481652378334e-05, + "loss": 0.6706, + "num_input_tokens_seen": 37942688, + "step": 210 + }, + { + "epoch": 0.023098606967897315, + "grad_norm": 1.143613323529403, + "learning_rate": 4.9934194523824715e-05, + "loss": 0.7749, + "num_input_tokens_seen": 38133088, + "step": 211 + }, + { + "epoch": 0.023208079038835216, + "grad_norm": 1.1703050906019363, + "learning_rate": 4.993356957419855e-05, + "loss": 0.7671, + "num_input_tokens_seen": 38315648, + "step": 212 + }, + { + "epoch": 0.02331755110977312, + "grad_norm": 1.26672070956805, + "learning_rate": 4.993294167497876e-05, + "loss": 0.7319, + "num_input_tokens_seen": 38489248, + "step": 213 + }, + { + "epoch": 0.02342702318071102, + "grad_norm": 1.1334037553188188, + "learning_rate": 4.993231082623965e-05, + "loss": 0.6718, + "num_input_tokens_seen": 38708992, + "step": 214 + }, + { + "epoch": 0.02353649525164892, + "grad_norm": 1.4361153443694465, + "learning_rate": 4.993167702805581e-05, + "loss": 0.7682, + "num_input_tokens_seen": 38875200, + "step": 215 + }, + { + "epoch": 0.023645967322586826, + "grad_norm": 1.1923143771791125, + "learning_rate": 4.9931040280502255e-05, + "loss": 0.6635, + "num_input_tokens_seen": 39044992, + "step": 216 + }, + { + "epoch": 0.023755439393524726, + "grad_norm": 1.2571120118252266, + "learning_rate": 4.993040058365429e-05, + "loss": 0.652, + "num_input_tokens_seen": 39208512, + "step": 217 + }, + { + "epoch": 0.02386491146446263, + "grad_norm": 1.318959010767799, + "learning_rate": 4.992975793758759e-05, + "loss": 0.8437, + "num_input_tokens_seen": 39402720, + "step": 218 + }, + { + "epoch": 0.02397438353540053, + "grad_norm": 1.1742430094981298, + "learning_rate": 4.9929112342378194e-05, + "loss": 0.9123, + "num_input_tokens_seen": 39589088, + "step": 219 + }, + { + "epoch": 0.024083855606338432, + "grad_norm": 1.1641526667345434, + "learning_rate": 4.9928463798102456e-05, + "loss": 0.8507, + "num_input_tokens_seen": 39780832, + "step": 220 + }, + { + "epoch": 0.024193327677276336, + "grad_norm": 1.2187799303341047, + "learning_rate": 4.992781230483711e-05, + "loss": 0.7906, + "num_input_tokens_seen": 39963840, + "step": 221 + }, + { + "epoch": 0.024302799748214237, + "grad_norm": 1.1645005189297455, + "learning_rate": 4.9927157862659215e-05, + "loss": 0.7648, + "num_input_tokens_seen": 40166784, + "step": 222 + }, + { + "epoch": 0.024412271819152138, + "grad_norm": 1.2273985327836345, + "learning_rate": 4.992650047164621e-05, + "loss": 0.8414, + "num_input_tokens_seen": 40350912, + "step": 223 + }, + { + "epoch": 0.024521743890090042, + "grad_norm": 1.1950098290298563, + "learning_rate": 4.9925840131875845e-05, + "loss": 0.6663, + "num_input_tokens_seen": 40543104, + "step": 224 + }, + { + "epoch": 0.024631215961027943, + "grad_norm": 1.1948555465600792, + "learning_rate": 4.9925176843426236e-05, + "loss": 0.708, + "num_input_tokens_seen": 40715808, + "step": 225 + }, + { + "epoch": 0.024740688031965843, + "grad_norm": 1.235569725826325, + "learning_rate": 4.9924510606375864e-05, + "loss": 0.7908, + "num_input_tokens_seen": 40924800, + "step": 226 + }, + { + "epoch": 0.024850160102903748, + "grad_norm": 1.1981373735643595, + "learning_rate": 4.992384142080353e-05, + "loss": 0.6441, + "num_input_tokens_seen": 41110272, + "step": 227 + }, + { + "epoch": 0.02495963217384165, + "grad_norm": 1.2093527976666871, + "learning_rate": 4.99231692867884e-05, + "loss": 0.722, + "num_input_tokens_seen": 41304480, + "step": 228 + }, + { + "epoch": 0.02506910424477955, + "grad_norm": 1.2989812086712251, + "learning_rate": 4.9922494204409994e-05, + "loss": 0.8136, + "num_input_tokens_seen": 41474720, + "step": 229 + }, + { + "epoch": 0.025178576315717453, + "grad_norm": 1.3890859252190275, + "learning_rate": 4.9921816173748166e-05, + "loss": 0.7715, + "num_input_tokens_seen": 41645408, + "step": 230 + }, + { + "epoch": 0.025288048386655354, + "grad_norm": 1.2863573763174536, + "learning_rate": 4.9921135194883126e-05, + "loss": 0.7798, + "num_input_tokens_seen": 41828416, + "step": 231 + }, + { + "epoch": 0.025397520457593258, + "grad_norm": 1.4082788878230268, + "learning_rate": 4.992045126789543e-05, + "loss": 0.7704, + "num_input_tokens_seen": 41993056, + "step": 232 + }, + { + "epoch": 0.02550699252853116, + "grad_norm": 1.3258896360225183, + "learning_rate": 4.9919764392865994e-05, + "loss": 0.761, + "num_input_tokens_seen": 42166208, + "step": 233 + }, + { + "epoch": 0.02561646459946906, + "grad_norm": 1.237284310164803, + "learning_rate": 4.9919074569876066e-05, + "loss": 0.8806, + "num_input_tokens_seen": 42380576, + "step": 234 + }, + { + "epoch": 0.025725936670406964, + "grad_norm": 1.2264816354713337, + "learning_rate": 4.991838179900726e-05, + "loss": 0.646, + "num_input_tokens_seen": 42570528, + "step": 235 + }, + { + "epoch": 0.025835408741344865, + "grad_norm": 1.2899247493530506, + "learning_rate": 4.991768608034152e-05, + "loss": 0.8508, + "num_input_tokens_seen": 42769216, + "step": 236 + }, + { + "epoch": 0.025944880812282765, + "grad_norm": 1.1602821153381027, + "learning_rate": 4.991698741396115e-05, + "loss": 0.7227, + "num_input_tokens_seen": 42934304, + "step": 237 + }, + { + "epoch": 0.02605435288322067, + "grad_norm": 1.2286032446831927, + "learning_rate": 4.991628579994879e-05, + "loss": 0.5724, + "num_input_tokens_seen": 43116640, + "step": 238 + }, + { + "epoch": 0.02616382495415857, + "grad_norm": 1.2278681809287362, + "learning_rate": 4.9915581238387464e-05, + "loss": 0.7538, + "num_input_tokens_seen": 43296960, + "step": 239 + }, + { + "epoch": 0.02627329702509647, + "grad_norm": 1.3100046323738022, + "learning_rate": 4.991487372936051e-05, + "loss": 0.7266, + "num_input_tokens_seen": 43471456, + "step": 240 + }, + { + "epoch": 0.026382769096034375, + "grad_norm": 1.4795537451096423, + "learning_rate": 4.991416327295162e-05, + "loss": 0.7425, + "num_input_tokens_seen": 43628928, + "step": 241 + }, + { + "epoch": 0.026492241166972276, + "grad_norm": 1.273879553708021, + "learning_rate": 4.9913449869244844e-05, + "loss": 0.7924, + "num_input_tokens_seen": 43827616, + "step": 242 + }, + { + "epoch": 0.026601713237910177, + "grad_norm": 1.2814357901988167, + "learning_rate": 4.991273351832457e-05, + "loss": 0.6683, + "num_input_tokens_seen": 43983520, + "step": 243 + }, + { + "epoch": 0.02671118530884808, + "grad_norm": 1.261838641537991, + "learning_rate": 4.991201422027556e-05, + "loss": 0.6728, + "num_input_tokens_seen": 44170560, + "step": 244 + }, + { + "epoch": 0.02682065737978598, + "grad_norm": 1.2888158388830369, + "learning_rate": 4.991129197518287e-05, + "loss": 0.6634, + "num_input_tokens_seen": 44338112, + "step": 245 + }, + { + "epoch": 0.026930129450723882, + "grad_norm": 1.3455974830829565, + "learning_rate": 4.991056678313197e-05, + "loss": 0.8105, + "num_input_tokens_seen": 44499168, + "step": 246 + }, + { + "epoch": 0.027039601521661787, + "grad_norm": 1.3327221015782305, + "learning_rate": 4.990983864420865e-05, + "loss": 0.8705, + "num_input_tokens_seen": 44714432, + "step": 247 + }, + { + "epoch": 0.027149073592599687, + "grad_norm": 1.1626558445480044, + "learning_rate": 4.990910755849903e-05, + "loss": 0.7122, + "num_input_tokens_seen": 44886240, + "step": 248 + }, + { + "epoch": 0.02725854566353759, + "grad_norm": 1.3426811047800222, + "learning_rate": 4.99083735260896e-05, + "loss": 0.7587, + "num_input_tokens_seen": 45024672, + "step": 249 + }, + { + "epoch": 0.027368017734475492, + "grad_norm": 1.454467153609092, + "learning_rate": 4.990763654706721e-05, + "loss": 0.7149, + "num_input_tokens_seen": 45193792, + "step": 250 + }, + { + "epoch": 0.027477489805413393, + "grad_norm": 1.1596072146914669, + "learning_rate": 4.990689662151903e-05, + "loss": 0.6512, + "num_input_tokens_seen": 45360896, + "step": 251 + }, + { + "epoch": 0.027586961876351297, + "grad_norm": 1.3565740119183756, + "learning_rate": 4.990615374953258e-05, + "loss": 0.6943, + "num_input_tokens_seen": 45533600, + "step": 252 + }, + { + "epoch": 0.027696433947289198, + "grad_norm": 1.1950208520067578, + "learning_rate": 4.990540793119577e-05, + "loss": 0.7342, + "num_input_tokens_seen": 45733632, + "step": 253 + }, + { + "epoch": 0.0278059060182271, + "grad_norm": 1.2785601875331627, + "learning_rate": 4.99046591665968e-05, + "loss": 0.7208, + "num_input_tokens_seen": 45931648, + "step": 254 + }, + { + "epoch": 0.027915378089165003, + "grad_norm": 1.0932308054647146, + "learning_rate": 4.990390745582427e-05, + "loss": 0.641, + "num_input_tokens_seen": 46127200, + "step": 255 + }, + { + "epoch": 0.028024850160102904, + "grad_norm": 1.3353284373226164, + "learning_rate": 4.990315279896709e-05, + "loss": 0.7197, + "num_input_tokens_seen": 46290272, + "step": 256 + }, + { + "epoch": 0.028134322231040804, + "grad_norm": 1.2790069950529865, + "learning_rate": 4.990239519611454e-05, + "loss": 0.7023, + "num_input_tokens_seen": 46454016, + "step": 257 + }, + { + "epoch": 0.02824379430197871, + "grad_norm": 1.3019729516608185, + "learning_rate": 4.990163464735624e-05, + "loss": 0.6438, + "num_input_tokens_seen": 46614400, + "step": 258 + }, + { + "epoch": 0.02835326637291661, + "grad_norm": 1.1552818988093567, + "learning_rate": 4.990087115278218e-05, + "loss": 0.7123, + "num_input_tokens_seen": 46835712, + "step": 259 + }, + { + "epoch": 0.02846273844385451, + "grad_norm": 1.358224579293587, + "learning_rate": 4.9900104712482656e-05, + "loss": 0.678, + "num_input_tokens_seen": 47001024, + "step": 260 + }, + { + "epoch": 0.028572210514792414, + "grad_norm": 1.326768743615148, + "learning_rate": 4.9899335326548346e-05, + "loss": 0.9007, + "num_input_tokens_seen": 47192768, + "step": 261 + }, + { + "epoch": 0.028681682585730315, + "grad_norm": 1.2723063962808452, + "learning_rate": 4.9898562995070264e-05, + "loss": 0.7573, + "num_input_tokens_seen": 47388320, + "step": 262 + }, + { + "epoch": 0.02879115465666822, + "grad_norm": 1.2027914402288304, + "learning_rate": 4.9897787718139774e-05, + "loss": 0.7314, + "num_input_tokens_seen": 47572672, + "step": 263 + }, + { + "epoch": 0.02890062672760612, + "grad_norm": 1.276393665368892, + "learning_rate": 4.98970094958486e-05, + "loss": 0.8009, + "num_input_tokens_seen": 47783904, + "step": 264 + }, + { + "epoch": 0.02901009879854402, + "grad_norm": 1.267165717938441, + "learning_rate": 4.98962283282888e-05, + "loss": 0.6173, + "num_input_tokens_seen": 47981248, + "step": 265 + }, + { + "epoch": 0.029119570869481925, + "grad_norm": 1.123672921207357, + "learning_rate": 4.989544421555278e-05, + "loss": 0.7067, + "num_input_tokens_seen": 48189792, + "step": 266 + }, + { + "epoch": 0.029229042940419826, + "grad_norm": 1.1009516002781823, + "learning_rate": 4.989465715773331e-05, + "loss": 0.724, + "num_input_tokens_seen": 48370560, + "step": 267 + }, + { + "epoch": 0.029338515011357726, + "grad_norm": 1.2609093493943713, + "learning_rate": 4.989386715492347e-05, + "loss": 0.7957, + "num_input_tokens_seen": 48534528, + "step": 268 + }, + { + "epoch": 0.02944798708229563, + "grad_norm": 1.3028893328561522, + "learning_rate": 4.9893074207216745e-05, + "loss": 0.7524, + "num_input_tokens_seen": 48705888, + "step": 269 + }, + { + "epoch": 0.02955745915323353, + "grad_norm": 1.1488976108325306, + "learning_rate": 4.989227831470692e-05, + "loss": 0.5817, + "num_input_tokens_seen": 48887104, + "step": 270 + }, + { + "epoch": 0.029666931224171432, + "grad_norm": 1.1117681024551747, + "learning_rate": 4.989147947748817e-05, + "loss": 0.6459, + "num_input_tokens_seen": 49045472, + "step": 271 + }, + { + "epoch": 0.029776403295109336, + "grad_norm": 1.220128233767854, + "learning_rate": 4.989067769565498e-05, + "loss": 0.7373, + "num_input_tokens_seen": 49247744, + "step": 272 + }, + { + "epoch": 0.029885875366047237, + "grad_norm": 1.255122892814321, + "learning_rate": 4.9889872969302195e-05, + "loss": 0.6751, + "num_input_tokens_seen": 49440160, + "step": 273 + }, + { + "epoch": 0.029995347436985138, + "grad_norm": 1.1315785494690456, + "learning_rate": 4.988906529852502e-05, + "loss": 0.6307, + "num_input_tokens_seen": 49644224, + "step": 274 + }, + { + "epoch": 0.030104819507923042, + "grad_norm": 1.2651994719818682, + "learning_rate": 4.9888254683419e-05, + "loss": 0.8009, + "num_input_tokens_seen": 49827008, + "step": 275 + }, + { + "epoch": 0.030214291578860943, + "grad_norm": 1.4032940208398472, + "learning_rate": 4.988744112408003e-05, + "loss": 0.7892, + "num_input_tokens_seen": 49971264, + "step": 276 + }, + { + "epoch": 0.030323763649798847, + "grad_norm": 1.2247778671654364, + "learning_rate": 4.9886624620604354e-05, + "loss": 0.6269, + "num_input_tokens_seen": 50165920, + "step": 277 + }, + { + "epoch": 0.030433235720736748, + "grad_norm": 1.301488185927137, + "learning_rate": 4.9885805173088563e-05, + "loss": 0.659, + "num_input_tokens_seen": 50359456, + "step": 278 + }, + { + "epoch": 0.030542707791674648, + "grad_norm": 1.3276435428176037, + "learning_rate": 4.988498278162959e-05, + "loss": 0.8028, + "num_input_tokens_seen": 50552320, + "step": 279 + }, + { + "epoch": 0.030652179862612552, + "grad_norm": 1.4044970713225344, + "learning_rate": 4.988415744632472e-05, + "loss": 0.7648, + "num_input_tokens_seen": 50724128, + "step": 280 + }, + { + "epoch": 0.030761651933550453, + "grad_norm": 1.1980381226818484, + "learning_rate": 4.9883329167271595e-05, + "loss": 0.6505, + "num_input_tokens_seen": 50906912, + "step": 281 + }, + { + "epoch": 0.030871124004488354, + "grad_norm": 1.2422600491048768, + "learning_rate": 4.988249794456821e-05, + "loss": 0.8363, + "num_input_tokens_seen": 51079392, + "step": 282 + }, + { + "epoch": 0.030980596075426258, + "grad_norm": 1.341817175477006, + "learning_rate": 4.988166377831288e-05, + "loss": 0.7892, + "num_input_tokens_seen": 51275840, + "step": 283 + }, + { + "epoch": 0.03109006814636416, + "grad_norm": 1.3366942911234634, + "learning_rate": 4.988082666860429e-05, + "loss": 0.7993, + "num_input_tokens_seen": 51464896, + "step": 284 + }, + { + "epoch": 0.03119954021730206, + "grad_norm": 1.4543506826507941, + "learning_rate": 4.9879986615541464e-05, + "loss": 1.0127, + "num_input_tokens_seen": 51674336, + "step": 285 + }, + { + "epoch": 0.031309012288239964, + "grad_norm": 1.169815746454706, + "learning_rate": 4.987914361922379e-05, + "loss": 0.6851, + "num_input_tokens_seen": 51849280, + "step": 286 + }, + { + "epoch": 0.031418484359177865, + "grad_norm": 1.3680953581044277, + "learning_rate": 4.9878297679750986e-05, + "loss": 0.8241, + "num_input_tokens_seen": 52045280, + "step": 287 + }, + { + "epoch": 0.031527956430115765, + "grad_norm": 1.3444737547883776, + "learning_rate": 4.987744879722312e-05, + "loss": 0.7814, + "num_input_tokens_seen": 52237920, + "step": 288 + }, + { + "epoch": 0.031637428501053666, + "grad_norm": 1.286449720634428, + "learning_rate": 4.987659697174063e-05, + "loss": 0.7596, + "num_input_tokens_seen": 52413088, + "step": 289 + }, + { + "epoch": 0.031746900571991574, + "grad_norm": 1.2511854272847438, + "learning_rate": 4.987574220340427e-05, + "loss": 0.817, + "num_input_tokens_seen": 52598336, + "step": 290 + }, + { + "epoch": 0.031856372642929474, + "grad_norm": 1.2419001408850954, + "learning_rate": 4.9874884492315155e-05, + "loss": 0.7345, + "num_input_tokens_seen": 52771936, + "step": 291 + }, + { + "epoch": 0.031965844713867375, + "grad_norm": 1.145282693972077, + "learning_rate": 4.987402383857477e-05, + "loss": 0.6996, + "num_input_tokens_seen": 52943296, + "step": 292 + }, + { + "epoch": 0.032075316784805276, + "grad_norm": 1.433084007505404, + "learning_rate": 4.98731602422849e-05, + "loss": 1.0966, + "num_input_tokens_seen": 53133696, + "step": 293 + }, + { + "epoch": 0.03218478885574318, + "grad_norm": 1.148897247673468, + "learning_rate": 4.9872293703547735e-05, + "loss": 0.8492, + "num_input_tokens_seen": 53339552, + "step": 294 + }, + { + "epoch": 0.03229426092668108, + "grad_norm": 1.2915470570384724, + "learning_rate": 4.987142422246577e-05, + "loss": 0.7442, + "num_input_tokens_seen": 53519200, + "step": 295 + }, + { + "epoch": 0.032403732997618985, + "grad_norm": 1.2387927710639086, + "learning_rate": 4.987055179914186e-05, + "loss": 0.8712, + "num_input_tokens_seen": 53711392, + "step": 296 + }, + { + "epoch": 0.032513205068556886, + "grad_norm": 1.194623761184304, + "learning_rate": 4.9869676433679225e-05, + "loss": 0.7391, + "num_input_tokens_seen": 53878944, + "step": 297 + }, + { + "epoch": 0.032622677139494786, + "grad_norm": 1.5045169531645761, + "learning_rate": 4.98687981261814e-05, + "loss": 0.7223, + "num_input_tokens_seen": 54061504, + "step": 298 + }, + { + "epoch": 0.03273214921043269, + "grad_norm": 1.1700454162683926, + "learning_rate": 4.9867916876752306e-05, + "loss": 0.6751, + "num_input_tokens_seen": 54237120, + "step": 299 + }, + { + "epoch": 0.03284162128137059, + "grad_norm": 1.3449521564638944, + "learning_rate": 4.9867032685496185e-05, + "loss": 0.8674, + "num_input_tokens_seen": 54426400, + "step": 300 + }, + { + "epoch": 0.032951093352308496, + "grad_norm": 1.3031536840122362, + "learning_rate": 4.986614555251763e-05, + "loss": 0.7386, + "num_input_tokens_seen": 54585664, + "step": 301 + }, + { + "epoch": 0.033060565423246396, + "grad_norm": 1.2556425533330426, + "learning_rate": 4.98652554779216e-05, + "loss": 0.871, + "num_input_tokens_seen": 54773600, + "step": 302 + }, + { + "epoch": 0.0331700374941843, + "grad_norm": 1.2368933182757873, + "learning_rate": 4.9864362461813373e-05, + "loss": 0.7653, + "num_input_tokens_seen": 54960416, + "step": 303 + }, + { + "epoch": 0.0332795095651222, + "grad_norm": 1.151554212190988, + "learning_rate": 4.9863466504298604e-05, + "loss": 0.6705, + "num_input_tokens_seen": 55148352, + "step": 304 + }, + { + "epoch": 0.0333889816360601, + "grad_norm": 1.2471428643184332, + "learning_rate": 4.9862567605483277e-05, + "loss": 0.7387, + "num_input_tokens_seen": 55343008, + "step": 305 + }, + { + "epoch": 0.033498453706998, + "grad_norm": 1.1767630802527322, + "learning_rate": 4.986166576547373e-05, + "loss": 0.7375, + "num_input_tokens_seen": 55535200, + "step": 306 + }, + { + "epoch": 0.03360792577793591, + "grad_norm": 1.1443740684717993, + "learning_rate": 4.9860760984376656e-05, + "loss": 0.6705, + "num_input_tokens_seen": 55732768, + "step": 307 + }, + { + "epoch": 0.03371739784887381, + "grad_norm": 1.3209934622915407, + "learning_rate": 4.985985326229907e-05, + "loss": 0.8133, + "num_input_tokens_seen": 55904576, + "step": 308 + }, + { + "epoch": 0.03382686991981171, + "grad_norm": 1.2601774740116845, + "learning_rate": 4.985894259934838e-05, + "loss": 0.7343, + "num_input_tokens_seen": 56068992, + "step": 309 + }, + { + "epoch": 0.03393634199074961, + "grad_norm": 1.176101590238498, + "learning_rate": 4.98580289956323e-05, + "loss": 0.6467, + "num_input_tokens_seen": 56259168, + "step": 310 + }, + { + "epoch": 0.03404581406168751, + "grad_norm": 1.345867263240265, + "learning_rate": 4.985711245125891e-05, + "loss": 0.7879, + "num_input_tokens_seen": 56464576, + "step": 311 + }, + { + "epoch": 0.03415528613262541, + "grad_norm": 1.2606013874647302, + "learning_rate": 4.9856192966336634e-05, + "loss": 0.7653, + "num_input_tokens_seen": 56620256, + "step": 312 + }, + { + "epoch": 0.03426475820356332, + "grad_norm": 1.2462876028588312, + "learning_rate": 4.985527054097425e-05, + "loss": 0.8726, + "num_input_tokens_seen": 56830592, + "step": 313 + }, + { + "epoch": 0.03437423027450122, + "grad_norm": 1.3290626357612387, + "learning_rate": 4.985434517528087e-05, + "loss": 0.7298, + "num_input_tokens_seen": 57016512, + "step": 314 + }, + { + "epoch": 0.03448370234543912, + "grad_norm": 1.2304029174632443, + "learning_rate": 4.985341686936598e-05, + "loss": 0.7489, + "num_input_tokens_seen": 57160096, + "step": 315 + }, + { + "epoch": 0.03459317441637702, + "grad_norm": 1.4706401431515475, + "learning_rate": 4.9852485623339376e-05, + "loss": 0.8126, + "num_input_tokens_seen": 57318016, + "step": 316 + }, + { + "epoch": 0.03470264648731492, + "grad_norm": 1.3214381458789868, + "learning_rate": 4.985155143731124e-05, + "loss": 0.727, + "num_input_tokens_seen": 57530816, + "step": 317 + }, + { + "epoch": 0.03481211855825283, + "grad_norm": 1.3309534360420159, + "learning_rate": 4.985061431139207e-05, + "loss": 0.5959, + "num_input_tokens_seen": 57644608, + "step": 318 + }, + { + "epoch": 0.03492159062919073, + "grad_norm": 1.1780759401503405, + "learning_rate": 4.9849674245692735e-05, + "loss": 0.701, + "num_input_tokens_seen": 57848000, + "step": 319 + }, + { + "epoch": 0.03503106270012863, + "grad_norm": 1.4244923239407739, + "learning_rate": 4.9848731240324444e-05, + "loss": 0.793, + "num_input_tokens_seen": 58049824, + "step": 320 + }, + { + "epoch": 0.03514053477106653, + "grad_norm": 1.3973502082594418, + "learning_rate": 4.984778529539875e-05, + "loss": 0.6787, + "num_input_tokens_seen": 58230816, + "step": 321 + }, + { + "epoch": 0.03525000684200443, + "grad_norm": 1.1945119433342848, + "learning_rate": 4.984683641102755e-05, + "loss": 0.6446, + "num_input_tokens_seen": 58373504, + "step": 322 + }, + { + "epoch": 0.03535947891294233, + "grad_norm": 1.2692012416632752, + "learning_rate": 4.984588458732311e-05, + "loss": 0.8173, + "num_input_tokens_seen": 58537920, + "step": 323 + }, + { + "epoch": 0.03546895098388024, + "grad_norm": 1.2326143963787772, + "learning_rate": 4.984492982439802e-05, + "loss": 0.6818, + "num_input_tokens_seen": 58722496, + "step": 324 + }, + { + "epoch": 0.03557842305481814, + "grad_norm": 1.5308261985522558, + "learning_rate": 4.984397212236522e-05, + "loss": 0.9523, + "num_input_tokens_seen": 58920064, + "step": 325 + }, + { + "epoch": 0.03568789512575604, + "grad_norm": 1.2088151113810222, + "learning_rate": 4.984301148133802e-05, + "loss": 0.8442, + "num_input_tokens_seen": 59106208, + "step": 326 + }, + { + "epoch": 0.03579736719669394, + "grad_norm": 1.211121821988035, + "learning_rate": 4.9842047901430044e-05, + "loss": 0.7644, + "num_input_tokens_seen": 59300864, + "step": 327 + }, + { + "epoch": 0.03590683926763184, + "grad_norm": 1.0882803802596297, + "learning_rate": 4.98410813827553e-05, + "loss": 0.5848, + "num_input_tokens_seen": 59449600, + "step": 328 + }, + { + "epoch": 0.036016311338569744, + "grad_norm": 1.1651982137175938, + "learning_rate": 4.984011192542811e-05, + "loss": 0.8383, + "num_input_tokens_seen": 59623872, + "step": 329 + }, + { + "epoch": 0.03612578340950765, + "grad_norm": 1.1633255437100112, + "learning_rate": 4.983913952956317e-05, + "loss": 0.6117, + "num_input_tokens_seen": 59810240, + "step": 330 + }, + { + "epoch": 0.03623525548044555, + "grad_norm": 1.3228628142602232, + "learning_rate": 4.983816419527551e-05, + "loss": 0.778, + "num_input_tokens_seen": 59987200, + "step": 331 + }, + { + "epoch": 0.03634472755138345, + "grad_norm": 1.2695243952639461, + "learning_rate": 4.983718592268051e-05, + "loss": 0.9366, + "num_input_tokens_seen": 60187680, + "step": 332 + }, + { + "epoch": 0.036454199622321354, + "grad_norm": 1.3638051073109894, + "learning_rate": 4.983620471189389e-05, + "loss": 0.867, + "num_input_tokens_seen": 60373376, + "step": 333 + }, + { + "epoch": 0.036563671693259255, + "grad_norm": 1.0659383229275146, + "learning_rate": 4.9835220563031726e-05, + "loss": 0.6922, + "num_input_tokens_seen": 60568032, + "step": 334 + }, + { + "epoch": 0.03667314376419716, + "grad_norm": 1.1704396931243783, + "learning_rate": 4.9834233476210456e-05, + "loss": 0.7688, + "num_input_tokens_seen": 60769632, + "step": 335 + }, + { + "epoch": 0.03678261583513506, + "grad_norm": 1.2292017635356394, + "learning_rate": 4.9833243451546834e-05, + "loss": 0.7368, + "num_input_tokens_seen": 60954208, + "step": 336 + }, + { + "epoch": 0.036892087906072964, + "grad_norm": 1.355771165691354, + "learning_rate": 4.9832250489157994e-05, + "loss": 0.7211, + "num_input_tokens_seen": 61097120, + "step": 337 + }, + { + "epoch": 0.037001559977010864, + "grad_norm": 1.149985535157496, + "learning_rate": 4.983125458916138e-05, + "loss": 0.7405, + "num_input_tokens_seen": 61266464, + "step": 338 + }, + { + "epoch": 0.037111032047948765, + "grad_norm": 1.465483340186568, + "learning_rate": 4.9830255751674825e-05, + "loss": 0.8812, + "num_input_tokens_seen": 61469856, + "step": 339 + }, + { + "epoch": 0.037220504118886666, + "grad_norm": 1.1616895414168138, + "learning_rate": 4.982925397681648e-05, + "loss": 0.6313, + "num_input_tokens_seen": 61642336, + "step": 340 + }, + { + "epoch": 0.037329976189824574, + "grad_norm": 1.3530234670979289, + "learning_rate": 4.982824926470486e-05, + "loss": 0.6163, + "num_input_tokens_seen": 61794432, + "step": 341 + }, + { + "epoch": 0.037439448260762474, + "grad_norm": 1.2309877300255225, + "learning_rate": 4.982724161545881e-05, + "loss": 0.851, + "num_input_tokens_seen": 61994912, + "step": 342 + }, + { + "epoch": 0.037548920331700375, + "grad_norm": 1.165854335856589, + "learning_rate": 4.982623102919754e-05, + "loss": 0.5694, + "num_input_tokens_seen": 62163584, + "step": 343 + }, + { + "epoch": 0.037658392402638276, + "grad_norm": 1.2877582772458636, + "learning_rate": 4.98252175060406e-05, + "loss": 0.6746, + "num_input_tokens_seen": 62340096, + "step": 344 + }, + { + "epoch": 0.037767864473576176, + "grad_norm": 1.3084733594486284, + "learning_rate": 4.9824201046107885e-05, + "loss": 0.7022, + "num_input_tokens_seen": 62522880, + "step": 345 + }, + { + "epoch": 0.037877336544514084, + "grad_norm": 1.4946208506245262, + "learning_rate": 4.9823181649519645e-05, + "loss": 0.7107, + "num_input_tokens_seen": 62707232, + "step": 346 + }, + { + "epoch": 0.037986808615451985, + "grad_norm": 1.4122702690259603, + "learning_rate": 4.9822159316396465e-05, + "loss": 0.8567, + "num_input_tokens_seen": 62868064, + "step": 347 + }, + { + "epoch": 0.038096280686389886, + "grad_norm": 1.351113525965865, + "learning_rate": 4.9821134046859295e-05, + "loss": 0.6395, + "num_input_tokens_seen": 63034496, + "step": 348 + }, + { + "epoch": 0.038205752757327786, + "grad_norm": 1.3495345584872556, + "learning_rate": 4.9820105841029416e-05, + "loss": 0.7132, + "num_input_tokens_seen": 63191072, + "step": 349 + }, + { + "epoch": 0.03831522482826569, + "grad_norm": 1.2242482700939392, + "learning_rate": 4.9819074699028455e-05, + "loss": 0.6755, + "num_input_tokens_seen": 63329280, + "step": 350 + }, + { + "epoch": 0.03842469689920359, + "grad_norm": 1.1848727325311514, + "learning_rate": 4.981804062097841e-05, + "loss": 0.757, + "num_input_tokens_seen": 63528864, + "step": 351 + }, + { + "epoch": 0.038534168970141496, + "grad_norm": 1.2119747026812357, + "learning_rate": 4.9817003607001614e-05, + "loss": 0.6294, + "num_input_tokens_seen": 63730240, + "step": 352 + }, + { + "epoch": 0.038643641041079396, + "grad_norm": 1.4057230915938506, + "learning_rate": 4.981596365722072e-05, + "loss": 0.9477, + "num_input_tokens_seen": 63911008, + "step": 353 + }, + { + "epoch": 0.0387531131120173, + "grad_norm": 1.2076062064525301, + "learning_rate": 4.981492077175877e-05, + "loss": 0.7056, + "num_input_tokens_seen": 64113952, + "step": 354 + }, + { + "epoch": 0.0388625851829552, + "grad_norm": 1.1555517511172717, + "learning_rate": 4.9813874950739124e-05, + "loss": 0.7086, + "num_input_tokens_seen": 64312192, + "step": 355 + }, + { + "epoch": 0.0389720572538931, + "grad_norm": 1.2747213526647274, + "learning_rate": 4.9812826194285515e-05, + "loss": 0.7446, + "num_input_tokens_seen": 64516032, + "step": 356 + }, + { + "epoch": 0.039081529324831, + "grad_norm": 1.214009283561341, + "learning_rate": 4.9811774502522e-05, + "loss": 0.6315, + "num_input_tokens_seen": 64714272, + "step": 357 + }, + { + "epoch": 0.03919100139576891, + "grad_norm": 1.1722387639355545, + "learning_rate": 4.9810719875573e-05, + "loss": 0.6953, + "num_input_tokens_seen": 64905120, + "step": 358 + }, + { + "epoch": 0.03930047346670681, + "grad_norm": 1.3651850395402259, + "learning_rate": 4.980966231356326e-05, + "loss": 0.9811, + "num_input_tokens_seen": 65097760, + "step": 359 + }, + { + "epoch": 0.03940994553764471, + "grad_norm": 1.389781481766964, + "learning_rate": 4.98086018166179e-05, + "loss": 0.734, + "num_input_tokens_seen": 65270464, + "step": 360 + }, + { + "epoch": 0.03951941760858261, + "grad_norm": 1.2461416301078714, + "learning_rate": 4.980753838486236e-05, + "loss": 0.6851, + "num_input_tokens_seen": 65447872, + "step": 361 + }, + { + "epoch": 0.03962888967952051, + "grad_norm": 1.1248373337870143, + "learning_rate": 4.980647201842247e-05, + "loss": 0.721, + "num_input_tokens_seen": 65661344, + "step": 362 + }, + { + "epoch": 0.03973836175045842, + "grad_norm": 1.4575673771198345, + "learning_rate": 4.980540271742435e-05, + "loss": 0.7708, + "num_input_tokens_seen": 65852192, + "step": 363 + }, + { + "epoch": 0.03984783382139632, + "grad_norm": 1.26735202083492, + "learning_rate": 4.980433048199451e-05, + "loss": 0.5962, + "num_input_tokens_seen": 66034304, + "step": 364 + }, + { + "epoch": 0.03995730589233422, + "grad_norm": 1.2587341343960408, + "learning_rate": 4.98032553122598e-05, + "loss": 0.7858, + "num_input_tokens_seen": 66215296, + "step": 365 + }, + { + "epoch": 0.04006677796327212, + "grad_norm": 1.3417287306715895, + "learning_rate": 4.98021772083474e-05, + "loss": 0.8307, + "num_input_tokens_seen": 66372320, + "step": 366 + }, + { + "epoch": 0.04017625003421002, + "grad_norm": 1.241964356155135, + "learning_rate": 4.980109617038484e-05, + "loss": 0.8013, + "num_input_tokens_seen": 66570560, + "step": 367 + }, + { + "epoch": 0.04028572210514792, + "grad_norm": 1.3731424181757574, + "learning_rate": 4.980001219850002e-05, + "loss": 0.9296, + "num_input_tokens_seen": 66772160, + "step": 368 + }, + { + "epoch": 0.04039519417608583, + "grad_norm": 1.2230850296623865, + "learning_rate": 4.979892529282117e-05, + "loss": 0.7534, + "num_input_tokens_seen": 66947104, + "step": 369 + }, + { + "epoch": 0.04050466624702373, + "grad_norm": 1.2586696855214523, + "learning_rate": 4.979783545347686e-05, + "loss": 0.8313, + "num_input_tokens_seen": 67143328, + "step": 370 + }, + { + "epoch": 0.04061413831796163, + "grad_norm": 1.2508590130814083, + "learning_rate": 4.9796742680596034e-05, + "loss": 0.5908, + "num_input_tokens_seen": 67328576, + "step": 371 + }, + { + "epoch": 0.04072361038889953, + "grad_norm": 1.2909696400542299, + "learning_rate": 4.9795646974307936e-05, + "loss": 0.6283, + "num_input_tokens_seen": 67503968, + "step": 372 + }, + { + "epoch": 0.04083308245983743, + "grad_norm": 1.1881689897688337, + "learning_rate": 4.979454833474221e-05, + "loss": 0.5829, + "num_input_tokens_seen": 67653600, + "step": 373 + }, + { + "epoch": 0.04094255453077534, + "grad_norm": 1.2759572007645803, + "learning_rate": 4.9793446762028816e-05, + "loss": 0.7869, + "num_input_tokens_seen": 67869312, + "step": 374 + }, + { + "epoch": 0.04105202660171324, + "grad_norm": 1.1344336124126786, + "learning_rate": 4.9792342256298064e-05, + "loss": 0.6395, + "num_input_tokens_seen": 68077856, + "step": 375 + }, + { + "epoch": 0.04116149867265114, + "grad_norm": 1.3911581410451872, + "learning_rate": 4.979123481768062e-05, + "loss": 0.9702, + "num_input_tokens_seen": 68269376, + "step": 376 + }, + { + "epoch": 0.04127097074358904, + "grad_norm": 1.2514068316209317, + "learning_rate": 4.979012444630748e-05, + "loss": 0.6403, + "num_input_tokens_seen": 68445664, + "step": 377 + }, + { + "epoch": 0.04138044281452694, + "grad_norm": 1.2438801203765992, + "learning_rate": 4.978901114231003e-05, + "loss": 0.6456, + "num_input_tokens_seen": 68643232, + "step": 378 + }, + { + "epoch": 0.04148991488546484, + "grad_norm": 1.2697865860263406, + "learning_rate": 4.978789490581993e-05, + "loss": 0.589, + "num_input_tokens_seen": 68792640, + "step": 379 + }, + { + "epoch": 0.04159938695640275, + "grad_norm": 1.2117188916676658, + "learning_rate": 4.978677573696926e-05, + "loss": 0.7765, + "num_input_tokens_seen": 68967584, + "step": 380 + }, + { + "epoch": 0.04170885902734065, + "grad_norm": 1.2426002824253441, + "learning_rate": 4.978565363589041e-05, + "loss": 0.7147, + "num_input_tokens_seen": 69158656, + "step": 381 + }, + { + "epoch": 0.04181833109827855, + "grad_norm": 1.2314611883145954, + "learning_rate": 4.97845286027161e-05, + "loss": 0.5852, + "num_input_tokens_seen": 69353984, + "step": 382 + }, + { + "epoch": 0.04192780316921645, + "grad_norm": 1.4894521517666635, + "learning_rate": 4.978340063757945e-05, + "loss": 1.0631, + "num_input_tokens_seen": 69513472, + "step": 383 + }, + { + "epoch": 0.042037275240154354, + "grad_norm": 1.3309451958674974, + "learning_rate": 4.978226974061388e-05, + "loss": 0.8497, + "num_input_tokens_seen": 69715744, + "step": 384 + }, + { + "epoch": 0.042146747311092254, + "grad_norm": 1.1721790682174904, + "learning_rate": 4.978113591195317e-05, + "loss": 0.7276, + "num_input_tokens_seen": 69892704, + "step": 385 + }, + { + "epoch": 0.04225621938203016, + "grad_norm": 1.3413446322259721, + "learning_rate": 4.9779999151731456e-05, + "loss": 0.8139, + "num_input_tokens_seen": 70064288, + "step": 386 + }, + { + "epoch": 0.04236569145296806, + "grad_norm": 1.200805409671764, + "learning_rate": 4.977885946008322e-05, + "loss": 0.6561, + "num_input_tokens_seen": 70246848, + "step": 387 + }, + { + "epoch": 0.042475163523905964, + "grad_norm": 1.1915468727045486, + "learning_rate": 4.977771683714327e-05, + "loss": 0.8013, + "num_input_tokens_seen": 70442176, + "step": 388 + }, + { + "epoch": 0.042584635594843864, + "grad_norm": 1.2839875168119266, + "learning_rate": 4.9776571283046794e-05, + "loss": 0.65, + "num_input_tokens_seen": 70567616, + "step": 389 + }, + { + "epoch": 0.042694107665781765, + "grad_norm": 1.312239212285397, + "learning_rate": 4.977542279792929e-05, + "loss": 0.7558, + "num_input_tokens_seen": 70780192, + "step": 390 + }, + { + "epoch": 0.04280357973671967, + "grad_norm": 1.2041577452138825, + "learning_rate": 4.9774271381926644e-05, + "loss": 0.5578, + "num_input_tokens_seen": 70967680, + "step": 391 + }, + { + "epoch": 0.04291305180765757, + "grad_norm": 1.1542764264924168, + "learning_rate": 4.977311703517504e-05, + "loss": 0.6114, + "num_input_tokens_seen": 71117312, + "step": 392 + }, + { + "epoch": 0.043022523878595474, + "grad_norm": 1.381836411978165, + "learning_rate": 4.977195975781106e-05, + "loss": 0.7356, + "num_input_tokens_seen": 71279712, + "step": 393 + }, + { + "epoch": 0.043131995949533375, + "grad_norm": 1.4904673782919784, + "learning_rate": 4.977079954997159e-05, + "loss": 0.9309, + "num_input_tokens_seen": 71480192, + "step": 394 + }, + { + "epoch": 0.043241468020471276, + "grad_norm": 1.1434260946759527, + "learning_rate": 4.9769636411793894e-05, + "loss": 0.5393, + "num_input_tokens_seen": 71618176, + "step": 395 + }, + { + "epoch": 0.043350940091409176, + "grad_norm": 1.2110931788632529, + "learning_rate": 4.976847034341555e-05, + "loss": 0.6832, + "num_input_tokens_seen": 71796256, + "step": 396 + }, + { + "epoch": 0.043460412162347084, + "grad_norm": 1.4374812581043006, + "learning_rate": 4.976730134497453e-05, + "loss": 0.9092, + "num_input_tokens_seen": 72005472, + "step": 397 + }, + { + "epoch": 0.043569884233284985, + "grad_norm": 1.2451390642226412, + "learning_rate": 4.97661294166091e-05, + "loss": 0.7243, + "num_input_tokens_seen": 72199904, + "step": 398 + }, + { + "epoch": 0.043679356304222886, + "grad_norm": 1.2014433763284218, + "learning_rate": 4.97649545584579e-05, + "loss": 0.808, + "num_input_tokens_seen": 72400384, + "step": 399 + }, + { + "epoch": 0.043788828375160786, + "grad_norm": 1.2762535645886295, + "learning_rate": 4.976377677065992e-05, + "loss": 0.8544, + "num_input_tokens_seen": 72584288, + "step": 400 + }, + { + "epoch": 0.04389830044609869, + "grad_norm": 1.4835513590481024, + "learning_rate": 4.9762596053354496e-05, + "loss": 0.8907, + "num_input_tokens_seen": 72768416, + "step": 401 + }, + { + "epoch": 0.04400777251703659, + "grad_norm": 1.2900772262345568, + "learning_rate": 4.976141240668129e-05, + "loss": 0.7397, + "num_input_tokens_seen": 72907072, + "step": 402 + }, + { + "epoch": 0.044117244587974495, + "grad_norm": 1.200652120145898, + "learning_rate": 4.976022583078033e-05, + "loss": 0.6544, + "num_input_tokens_seen": 73077088, + "step": 403 + }, + { + "epoch": 0.044226716658912396, + "grad_norm": 1.156125139247784, + "learning_rate": 4.975903632579199e-05, + "loss": 0.8024, + "num_input_tokens_seen": 73252032, + "step": 404 + }, + { + "epoch": 0.0443361887298503, + "grad_norm": 1.1928257724742113, + "learning_rate": 4.9757843891856986e-05, + "loss": 0.7861, + "num_input_tokens_seen": 73444672, + "step": 405 + }, + { + "epoch": 0.0444456608007882, + "grad_norm": 1.1931692618167635, + "learning_rate": 4.975664852911638e-05, + "loss": 0.594, + "num_input_tokens_seen": 73597440, + "step": 406 + }, + { + "epoch": 0.0445551328717261, + "grad_norm": 1.2925863939552813, + "learning_rate": 4.9755450237711575e-05, + "loss": 0.8075, + "num_input_tokens_seen": 73773504, + "step": 407 + }, + { + "epoch": 0.044664604942664006, + "grad_norm": 1.4103032508376063, + "learning_rate": 4.975424901778434e-05, + "loss": 0.7298, + "num_input_tokens_seen": 73987200, + "step": 408 + }, + { + "epoch": 0.04477407701360191, + "grad_norm": 1.231666248658711, + "learning_rate": 4.975304486947676e-05, + "loss": 0.6233, + "num_input_tokens_seen": 74141088, + "step": 409 + }, + { + "epoch": 0.04488354908453981, + "grad_norm": 1.2021138289278537, + "learning_rate": 4.975183779293129e-05, + "loss": 0.7581, + "num_input_tokens_seen": 74304608, + "step": 410 + }, + { + "epoch": 0.04499302115547771, + "grad_norm": 1.335698201124187, + "learning_rate": 4.975062778829073e-05, + "loss": 0.7161, + "num_input_tokens_seen": 74489408, + "step": 411 + }, + { + "epoch": 0.04510249322641561, + "grad_norm": 1.2748633517172567, + "learning_rate": 4.9749414855698216e-05, + "loss": 0.6762, + "num_input_tokens_seen": 74692800, + "step": 412 + }, + { + "epoch": 0.04521196529735351, + "grad_norm": 1.2898234577766194, + "learning_rate": 4.974819899529725e-05, + "loss": 0.6548, + "num_input_tokens_seen": 74869984, + "step": 413 + }, + { + "epoch": 0.04532143736829142, + "grad_norm": 1.2797452883312295, + "learning_rate": 4.9746980207231634e-05, + "loss": 0.7919, + "num_input_tokens_seen": 75069792, + "step": 414 + }, + { + "epoch": 0.04543090943922932, + "grad_norm": 1.3125094004703117, + "learning_rate": 4.9745758491645576e-05, + "loss": 0.7385, + "num_input_tokens_seen": 75261536, + "step": 415 + }, + { + "epoch": 0.04554038151016722, + "grad_norm": 1.1094454890678003, + "learning_rate": 4.97445338486836e-05, + "loss": 0.6277, + "num_input_tokens_seen": 75421920, + "step": 416 + }, + { + "epoch": 0.04564985358110512, + "grad_norm": 1.3827649124491792, + "learning_rate": 4.974330627849057e-05, + "loss": 0.9815, + "num_input_tokens_seen": 75626880, + "step": 417 + }, + { + "epoch": 0.04575932565204302, + "grad_norm": 1.149022375116371, + "learning_rate": 4.974207578121171e-05, + "loss": 0.7137, + "num_input_tokens_seen": 75814592, + "step": 418 + }, + { + "epoch": 0.04586879772298093, + "grad_norm": 1.220575383941238, + "learning_rate": 4.974084235699258e-05, + "loss": 0.6356, + "num_input_tokens_seen": 75995584, + "step": 419 + }, + { + "epoch": 0.04597826979391883, + "grad_norm": 1.16018481492349, + "learning_rate": 4.973960600597909e-05, + "loss": 0.6455, + "num_input_tokens_seen": 76195840, + "step": 420 + }, + { + "epoch": 0.04608774186485673, + "grad_norm": 1.1156656530330245, + "learning_rate": 4.973836672831751e-05, + "loss": 0.6132, + "num_input_tokens_seen": 76358912, + "step": 421 + }, + { + "epoch": 0.04619721393579463, + "grad_norm": 1.242815446780603, + "learning_rate": 4.973712452415444e-05, + "loss": 0.7666, + "num_input_tokens_seen": 76554016, + "step": 422 + }, + { + "epoch": 0.04630668600673253, + "grad_norm": 1.3795927678350834, + "learning_rate": 4.9735879393636826e-05, + "loss": 0.7632, + "num_input_tokens_seen": 76725152, + "step": 423 + }, + { + "epoch": 0.04641615807767043, + "grad_norm": 1.1096753533186534, + "learning_rate": 4.9734631336911964e-05, + "loss": 0.5968, + "num_input_tokens_seen": 76925856, + "step": 424 + }, + { + "epoch": 0.04652563014860834, + "grad_norm": 1.3731264007241732, + "learning_rate": 4.97333803541275e-05, + "loss": 0.7795, + "num_input_tokens_seen": 77099680, + "step": 425 + }, + { + "epoch": 0.04663510221954624, + "grad_norm": 1.2315516475111248, + "learning_rate": 4.973212644543143e-05, + "loss": 0.8507, + "num_input_tokens_seen": 77301280, + "step": 426 + }, + { + "epoch": 0.04674457429048414, + "grad_norm": 1.2928011229496035, + "learning_rate": 4.973086961097207e-05, + "loss": 0.6908, + "num_input_tokens_seen": 77466816, + "step": 427 + }, + { + "epoch": 0.04685404636142204, + "grad_norm": 1.3160059289074018, + "learning_rate": 4.972960985089812e-05, + "loss": 0.7843, + "num_input_tokens_seen": 77651392, + "step": 428 + }, + { + "epoch": 0.04696351843235994, + "grad_norm": 1.381453196221142, + "learning_rate": 4.97283471653586e-05, + "loss": 0.741, + "num_input_tokens_seen": 77816256, + "step": 429 + }, + { + "epoch": 0.04707299050329784, + "grad_norm": 1.2848142963212599, + "learning_rate": 4.972708155450288e-05, + "loss": 0.8123, + "num_input_tokens_seen": 77988960, + "step": 430 + }, + { + "epoch": 0.04718246257423575, + "grad_norm": 1.2320781052992047, + "learning_rate": 4.972581301848068e-05, + "loss": 0.7105, + "num_input_tokens_seen": 78155392, + "step": 431 + }, + { + "epoch": 0.04729193464517365, + "grad_norm": 1.1839771216411668, + "learning_rate": 4.972454155744207e-05, + "loss": 0.6943, + "num_input_tokens_seen": 78351840, + "step": 432 + }, + { + "epoch": 0.04740140671611155, + "grad_norm": 1.2871689731999223, + "learning_rate": 4.9723267171537455e-05, + "loss": 0.8766, + "num_input_tokens_seen": 78511328, + "step": 433 + }, + { + "epoch": 0.04751087878704945, + "grad_norm": 1.24918276774943, + "learning_rate": 4.9721989860917605e-05, + "loss": 0.7637, + "num_input_tokens_seen": 78708000, + "step": 434 + }, + { + "epoch": 0.047620350857987354, + "grad_norm": 1.224305378296055, + "learning_rate": 4.9720709625733614e-05, + "loss": 0.6539, + "num_input_tokens_seen": 78898400, + "step": 435 + }, + { + "epoch": 0.04772982292892526, + "grad_norm": 1.140988696915777, + "learning_rate": 4.971942646613693e-05, + "loss": 0.5621, + "num_input_tokens_seen": 79078272, + "step": 436 + }, + { + "epoch": 0.04783929499986316, + "grad_norm": 1.356373465758899, + "learning_rate": 4.971814038227934e-05, + "loss": 0.757, + "num_input_tokens_seen": 79252544, + "step": 437 + }, + { + "epoch": 0.04794876707080106, + "grad_norm": 1.198563387350333, + "learning_rate": 4.971685137431301e-05, + "loss": 0.5888, + "num_input_tokens_seen": 79420096, + "step": 438 + }, + { + "epoch": 0.04805823914173896, + "grad_norm": 1.2042754976575845, + "learning_rate": 4.971555944239041e-05, + "loss": 0.7311, + "num_input_tokens_seen": 79601760, + "step": 439 + }, + { + "epoch": 0.048167711212676864, + "grad_norm": 1.3334662798325192, + "learning_rate": 4.971426458666437e-05, + "loss": 0.8492, + "num_input_tokens_seen": 79765056, + "step": 440 + }, + { + "epoch": 0.048277183283614765, + "grad_norm": 1.3399496231288672, + "learning_rate": 4.9712966807288085e-05, + "loss": 0.6302, + "num_input_tokens_seen": 79953664, + "step": 441 + }, + { + "epoch": 0.04838665535455267, + "grad_norm": 1.2187090157684715, + "learning_rate": 4.971166610441507e-05, + "loss": 0.7277, + "num_input_tokens_seen": 80133536, + "step": 442 + }, + { + "epoch": 0.04849612742549057, + "grad_norm": 1.3320444056065437, + "learning_rate": 4.9710362478199186e-05, + "loss": 0.7773, + "num_input_tokens_seen": 80327296, + "step": 443 + }, + { + "epoch": 0.048605599496428474, + "grad_norm": 1.2876784784254371, + "learning_rate": 4.9709055928794664e-05, + "loss": 1.0182, + "num_input_tokens_seen": 80531136, + "step": 444 + }, + { + "epoch": 0.048715071567366375, + "grad_norm": 1.1137770770000033, + "learning_rate": 4.970774645635606e-05, + "loss": 0.6257, + "num_input_tokens_seen": 80725120, + "step": 445 + }, + { + "epoch": 0.048824543638304276, + "grad_norm": 1.3695275471490918, + "learning_rate": 4.970643406103828e-05, + "loss": 0.9036, + "num_input_tokens_seen": 80884384, + "step": 446 + }, + { + "epoch": 0.048934015709242176, + "grad_norm": 1.2203794004342803, + "learning_rate": 4.970511874299659e-05, + "loss": 0.7362, + "num_input_tokens_seen": 81054624, + "step": 447 + }, + { + "epoch": 0.049043487780180084, + "grad_norm": 1.339326719837054, + "learning_rate": 4.9703800502386574e-05, + "loss": 0.7956, + "num_input_tokens_seen": 81238528, + "step": 448 + }, + { + "epoch": 0.049152959851117985, + "grad_norm": 1.1636567400677655, + "learning_rate": 4.970247933936418e-05, + "loss": 0.5557, + "num_input_tokens_seen": 81419968, + "step": 449 + }, + { + "epoch": 0.049262431922055885, + "grad_norm": 1.2588433253250597, + "learning_rate": 4.970115525408572e-05, + "loss": 0.6252, + "num_input_tokens_seen": 81599392, + "step": 450 + }, + { + "epoch": 0.049371903992993786, + "grad_norm": 1.3163463907187487, + "learning_rate": 4.96998282467078e-05, + "loss": 0.76, + "num_input_tokens_seen": 81751040, + "step": 451 + }, + { + "epoch": 0.04948137606393169, + "grad_norm": 1.4627118275143405, + "learning_rate": 4.969849831738742e-05, + "loss": 0.9865, + "num_input_tokens_seen": 81934944, + "step": 452 + }, + { + "epoch": 0.049590848134869595, + "grad_norm": 1.1828924940666528, + "learning_rate": 4.96971654662819e-05, + "loss": 0.6187, + "num_input_tokens_seen": 82089056, + "step": 453 + }, + { + "epoch": 0.049700320205807495, + "grad_norm": 1.1559826309786794, + "learning_rate": 4.969582969354892e-05, + "loss": 0.6141, + "num_input_tokens_seen": 82282816, + "step": 454 + }, + { + "epoch": 0.049809792276745396, + "grad_norm": 1.2812394862790335, + "learning_rate": 4.96944909993465e-05, + "loss": 0.7181, + "num_input_tokens_seen": 82441408, + "step": 455 + }, + { + "epoch": 0.0499192643476833, + "grad_norm": 1.1770025354972242, + "learning_rate": 4.969314938383301e-05, + "loss": 0.8364, + "num_input_tokens_seen": 82648160, + "step": 456 + }, + { + "epoch": 0.0500287364186212, + "grad_norm": 1.3396659710886956, + "learning_rate": 4.9691804847167146e-05, + "loss": 0.6264, + "num_input_tokens_seen": 82783232, + "step": 457 + }, + { + "epoch": 0.0501382084895591, + "grad_norm": 1.1734990374281211, + "learning_rate": 4.969045738950797e-05, + "loss": 0.5421, + "num_input_tokens_seen": 82939584, + "step": 458 + }, + { + "epoch": 0.050247680560497006, + "grad_norm": 1.2176875839010162, + "learning_rate": 4.968910701101489e-05, + "loss": 0.8048, + "num_input_tokens_seen": 83106464, + "step": 459 + }, + { + "epoch": 0.05035715263143491, + "grad_norm": 1.4112865958186573, + "learning_rate": 4.968775371184764e-05, + "loss": 0.7047, + "num_input_tokens_seen": 83278048, + "step": 460 + }, + { + "epoch": 0.05046662470237281, + "grad_norm": 1.221565564479385, + "learning_rate": 4.968639749216632e-05, + "loss": 0.7417, + "num_input_tokens_seen": 83439776, + "step": 461 + }, + { + "epoch": 0.05057609677331071, + "grad_norm": 1.3664550126951365, + "learning_rate": 4.968503835213138e-05, + "loss": 0.9568, + "num_input_tokens_seen": 83641600, + "step": 462 + }, + { + "epoch": 0.05068556884424861, + "grad_norm": 1.2083154655100108, + "learning_rate": 4.9683676291903594e-05, + "loss": 0.676, + "num_input_tokens_seen": 83836032, + "step": 463 + }, + { + "epoch": 0.050795040915186516, + "grad_norm": 1.214547516745658, + "learning_rate": 4.968231131164408e-05, + "loss": 0.6472, + "num_input_tokens_seen": 84042560, + "step": 464 + }, + { + "epoch": 0.05090451298612442, + "grad_norm": 1.2887102512775095, + "learning_rate": 4.968094341151433e-05, + "loss": 0.7965, + "num_input_tokens_seen": 84230048, + "step": 465 + }, + { + "epoch": 0.05101398505706232, + "grad_norm": 1.1261028606056307, + "learning_rate": 4.967957259167615e-05, + "loss": 0.7549, + "num_input_tokens_seen": 84424480, + "step": 466 + }, + { + "epoch": 0.05112345712800022, + "grad_norm": 1.2550900774237321, + "learning_rate": 4.967819885229171e-05, + "loss": 0.8055, + "num_input_tokens_seen": 84596736, + "step": 467 + }, + { + "epoch": 0.05123292919893812, + "grad_norm": 1.2014939783447853, + "learning_rate": 4.967682219352353e-05, + "loss": 0.789, + "num_input_tokens_seen": 84760256, + "step": 468 + }, + { + "epoch": 0.05134240126987602, + "grad_norm": 1.1361987310746595, + "learning_rate": 4.967544261553445e-05, + "loss": 0.6364, + "num_input_tokens_seen": 84909664, + "step": 469 + }, + { + "epoch": 0.05145187334081393, + "grad_norm": 1.2186495910755282, + "learning_rate": 4.967406011848769e-05, + "loss": 0.715, + "num_input_tokens_seen": 85081920, + "step": 470 + }, + { + "epoch": 0.05156134541175183, + "grad_norm": 1.3481396236696488, + "learning_rate": 4.967267470254678e-05, + "loss": 0.8409, + "num_input_tokens_seen": 85282848, + "step": 471 + }, + { + "epoch": 0.05167081748268973, + "grad_norm": 1.1130553970123127, + "learning_rate": 4.967128636787562e-05, + "loss": 0.6408, + "num_input_tokens_seen": 85435168, + "step": 472 + }, + { + "epoch": 0.05178028955362763, + "grad_norm": 1.2831634097289402, + "learning_rate": 4.9669895114638445e-05, + "loss": 0.7552, + "num_input_tokens_seen": 85615936, + "step": 473 + }, + { + "epoch": 0.05188976162456553, + "grad_norm": 1.156044091113109, + "learning_rate": 4.966850094299984e-05, + "loss": 0.74, + "num_input_tokens_seen": 85806112, + "step": 474 + }, + { + "epoch": 0.05199923369550343, + "grad_norm": 1.2111151288099076, + "learning_rate": 4.966710385312473e-05, + "loss": 0.7766, + "num_input_tokens_seen": 86004352, + "step": 475 + }, + { + "epoch": 0.05210870576644134, + "grad_norm": 1.3218997240572548, + "learning_rate": 4.966570384517839e-05, + "loss": 0.9263, + "num_input_tokens_seen": 86220064, + "step": 476 + }, + { + "epoch": 0.05221817783737924, + "grad_norm": 1.3821811851307186, + "learning_rate": 4.966430091932645e-05, + "loss": 0.8275, + "num_input_tokens_seen": 86406208, + "step": 477 + }, + { + "epoch": 0.05232764990831714, + "grad_norm": 1.1432745482084243, + "learning_rate": 4.9662895075734844e-05, + "loss": 0.5834, + "num_input_tokens_seen": 86605792, + "step": 478 + }, + { + "epoch": 0.05243712197925504, + "grad_norm": 1.0993033435537147, + "learning_rate": 4.9661486314569904e-05, + "loss": 0.7686, + "num_input_tokens_seen": 86778048, + "step": 479 + }, + { + "epoch": 0.05254659405019294, + "grad_norm": 1.1109290722233565, + "learning_rate": 4.966007463599828e-05, + "loss": 0.645, + "num_input_tokens_seen": 86978304, + "step": 480 + }, + { + "epoch": 0.05265606612113085, + "grad_norm": 1.1978019272407063, + "learning_rate": 4.9658660040186967e-05, + "loss": 0.6926, + "num_input_tokens_seen": 87170720, + "step": 481 + }, + { + "epoch": 0.05276553819206875, + "grad_norm": 1.3149854197255881, + "learning_rate": 4.965724252730331e-05, + "loss": 0.8736, + "num_input_tokens_seen": 87327968, + "step": 482 + }, + { + "epoch": 0.05287501026300665, + "grad_norm": 1.2373697193039621, + "learning_rate": 4.9655822097515e-05, + "loss": 0.7194, + "num_input_tokens_seen": 87485888, + "step": 483 + }, + { + "epoch": 0.05298448233394455, + "grad_norm": 1.3246633857823369, + "learning_rate": 4.9654398750990075e-05, + "loss": 0.9113, + "num_input_tokens_seen": 87643808, + "step": 484 + }, + { + "epoch": 0.05309395440488245, + "grad_norm": 1.0467233120923776, + "learning_rate": 4.96529724878969e-05, + "loss": 0.6416, + "num_input_tokens_seen": 87852128, + "step": 485 + }, + { + "epoch": 0.05320342647582035, + "grad_norm": 1.325313869224656, + "learning_rate": 4.9651543308404217e-05, + "loss": 0.707, + "num_input_tokens_seen": 88062912, + "step": 486 + }, + { + "epoch": 0.05331289854675826, + "grad_norm": 1.3522424035136646, + "learning_rate": 4.9650111212681073e-05, + "loss": 0.7899, + "num_input_tokens_seen": 88247936, + "step": 487 + }, + { + "epoch": 0.05342237061769616, + "grad_norm": 1.4227289004265313, + "learning_rate": 4.96486762008969e-05, + "loss": 0.8035, + "num_input_tokens_seen": 88433184, + "step": 488 + }, + { + "epoch": 0.05353184268863406, + "grad_norm": 1.2273310786320994, + "learning_rate": 4.964723827322147e-05, + "loss": 0.6418, + "num_input_tokens_seen": 88586848, + "step": 489 + }, + { + "epoch": 0.05364131475957196, + "grad_norm": 1.1809519881293233, + "learning_rate": 4.9645797429824846e-05, + "loss": 0.6706, + "num_input_tokens_seen": 88767392, + "step": 490 + }, + { + "epoch": 0.053750786830509864, + "grad_norm": 1.4211338333256656, + "learning_rate": 4.964435367087751e-05, + "loss": 0.9752, + "num_input_tokens_seen": 88959808, + "step": 491 + }, + { + "epoch": 0.053860258901447765, + "grad_norm": 1.1989822402987793, + "learning_rate": 4.9642906996550256e-05, + "loss": 0.8196, + "num_input_tokens_seen": 89153568, + "step": 492 + }, + { + "epoch": 0.05396973097238567, + "grad_norm": 1.303699919121861, + "learning_rate": 4.96414574070142e-05, + "loss": 0.8177, + "num_input_tokens_seen": 89306112, + "step": 493 + }, + { + "epoch": 0.05407920304332357, + "grad_norm": 1.2540891519885076, + "learning_rate": 4.964000490244084e-05, + "loss": 0.6267, + "num_input_tokens_seen": 89473440, + "step": 494 + }, + { + "epoch": 0.054188675114261474, + "grad_norm": 1.2534911517366185, + "learning_rate": 4.963854948300201e-05, + "loss": 0.7453, + "num_input_tokens_seen": 89633376, + "step": 495 + }, + { + "epoch": 0.054298147185199375, + "grad_norm": 1.3473382104624632, + "learning_rate": 4.963709114886988e-05, + "loss": 0.7179, + "num_input_tokens_seen": 89796224, + "step": 496 + }, + { + "epoch": 0.054407619256137275, + "grad_norm": 1.3032269554596096, + "learning_rate": 4.9635629900216956e-05, + "loss": 0.707, + "num_input_tokens_seen": 89959296, + "step": 497 + }, + { + "epoch": 0.05451709132707518, + "grad_norm": 1.2329024694222008, + "learning_rate": 4.963416573721611e-05, + "loss": 0.6263, + "num_input_tokens_seen": 90131776, + "step": 498 + }, + { + "epoch": 0.054626563398013084, + "grad_norm": 1.2014511155983865, + "learning_rate": 4.9632698660040556e-05, + "loss": 0.6307, + "num_input_tokens_seen": 90306720, + "step": 499 + }, + { + "epoch": 0.054736035468950985, + "grad_norm": 1.2426576984041653, + "learning_rate": 4.963122866886384e-05, + "loss": 0.7336, + "num_input_tokens_seen": 90500480, + "step": 500 + }, + { + "epoch": 0.054845507539888885, + "grad_norm": 1.2723004934553266, + "learning_rate": 4.9629755763859855e-05, + "loss": 0.8413, + "num_input_tokens_seen": 90666688, + "step": 501 + }, + { + "epoch": 0.054954979610826786, + "grad_norm": 1.3471012907820437, + "learning_rate": 4.9628279945202856e-05, + "loss": 0.8012, + "num_input_tokens_seen": 90863360, + "step": 502 + }, + { + "epoch": 0.05506445168176469, + "grad_norm": 1.2328606986521993, + "learning_rate": 4.962680121306741e-05, + "loss": 0.7848, + "num_input_tokens_seen": 91072128, + "step": 503 + }, + { + "epoch": 0.055173923752702594, + "grad_norm": 1.3750287399756314, + "learning_rate": 4.962531956762847e-05, + "loss": 0.8546, + "num_input_tokens_seen": 91237216, + "step": 504 + }, + { + "epoch": 0.055283395823640495, + "grad_norm": 1.3098890583730216, + "learning_rate": 4.9623835009061294e-05, + "loss": 0.5563, + "num_input_tokens_seen": 91396480, + "step": 505 + }, + { + "epoch": 0.055392867894578396, + "grad_norm": 1.4117447631668554, + "learning_rate": 4.962234753754151e-05, + "loss": 1.0451, + "num_input_tokens_seen": 91609504, + "step": 506 + }, + { + "epoch": 0.0555023399655163, + "grad_norm": 1.2457478201531482, + "learning_rate": 4.962085715324508e-05, + "loss": 0.7494, + "num_input_tokens_seen": 91806400, + "step": 507 + }, + { + "epoch": 0.0556118120364542, + "grad_norm": 1.3290086835688026, + "learning_rate": 4.9619363856348324e-05, + "loss": 0.7988, + "num_input_tokens_seen": 91965216, + "step": 508 + }, + { + "epoch": 0.055721284107392105, + "grad_norm": 1.286109547560318, + "learning_rate": 4.9617867647027876e-05, + "loss": 0.6388, + "num_input_tokens_seen": 92125824, + "step": 509 + }, + { + "epoch": 0.055830756178330006, + "grad_norm": 1.2368408875389982, + "learning_rate": 4.961636852546075e-05, + "loss": 0.6663, + "num_input_tokens_seen": 92264256, + "step": 510 + }, + { + "epoch": 0.055940228249267906, + "grad_norm": 1.1053227843846107, + "learning_rate": 4.961486649182429e-05, + "loss": 0.6275, + "num_input_tokens_seen": 92449952, + "step": 511 + }, + { + "epoch": 0.05604970032020581, + "grad_norm": 1.3391077450171411, + "learning_rate": 4.961336154629618e-05, + "loss": 0.9085, + "num_input_tokens_seen": 92635200, + "step": 512 + }, + { + "epoch": 0.05615917239114371, + "grad_norm": 1.0747433023255157, + "learning_rate": 4.961185368905445e-05, + "loss": 0.6674, + "num_input_tokens_seen": 92854720, + "step": 513 + }, + { + "epoch": 0.05626864446208161, + "grad_norm": 1.2264472761520264, + "learning_rate": 4.9610342920277475e-05, + "loss": 0.8299, + "num_input_tokens_seen": 93066848, + "step": 514 + }, + { + "epoch": 0.056378116533019516, + "grad_norm": 1.2042295661698892, + "learning_rate": 4.960882924014398e-05, + "loss": 0.6689, + "num_input_tokens_seen": 93251648, + "step": 515 + }, + { + "epoch": 0.05648758860395742, + "grad_norm": 1.2064608117927165, + "learning_rate": 4.960731264883304e-05, + "loss": 0.5849, + "num_input_tokens_seen": 93399040, + "step": 516 + }, + { + "epoch": 0.05659706067489532, + "grad_norm": 1.2345367518810426, + "learning_rate": 4.960579314652405e-05, + "loss": 0.689, + "num_input_tokens_seen": 93571968, + "step": 517 + }, + { + "epoch": 0.05670653274583322, + "grad_norm": 1.3525682356680055, + "learning_rate": 4.960427073339676e-05, + "loss": 1.0275, + "num_input_tokens_seen": 93755200, + "step": 518 + }, + { + "epoch": 0.05681600481677112, + "grad_norm": 1.2105201740403702, + "learning_rate": 4.960274540963129e-05, + "loss": 0.7858, + "num_input_tokens_seen": 93911776, + "step": 519 + }, + { + "epoch": 0.05692547688770902, + "grad_norm": 1.1440662051164054, + "learning_rate": 4.9601217175408064e-05, + "loss": 0.6032, + "num_input_tokens_seen": 94054016, + "step": 520 + }, + { + "epoch": 0.05703494895864693, + "grad_norm": 1.293435649183875, + "learning_rate": 4.959968603090788e-05, + "loss": 0.9303, + "num_input_tokens_seen": 94269952, + "step": 521 + }, + { + "epoch": 0.05714442102958483, + "grad_norm": 1.1848379211805589, + "learning_rate": 4.959815197631186e-05, + "loss": 0.7497, + "num_input_tokens_seen": 94468416, + "step": 522 + }, + { + "epoch": 0.05725389310052273, + "grad_norm": 1.4218374394077413, + "learning_rate": 4.9596615011801486e-05, + "loss": 0.5854, + "num_input_tokens_seen": 94661280, + "step": 523 + }, + { + "epoch": 0.05736336517146063, + "grad_norm": 1.3382827518274327, + "learning_rate": 4.959507513755858e-05, + "loss": 0.6438, + "num_input_tokens_seen": 94835776, + "step": 524 + }, + { + "epoch": 0.05747283724239853, + "grad_norm": 1.3646530638294632, + "learning_rate": 4.95935323537653e-05, + "loss": 0.8989, + "num_input_tokens_seen": 95017216, + "step": 525 + }, + { + "epoch": 0.05758230931333644, + "grad_norm": 1.2211975073099546, + "learning_rate": 4.9591986660604164e-05, + "loss": 0.6791, + "num_input_tokens_seen": 95220384, + "step": 526 + }, + { + "epoch": 0.05769178138427434, + "grad_norm": 1.3806699112943515, + "learning_rate": 4.959043805825801e-05, + "loss": 0.7801, + "num_input_tokens_seen": 95403392, + "step": 527 + }, + { + "epoch": 0.05780125345521224, + "grad_norm": 1.3207649042753284, + "learning_rate": 4.958888654691004e-05, + "loss": 0.8164, + "num_input_tokens_seen": 95595808, + "step": 528 + }, + { + "epoch": 0.05791072552615014, + "grad_norm": 1.3293664403441656, + "learning_rate": 4.95873321267438e-05, + "loss": 0.7436, + "num_input_tokens_seen": 95775680, + "step": 529 + }, + { + "epoch": 0.05802019759708804, + "grad_norm": 1.1851265427670206, + "learning_rate": 4.958577479794317e-05, + "loss": 0.6967, + "num_input_tokens_seen": 95961152, + "step": 530 + }, + { + "epoch": 0.05812966966802594, + "grad_norm": 1.162349182518244, + "learning_rate": 4.958421456069239e-05, + "loss": 0.6337, + "num_input_tokens_seen": 96172384, + "step": 531 + }, + { + "epoch": 0.05823914173896385, + "grad_norm": 1.1574289651911693, + "learning_rate": 4.9582651415176026e-05, + "loss": 0.6596, + "num_input_tokens_seen": 96363680, + "step": 532 + }, + { + "epoch": 0.05834861380990175, + "grad_norm": 1.3005525771750652, + "learning_rate": 4.958108536157899e-05, + "loss": 0.664, + "num_input_tokens_seen": 96523392, + "step": 533 + }, + { + "epoch": 0.05845808588083965, + "grad_norm": 1.302405823508044, + "learning_rate": 4.9579516400086545e-05, + "loss": 0.687, + "num_input_tokens_seen": 96688256, + "step": 534 + }, + { + "epoch": 0.05856755795177755, + "grad_norm": 1.2646026249544748, + "learning_rate": 4.9577944530884295e-05, + "loss": 0.6981, + "num_input_tokens_seen": 96857824, + "step": 535 + }, + { + "epoch": 0.05867703002271545, + "grad_norm": 1.263244509472325, + "learning_rate": 4.9576369754158194e-05, + "loss": 0.6295, + "num_input_tokens_seen": 97045312, + "step": 536 + }, + { + "epoch": 0.05878650209365335, + "grad_norm": 1.4742862170478368, + "learning_rate": 4.9574792070094534e-05, + "loss": 0.9087, + "num_input_tokens_seen": 97227200, + "step": 537 + }, + { + "epoch": 0.05889597416459126, + "grad_norm": 1.4720544369572752, + "learning_rate": 4.9573211478879955e-05, + "loss": 0.7898, + "num_input_tokens_seen": 97385792, + "step": 538 + }, + { + "epoch": 0.05900544623552916, + "grad_norm": 1.2699016793556923, + "learning_rate": 4.9571627980701426e-05, + "loss": 0.63, + "num_input_tokens_seen": 97566336, + "step": 539 + }, + { + "epoch": 0.05911491830646706, + "grad_norm": 1.4325391335992232, + "learning_rate": 4.9570041575746285e-05, + "loss": 0.6892, + "num_input_tokens_seen": 97741504, + "step": 540 + }, + { + "epoch": 0.05922439037740496, + "grad_norm": 1.3527222877643184, + "learning_rate": 4.9568452264202194e-05, + "loss": 0.7597, + "num_input_tokens_seen": 97925856, + "step": 541 + }, + { + "epoch": 0.059333862448342864, + "grad_norm": 1.31062378067884, + "learning_rate": 4.9566860046257166e-05, + "loss": 0.756, + "num_input_tokens_seen": 98098560, + "step": 542 + }, + { + "epoch": 0.05944333451928077, + "grad_norm": 1.318625551833145, + "learning_rate": 4.956526492209956e-05, + "loss": 0.6924, + "num_input_tokens_seen": 98290528, + "step": 543 + }, + { + "epoch": 0.05955280659021867, + "grad_norm": 1.2371307226663255, + "learning_rate": 4.956366689191808e-05, + "loss": 0.5497, + "num_input_tokens_seen": 98457408, + "step": 544 + }, + { + "epoch": 0.05966227866115657, + "grad_norm": 1.3231171833973292, + "learning_rate": 4.956206595590176e-05, + "loss": 0.8472, + "num_input_tokens_seen": 98638848, + "step": 545 + }, + { + "epoch": 0.059771750732094474, + "grad_norm": 1.212281545642788, + "learning_rate": 4.9560462114239995e-05, + "loss": 0.7207, + "num_input_tokens_seen": 98807520, + "step": 546 + }, + { + "epoch": 0.059881222803032375, + "grad_norm": 1.1346462272296558, + "learning_rate": 4.9558855367122505e-05, + "loss": 0.6397, + "num_input_tokens_seen": 99027936, + "step": 547 + }, + { + "epoch": 0.059990694873970275, + "grad_norm": 1.1905080943146855, + "learning_rate": 4.9557245714739374e-05, + "loss": 0.5435, + "num_input_tokens_seen": 99191232, + "step": 548 + }, + { + "epoch": 0.06010016694490818, + "grad_norm": 1.2576022238921627, + "learning_rate": 4.955563315728103e-05, + "loss": 0.7598, + "num_input_tokens_seen": 99371552, + "step": 549 + }, + { + "epoch": 0.060209639015846084, + "grad_norm": 1.2995079613236615, + "learning_rate": 4.955401769493822e-05, + "loss": 0.7534, + "num_input_tokens_seen": 99544032, + "step": 550 + }, + { + "epoch": 0.060319111086783984, + "grad_norm": 1.4260527051893648, + "learning_rate": 4.9552399327902054e-05, + "loss": 0.6839, + "num_input_tokens_seen": 99720096, + "step": 551 + }, + { + "epoch": 0.060428583157721885, + "grad_norm": 1.1883042774063424, + "learning_rate": 4.955077805636399e-05, + "loss": 0.5825, + "num_input_tokens_seen": 99881152, + "step": 552 + }, + { + "epoch": 0.060538055228659786, + "grad_norm": 1.2686955444976922, + "learning_rate": 4.954915388051581e-05, + "loss": 0.9622, + "num_input_tokens_seen": 100044672, + "step": 553 + }, + { + "epoch": 0.060647527299597694, + "grad_norm": 1.346623873919876, + "learning_rate": 4.954752680054966e-05, + "loss": 0.6319, + "num_input_tokens_seen": 100227680, + "step": 554 + }, + { + "epoch": 0.060756999370535594, + "grad_norm": 1.165587789233046, + "learning_rate": 4.954589681665801e-05, + "loss": 0.7092, + "num_input_tokens_seen": 100405312, + "step": 555 + }, + { + "epoch": 0.060866471441473495, + "grad_norm": 1.30317714778928, + "learning_rate": 4.95442639290337e-05, + "loss": 0.7141, + "num_input_tokens_seen": 100553600, + "step": 556 + }, + { + "epoch": 0.060975943512411396, + "grad_norm": 1.4728388584561842, + "learning_rate": 4.954262813786988e-05, + "loss": 0.6965, + "num_input_tokens_seen": 100721152, + "step": 557 + }, + { + "epoch": 0.061085415583349296, + "grad_norm": 1.2233418745181017, + "learning_rate": 4.954098944336007e-05, + "loss": 0.7073, + "num_input_tokens_seen": 100899456, + "step": 558 + }, + { + "epoch": 0.0611948876542872, + "grad_norm": 1.0650290514784861, + "learning_rate": 4.953934784569812e-05, + "loss": 0.8067, + "num_input_tokens_seen": 101121440, + "step": 559 + }, + { + "epoch": 0.061304359725225105, + "grad_norm": 1.1240339900100607, + "learning_rate": 4.953770334507824e-05, + "loss": 0.5724, + "num_input_tokens_seen": 101304448, + "step": 560 + }, + { + "epoch": 0.061413831796163006, + "grad_norm": 1.2714864845082587, + "learning_rate": 4.9536055941694956e-05, + "loss": 0.6555, + "num_input_tokens_seen": 101481184, + "step": 561 + }, + { + "epoch": 0.061523303867100906, + "grad_norm": 1.2102645973281874, + "learning_rate": 4.9534405635743165e-05, + "loss": 0.6492, + "num_input_tokens_seen": 101678976, + "step": 562 + }, + { + "epoch": 0.06163277593803881, + "grad_norm": 1.240845152877832, + "learning_rate": 4.953275242741808e-05, + "loss": 0.7577, + "num_input_tokens_seen": 101853472, + "step": 563 + }, + { + "epoch": 0.06174224800897671, + "grad_norm": 1.373306835846376, + "learning_rate": 4.953109631691529e-05, + "loss": 0.7937, + "num_input_tokens_seen": 102000192, + "step": 564 + }, + { + "epoch": 0.06185172007991461, + "grad_norm": 1.3464411025568177, + "learning_rate": 4.952943730443069e-05, + "loss": 0.6746, + "num_input_tokens_seen": 102169984, + "step": 565 + }, + { + "epoch": 0.061961192150852516, + "grad_norm": 1.1813805159868969, + "learning_rate": 4.952777539016056e-05, + "loss": 0.6961, + "num_input_tokens_seen": 102331712, + "step": 566 + }, + { + "epoch": 0.06207066422179042, + "grad_norm": 1.1575424085226345, + "learning_rate": 4.9526110574301485e-05, + "loss": 0.5973, + "num_input_tokens_seen": 102522560, + "step": 567 + }, + { + "epoch": 0.06218013629272832, + "grad_norm": 1.1619561615289118, + "learning_rate": 4.9524442857050424e-05, + "loss": 0.7482, + "num_input_tokens_seen": 102707584, + "step": 568 + }, + { + "epoch": 0.06228960836366622, + "grad_norm": 1.3020662026867909, + "learning_rate": 4.952277223860465e-05, + "loss": 0.9118, + "num_input_tokens_seen": 102924192, + "step": 569 + }, + { + "epoch": 0.06239908043460412, + "grad_norm": 1.3280953347876074, + "learning_rate": 4.9521098719161795e-05, + "loss": 0.8055, + "num_input_tokens_seen": 103119968, + "step": 570 + }, + { + "epoch": 0.06250855250554202, + "grad_norm": 1.347484898385263, + "learning_rate": 4.9519422298919844e-05, + "loss": 0.7654, + "num_input_tokens_seen": 103300736, + "step": 571 + }, + { + "epoch": 0.06261802457647993, + "grad_norm": 1.3129331691378334, + "learning_rate": 4.9517742978077106e-05, + "loss": 0.7273, + "num_input_tokens_seen": 103480832, + "step": 572 + }, + { + "epoch": 0.06272749664741782, + "grad_norm": 1.1931661646092753, + "learning_rate": 4.951606075683224e-05, + "loss": 0.7502, + "num_input_tokens_seen": 103681536, + "step": 573 + }, + { + "epoch": 0.06283696871835573, + "grad_norm": 1.2345933841721484, + "learning_rate": 4.9514375635384255e-05, + "loss": 0.8977, + "num_input_tokens_seen": 103860512, + "step": 574 + }, + { + "epoch": 0.06294644078929364, + "grad_norm": 1.2563752676534932, + "learning_rate": 4.9512687613932505e-05, + "loss": 0.6015, + "num_input_tokens_seen": 104043072, + "step": 575 + }, + { + "epoch": 0.06305591286023153, + "grad_norm": 1.4793820381077714, + "learning_rate": 4.951099669267666e-05, + "loss": 0.6893, + "num_input_tokens_seen": 104190464, + "step": 576 + }, + { + "epoch": 0.06316538493116944, + "grad_norm": 1.2868563645038535, + "learning_rate": 4.950930287181677e-05, + "loss": 0.8904, + "num_input_tokens_seen": 104391616, + "step": 577 + }, + { + "epoch": 0.06327485700210733, + "grad_norm": 1.2652407404288106, + "learning_rate": 4.95076061515532e-05, + "loss": 0.6306, + "num_input_tokens_seen": 104574400, + "step": 578 + }, + { + "epoch": 0.06338432907304524, + "grad_norm": 1.2052388632733992, + "learning_rate": 4.9505906532086676e-05, + "loss": 0.679, + "num_input_tokens_seen": 104740384, + "step": 579 + }, + { + "epoch": 0.06349380114398315, + "grad_norm": 1.2658041838972334, + "learning_rate": 4.950420401361825e-05, + "loss": 0.7566, + "num_input_tokens_seen": 104888448, + "step": 580 + }, + { + "epoch": 0.06360327321492104, + "grad_norm": 1.2305015922886398, + "learning_rate": 4.950249859634934e-05, + "loss": 0.7865, + "num_input_tokens_seen": 105069888, + "step": 581 + }, + { + "epoch": 0.06371274528585895, + "grad_norm": 1.3130773909161704, + "learning_rate": 4.9500790280481684e-05, + "loss": 0.7362, + "num_input_tokens_seen": 105263424, + "step": 582 + }, + { + "epoch": 0.06382221735679684, + "grad_norm": 1.4173851809009426, + "learning_rate": 4.9499079066217374e-05, + "loss": 0.8289, + "num_input_tokens_seen": 105433216, + "step": 583 + }, + { + "epoch": 0.06393168942773475, + "grad_norm": 1.2489114312959668, + "learning_rate": 4.9497364953758854e-05, + "loss": 0.9543, + "num_input_tokens_seen": 105652736, + "step": 584 + }, + { + "epoch": 0.06404116149867266, + "grad_norm": 1.3432026568968705, + "learning_rate": 4.9495647943308877e-05, + "loss": 0.802, + "num_input_tokens_seen": 105832608, + "step": 585 + }, + { + "epoch": 0.06415063356961055, + "grad_norm": 1.292794253280704, + "learning_rate": 4.949392803507058e-05, + "loss": 0.6763, + "num_input_tokens_seen": 106029056, + "step": 586 + }, + { + "epoch": 0.06426010564054846, + "grad_norm": 1.2514864190035697, + "learning_rate": 4.949220522924742e-05, + "loss": 0.6194, + "num_input_tokens_seen": 106190560, + "step": 587 + }, + { + "epoch": 0.06436957771148635, + "grad_norm": 1.2874865984913633, + "learning_rate": 4.949047952604321e-05, + "loss": 0.7022, + "num_input_tokens_seen": 106366624, + "step": 588 + }, + { + "epoch": 0.06447904978242426, + "grad_norm": 1.4054010159687644, + "learning_rate": 4.9488750925662083e-05, + "loss": 0.9235, + "num_input_tokens_seen": 106554560, + "step": 589 + }, + { + "epoch": 0.06458852185336215, + "grad_norm": 1.1791585279437715, + "learning_rate": 4.9487019428308547e-05, + "loss": 0.7205, + "num_input_tokens_seen": 106761984, + "step": 590 + }, + { + "epoch": 0.06469799392430006, + "grad_norm": 1.1778984904407739, + "learning_rate": 4.948528503418741e-05, + "loss": 0.8584, + "num_input_tokens_seen": 106939392, + "step": 591 + }, + { + "epoch": 0.06480746599523797, + "grad_norm": 1.1466248408348334, + "learning_rate": 4.9483547743503874e-05, + "loss": 0.7038, + "num_input_tokens_seen": 107119040, + "step": 592 + }, + { + "epoch": 0.06491693806617586, + "grad_norm": 1.2351484651941531, + "learning_rate": 4.9481807556463435e-05, + "loss": 0.7352, + "num_input_tokens_seen": 107268224, + "step": 593 + }, + { + "epoch": 0.06502641013711377, + "grad_norm": 1.1479865505779772, + "learning_rate": 4.948006447327197e-05, + "loss": 0.6351, + "num_input_tokens_seen": 107481920, + "step": 594 + }, + { + "epoch": 0.06513588220805167, + "grad_norm": 1.0914209363491285, + "learning_rate": 4.947831849413567e-05, + "loss": 0.667, + "num_input_tokens_seen": 107689344, + "step": 595 + }, + { + "epoch": 0.06524535427898957, + "grad_norm": 1.327316037809871, + "learning_rate": 4.947656961926109e-05, + "loss": 0.8247, + "num_input_tokens_seen": 107880640, + "step": 596 + }, + { + "epoch": 0.06535482634992748, + "grad_norm": 1.9169247771355789, + "learning_rate": 4.947481784885511e-05, + "loss": 0.7345, + "num_input_tokens_seen": 108056480, + "step": 597 + }, + { + "epoch": 0.06546429842086537, + "grad_norm": 1.4359147826546896, + "learning_rate": 4.947306318312497e-05, + "loss": 0.7812, + "num_input_tokens_seen": 108231648, + "step": 598 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 1.3018658672567327, + "learning_rate": 4.947130562227824e-05, + "loss": 0.9278, + "num_input_tokens_seen": 108421824, + "step": 599 + }, + { + "epoch": 0.06568324256274118, + "grad_norm": 1.4700120414188804, + "learning_rate": 4.9469545166522836e-05, + "loss": 0.8514, + "num_input_tokens_seen": 108588928, + "step": 600 + }, + { + "epoch": 0.06579271463367908, + "grad_norm": 1.250539424852838, + "learning_rate": 4.946778181606702e-05, + "loss": 0.8285, + "num_input_tokens_seen": 108764768, + "step": 601 + }, + { + "epoch": 0.06590218670461699, + "grad_norm": 1.2325688439229106, + "learning_rate": 4.946601557111938e-05, + "loss": 0.6031, + "num_input_tokens_seen": 108962560, + "step": 602 + }, + { + "epoch": 0.06601165877555489, + "grad_norm": 1.424146409951856, + "learning_rate": 4.9464246431888885e-05, + "loss": 0.744, + "num_input_tokens_seen": 109157440, + "step": 603 + }, + { + "epoch": 0.06612113084649279, + "grad_norm": 1.2655459720757416, + "learning_rate": 4.946247439858479e-05, + "loss": 0.829, + "num_input_tokens_seen": 109349408, + "step": 604 + }, + { + "epoch": 0.06623060291743069, + "grad_norm": 1.1456937126868159, + "learning_rate": 4.9460699471416745e-05, + "loss": 0.5337, + "num_input_tokens_seen": 109546976, + "step": 605 + }, + { + "epoch": 0.0663400749883686, + "grad_norm": 1.3118746039547624, + "learning_rate": 4.945892165059472e-05, + "loss": 0.9072, + "num_input_tokens_seen": 109713184, + "step": 606 + }, + { + "epoch": 0.06644954705930649, + "grad_norm": 1.2852696887565285, + "learning_rate": 4.9457140936329004e-05, + "loss": 0.8036, + "num_input_tokens_seen": 109901120, + "step": 607 + }, + { + "epoch": 0.0665590191302444, + "grad_norm": 1.3242845997149462, + "learning_rate": 4.9455357328830275e-05, + "loss": 0.7907, + "num_input_tokens_seen": 110082784, + "step": 608 + }, + { + "epoch": 0.0666684912011823, + "grad_norm": 1.1412861096956648, + "learning_rate": 4.9453570828309536e-05, + "loss": 0.584, + "num_input_tokens_seen": 110241152, + "step": 609 + }, + { + "epoch": 0.0667779632721202, + "grad_norm": 1.1893810131343958, + "learning_rate": 4.9451781434978104e-05, + "loss": 0.6707, + "num_input_tokens_seen": 110409824, + "step": 610 + }, + { + "epoch": 0.0668874353430581, + "grad_norm": 1.1534451151482514, + "learning_rate": 4.944998914904768e-05, + "loss": 0.6707, + "num_input_tokens_seen": 110580960, + "step": 611 + }, + { + "epoch": 0.066996907413996, + "grad_norm": 1.1805146708919898, + "learning_rate": 4.944819397073027e-05, + "loss": 0.6299, + "num_input_tokens_seen": 110778752, + "step": 612 + }, + { + "epoch": 0.0671063794849339, + "grad_norm": 1.3814390441502067, + "learning_rate": 4.944639590023826e-05, + "loss": 0.7124, + "num_input_tokens_seen": 110918528, + "step": 613 + }, + { + "epoch": 0.06721585155587181, + "grad_norm": 1.2043078169324068, + "learning_rate": 4.9444594937784336e-05, + "loss": 0.6447, + "num_input_tokens_seen": 111130432, + "step": 614 + }, + { + "epoch": 0.06732532362680971, + "grad_norm": 1.3062325654394964, + "learning_rate": 4.9442791083581575e-05, + "loss": 0.6201, + "num_input_tokens_seen": 111285664, + "step": 615 + }, + { + "epoch": 0.06743479569774762, + "grad_norm": 1.3307173169150301, + "learning_rate": 4.944098433784335e-05, + "loss": 0.8074, + "num_input_tokens_seen": 111470688, + "step": 616 + }, + { + "epoch": 0.06754426776868551, + "grad_norm": 1.249080128758881, + "learning_rate": 4.94391747007834e-05, + "loss": 0.7161, + "num_input_tokens_seen": 111681472, + "step": 617 + }, + { + "epoch": 0.06765373983962342, + "grad_norm": 1.326819687210161, + "learning_rate": 4.9437362172615806e-05, + "loss": 0.7683, + "num_input_tokens_seen": 111847008, + "step": 618 + }, + { + "epoch": 0.06776321191056132, + "grad_norm": 1.326941198206124, + "learning_rate": 4.9435546753554985e-05, + "loss": 0.7126, + "num_input_tokens_seen": 112023072, + "step": 619 + }, + { + "epoch": 0.06787268398149922, + "grad_norm": 1.2992797525275548, + "learning_rate": 4.943372844381568e-05, + "loss": 0.7069, + "num_input_tokens_seen": 112181664, + "step": 620 + }, + { + "epoch": 0.06798215605243713, + "grad_norm": 1.1874583889547299, + "learning_rate": 4.943190724361303e-05, + "loss": 0.6956, + "num_input_tokens_seen": 112396928, + "step": 621 + }, + { + "epoch": 0.06809162812337502, + "grad_norm": 1.163400653887849, + "learning_rate": 4.9430083153162456e-05, + "loss": 0.7277, + "num_input_tokens_seen": 112597632, + "step": 622 + }, + { + "epoch": 0.06820110019431293, + "grad_norm": 1.1811871627967805, + "learning_rate": 4.942825617267973e-05, + "loss": 0.7074, + "num_input_tokens_seen": 112773248, + "step": 623 + }, + { + "epoch": 0.06831057226525082, + "grad_norm": 1.1610733158149706, + "learning_rate": 4.9426426302381014e-05, + "loss": 0.6544, + "num_input_tokens_seen": 112939456, + "step": 624 + }, + { + "epoch": 0.06842004433618873, + "grad_norm": 1.1701053736584912, + "learning_rate": 4.9424593542482754e-05, + "loss": 0.7088, + "num_input_tokens_seen": 113123584, + "step": 625 + }, + { + "epoch": 0.06852951640712664, + "grad_norm": 1.1717645211594758, + "learning_rate": 4.942275789320178e-05, + "loss": 0.7974, + "num_input_tokens_seen": 113329888, + "step": 626 + }, + { + "epoch": 0.06863898847806453, + "grad_norm": 1.3476177265355571, + "learning_rate": 4.9420919354755225e-05, + "loss": 0.7588, + "num_input_tokens_seen": 113494304, + "step": 627 + }, + { + "epoch": 0.06874846054900244, + "grad_norm": 1.210971757957954, + "learning_rate": 4.9419077927360605e-05, + "loss": 0.724, + "num_input_tokens_seen": 113697696, + "step": 628 + }, + { + "epoch": 0.06885793261994033, + "grad_norm": 1.2312897864560137, + "learning_rate": 4.9417233611235735e-05, + "loss": 0.7036, + "num_input_tokens_seen": 113878464, + "step": 629 + }, + { + "epoch": 0.06896740469087824, + "grad_norm": 1.2412145871184541, + "learning_rate": 4.9415386406598816e-05, + "loss": 0.8195, + "num_input_tokens_seen": 114079168, + "step": 630 + }, + { + "epoch": 0.06907687676181615, + "grad_norm": 1.2570574846013547, + "learning_rate": 4.941353631366836e-05, + "loss": 0.7532, + "num_input_tokens_seen": 114235744, + "step": 631 + }, + { + "epoch": 0.06918634883275404, + "grad_norm": 1.1984786443100726, + "learning_rate": 4.9411683332663225e-05, + "loss": 0.6755, + "num_input_tokens_seen": 114414944, + "step": 632 + }, + { + "epoch": 0.06929582090369195, + "grad_norm": 1.304859355640883, + "learning_rate": 4.940982746380262e-05, + "loss": 0.9562, + "num_input_tokens_seen": 114598624, + "step": 633 + }, + { + "epoch": 0.06940529297462984, + "grad_norm": 1.172316658716726, + "learning_rate": 4.9407968707306085e-05, + "loss": 0.8508, + "num_input_tokens_seen": 114815008, + "step": 634 + }, + { + "epoch": 0.06951476504556775, + "grad_norm": 1.1858179828796716, + "learning_rate": 4.940610706339351e-05, + "loss": 0.6447, + "num_input_tokens_seen": 114956352, + "step": 635 + }, + { + "epoch": 0.06962423711650566, + "grad_norm": 1.324518355513468, + "learning_rate": 4.940424253228514e-05, + "loss": 0.9054, + "num_input_tokens_seen": 115127488, + "step": 636 + }, + { + "epoch": 0.06973370918744355, + "grad_norm": 1.238731422785432, + "learning_rate": 4.940237511420152e-05, + "loss": 0.816, + "num_input_tokens_seen": 115307136, + "step": 637 + }, + { + "epoch": 0.06984318125838146, + "grad_norm": 1.3152400434443812, + "learning_rate": 4.9400504809363576e-05, + "loss": 0.693, + "num_input_tokens_seen": 115510080, + "step": 638 + }, + { + "epoch": 0.06995265332931935, + "grad_norm": 1.3375275279269363, + "learning_rate": 4.9398631617992565e-05, + "loss": 0.825, + "num_input_tokens_seen": 115663744, + "step": 639 + }, + { + "epoch": 0.07006212540025726, + "grad_norm": 1.4292233239927798, + "learning_rate": 4.939675554031007e-05, + "loss": 0.9089, + "num_input_tokens_seen": 115873408, + "step": 640 + }, + { + "epoch": 0.07017159747119515, + "grad_norm": 1.3234145611753318, + "learning_rate": 4.939487657653803e-05, + "loss": 0.6972, + "num_input_tokens_seen": 116007136, + "step": 641 + }, + { + "epoch": 0.07028106954213306, + "grad_norm": 1.316000946769258, + "learning_rate": 4.9392994726898735e-05, + "loss": 0.7176, + "num_input_tokens_seen": 116174912, + "step": 642 + }, + { + "epoch": 0.07039054161307097, + "grad_norm": 1.3336942700770582, + "learning_rate": 4.939110999161479e-05, + "loss": 0.8517, + "num_input_tokens_seen": 116358816, + "step": 643 + }, + { + "epoch": 0.07050001368400886, + "grad_norm": 1.1936019426172775, + "learning_rate": 4.938922237090916e-05, + "loss": 0.615, + "num_input_tokens_seen": 116531072, + "step": 644 + }, + { + "epoch": 0.07060948575494677, + "grad_norm": 1.3389585812995142, + "learning_rate": 4.938733186500515e-05, + "loss": 0.6759, + "num_input_tokens_seen": 116700864, + "step": 645 + }, + { + "epoch": 0.07071895782588467, + "grad_norm": 1.2223701995558744, + "learning_rate": 4.93854384741264e-05, + "loss": 0.7622, + "num_input_tokens_seen": 116912320, + "step": 646 + }, + { + "epoch": 0.07082842989682257, + "grad_norm": 1.2585075306263565, + "learning_rate": 4.938354219849689e-05, + "loss": 0.6514, + "num_input_tokens_seen": 117083232, + "step": 647 + }, + { + "epoch": 0.07093790196776048, + "grad_norm": 1.1661207704405006, + "learning_rate": 4.9381643038340966e-05, + "loss": 0.5842, + "num_input_tokens_seen": 117249664, + "step": 648 + }, + { + "epoch": 0.07104737403869837, + "grad_norm": 1.27622115241765, + "learning_rate": 4.937974099388326e-05, + "loss": 0.849, + "num_input_tokens_seen": 117439616, + "step": 649 + }, + { + "epoch": 0.07115684610963628, + "grad_norm": 1.2704499661810846, + "learning_rate": 4.9377836065348814e-05, + "loss": 0.6723, + "num_input_tokens_seen": 117614560, + "step": 650 + }, + { + "epoch": 0.07126631818057418, + "grad_norm": 1.6549038004503818, + "learning_rate": 4.937592825296297e-05, + "loss": 1.2026, + "num_input_tokens_seen": 117789280, + "step": 651 + }, + { + "epoch": 0.07137579025151208, + "grad_norm": 1.3464189682300394, + "learning_rate": 4.93740175569514e-05, + "loss": 0.7484, + "num_input_tokens_seen": 117956384, + "step": 652 + }, + { + "epoch": 0.07148526232244999, + "grad_norm": 1.3087219630957325, + "learning_rate": 4.9372103977540154e-05, + "loss": 0.8211, + "num_input_tokens_seen": 118143424, + "step": 653 + }, + { + "epoch": 0.07159473439338788, + "grad_norm": 1.1784062785906544, + "learning_rate": 4.937018751495559e-05, + "loss": 0.684, + "num_input_tokens_seen": 118340096, + "step": 654 + }, + { + "epoch": 0.07170420646432579, + "grad_norm": 1.122320746486529, + "learning_rate": 4.9368268169424444e-05, + "loss": 0.7986, + "num_input_tokens_seen": 118531168, + "step": 655 + }, + { + "epoch": 0.07181367853526369, + "grad_norm": 1.396174512511345, + "learning_rate": 4.936634594117375e-05, + "loss": 0.7139, + "num_input_tokens_seen": 118688640, + "step": 656 + }, + { + "epoch": 0.0719231506062016, + "grad_norm": 1.2073821017756186, + "learning_rate": 4.936442083043091e-05, + "loss": 0.8582, + "num_input_tokens_seen": 118901888, + "step": 657 + }, + { + "epoch": 0.07203262267713949, + "grad_norm": 1.1526899431973523, + "learning_rate": 4.936249283742367e-05, + "loss": 0.6959, + "num_input_tokens_seen": 119069216, + "step": 658 + }, + { + "epoch": 0.0721420947480774, + "grad_norm": 1.2936636449032681, + "learning_rate": 4.93605619623801e-05, + "loss": 0.6639, + "num_input_tokens_seen": 119239456, + "step": 659 + }, + { + "epoch": 0.0722515668190153, + "grad_norm": 1.1325249668762911, + "learning_rate": 4.935862820552861e-05, + "loss": 0.564, + "num_input_tokens_seen": 119405664, + "step": 660 + }, + { + "epoch": 0.0723610388899532, + "grad_norm": 1.3408023085639171, + "learning_rate": 4.935669156709798e-05, + "loss": 0.7854, + "num_input_tokens_seen": 119570976, + "step": 661 + }, + { + "epoch": 0.0724705109608911, + "grad_norm": 1.4512964865166293, + "learning_rate": 4.93547520473173e-05, + "loss": 0.7308, + "num_input_tokens_seen": 119732928, + "step": 662 + }, + { + "epoch": 0.072579983031829, + "grad_norm": 1.4501109705587412, + "learning_rate": 4.9352809646416e-05, + "loss": 0.7933, + "num_input_tokens_seen": 119936768, + "step": 663 + }, + { + "epoch": 0.0726894551027669, + "grad_norm": 1.2803268828828327, + "learning_rate": 4.935086436462388e-05, + "loss": 0.9965, + "num_input_tokens_seen": 120138816, + "step": 664 + }, + { + "epoch": 0.07279892717370481, + "grad_norm": 1.3209136858224562, + "learning_rate": 4.934891620217106e-05, + "loss": 0.8763, + "num_input_tokens_seen": 120327648, + "step": 665 + }, + { + "epoch": 0.07290839924464271, + "grad_norm": 1.2087405055875402, + "learning_rate": 4.934696515928799e-05, + "loss": 0.7658, + "num_input_tokens_seen": 120532832, + "step": 666 + }, + { + "epoch": 0.07301787131558062, + "grad_norm": 1.3318247704989479, + "learning_rate": 4.93450112362055e-05, + "loss": 0.9149, + "num_input_tokens_seen": 120699936, + "step": 667 + }, + { + "epoch": 0.07312734338651851, + "grad_norm": 1.13740178197698, + "learning_rate": 4.934305443315471e-05, + "loss": 0.6005, + "num_input_tokens_seen": 120858304, + "step": 668 + }, + { + "epoch": 0.07323681545745642, + "grad_norm": 1.7187939724932568, + "learning_rate": 4.9341094750367126e-05, + "loss": 0.7429, + "num_input_tokens_seen": 121033248, + "step": 669 + }, + { + "epoch": 0.07334628752839432, + "grad_norm": 1.2183971872482595, + "learning_rate": 4.9339132188074556e-05, + "loss": 0.7919, + "num_input_tokens_seen": 121201024, + "step": 670 + }, + { + "epoch": 0.07345575959933222, + "grad_norm": 1.213552690491384, + "learning_rate": 4.933716674650918e-05, + "loss": 0.8113, + "num_input_tokens_seen": 121359840, + "step": 671 + }, + { + "epoch": 0.07356523167027013, + "grad_norm": 1.2236823508060142, + "learning_rate": 4.9335198425903497e-05, + "loss": 0.6825, + "num_input_tokens_seen": 121547104, + "step": 672 + }, + { + "epoch": 0.07367470374120802, + "grad_norm": 1.1765998702681237, + "learning_rate": 4.933322722649037e-05, + "loss": 0.5768, + "num_input_tokens_seen": 121743552, + "step": 673 + }, + { + "epoch": 0.07378417581214593, + "grad_norm": 1.2111227078470743, + "learning_rate": 4.933125314850297e-05, + "loss": 0.6768, + "num_input_tokens_seen": 121933280, + "step": 674 + }, + { + "epoch": 0.07389364788308384, + "grad_norm": 1.1932347508864136, + "learning_rate": 4.9329276192174845e-05, + "loss": 0.7264, + "num_input_tokens_seen": 122102848, + "step": 675 + }, + { + "epoch": 0.07400311995402173, + "grad_norm": 1.2572595079676514, + "learning_rate": 4.932729635773985e-05, + "loss": 0.5635, + "num_input_tokens_seen": 122308704, + "step": 676 + }, + { + "epoch": 0.07411259202495964, + "grad_norm": 1.2071867907406242, + "learning_rate": 4.93253136454322e-05, + "loss": 0.5059, + "num_input_tokens_seen": 122492384, + "step": 677 + }, + { + "epoch": 0.07422206409589753, + "grad_norm": 1.4031255131574003, + "learning_rate": 4.9323328055486464e-05, + "loss": 0.7137, + "num_input_tokens_seen": 122689728, + "step": 678 + }, + { + "epoch": 0.07433153616683544, + "grad_norm": 1.4353964798185597, + "learning_rate": 4.93213395881375e-05, + "loss": 0.7443, + "num_input_tokens_seen": 122862432, + "step": 679 + }, + { + "epoch": 0.07444100823777333, + "grad_norm": 1.3992561629259987, + "learning_rate": 4.9319348243620566e-05, + "loss": 0.8724, + "num_input_tokens_seen": 123083968, + "step": 680 + }, + { + "epoch": 0.07455048030871124, + "grad_norm": 1.2900556033371344, + "learning_rate": 4.931735402217122e-05, + "loss": 0.7734, + "num_input_tokens_seen": 123272128, + "step": 681 + }, + { + "epoch": 0.07465995237964915, + "grad_norm": 1.2812828897116018, + "learning_rate": 4.931535692402538e-05, + "loss": 0.7111, + "num_input_tokens_seen": 123468352, + "step": 682 + }, + { + "epoch": 0.07476942445058704, + "grad_norm": 1.1783071361976705, + "learning_rate": 4.93133569494193e-05, + "loss": 0.6262, + "num_input_tokens_seen": 123663680, + "step": 683 + }, + { + "epoch": 0.07487889652152495, + "grad_norm": 1.2314681097184779, + "learning_rate": 4.931135409858958e-05, + "loss": 0.7148, + "num_input_tokens_seen": 123802560, + "step": 684 + }, + { + "epoch": 0.07498836859246284, + "grad_norm": 1.3036014810968666, + "learning_rate": 4.930934837177313e-05, + "loss": 0.7853, + "num_input_tokens_seen": 123977728, + "step": 685 + }, + { + "epoch": 0.07509784066340075, + "grad_norm": 1.2459612450060422, + "learning_rate": 4.9307339769207257e-05, + "loss": 0.8035, + "num_input_tokens_seen": 124157824, + "step": 686 + }, + { + "epoch": 0.07520731273433866, + "grad_norm": 1.248838648126801, + "learning_rate": 4.930532829112955e-05, + "loss": 0.9152, + "num_input_tokens_seen": 124358976, + "step": 687 + }, + { + "epoch": 0.07531678480527655, + "grad_norm": 1.1410900892219602, + "learning_rate": 4.930331393777796e-05, + "loss": 0.7773, + "num_input_tokens_seen": 124570208, + "step": 688 + }, + { + "epoch": 0.07542625687621446, + "grad_norm": 1.1958687180036813, + "learning_rate": 4.93012967093908e-05, + "loss": 0.7395, + "num_input_tokens_seen": 124765984, + "step": 689 + }, + { + "epoch": 0.07553572894715235, + "grad_norm": 1.3082851672367326, + "learning_rate": 4.92992766062067e-05, + "loss": 0.7581, + "num_input_tokens_seen": 124943616, + "step": 690 + }, + { + "epoch": 0.07564520101809026, + "grad_norm": 1.335366690420128, + "learning_rate": 4.9297253628464624e-05, + "loss": 0.7719, + "num_input_tokens_seen": 125125280, + "step": 691 + }, + { + "epoch": 0.07575467308902817, + "grad_norm": 1.289531229514433, + "learning_rate": 4.9295227776403893e-05, + "loss": 0.7598, + "num_input_tokens_seen": 125272448, + "step": 692 + }, + { + "epoch": 0.07586414515996606, + "grad_norm": 1.135713900003085, + "learning_rate": 4.929319905026416e-05, + "loss": 0.6961, + "num_input_tokens_seen": 125444704, + "step": 693 + }, + { + "epoch": 0.07597361723090397, + "grad_norm": 1.2613612279626847, + "learning_rate": 4.929116745028542e-05, + "loss": 0.681, + "num_input_tokens_seen": 125590976, + "step": 694 + }, + { + "epoch": 0.07608308930184186, + "grad_norm": 1.3110200167160078, + "learning_rate": 4.928913297670801e-05, + "loss": 0.7037, + "num_input_tokens_seen": 125737024, + "step": 695 + }, + { + "epoch": 0.07619256137277977, + "grad_norm": 1.257798331044637, + "learning_rate": 4.92870956297726e-05, + "loss": 0.6665, + "num_input_tokens_seen": 125925184, + "step": 696 + }, + { + "epoch": 0.07630203344371767, + "grad_norm": 1.260164892518217, + "learning_rate": 4.92850554097202e-05, + "loss": 0.6778, + "num_input_tokens_seen": 126109536, + "step": 697 + }, + { + "epoch": 0.07641150551465557, + "grad_norm": 1.338476924097404, + "learning_rate": 4.928301231679218e-05, + "loss": 0.688, + "num_input_tokens_seen": 126295904, + "step": 698 + }, + { + "epoch": 0.07652097758559348, + "grad_norm": 1.2499255635032878, + "learning_rate": 4.9280966351230226e-05, + "loss": 0.6903, + "num_input_tokens_seen": 126487424, + "step": 699 + }, + { + "epoch": 0.07663044965653137, + "grad_norm": 1.1642380140464534, + "learning_rate": 4.927891751327636e-05, + "loss": 0.6162, + "num_input_tokens_seen": 126685664, + "step": 700 + }, + { + "epoch": 0.07673992172746928, + "grad_norm": 1.3168204813255495, + "learning_rate": 4.9276865803172965e-05, + "loss": 0.8016, + "num_input_tokens_seen": 126863744, + "step": 701 + }, + { + "epoch": 0.07684939379840718, + "grad_norm": 1.3163957906698032, + "learning_rate": 4.9274811221162764e-05, + "loss": 0.858, + "num_input_tokens_seen": 127062208, + "step": 702 + }, + { + "epoch": 0.07695886586934508, + "grad_norm": 1.3982356687698347, + "learning_rate": 4.92727537674888e-05, + "loss": 0.8053, + "num_input_tokens_seen": 127258880, + "step": 703 + }, + { + "epoch": 0.07706833794028299, + "grad_norm": 1.296808355986015, + "learning_rate": 4.927069344239447e-05, + "loss": 0.8269, + "num_input_tokens_seen": 127457568, + "step": 704 + }, + { + "epoch": 0.07717781001122088, + "grad_norm": 1.159575757024653, + "learning_rate": 4.9268630246123495e-05, + "loss": 0.7908, + "num_input_tokens_seen": 127640128, + "step": 705 + }, + { + "epoch": 0.07728728208215879, + "grad_norm": 1.3826799490810884, + "learning_rate": 4.926656417891996e-05, + "loss": 0.8948, + "num_input_tokens_seen": 127787968, + "step": 706 + }, + { + "epoch": 0.07739675415309669, + "grad_norm": 1.1756282466050645, + "learning_rate": 4.926449524102826e-05, + "loss": 0.765, + "num_input_tokens_seen": 127941632, + "step": 707 + }, + { + "epoch": 0.0775062262240346, + "grad_norm": 1.2941152784891943, + "learning_rate": 4.9262423432693175e-05, + "loss": 0.6304, + "num_input_tokens_seen": 128130464, + "step": 708 + }, + { + "epoch": 0.0776156982949725, + "grad_norm": 1.1988072388222641, + "learning_rate": 4.926034875415977e-05, + "loss": 0.9715, + "num_input_tokens_seen": 128338784, + "step": 709 + }, + { + "epoch": 0.0777251703659104, + "grad_norm": 1.1402284115631782, + "learning_rate": 4.925827120567349e-05, + "loss": 0.748, + "num_input_tokens_seen": 128541280, + "step": 710 + }, + { + "epoch": 0.0778346424368483, + "grad_norm": 1.1787467388540809, + "learning_rate": 4.9256190787480104e-05, + "loss": 0.7341, + "num_input_tokens_seen": 128745792, + "step": 711 + }, + { + "epoch": 0.0779441145077862, + "grad_norm": 1.3596553732588554, + "learning_rate": 4.9254107499825705e-05, + "loss": 0.7121, + "num_input_tokens_seen": 128944928, + "step": 712 + }, + { + "epoch": 0.0780535865787241, + "grad_norm": 1.5628303178680372, + "learning_rate": 4.925202134295677e-05, + "loss": 1.073, + "num_input_tokens_seen": 129112032, + "step": 713 + }, + { + "epoch": 0.078163058649662, + "grad_norm": 1.3912120394399108, + "learning_rate": 4.924993231712006e-05, + "loss": 0.7132, + "num_input_tokens_seen": 129271072, + "step": 714 + }, + { + "epoch": 0.0782725307205999, + "grad_norm": 1.2277195812163006, + "learning_rate": 4.924784042256273e-05, + "loss": 0.7086, + "num_input_tokens_seen": 129470432, + "step": 715 + }, + { + "epoch": 0.07838200279153781, + "grad_norm": 1.2264116036478923, + "learning_rate": 4.9245745659532214e-05, + "loss": 0.708, + "num_input_tokens_seen": 129628576, + "step": 716 + }, + { + "epoch": 0.07849147486247571, + "grad_norm": 1.2093236197663673, + "learning_rate": 4.924364802827635e-05, + "loss": 0.8432, + "num_input_tokens_seen": 129841376, + "step": 717 + }, + { + "epoch": 0.07860094693341362, + "grad_norm": 1.2658401588940673, + "learning_rate": 4.924154752904326e-05, + "loss": 0.8947, + "num_input_tokens_seen": 130029536, + "step": 718 + }, + { + "epoch": 0.07871041900435151, + "grad_norm": 1.2711900408472923, + "learning_rate": 4.923944416208145e-05, + "loss": 0.7372, + "num_input_tokens_seen": 130197536, + "step": 719 + }, + { + "epoch": 0.07881989107528942, + "grad_norm": 1.148299487897358, + "learning_rate": 4.9237337927639725e-05, + "loss": 0.6077, + "num_input_tokens_seen": 130362400, + "step": 720 + }, + { + "epoch": 0.07892936314622732, + "grad_norm": 1.2702800244523906, + "learning_rate": 4.923522882596726e-05, + "loss": 0.757, + "num_input_tokens_seen": 130571840, + "step": 721 + }, + { + "epoch": 0.07903883521716522, + "grad_norm": 1.1588986448974745, + "learning_rate": 4.9233116857313554e-05, + "loss": 0.6372, + "num_input_tokens_seen": 130753952, + "step": 722 + }, + { + "epoch": 0.07914830728810313, + "grad_norm": 1.2302513778266408, + "learning_rate": 4.923100202192845e-05, + "loss": 0.5178, + "num_input_tokens_seen": 130910304, + "step": 723 + }, + { + "epoch": 0.07925777935904102, + "grad_norm": 1.1584267617472213, + "learning_rate": 4.922888432006213e-05, + "loss": 0.6108, + "num_input_tokens_seen": 131084352, + "step": 724 + }, + { + "epoch": 0.07936725142997893, + "grad_norm": 1.551549764313919, + "learning_rate": 4.922676375196511e-05, + "loss": 0.6605, + "num_input_tokens_seen": 131263776, + "step": 725 + }, + { + "epoch": 0.07947672350091683, + "grad_norm": 1.3283443494186002, + "learning_rate": 4.922464031788826e-05, + "loss": 0.7337, + "num_input_tokens_seen": 131479264, + "step": 726 + }, + { + "epoch": 0.07958619557185473, + "grad_norm": 1.2437549514288562, + "learning_rate": 4.922251401808276e-05, + "loss": 0.6106, + "num_input_tokens_seen": 131640992, + "step": 727 + }, + { + "epoch": 0.07969566764279264, + "grad_norm": 1.1289905133858236, + "learning_rate": 4.922038485280016e-05, + "loss": 0.6467, + "num_input_tokens_seen": 131836768, + "step": 728 + }, + { + "epoch": 0.07980513971373053, + "grad_norm": 1.2411957146336114, + "learning_rate": 4.921825282229233e-05, + "loss": 0.7412, + "num_input_tokens_seen": 131998272, + "step": 729 + }, + { + "epoch": 0.07991461178466844, + "grad_norm": 1.0246292048348578, + "learning_rate": 4.92161179268115e-05, + "loss": 0.54, + "num_input_tokens_seen": 132182848, + "step": 730 + }, + { + "epoch": 0.08002408385560633, + "grad_norm": 1.2843596241719786, + "learning_rate": 4.921398016661021e-05, + "loss": 0.7165, + "num_input_tokens_seen": 132351968, + "step": 731 + }, + { + "epoch": 0.08013355592654424, + "grad_norm": 1.3008935095198846, + "learning_rate": 4.9211839541941345e-05, + "loss": 0.8466, + "num_input_tokens_seen": 132549088, + "step": 732 + }, + { + "epoch": 0.08024302799748215, + "grad_norm": 1.265308670260493, + "learning_rate": 4.920969605305815e-05, + "loss": 0.7257, + "num_input_tokens_seen": 132720672, + "step": 733 + }, + { + "epoch": 0.08035250006842004, + "grad_norm": 1.3101878918151895, + "learning_rate": 4.92075497002142e-05, + "loss": 0.7737, + "num_input_tokens_seen": 132905024, + "step": 734 + }, + { + "epoch": 0.08046197213935795, + "grad_norm": 1.0475730845918771, + "learning_rate": 4.92054004836634e-05, + "loss": 0.5127, + "num_input_tokens_seen": 133055552, + "step": 735 + }, + { + "epoch": 0.08057144421029584, + "grad_norm": 1.186945836111294, + "learning_rate": 4.920324840365998e-05, + "loss": 0.5494, + "num_input_tokens_seen": 133226464, + "step": 736 + }, + { + "epoch": 0.08068091628123375, + "grad_norm": 1.0890877675610215, + "learning_rate": 4.9201093460458555e-05, + "loss": 0.6358, + "num_input_tokens_seen": 133423584, + "step": 737 + }, + { + "epoch": 0.08079038835217166, + "grad_norm": 1.26387161492268, + "learning_rate": 4.9198935654314036e-05, + "loss": 0.7337, + "num_input_tokens_seen": 133594048, + "step": 738 + }, + { + "epoch": 0.08089986042310955, + "grad_norm": 1.2055254682447782, + "learning_rate": 4.919677498548169e-05, + "loss": 0.6787, + "num_input_tokens_seen": 133781312, + "step": 739 + }, + { + "epoch": 0.08100933249404746, + "grad_norm": 1.4923391215201003, + "learning_rate": 4.9194611454217124e-05, + "loss": 0.9397, + "num_input_tokens_seen": 133964096, + "step": 740 + }, + { + "epoch": 0.08111880456498535, + "grad_norm": 1.3352677746796564, + "learning_rate": 4.9192445060776264e-05, + "loss": 0.7926, + "num_input_tokens_seen": 134168160, + "step": 741 + }, + { + "epoch": 0.08122827663592326, + "grad_norm": 1.2615119599279265, + "learning_rate": 4.919027580541541e-05, + "loss": 0.7434, + "num_input_tokens_seen": 134362144, + "step": 742 + }, + { + "epoch": 0.08133774870686117, + "grad_norm": 1.4338579895803654, + "learning_rate": 4.918810368839117e-05, + "loss": 0.8817, + "num_input_tokens_seen": 134545600, + "step": 743 + }, + { + "epoch": 0.08144722077779906, + "grad_norm": 1.1648333256726455, + "learning_rate": 4.91859287099605e-05, + "loss": 0.6353, + "num_input_tokens_seen": 134693216, + "step": 744 + }, + { + "epoch": 0.08155669284873697, + "grad_norm": 1.0726908967826114, + "learning_rate": 4.9183750870380704e-05, + "loss": 0.5205, + "num_input_tokens_seen": 134882048, + "step": 745 + }, + { + "epoch": 0.08166616491967486, + "grad_norm": 1.3234200558644338, + "learning_rate": 4.918157016990941e-05, + "loss": 1.1545, + "num_input_tokens_seen": 135092608, + "step": 746 + }, + { + "epoch": 0.08177563699061277, + "grad_norm": 1.2334515461243938, + "learning_rate": 4.917938660880459e-05, + "loss": 0.8898, + "num_input_tokens_seen": 135255008, + "step": 747 + }, + { + "epoch": 0.08188510906155068, + "grad_norm": 1.2840966955715916, + "learning_rate": 4.9177200187324556e-05, + "loss": 0.7135, + "num_input_tokens_seen": 135431296, + "step": 748 + }, + { + "epoch": 0.08199458113248857, + "grad_norm": 1.191942988328398, + "learning_rate": 4.917501090572797e-05, + "loss": 0.6785, + "num_input_tokens_seen": 135585184, + "step": 749 + }, + { + "epoch": 0.08210405320342648, + "grad_norm": 1.1738948453960796, + "learning_rate": 4.91728187642738e-05, + "loss": 0.6401, + "num_input_tokens_seen": 135749152, + "step": 750 + }, + { + "epoch": 0.08221352527436437, + "grad_norm": 1.1733253151085008, + "learning_rate": 4.917062376322138e-05, + "loss": 0.6404, + "num_input_tokens_seen": 135965312, + "step": 751 + }, + { + "epoch": 0.08232299734530228, + "grad_norm": 1.4027982522500546, + "learning_rate": 4.916842590283037e-05, + "loss": 0.9701, + "num_input_tokens_seen": 136155936, + "step": 752 + }, + { + "epoch": 0.08243246941624018, + "grad_norm": 1.2911566629862228, + "learning_rate": 4.916622518336079e-05, + "loss": 0.8394, + "num_input_tokens_seen": 136329088, + "step": 753 + }, + { + "epoch": 0.08254194148717808, + "grad_norm": 1.3042369224385537, + "learning_rate": 4.916402160507296e-05, + "loss": 0.9354, + "num_input_tokens_seen": 136530240, + "step": 754 + }, + { + "epoch": 0.08265141355811599, + "grad_norm": 1.333643097458927, + "learning_rate": 4.9161815168227576e-05, + "loss": 0.6836, + "num_input_tokens_seen": 136715264, + "step": 755 + }, + { + "epoch": 0.08276088562905388, + "grad_norm": 1.3516139202589865, + "learning_rate": 4.915960587308564e-05, + "loss": 0.6397, + "num_input_tokens_seen": 136866688, + "step": 756 + }, + { + "epoch": 0.08287035769999179, + "grad_norm": 1.186535845405159, + "learning_rate": 4.915739371990852e-05, + "loss": 0.5921, + "num_input_tokens_seen": 137036256, + "step": 757 + }, + { + "epoch": 0.08297982977092969, + "grad_norm": 1.3825563096483446, + "learning_rate": 4.9155178708957896e-05, + "loss": 0.6946, + "num_input_tokens_seen": 137239424, + "step": 758 + }, + { + "epoch": 0.0830893018418676, + "grad_norm": 1.588440371759743, + "learning_rate": 4.915296084049582e-05, + "loss": 0.9498, + "num_input_tokens_seen": 137408992, + "step": 759 + }, + { + "epoch": 0.0831987739128055, + "grad_norm": 1.243834261699686, + "learning_rate": 4.915074011478463e-05, + "loss": 0.673, + "num_input_tokens_seen": 137578112, + "step": 760 + }, + { + "epoch": 0.0833082459837434, + "grad_norm": 1.1729087339215427, + "learning_rate": 4.914851653208707e-05, + "loss": 0.7599, + "num_input_tokens_seen": 137744992, + "step": 761 + }, + { + "epoch": 0.0834177180546813, + "grad_norm": 1.0741041642543987, + "learning_rate": 4.9146290092666163e-05, + "loss": 0.6717, + "num_input_tokens_seen": 137918144, + "step": 762 + }, + { + "epoch": 0.0835271901256192, + "grad_norm": 1.1300925387410379, + "learning_rate": 4.91440607967853e-05, + "loss": 0.6558, + "num_input_tokens_seen": 138106080, + "step": 763 + }, + { + "epoch": 0.0836366621965571, + "grad_norm": 1.2267573499504083, + "learning_rate": 4.91418286447082e-05, + "loss": 0.5753, + "num_input_tokens_seen": 138282144, + "step": 764 + }, + { + "epoch": 0.08374613426749501, + "grad_norm": 1.277546857732223, + "learning_rate": 4.913959363669892e-05, + "loss": 0.7834, + "num_input_tokens_seen": 138457760, + "step": 765 + }, + { + "epoch": 0.0838556063384329, + "grad_norm": 1.324788154005103, + "learning_rate": 4.9137355773021856e-05, + "loss": 0.8654, + "num_input_tokens_seen": 138656672, + "step": 766 + }, + { + "epoch": 0.08396507840937081, + "grad_norm": 1.206818250115041, + "learning_rate": 4.913511505394175e-05, + "loss": 0.7416, + "num_input_tokens_seen": 138854016, + "step": 767 + }, + { + "epoch": 0.08407455048030871, + "grad_norm": 1.2401822777566882, + "learning_rate": 4.9132871479723675e-05, + "loss": 0.6422, + "num_input_tokens_seen": 139026048, + "step": 768 + }, + { + "epoch": 0.08418402255124662, + "grad_norm": 1.2560074927993379, + "learning_rate": 4.9130625050633036e-05, + "loss": 0.8059, + "num_input_tokens_seen": 139206592, + "step": 769 + }, + { + "epoch": 0.08429349462218451, + "grad_norm": 1.2427999078478624, + "learning_rate": 4.912837576693559e-05, + "loss": 0.7513, + "num_input_tokens_seen": 139348384, + "step": 770 + }, + { + "epoch": 0.08440296669312242, + "grad_norm": 1.443966047627523, + "learning_rate": 4.9126123628897406e-05, + "loss": 0.8439, + "num_input_tokens_seen": 139513024, + "step": 771 + }, + { + "epoch": 0.08451243876406032, + "grad_norm": 1.2563678192762369, + "learning_rate": 4.912386863678492e-05, + "loss": 0.6829, + "num_input_tokens_seen": 139649888, + "step": 772 + }, + { + "epoch": 0.08462191083499822, + "grad_norm": 1.419291654194922, + "learning_rate": 4.91216107908649e-05, + "loss": 0.9824, + "num_input_tokens_seen": 139833120, + "step": 773 + }, + { + "epoch": 0.08473138290593613, + "grad_norm": 1.289219008683699, + "learning_rate": 4.911935009140443e-05, + "loss": 0.7618, + "num_input_tokens_seen": 140047040, + "step": 774 + }, + { + "epoch": 0.08484085497687402, + "grad_norm": 1.268298821491457, + "learning_rate": 4.911708653867095e-05, + "loss": 0.9318, + "num_input_tokens_seen": 140231840, + "step": 775 + }, + { + "epoch": 0.08495032704781193, + "grad_norm": 1.2539066302812898, + "learning_rate": 4.911482013293224e-05, + "loss": 0.768, + "num_input_tokens_seen": 140409472, + "step": 776 + }, + { + "epoch": 0.08505979911874983, + "grad_norm": 1.422615696739116, + "learning_rate": 4.91125508744564e-05, + "loss": 0.8267, + "num_input_tokens_seen": 140611744, + "step": 777 + }, + { + "epoch": 0.08516927118968773, + "grad_norm": 1.27614072557913, + "learning_rate": 4.9110278763511897e-05, + "loss": 0.8866, + "num_input_tokens_seen": 140774368, + "step": 778 + }, + { + "epoch": 0.08527874326062564, + "grad_norm": 1.1423420458397293, + "learning_rate": 4.910800380036751e-05, + "loss": 0.6919, + "num_input_tokens_seen": 140976416, + "step": 779 + }, + { + "epoch": 0.08538821533156353, + "grad_norm": 1.212614313616261, + "learning_rate": 4.910572598529235e-05, + "loss": 0.6341, + "num_input_tokens_seen": 141143296, + "step": 780 + }, + { + "epoch": 0.08549768740250144, + "grad_norm": 1.1090653215610515, + "learning_rate": 4.910344531855589e-05, + "loss": 0.5611, + "num_input_tokens_seen": 141325632, + "step": 781 + }, + { + "epoch": 0.08560715947343935, + "grad_norm": 1.1316794681433646, + "learning_rate": 4.910116180042793e-05, + "loss": 0.7065, + "num_input_tokens_seen": 141512896, + "step": 782 + }, + { + "epoch": 0.08571663154437724, + "grad_norm": 1.2812186446656335, + "learning_rate": 4.90988754311786e-05, + "loss": 0.6342, + "num_input_tokens_seen": 141688512, + "step": 783 + }, + { + "epoch": 0.08582610361531515, + "grad_norm": 1.353860566193388, + "learning_rate": 4.9096586211078376e-05, + "loss": 0.615, + "num_input_tokens_seen": 141845536, + "step": 784 + }, + { + "epoch": 0.08593557568625304, + "grad_norm": 1.146457548331264, + "learning_rate": 4.9094294140398075e-05, + "loss": 0.7083, + "num_input_tokens_seen": 142004800, + "step": 785 + }, + { + "epoch": 0.08604504775719095, + "grad_norm": 1.2460992029722477, + "learning_rate": 4.909199921940883e-05, + "loss": 0.6695, + "num_input_tokens_seen": 142201472, + "step": 786 + }, + { + "epoch": 0.08615451982812884, + "grad_norm": 1.4336069714510806, + "learning_rate": 4.908970144838214e-05, + "loss": 0.9538, + "num_input_tokens_seen": 142384256, + "step": 787 + }, + { + "epoch": 0.08626399189906675, + "grad_norm": 1.4821857570933117, + "learning_rate": 4.9087400827589814e-05, + "loss": 0.9346, + "num_input_tokens_seen": 142551360, + "step": 788 + }, + { + "epoch": 0.08637346397000466, + "grad_norm": 1.2529948597442577, + "learning_rate": 4.908509735730402e-05, + "loss": 0.7631, + "num_input_tokens_seen": 142762816, + "step": 789 + }, + { + "epoch": 0.08648293604094255, + "grad_norm": 1.2034761486122711, + "learning_rate": 4.908279103779725e-05, + "loss": 0.6486, + "num_input_tokens_seen": 142943360, + "step": 790 + }, + { + "epoch": 0.08659240811188046, + "grad_norm": 1.2698685887450223, + "learning_rate": 4.908048186934234e-05, + "loss": 0.7172, + "num_input_tokens_seen": 143127040, + "step": 791 + }, + { + "epoch": 0.08670188018281835, + "grad_norm": 1.1946912160661454, + "learning_rate": 4.9078169852212454e-05, + "loss": 0.7696, + "num_input_tokens_seen": 143330880, + "step": 792 + }, + { + "epoch": 0.08681135225375626, + "grad_norm": 1.4363763813028423, + "learning_rate": 4.907585498668111e-05, + "loss": 0.81, + "num_input_tokens_seen": 143491264, + "step": 793 + }, + { + "epoch": 0.08692082432469417, + "grad_norm": 1.211931817157542, + "learning_rate": 4.907353727302214e-05, + "loss": 0.6442, + "num_input_tokens_seen": 143648288, + "step": 794 + }, + { + "epoch": 0.08703029639563206, + "grad_norm": 1.1848248918137543, + "learning_rate": 4.907121671150974e-05, + "loss": 0.5882, + "num_input_tokens_seen": 143825696, + "step": 795 + }, + { + "epoch": 0.08713976846656997, + "grad_norm": 1.2124176823309833, + "learning_rate": 4.906889330241842e-05, + "loss": 0.627, + "num_input_tokens_seen": 143986080, + "step": 796 + }, + { + "epoch": 0.08724924053750786, + "grad_norm": 1.1876417878480787, + "learning_rate": 4.9066567046023025e-05, + "loss": 0.8028, + "num_input_tokens_seen": 144177824, + "step": 797 + }, + { + "epoch": 0.08735871260844577, + "grad_norm": 1.3504895046889795, + "learning_rate": 4.906423794259876e-05, + "loss": 0.879, + "num_input_tokens_seen": 144341792, + "step": 798 + }, + { + "epoch": 0.08746818467938368, + "grad_norm": 1.1809497673727962, + "learning_rate": 4.906190599242115e-05, + "loss": 0.8659, + "num_input_tokens_seen": 144533984, + "step": 799 + }, + { + "epoch": 0.08757765675032157, + "grad_norm": 1.2484174724399963, + "learning_rate": 4.9059571195766066e-05, + "loss": 0.8295, + "num_input_tokens_seen": 144721024, + "step": 800 + }, + { + "epoch": 0.08768712882125948, + "grad_norm": 1.1612614976623759, + "learning_rate": 4.90572335529097e-05, + "loss": 0.6388, + "num_input_tokens_seen": 144908288, + "step": 801 + }, + { + "epoch": 0.08779660089219737, + "grad_norm": 1.2768293739117702, + "learning_rate": 4.9054893064128584e-05, + "loss": 0.5966, + "num_input_tokens_seen": 145097792, + "step": 802 + }, + { + "epoch": 0.08790607296313528, + "grad_norm": 1.1640415573796425, + "learning_rate": 4.905254972969962e-05, + "loss": 0.6211, + "num_input_tokens_seen": 145303424, + "step": 803 + }, + { + "epoch": 0.08801554503407318, + "grad_norm": 1.2617221513353922, + "learning_rate": 4.9050203549899984e-05, + "loss": 0.7511, + "num_input_tokens_seen": 145508160, + "step": 804 + }, + { + "epoch": 0.08812501710501108, + "grad_norm": 1.3907638651917673, + "learning_rate": 4.904785452500726e-05, + "loss": 0.733, + "num_input_tokens_seen": 145673920, + "step": 805 + }, + { + "epoch": 0.08823448917594899, + "grad_norm": 1.2671817577151991, + "learning_rate": 4.904550265529932e-05, + "loss": 0.6224, + "num_input_tokens_seen": 145849760, + "step": 806 + }, + { + "epoch": 0.08834396124688688, + "grad_norm": 1.268424904801417, + "learning_rate": 4.904314794105437e-05, + "loss": 0.798, + "num_input_tokens_seen": 146045536, + "step": 807 + }, + { + "epoch": 0.08845343331782479, + "grad_norm": 1.379289611607141, + "learning_rate": 4.9040790382550985e-05, + "loss": 0.8995, + "num_input_tokens_seen": 146232576, + "step": 808 + }, + { + "epoch": 0.08856290538876269, + "grad_norm": 1.2280165685101323, + "learning_rate": 4.903842998006806e-05, + "loss": 0.6881, + "num_input_tokens_seen": 146427456, + "step": 809 + }, + { + "epoch": 0.0886723774597006, + "grad_norm": 1.1547320428291532, + "learning_rate": 4.903606673388482e-05, + "loss": 0.7038, + "num_input_tokens_seen": 146614048, + "step": 810 + }, + { + "epoch": 0.0887818495306385, + "grad_norm": 1.47495630490513, + "learning_rate": 4.903370064428083e-05, + "loss": 1.0258, + "num_input_tokens_seen": 146790784, + "step": 811 + }, + { + "epoch": 0.0888913216015764, + "grad_norm": 1.3454314512683099, + "learning_rate": 4.903133171153601e-05, + "loss": 0.9551, + "num_input_tokens_seen": 146999328, + "step": 812 + }, + { + "epoch": 0.0890007936725143, + "grad_norm": 1.1417099482738813, + "learning_rate": 4.902895993593058e-05, + "loss": 0.5975, + "num_input_tokens_seen": 147183904, + "step": 813 + }, + { + "epoch": 0.0891102657434522, + "grad_norm": 1.210804741257785, + "learning_rate": 4.902658531774512e-05, + "loss": 0.6499, + "num_input_tokens_seen": 147375648, + "step": 814 + }, + { + "epoch": 0.0892197378143901, + "grad_norm": 1.263113284578123, + "learning_rate": 4.902420785726056e-05, + "loss": 0.8148, + "num_input_tokens_seen": 147548128, + "step": 815 + }, + { + "epoch": 0.08932920988532801, + "grad_norm": 1.2665833506455015, + "learning_rate": 4.902182755475813e-05, + "loss": 0.8289, + "num_input_tokens_seen": 147737856, + "step": 816 + }, + { + "epoch": 0.0894386819562659, + "grad_norm": 1.382550058407096, + "learning_rate": 4.9019444410519425e-05, + "loss": 0.9015, + "num_input_tokens_seen": 147918400, + "step": 817 + }, + { + "epoch": 0.08954815402720381, + "grad_norm": 1.2752201731696762, + "learning_rate": 4.9017058424826366e-05, + "loss": 0.6044, + "num_input_tokens_seen": 148081024, + "step": 818 + }, + { + "epoch": 0.08965762609814171, + "grad_norm": 1.0831481457420262, + "learning_rate": 4.901466959796121e-05, + "loss": 0.6394, + "num_input_tokens_seen": 148276352, + "step": 819 + }, + { + "epoch": 0.08976709816907961, + "grad_norm": 1.2932123605857773, + "learning_rate": 4.9012277930206536e-05, + "loss": 0.7425, + "num_input_tokens_seen": 148465632, + "step": 820 + }, + { + "epoch": 0.08987657024001751, + "grad_norm": 1.3096755836725669, + "learning_rate": 4.900988342184529e-05, + "loss": 0.7632, + "num_input_tokens_seen": 148653120, + "step": 821 + }, + { + "epoch": 0.08998604231095542, + "grad_norm": 1.3152016581541128, + "learning_rate": 4.9007486073160746e-05, + "loss": 0.6866, + "num_input_tokens_seen": 148813952, + "step": 822 + }, + { + "epoch": 0.09009551438189332, + "grad_norm": 1.3537884061654197, + "learning_rate": 4.900508588443649e-05, + "loss": 0.707, + "num_input_tokens_seen": 148983744, + "step": 823 + }, + { + "epoch": 0.09020498645283122, + "grad_norm": 1.3651076709634902, + "learning_rate": 4.900268285595645e-05, + "loss": 0.7952, + "num_input_tokens_seen": 149200800, + "step": 824 + }, + { + "epoch": 0.09031445852376913, + "grad_norm": 1.3318919894899437, + "learning_rate": 4.9000276988004925e-05, + "loss": 0.6926, + "num_input_tokens_seen": 149385824, + "step": 825 + }, + { + "epoch": 0.09042393059470702, + "grad_norm": 1.341742946201787, + "learning_rate": 4.899786828086651e-05, + "loss": 1.0813, + "num_input_tokens_seen": 149601984, + "step": 826 + }, + { + "epoch": 0.09053340266564493, + "grad_norm": 1.5622520250004222, + "learning_rate": 4.899545673482616e-05, + "loss": 0.7668, + "num_input_tokens_seen": 149744672, + "step": 827 + }, + { + "epoch": 0.09064287473658283, + "grad_norm": 1.4232415978367792, + "learning_rate": 4.8993042350169145e-05, + "loss": 0.8394, + "num_input_tokens_seen": 149964192, + "step": 828 + }, + { + "epoch": 0.09075234680752073, + "grad_norm": 1.2662216004842508, + "learning_rate": 4.899062512718109e-05, + "loss": 0.6735, + "num_input_tokens_seen": 150138912, + "step": 829 + }, + { + "epoch": 0.09086181887845864, + "grad_norm": 1.3164025371736865, + "learning_rate": 4.898820506614794e-05, + "loss": 0.6986, + "num_input_tokens_seen": 150272864, + "step": 830 + }, + { + "epoch": 0.09097129094939653, + "grad_norm": 1.2757134722653818, + "learning_rate": 4.898578216735599e-05, + "loss": 0.8561, + "num_input_tokens_seen": 150478496, + "step": 831 + }, + { + "epoch": 0.09108076302033444, + "grad_norm": 1.2431544253327116, + "learning_rate": 4.8983356431091864e-05, + "loss": 0.7374, + "num_input_tokens_seen": 150650304, + "step": 832 + }, + { + "epoch": 0.09119023509127235, + "grad_norm": 1.358872550975483, + "learning_rate": 4.8980927857642514e-05, + "loss": 0.7869, + "num_input_tokens_seen": 150804192, + "step": 833 + }, + { + "epoch": 0.09129970716221024, + "grad_norm": 1.1919895432736831, + "learning_rate": 4.897849644729525e-05, + "loss": 0.6249, + "num_input_tokens_seen": 150946208, + "step": 834 + }, + { + "epoch": 0.09140917923314815, + "grad_norm": 1.2930218089300873, + "learning_rate": 4.8976062200337695e-05, + "loss": 0.5683, + "num_input_tokens_seen": 151109952, + "step": 835 + }, + { + "epoch": 0.09151865130408604, + "grad_norm": 1.135807681634811, + "learning_rate": 4.897362511705781e-05, + "loss": 0.7016, + "num_input_tokens_seen": 151293856, + "step": 836 + }, + { + "epoch": 0.09162812337502395, + "grad_norm": 1.236315879339285, + "learning_rate": 4.897118519774391e-05, + "loss": 0.7584, + "num_input_tokens_seen": 151474848, + "step": 837 + }, + { + "epoch": 0.09173759544596186, + "grad_norm": 1.2983191848264728, + "learning_rate": 4.8968742442684625e-05, + "loss": 0.873, + "num_input_tokens_seen": 151634112, + "step": 838 + }, + { + "epoch": 0.09184706751689975, + "grad_norm": 1.3008830629203165, + "learning_rate": 4.896629685216892e-05, + "loss": 0.7044, + "num_input_tokens_seen": 151794048, + "step": 839 + }, + { + "epoch": 0.09195653958783766, + "grad_norm": 1.1253238847533245, + "learning_rate": 4.896384842648612e-05, + "loss": 0.5683, + "num_input_tokens_seen": 151947936, + "step": 840 + }, + { + "epoch": 0.09206601165877555, + "grad_norm": 1.2186873493230679, + "learning_rate": 4.8961397165925874e-05, + "loss": 0.6199, + "num_input_tokens_seen": 152135648, + "step": 841 + }, + { + "epoch": 0.09217548372971346, + "grad_norm": 1.1402029536937175, + "learning_rate": 4.895894307077814e-05, + "loss": 0.6604, + "num_input_tokens_seen": 152325152, + "step": 842 + }, + { + "epoch": 0.09228495580065135, + "grad_norm": 1.3229138632116375, + "learning_rate": 4.895648614133324e-05, + "loss": 0.7524, + "num_input_tokens_seen": 152524512, + "step": 843 + }, + { + "epoch": 0.09239442787158926, + "grad_norm": 1.3356570568530768, + "learning_rate": 4.895402637788183e-05, + "loss": 0.6847, + "num_input_tokens_seen": 152703936, + "step": 844 + }, + { + "epoch": 0.09250389994252717, + "grad_norm": 1.182628174785098, + "learning_rate": 4.895156378071489e-05, + "loss": 0.6819, + "num_input_tokens_seen": 152910688, + "step": 845 + }, + { + "epoch": 0.09261337201346506, + "grad_norm": 1.3630595641803553, + "learning_rate": 4.894909835012374e-05, + "loss": 0.7543, + "num_input_tokens_seen": 153072416, + "step": 846 + }, + { + "epoch": 0.09272284408440297, + "grad_norm": 1.1947005456945063, + "learning_rate": 4.894663008640004e-05, + "loss": 0.5573, + "num_input_tokens_seen": 153241088, + "step": 847 + }, + { + "epoch": 0.09283231615534086, + "grad_norm": 1.3869286176881777, + "learning_rate": 4.894415898983578e-05, + "loss": 0.8251, + "num_input_tokens_seen": 153431264, + "step": 848 + }, + { + "epoch": 0.09294178822627877, + "grad_norm": 1.258401199851704, + "learning_rate": 4.894168506072329e-05, + "loss": 0.6088, + "num_input_tokens_seen": 153616064, + "step": 849 + }, + { + "epoch": 0.09305126029721668, + "grad_norm": 1.1125934282997818, + "learning_rate": 4.8939208299355215e-05, + "loss": 0.7778, + "num_input_tokens_seen": 153795936, + "step": 850 + }, + { + "epoch": 0.09316073236815457, + "grad_norm": 1.2507231527890117, + "learning_rate": 4.893672870602457e-05, + "loss": 0.659, + "num_input_tokens_seen": 153950272, + "step": 851 + }, + { + "epoch": 0.09327020443909248, + "grad_norm": 1.2999321626423141, + "learning_rate": 4.893424628102468e-05, + "loss": 0.7125, + "num_input_tokens_seen": 154124544, + "step": 852 + }, + { + "epoch": 0.09337967651003037, + "grad_norm": 1.2413904764026746, + "learning_rate": 4.8931761024649206e-05, + "loss": 0.706, + "num_input_tokens_seen": 154330176, + "step": 853 + }, + { + "epoch": 0.09348914858096828, + "grad_norm": 1.4672027717096154, + "learning_rate": 4.8929272937192147e-05, + "loss": 0.8021, + "num_input_tokens_seen": 154488096, + "step": 854 + }, + { + "epoch": 0.09359862065190619, + "grad_norm": 1.5036540030440384, + "learning_rate": 4.892678201894785e-05, + "loss": 0.8338, + "num_input_tokens_seen": 154652512, + "step": 855 + }, + { + "epoch": 0.09370809272284408, + "grad_norm": 1.3196280697433382, + "learning_rate": 4.892428827021098e-05, + "loss": 0.7688, + "num_input_tokens_seen": 154825216, + "step": 856 + }, + { + "epoch": 0.09381756479378199, + "grad_norm": 1.2182167549474137, + "learning_rate": 4.892179169127654e-05, + "loss": 0.6916, + "num_input_tokens_seen": 155017184, + "step": 857 + }, + { + "epoch": 0.09392703686471988, + "grad_norm": 1.3375957526041429, + "learning_rate": 4.891929228243988e-05, + "loss": 0.7637, + "num_input_tokens_seen": 155211168, + "step": 858 + }, + { + "epoch": 0.09403650893565779, + "grad_norm": 1.2909056000564647, + "learning_rate": 4.8916790043996665e-05, + "loss": 0.6796, + "num_input_tokens_seen": 155384320, + "step": 859 + }, + { + "epoch": 0.09414598100659569, + "grad_norm": 1.1745247447889384, + "learning_rate": 4.891428497624291e-05, + "loss": 0.5568, + "num_input_tokens_seen": 155527680, + "step": 860 + }, + { + "epoch": 0.0942554530775336, + "grad_norm": 1.3367470413326512, + "learning_rate": 4.891177707947496e-05, + "loss": 0.6695, + "num_input_tokens_seen": 155728384, + "step": 861 + }, + { + "epoch": 0.0943649251484715, + "grad_norm": 1.3051381960222632, + "learning_rate": 4.890926635398949e-05, + "loss": 0.6203, + "num_input_tokens_seen": 155873984, + "step": 862 + }, + { + "epoch": 0.0944743972194094, + "grad_norm": 1.2545543196173956, + "learning_rate": 4.890675280008352e-05, + "loss": 0.7613, + "num_input_tokens_seen": 156072672, + "step": 863 + }, + { + "epoch": 0.0945838692903473, + "grad_norm": 1.3425484985598892, + "learning_rate": 4.8904236418054395e-05, + "loss": 0.7572, + "num_input_tokens_seen": 156239552, + "step": 864 + }, + { + "epoch": 0.0946933413612852, + "grad_norm": 1.1401662557235295, + "learning_rate": 4.890171720819979e-05, + "loss": 0.6268, + "num_input_tokens_seen": 156433984, + "step": 865 + }, + { + "epoch": 0.0948028134322231, + "grad_norm": 1.3519354509654928, + "learning_rate": 4.889919517081775e-05, + "loss": 0.8435, + "num_input_tokens_seen": 156588992, + "step": 866 + }, + { + "epoch": 0.09491228550316101, + "grad_norm": 1.3728937576458784, + "learning_rate": 4.889667030620659e-05, + "loss": 0.8546, + "num_input_tokens_seen": 156771776, + "step": 867 + }, + { + "epoch": 0.0950217575740989, + "grad_norm": 1.3846705825106247, + "learning_rate": 4.889414261466503e-05, + "loss": 0.8201, + "num_input_tokens_seen": 156948512, + "step": 868 + }, + { + "epoch": 0.09513122964503681, + "grad_norm": 1.1584287684162742, + "learning_rate": 4.8891612096492066e-05, + "loss": 0.553, + "num_input_tokens_seen": 157098816, + "step": 869 + }, + { + "epoch": 0.09524070171597471, + "grad_norm": 1.1247287529208052, + "learning_rate": 4.8889078751987074e-05, + "loss": 0.5557, + "num_input_tokens_seen": 157295712, + "step": 870 + }, + { + "epoch": 0.09535017378691261, + "grad_norm": 1.3104988464559975, + "learning_rate": 4.8886542581449726e-05, + "loss": 0.7024, + "num_input_tokens_seen": 157470656, + "step": 871 + }, + { + "epoch": 0.09545964585785052, + "grad_norm": 1.2277290967972865, + "learning_rate": 4.8884003585180053e-05, + "loss": 0.7516, + "num_input_tokens_seen": 157678304, + "step": 872 + }, + { + "epoch": 0.09556911792878842, + "grad_norm": 1.2719484762576951, + "learning_rate": 4.888146176347842e-05, + "loss": 0.7469, + "num_input_tokens_seen": 157853024, + "step": 873 + }, + { + "epoch": 0.09567858999972632, + "grad_norm": 1.2988070149247817, + "learning_rate": 4.8878917116645514e-05, + "loss": 0.7961, + "num_input_tokens_seen": 158029088, + "step": 874 + }, + { + "epoch": 0.09578806207066422, + "grad_norm": 1.5410966162919377, + "learning_rate": 4.887636964498236e-05, + "loss": 0.9732, + "num_input_tokens_seen": 158202016, + "step": 875 + }, + { + "epoch": 0.09589753414160213, + "grad_norm": 1.2235468341794697, + "learning_rate": 4.887381934879032e-05, + "loss": 0.605, + "num_input_tokens_seen": 158357248, + "step": 876 + }, + { + "epoch": 0.09600700621254002, + "grad_norm": 1.086006327032834, + "learning_rate": 4.887126622837109e-05, + "loss": 0.624, + "num_input_tokens_seen": 158556384, + "step": 877 + }, + { + "epoch": 0.09611647828347793, + "grad_norm": 1.5304641264007814, + "learning_rate": 4.88687102840267e-05, + "loss": 0.7061, + "num_input_tokens_seen": 158700864, + "step": 878 + }, + { + "epoch": 0.09622595035441583, + "grad_norm": 1.2213667141636533, + "learning_rate": 4.886615151605951e-05, + "loss": 0.8227, + "num_input_tokens_seen": 158870432, + "step": 879 + }, + { + "epoch": 0.09633542242535373, + "grad_norm": 1.2929751116547141, + "learning_rate": 4.886358992477222e-05, + "loss": 0.738, + "num_input_tokens_seen": 159068896, + "step": 880 + }, + { + "epoch": 0.09644489449629164, + "grad_norm": 1.2866709864169987, + "learning_rate": 4.886102551046786e-05, + "loss": 0.6958, + "num_input_tokens_seen": 159242048, + "step": 881 + }, + { + "epoch": 0.09655436656722953, + "grad_norm": 1.1535300145735674, + "learning_rate": 4.8858458273449806e-05, + "loss": 0.5828, + "num_input_tokens_seen": 159412064, + "step": 882 + }, + { + "epoch": 0.09666383863816744, + "grad_norm": 1.2175678486492871, + "learning_rate": 4.885588821402174e-05, + "loss": 0.898, + "num_input_tokens_seen": 159625312, + "step": 883 + }, + { + "epoch": 0.09677331070910535, + "grad_norm": 1.3318796379191193, + "learning_rate": 4.88533153324877e-05, + "loss": 0.8381, + "num_input_tokens_seen": 159813472, + "step": 884 + }, + { + "epoch": 0.09688278278004324, + "grad_norm": 1.0756042977645486, + "learning_rate": 4.885073962915207e-05, + "loss": 0.555, + "num_input_tokens_seen": 160008800, + "step": 885 + }, + { + "epoch": 0.09699225485098115, + "grad_norm": 1.3002185692812374, + "learning_rate": 4.8848161104319525e-05, + "loss": 0.6893, + "num_input_tokens_seen": 160192704, + "step": 886 + }, + { + "epoch": 0.09710172692191904, + "grad_norm": 1.2672096034143663, + "learning_rate": 4.8845579758295114e-05, + "loss": 0.6842, + "num_input_tokens_seen": 160349280, + "step": 887 + }, + { + "epoch": 0.09721119899285695, + "grad_norm": 1.2505971435921774, + "learning_rate": 4.88429955913842e-05, + "loss": 0.8236, + "num_input_tokens_seen": 160572608, + "step": 888 + }, + { + "epoch": 0.09732067106379486, + "grad_norm": 1.3572855709443572, + "learning_rate": 4.8840408603892495e-05, + "loss": 0.7238, + "num_input_tokens_seen": 160731648, + "step": 889 + }, + { + "epoch": 0.09743014313473275, + "grad_norm": 1.3190747390031055, + "learning_rate": 4.883781879612602e-05, + "loss": 0.7868, + "num_input_tokens_seen": 160928992, + "step": 890 + }, + { + "epoch": 0.09753961520567066, + "grad_norm": 1.3137162340871882, + "learning_rate": 4.883522616839116e-05, + "loss": 0.8868, + "num_input_tokens_seen": 161118720, + "step": 891 + }, + { + "epoch": 0.09764908727660855, + "grad_norm": 1.4073180487263695, + "learning_rate": 4.88326307209946e-05, + "loss": 0.6829, + "num_input_tokens_seen": 161288736, + "step": 892 + }, + { + "epoch": 0.09775855934754646, + "grad_norm": 1.2139060765341296, + "learning_rate": 4.883003245424339e-05, + "loss": 0.6839, + "num_input_tokens_seen": 161476224, + "step": 893 + }, + { + "epoch": 0.09786803141848435, + "grad_norm": 1.2518908980641335, + "learning_rate": 4.8827431368444896e-05, + "loss": 0.879, + "num_input_tokens_seen": 161682080, + "step": 894 + }, + { + "epoch": 0.09797750348942226, + "grad_norm": 1.380787762588453, + "learning_rate": 4.882482746390682e-05, + "loss": 0.9603, + "num_input_tokens_seen": 161846720, + "step": 895 + }, + { + "epoch": 0.09808697556036017, + "grad_norm": 1.2537833655682205, + "learning_rate": 4.8822220740937195e-05, + "loss": 0.6528, + "num_input_tokens_seen": 161958720, + "step": 896 + }, + { + "epoch": 0.09819644763129806, + "grad_norm": 1.129217718643049, + "learning_rate": 4.8819611199844406e-05, + "loss": 0.7086, + "num_input_tokens_seen": 162172416, + "step": 897 + }, + { + "epoch": 0.09830591970223597, + "grad_norm": 1.3460339498050224, + "learning_rate": 4.881699884093715e-05, + "loss": 0.8896, + "num_input_tokens_seen": 162360800, + "step": 898 + }, + { + "epoch": 0.09841539177317386, + "grad_norm": 1.2008561378412044, + "learning_rate": 4.881438366452446e-05, + "loss": 0.7655, + "num_input_tokens_seen": 162525216, + "step": 899 + }, + { + "epoch": 0.09852486384411177, + "grad_norm": 1.3613339920683738, + "learning_rate": 4.88117656709157e-05, + "loss": 0.9436, + "num_input_tokens_seen": 162717184, + "step": 900 + }, + { + "epoch": 0.09863433591504968, + "grad_norm": 1.3329206612850972, + "learning_rate": 4.880914486042059e-05, + "loss": 0.6904, + "num_input_tokens_seen": 162904000, + "step": 901 + }, + { + "epoch": 0.09874380798598757, + "grad_norm": 1.2789171893025388, + "learning_rate": 4.8806521233349146e-05, + "loss": 0.8643, + "num_input_tokens_seen": 163096416, + "step": 902 + }, + { + "epoch": 0.09885328005692548, + "grad_norm": 1.3181317620354924, + "learning_rate": 4.880389479001176e-05, + "loss": 0.7828, + "num_input_tokens_seen": 163276960, + "step": 903 + }, + { + "epoch": 0.09896275212786337, + "grad_norm": 1.2084213202877752, + "learning_rate": 4.880126553071912e-05, + "loss": 0.8228, + "num_input_tokens_seen": 163496256, + "step": 904 + }, + { + "epoch": 0.09907222419880128, + "grad_norm": 1.3070105697537173, + "learning_rate": 4.879863345578227e-05, + "loss": 0.752, + "num_input_tokens_seen": 163654176, + "step": 905 + }, + { + "epoch": 0.09918169626973919, + "grad_norm": 1.3629092170669566, + "learning_rate": 4.879599856551258e-05, + "loss": 0.8627, + "num_input_tokens_seen": 163834496, + "step": 906 + }, + { + "epoch": 0.09929116834067708, + "grad_norm": 1.2550279110392277, + "learning_rate": 4.879336086022175e-05, + "loss": 0.6578, + "num_input_tokens_seen": 164006528, + "step": 907 + }, + { + "epoch": 0.09940064041161499, + "grad_norm": 1.2801070348870096, + "learning_rate": 4.879072034022182e-05, + "loss": 0.8276, + "num_input_tokens_seen": 164215520, + "step": 908 + }, + { + "epoch": 0.09951011248255288, + "grad_norm": 1.698222672984846, + "learning_rate": 4.8788077005825146e-05, + "loss": 0.7947, + "num_input_tokens_seen": 164395168, + "step": 909 + }, + { + "epoch": 0.09961958455349079, + "grad_norm": 1.2818927083310452, + "learning_rate": 4.878543085734444e-05, + "loss": 0.6657, + "num_input_tokens_seen": 164566528, + "step": 910 + }, + { + "epoch": 0.09972905662442869, + "grad_norm": 1.2852236373533517, + "learning_rate": 4.8782781895092734e-05, + "loss": 0.9609, + "num_input_tokens_seen": 164762528, + "step": 911 + }, + { + "epoch": 0.0998385286953666, + "grad_norm": 1.2355349438525702, + "learning_rate": 4.878013011938339e-05, + "loss": 0.6463, + "num_input_tokens_seen": 164933664, + "step": 912 + }, + { + "epoch": 0.0999480007663045, + "grad_norm": 1.5445833281331884, + "learning_rate": 4.877747553053012e-05, + "loss": 0.8373, + "num_input_tokens_seen": 165124512, + "step": 913 + }, + { + "epoch": 0.1000574728372424, + "grad_norm": 1.250051497352295, + "learning_rate": 4.877481812884695e-05, + "loss": 0.7064, + "num_input_tokens_seen": 165311776, + "step": 914 + }, + { + "epoch": 0.1001669449081803, + "grad_norm": 1.2846823981129738, + "learning_rate": 4.877215791464824e-05, + "loss": 0.8218, + "num_input_tokens_seen": 165483584, + "step": 915 + }, + { + "epoch": 0.1002764169791182, + "grad_norm": 1.3256895286469361, + "learning_rate": 4.876949488824869e-05, + "loss": 0.7181, + "num_input_tokens_seen": 165640384, + "step": 916 + }, + { + "epoch": 0.1003858890500561, + "grad_norm": 1.2535704165971993, + "learning_rate": 4.8766829049963344e-05, + "loss": 0.7043, + "num_input_tokens_seen": 165851168, + "step": 917 + }, + { + "epoch": 0.10049536112099401, + "grad_norm": 1.2898514154161373, + "learning_rate": 4.876416040010755e-05, + "loss": 0.7461, + "num_input_tokens_seen": 166040672, + "step": 918 + }, + { + "epoch": 0.1006048331919319, + "grad_norm": 1.2277950241542317, + "learning_rate": 4.876148893899701e-05, + "loss": 0.6692, + "num_input_tokens_seen": 166219424, + "step": 919 + }, + { + "epoch": 0.10071430526286981, + "grad_norm": 1.2941962002497558, + "learning_rate": 4.8758814666947756e-05, + "loss": 0.8863, + "num_input_tokens_seen": 166421248, + "step": 920 + }, + { + "epoch": 0.10082377733380771, + "grad_norm": 1.3211918898753643, + "learning_rate": 4.875613758427614e-05, + "loss": 0.7305, + "num_input_tokens_seen": 166598208, + "step": 921 + }, + { + "epoch": 0.10093324940474561, + "grad_norm": 1.109651933947143, + "learning_rate": 4.875345769129887e-05, + "loss": 0.6228, + "num_input_tokens_seen": 166793536, + "step": 922 + }, + { + "epoch": 0.10104272147568352, + "grad_norm": 1.3041746307877378, + "learning_rate": 4.875077498833296e-05, + "loss": 0.8925, + "num_input_tokens_seen": 166970496, + "step": 923 + }, + { + "epoch": 0.10115219354662142, + "grad_norm": 1.3469222129105436, + "learning_rate": 4.874808947569577e-05, + "loss": 0.8568, + "num_input_tokens_seen": 167159552, + "step": 924 + }, + { + "epoch": 0.10126166561755932, + "grad_norm": 1.2002238680714796, + "learning_rate": 4.8745401153704996e-05, + "loss": 0.624, + "num_input_tokens_seen": 167337632, + "step": 925 + }, + { + "epoch": 0.10137113768849722, + "grad_norm": 1.2514967540374249, + "learning_rate": 4.874271002267866e-05, + "loss": 0.6356, + "num_input_tokens_seen": 167505408, + "step": 926 + }, + { + "epoch": 0.10148060975943513, + "grad_norm": 1.1662822271639217, + "learning_rate": 4.874001608293511e-05, + "loss": 0.7896, + "num_input_tokens_seen": 167707456, + "step": 927 + }, + { + "epoch": 0.10159008183037303, + "grad_norm": 1.4355046064940031, + "learning_rate": 4.873731933479305e-05, + "loss": 0.7985, + "num_input_tokens_seen": 167886208, + "step": 928 + }, + { + "epoch": 0.10169955390131093, + "grad_norm": 1.3802946627506991, + "learning_rate": 4.873461977857149e-05, + "loss": 0.8132, + "num_input_tokens_seen": 168025536, + "step": 929 + }, + { + "epoch": 0.10180902597224883, + "grad_norm": 1.276692168841782, + "learning_rate": 4.8731917414589776e-05, + "loss": 0.7088, + "num_input_tokens_seen": 168209216, + "step": 930 + }, + { + "epoch": 0.10191849804318673, + "grad_norm": 1.1717711847733017, + "learning_rate": 4.872921224316761e-05, + "loss": 0.5851, + "num_input_tokens_seen": 168390880, + "step": 931 + }, + { + "epoch": 0.10202797011412464, + "grad_norm": 1.1486843129369155, + "learning_rate": 4.872650426462499e-05, + "loss": 0.6904, + "num_input_tokens_seen": 168590240, + "step": 932 + }, + { + "epoch": 0.10213744218506253, + "grad_norm": 1.2590068099070286, + "learning_rate": 4.8723793479282274e-05, + "loss": 0.7619, + "num_input_tokens_seen": 168796544, + "step": 933 + }, + { + "epoch": 0.10224691425600044, + "grad_norm": 1.403207751852087, + "learning_rate": 4.872107988746014e-05, + "loss": 0.6939, + "num_input_tokens_seen": 168956704, + "step": 934 + }, + { + "epoch": 0.10235638632693835, + "grad_norm": 1.2999746668621421, + "learning_rate": 4.871836348947961e-05, + "loss": 0.6586, + "num_input_tokens_seen": 169133664, + "step": 935 + }, + { + "epoch": 0.10246585839787624, + "grad_norm": 1.1805066428928659, + "learning_rate": 4.871564428566201e-05, + "loss": 0.579, + "num_input_tokens_seen": 169330112, + "step": 936 + }, + { + "epoch": 0.10257533046881415, + "grad_norm": 1.1283295950459058, + "learning_rate": 4.8712922276329035e-05, + "loss": 0.6513, + "num_input_tokens_seen": 169520512, + "step": 937 + }, + { + "epoch": 0.10268480253975204, + "grad_norm": 1.2174271471288474, + "learning_rate": 4.8710197461802686e-05, + "loss": 0.7276, + "num_input_tokens_seen": 169714272, + "step": 938 + }, + { + "epoch": 0.10279427461068995, + "grad_norm": 1.0902498643923586, + "learning_rate": 4.8707469842405304e-05, + "loss": 0.5897, + "num_input_tokens_seen": 169885408, + "step": 939 + }, + { + "epoch": 0.10290374668162786, + "grad_norm": 1.2203392524082484, + "learning_rate": 4.870473941845955e-05, + "loss": 0.7027, + "num_input_tokens_seen": 170049376, + "step": 940 + }, + { + "epoch": 0.10301321875256575, + "grad_norm": 1.2806162778856154, + "learning_rate": 4.870200619028845e-05, + "loss": 0.6164, + "num_input_tokens_seen": 170244928, + "step": 941 + }, + { + "epoch": 0.10312269082350366, + "grad_norm": 1.495071819073051, + "learning_rate": 4.869927015821533e-05, + "loss": 0.9216, + "num_input_tokens_seen": 170375296, + "step": 942 + }, + { + "epoch": 0.10323216289444155, + "grad_norm": 1.1784312146246056, + "learning_rate": 4.8696531322563857e-05, + "loss": 0.6821, + "num_input_tokens_seen": 170571744, + "step": 943 + }, + { + "epoch": 0.10334163496537946, + "grad_norm": 1.304127967227795, + "learning_rate": 4.869378968365802e-05, + "loss": 0.6696, + "num_input_tokens_seen": 170708384, + "step": 944 + }, + { + "epoch": 0.10345110703631737, + "grad_norm": 1.2588005686871093, + "learning_rate": 4.869104524182216e-05, + "loss": 0.7371, + "num_input_tokens_seen": 170902816, + "step": 945 + }, + { + "epoch": 0.10356057910725526, + "grad_norm": 1.2386213732011966, + "learning_rate": 4.868829799738094e-05, + "loss": 0.6522, + "num_input_tokens_seen": 171067680, + "step": 946 + }, + { + "epoch": 0.10367005117819317, + "grad_norm": 1.1962541349333569, + "learning_rate": 4.8685547950659346e-05, + "loss": 0.6111, + "num_input_tokens_seen": 171226272, + "step": 947 + }, + { + "epoch": 0.10377952324913106, + "grad_norm": 1.3179640339674858, + "learning_rate": 4.868279510198271e-05, + "loss": 0.5951, + "num_input_tokens_seen": 171408160, + "step": 948 + }, + { + "epoch": 0.10388899532006897, + "grad_norm": 1.4114232592664757, + "learning_rate": 4.8680039451676695e-05, + "loss": 0.7885, + "num_input_tokens_seen": 171588704, + "step": 949 + }, + { + "epoch": 0.10399846739100686, + "grad_norm": 1.3491836958563093, + "learning_rate": 4.867728100006728e-05, + "loss": 0.6358, + "num_input_tokens_seen": 171774848, + "step": 950 + }, + { + "epoch": 0.10410793946194477, + "grad_norm": 1.2675684853585456, + "learning_rate": 4.8674519747480774e-05, + "loss": 0.7533, + "num_input_tokens_seen": 171970400, + "step": 951 + }, + { + "epoch": 0.10421741153288268, + "grad_norm": 1.2342665902348482, + "learning_rate": 4.867175569424385e-05, + "loss": 0.6149, + "num_input_tokens_seen": 172117120, + "step": 952 + }, + { + "epoch": 0.10432688360382057, + "grad_norm": 1.2384806090443572, + "learning_rate": 4.866898884068348e-05, + "loss": 0.715, + "num_input_tokens_seen": 172268096, + "step": 953 + }, + { + "epoch": 0.10443635567475848, + "grad_norm": 1.2826400752470273, + "learning_rate": 4.866621918712697e-05, + "loss": 0.7553, + "num_input_tokens_seen": 172415488, + "step": 954 + }, + { + "epoch": 0.10454582774569637, + "grad_norm": 1.2677159873628945, + "learning_rate": 4.866344673390198e-05, + "loss": 0.6576, + "num_input_tokens_seen": 172624928, + "step": 955 + }, + { + "epoch": 0.10465529981663428, + "grad_norm": 1.3895583187217604, + "learning_rate": 4.8660671481336475e-05, + "loss": 0.6706, + "num_input_tokens_seen": 172812192, + "step": 956 + }, + { + "epoch": 0.10476477188757219, + "grad_norm": 1.3270570388097325, + "learning_rate": 4.865789342975877e-05, + "loss": 0.6402, + "num_input_tokens_seen": 173019168, + "step": 957 + }, + { + "epoch": 0.10487424395851008, + "grad_norm": 1.2261571518086074, + "learning_rate": 4.865511257949749e-05, + "loss": 0.8545, + "num_input_tokens_seen": 173223008, + "step": 958 + }, + { + "epoch": 0.10498371602944799, + "grad_norm": 1.2363148186857988, + "learning_rate": 4.865232893088162e-05, + "loss": 0.7539, + "num_input_tokens_seen": 173417888, + "step": 959 + }, + { + "epoch": 0.10509318810038588, + "grad_norm": 1.1261174691958653, + "learning_rate": 4.864954248424045e-05, + "loss": 0.6802, + "num_input_tokens_seen": 173625536, + "step": 960 + }, + { + "epoch": 0.10520266017132379, + "grad_norm": 1.287553956465964, + "learning_rate": 4.864675323990361e-05, + "loss": 0.7212, + "num_input_tokens_seen": 173815712, + "step": 961 + }, + { + "epoch": 0.1053121322422617, + "grad_norm": 1.1735464256312476, + "learning_rate": 4.864396119820108e-05, + "loss": 0.6268, + "num_input_tokens_seen": 174001184, + "step": 962 + }, + { + "epoch": 0.1054216043131996, + "grad_norm": 1.3259011892668853, + "learning_rate": 4.864116635946313e-05, + "loss": 0.6949, + "num_input_tokens_seen": 174220032, + "step": 963 + }, + { + "epoch": 0.1055310763841375, + "grad_norm": 1.2055880612992829, + "learning_rate": 4.863836872402039e-05, + "loss": 0.5744, + "num_input_tokens_seen": 174402368, + "step": 964 + }, + { + "epoch": 0.1056405484550754, + "grad_norm": 1.3388281440189491, + "learning_rate": 4.863556829220383e-05, + "loss": 0.9442, + "num_input_tokens_seen": 174593440, + "step": 965 + }, + { + "epoch": 0.1057500205260133, + "grad_norm": 1.2805831848734925, + "learning_rate": 4.863276506434471e-05, + "loss": 0.6836, + "num_input_tokens_seen": 174748224, + "step": 966 + }, + { + "epoch": 0.1058594925969512, + "grad_norm": 1.3352311704272521, + "learning_rate": 4.862995904077468e-05, + "loss": 0.7871, + "num_input_tokens_seen": 174914432, + "step": 967 + }, + { + "epoch": 0.1059689646678891, + "grad_norm": 1.4122926374155322, + "learning_rate": 4.8627150221825654e-05, + "loss": 0.7583, + "num_input_tokens_seen": 175066752, + "step": 968 + }, + { + "epoch": 0.10607843673882701, + "grad_norm": 1.3398300926468887, + "learning_rate": 4.862433860782993e-05, + "loss": 0.6804, + "num_input_tokens_seen": 175255584, + "step": 969 + }, + { + "epoch": 0.1061879088097649, + "grad_norm": 1.3480835168192296, + "learning_rate": 4.8621524199120106e-05, + "loss": 0.7711, + "num_input_tokens_seen": 175453152, + "step": 970 + }, + { + "epoch": 0.10629738088070281, + "grad_norm": 1.3240551242618432, + "learning_rate": 4.861870699602913e-05, + "loss": 0.7371, + "num_input_tokens_seen": 175644000, + "step": 971 + }, + { + "epoch": 0.1064068529516407, + "grad_norm": 1.236230156317847, + "learning_rate": 4.8615886998890266e-05, + "loss": 0.8159, + "num_input_tokens_seen": 175830592, + "step": 972 + }, + { + "epoch": 0.10651632502257861, + "grad_norm": 1.1780394606562814, + "learning_rate": 4.861306420803712e-05, + "loss": 0.6739, + "num_input_tokens_seen": 176043616, + "step": 973 + }, + { + "epoch": 0.10662579709351652, + "grad_norm": 1.295463943371306, + "learning_rate": 4.861023862380361e-05, + "loss": 0.8644, + "num_input_tokens_seen": 176254624, + "step": 974 + }, + { + "epoch": 0.10673526916445442, + "grad_norm": 1.1985161428498285, + "learning_rate": 4.860741024652401e-05, + "loss": 0.7162, + "num_input_tokens_seen": 176423072, + "step": 975 + }, + { + "epoch": 0.10684474123539232, + "grad_norm": 1.3126598896547834, + "learning_rate": 4.860457907653291e-05, + "loss": 0.9685, + "num_input_tokens_seen": 176637440, + "step": 976 + }, + { + "epoch": 0.10695421330633022, + "grad_norm": 1.2878300179037752, + "learning_rate": 4.860174511416523e-05, + "loss": 0.7168, + "num_input_tokens_seen": 176812384, + "step": 977 + }, + { + "epoch": 0.10706368537726813, + "grad_norm": 1.327133212307466, + "learning_rate": 4.8598908359756226e-05, + "loss": 0.7853, + "num_input_tokens_seen": 177001664, + "step": 978 + }, + { + "epoch": 0.10717315744820603, + "grad_norm": 1.281851882986516, + "learning_rate": 4.859606881364146e-05, + "loss": 0.7571, + "num_input_tokens_seen": 177153536, + "step": 979 + }, + { + "epoch": 0.10728262951914393, + "grad_norm": 1.2766298404707541, + "learning_rate": 4.859322647615687e-05, + "loss": 0.6039, + "num_input_tokens_seen": 177336544, + "step": 980 + }, + { + "epoch": 0.10739210159008183, + "grad_norm": 1.4063261223429055, + "learning_rate": 4.85903813476387e-05, + "loss": 0.8699, + "num_input_tokens_seen": 177516640, + "step": 981 + }, + { + "epoch": 0.10750157366101973, + "grad_norm": 1.40743320279482, + "learning_rate": 4.8587533428423504e-05, + "loss": 0.7725, + "num_input_tokens_seen": 177671424, + "step": 982 + }, + { + "epoch": 0.10761104573195764, + "grad_norm": 1.3211939200172191, + "learning_rate": 4.85846827188482e-05, + "loss": 0.818, + "num_input_tokens_seen": 177877728, + "step": 983 + }, + { + "epoch": 0.10772051780289553, + "grad_norm": 1.3206806008314564, + "learning_rate": 4.858182921925001e-05, + "loss": 0.6957, + "num_input_tokens_seen": 178066560, + "step": 984 + }, + { + "epoch": 0.10782998987383344, + "grad_norm": 1.3426051030848907, + "learning_rate": 4.857897292996651e-05, + "loss": 0.8008, + "num_input_tokens_seen": 178253376, + "step": 985 + }, + { + "epoch": 0.10793946194477134, + "grad_norm": 1.3110898026029727, + "learning_rate": 4.857611385133559e-05, + "loss": 0.6187, + "num_input_tokens_seen": 178434816, + "step": 986 + }, + { + "epoch": 0.10804893401570924, + "grad_norm": 1.1398141147246283, + "learning_rate": 4.857325198369546e-05, + "loss": 0.7601, + "num_input_tokens_seen": 178634848, + "step": 987 + }, + { + "epoch": 0.10815840608664715, + "grad_norm": 1.3302042224952948, + "learning_rate": 4.8570387327384695e-05, + "loss": 0.8135, + "num_input_tokens_seen": 178818752, + "step": 988 + }, + { + "epoch": 0.10826787815758504, + "grad_norm": 1.2574514498732952, + "learning_rate": 4.856751988274216e-05, + "loss": 0.7095, + "num_input_tokens_seen": 179019680, + "step": 989 + }, + { + "epoch": 0.10837735022852295, + "grad_norm": 1.249889178131839, + "learning_rate": 4.8564649650107084e-05, + "loss": 0.6195, + "num_input_tokens_seen": 179179840, + "step": 990 + }, + { + "epoch": 0.10848682229946086, + "grad_norm": 1.24312872585469, + "learning_rate": 4.8561776629819e-05, + "loss": 0.6993, + "num_input_tokens_seen": 179350528, + "step": 991 + }, + { + "epoch": 0.10859629437039875, + "grad_norm": 1.2541090661611436, + "learning_rate": 4.855890082221778e-05, + "loss": 0.5838, + "num_input_tokens_seen": 179524576, + "step": 992 + }, + { + "epoch": 0.10870576644133666, + "grad_norm": 1.0915661945165824, + "learning_rate": 4.8556022227643636e-05, + "loss": 0.7074, + "num_input_tokens_seen": 179731776, + "step": 993 + }, + { + "epoch": 0.10881523851227455, + "grad_norm": 1.3705914656002247, + "learning_rate": 4.8553140846437094e-05, + "loss": 0.7154, + "num_input_tokens_seen": 179870880, + "step": 994 + }, + { + "epoch": 0.10892471058321246, + "grad_norm": 1.2189702055140015, + "learning_rate": 4.855025667893901e-05, + "loss": 0.7043, + "num_input_tokens_seen": 180044032, + "step": 995 + }, + { + "epoch": 0.10903418265415037, + "grad_norm": 1.505804830273296, + "learning_rate": 4.854736972549058e-05, + "loss": 0.9028, + "num_input_tokens_seen": 180214720, + "step": 996 + }, + { + "epoch": 0.10914365472508826, + "grad_norm": 1.336941614974714, + "learning_rate": 4.854447998643333e-05, + "loss": 0.8202, + "num_input_tokens_seen": 180392800, + "step": 997 + }, + { + "epoch": 0.10925312679602617, + "grad_norm": 1.3277951896905564, + "learning_rate": 4.8541587462109105e-05, + "loss": 0.8383, + "num_input_tokens_seen": 180569312, + "step": 998 + }, + { + "epoch": 0.10936259886696406, + "grad_norm": 1.4176738001391838, + "learning_rate": 4.8538692152860094e-05, + "loss": 0.8878, + "num_input_tokens_seen": 180761952, + "step": 999 + }, + { + "epoch": 0.10947207093790197, + "grad_norm": 1.3087005571646475, + "learning_rate": 4.853579405902879e-05, + "loss": 0.6891, + "num_input_tokens_seen": 180949216, + "step": 1000 + }, + { + "epoch": 0.10958154300883986, + "grad_norm": 1.264189058623954, + "learning_rate": 4.853289318095805e-05, + "loss": 0.6913, + "num_input_tokens_seen": 181110944, + "step": 1001 + }, + { + "epoch": 0.10969101507977777, + "grad_norm": 1.2916122189192794, + "learning_rate": 4.8529989518991033e-05, + "loss": 0.6892, + "num_input_tokens_seen": 181279392, + "step": 1002 + }, + { + "epoch": 0.10980048715071568, + "grad_norm": 1.2239247527148993, + "learning_rate": 4.8527083073471236e-05, + "loss": 0.7772, + "num_input_tokens_seen": 181448064, + "step": 1003 + }, + { + "epoch": 0.10990995922165357, + "grad_norm": 1.330045007861966, + "learning_rate": 4.852417384474248e-05, + "loss": 0.6991, + "num_input_tokens_seen": 181609344, + "step": 1004 + }, + { + "epoch": 0.11001943129259148, + "grad_norm": 1.3924964570526033, + "learning_rate": 4.852126183314894e-05, + "loss": 0.6477, + "num_input_tokens_seen": 181806016, + "step": 1005 + }, + { + "epoch": 0.11012890336352937, + "grad_norm": 1.2223970181048058, + "learning_rate": 4.851834703903508e-05, + "loss": 0.6692, + "num_input_tokens_seen": 181990816, + "step": 1006 + }, + { + "epoch": 0.11023837543446728, + "grad_norm": 1.2670770956879187, + "learning_rate": 4.851542946274573e-05, + "loss": 0.616, + "num_input_tokens_seen": 182144480, + "step": 1007 + }, + { + "epoch": 0.11034784750540519, + "grad_norm": 1.261682988182599, + "learning_rate": 4.8512509104626036e-05, + "loss": 0.7609, + "num_input_tokens_seen": 182318528, + "step": 1008 + }, + { + "epoch": 0.11045731957634308, + "grad_norm": 1.2300999164316744, + "learning_rate": 4.850958596502145e-05, + "loss": 0.6533, + "num_input_tokens_seen": 182477120, + "step": 1009 + }, + { + "epoch": 0.11056679164728099, + "grad_norm": 1.314663375658542, + "learning_rate": 4.85066600442778e-05, + "loss": 0.6286, + "num_input_tokens_seen": 182640416, + "step": 1010 + }, + { + "epoch": 0.11067626371821888, + "grad_norm": 1.2297566124229764, + "learning_rate": 4.8503731342741195e-05, + "loss": 0.6152, + "num_input_tokens_seen": 182837760, + "step": 1011 + }, + { + "epoch": 0.11078573578915679, + "grad_norm": 1.180298400746217, + "learning_rate": 4.8500799860758105e-05, + "loss": 0.8555, + "num_input_tokens_seen": 183048096, + "step": 1012 + }, + { + "epoch": 0.1108952078600947, + "grad_norm": 1.2699334549177896, + "learning_rate": 4.849786559867532e-05, + "loss": 0.7277, + "num_input_tokens_seen": 183228416, + "step": 1013 + }, + { + "epoch": 0.1110046799310326, + "grad_norm": 1.5127357393776852, + "learning_rate": 4.8494928556839946e-05, + "loss": 0.8214, + "num_input_tokens_seen": 183383200, + "step": 1014 + }, + { + "epoch": 0.1111141520019705, + "grad_norm": 1.1505597804626555, + "learning_rate": 4.849198873559945e-05, + "loss": 0.7543, + "num_input_tokens_seen": 183595552, + "step": 1015 + }, + { + "epoch": 0.1112236240729084, + "grad_norm": 1.246835087493518, + "learning_rate": 4.848904613530159e-05, + "loss": 0.6592, + "num_input_tokens_seen": 183762432, + "step": 1016 + }, + { + "epoch": 0.1113330961438463, + "grad_norm": 1.15868291839693, + "learning_rate": 4.848610075629447e-05, + "loss": 0.8744, + "num_input_tokens_seen": 183970080, + "step": 1017 + }, + { + "epoch": 0.11144256821478421, + "grad_norm": 1.2497388566938374, + "learning_rate": 4.848315259892654e-05, + "loss": 0.6931, + "num_input_tokens_seen": 184178624, + "step": 1018 + }, + { + "epoch": 0.1115520402857221, + "grad_norm": 1.3191940717005268, + "learning_rate": 4.848020166354654e-05, + "loss": 0.8305, + "num_input_tokens_seen": 184336320, + "step": 1019 + }, + { + "epoch": 0.11166151235666001, + "grad_norm": 1.1145008567645895, + "learning_rate": 4.847724795050358e-05, + "loss": 0.5545, + "num_input_tokens_seen": 184497376, + "step": 1020 + }, + { + "epoch": 0.1117709844275979, + "grad_norm": 1.3624914554013046, + "learning_rate": 4.847429146014706e-05, + "loss": 0.8031, + "num_input_tokens_seen": 184703680, + "step": 1021 + }, + { + "epoch": 0.11188045649853581, + "grad_norm": 1.1807010666922617, + "learning_rate": 4.847133219282674e-05, + "loss": 0.6681, + "num_input_tokens_seen": 184894752, + "step": 1022 + }, + { + "epoch": 0.1119899285694737, + "grad_norm": 1.2824166266722916, + "learning_rate": 4.846837014889269e-05, + "loss": 0.7536, + "num_input_tokens_seen": 185073504, + "step": 1023 + }, + { + "epoch": 0.11209940064041161, + "grad_norm": 1.3562868051764763, + "learning_rate": 4.8465405328695315e-05, + "loss": 0.8242, + "num_input_tokens_seen": 185254944, + "step": 1024 + }, + { + "epoch": 0.11220887271134952, + "grad_norm": 1.2934251884322432, + "learning_rate": 4.8462437732585345e-05, + "loss": 0.8374, + "num_input_tokens_seen": 185456320, + "step": 1025 + }, + { + "epoch": 0.11231834478228742, + "grad_norm": 1.2319330131538466, + "learning_rate": 4.845946736091384e-05, + "loss": 0.6488, + "num_input_tokens_seen": 185636640, + "step": 1026 + }, + { + "epoch": 0.11242781685322532, + "grad_norm": 1.1456855967676867, + "learning_rate": 4.8456494214032205e-05, + "loss": 0.5977, + "num_input_tokens_seen": 185827936, + "step": 1027 + }, + { + "epoch": 0.11253728892416322, + "grad_norm": 1.256286548443438, + "learning_rate": 4.8453518292292146e-05, + "loss": 0.7064, + "num_input_tokens_seen": 186013184, + "step": 1028 + }, + { + "epoch": 0.11264676099510113, + "grad_norm": 1.2097970329644323, + "learning_rate": 4.8450539596045694e-05, + "loss": 0.7034, + "num_input_tokens_seen": 186166624, + "step": 1029 + }, + { + "epoch": 0.11275623306603903, + "grad_norm": 1.212476773383669, + "learning_rate": 4.844755812564525e-05, + "loss": 0.6872, + "num_input_tokens_seen": 186342240, + "step": 1030 + }, + { + "epoch": 0.11286570513697693, + "grad_norm": 1.3027351070556414, + "learning_rate": 4.84445738814435e-05, + "loss": 0.7132, + "num_input_tokens_seen": 186536896, + "step": 1031 + }, + { + "epoch": 0.11297517720791483, + "grad_norm": 1.167696815908066, + "learning_rate": 4.8441586863793475e-05, + "loss": 0.7331, + "num_input_tokens_seen": 186743200, + "step": 1032 + }, + { + "epoch": 0.11308464927885273, + "grad_norm": 1.3243208764229133, + "learning_rate": 4.843859707304854e-05, + "loss": 0.6708, + "num_input_tokens_seen": 186953536, + "step": 1033 + }, + { + "epoch": 0.11319412134979064, + "grad_norm": 1.3088936533235824, + "learning_rate": 4.843560450956238e-05, + "loss": 1.0204, + "num_input_tokens_seen": 187169472, + "step": 1034 + }, + { + "epoch": 0.11330359342072854, + "grad_norm": 1.2057832852680448, + "learning_rate": 4.8432609173689004e-05, + "loss": 0.7375, + "num_input_tokens_seen": 187349120, + "step": 1035 + }, + { + "epoch": 0.11341306549166644, + "grad_norm": 1.4046843820437578, + "learning_rate": 4.8429611065782765e-05, + "loss": 0.7696, + "num_input_tokens_seen": 187537056, + "step": 1036 + }, + { + "epoch": 0.11352253756260434, + "grad_norm": 1.1007081634971505, + "learning_rate": 4.8426610186198315e-05, + "loss": 0.6767, + "num_input_tokens_seen": 187717824, + "step": 1037 + }, + { + "epoch": 0.11363200963354224, + "grad_norm": 1.176999142541954, + "learning_rate": 4.8423606535290675e-05, + "loss": 0.5902, + "num_input_tokens_seen": 187853792, + "step": 1038 + }, + { + "epoch": 0.11374148170448015, + "grad_norm": 1.1898485350183865, + "learning_rate": 4.842060011341516e-05, + "loss": 0.7321, + "num_input_tokens_seen": 188039936, + "step": 1039 + }, + { + "epoch": 0.11385095377541804, + "grad_norm": 1.2649578172639167, + "learning_rate": 4.841759092092741e-05, + "loss": 0.8522, + "num_input_tokens_seen": 188225856, + "step": 1040 + }, + { + "epoch": 0.11396042584635595, + "grad_norm": 1.195403002136428, + "learning_rate": 4.841457895818344e-05, + "loss": 0.6799, + "num_input_tokens_seen": 188357120, + "step": 1041 + }, + { + "epoch": 0.11406989791729386, + "grad_norm": 1.2081684076727561, + "learning_rate": 4.841156422553953e-05, + "loss": 0.7518, + "num_input_tokens_seen": 188529376, + "step": 1042 + }, + { + "epoch": 0.11417936998823175, + "grad_norm": 1.2161054529624125, + "learning_rate": 4.840854672335233e-05, + "loss": 0.6675, + "num_input_tokens_seen": 188713280, + "step": 1043 + }, + { + "epoch": 0.11428884205916966, + "grad_norm": 1.1678200821685252, + "learning_rate": 4.84055264519788e-05, + "loss": 0.632, + "num_input_tokens_seen": 188861120, + "step": 1044 + }, + { + "epoch": 0.11439831413010755, + "grad_norm": 1.2505189295315606, + "learning_rate": 4.8402503411776235e-05, + "loss": 0.5964, + "num_input_tokens_seen": 189041440, + "step": 1045 + }, + { + "epoch": 0.11450778620104546, + "grad_norm": 1.2643411034619259, + "learning_rate": 4.839947760310226e-05, + "loss": 0.9238, + "num_input_tokens_seen": 189237888, + "step": 1046 + }, + { + "epoch": 0.11461725827198337, + "grad_norm": 1.199134600044696, + "learning_rate": 4.8396449026314803e-05, + "loss": 0.5199, + "num_input_tokens_seen": 189417760, + "step": 1047 + }, + { + "epoch": 0.11472673034292126, + "grad_norm": 1.310206270940614, + "learning_rate": 4.839341768177217e-05, + "loss": 0.7467, + "num_input_tokens_seen": 189599872, + "step": 1048 + }, + { + "epoch": 0.11483620241385917, + "grad_norm": 1.2074556561338148, + "learning_rate": 4.839038356983293e-05, + "loss": 0.5758, + "num_input_tokens_seen": 189784000, + "step": 1049 + }, + { + "epoch": 0.11494567448479706, + "grad_norm": 1.1103251511947896, + "learning_rate": 4.838734669085604e-05, + "loss": 0.6175, + "num_input_tokens_seen": 189969024, + "step": 1050 + }, + { + "epoch": 0.11505514655573497, + "grad_norm": 1.21754014044982, + "learning_rate": 4.838430704520074e-05, + "loss": 0.7275, + "num_input_tokens_seen": 190148672, + "step": 1051 + }, + { + "epoch": 0.11516461862667288, + "grad_norm": 1.2452185735376655, + "learning_rate": 4.838126463322662e-05, + "loss": 0.6634, + "num_input_tokens_seen": 190325632, + "step": 1052 + }, + { + "epoch": 0.11527409069761077, + "grad_norm": 1.3871403013115746, + "learning_rate": 4.8378219455293595e-05, + "loss": 0.7543, + "num_input_tokens_seen": 190502368, + "step": 1053 + }, + { + "epoch": 0.11538356276854868, + "grad_norm": 1.2417094912387214, + "learning_rate": 4.8375171511761895e-05, + "loss": 0.8233, + "num_input_tokens_seen": 190693216, + "step": 1054 + }, + { + "epoch": 0.11549303483948657, + "grad_norm": 1.1855139437128566, + "learning_rate": 4.837212080299209e-05, + "loss": 0.5905, + "num_input_tokens_seen": 190835232, + "step": 1055 + }, + { + "epoch": 0.11560250691042448, + "grad_norm": 1.1836262814926477, + "learning_rate": 4.836906732934508e-05, + "loss": 0.7737, + "num_input_tokens_seen": 191023616, + "step": 1056 + }, + { + "epoch": 0.11571197898136237, + "grad_norm": 1.9669874869813078, + "learning_rate": 4.836601109118208e-05, + "loss": 1.0295, + "num_input_tokens_seen": 191180640, + "step": 1057 + }, + { + "epoch": 0.11582145105230028, + "grad_norm": 1.283276327939582, + "learning_rate": 4.836295208886463e-05, + "loss": 0.6463, + "num_input_tokens_seen": 191370144, + "step": 1058 + }, + { + "epoch": 0.11593092312323819, + "grad_norm": 1.3119514318030585, + "learning_rate": 4.835989032275461e-05, + "loss": 0.7186, + "num_input_tokens_seen": 191548224, + "step": 1059 + }, + { + "epoch": 0.11604039519417608, + "grad_norm": 1.1668720007900275, + "learning_rate": 4.835682579321423e-05, + "loss": 0.5458, + "num_input_tokens_seen": 191724512, + "step": 1060 + }, + { + "epoch": 0.11614986726511399, + "grad_norm": 1.3011486920616566, + "learning_rate": 4.8353758500606e-05, + "loss": 0.8529, + "num_input_tokens_seen": 191916032, + "step": 1061 + }, + { + "epoch": 0.11625933933605188, + "grad_norm": 1.3733464181532842, + "learning_rate": 4.8350688445292794e-05, + "loss": 0.7331, + "num_input_tokens_seen": 192100832, + "step": 1062 + }, + { + "epoch": 0.11636881140698979, + "grad_norm": 1.2121973190834348, + "learning_rate": 4.834761562763777e-05, + "loss": 0.6914, + "num_input_tokens_seen": 192265472, + "step": 1063 + }, + { + "epoch": 0.1164782834779277, + "grad_norm": 1.235193681240226, + "learning_rate": 4.834454004800446e-05, + "loss": 0.5881, + "num_input_tokens_seen": 192450048, + "step": 1064 + }, + { + "epoch": 0.1165877555488656, + "grad_norm": 1.211889274398994, + "learning_rate": 4.8341461706756686e-05, + "loss": 0.6059, + "num_input_tokens_seen": 192640224, + "step": 1065 + }, + { + "epoch": 0.1166972276198035, + "grad_norm": 1.2655524715787536, + "learning_rate": 4.833838060425862e-05, + "loss": 0.6311, + "num_input_tokens_seen": 192838464, + "step": 1066 + }, + { + "epoch": 0.1168066996907414, + "grad_norm": 1.2882532527326565, + "learning_rate": 4.8335296740874735e-05, + "loss": 0.7326, + "num_input_tokens_seen": 193019232, + "step": 1067 + }, + { + "epoch": 0.1169161717616793, + "grad_norm": 1.2564311053556372, + "learning_rate": 4.8332210116969855e-05, + "loss": 0.7613, + "num_input_tokens_seen": 193217472, + "step": 1068 + }, + { + "epoch": 0.11702564383261721, + "grad_norm": 1.2392114460765598, + "learning_rate": 4.832912073290913e-05, + "loss": 0.7442, + "num_input_tokens_seen": 193380992, + "step": 1069 + }, + { + "epoch": 0.1171351159035551, + "grad_norm": 1.3870622236380215, + "learning_rate": 4.832602858905801e-05, + "loss": 0.7619, + "num_input_tokens_seen": 193566240, + "step": 1070 + }, + { + "epoch": 0.11724458797449301, + "grad_norm": 1.3514575366237442, + "learning_rate": 4.8322933685782304e-05, + "loss": 0.6023, + "num_input_tokens_seen": 193738720, + "step": 1071 + }, + { + "epoch": 0.1173540600454309, + "grad_norm": 1.451138194374879, + "learning_rate": 4.831983602344813e-05, + "loss": 0.7588, + "num_input_tokens_seen": 193910080, + "step": 1072 + }, + { + "epoch": 0.11746353211636881, + "grad_norm": 1.1446549801134556, + "learning_rate": 4.8316735602421935e-05, + "loss": 0.7136, + "num_input_tokens_seen": 194097568, + "step": 1073 + }, + { + "epoch": 0.1175730041873067, + "grad_norm": 1.224792272121286, + "learning_rate": 4.831363242307049e-05, + "loss": 0.7162, + "num_input_tokens_seen": 194257728, + "step": 1074 + }, + { + "epoch": 0.11768247625824461, + "grad_norm": 1.3113023652565572, + "learning_rate": 4.83105264857609e-05, + "loss": 0.8369, + "num_input_tokens_seen": 194465600, + "step": 1075 + }, + { + "epoch": 0.11779194832918252, + "grad_norm": 1.2316532791315185, + "learning_rate": 4.8307417790860586e-05, + "loss": 0.5408, + "num_input_tokens_seen": 194622624, + "step": 1076 + }, + { + "epoch": 0.11790142040012042, + "grad_norm": 1.2929406292672283, + "learning_rate": 4.830430633873731e-05, + "loss": 0.6627, + "num_input_tokens_seen": 194790848, + "step": 1077 + }, + { + "epoch": 0.11801089247105832, + "grad_norm": 1.3054353318197554, + "learning_rate": 4.830119212975914e-05, + "loss": 0.8924, + "num_input_tokens_seen": 194989984, + "step": 1078 + }, + { + "epoch": 0.11812036454199622, + "grad_norm": 1.326150885571772, + "learning_rate": 4.8298075164294484e-05, + "loss": 0.7917, + "num_input_tokens_seen": 195196064, + "step": 1079 + }, + { + "epoch": 0.11822983661293412, + "grad_norm": 1.204399474231348, + "learning_rate": 4.829495544271208e-05, + "loss": 0.728, + "num_input_tokens_seen": 195366304, + "step": 1080 + }, + { + "epoch": 0.11833930868387203, + "grad_norm": 1.2662489495080054, + "learning_rate": 4.829183296538097e-05, + "loss": 0.7152, + "num_input_tokens_seen": 195539456, + "step": 1081 + }, + { + "epoch": 0.11844878075480993, + "grad_norm": 1.1783654086555315, + "learning_rate": 4.828870773267056e-05, + "loss": 0.5901, + "num_input_tokens_seen": 195724256, + "step": 1082 + }, + { + "epoch": 0.11855825282574783, + "grad_norm": 1.2312733275241727, + "learning_rate": 4.8285579744950535e-05, + "loss": 0.7107, + "num_input_tokens_seen": 195867616, + "step": 1083 + }, + { + "epoch": 0.11866772489668573, + "grad_norm": 1.2899569096132852, + "learning_rate": 4.828244900259094e-05, + "loss": 0.6382, + "num_input_tokens_seen": 196049728, + "step": 1084 + }, + { + "epoch": 0.11877719696762364, + "grad_norm": 1.4033384487862277, + "learning_rate": 4.827931550596214e-05, + "loss": 0.6994, + "num_input_tokens_seen": 196196224, + "step": 1085 + }, + { + "epoch": 0.11888666903856154, + "grad_norm": 1.4503480362026706, + "learning_rate": 4.827617925543482e-05, + "loss": 0.751, + "num_input_tokens_seen": 196380352, + "step": 1086 + }, + { + "epoch": 0.11899614110949944, + "grad_norm": 1.174158592485147, + "learning_rate": 4.8273040251379985e-05, + "loss": 0.5442, + "num_input_tokens_seen": 196557312, + "step": 1087 + }, + { + "epoch": 0.11910561318043734, + "grad_norm": 1.2124326274283683, + "learning_rate": 4.826989849416899e-05, + "loss": 0.6971, + "num_input_tokens_seen": 196709408, + "step": 1088 + }, + { + "epoch": 0.11921508525137524, + "grad_norm": 1.3224291115783262, + "learning_rate": 4.826675398417347e-05, + "loss": 0.8622, + "num_input_tokens_seen": 196915040, + "step": 1089 + }, + { + "epoch": 0.11932455732231315, + "grad_norm": 1.350830497307284, + "learning_rate": 4.826360672176544e-05, + "loss": 0.7381, + "num_input_tokens_seen": 197111488, + "step": 1090 + }, + { + "epoch": 0.11943402939325104, + "grad_norm": 1.5349976746482508, + "learning_rate": 4.826045670731722e-05, + "loss": 0.8174, + "num_input_tokens_seen": 197276576, + "step": 1091 + }, + { + "epoch": 0.11954350146418895, + "grad_norm": 1.3377939008290014, + "learning_rate": 4.825730394120142e-05, + "loss": 0.7172, + "num_input_tokens_seen": 197459136, + "step": 1092 + }, + { + "epoch": 0.11965297353512686, + "grad_norm": 1.149706029636365, + "learning_rate": 4.8254148423791035e-05, + "loss": 0.5935, + "num_input_tokens_seen": 197653792, + "step": 1093 + }, + { + "epoch": 0.11976244560606475, + "grad_norm": 1.3045242116018374, + "learning_rate": 4.825099015545934e-05, + "loss": 0.8727, + "num_input_tokens_seen": 197833216, + "step": 1094 + }, + { + "epoch": 0.11987191767700266, + "grad_norm": 1.2942273332844745, + "learning_rate": 4.824782913657996e-05, + "loss": 0.813, + "num_input_tokens_seen": 198018688, + "step": 1095 + }, + { + "epoch": 0.11998138974794055, + "grad_norm": 1.1065504086471327, + "learning_rate": 4.824466536752683e-05, + "loss": 0.7642, + "num_input_tokens_seen": 198196544, + "step": 1096 + }, + { + "epoch": 0.12009086181887846, + "grad_norm": 1.2432812368065265, + "learning_rate": 4.8241498848674236e-05, + "loss": 0.7276, + "num_input_tokens_seen": 198361408, + "step": 1097 + }, + { + "epoch": 0.12020033388981637, + "grad_norm": 1.38715267645568, + "learning_rate": 4.823832958039675e-05, + "loss": 0.7087, + "num_input_tokens_seen": 198539712, + "step": 1098 + }, + { + "epoch": 0.12030980596075426, + "grad_norm": 1.115772231188005, + "learning_rate": 4.82351575630693e-05, + "loss": 0.5878, + "num_input_tokens_seen": 198714656, + "step": 1099 + }, + { + "epoch": 0.12041927803169217, + "grad_norm": 1.1899278817642895, + "learning_rate": 4.823198279706713e-05, + "loss": 0.7793, + "num_input_tokens_seen": 198896096, + "step": 1100 + }, + { + "epoch": 0.12052875010263006, + "grad_norm": 1.1710326155266544, + "learning_rate": 4.8228805282765803e-05, + "loss": 0.6473, + "num_input_tokens_seen": 199071936, + "step": 1101 + }, + { + "epoch": 0.12063822217356797, + "grad_norm": 1.322514926054578, + "learning_rate": 4.822562502054122e-05, + "loss": 0.8471, + "num_input_tokens_seen": 199265920, + "step": 1102 + }, + { + "epoch": 0.12074769424450588, + "grad_norm": 1.1620392080970354, + "learning_rate": 4.82224420107696e-05, + "loss": 0.7165, + "num_input_tokens_seen": 199454752, + "step": 1103 + }, + { + "epoch": 0.12085716631544377, + "grad_norm": 1.2289457553690428, + "learning_rate": 4.821925625382748e-05, + "loss": 0.7643, + "num_input_tokens_seen": 199638880, + "step": 1104 + }, + { + "epoch": 0.12096663838638168, + "grad_norm": 1.2483827619524253, + "learning_rate": 4.821606775009173e-05, + "loss": 0.6976, + "num_input_tokens_seen": 199809120, + "step": 1105 + }, + { + "epoch": 0.12107611045731957, + "grad_norm": 1.2966261475581047, + "learning_rate": 4.8212876499939555e-05, + "loss": 0.6002, + "num_input_tokens_seen": 199952256, + "step": 1106 + }, + { + "epoch": 0.12118558252825748, + "grad_norm": 1.278236800249669, + "learning_rate": 4.8209682503748455e-05, + "loss": 0.668, + "num_input_tokens_seen": 200120480, + "step": 1107 + }, + { + "epoch": 0.12129505459919539, + "grad_norm": 1.3354627870083122, + "learning_rate": 4.820648576189629e-05, + "loss": 0.7596, + "num_input_tokens_seen": 200282880, + "step": 1108 + }, + { + "epoch": 0.12140452667013328, + "grad_norm": 1.44440823697087, + "learning_rate": 4.820328627476122e-05, + "loss": 0.8501, + "num_input_tokens_seen": 200468576, + "step": 1109 + }, + { + "epoch": 0.12151399874107119, + "grad_norm": 1.2855926514794742, + "learning_rate": 4.820008404272175e-05, + "loss": 0.7494, + "num_input_tokens_seen": 200686976, + "step": 1110 + }, + { + "epoch": 0.12162347081200908, + "grad_norm": 1.2877126388506628, + "learning_rate": 4.819687906615668e-05, + "loss": 0.6706, + "num_input_tokens_seen": 200863264, + "step": 1111 + }, + { + "epoch": 0.12173294288294699, + "grad_norm": 1.2605888574853705, + "learning_rate": 4.819367134544516e-05, + "loss": 0.648, + "num_input_tokens_seen": 201078976, + "step": 1112 + }, + { + "epoch": 0.12184241495388488, + "grad_norm": 1.2468947942018274, + "learning_rate": 4.819046088096666e-05, + "loss": 0.7267, + "num_input_tokens_seen": 201249216, + "step": 1113 + }, + { + "epoch": 0.12195188702482279, + "grad_norm": 1.2730149112046862, + "learning_rate": 4.818724767310098e-05, + "loss": 0.7926, + "num_input_tokens_seen": 201444320, + "step": 1114 + }, + { + "epoch": 0.1220613590957607, + "grad_norm": 1.117676047895474, + "learning_rate": 4.8184031722228216e-05, + "loss": 0.5634, + "num_input_tokens_seen": 201629344, + "step": 1115 + }, + { + "epoch": 0.12217083116669859, + "grad_norm": 1.2019422588284927, + "learning_rate": 4.818081302872882e-05, + "loss": 0.6993, + "num_input_tokens_seen": 201821088, + "step": 1116 + }, + { + "epoch": 0.1222803032376365, + "grad_norm": 1.3334778121315265, + "learning_rate": 4.817759159298356e-05, + "loss": 0.6645, + "num_input_tokens_seen": 201992896, + "step": 1117 + }, + { + "epoch": 0.1223897753085744, + "grad_norm": 1.194620967112966, + "learning_rate": 4.817436741537352e-05, + "loss": 0.6137, + "num_input_tokens_seen": 202160224, + "step": 1118 + }, + { + "epoch": 0.1224992473795123, + "grad_norm": 1.2189971966574324, + "learning_rate": 4.817114049628012e-05, + "loss": 0.5721, + "num_input_tokens_seen": 202324416, + "step": 1119 + }, + { + "epoch": 0.12260871945045021, + "grad_norm": 1.2073892105803354, + "learning_rate": 4.81679108360851e-05, + "loss": 0.7126, + "num_input_tokens_seen": 202502720, + "step": 1120 + }, + { + "epoch": 0.1227181915213881, + "grad_norm": 1.4162713602581432, + "learning_rate": 4.8164678435170505e-05, + "loss": 0.8152, + "num_input_tokens_seen": 202690208, + "step": 1121 + }, + { + "epoch": 0.12282766359232601, + "grad_norm": 1.3503406271535456, + "learning_rate": 4.8161443293918746e-05, + "loss": 0.7046, + "num_input_tokens_seen": 202841408, + "step": 1122 + }, + { + "epoch": 0.1229371356632639, + "grad_norm": 1.2555034127046607, + "learning_rate": 4.815820541271252e-05, + "loss": 0.7138, + "num_input_tokens_seen": 202994400, + "step": 1123 + }, + { + "epoch": 0.12304660773420181, + "grad_norm": 1.4571195273976028, + "learning_rate": 4.815496479193486e-05, + "loss": 0.826, + "num_input_tokens_seen": 203139552, + "step": 1124 + }, + { + "epoch": 0.12315607980513972, + "grad_norm": 1.3006386435295856, + "learning_rate": 4.815172143196913e-05, + "loss": 0.6858, + "num_input_tokens_seen": 203332192, + "step": 1125 + }, + { + "epoch": 0.12326555187607761, + "grad_norm": 1.2867434362048054, + "learning_rate": 4.814847533319902e-05, + "loss": 0.7775, + "num_input_tokens_seen": 203503328, + "step": 1126 + }, + { + "epoch": 0.12337502394701552, + "grad_norm": 1.2466394072797622, + "learning_rate": 4.814522649600852e-05, + "loss": 0.7706, + "num_input_tokens_seen": 203723520, + "step": 1127 + }, + { + "epoch": 0.12348449601795342, + "grad_norm": 1.249158003465074, + "learning_rate": 4.814197492078198e-05, + "loss": 0.8207, + "num_input_tokens_seen": 203896448, + "step": 1128 + }, + { + "epoch": 0.12359396808889132, + "grad_norm": 1.2049452153490186, + "learning_rate": 4.813872060790404e-05, + "loss": 0.7653, + "num_input_tokens_seen": 204089760, + "step": 1129 + }, + { + "epoch": 0.12370344015982922, + "grad_norm": 1.2989291732602692, + "learning_rate": 4.813546355775969e-05, + "loss": 0.6552, + "num_input_tokens_seen": 204246336, + "step": 1130 + }, + { + "epoch": 0.12381291223076712, + "grad_norm": 1.1764386642556488, + "learning_rate": 4.813220377073423e-05, + "loss": 0.6613, + "num_input_tokens_seen": 204424416, + "step": 1131 + }, + { + "epoch": 0.12392238430170503, + "grad_norm": 1.3132452438474604, + "learning_rate": 4.8128941247213286e-05, + "loss": 0.6629, + "num_input_tokens_seen": 204594656, + "step": 1132 + }, + { + "epoch": 0.12403185637264293, + "grad_norm": 1.271374581983222, + "learning_rate": 4.812567598758281e-05, + "loss": 0.8369, + "num_input_tokens_seen": 204794912, + "step": 1133 + }, + { + "epoch": 0.12414132844358083, + "grad_norm": 1.2166912590223924, + "learning_rate": 4.812240799222906e-05, + "loss": 0.6613, + "num_input_tokens_seen": 204964928, + "step": 1134 + }, + { + "epoch": 0.12425080051451873, + "grad_norm": 1.3109742333377357, + "learning_rate": 4.811913726153866e-05, + "loss": 0.5747, + "num_input_tokens_seen": 205109632, + "step": 1135 + }, + { + "epoch": 0.12436027258545664, + "grad_norm": 1.3690515508259613, + "learning_rate": 4.8115863795898514e-05, + "loss": 0.6956, + "num_input_tokens_seen": 205309888, + "step": 1136 + }, + { + "epoch": 0.12446974465639454, + "grad_norm": 1.277932360991897, + "learning_rate": 4.811258759569587e-05, + "loss": 0.8307, + "num_input_tokens_seen": 205496256, + "step": 1137 + }, + { + "epoch": 0.12457921672733244, + "grad_norm": 1.4191097156404167, + "learning_rate": 4.8109308661318296e-05, + "loss": 0.9829, + "num_input_tokens_seen": 205669632, + "step": 1138 + }, + { + "epoch": 0.12468868879827034, + "grad_norm": 1.3726112684023843, + "learning_rate": 4.810602699315369e-05, + "loss": 0.8558, + "num_input_tokens_seen": 205815904, + "step": 1139 + }, + { + "epoch": 0.12479816086920824, + "grad_norm": 1.4383680790888584, + "learning_rate": 4.810274259159026e-05, + "loss": 0.8638, + "num_input_tokens_seen": 206006976, + "step": 1140 + }, + { + "epoch": 0.12490763294014615, + "grad_norm": 1.1893712408634822, + "learning_rate": 4.809945545701654e-05, + "loss": 0.6241, + "num_input_tokens_seen": 206184832, + "step": 1141 + }, + { + "epoch": 0.12501710501108404, + "grad_norm": 1.1792413739553476, + "learning_rate": 4.8096165589821404e-05, + "loss": 0.6551, + "num_input_tokens_seen": 206352832, + "step": 1142 + }, + { + "epoch": 0.12512657708202196, + "grad_norm": 1.2615983273698874, + "learning_rate": 4.809287299039403e-05, + "loss": 0.7376, + "num_input_tokens_seen": 206538752, + "step": 1143 + }, + { + "epoch": 0.12523604915295986, + "grad_norm": 1.1522014465398438, + "learning_rate": 4.808957765912393e-05, + "loss": 0.7462, + "num_input_tokens_seen": 206732288, + "step": 1144 + }, + { + "epoch": 0.12534552122389775, + "grad_norm": 1.3162953134762971, + "learning_rate": 4.808627959640093e-05, + "loss": 0.8108, + "num_input_tokens_seen": 206917760, + "step": 1145 + }, + { + "epoch": 0.12545499329483564, + "grad_norm": 1.2170226743305133, + "learning_rate": 4.808297880261518e-05, + "loss": 0.6857, + "num_input_tokens_seen": 207113984, + "step": 1146 + }, + { + "epoch": 0.12556446536577356, + "grad_norm": 1.2283727323216735, + "learning_rate": 4.807967527815718e-05, + "loss": 0.5671, + "num_input_tokens_seen": 207319840, + "step": 1147 + }, + { + "epoch": 0.12567393743671146, + "grad_norm": 1.1166835795492929, + "learning_rate": 4.807636902341771e-05, + "loss": 0.5684, + "num_input_tokens_seen": 207490752, + "step": 1148 + }, + { + "epoch": 0.12578340950764935, + "grad_norm": 1.3619846856884947, + "learning_rate": 4.80730600387879e-05, + "loss": 0.6801, + "num_input_tokens_seen": 207681824, + "step": 1149 + }, + { + "epoch": 0.12589288157858727, + "grad_norm": 1.2771523708499368, + "learning_rate": 4.8069748324659193e-05, + "loss": 0.7386, + "num_input_tokens_seen": 207873568, + "step": 1150 + }, + { + "epoch": 0.12600235364952517, + "grad_norm": 1.2051526113084512, + "learning_rate": 4.8066433881423354e-05, + "loss": 0.6109, + "num_input_tokens_seen": 208036192, + "step": 1151 + }, + { + "epoch": 0.12611182572046306, + "grad_norm": 1.1711754300247579, + "learning_rate": 4.806311670947249e-05, + "loss": 0.6863, + "num_input_tokens_seen": 208189184, + "step": 1152 + }, + { + "epoch": 0.12622129779140098, + "grad_norm": 1.1879155787036257, + "learning_rate": 4.805979680919901e-05, + "loss": 0.6409, + "num_input_tokens_seen": 208350688, + "step": 1153 + }, + { + "epoch": 0.12633076986233888, + "grad_norm": 1.4005037950997512, + "learning_rate": 4.8056474180995645e-05, + "loss": 0.5998, + "num_input_tokens_seen": 208500096, + "step": 1154 + }, + { + "epoch": 0.12644024193327677, + "grad_norm": 1.2669821162357928, + "learning_rate": 4.8053148825255466e-05, + "loss": 0.7037, + "num_input_tokens_seen": 208676384, + "step": 1155 + }, + { + "epoch": 0.12654971400421466, + "grad_norm": 1.250493286075132, + "learning_rate": 4.804982074237185e-05, + "loss": 0.6879, + "num_input_tokens_seen": 208852448, + "step": 1156 + }, + { + "epoch": 0.12665918607515259, + "grad_norm": 1.282407795750059, + "learning_rate": 4.8046489932738504e-05, + "loss": 0.6181, + "num_input_tokens_seen": 209027168, + "step": 1157 + }, + { + "epoch": 0.12676865814609048, + "grad_norm": 1.8037633470026928, + "learning_rate": 4.8043156396749454e-05, + "loss": 1.3005, + "num_input_tokens_seen": 209203008, + "step": 1158 + }, + { + "epoch": 0.12687813021702837, + "grad_norm": 1.301401488750797, + "learning_rate": 4.8039820134799054e-05, + "loss": 0.7303, + "num_input_tokens_seen": 209386912, + "step": 1159 + }, + { + "epoch": 0.1269876022879663, + "grad_norm": 1.14946695496134, + "learning_rate": 4.8036481147281975e-05, + "loss": 0.7447, + "num_input_tokens_seen": 209587840, + "step": 1160 + }, + { + "epoch": 0.1270970743589042, + "grad_norm": 1.257691276821772, + "learning_rate": 4.8033139434593224e-05, + "loss": 0.693, + "num_input_tokens_seen": 209749120, + "step": 1161 + }, + { + "epoch": 0.12720654642984208, + "grad_norm": 1.2410345905164766, + "learning_rate": 4.8029794997128096e-05, + "loss": 0.9108, + "num_input_tokens_seen": 209935712, + "step": 1162 + }, + { + "epoch": 0.12731601850077998, + "grad_norm": 1.1695539249319244, + "learning_rate": 4.8026447835282256e-05, + "loss": 0.6871, + "num_input_tokens_seen": 210093408, + "step": 1163 + }, + { + "epoch": 0.1274254905717179, + "grad_norm": 1.1568360261210238, + "learning_rate": 4.802309794945165e-05, + "loss": 0.8759, + "num_input_tokens_seen": 210305536, + "step": 1164 + }, + { + "epoch": 0.1275349626426558, + "grad_norm": 1.2191310678075058, + "learning_rate": 4.8019745340032574e-05, + "loss": 0.7458, + "num_input_tokens_seen": 210506240, + "step": 1165 + }, + { + "epoch": 0.12764443471359369, + "grad_norm": 1.2162981557379047, + "learning_rate": 4.801639000742163e-05, + "loss": 0.7389, + "num_input_tokens_seen": 210697760, + "step": 1166 + }, + { + "epoch": 0.1277539067845316, + "grad_norm": 1.3483807371708305, + "learning_rate": 4.801303195201574e-05, + "loss": 0.7821, + "num_input_tokens_seen": 210884352, + "step": 1167 + }, + { + "epoch": 0.1278633788554695, + "grad_norm": 1.276849726997726, + "learning_rate": 4.8009671174212176e-05, + "loss": 0.6153, + "num_input_tokens_seen": 211022336, + "step": 1168 + }, + { + "epoch": 0.1279728509264074, + "grad_norm": 1.2671390002055734, + "learning_rate": 4.8006307674408494e-05, + "loss": 0.6053, + "num_input_tokens_seen": 211192352, + "step": 1169 + }, + { + "epoch": 0.12808232299734532, + "grad_norm": 1.26596982991396, + "learning_rate": 4.800294145300259e-05, + "loss": 0.7252, + "num_input_tokens_seen": 211379840, + "step": 1170 + }, + { + "epoch": 0.1281917950682832, + "grad_norm": 1.3891186645961995, + "learning_rate": 4.799957251039269e-05, + "loss": 0.8351, + "num_input_tokens_seen": 211567328, + "step": 1171 + }, + { + "epoch": 0.1283012671392211, + "grad_norm": 1.3186948448326272, + "learning_rate": 4.799620084697732e-05, + "loss": 0.697, + "num_input_tokens_seen": 211754592, + "step": 1172 + }, + { + "epoch": 0.128410739210159, + "grad_norm": 1.3229626574994937, + "learning_rate": 4.799282646315537e-05, + "loss": 0.8519, + "num_input_tokens_seen": 211957088, + "step": 1173 + }, + { + "epoch": 0.12852021128109692, + "grad_norm": 1.2322448506602734, + "learning_rate": 4.798944935932599e-05, + "loss": 0.7239, + "num_input_tokens_seen": 212139648, + "step": 1174 + }, + { + "epoch": 0.1286296833520348, + "grad_norm": 1.16145390850416, + "learning_rate": 4.798606953588871e-05, + "loss": 0.7863, + "num_input_tokens_seen": 212337440, + "step": 1175 + }, + { + "epoch": 0.1287391554229727, + "grad_norm": 1.3589132116533529, + "learning_rate": 4.7982686993243335e-05, + "loss": 0.7712, + "num_input_tokens_seen": 212512384, + "step": 1176 + }, + { + "epoch": 0.12884862749391063, + "grad_norm": 1.2575147288163278, + "learning_rate": 4.797930173179003e-05, + "loss": 0.8943, + "num_input_tokens_seen": 212702784, + "step": 1177 + }, + { + "epoch": 0.12895809956484852, + "grad_norm": 1.2576966321189296, + "learning_rate": 4.797591375192926e-05, + "loss": 0.6998, + "num_input_tokens_seen": 212879296, + "step": 1178 + }, + { + "epoch": 0.12906757163578642, + "grad_norm": 1.186718683701112, + "learning_rate": 4.7972523054061815e-05, + "loss": 0.7541, + "num_input_tokens_seen": 213052000, + "step": 1179 + }, + { + "epoch": 0.1291770437067243, + "grad_norm": 1.273209018921855, + "learning_rate": 4.7969129638588805e-05, + "loss": 0.6441, + "num_input_tokens_seen": 213231872, + "step": 1180 + }, + { + "epoch": 0.12928651577766223, + "grad_norm": 1.341535739947239, + "learning_rate": 4.796573350591167e-05, + "loss": 0.6092, + "num_input_tokens_seen": 213431904, + "step": 1181 + }, + { + "epoch": 0.12939598784860012, + "grad_norm": 1.340797863110385, + "learning_rate": 4.796233465643216e-05, + "loss": 0.6959, + "num_input_tokens_seen": 213620288, + "step": 1182 + }, + { + "epoch": 0.12950545991953802, + "grad_norm": 1.2438843850997057, + "learning_rate": 4.7958933090552365e-05, + "loss": 0.6207, + "num_input_tokens_seen": 213790752, + "step": 1183 + }, + { + "epoch": 0.12961493199047594, + "grad_norm": 1.470541814744604, + "learning_rate": 4.795552880867467e-05, + "loss": 0.8396, + "num_input_tokens_seen": 214008256, + "step": 1184 + }, + { + "epoch": 0.12972440406141383, + "grad_norm": 1.2364968038657251, + "learning_rate": 4.795212181120181e-05, + "loss": 0.5226, + "num_input_tokens_seen": 214210976, + "step": 1185 + }, + { + "epoch": 0.12983387613235173, + "grad_norm": 1.3619624538624022, + "learning_rate": 4.79487120985368e-05, + "loss": 0.8702, + "num_input_tokens_seen": 214397344, + "step": 1186 + }, + { + "epoch": 0.12994334820328965, + "grad_norm": 1.3621547821589408, + "learning_rate": 4.7945299671083036e-05, + "loss": 0.8755, + "num_input_tokens_seen": 214589088, + "step": 1187 + }, + { + "epoch": 0.13005282027422754, + "grad_norm": 1.282265981400936, + "learning_rate": 4.7941884529244175e-05, + "loss": 0.8339, + "num_input_tokens_seen": 214773888, + "step": 1188 + }, + { + "epoch": 0.13016229234516544, + "grad_norm": 1.2663443134277332, + "learning_rate": 4.793846667342423e-05, + "loss": 0.7587, + "num_input_tokens_seen": 214955552, + "step": 1189 + }, + { + "epoch": 0.13027176441610333, + "grad_norm": 1.2959265860380156, + "learning_rate": 4.793504610402754e-05, + "loss": 0.7152, + "num_input_tokens_seen": 215124896, + "step": 1190 + }, + { + "epoch": 0.13038123648704125, + "grad_norm": 1.2114623870300714, + "learning_rate": 4.7931622821458726e-05, + "loss": 0.717, + "num_input_tokens_seen": 215307904, + "step": 1191 + }, + { + "epoch": 0.13049070855797915, + "grad_norm": 1.2400243866737433, + "learning_rate": 4.7928196826122775e-05, + "loss": 0.8882, + "num_input_tokens_seen": 215475456, + "step": 1192 + }, + { + "epoch": 0.13060018062891704, + "grad_norm": 1.203729254839941, + "learning_rate": 4.7924768118424975e-05, + "loss": 0.7955, + "num_input_tokens_seen": 215666528, + "step": 1193 + }, + { + "epoch": 0.13070965269985496, + "grad_norm": 1.162527621361517, + "learning_rate": 4.7921336698770926e-05, + "loss": 0.9059, + "num_input_tokens_seen": 215871712, + "step": 1194 + }, + { + "epoch": 0.13081912477079286, + "grad_norm": 1.0809053698680653, + "learning_rate": 4.791790256756657e-05, + "loss": 0.7021, + "num_input_tokens_seen": 216077568, + "step": 1195 + }, + { + "epoch": 0.13092859684173075, + "grad_norm": 1.073556591907116, + "learning_rate": 4.791446572521815e-05, + "loss": 0.7179, + "num_input_tokens_seen": 216239744, + "step": 1196 + }, + { + "epoch": 0.13103806891266864, + "grad_norm": 1.174011876200535, + "learning_rate": 4.791102617213223e-05, + "loss": 0.7936, + "num_input_tokens_seen": 216435744, + "step": 1197 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 1.1697277487562339, + "learning_rate": 4.7907583908715725e-05, + "loss": 0.5434, + "num_input_tokens_seen": 216578208, + "step": 1198 + }, + { + "epoch": 0.13125701305454446, + "grad_norm": 1.4920246774407053, + "learning_rate": 4.790413893537583e-05, + "loss": 0.7392, + "num_input_tokens_seen": 216755840, + "step": 1199 + }, + { + "epoch": 0.13136648512548235, + "grad_norm": 1.2962596766405885, + "learning_rate": 4.790069125252009e-05, + "loss": 0.8309, + "num_input_tokens_seen": 216965504, + "step": 1200 + }, + { + "epoch": 0.13147595719642027, + "grad_norm": 1.2611610431887732, + "learning_rate": 4.7897240860556345e-05, + "loss": 0.7778, + "num_input_tokens_seen": 217160832, + "step": 1201 + }, + { + "epoch": 0.13158542926735817, + "grad_norm": 1.3479557754919818, + "learning_rate": 4.789378775989278e-05, + "loss": 0.6507, + "num_input_tokens_seen": 217335552, + "step": 1202 + }, + { + "epoch": 0.13169490133829606, + "grad_norm": 1.332762039499424, + "learning_rate": 4.789033195093789e-05, + "loss": 0.6143, + "num_input_tokens_seen": 217514752, + "step": 1203 + }, + { + "epoch": 0.13180437340923398, + "grad_norm": 1.226499713821471, + "learning_rate": 4.7886873434100486e-05, + "loss": 0.6973, + "num_input_tokens_seen": 217678272, + "step": 1204 + }, + { + "epoch": 0.13191384548017188, + "grad_norm": 1.2548155057794819, + "learning_rate": 4.7883412209789714e-05, + "loss": 0.6889, + "num_input_tokens_seen": 217859488, + "step": 1205 + }, + { + "epoch": 0.13202331755110977, + "grad_norm": 1.4049841375575491, + "learning_rate": 4.787994827841502e-05, + "loss": 0.6443, + "num_input_tokens_seen": 218029504, + "step": 1206 + }, + { + "epoch": 0.13213278962204766, + "grad_norm": 1.1889022779354503, + "learning_rate": 4.7876481640386184e-05, + "loss": 0.8995, + "num_input_tokens_seen": 218243424, + "step": 1207 + }, + { + "epoch": 0.13224226169298559, + "grad_norm": 1.2059260373090972, + "learning_rate": 4.78730122961133e-05, + "loss": 0.7954, + "num_input_tokens_seen": 218408512, + "step": 1208 + }, + { + "epoch": 0.13235173376392348, + "grad_norm": 1.0879557673950566, + "learning_rate": 4.78695402460068e-05, + "loss": 0.5628, + "num_input_tokens_seen": 218589728, + "step": 1209 + }, + { + "epoch": 0.13246120583486137, + "grad_norm": 1.1469855959419364, + "learning_rate": 4.7866065490477386e-05, + "loss": 0.6415, + "num_input_tokens_seen": 218760416, + "step": 1210 + }, + { + "epoch": 0.1325706779057993, + "grad_norm": 1.3804933313708145, + "learning_rate": 4.786258802993615e-05, + "loss": 0.8429, + "num_input_tokens_seen": 218953952, + "step": 1211 + }, + { + "epoch": 0.1326801499767372, + "grad_norm": 1.488090362722494, + "learning_rate": 4.785910786479445e-05, + "loss": 0.8278, + "num_input_tokens_seen": 219122176, + "step": 1212 + }, + { + "epoch": 0.13278962204767508, + "grad_norm": 1.3001012084151362, + "learning_rate": 4.7855624995464e-05, + "loss": 0.6846, + "num_input_tokens_seen": 219303168, + "step": 1213 + }, + { + "epoch": 0.13289909411861298, + "grad_norm": 1.3351778692701606, + "learning_rate": 4.785213942235679e-05, + "loss": 0.6599, + "num_input_tokens_seen": 219511712, + "step": 1214 + }, + { + "epoch": 0.1330085661895509, + "grad_norm": 1.3161328311947678, + "learning_rate": 4.784865114588518e-05, + "loss": 0.668, + "num_input_tokens_seen": 219695168, + "step": 1215 + }, + { + "epoch": 0.1331180382604888, + "grad_norm": 1.254951113018465, + "learning_rate": 4.784516016646182e-05, + "loss": 0.7701, + "num_input_tokens_seen": 219835616, + "step": 1216 + }, + { + "epoch": 0.13322751033142669, + "grad_norm": 1.2789643998578775, + "learning_rate": 4.784166648449969e-05, + "loss": 0.7214, + "num_input_tokens_seen": 220009440, + "step": 1217 + }, + { + "epoch": 0.1333369824023646, + "grad_norm": 1.220015626713465, + "learning_rate": 4.783817010041207e-05, + "loss": 0.7259, + "num_input_tokens_seen": 220192000, + "step": 1218 + }, + { + "epoch": 0.1334464544733025, + "grad_norm": 1.11807876994567, + "learning_rate": 4.783467101461259e-05, + "loss": 0.71, + "num_input_tokens_seen": 220387104, + "step": 1219 + }, + { + "epoch": 0.1335559265442404, + "grad_norm": 1.223774212996101, + "learning_rate": 4.783116922751518e-05, + "loss": 0.7189, + "num_input_tokens_seen": 220552864, + "step": 1220 + }, + { + "epoch": 0.13366539861517832, + "grad_norm": 1.1922481876390711, + "learning_rate": 4.78276647395341e-05, + "loss": 0.6644, + "num_input_tokens_seen": 220743264, + "step": 1221 + }, + { + "epoch": 0.1337748706861162, + "grad_norm": 1.2669237796103936, + "learning_rate": 4.782415755108392e-05, + "loss": 0.7144, + "num_input_tokens_seen": 220916192, + "step": 1222 + }, + { + "epoch": 0.1338843427570541, + "grad_norm": 1.413152623267223, + "learning_rate": 4.782064766257953e-05, + "loss": 0.6621, + "num_input_tokens_seen": 221097408, + "step": 1223 + }, + { + "epoch": 0.133993814827992, + "grad_norm": 1.2863371273296862, + "learning_rate": 4.781713507443615e-05, + "loss": 0.8934, + "num_input_tokens_seen": 221309984, + "step": 1224 + }, + { + "epoch": 0.13410328689892992, + "grad_norm": 1.342255448740804, + "learning_rate": 4.7813619787069314e-05, + "loss": 0.7203, + "num_input_tokens_seen": 221499488, + "step": 1225 + }, + { + "epoch": 0.1342127589698678, + "grad_norm": 1.1910661055494343, + "learning_rate": 4.781010180089487e-05, + "loss": 0.6175, + "num_input_tokens_seen": 221637248, + "step": 1226 + }, + { + "epoch": 0.1343222310408057, + "grad_norm": 1.3370000172103618, + "learning_rate": 4.7806581116328976e-05, + "loss": 0.7926, + "num_input_tokens_seen": 221845120, + "step": 1227 + }, + { + "epoch": 0.13443170311174363, + "grad_norm": 1.210057092375844, + "learning_rate": 4.780305773378815e-05, + "loss": 0.7109, + "num_input_tokens_seen": 222038208, + "step": 1228 + }, + { + "epoch": 0.13454117518268152, + "grad_norm": 1.301280982448442, + "learning_rate": 4.779953165368917e-05, + "loss": 0.6611, + "num_input_tokens_seen": 222203072, + "step": 1229 + }, + { + "epoch": 0.13465064725361942, + "grad_norm": 1.271622610708077, + "learning_rate": 4.779600287644919e-05, + "loss": 0.7126, + "num_input_tokens_seen": 222397728, + "step": 1230 + }, + { + "epoch": 0.1347601193245573, + "grad_norm": 1.2623793346037115, + "learning_rate": 4.779247140248565e-05, + "loss": 0.7454, + "num_input_tokens_seen": 222568640, + "step": 1231 + }, + { + "epoch": 0.13486959139549523, + "grad_norm": 1.2540019346432159, + "learning_rate": 4.778893723221631e-05, + "loss": 0.8002, + "num_input_tokens_seen": 222744480, + "step": 1232 + }, + { + "epoch": 0.13497906346643312, + "grad_norm": 1.234620564818056, + "learning_rate": 4.7785400366059266e-05, + "loss": 0.6589, + "num_input_tokens_seen": 222900160, + "step": 1233 + }, + { + "epoch": 0.13508853553737102, + "grad_norm": 1.1811602664349057, + "learning_rate": 4.778186080443291e-05, + "loss": 0.616, + "num_input_tokens_seen": 223074208, + "step": 1234 + }, + { + "epoch": 0.13519800760830894, + "grad_norm": 1.3846277940430414, + "learning_rate": 4.777831854775598e-05, + "loss": 0.6828, + "num_input_tokens_seen": 223231008, + "step": 1235 + }, + { + "epoch": 0.13530747967924683, + "grad_norm": 1.3802256121243637, + "learning_rate": 4.777477359644751e-05, + "loss": 0.8746, + "num_input_tokens_seen": 223399680, + "step": 1236 + }, + { + "epoch": 0.13541695175018473, + "grad_norm": 1.1944172816674965, + "learning_rate": 4.7771225950926854e-05, + "loss": 0.6213, + "num_input_tokens_seen": 223582016, + "step": 1237 + }, + { + "epoch": 0.13552642382112265, + "grad_norm": 1.2991014236077105, + "learning_rate": 4.7767675611613704e-05, + "loss": 0.7855, + "num_input_tokens_seen": 223764576, + "step": 1238 + }, + { + "epoch": 0.13563589589206054, + "grad_norm": 1.2863480164130678, + "learning_rate": 4.776412257892805e-05, + "loss": 0.7594, + "num_input_tokens_seen": 223937280, + "step": 1239 + }, + { + "epoch": 0.13574536796299844, + "grad_norm": 1.1471407049824307, + "learning_rate": 4.7760566853290215e-05, + "loss": 0.6476, + "num_input_tokens_seen": 224157920, + "step": 1240 + }, + { + "epoch": 0.13585484003393633, + "grad_norm": 1.292954828008743, + "learning_rate": 4.775700843512084e-05, + "loss": 0.6917, + "num_input_tokens_seen": 224325248, + "step": 1241 + }, + { + "epoch": 0.13596431210487425, + "grad_norm": 1.2510786647656589, + "learning_rate": 4.775344732484086e-05, + "loss": 0.8417, + "num_input_tokens_seen": 224527296, + "step": 1242 + }, + { + "epoch": 0.13607378417581215, + "grad_norm": 1.2095574667636704, + "learning_rate": 4.774988352287156e-05, + "loss": 0.6032, + "num_input_tokens_seen": 224714112, + "step": 1243 + }, + { + "epoch": 0.13618325624675004, + "grad_norm": 1.2729944968063416, + "learning_rate": 4.774631702963453e-05, + "loss": 0.6384, + "num_input_tokens_seen": 224904960, + "step": 1244 + }, + { + "epoch": 0.13629272831768796, + "grad_norm": 1.2799400518207416, + "learning_rate": 4.7742747845551685e-05, + "loss": 0.8046, + "num_input_tokens_seen": 225053696, + "step": 1245 + }, + { + "epoch": 0.13640220038862585, + "grad_norm": 1.2038616550907244, + "learning_rate": 4.773917597104525e-05, + "loss": 0.695, + "num_input_tokens_seen": 225238944, + "step": 1246 + }, + { + "epoch": 0.13651167245956375, + "grad_norm": 1.2013096179914688, + "learning_rate": 4.773560140653775e-05, + "loss": 0.6213, + "num_input_tokens_seen": 225415680, + "step": 1247 + }, + { + "epoch": 0.13662114453050164, + "grad_norm": 1.3310943619992779, + "learning_rate": 4.773202415245208e-05, + "loss": 0.7096, + "num_input_tokens_seen": 225587040, + "step": 1248 + }, + { + "epoch": 0.13673061660143956, + "grad_norm": 1.1944007138568868, + "learning_rate": 4.772844420921141e-05, + "loss": 0.5362, + "num_input_tokens_seen": 225751680, + "step": 1249 + }, + { + "epoch": 0.13684008867237746, + "grad_norm": 1.0938074914263827, + "learning_rate": 4.772486157723923e-05, + "loss": 0.5776, + "num_input_tokens_seen": 225953728, + "step": 1250 + }, + { + "epoch": 0.13694956074331535, + "grad_norm": 1.466329893909562, + "learning_rate": 4.772127625695937e-05, + "loss": 0.801, + "num_input_tokens_seen": 226098656, + "step": 1251 + }, + { + "epoch": 0.13705903281425327, + "grad_norm": 1.3077182858894472, + "learning_rate": 4.771768824879597e-05, + "loss": 0.645, + "num_input_tokens_seen": 226261504, + "step": 1252 + }, + { + "epoch": 0.13716850488519117, + "grad_norm": 1.3145303916044164, + "learning_rate": 4.771409755317348e-05, + "loss": 0.5859, + "num_input_tokens_seen": 226434656, + "step": 1253 + }, + { + "epoch": 0.13727797695612906, + "grad_norm": 1.1505927797164894, + "learning_rate": 4.771050417051667e-05, + "loss": 0.6591, + "num_input_tokens_seen": 226594368, + "step": 1254 + }, + { + "epoch": 0.13738744902706698, + "grad_norm": 1.1101988835910028, + "learning_rate": 4.770690810125062e-05, + "loss": 0.6216, + "num_input_tokens_seen": 226786336, + "step": 1255 + }, + { + "epoch": 0.13749692109800488, + "grad_norm": 1.4231378166549722, + "learning_rate": 4.7703309345800766e-05, + "loss": 0.7215, + "num_input_tokens_seen": 226956576, + "step": 1256 + }, + { + "epoch": 0.13760639316894277, + "grad_norm": 1.4217905711165215, + "learning_rate": 4.769970790459282e-05, + "loss": 0.9164, + "num_input_tokens_seen": 227139808, + "step": 1257 + }, + { + "epoch": 0.13771586523988066, + "grad_norm": 1.3545536173096833, + "learning_rate": 4.769610377805281e-05, + "loss": 0.7662, + "num_input_tokens_seen": 227322592, + "step": 1258 + }, + { + "epoch": 0.13782533731081859, + "grad_norm": 1.1983193867755657, + "learning_rate": 4.769249696660711e-05, + "loss": 0.806, + "num_input_tokens_seen": 227517920, + "step": 1259 + }, + { + "epoch": 0.13793480938175648, + "grad_norm": 1.2323856499572858, + "learning_rate": 4.768888747068241e-05, + "loss": 0.6878, + "num_input_tokens_seen": 227684128, + "step": 1260 + }, + { + "epoch": 0.13804428145269437, + "grad_norm": 1.1153906931445934, + "learning_rate": 4.7685275290705686e-05, + "loss": 0.577, + "num_input_tokens_seen": 227868256, + "step": 1261 + }, + { + "epoch": 0.1381537535236323, + "grad_norm": 1.3301956640171428, + "learning_rate": 4.7681660427104266e-05, + "loss": 0.6753, + "num_input_tokens_seen": 228028192, + "step": 1262 + }, + { + "epoch": 0.1382632255945702, + "grad_norm": 1.1826005047756274, + "learning_rate": 4.7678042880305785e-05, + "loss": 0.7942, + "num_input_tokens_seen": 228209184, + "step": 1263 + }, + { + "epoch": 0.13837269766550808, + "grad_norm": 1.317554414627979, + "learning_rate": 4.767442265073818e-05, + "loss": 0.6687, + "num_input_tokens_seen": 228361504, + "step": 1264 + }, + { + "epoch": 0.13848216973644598, + "grad_norm": 1.2593787257137041, + "learning_rate": 4.767079973882972e-05, + "loss": 0.6364, + "num_input_tokens_seen": 228532416, + "step": 1265 + }, + { + "epoch": 0.1385916418073839, + "grad_norm": 1.2173951761446271, + "learning_rate": 4.766717414500898e-05, + "loss": 0.7894, + "num_input_tokens_seen": 228704224, + "step": 1266 + }, + { + "epoch": 0.1387011138783218, + "grad_norm": 1.1126440533587043, + "learning_rate": 4.766354586970489e-05, + "loss": 0.6682, + "num_input_tokens_seen": 228890368, + "step": 1267 + }, + { + "epoch": 0.13881058594925968, + "grad_norm": 1.3151186297033706, + "learning_rate": 4.7659914913346634e-05, + "loss": 0.6803, + "num_input_tokens_seen": 229068224, + "step": 1268 + }, + { + "epoch": 0.1389200580201976, + "grad_norm": 1.2222754522253447, + "learning_rate": 4.7656281276363765e-05, + "loss": 0.666, + "num_input_tokens_seen": 229266688, + "step": 1269 + }, + { + "epoch": 0.1390295300911355, + "grad_norm": 1.333952069764051, + "learning_rate": 4.7652644959186146e-05, + "loss": 0.6594, + "num_input_tokens_seen": 229446784, + "step": 1270 + }, + { + "epoch": 0.1391390021620734, + "grad_norm": 1.2084809026823002, + "learning_rate": 4.764900596224392e-05, + "loss": 0.7602, + "num_input_tokens_seen": 229598208, + "step": 1271 + }, + { + "epoch": 0.13924847423301132, + "grad_norm": 1.2587338029260686, + "learning_rate": 4.7645364285967584e-05, + "loss": 0.788, + "num_input_tokens_seen": 229783904, + "step": 1272 + }, + { + "epoch": 0.1393579463039492, + "grad_norm": 1.2659979376753512, + "learning_rate": 4.764171993078795e-05, + "loss": 0.6389, + "num_input_tokens_seen": 229948320, + "step": 1273 + }, + { + "epoch": 0.1394674183748871, + "grad_norm": 1.3428975439806086, + "learning_rate": 4.763807289713613e-05, + "loss": 0.9654, + "num_input_tokens_seen": 230137824, + "step": 1274 + }, + { + "epoch": 0.139576890445825, + "grad_norm": 1.236987765098268, + "learning_rate": 4.763442318544356e-05, + "loss": 0.7913, + "num_input_tokens_seen": 230325088, + "step": 1275 + }, + { + "epoch": 0.13968636251676292, + "grad_norm": 1.2407801110762366, + "learning_rate": 4.7630770796142e-05, + "loss": 0.7346, + "num_input_tokens_seen": 230518624, + "step": 1276 + }, + { + "epoch": 0.1397958345877008, + "grad_norm": 1.2054405718394554, + "learning_rate": 4.762711572966352e-05, + "loss": 0.5652, + "num_input_tokens_seen": 230701408, + "step": 1277 + }, + { + "epoch": 0.1399053066586387, + "grad_norm": 1.20486329573483, + "learning_rate": 4.76234579864405e-05, + "loss": 0.7643, + "num_input_tokens_seen": 230885312, + "step": 1278 + }, + { + "epoch": 0.14001477872957663, + "grad_norm": 1.2571950783194157, + "learning_rate": 4.761979756690565e-05, + "loss": 0.7445, + "num_input_tokens_seen": 231066752, + "step": 1279 + }, + { + "epoch": 0.14012425080051452, + "grad_norm": 1.1514348988120833, + "learning_rate": 4.761613447149199e-05, + "loss": 0.7607, + "num_input_tokens_seen": 231277088, + "step": 1280 + }, + { + "epoch": 0.14023372287145242, + "grad_norm": 1.1544033459713374, + "learning_rate": 4.761246870063286e-05, + "loss": 0.7039, + "num_input_tokens_seen": 231484064, + "step": 1281 + }, + { + "epoch": 0.1403431949423903, + "grad_norm": 1.2397545073900942, + "learning_rate": 4.760880025476191e-05, + "loss": 0.658, + "num_input_tokens_seen": 231650496, + "step": 1282 + }, + { + "epoch": 0.14045266701332823, + "grad_norm": 1.2520186813258727, + "learning_rate": 4.76051291343131e-05, + "loss": 0.7526, + "num_input_tokens_seen": 231841344, + "step": 1283 + }, + { + "epoch": 0.14056213908426612, + "grad_norm": 1.3993690840207198, + "learning_rate": 4.7601455339720736e-05, + "loss": 0.7538, + "num_input_tokens_seen": 231998144, + "step": 1284 + }, + { + "epoch": 0.14067161115520402, + "grad_norm": 1.3464820058351927, + "learning_rate": 4.759777887141941e-05, + "loss": 0.5775, + "num_input_tokens_seen": 232174880, + "step": 1285 + }, + { + "epoch": 0.14078108322614194, + "grad_norm": 1.300562109831334, + "learning_rate": 4.7594099729844045e-05, + "loss": 0.6722, + "num_input_tokens_seen": 232333696, + "step": 1286 + }, + { + "epoch": 0.14089055529707983, + "grad_norm": 1.391919208881406, + "learning_rate": 4.759041791542987e-05, + "loss": 0.6954, + "num_input_tokens_seen": 232527680, + "step": 1287 + }, + { + "epoch": 0.14100002736801773, + "grad_norm": 1.2004187727413926, + "learning_rate": 4.7586733428612454e-05, + "loss": 0.637, + "num_input_tokens_seen": 232734432, + "step": 1288 + }, + { + "epoch": 0.14110949943895565, + "grad_norm": 1.189456031432461, + "learning_rate": 4.758304626982764e-05, + "loss": 0.6359, + "num_input_tokens_seen": 232939168, + "step": 1289 + }, + { + "epoch": 0.14121897150989354, + "grad_norm": 1.2555134298258799, + "learning_rate": 4.757935643951163e-05, + "loss": 0.7564, + "num_input_tokens_seen": 233103136, + "step": 1290 + }, + { + "epoch": 0.14132844358083144, + "grad_norm": 1.4503569952746342, + "learning_rate": 4.757566393810091e-05, + "loss": 0.9284, + "num_input_tokens_seen": 233278080, + "step": 1291 + }, + { + "epoch": 0.14143791565176933, + "grad_norm": 1.3231387164774853, + "learning_rate": 4.75719687660323e-05, + "loss": 0.8183, + "num_input_tokens_seen": 233470272, + "step": 1292 + }, + { + "epoch": 0.14154738772270725, + "grad_norm": 1.2865800014933446, + "learning_rate": 4.756827092374295e-05, + "loss": 0.8962, + "num_input_tokens_seen": 233653280, + "step": 1293 + }, + { + "epoch": 0.14165685979364515, + "grad_norm": 1.237149898056879, + "learning_rate": 4.7564570411670284e-05, + "loss": 0.8683, + "num_input_tokens_seen": 233832704, + "step": 1294 + }, + { + "epoch": 0.14176633186458304, + "grad_norm": 1.219030152424392, + "learning_rate": 4.756086723025208e-05, + "loss": 0.6789, + "num_input_tokens_seen": 234021088, + "step": 1295 + }, + { + "epoch": 0.14187580393552096, + "grad_norm": 1.231360218517389, + "learning_rate": 4.755716137992641e-05, + "loss": 0.764, + "num_input_tokens_seen": 234191776, + "step": 1296 + }, + { + "epoch": 0.14198527600645885, + "grad_norm": 1.1503419201940408, + "learning_rate": 4.755345286113166e-05, + "loss": 0.8036, + "num_input_tokens_seen": 234346560, + "step": 1297 + }, + { + "epoch": 0.14209474807739675, + "grad_norm": 1.2243846827110032, + "learning_rate": 4.7549741674306567e-05, + "loss": 0.7765, + "num_input_tokens_seen": 234530464, + "step": 1298 + }, + { + "epoch": 0.14220422014833464, + "grad_norm": 1.2130704245852162, + "learning_rate": 4.754602781989013e-05, + "loss": 0.7371, + "num_input_tokens_seen": 234709440, + "step": 1299 + }, + { + "epoch": 0.14231369221927256, + "grad_norm": 1.2503312148924817, + "learning_rate": 4.754231129832171e-05, + "loss": 0.6459, + "num_input_tokens_seen": 234885504, + "step": 1300 + }, + { + "epoch": 0.14242316429021046, + "grad_norm": 1.3428682051075442, + "learning_rate": 4.753859211004096e-05, + "loss": 0.8833, + "num_input_tokens_seen": 235082624, + "step": 1301 + }, + { + "epoch": 0.14253263636114835, + "grad_norm": 1.1703341208321023, + "learning_rate": 4.753487025548784e-05, + "loss": 0.722, + "num_input_tokens_seen": 235257792, + "step": 1302 + }, + { + "epoch": 0.14264210843208627, + "grad_norm": 1.6495064979973328, + "learning_rate": 4.753114573510265e-05, + "loss": 0.8241, + "num_input_tokens_seen": 235410560, + "step": 1303 + }, + { + "epoch": 0.14275158050302417, + "grad_norm": 1.0491553565080525, + "learning_rate": 4.7527418549326e-05, + "loss": 0.5141, + "num_input_tokens_seen": 235607456, + "step": 1304 + }, + { + "epoch": 0.14286105257396206, + "grad_norm": 1.2474636093978255, + "learning_rate": 4.752368869859879e-05, + "loss": 0.7885, + "num_input_tokens_seen": 235801440, + "step": 1305 + }, + { + "epoch": 0.14297052464489998, + "grad_norm": 1.3513384947616276, + "learning_rate": 4.751995618336227e-05, + "loss": 0.7191, + "num_input_tokens_seen": 236018944, + "step": 1306 + }, + { + "epoch": 0.14307999671583788, + "grad_norm": 1.3018159159394331, + "learning_rate": 4.751622100405798e-05, + "loss": 0.6686, + "num_input_tokens_seen": 236209344, + "step": 1307 + }, + { + "epoch": 0.14318946878677577, + "grad_norm": 1.1609775192325082, + "learning_rate": 4.7512483161127794e-05, + "loss": 0.5666, + "num_input_tokens_seen": 236404000, + "step": 1308 + }, + { + "epoch": 0.14329894085771366, + "grad_norm": 1.2545826831327334, + "learning_rate": 4.750874265501389e-05, + "loss": 0.6433, + "num_input_tokens_seen": 236577600, + "step": 1309 + }, + { + "epoch": 0.14340841292865159, + "grad_norm": 1.4550856622078279, + "learning_rate": 4.750499948615875e-05, + "loss": 0.7324, + "num_input_tokens_seen": 236741568, + "step": 1310 + }, + { + "epoch": 0.14351788499958948, + "grad_norm": 1.2955718789663584, + "learning_rate": 4.750125365500521e-05, + "loss": 0.6275, + "num_input_tokens_seen": 236910912, + "step": 1311 + }, + { + "epoch": 0.14362735707052737, + "grad_norm": 1.3801428245496934, + "learning_rate": 4.7497505161996356e-05, + "loss": 0.7143, + "num_input_tokens_seen": 237111616, + "step": 1312 + }, + { + "epoch": 0.1437368291414653, + "grad_norm": 1.309201045532928, + "learning_rate": 4.749375400757566e-05, + "loss": 0.8587, + "num_input_tokens_seen": 237275584, + "step": 1313 + }, + { + "epoch": 0.1438463012124032, + "grad_norm": 1.3334576081931535, + "learning_rate": 4.749000019218687e-05, + "loss": 0.6385, + "num_input_tokens_seen": 237445824, + "step": 1314 + }, + { + "epoch": 0.14395577328334108, + "grad_norm": 1.40318787227879, + "learning_rate": 4.7486243716274036e-05, + "loss": 0.788, + "num_input_tokens_seen": 237612928, + "step": 1315 + }, + { + "epoch": 0.14406524535427898, + "grad_norm": 1.3055126755799247, + "learning_rate": 4.748248458028157e-05, + "loss": 0.7473, + "num_input_tokens_seen": 237807584, + "step": 1316 + }, + { + "epoch": 0.1441747174252169, + "grad_norm": 1.2036907797850487, + "learning_rate": 4.747872278465416e-05, + "loss": 0.6219, + "num_input_tokens_seen": 237949152, + "step": 1317 + }, + { + "epoch": 0.1442841894961548, + "grad_norm": 1.310611410584441, + "learning_rate": 4.7474958329836805e-05, + "loss": 0.6392, + "num_input_tokens_seen": 238133280, + "step": 1318 + }, + { + "epoch": 0.14439366156709268, + "grad_norm": 1.164915911296146, + "learning_rate": 4.747119121627485e-05, + "loss": 0.656, + "num_input_tokens_seen": 238327936, + "step": 1319 + }, + { + "epoch": 0.1445031336380306, + "grad_norm": 1.2989643685150458, + "learning_rate": 4.746742144441393e-05, + "loss": 0.7357, + "num_input_tokens_seen": 238488096, + "step": 1320 + }, + { + "epoch": 0.1446126057089685, + "grad_norm": 1.1910489906849633, + "learning_rate": 4.7463649014700004e-05, + "loss": 0.6705, + "num_input_tokens_seen": 238660352, + "step": 1321 + }, + { + "epoch": 0.1447220777799064, + "grad_norm": 1.2109378445532524, + "learning_rate": 4.7459873927579345e-05, + "loss": 0.6446, + "num_input_tokens_seen": 238847616, + "step": 1322 + }, + { + "epoch": 0.14483154985084432, + "grad_norm": 1.2300344518163016, + "learning_rate": 4.745609618349853e-05, + "loss": 0.6836, + "num_input_tokens_seen": 239023008, + "step": 1323 + }, + { + "epoch": 0.1449410219217822, + "grad_norm": 1.3663297741941238, + "learning_rate": 4.7452315782904477e-05, + "loss": 0.8015, + "num_input_tokens_seen": 239192800, + "step": 1324 + }, + { + "epoch": 0.1450504939927201, + "grad_norm": 1.2884539005587476, + "learning_rate": 4.744853272624438e-05, + "loss": 0.6096, + "num_input_tokens_seen": 239378048, + "step": 1325 + }, + { + "epoch": 0.145159966063658, + "grad_norm": 1.2045939012079516, + "learning_rate": 4.7444747013965776e-05, + "loss": 0.7963, + "num_input_tokens_seen": 239589280, + "step": 1326 + }, + { + "epoch": 0.14526943813459592, + "grad_norm": 1.2749347426974065, + "learning_rate": 4.744095864651651e-05, + "loss": 0.7845, + "num_input_tokens_seen": 239787968, + "step": 1327 + }, + { + "epoch": 0.1453789102055338, + "grad_norm": 1.2111722657068265, + "learning_rate": 4.7437167624344736e-05, + "loss": 0.598, + "num_input_tokens_seen": 239953952, + "step": 1328 + }, + { + "epoch": 0.1454883822764717, + "grad_norm": 1.3765646528558426, + "learning_rate": 4.743337394789892e-05, + "loss": 1.0846, + "num_input_tokens_seen": 240152640, + "step": 1329 + }, + { + "epoch": 0.14559785434740963, + "grad_norm": 1.198433236682774, + "learning_rate": 4.7429577617627864e-05, + "loss": 0.5813, + "num_input_tokens_seen": 240359616, + "step": 1330 + }, + { + "epoch": 0.14570732641834752, + "grad_norm": 1.2159438472632138, + "learning_rate": 4.7425778633980636e-05, + "loss": 0.7968, + "num_input_tokens_seen": 240537696, + "step": 1331 + }, + { + "epoch": 0.14581679848928542, + "grad_norm": 1.278872680223479, + "learning_rate": 4.742197699740668e-05, + "loss": 0.8136, + "num_input_tokens_seen": 240732800, + "step": 1332 + }, + { + "epoch": 0.14592627056022334, + "grad_norm": 1.0917278124644823, + "learning_rate": 4.74181727083557e-05, + "loss": 0.6057, + "num_input_tokens_seen": 240911776, + "step": 1333 + }, + { + "epoch": 0.14603574263116123, + "grad_norm": 1.1570769265564937, + "learning_rate": 4.741436576727775e-05, + "loss": 0.756, + "num_input_tokens_seen": 241118304, + "step": 1334 + }, + { + "epoch": 0.14614521470209912, + "grad_norm": 1.2257750103161376, + "learning_rate": 4.741055617462318e-05, + "loss": 0.8881, + "num_input_tokens_seen": 241301760, + "step": 1335 + }, + { + "epoch": 0.14625468677303702, + "grad_norm": 1.1399478208579854, + "learning_rate": 4.7406743930842655e-05, + "loss": 0.6046, + "num_input_tokens_seen": 241471776, + "step": 1336 + }, + { + "epoch": 0.14636415884397494, + "grad_norm": 1.2439901838865937, + "learning_rate": 4.740292903638716e-05, + "loss": 0.6165, + "num_input_tokens_seen": 241629920, + "step": 1337 + }, + { + "epoch": 0.14647363091491283, + "grad_norm": 1.2330285950227589, + "learning_rate": 4.739911149170798e-05, + "loss": 0.772, + "num_input_tokens_seen": 241810464, + "step": 1338 + }, + { + "epoch": 0.14658310298585073, + "grad_norm": 1.3054081645355493, + "learning_rate": 4.7395291297256725e-05, + "loss": 0.749, + "num_input_tokens_seen": 241981600, + "step": 1339 + }, + { + "epoch": 0.14669257505678865, + "grad_norm": 1.22530850009708, + "learning_rate": 4.7391468453485334e-05, + "loss": 0.6604, + "num_input_tokens_seen": 242140192, + "step": 1340 + }, + { + "epoch": 0.14680204712772654, + "grad_norm": 1.3138826898119882, + "learning_rate": 4.738764296084603e-05, + "loss": 0.8946, + "num_input_tokens_seen": 242310656, + "step": 1341 + }, + { + "epoch": 0.14691151919866444, + "grad_norm": 1.1975487627840486, + "learning_rate": 4.738381481979136e-05, + "loss": 0.6092, + "num_input_tokens_seen": 242472832, + "step": 1342 + }, + { + "epoch": 0.14702099126960233, + "grad_norm": 1.424768208173345, + "learning_rate": 4.7379984030774184e-05, + "loss": 0.7997, + "num_input_tokens_seen": 242664128, + "step": 1343 + }, + { + "epoch": 0.14713046334054025, + "grad_norm": 1.3908222787203268, + "learning_rate": 4.737615059424768e-05, + "loss": 0.7865, + "num_input_tokens_seen": 242851840, + "step": 1344 + }, + { + "epoch": 0.14723993541147815, + "grad_norm": 1.2604312528076251, + "learning_rate": 4.737231451066534e-05, + "loss": 0.7408, + "num_input_tokens_seen": 243059488, + "step": 1345 + }, + { + "epoch": 0.14734940748241604, + "grad_norm": 1.300599598157628, + "learning_rate": 4.7368475780480956e-05, + "loss": 0.8787, + "num_input_tokens_seen": 243233760, + "step": 1346 + }, + { + "epoch": 0.14745887955335396, + "grad_norm": 1.3729174188004991, + "learning_rate": 4.7364634404148655e-05, + "loss": 0.7509, + "num_input_tokens_seen": 243382048, + "step": 1347 + }, + { + "epoch": 0.14756835162429185, + "grad_norm": 1.1777287573229671, + "learning_rate": 4.736079038212286e-05, + "loss": 0.7923, + "num_input_tokens_seen": 243578496, + "step": 1348 + }, + { + "epoch": 0.14767782369522975, + "grad_norm": 1.2454448194098706, + "learning_rate": 4.7356943714858306e-05, + "loss": 0.773, + "num_input_tokens_seen": 243770240, + "step": 1349 + }, + { + "epoch": 0.14778729576616767, + "grad_norm": 1.26645581326697, + "learning_rate": 4.735309440281005e-05, + "loss": 0.7505, + "num_input_tokens_seen": 243940704, + "step": 1350 + }, + { + "epoch": 0.14789676783710556, + "grad_norm": 1.2980063685509278, + "learning_rate": 4.7349242446433464e-05, + "loss": 0.9581, + "num_input_tokens_seen": 244129760, + "step": 1351 + }, + { + "epoch": 0.14800623990804346, + "grad_norm": 1.150484764508859, + "learning_rate": 4.734538784618421e-05, + "loss": 0.8142, + "num_input_tokens_seen": 244339872, + "step": 1352 + }, + { + "epoch": 0.14811571197898135, + "grad_norm": 1.0547787662153967, + "learning_rate": 4.734153060251829e-05, + "loss": 0.6503, + "num_input_tokens_seen": 244539904, + "step": 1353 + }, + { + "epoch": 0.14822518404991927, + "grad_norm": 1.1287519620115816, + "learning_rate": 4.733767071589202e-05, + "loss": 0.7656, + "num_input_tokens_seen": 244741952, + "step": 1354 + }, + { + "epoch": 0.14833465612085717, + "grad_norm": 1.222570446769614, + "learning_rate": 4.7333808186761996e-05, + "loss": 0.5994, + "num_input_tokens_seen": 244903456, + "step": 1355 + }, + { + "epoch": 0.14844412819179506, + "grad_norm": 1.4143577815350834, + "learning_rate": 4.732994301558516e-05, + "loss": 0.8564, + "num_input_tokens_seen": 245054656, + "step": 1356 + }, + { + "epoch": 0.14855360026273298, + "grad_norm": 1.2344770751468455, + "learning_rate": 4.7326075202818765e-05, + "loss": 0.7511, + "num_input_tokens_seen": 245230272, + "step": 1357 + }, + { + "epoch": 0.14866307233367088, + "grad_norm": 1.1613342264466149, + "learning_rate": 4.7322204748920345e-05, + "loss": 0.7001, + "num_input_tokens_seen": 245412160, + "step": 1358 + }, + { + "epoch": 0.14877254440460877, + "grad_norm": 1.310761799365042, + "learning_rate": 4.731833165434778e-05, + "loss": 0.8003, + "num_input_tokens_seen": 245590688, + "step": 1359 + }, + { + "epoch": 0.14888201647554666, + "grad_norm": 1.307485629380211, + "learning_rate": 4.731445591955924e-05, + "loss": 0.8152, + "num_input_tokens_seen": 245785120, + "step": 1360 + }, + { + "epoch": 0.14899148854648459, + "grad_norm": 1.2493661704044814, + "learning_rate": 4.7310577545013224e-05, + "loss": 0.6393, + "num_input_tokens_seen": 245950656, + "step": 1361 + }, + { + "epoch": 0.14910096061742248, + "grad_norm": 1.3754320766244807, + "learning_rate": 4.7306696531168535e-05, + "loss": 0.7065, + "num_input_tokens_seen": 246095808, + "step": 1362 + }, + { + "epoch": 0.14921043268836037, + "grad_norm": 1.237156305847267, + "learning_rate": 4.7302812878484294e-05, + "loss": 0.7815, + "num_input_tokens_seen": 246301888, + "step": 1363 + }, + { + "epoch": 0.1493199047592983, + "grad_norm": 1.1852847065522054, + "learning_rate": 4.7298926587419924e-05, + "loss": 0.7585, + "num_input_tokens_seen": 246471008, + "step": 1364 + }, + { + "epoch": 0.1494293768302362, + "grad_norm": 1.183489854740935, + "learning_rate": 4.729503765843516e-05, + "loss": 0.6733, + "num_input_tokens_seen": 246626688, + "step": 1365 + }, + { + "epoch": 0.14953884890117408, + "grad_norm": 1.1552969639140847, + "learning_rate": 4.7291146091990066e-05, + "loss": 0.6085, + "num_input_tokens_seen": 246809920, + "step": 1366 + }, + { + "epoch": 0.149648320972112, + "grad_norm": 1.3636221910231474, + "learning_rate": 4.7287251888545005e-05, + "loss": 0.6327, + "num_input_tokens_seen": 246989120, + "step": 1367 + }, + { + "epoch": 0.1497577930430499, + "grad_norm": 1.1997835401173815, + "learning_rate": 4.728335504856065e-05, + "loss": 0.6761, + "num_input_tokens_seen": 247184672, + "step": 1368 + }, + { + "epoch": 0.1498672651139878, + "grad_norm": 1.2830448604629014, + "learning_rate": 4.727945557249799e-05, + "loss": 0.5573, + "num_input_tokens_seen": 247376864, + "step": 1369 + }, + { + "epoch": 0.14997673718492568, + "grad_norm": 1.126840939549202, + "learning_rate": 4.727555346081833e-05, + "loss": 0.6686, + "num_input_tokens_seen": 247565024, + "step": 1370 + }, + { + "epoch": 0.1500862092558636, + "grad_norm": 1.2038474452901735, + "learning_rate": 4.7271648713983276e-05, + "loss": 0.8797, + "num_input_tokens_seen": 247761696, + "step": 1371 + }, + { + "epoch": 0.1501956813268015, + "grad_norm": 1.2143731386304912, + "learning_rate": 4.726774133245476e-05, + "loss": 0.6095, + "num_input_tokens_seen": 247919168, + "step": 1372 + }, + { + "epoch": 0.1503051533977394, + "grad_norm": 1.3978117496480789, + "learning_rate": 4.7263831316695005e-05, + "loss": 0.8636, + "num_input_tokens_seen": 248106432, + "step": 1373 + }, + { + "epoch": 0.15041462546867732, + "grad_norm": 1.1592634985431765, + "learning_rate": 4.725991866716657e-05, + "loss": 0.7765, + "num_input_tokens_seen": 248286080, + "step": 1374 + }, + { + "epoch": 0.1505240975396152, + "grad_norm": 1.238869897176999, + "learning_rate": 4.7256003384332314e-05, + "loss": 0.6218, + "num_input_tokens_seen": 248476704, + "step": 1375 + }, + { + "epoch": 0.1506335696105531, + "grad_norm": 1.4917869629471943, + "learning_rate": 4.72520854686554e-05, + "loss": 0.9153, + "num_input_tokens_seen": 248655680, + "step": 1376 + }, + { + "epoch": 0.150743041681491, + "grad_norm": 1.2356319064158108, + "learning_rate": 4.724816492059932e-05, + "loss": 0.6723, + "num_input_tokens_seen": 248838688, + "step": 1377 + }, + { + "epoch": 0.15085251375242892, + "grad_norm": 1.2027904491231851, + "learning_rate": 4.724424174062786e-05, + "loss": 0.6343, + "num_input_tokens_seen": 249004672, + "step": 1378 + }, + { + "epoch": 0.1509619858233668, + "grad_norm": 1.2050020017646845, + "learning_rate": 4.724031592920512e-05, + "loss": 0.6019, + "num_input_tokens_seen": 249170880, + "step": 1379 + }, + { + "epoch": 0.1510714578943047, + "grad_norm": 1.2409950146571131, + "learning_rate": 4.7236387486795525e-05, + "loss": 0.7691, + "num_input_tokens_seen": 249360160, + "step": 1380 + }, + { + "epoch": 0.15118092996524263, + "grad_norm": 1.1711723255469342, + "learning_rate": 4.72324564138638e-05, + "loss": 0.6459, + "num_input_tokens_seen": 249556832, + "step": 1381 + }, + { + "epoch": 0.15129040203618052, + "grad_norm": 1.35485628125996, + "learning_rate": 4.722852271087498e-05, + "loss": 0.7898, + "num_input_tokens_seen": 249722592, + "step": 1382 + }, + { + "epoch": 0.15139987410711842, + "grad_norm": 1.397161274362872, + "learning_rate": 4.722458637829442e-05, + "loss": 0.8013, + "num_input_tokens_seen": 249896864, + "step": 1383 + }, + { + "epoch": 0.15150934617805634, + "grad_norm": 1.167082485392511, + "learning_rate": 4.722064741658777e-05, + "loss": 0.6231, + "num_input_tokens_seen": 250087488, + "step": 1384 + }, + { + "epoch": 0.15161881824899423, + "grad_norm": 1.2958822472142364, + "learning_rate": 4.721670582622102e-05, + "loss": 0.6553, + "num_input_tokens_seen": 250248544, + "step": 1385 + }, + { + "epoch": 0.15172829031993212, + "grad_norm": 1.2307405201951513, + "learning_rate": 4.721276160766043e-05, + "loss": 0.618, + "num_input_tokens_seen": 250422368, + "step": 1386 + }, + { + "epoch": 0.15183776239087002, + "grad_norm": 1.182558620524138, + "learning_rate": 4.720881476137261e-05, + "loss": 0.8347, + "num_input_tokens_seen": 250632928, + "step": 1387 + }, + { + "epoch": 0.15194723446180794, + "grad_norm": 1.2140209195787128, + "learning_rate": 4.720486528782447e-05, + "loss": 0.6543, + "num_input_tokens_seen": 250820416, + "step": 1388 + }, + { + "epoch": 0.15205670653274583, + "grad_norm": 1.2020188237714107, + "learning_rate": 4.720091318748321e-05, + "loss": 0.8169, + "num_input_tokens_seen": 251015296, + "step": 1389 + }, + { + "epoch": 0.15216617860368373, + "grad_norm": 1.4546971993783697, + "learning_rate": 4.7196958460816356e-05, + "loss": 0.8252, + "num_input_tokens_seen": 251222944, + "step": 1390 + }, + { + "epoch": 0.15227565067462165, + "grad_norm": 1.2499545565927819, + "learning_rate": 4.719300110829174e-05, + "loss": 0.6979, + "num_input_tokens_seen": 251431264, + "step": 1391 + }, + { + "epoch": 0.15238512274555954, + "grad_norm": 1.3713415633782824, + "learning_rate": 4.718904113037754e-05, + "loss": 0.6846, + "num_input_tokens_seen": 251605760, + "step": 1392 + }, + { + "epoch": 0.15249459481649744, + "grad_norm": 1.2921286443847324, + "learning_rate": 4.718507852754218e-05, + "loss": 0.7524, + "num_input_tokens_seen": 251784064, + "step": 1393 + }, + { + "epoch": 0.15260406688743533, + "grad_norm": 1.2650055016710866, + "learning_rate": 4.718111330025444e-05, + "loss": 0.5712, + "num_input_tokens_seen": 251938176, + "step": 1394 + }, + { + "epoch": 0.15271353895837325, + "grad_norm": 1.3184121354460054, + "learning_rate": 4.717714544898341e-05, + "loss": 0.6583, + "num_input_tokens_seen": 252096992, + "step": 1395 + }, + { + "epoch": 0.15282301102931115, + "grad_norm": 1.3764447511503541, + "learning_rate": 4.717317497419846e-05, + "loss": 0.8149, + "num_input_tokens_seen": 252281792, + "step": 1396 + }, + { + "epoch": 0.15293248310024904, + "grad_norm": 1.3395875735930178, + "learning_rate": 4.7169201876369295e-05, + "loss": 0.6314, + "num_input_tokens_seen": 252428288, + "step": 1397 + }, + { + "epoch": 0.15304195517118696, + "grad_norm": 1.1463060906517382, + "learning_rate": 4.7165226155965936e-05, + "loss": 0.5287, + "num_input_tokens_seen": 252591136, + "step": 1398 + }, + { + "epoch": 0.15315142724212485, + "grad_norm": 1.4707910681137177, + "learning_rate": 4.7161247813458696e-05, + "loss": 0.9543, + "num_input_tokens_seen": 252768320, + "step": 1399 + }, + { + "epoch": 0.15326089931306275, + "grad_norm": 1.3121691695566209, + "learning_rate": 4.71572668493182e-05, + "loss": 0.8586, + "num_input_tokens_seen": 252968800, + "step": 1400 + }, + { + "epoch": 0.15337037138400067, + "grad_norm": 1.3521580789573022, + "learning_rate": 4.7153283264015394e-05, + "loss": 0.9379, + "num_input_tokens_seen": 253184288, + "step": 1401 + }, + { + "epoch": 0.15347984345493856, + "grad_norm": 1.3537419117609317, + "learning_rate": 4.714929705802153e-05, + "loss": 0.8419, + "num_input_tokens_seen": 253361696, + "step": 1402 + }, + { + "epoch": 0.15358931552587646, + "grad_norm": 1.273685735367256, + "learning_rate": 4.714530823180816e-05, + "loss": 0.7545, + "num_input_tokens_seen": 253536864, + "step": 1403 + }, + { + "epoch": 0.15369878759681435, + "grad_norm": 1.306186294827443, + "learning_rate": 4.7141316785847176e-05, + "loss": 0.7545, + "num_input_tokens_seen": 253716064, + "step": 1404 + }, + { + "epoch": 0.15380825966775227, + "grad_norm": 1.2576525568709414, + "learning_rate": 4.713732272061073e-05, + "loss": 0.6513, + "num_input_tokens_seen": 253875776, + "step": 1405 + }, + { + "epoch": 0.15391773173869017, + "grad_norm": 1.3336317006696436, + "learning_rate": 4.713332603657133e-05, + "loss": 0.6302, + "num_input_tokens_seen": 254041760, + "step": 1406 + }, + { + "epoch": 0.15402720380962806, + "grad_norm": 1.1624419946452886, + "learning_rate": 4.712932673420177e-05, + "loss": 0.6368, + "num_input_tokens_seen": 254229920, + "step": 1407 + }, + { + "epoch": 0.15413667588056598, + "grad_norm": 1.281965800119284, + "learning_rate": 4.7125324813975155e-05, + "loss": 0.5526, + "num_input_tokens_seen": 254380672, + "step": 1408 + }, + { + "epoch": 0.15424614795150388, + "grad_norm": 1.2632271929081917, + "learning_rate": 4.712132027636492e-05, + "loss": 0.7591, + "num_input_tokens_seen": 254564352, + "step": 1409 + }, + { + "epoch": 0.15435562002244177, + "grad_norm": 1.3673654059054248, + "learning_rate": 4.711731312184479e-05, + "loss": 0.7529, + "num_input_tokens_seen": 254723840, + "step": 1410 + }, + { + "epoch": 0.15446509209337966, + "grad_norm": 1.4013979812056585, + "learning_rate": 4.711330335088879e-05, + "loss": 0.9235, + "num_input_tokens_seen": 254932608, + "step": 1411 + }, + { + "epoch": 0.15457456416431758, + "grad_norm": 1.2709600771044636, + "learning_rate": 4.710929096397127e-05, + "loss": 0.6455, + "num_input_tokens_seen": 255107552, + "step": 1412 + }, + { + "epoch": 0.15468403623525548, + "grad_norm": 1.214909297241939, + "learning_rate": 4.710527596156691e-05, + "loss": 0.5425, + "num_input_tokens_seen": 255292352, + "step": 1413 + }, + { + "epoch": 0.15479350830619337, + "grad_norm": 1.2623270180669062, + "learning_rate": 4.710125834415065e-05, + "loss": 0.8238, + "num_input_tokens_seen": 255472224, + "step": 1414 + }, + { + "epoch": 0.1549029803771313, + "grad_norm": 1.3783059691204052, + "learning_rate": 4.709723811219779e-05, + "loss": 0.7698, + "num_input_tokens_seen": 255626336, + "step": 1415 + }, + { + "epoch": 0.1550124524480692, + "grad_norm": 1.2627427991300368, + "learning_rate": 4.70932152661839e-05, + "loss": 0.8038, + "num_input_tokens_seen": 255816288, + "step": 1416 + }, + { + "epoch": 0.15512192451900708, + "grad_norm": 1.341703231186909, + "learning_rate": 4.7089189806584874e-05, + "loss": 0.7319, + "num_input_tokens_seen": 255962112, + "step": 1417 + }, + { + "epoch": 0.155231396589945, + "grad_norm": 1.2213876498688434, + "learning_rate": 4.708516173387692e-05, + "loss": 0.922, + "num_input_tokens_seen": 256176256, + "step": 1418 + }, + { + "epoch": 0.1553408686608829, + "grad_norm": 1.1525062058585809, + "learning_rate": 4.7081131048536564e-05, + "loss": 0.5817, + "num_input_tokens_seen": 256363968, + "step": 1419 + }, + { + "epoch": 0.1554503407318208, + "grad_norm": 1.384329271099457, + "learning_rate": 4.70770977510406e-05, + "loss": 0.7691, + "num_input_tokens_seen": 256512480, + "step": 1420 + }, + { + "epoch": 0.15555981280275868, + "grad_norm": 1.1925848962246102, + "learning_rate": 4.70730618418662e-05, + "loss": 0.6224, + "num_input_tokens_seen": 256665696, + "step": 1421 + }, + { + "epoch": 0.1556692848736966, + "grad_norm": 1.119304278859051, + "learning_rate": 4.7069023321490754e-05, + "loss": 0.5609, + "num_input_tokens_seen": 256837504, + "step": 1422 + }, + { + "epoch": 0.1557787569446345, + "grad_norm": 1.3845616716375435, + "learning_rate": 4.706498219039206e-05, + "loss": 0.7151, + "num_input_tokens_seen": 257006848, + "step": 1423 + }, + { + "epoch": 0.1558882290155724, + "grad_norm": 1.1412626600451172, + "learning_rate": 4.706093844904814e-05, + "loss": 0.6813, + "num_input_tokens_seen": 257175520, + "step": 1424 + }, + { + "epoch": 0.15599770108651032, + "grad_norm": 1.2638653425787705, + "learning_rate": 4.7056892097937376e-05, + "loss": 0.6728, + "num_input_tokens_seen": 257380256, + "step": 1425 + }, + { + "epoch": 0.1561071731574482, + "grad_norm": 1.2169692895504736, + "learning_rate": 4.705284313753845e-05, + "loss": 0.6883, + "num_input_tokens_seen": 257591712, + "step": 1426 + }, + { + "epoch": 0.1562166452283861, + "grad_norm": 1.3117324310370848, + "learning_rate": 4.7048791568330333e-05, + "loss": 0.7034, + "num_input_tokens_seen": 257786144, + "step": 1427 + }, + { + "epoch": 0.156326117299324, + "grad_norm": 1.3045151648087747, + "learning_rate": 4.7044737390792326e-05, + "loss": 0.7254, + "num_input_tokens_seen": 257976544, + "step": 1428 + }, + { + "epoch": 0.15643558937026192, + "grad_norm": 1.2351223095857897, + "learning_rate": 4.704068060540402e-05, + "loss": 0.8161, + "num_input_tokens_seen": 258157984, + "step": 1429 + }, + { + "epoch": 0.1565450614411998, + "grad_norm": 1.2807819162711187, + "learning_rate": 4.703662121264535e-05, + "loss": 0.905, + "num_input_tokens_seen": 258364736, + "step": 1430 + }, + { + "epoch": 0.1566545335121377, + "grad_norm": 1.300213343280532, + "learning_rate": 4.70325592129965e-05, + "loss": 0.701, + "num_input_tokens_seen": 258581344, + "step": 1431 + }, + { + "epoch": 0.15676400558307563, + "grad_norm": 1.3520971574747662, + "learning_rate": 4.7028494606938025e-05, + "loss": 0.9636, + "num_input_tokens_seen": 258758080, + "step": 1432 + }, + { + "epoch": 0.15687347765401352, + "grad_norm": 1.332905457308302, + "learning_rate": 4.7024427394950745e-05, + "loss": 0.7459, + "num_input_tokens_seen": 258933696, + "step": 1433 + }, + { + "epoch": 0.15698294972495141, + "grad_norm": 1.3503088015130704, + "learning_rate": 4.702035757751581e-05, + "loss": 0.7365, + "num_input_tokens_seen": 259109984, + "step": 1434 + }, + { + "epoch": 0.15709242179588934, + "grad_norm": 1.3061527088200424, + "learning_rate": 4.701628515511467e-05, + "loss": 0.7074, + "num_input_tokens_seen": 259321216, + "step": 1435 + }, + { + "epoch": 0.15720189386682723, + "grad_norm": 1.2976687656375487, + "learning_rate": 4.701221012822908e-05, + "loss": 0.6889, + "num_input_tokens_seen": 259504672, + "step": 1436 + }, + { + "epoch": 0.15731136593776512, + "grad_norm": 1.358148262467214, + "learning_rate": 4.7008132497341116e-05, + "loss": 0.9133, + "num_input_tokens_seen": 259678496, + "step": 1437 + }, + { + "epoch": 0.15742083800870302, + "grad_norm": 1.1908834342125327, + "learning_rate": 4.700405226293314e-05, + "loss": 0.7161, + "num_input_tokens_seen": 259845376, + "step": 1438 + }, + { + "epoch": 0.15753031007964094, + "grad_norm": 1.3199925232444325, + "learning_rate": 4.6999969425487864e-05, + "loss": 0.6339, + "num_input_tokens_seen": 259999936, + "step": 1439 + }, + { + "epoch": 0.15763978215057883, + "grad_norm": 1.232860746998816, + "learning_rate": 4.699588398548825e-05, + "loss": 0.6262, + "num_input_tokens_seen": 260186304, + "step": 1440 + }, + { + "epoch": 0.15774925422151673, + "grad_norm": 1.2629791672378572, + "learning_rate": 4.699179594341761e-05, + "loss": 0.7425, + "num_input_tokens_seen": 260391488, + "step": 1441 + }, + { + "epoch": 0.15785872629245465, + "grad_norm": 1.2355075903504593, + "learning_rate": 4.698770529975956e-05, + "loss": 0.6812, + "num_input_tokens_seen": 260565536, + "step": 1442 + }, + { + "epoch": 0.15796819836339254, + "grad_norm": 1.3098123370861867, + "learning_rate": 4.698361205499799e-05, + "loss": 0.7717, + "num_input_tokens_seen": 260740928, + "step": 1443 + }, + { + "epoch": 0.15807767043433044, + "grad_norm": 1.2037535672714283, + "learning_rate": 4.6979516209617144e-05, + "loss": 0.7051, + "num_input_tokens_seen": 260930208, + "step": 1444 + }, + { + "epoch": 0.15818714250526833, + "grad_norm": 1.1275848786405476, + "learning_rate": 4.697541776410156e-05, + "loss": 0.5688, + "num_input_tokens_seen": 261116800, + "step": 1445 + }, + { + "epoch": 0.15829661457620625, + "grad_norm": 1.2158137922610055, + "learning_rate": 4.697131671893605e-05, + "loss": 0.5024, + "num_input_tokens_seen": 261270464, + "step": 1446 + }, + { + "epoch": 0.15840608664714415, + "grad_norm": 1.2986046161364526, + "learning_rate": 4.696721307460579e-05, + "loss": 0.7838, + "num_input_tokens_seen": 261453920, + "step": 1447 + }, + { + "epoch": 0.15851555871808204, + "grad_norm": 1.3654833390634793, + "learning_rate": 4.6963106831596206e-05, + "loss": 0.8274, + "num_input_tokens_seen": 261604000, + "step": 1448 + }, + { + "epoch": 0.15862503078901996, + "grad_norm": 1.3412000725404543, + "learning_rate": 4.695899799039307e-05, + "loss": 0.6234, + "num_input_tokens_seen": 261785888, + "step": 1449 + }, + { + "epoch": 0.15873450285995785, + "grad_norm": 1.357343493057917, + "learning_rate": 4.695488655148245e-05, + "loss": 0.8017, + "num_input_tokens_seen": 261957920, + "step": 1450 + }, + { + "epoch": 0.15884397493089575, + "grad_norm": 1.2539649545588016, + "learning_rate": 4.695077251535073e-05, + "loss": 0.7015, + "num_input_tokens_seen": 262151232, + "step": 1451 + }, + { + "epoch": 0.15895344700183367, + "grad_norm": 1.3909298851886331, + "learning_rate": 4.6946655882484575e-05, + "loss": 0.7989, + "num_input_tokens_seen": 262328640, + "step": 1452 + }, + { + "epoch": 0.15906291907277156, + "grad_norm": 1.3859090670966627, + "learning_rate": 4.694253665337099e-05, + "loss": 0.8251, + "num_input_tokens_seen": 262537184, + "step": 1453 + }, + { + "epoch": 0.15917239114370946, + "grad_norm": 1.5101924949121033, + "learning_rate": 4.693841482849726e-05, + "loss": 0.8237, + "num_input_tokens_seen": 262702720, + "step": 1454 + }, + { + "epoch": 0.15928186321464735, + "grad_norm": 1.29846544293061, + "learning_rate": 4.6934290408351e-05, + "loss": 0.8038, + "num_input_tokens_seen": 262893344, + "step": 1455 + }, + { + "epoch": 0.15939133528558527, + "grad_norm": 1.2067877761558066, + "learning_rate": 4.693016339342011e-05, + "loss": 0.5497, + "num_input_tokens_seen": 263058208, + "step": 1456 + }, + { + "epoch": 0.15950080735652317, + "grad_norm": 1.2761723925003017, + "learning_rate": 4.692603378419282e-05, + "loss": 0.5789, + "num_input_tokens_seen": 263219712, + "step": 1457 + }, + { + "epoch": 0.15961027942746106, + "grad_norm": 1.2239104836491879, + "learning_rate": 4.692190158115765e-05, + "loss": 0.7238, + "num_input_tokens_seen": 263372256, + "step": 1458 + }, + { + "epoch": 0.15971975149839898, + "grad_norm": 1.2584622049760372, + "learning_rate": 4.691776678480343e-05, + "loss": 0.7467, + "num_input_tokens_seen": 263571168, + "step": 1459 + }, + { + "epoch": 0.15982922356933688, + "grad_norm": 1.2627952400430766, + "learning_rate": 4.69136293956193e-05, + "loss": 0.7387, + "num_input_tokens_seen": 263740288, + "step": 1460 + }, + { + "epoch": 0.15993869564027477, + "grad_norm": 1.1856550139075401, + "learning_rate": 4.6909489414094694e-05, + "loss": 0.8547, + "num_input_tokens_seen": 263924640, + "step": 1461 + }, + { + "epoch": 0.16004816771121266, + "grad_norm": 1.1883600032073194, + "learning_rate": 4.6905346840719386e-05, + "loss": 0.6846, + "num_input_tokens_seen": 264108544, + "step": 1462 + }, + { + "epoch": 0.16015763978215058, + "grad_norm": 1.2459326852246653, + "learning_rate": 4.690120167598341e-05, + "loss": 0.6742, + "num_input_tokens_seen": 264300064, + "step": 1463 + }, + { + "epoch": 0.16026711185308848, + "grad_norm": 1.2385899011680386, + "learning_rate": 4.689705392037716e-05, + "loss": 0.7547, + "num_input_tokens_seen": 264469408, + "step": 1464 + }, + { + "epoch": 0.16037658392402637, + "grad_norm": 1.2310889240378007, + "learning_rate": 4.689290357439128e-05, + "loss": 0.5629, + "num_input_tokens_seen": 264658240, + "step": 1465 + }, + { + "epoch": 0.1604860559949643, + "grad_norm": 1.3447199580986828, + "learning_rate": 4.688875063851676e-05, + "loss": 0.7411, + "num_input_tokens_seen": 264799584, + "step": 1466 + }, + { + "epoch": 0.1605955280659022, + "grad_norm": 1.2983536161830642, + "learning_rate": 4.688459511324489e-05, + "loss": 0.7588, + "num_input_tokens_seen": 264943616, + "step": 1467 + }, + { + "epoch": 0.16070500013684008, + "grad_norm": 1.2472380642544434, + "learning_rate": 4.688043699906725e-05, + "loss": 0.6871, + "num_input_tokens_seen": 265153280, + "step": 1468 + }, + { + "epoch": 0.160814472207778, + "grad_norm": 1.3472038947275635, + "learning_rate": 4.687627629647573e-05, + "loss": 0.7719, + "num_input_tokens_seen": 265312320, + "step": 1469 + }, + { + "epoch": 0.1609239442787159, + "grad_norm": 1.3299923055469254, + "learning_rate": 4.687211300596256e-05, + "loss": 0.9027, + "num_input_tokens_seen": 265487712, + "step": 1470 + }, + { + "epoch": 0.1610334163496538, + "grad_norm": 1.4132678080310175, + "learning_rate": 4.686794712802023e-05, + "loss": 0.865, + "num_input_tokens_seen": 265636672, + "step": 1471 + }, + { + "epoch": 0.16114288842059168, + "grad_norm": 1.31331817919965, + "learning_rate": 4.6863778663141556e-05, + "loss": 0.6975, + "num_input_tokens_seen": 265793696, + "step": 1472 + }, + { + "epoch": 0.1612523604915296, + "grad_norm": 1.1240078471782073, + "learning_rate": 4.6859607611819664e-05, + "loss": 0.6301, + "num_input_tokens_seen": 265956320, + "step": 1473 + }, + { + "epoch": 0.1613618325624675, + "grad_norm": 1.2305431040747914, + "learning_rate": 4.685543397454799e-05, + "loss": 0.6863, + "num_input_tokens_seen": 266108864, + "step": 1474 + }, + { + "epoch": 0.1614713046334054, + "grad_norm": 1.1898055533216199, + "learning_rate": 4.685125775182024e-05, + "loss": 0.79, + "num_input_tokens_seen": 266278880, + "step": 1475 + }, + { + "epoch": 0.16158077670434332, + "grad_norm": 1.2888683750906125, + "learning_rate": 4.684707894413048e-05, + "loss": 0.7672, + "num_input_tokens_seen": 266479808, + "step": 1476 + }, + { + "epoch": 0.1616902487752812, + "grad_norm": 1.1406058740971943, + "learning_rate": 4.684289755197305e-05, + "loss": 0.6446, + "num_input_tokens_seen": 266652960, + "step": 1477 + }, + { + "epoch": 0.1617997208462191, + "grad_norm": 1.19959752565455, + "learning_rate": 4.683871357584259e-05, + "loss": 0.6489, + "num_input_tokens_seen": 266805728, + "step": 1478 + }, + { + "epoch": 0.161909192917157, + "grad_norm": 1.276795389136854, + "learning_rate": 4.6834527016234065e-05, + "loss": 0.644, + "num_input_tokens_seen": 266987168, + "step": 1479 + }, + { + "epoch": 0.16201866498809492, + "grad_norm": 1.2392268857110484, + "learning_rate": 4.6830337873642724e-05, + "loss": 0.6839, + "num_input_tokens_seen": 267159872, + "step": 1480 + }, + { + "epoch": 0.1621281370590328, + "grad_norm": 1.379482937570751, + "learning_rate": 4.682614614856416e-05, + "loss": 0.7335, + "num_input_tokens_seen": 267318912, + "step": 1481 + }, + { + "epoch": 0.1622376091299707, + "grad_norm": 1.3085712943356635, + "learning_rate": 4.6821951841494225e-05, + "loss": 0.7909, + "num_input_tokens_seen": 267514464, + "step": 1482 + }, + { + "epoch": 0.16234708120090863, + "grad_norm": 1.2715747525723349, + "learning_rate": 4.6817754952929106e-05, + "loss": 0.6752, + "num_input_tokens_seen": 267699936, + "step": 1483 + }, + { + "epoch": 0.16245655327184652, + "grad_norm": 1.245119006585681, + "learning_rate": 4.681355548336528e-05, + "loss": 0.8886, + "num_input_tokens_seen": 267889216, + "step": 1484 + }, + { + "epoch": 0.16256602534278441, + "grad_norm": 1.2562204556763046, + "learning_rate": 4.680935343329954e-05, + "loss": 0.6516, + "num_input_tokens_seen": 268088576, + "step": 1485 + }, + { + "epoch": 0.16267549741372234, + "grad_norm": 1.4681826368169115, + "learning_rate": 4.680514880322898e-05, + "loss": 0.7621, + "num_input_tokens_seen": 268278752, + "step": 1486 + }, + { + "epoch": 0.16278496948466023, + "grad_norm": 1.1888402101963236, + "learning_rate": 4.680094159365101e-05, + "loss": 0.6528, + "num_input_tokens_seen": 268470048, + "step": 1487 + }, + { + "epoch": 0.16289444155559812, + "grad_norm": 1.2624755101614147, + "learning_rate": 4.679673180506332e-05, + "loss": 0.7321, + "num_input_tokens_seen": 268643872, + "step": 1488 + }, + { + "epoch": 0.16300391362653602, + "grad_norm": 1.322320798908892, + "learning_rate": 4.679251943796393e-05, + "loss": 0.8103, + "num_input_tokens_seen": 268818816, + "step": 1489 + }, + { + "epoch": 0.16311338569747394, + "grad_norm": 1.1390723543303376, + "learning_rate": 4.678830449285114e-05, + "loss": 0.6323, + "num_input_tokens_seen": 268993536, + "step": 1490 + }, + { + "epoch": 0.16322285776841183, + "grad_norm": 1.197050987034131, + "learning_rate": 4.6784086970223596e-05, + "loss": 0.6342, + "num_input_tokens_seen": 269173856, + "step": 1491 + }, + { + "epoch": 0.16333232983934973, + "grad_norm": 1.3724053784882375, + "learning_rate": 4.677986687058019e-05, + "loss": 0.921, + "num_input_tokens_seen": 269360672, + "step": 1492 + }, + { + "epoch": 0.16344180191028765, + "grad_norm": 1.186605317230116, + "learning_rate": 4.6775644194420184e-05, + "loss": 0.6604, + "num_input_tokens_seen": 269553312, + "step": 1493 + }, + { + "epoch": 0.16355127398122554, + "grad_norm": 1.2238165374217027, + "learning_rate": 4.6771418942243096e-05, + "loss": 0.6726, + "num_input_tokens_seen": 269760288, + "step": 1494 + }, + { + "epoch": 0.16366074605216344, + "grad_norm": 1.217967442382737, + "learning_rate": 4.6767191114548755e-05, + "loss": 0.9044, + "num_input_tokens_seen": 269953152, + "step": 1495 + }, + { + "epoch": 0.16377021812310136, + "grad_norm": 1.1333488510041865, + "learning_rate": 4.676296071183733e-05, + "loss": 0.5553, + "num_input_tokens_seen": 270132576, + "step": 1496 + }, + { + "epoch": 0.16387969019403925, + "grad_norm": 1.1436302622670804, + "learning_rate": 4.6758727734609256e-05, + "loss": 0.5936, + "num_input_tokens_seen": 270307968, + "step": 1497 + }, + { + "epoch": 0.16398916226497715, + "grad_norm": 1.0697237051994772, + "learning_rate": 4.675449218336528e-05, + "loss": 0.5061, + "num_input_tokens_seen": 270504192, + "step": 1498 + }, + { + "epoch": 0.16409863433591504, + "grad_norm": 1.114854898376968, + "learning_rate": 4.6750254058606467e-05, + "loss": 0.507, + "num_input_tokens_seen": 270676224, + "step": 1499 + }, + { + "epoch": 0.16420810640685296, + "grad_norm": 1.2841752316937907, + "learning_rate": 4.6746013360834184e-05, + "loss": 0.7169, + "num_input_tokens_seen": 270822720, + "step": 1500 + }, + { + "epoch": 0.16431757847779085, + "grad_norm": 1.353951564235214, + "learning_rate": 4.6741770090550084e-05, + "loss": 0.7463, + "num_input_tokens_seen": 271001248, + "step": 1501 + }, + { + "epoch": 0.16442705054872875, + "grad_norm": 1.3256381819167207, + "learning_rate": 4.673752424825615e-05, + "loss": 0.7822, + "num_input_tokens_seen": 271184032, + "step": 1502 + }, + { + "epoch": 0.16453652261966667, + "grad_norm": 1.428835672394795, + "learning_rate": 4.673327583445465e-05, + "loss": 0.8156, + "num_input_tokens_seen": 271376000, + "step": 1503 + }, + { + "epoch": 0.16464599469060456, + "grad_norm": 1.3195648333633168, + "learning_rate": 4.672902484964817e-05, + "loss": 0.6269, + "num_input_tokens_seen": 271561696, + "step": 1504 + }, + { + "epoch": 0.16475546676154246, + "grad_norm": 1.42735274990107, + "learning_rate": 4.672477129433959e-05, + "loss": 0.8395, + "num_input_tokens_seen": 271745600, + "step": 1505 + }, + { + "epoch": 0.16486493883248035, + "grad_norm": 1.4922037173683116, + "learning_rate": 4.672051516903209e-05, + "loss": 0.7432, + "num_input_tokens_seen": 271912704, + "step": 1506 + }, + { + "epoch": 0.16497441090341827, + "grad_norm": 1.3512579955253516, + "learning_rate": 4.671625647422917e-05, + "loss": 0.7383, + "num_input_tokens_seen": 272101088, + "step": 1507 + }, + { + "epoch": 0.16508388297435617, + "grad_norm": 1.2987696125359718, + "learning_rate": 4.6711995210434625e-05, + "loss": 0.7614, + "num_input_tokens_seen": 272298432, + "step": 1508 + }, + { + "epoch": 0.16519335504529406, + "grad_norm": 1.3327150798939118, + "learning_rate": 4.670773137815255e-05, + "loss": 0.9494, + "num_input_tokens_seen": 272484352, + "step": 1509 + }, + { + "epoch": 0.16530282711623198, + "grad_norm": 1.2469393453033224, + "learning_rate": 4.670346497788736e-05, + "loss": 0.7336, + "num_input_tokens_seen": 272658400, + "step": 1510 + }, + { + "epoch": 0.16541229918716988, + "grad_norm": 1.3755374204980502, + "learning_rate": 4.669919601014374e-05, + "loss": 0.8216, + "num_input_tokens_seen": 272849920, + "step": 1511 + }, + { + "epoch": 0.16552177125810777, + "grad_norm": 1.2335905165626906, + "learning_rate": 4.669492447542673e-05, + "loss": 0.6921, + "num_input_tokens_seen": 273032704, + "step": 1512 + }, + { + "epoch": 0.1656312433290457, + "grad_norm": 1.1602877661146447, + "learning_rate": 4.669065037424161e-05, + "loss": 0.5839, + "num_input_tokens_seen": 273241920, + "step": 1513 + }, + { + "epoch": 0.16574071539998358, + "grad_norm": 1.1857333343029577, + "learning_rate": 4.668637370709403e-05, + "loss": 0.7662, + "num_input_tokens_seen": 273432320, + "step": 1514 + }, + { + "epoch": 0.16585018747092148, + "grad_norm": 1.3862314576954688, + "learning_rate": 4.668209447448989e-05, + "loss": 0.6626, + "num_input_tokens_seen": 273617344, + "step": 1515 + }, + { + "epoch": 0.16595965954185937, + "grad_norm": 1.4069815957872704, + "learning_rate": 4.667781267693543e-05, + "loss": 0.8782, + "num_input_tokens_seen": 273786464, + "step": 1516 + }, + { + "epoch": 0.1660691316127973, + "grad_norm": 1.2478583108358574, + "learning_rate": 4.6673528314937166e-05, + "loss": 0.6219, + "num_input_tokens_seen": 273967456, + "step": 1517 + }, + { + "epoch": 0.1661786036837352, + "grad_norm": 1.3163098942387763, + "learning_rate": 4.666924138900194e-05, + "loss": 0.7154, + "num_input_tokens_seen": 274119776, + "step": 1518 + }, + { + "epoch": 0.16628807575467308, + "grad_norm": 1.1875665545635514, + "learning_rate": 4.666495189963688e-05, + "loss": 0.7352, + "num_input_tokens_seen": 274295840, + "step": 1519 + }, + { + "epoch": 0.166397547825611, + "grad_norm": 1.1690057342134128, + "learning_rate": 4.666065984734942e-05, + "loss": 0.7133, + "num_input_tokens_seen": 274492288, + "step": 1520 + }, + { + "epoch": 0.1665070198965489, + "grad_norm": 1.1710212649459562, + "learning_rate": 4.6656365232647316e-05, + "loss": 0.6485, + "num_input_tokens_seen": 274670592, + "step": 1521 + }, + { + "epoch": 0.1666164919674868, + "grad_norm": 1.2328152519939926, + "learning_rate": 4.66520680560386e-05, + "loss": 0.6739, + "num_input_tokens_seen": 274866592, + "step": 1522 + }, + { + "epoch": 0.16672596403842468, + "grad_norm": 1.2287521255955982, + "learning_rate": 4.664776831803163e-05, + "loss": 0.6798, + "num_input_tokens_seen": 275029440, + "step": 1523 + }, + { + "epoch": 0.1668354361093626, + "grad_norm": 1.2309215862856018, + "learning_rate": 4.664346601913504e-05, + "loss": 0.6982, + "num_input_tokens_seen": 275215136, + "step": 1524 + }, + { + "epoch": 0.1669449081803005, + "grad_norm": 1.225426360300258, + "learning_rate": 4.663916115985781e-05, + "loss": 0.7057, + "num_input_tokens_seen": 275398816, + "step": 1525 + }, + { + "epoch": 0.1670543802512384, + "grad_norm": 1.287073670327634, + "learning_rate": 4.663485374070917e-05, + "loss": 0.7479, + "num_input_tokens_seen": 275601984, + "step": 1526 + }, + { + "epoch": 0.16716385232217632, + "grad_norm": 1.2138490683199934, + "learning_rate": 4.66305437621987e-05, + "loss": 0.6276, + "num_input_tokens_seen": 275782976, + "step": 1527 + }, + { + "epoch": 0.1672733243931142, + "grad_norm": 1.3061376952303583, + "learning_rate": 4.6626231224836245e-05, + "loss": 0.8127, + "num_input_tokens_seen": 275975616, + "step": 1528 + }, + { + "epoch": 0.1673827964640521, + "grad_norm": 1.350174383980448, + "learning_rate": 4.662191612913199e-05, + "loss": 0.827, + "num_input_tokens_seen": 276165344, + "step": 1529 + }, + { + "epoch": 0.16749226853499002, + "grad_norm": 1.2433251982943498, + "learning_rate": 4.661759847559638e-05, + "loss": 0.5903, + "num_input_tokens_seen": 276365824, + "step": 1530 + }, + { + "epoch": 0.16760174060592792, + "grad_norm": 1.5395954584135494, + "learning_rate": 4.66132782647402e-05, + "loss": 0.8826, + "num_input_tokens_seen": 276572800, + "step": 1531 + }, + { + "epoch": 0.1677112126768658, + "grad_norm": 1.1903930858715057, + "learning_rate": 4.6608955497074526e-05, + "loss": 0.6796, + "num_input_tokens_seen": 276726688, + "step": 1532 + }, + { + "epoch": 0.1678206847478037, + "grad_norm": 1.252008731458156, + "learning_rate": 4.660463017311072e-05, + "loss": 0.8858, + "num_input_tokens_seen": 276923584, + "step": 1533 + }, + { + "epoch": 0.16793015681874163, + "grad_norm": 1.060680458268902, + "learning_rate": 4.660030229336046e-05, + "loss": 0.5721, + "num_input_tokens_seen": 277092480, + "step": 1534 + }, + { + "epoch": 0.16803962888967952, + "grad_norm": 1.1328212343076534, + "learning_rate": 4.659597185833574e-05, + "loss": 0.6596, + "num_input_tokens_seen": 277287136, + "step": 1535 + }, + { + "epoch": 0.16814910096061741, + "grad_norm": 1.1800938089711797, + "learning_rate": 4.6591638868548824e-05, + "loss": 0.6495, + "num_input_tokens_seen": 277486720, + "step": 1536 + }, + { + "epoch": 0.16825857303155534, + "grad_norm": 1.2334823277834595, + "learning_rate": 4.6587303324512324e-05, + "loss": 0.9191, + "num_input_tokens_seen": 277689664, + "step": 1537 + }, + { + "epoch": 0.16836804510249323, + "grad_norm": 1.2408433275266493, + "learning_rate": 4.6582965226739094e-05, + "loss": 0.6179, + "num_input_tokens_seen": 277872896, + "step": 1538 + }, + { + "epoch": 0.16847751717343112, + "grad_norm": 1.0328698008190726, + "learning_rate": 4.6578624575742335e-05, + "loss": 0.5461, + "num_input_tokens_seen": 278048960, + "step": 1539 + }, + { + "epoch": 0.16858698924436902, + "grad_norm": 1.1906468206693912, + "learning_rate": 4.6574281372035546e-05, + "loss": 0.542, + "num_input_tokens_seen": 278248320, + "step": 1540 + }, + { + "epoch": 0.16869646131530694, + "grad_norm": 1.2080966235290234, + "learning_rate": 4.6569935616132516e-05, + "loss": 0.6508, + "num_input_tokens_seen": 278406912, + "step": 1541 + }, + { + "epoch": 0.16880593338624483, + "grad_norm": 1.4217814320389484, + "learning_rate": 4.6565587308547334e-05, + "loss": 0.9863, + "num_input_tokens_seen": 278599776, + "step": 1542 + }, + { + "epoch": 0.16891540545718273, + "grad_norm": 1.343300145278022, + "learning_rate": 4.65612364497944e-05, + "loss": 0.739, + "num_input_tokens_seen": 278774944, + "step": 1543 + }, + { + "epoch": 0.16902487752812065, + "grad_norm": 1.2744952062738897, + "learning_rate": 4.655688304038841e-05, + "loss": 0.7061, + "num_input_tokens_seen": 278942496, + "step": 1544 + }, + { + "epoch": 0.16913434959905854, + "grad_norm": 1.3543694515493097, + "learning_rate": 4.6552527080844374e-05, + "loss": 0.9761, + "num_input_tokens_seen": 279133344, + "step": 1545 + }, + { + "epoch": 0.16924382166999644, + "grad_norm": 1.2065483040655003, + "learning_rate": 4.6548168571677574e-05, + "loss": 0.584, + "num_input_tokens_seen": 279330464, + "step": 1546 + }, + { + "epoch": 0.16935329374093436, + "grad_norm": 1.2200309672881127, + "learning_rate": 4.6543807513403636e-05, + "loss": 0.6647, + "num_input_tokens_seen": 279518400, + "step": 1547 + }, + { + "epoch": 0.16946276581187225, + "grad_norm": 1.2063493994045023, + "learning_rate": 4.653944390653845e-05, + "loss": 0.6937, + "num_input_tokens_seen": 279676992, + "step": 1548 + }, + { + "epoch": 0.16957223788281015, + "grad_norm": 1.109813388654608, + "learning_rate": 4.6535077751598224e-05, + "loss": 0.6195, + "num_input_tokens_seen": 279858432, + "step": 1549 + }, + { + "epoch": 0.16968170995374804, + "grad_norm": 1.289694983477056, + "learning_rate": 4.653070904909947e-05, + "loss": 0.6947, + "num_input_tokens_seen": 280040096, + "step": 1550 + }, + { + "epoch": 0.16979118202468596, + "grad_norm": 1.2512120093066867, + "learning_rate": 4.6526337799559e-05, + "loss": 0.6698, + "num_input_tokens_seen": 280220416, + "step": 1551 + }, + { + "epoch": 0.16990065409562385, + "grad_norm": 1.1790705197948994, + "learning_rate": 4.652196400349391e-05, + "loss": 0.5624, + "num_input_tokens_seen": 280369376, + "step": 1552 + }, + { + "epoch": 0.17001012616656175, + "grad_norm": 1.2875361798582476, + "learning_rate": 4.651758766142162e-05, + "loss": 0.7419, + "num_input_tokens_seen": 280562240, + "step": 1553 + }, + { + "epoch": 0.17011959823749967, + "grad_norm": 1.3949515513412265, + "learning_rate": 4.6513208773859854e-05, + "loss": 0.7965, + "num_input_tokens_seen": 280753984, + "step": 1554 + }, + { + "epoch": 0.17022907030843756, + "grad_norm": 1.239691379657403, + "learning_rate": 4.650882734132661e-05, + "loss": 0.8029, + "num_input_tokens_seen": 280964320, + "step": 1555 + }, + { + "epoch": 0.17033854237937546, + "grad_norm": 1.2516004806641985, + "learning_rate": 4.650444336434021e-05, + "loss": 0.5798, + "num_input_tokens_seen": 281123808, + "step": 1556 + }, + { + "epoch": 0.17044801445031335, + "grad_norm": 1.2846428677677715, + "learning_rate": 4.650005684341928e-05, + "loss": 0.7068, + "num_input_tokens_seen": 281309056, + "step": 1557 + }, + { + "epoch": 0.17055748652125127, + "grad_norm": 1.3062979529673062, + "learning_rate": 4.6495667779082716e-05, + "loss": 0.711, + "num_input_tokens_seen": 281483328, + "step": 1558 + }, + { + "epoch": 0.17066695859218917, + "grad_norm": 1.3901749214735555, + "learning_rate": 4.649127617184975e-05, + "loss": 0.7761, + "num_input_tokens_seen": 281652000, + "step": 1559 + }, + { + "epoch": 0.17077643066312706, + "grad_norm": 1.3233961402888668, + "learning_rate": 4.6486882022239895e-05, + "loss": 0.733, + "num_input_tokens_seen": 281862560, + "step": 1560 + }, + { + "epoch": 0.17088590273406498, + "grad_norm": 1.3050668587493781, + "learning_rate": 4.648248533077297e-05, + "loss": 0.559, + "num_input_tokens_seen": 282020032, + "step": 1561 + }, + { + "epoch": 0.17099537480500288, + "grad_norm": 1.383801682979345, + "learning_rate": 4.6478086097969104e-05, + "loss": 0.9127, + "num_input_tokens_seen": 282205280, + "step": 1562 + }, + { + "epoch": 0.17110484687594077, + "grad_norm": 1.2825157147461632, + "learning_rate": 4.647368432434871e-05, + "loss": 0.7845, + "num_input_tokens_seen": 282407328, + "step": 1563 + }, + { + "epoch": 0.1712143189468787, + "grad_norm": 1.4722013302086405, + "learning_rate": 4.646928001043251e-05, + "loss": 0.6889, + "num_input_tokens_seen": 282550464, + "step": 1564 + }, + { + "epoch": 0.17132379101781658, + "grad_norm": 1.3254701446054593, + "learning_rate": 4.646487315674153e-05, + "loss": 0.7499, + "num_input_tokens_seen": 282727648, + "step": 1565 + }, + { + "epoch": 0.17143326308875448, + "grad_norm": 1.14861348159732, + "learning_rate": 4.646046376379708e-05, + "loss": 0.7225, + "num_input_tokens_seen": 282921856, + "step": 1566 + }, + { + "epoch": 0.17154273515969237, + "grad_norm": 1.2488494823492156, + "learning_rate": 4.64560518321208e-05, + "loss": 0.8488, + "num_input_tokens_seen": 283110464, + "step": 1567 + }, + { + "epoch": 0.1716522072306303, + "grad_norm": 1.214551540937966, + "learning_rate": 4.6451637362234604e-05, + "loss": 0.8527, + "num_input_tokens_seen": 283308256, + "step": 1568 + }, + { + "epoch": 0.1717616793015682, + "grad_norm": 1.161716478110826, + "learning_rate": 4.644722035466072e-05, + "loss": 0.5577, + "num_input_tokens_seen": 283481408, + "step": 1569 + }, + { + "epoch": 0.17187115137250608, + "grad_norm": 1.296953819361576, + "learning_rate": 4.644280080992166e-05, + "loss": 0.8266, + "num_input_tokens_seen": 283679872, + "step": 1570 + }, + { + "epoch": 0.171980623443444, + "grad_norm": 1.20145781300331, + "learning_rate": 4.643837872854027e-05, + "loss": 0.6238, + "num_input_tokens_seen": 283857056, + "step": 1571 + }, + { + "epoch": 0.1720900955143819, + "grad_norm": 1.2659760918310783, + "learning_rate": 4.643395411103965e-05, + "loss": 0.8533, + "num_input_tokens_seen": 284046784, + "step": 1572 + }, + { + "epoch": 0.1721995675853198, + "grad_norm": 1.2832206364892695, + "learning_rate": 4.642952695794323e-05, + "loss": 0.6206, + "num_input_tokens_seen": 284218592, + "step": 1573 + }, + { + "epoch": 0.17230903965625768, + "grad_norm": 1.2488144020374221, + "learning_rate": 4.6425097269774744e-05, + "loss": 0.7451, + "num_input_tokens_seen": 284411232, + "step": 1574 + }, + { + "epoch": 0.1724185117271956, + "grad_norm": 1.2556943414205635, + "learning_rate": 4.64206650470582e-05, + "loss": 0.7152, + "num_input_tokens_seen": 284583712, + "step": 1575 + }, + { + "epoch": 0.1725279837981335, + "grad_norm": 1.3827776284858688, + "learning_rate": 4.6416230290317934e-05, + "loss": 0.8709, + "num_input_tokens_seen": 284754176, + "step": 1576 + }, + { + "epoch": 0.1726374558690714, + "grad_norm": 1.2395400619434043, + "learning_rate": 4.641179300007857e-05, + "loss": 0.6065, + "num_input_tokens_seen": 284897984, + "step": 1577 + }, + { + "epoch": 0.17274692794000931, + "grad_norm": 1.3785152583393356, + "learning_rate": 4.640735317686502e-05, + "loss": 0.713, + "num_input_tokens_seen": 285065536, + "step": 1578 + }, + { + "epoch": 0.1728564000109472, + "grad_norm": 1.2886881891740243, + "learning_rate": 4.6402910821202525e-05, + "loss": 0.7582, + "num_input_tokens_seen": 285237568, + "step": 1579 + }, + { + "epoch": 0.1729658720818851, + "grad_norm": 1.2701818589040808, + "learning_rate": 4.6398465933616585e-05, + "loss": 0.8263, + "num_input_tokens_seen": 285436032, + "step": 1580 + }, + { + "epoch": 0.17307534415282302, + "grad_norm": 1.3159950134710434, + "learning_rate": 4.6394018514633033e-05, + "loss": 0.6285, + "num_input_tokens_seen": 285630016, + "step": 1581 + }, + { + "epoch": 0.17318481622376092, + "grad_norm": 1.4241910244350724, + "learning_rate": 4.6389568564777994e-05, + "loss": 0.7992, + "num_input_tokens_seen": 285788608, + "step": 1582 + }, + { + "epoch": 0.1732942882946988, + "grad_norm": 1.3345344269543515, + "learning_rate": 4.6385116084577874e-05, + "loss": 0.8261, + "num_input_tokens_seen": 285963104, + "step": 1583 + }, + { + "epoch": 0.1734037603656367, + "grad_norm": 1.4626524136670942, + "learning_rate": 4.638066107455941e-05, + "loss": 0.7439, + "num_input_tokens_seen": 286128416, + "step": 1584 + }, + { + "epoch": 0.17351323243657463, + "grad_norm": 1.3546521368107949, + "learning_rate": 4.637620353524962e-05, + "loss": 0.8627, + "num_input_tokens_seen": 286308288, + "step": 1585 + }, + { + "epoch": 0.17362270450751252, + "grad_norm": 1.3684858859877602, + "learning_rate": 4.637174346717581e-05, + "loss": 0.7389, + "num_input_tokens_seen": 286471136, + "step": 1586 + }, + { + "epoch": 0.17373217657845041, + "grad_norm": 1.2982083555668178, + "learning_rate": 4.63672808708656e-05, + "loss": 0.7723, + "num_input_tokens_seen": 286649664, + "step": 1587 + }, + { + "epoch": 0.17384164864938834, + "grad_norm": 1.0539612141409147, + "learning_rate": 4.636281574684691e-05, + "loss": 0.5475, + "num_input_tokens_seen": 286856416, + "step": 1588 + }, + { + "epoch": 0.17395112072032623, + "grad_norm": 1.0831770356231727, + "learning_rate": 4.635834809564796e-05, + "loss": 0.7104, + "num_input_tokens_seen": 287033152, + "step": 1589 + }, + { + "epoch": 0.17406059279126412, + "grad_norm": 1.1550257619651705, + "learning_rate": 4.635387791779726e-05, + "loss": 0.5432, + "num_input_tokens_seen": 287200032, + "step": 1590 + }, + { + "epoch": 0.17417006486220202, + "grad_norm": 1.2445372423888612, + "learning_rate": 4.634940521382362e-05, + "loss": 0.7116, + "num_input_tokens_seen": 287390880, + "step": 1591 + }, + { + "epoch": 0.17427953693313994, + "grad_norm": 1.1699452596884263, + "learning_rate": 4.634492998425616e-05, + "loss": 0.7421, + "num_input_tokens_seen": 287597856, + "step": 1592 + }, + { + "epoch": 0.17438900900407783, + "grad_norm": 1.255501608664858, + "learning_rate": 4.6340452229624286e-05, + "loss": 0.6394, + "num_input_tokens_seen": 287777056, + "step": 1593 + }, + { + "epoch": 0.17449848107501573, + "grad_norm": 1.215957523554606, + "learning_rate": 4.6335971950457715e-05, + "loss": 0.5943, + "num_input_tokens_seen": 287943264, + "step": 1594 + }, + { + "epoch": 0.17460795314595365, + "grad_norm": 1.231869968460158, + "learning_rate": 4.6331489147286444e-05, + "loss": 0.7475, + "num_input_tokens_seen": 288130752, + "step": 1595 + }, + { + "epoch": 0.17471742521689154, + "grad_norm": 1.821709812848131, + "learning_rate": 4.632700382064079e-05, + "loss": 0.9054, + "num_input_tokens_seen": 288286208, + "step": 1596 + }, + { + "epoch": 0.17482689728782944, + "grad_norm": 1.2452559569575719, + "learning_rate": 4.632251597105135e-05, + "loss": 0.7435, + "num_input_tokens_seen": 288461376, + "step": 1597 + }, + { + "epoch": 0.17493636935876736, + "grad_norm": 1.1001795752205836, + "learning_rate": 4.631802559904903e-05, + "loss": 0.7022, + "num_input_tokens_seen": 288645056, + "step": 1598 + }, + { + "epoch": 0.17504584142970525, + "grad_norm": 1.2069220632407736, + "learning_rate": 4.631353270516504e-05, + "loss": 0.6644, + "num_input_tokens_seen": 288820000, + "step": 1599 + }, + { + "epoch": 0.17515531350064314, + "grad_norm": 1.3015526211330481, + "learning_rate": 4.6309037289930875e-05, + "loss": 0.7047, + "num_input_tokens_seen": 288994272, + "step": 1600 + }, + { + "epoch": 0.17526478557158104, + "grad_norm": 1.169328491329954, + "learning_rate": 4.630453935387833e-05, + "loss": 0.7493, + "num_input_tokens_seen": 289202816, + "step": 1601 + }, + { + "epoch": 0.17537425764251896, + "grad_norm": 1.2433156103161311, + "learning_rate": 4.630003889753951e-05, + "loss": 0.6486, + "num_input_tokens_seen": 289386496, + "step": 1602 + }, + { + "epoch": 0.17548372971345685, + "grad_norm": 1.1595512379572457, + "learning_rate": 4.629553592144681e-05, + "loss": 0.6121, + "num_input_tokens_seen": 289581824, + "step": 1603 + }, + { + "epoch": 0.17559320178439475, + "grad_norm": 1.2315429358391232, + "learning_rate": 4.629103042613292e-05, + "loss": 0.6971, + "num_input_tokens_seen": 289776480, + "step": 1604 + }, + { + "epoch": 0.17570267385533267, + "grad_norm": 1.278931683547857, + "learning_rate": 4.628652241213083e-05, + "loss": 0.7046, + "num_input_tokens_seen": 289928352, + "step": 1605 + }, + { + "epoch": 0.17581214592627056, + "grad_norm": 1.260372802139238, + "learning_rate": 4.6282011879973833e-05, + "loss": 0.8522, + "num_input_tokens_seen": 290111136, + "step": 1606 + }, + { + "epoch": 0.17592161799720846, + "grad_norm": 1.3434535298319523, + "learning_rate": 4.627749883019551e-05, + "loss": 0.9065, + "num_input_tokens_seen": 290292128, + "step": 1607 + }, + { + "epoch": 0.17603109006814635, + "grad_norm": 1.2645514373695108, + "learning_rate": 4.627298326332975e-05, + "loss": 0.9542, + "num_input_tokens_seen": 290497760, + "step": 1608 + }, + { + "epoch": 0.17614056213908427, + "grad_norm": 1.1724388292049779, + "learning_rate": 4.626846517991075e-05, + "loss": 0.5236, + "num_input_tokens_seen": 290698688, + "step": 1609 + }, + { + "epoch": 0.17625003421002217, + "grad_norm": 1.24962381424332, + "learning_rate": 4.626394458047296e-05, + "loss": 0.5208, + "num_input_tokens_seen": 290864448, + "step": 1610 + }, + { + "epoch": 0.17635950628096006, + "grad_norm": 1.143927665627743, + "learning_rate": 4.625942146555119e-05, + "loss": 0.6694, + "num_input_tokens_seen": 291056416, + "step": 1611 + }, + { + "epoch": 0.17646897835189798, + "grad_norm": 1.438745953830097, + "learning_rate": 4.62548958356805e-05, + "loss": 0.7317, + "num_input_tokens_seen": 291248832, + "step": 1612 + }, + { + "epoch": 0.17657845042283588, + "grad_norm": 1.2866866440696276, + "learning_rate": 4.625036769139626e-05, + "loss": 0.6754, + "num_input_tokens_seen": 291439232, + "step": 1613 + }, + { + "epoch": 0.17668792249377377, + "grad_norm": 1.2544398613294057, + "learning_rate": 4.624583703323415e-05, + "loss": 0.8223, + "num_input_tokens_seen": 291600960, + "step": 1614 + }, + { + "epoch": 0.1767973945647117, + "grad_norm": 1.3081826546435158, + "learning_rate": 4.624130386173013e-05, + "loss": 0.7705, + "num_input_tokens_seen": 291793824, + "step": 1615 + }, + { + "epoch": 0.17690686663564958, + "grad_norm": 1.267527715917927, + "learning_rate": 4.623676817742047e-05, + "loss": 0.6863, + "num_input_tokens_seen": 291940768, + "step": 1616 + }, + { + "epoch": 0.17701633870658748, + "grad_norm": 1.2038586844145809, + "learning_rate": 4.623222998084174e-05, + "loss": 0.5966, + "num_input_tokens_seen": 292139456, + "step": 1617 + }, + { + "epoch": 0.17712581077752537, + "grad_norm": 1.2497784895132564, + "learning_rate": 4.6227689272530785e-05, + "loss": 0.7892, + "num_input_tokens_seen": 292348448, + "step": 1618 + }, + { + "epoch": 0.1772352828484633, + "grad_norm": 1.2248920801853898, + "learning_rate": 4.622314605302477e-05, + "loss": 0.6496, + "num_input_tokens_seen": 292542880, + "step": 1619 + }, + { + "epoch": 0.1773447549194012, + "grad_norm": 1.3860164529531207, + "learning_rate": 4.621860032286115e-05, + "loss": 0.7724, + "num_input_tokens_seen": 292732384, + "step": 1620 + }, + { + "epoch": 0.17745422699033908, + "grad_norm": 1.2500393384465465, + "learning_rate": 4.621405208257767e-05, + "loss": 0.667, + "num_input_tokens_seen": 292902848, + "step": 1621 + }, + { + "epoch": 0.177563699061277, + "grad_norm": 1.2375379672150537, + "learning_rate": 4.620950133271239e-05, + "loss": 0.6736, + "num_input_tokens_seen": 293098624, + "step": 1622 + }, + { + "epoch": 0.1776731711322149, + "grad_norm": 1.2655885008270256, + "learning_rate": 4.620494807380365e-05, + "loss": 0.7815, + "num_input_tokens_seen": 293305152, + "step": 1623 + }, + { + "epoch": 0.1777826432031528, + "grad_norm": 1.3023243134346283, + "learning_rate": 4.620039230639008e-05, + "loss": 0.6715, + "num_input_tokens_seen": 293465760, + "step": 1624 + }, + { + "epoch": 0.17789211527409068, + "grad_norm": 1.3745433309114565, + "learning_rate": 4.619583403101063e-05, + "loss": 0.8146, + "num_input_tokens_seen": 293627712, + "step": 1625 + }, + { + "epoch": 0.1780015873450286, + "grad_norm": 1.3175101702911047, + "learning_rate": 4.619127324820454e-05, + "loss": 0.9117, + "num_input_tokens_seen": 293799744, + "step": 1626 + }, + { + "epoch": 0.1781110594159665, + "grad_norm": 1.1979084622751768, + "learning_rate": 4.6186709958511334e-05, + "loss": 0.6862, + "num_input_tokens_seen": 293968640, + "step": 1627 + }, + { + "epoch": 0.1782205314869044, + "grad_norm": 1.2648896801003642, + "learning_rate": 4.618214416247084e-05, + "loss": 0.7929, + "num_input_tokens_seen": 294150752, + "step": 1628 + }, + { + "epoch": 0.17833000355784231, + "grad_norm": 1.4282733197477717, + "learning_rate": 4.617757586062319e-05, + "loss": 0.8336, + "num_input_tokens_seen": 294321664, + "step": 1629 + }, + { + "epoch": 0.1784394756287802, + "grad_norm": 1.2266350985867145, + "learning_rate": 4.61730050535088e-05, + "loss": 0.584, + "num_input_tokens_seen": 294498400, + "step": 1630 + }, + { + "epoch": 0.1785489476997181, + "grad_norm": 1.2207026367186524, + "learning_rate": 4.6168431741668386e-05, + "loss": 0.6597, + "num_input_tokens_seen": 294658112, + "step": 1631 + }, + { + "epoch": 0.17865841977065602, + "grad_norm": 1.221822387265037, + "learning_rate": 4.6163855925642955e-05, + "loss": 0.6758, + "num_input_tokens_seen": 294864864, + "step": 1632 + }, + { + "epoch": 0.17876789184159392, + "grad_norm": 1.3927278371323537, + "learning_rate": 4.6159277605973836e-05, + "loss": 0.8215, + "num_input_tokens_seen": 295054592, + "step": 1633 + }, + { + "epoch": 0.1788773639125318, + "grad_norm": 1.146294339236582, + "learning_rate": 4.615469678320262e-05, + "loss": 0.5943, + "num_input_tokens_seen": 295243872, + "step": 1634 + }, + { + "epoch": 0.1789868359834697, + "grad_norm": 1.3220612721606768, + "learning_rate": 4.615011345787122e-05, + "loss": 0.9439, + "num_input_tokens_seen": 295434944, + "step": 1635 + }, + { + "epoch": 0.17909630805440763, + "grad_norm": 1.3369819618297922, + "learning_rate": 4.6145527630521834e-05, + "loss": 0.7777, + "num_input_tokens_seen": 295609888, + "step": 1636 + }, + { + "epoch": 0.17920578012534552, + "grad_norm": 1.3558044018010096, + "learning_rate": 4.614093930169695e-05, + "loss": 0.7905, + "num_input_tokens_seen": 295775424, + "step": 1637 + }, + { + "epoch": 0.17931525219628341, + "grad_norm": 1.1934157637589229, + "learning_rate": 4.613634847193936e-05, + "loss": 0.6148, + "num_input_tokens_seen": 295973888, + "step": 1638 + }, + { + "epoch": 0.17942472426722134, + "grad_norm": 1.298461678804285, + "learning_rate": 4.613175514179215e-05, + "loss": 0.7418, + "num_input_tokens_seen": 296153760, + "step": 1639 + }, + { + "epoch": 0.17953419633815923, + "grad_norm": 1.2787911616736176, + "learning_rate": 4.6127159311798705e-05, + "loss": 0.7874, + "num_input_tokens_seen": 296353568, + "step": 1640 + }, + { + "epoch": 0.17964366840909712, + "grad_norm": 1.2289973104585623, + "learning_rate": 4.61225609825027e-05, + "loss": 0.6322, + "num_input_tokens_seen": 296537248, + "step": 1641 + }, + { + "epoch": 0.17975314048003502, + "grad_norm": 1.310124927708191, + "learning_rate": 4.6117960154448115e-05, + "loss": 0.7468, + "num_input_tokens_seen": 296731008, + "step": 1642 + }, + { + "epoch": 0.17986261255097294, + "grad_norm": 1.293272737569748, + "learning_rate": 4.611335682817921e-05, + "loss": 0.7362, + "num_input_tokens_seen": 296926336, + "step": 1643 + }, + { + "epoch": 0.17997208462191083, + "grad_norm": 1.3975503334439434, + "learning_rate": 4.610875100424056e-05, + "loss": 0.7212, + "num_input_tokens_seen": 297065216, + "step": 1644 + }, + { + "epoch": 0.18008155669284873, + "grad_norm": 1.2382543422531187, + "learning_rate": 4.610414268317701e-05, + "loss": 0.5876, + "num_input_tokens_seen": 297235680, + "step": 1645 + }, + { + "epoch": 0.18019102876378665, + "grad_norm": 1.230188392307427, + "learning_rate": 4.609953186553373e-05, + "loss": 0.6713, + "num_input_tokens_seen": 297435040, + "step": 1646 + }, + { + "epoch": 0.18030050083472454, + "grad_norm": 1.3598720474919421, + "learning_rate": 4.609491855185616e-05, + "loss": 0.587, + "num_input_tokens_seen": 297590496, + "step": 1647 + }, + { + "epoch": 0.18040997290566244, + "grad_norm": 1.2904010643086121, + "learning_rate": 4.609030274269006e-05, + "loss": 0.6131, + "num_input_tokens_seen": 297746624, + "step": 1648 + }, + { + "epoch": 0.18051944497660036, + "grad_norm": 1.2594788690712373, + "learning_rate": 4.6085684438581464e-05, + "loss": 0.6679, + "num_input_tokens_seen": 297921120, + "step": 1649 + }, + { + "epoch": 0.18062891704753825, + "grad_norm": 1.2163276073634308, + "learning_rate": 4.60810636400767e-05, + "loss": 0.6286, + "num_input_tokens_seen": 298105696, + "step": 1650 + }, + { + "epoch": 0.18073838911847614, + "grad_norm": 1.357602685097111, + "learning_rate": 4.6076440347722415e-05, + "loss": 0.7899, + "num_input_tokens_seen": 298285792, + "step": 1651 + }, + { + "epoch": 0.18084786118941404, + "grad_norm": 1.369596658418826, + "learning_rate": 4.6071814562065524e-05, + "loss": 0.7693, + "num_input_tokens_seen": 298481568, + "step": 1652 + }, + { + "epoch": 0.18095733326035196, + "grad_norm": 1.309070329567595, + "learning_rate": 4.6067186283653255e-05, + "loss": 0.707, + "num_input_tokens_seen": 298643296, + "step": 1653 + }, + { + "epoch": 0.18106680533128985, + "grad_norm": 1.3382309084107156, + "learning_rate": 4.606255551303312e-05, + "loss": 0.7795, + "num_input_tokens_seen": 298839744, + "step": 1654 + }, + { + "epoch": 0.18117627740222775, + "grad_norm": 1.275982052195494, + "learning_rate": 4.6057922250752935e-05, + "loss": 0.7214, + "num_input_tokens_seen": 299018496, + "step": 1655 + }, + { + "epoch": 0.18128574947316567, + "grad_norm": 1.303994183006652, + "learning_rate": 4.60532864973608e-05, + "loss": 0.7098, + "num_input_tokens_seen": 299235776, + "step": 1656 + }, + { + "epoch": 0.18139522154410356, + "grad_norm": 1.3966418197549053, + "learning_rate": 4.604864825340512e-05, + "loss": 0.7857, + "num_input_tokens_seen": 299423936, + "step": 1657 + }, + { + "epoch": 0.18150469361504146, + "grad_norm": 1.1260051475529202, + "learning_rate": 4.6044007519434594e-05, + "loss": 0.6768, + "num_input_tokens_seen": 299578720, + "step": 1658 + }, + { + "epoch": 0.18161416568597935, + "grad_norm": 1.152248779521353, + "learning_rate": 4.603936429599821e-05, + "loss": 0.6078, + "num_input_tokens_seen": 299726112, + "step": 1659 + }, + { + "epoch": 0.18172363775691727, + "grad_norm": 1.27556110078121, + "learning_rate": 4.6034718583645244e-05, + "loss": 0.818, + "num_input_tokens_seen": 299916960, + "step": 1660 + }, + { + "epoch": 0.18183310982785517, + "grad_norm": 1.267747958756934, + "learning_rate": 4.603007038292528e-05, + "loss": 0.6143, + "num_input_tokens_seen": 300115648, + "step": 1661 + }, + { + "epoch": 0.18194258189879306, + "grad_norm": 1.254027363759993, + "learning_rate": 4.602541969438819e-05, + "loss": 0.7028, + "num_input_tokens_seen": 300279840, + "step": 1662 + }, + { + "epoch": 0.18205205396973098, + "grad_norm": 1.201333086314019, + "learning_rate": 4.602076651858416e-05, + "loss": 0.7645, + "num_input_tokens_seen": 300503392, + "step": 1663 + }, + { + "epoch": 0.18216152604066888, + "grad_norm": 1.3195752224052082, + "learning_rate": 4.601611085606362e-05, + "loss": 0.7749, + "num_input_tokens_seen": 300698496, + "step": 1664 + }, + { + "epoch": 0.18227099811160677, + "grad_norm": 1.3085125798344788, + "learning_rate": 4.601145270737735e-05, + "loss": 0.6195, + "num_input_tokens_seen": 300862016, + "step": 1665 + }, + { + "epoch": 0.1823804701825447, + "grad_norm": 1.23178345214746, + "learning_rate": 4.6006792073076385e-05, + "loss": 0.7233, + "num_input_tokens_seen": 301058688, + "step": 1666 + }, + { + "epoch": 0.18248994225348258, + "grad_norm": 1.279424157253496, + "learning_rate": 4.600212895371208e-05, + "loss": 0.8297, + "num_input_tokens_seen": 301235200, + "step": 1667 + }, + { + "epoch": 0.18259941432442048, + "grad_norm": 1.256603346898116, + "learning_rate": 4.5997463349836066e-05, + "loss": 0.873, + "num_input_tokens_seen": 301441504, + "step": 1668 + }, + { + "epoch": 0.18270888639535837, + "grad_norm": 1.271898099797309, + "learning_rate": 4.5992795262000285e-05, + "loss": 0.6468, + "num_input_tokens_seen": 301591808, + "step": 1669 + }, + { + "epoch": 0.1828183584662963, + "grad_norm": 1.2704808366449465, + "learning_rate": 4.598812469075695e-05, + "loss": 0.8206, + "num_input_tokens_seen": 301788032, + "step": 1670 + }, + { + "epoch": 0.1829278305372342, + "grad_norm": 1.10654760764916, + "learning_rate": 4.598345163665859e-05, + "loss": 0.695, + "num_input_tokens_seen": 301994784, + "step": 1671 + }, + { + "epoch": 0.18303730260817208, + "grad_norm": 1.166384129145198, + "learning_rate": 4.5978776100258006e-05, + "loss": 0.6364, + "num_input_tokens_seen": 302156960, + "step": 1672 + }, + { + "epoch": 0.18314677467911, + "grad_norm": 1.3237428063043182, + "learning_rate": 4.597409808210832e-05, + "loss": 0.8763, + "num_input_tokens_seen": 302351840, + "step": 1673 + }, + { + "epoch": 0.1832562467500479, + "grad_norm": 1.341060030763405, + "learning_rate": 4.596941758276293e-05, + "loss": 0.7779, + "num_input_tokens_seen": 302553664, + "step": 1674 + }, + { + "epoch": 0.1833657188209858, + "grad_norm": 1.3461096250690552, + "learning_rate": 4.596473460277553e-05, + "loss": 0.7778, + "num_input_tokens_seen": 302754144, + "step": 1675 + }, + { + "epoch": 0.1834751908919237, + "grad_norm": 1.123581734047834, + "learning_rate": 4.5960049142700096e-05, + "loss": 0.5931, + "num_input_tokens_seen": 302931776, + "step": 1676 + }, + { + "epoch": 0.1835846629628616, + "grad_norm": 1.185551500389968, + "learning_rate": 4.595536120309092e-05, + "loss": 0.6655, + "num_input_tokens_seen": 303115232, + "step": 1677 + }, + { + "epoch": 0.1836941350337995, + "grad_norm": 1.4129627229426387, + "learning_rate": 4.595067078450257e-05, + "loss": 0.8402, + "num_input_tokens_seen": 303300032, + "step": 1678 + }, + { + "epoch": 0.1838036071047374, + "grad_norm": 1.3138374598579168, + "learning_rate": 4.5945977887489925e-05, + "loss": 0.8069, + "num_input_tokens_seen": 303513504, + "step": 1679 + }, + { + "epoch": 0.18391307917567531, + "grad_norm": 1.1105877869081329, + "learning_rate": 4.594128251260813e-05, + "loss": 0.4643, + "num_input_tokens_seen": 303698976, + "step": 1680 + }, + { + "epoch": 0.1840225512466132, + "grad_norm": 1.4573677226741393, + "learning_rate": 4.593658466041265e-05, + "loss": 0.7694, + "num_input_tokens_seen": 303853536, + "step": 1681 + }, + { + "epoch": 0.1841320233175511, + "grad_norm": 1.2086729964430527, + "learning_rate": 4.593188433145923e-05, + "loss": 0.6043, + "num_input_tokens_seen": 304040128, + "step": 1682 + }, + { + "epoch": 0.18424149538848902, + "grad_norm": 1.25872991998561, + "learning_rate": 4.5927181526303906e-05, + "loss": 0.5601, + "num_input_tokens_seen": 304212608, + "step": 1683 + }, + { + "epoch": 0.18435096745942692, + "grad_norm": 1.4165269651906323, + "learning_rate": 4.592247624550301e-05, + "loss": 1.0625, + "num_input_tokens_seen": 304425184, + "step": 1684 + }, + { + "epoch": 0.1844604395303648, + "grad_norm": 1.343291448377158, + "learning_rate": 4.591776848961318e-05, + "loss": 0.7646, + "num_input_tokens_seen": 304639328, + "step": 1685 + }, + { + "epoch": 0.1845699116013027, + "grad_norm": 1.3590601852513062, + "learning_rate": 4.591305825919132e-05, + "loss": 0.6684, + "num_input_tokens_seen": 304817408, + "step": 1686 + }, + { + "epoch": 0.18467938367224063, + "grad_norm": 1.4640171821607373, + "learning_rate": 4.590834555479465e-05, + "loss": 0.702, + "num_input_tokens_seen": 305009824, + "step": 1687 + }, + { + "epoch": 0.18478885574317852, + "grad_norm": 1.3727958959890478, + "learning_rate": 4.590363037698067e-05, + "loss": 0.604, + "num_input_tokens_seen": 305182976, + "step": 1688 + }, + { + "epoch": 0.18489832781411641, + "grad_norm": 1.3893636919763497, + "learning_rate": 4.589891272630717e-05, + "loss": 0.8205, + "num_input_tokens_seen": 305355008, + "step": 1689 + }, + { + "epoch": 0.18500779988505434, + "grad_norm": 1.4153135045393976, + "learning_rate": 4.5894192603332254e-05, + "loss": 0.8788, + "num_input_tokens_seen": 305548544, + "step": 1690 + }, + { + "epoch": 0.18511727195599223, + "grad_norm": 1.2153281077302076, + "learning_rate": 4.58894700086143e-05, + "loss": 0.6037, + "num_input_tokens_seen": 305742304, + "step": 1691 + }, + { + "epoch": 0.18522674402693012, + "grad_norm": 1.3114849888782016, + "learning_rate": 4.5884744942711964e-05, + "loss": 0.9564, + "num_input_tokens_seen": 305950848, + "step": 1692 + }, + { + "epoch": 0.18533621609786805, + "grad_norm": 1.3808225980228934, + "learning_rate": 4.588001740618424e-05, + "loss": 0.819, + "num_input_tokens_seen": 306120640, + "step": 1693 + }, + { + "epoch": 0.18544568816880594, + "grad_norm": 1.397379128376054, + "learning_rate": 4.587528739959036e-05, + "loss": 0.8601, + "num_input_tokens_seen": 306301632, + "step": 1694 + }, + { + "epoch": 0.18555516023974383, + "grad_norm": 1.1727222177552468, + "learning_rate": 4.58705549234899e-05, + "loss": 0.6594, + "num_input_tokens_seen": 306441632, + "step": 1695 + }, + { + "epoch": 0.18566463231068173, + "grad_norm": 1.2339179242999476, + "learning_rate": 4.5865819978442685e-05, + "loss": 0.6408, + "num_input_tokens_seen": 306586784, + "step": 1696 + }, + { + "epoch": 0.18577410438161965, + "grad_norm": 1.227020664050412, + "learning_rate": 4.586108256500885e-05, + "loss": 0.8256, + "num_input_tokens_seen": 306764864, + "step": 1697 + }, + { + "epoch": 0.18588357645255754, + "grad_norm": 1.1124577203496004, + "learning_rate": 4.585634268374884e-05, + "loss": 0.5926, + "num_input_tokens_seen": 306965568, + "step": 1698 + }, + { + "epoch": 0.18599304852349544, + "grad_norm": 1.1675860438201164, + "learning_rate": 4.585160033522335e-05, + "loss": 0.5739, + "num_input_tokens_seen": 307133792, + "step": 1699 + }, + { + "epoch": 0.18610252059443336, + "grad_norm": 1.3405853532383984, + "learning_rate": 4.5846855519993404e-05, + "loss": 0.7133, + "num_input_tokens_seen": 307281632, + "step": 1700 + }, + { + "epoch": 0.18621199266537125, + "grad_norm": 1.2221941577454007, + "learning_rate": 4.584210823862031e-05, + "loss": 0.7289, + "num_input_tokens_seen": 307441792, + "step": 1701 + }, + { + "epoch": 0.18632146473630914, + "grad_norm": 1.2094644754908535, + "learning_rate": 4.583735849166564e-05, + "loss": 0.5245, + "num_input_tokens_seen": 307607328, + "step": 1702 + }, + { + "epoch": 0.18643093680724704, + "grad_norm": 1.2825408108856495, + "learning_rate": 4.583260627969131e-05, + "loss": 0.7066, + "num_input_tokens_seen": 307763008, + "step": 1703 + }, + { + "epoch": 0.18654040887818496, + "grad_norm": 1.3042781438907747, + "learning_rate": 4.5827851603259475e-05, + "loss": 0.7144, + "num_input_tokens_seen": 307930336, + "step": 1704 + }, + { + "epoch": 0.18664988094912285, + "grad_norm": 1.440746538906385, + "learning_rate": 4.582309446293261e-05, + "loss": 0.9109, + "num_input_tokens_seen": 308096320, + "step": 1705 + }, + { + "epoch": 0.18675935302006075, + "grad_norm": 1.3154326327715031, + "learning_rate": 4.581833485927348e-05, + "loss": 0.6924, + "num_input_tokens_seen": 308263648, + "step": 1706 + }, + { + "epoch": 0.18686882509099867, + "grad_norm": 1.4989521022063588, + "learning_rate": 4.5813572792845134e-05, + "loss": 0.7552, + "num_input_tokens_seen": 308417984, + "step": 1707 + }, + { + "epoch": 0.18697829716193656, + "grad_norm": 1.2401478654621994, + "learning_rate": 4.580880826421091e-05, + "loss": 0.6805, + "num_input_tokens_seen": 308620256, + "step": 1708 + }, + { + "epoch": 0.18708776923287446, + "grad_norm": 1.3708310215166226, + "learning_rate": 4.580404127393445e-05, + "loss": 0.7154, + "num_input_tokens_seen": 308807744, + "step": 1709 + }, + { + "epoch": 0.18719724130381238, + "grad_norm": 1.2149983887210414, + "learning_rate": 4.579927182257968e-05, + "loss": 0.5956, + "num_input_tokens_seen": 308975072, + "step": 1710 + }, + { + "epoch": 0.18730671337475027, + "grad_norm": 1.5097150115576239, + "learning_rate": 4.579449991071082e-05, + "loss": 0.8904, + "num_input_tokens_seen": 309158080, + "step": 1711 + }, + { + "epoch": 0.18741618544568817, + "grad_norm": 1.2623070918843389, + "learning_rate": 4.578972553889237e-05, + "loss": 0.5257, + "num_input_tokens_seen": 309339968, + "step": 1712 + }, + { + "epoch": 0.18752565751662606, + "grad_norm": 1.318844673410409, + "learning_rate": 4.578494870768912e-05, + "loss": 0.8125, + "num_input_tokens_seen": 309524096, + "step": 1713 + }, + { + "epoch": 0.18763512958756398, + "grad_norm": 1.3605173068868186, + "learning_rate": 4.578016941766619e-05, + "loss": 0.9795, + "num_input_tokens_seen": 309719424, + "step": 1714 + }, + { + "epoch": 0.18774460165850188, + "grad_norm": 1.1809895384710347, + "learning_rate": 4.5775387669388935e-05, + "loss": 0.5501, + "num_input_tokens_seen": 309914976, + "step": 1715 + }, + { + "epoch": 0.18785407372943977, + "grad_norm": 1.1558247377002213, + "learning_rate": 4.5770603463423035e-05, + "loss": 0.7762, + "num_input_tokens_seen": 310101344, + "step": 1716 + }, + { + "epoch": 0.1879635458003777, + "grad_norm": 1.3285933622404302, + "learning_rate": 4.576581680033445e-05, + "loss": 0.7358, + "num_input_tokens_seen": 310286816, + "step": 1717 + }, + { + "epoch": 0.18807301787131558, + "grad_norm": 1.335264783882044, + "learning_rate": 4.576102768068944e-05, + "loss": 0.7913, + "num_input_tokens_seen": 310462432, + "step": 1718 + }, + { + "epoch": 0.18818248994225348, + "grad_norm": 1.3438167111892472, + "learning_rate": 4.5756236105054534e-05, + "loss": 0.6864, + "num_input_tokens_seen": 310600864, + "step": 1719 + }, + { + "epoch": 0.18829196201319137, + "grad_norm": 1.258148527574445, + "learning_rate": 4.575144207399658e-05, + "loss": 0.7649, + "num_input_tokens_seen": 310794624, + "step": 1720 + }, + { + "epoch": 0.1884014340841293, + "grad_norm": 1.1692135916492499, + "learning_rate": 4.574664558808271e-05, + "loss": 0.556, + "num_input_tokens_seen": 310951424, + "step": 1721 + }, + { + "epoch": 0.1885109061550672, + "grad_norm": 1.364403030869454, + "learning_rate": 4.574184664788031e-05, + "loss": 0.7297, + "num_input_tokens_seen": 311130848, + "step": 1722 + }, + { + "epoch": 0.18862037822600508, + "grad_norm": 1.2915789923086811, + "learning_rate": 4.573704525395711e-05, + "loss": 0.7508, + "num_input_tokens_seen": 311330656, + "step": 1723 + }, + { + "epoch": 0.188729850296943, + "grad_norm": 1.2245240493568545, + "learning_rate": 4.573224140688111e-05, + "loss": 0.7034, + "num_input_tokens_seen": 311480288, + "step": 1724 + }, + { + "epoch": 0.1888393223678809, + "grad_norm": 1.1876032433050214, + "learning_rate": 4.5727435107220576e-05, + "loss": 0.5854, + "num_input_tokens_seen": 311653216, + "step": 1725 + }, + { + "epoch": 0.1889487944388188, + "grad_norm": 1.3172828630745155, + "learning_rate": 4.5722626355544085e-05, + "loss": 0.6283, + "num_input_tokens_seen": 311787616, + "step": 1726 + }, + { + "epoch": 0.1890582665097567, + "grad_norm": 1.2900577748838022, + "learning_rate": 4.5717815152420515e-05, + "loss": 0.8857, + "num_input_tokens_seen": 311982944, + "step": 1727 + }, + { + "epoch": 0.1891677385806946, + "grad_norm": 1.3895309945486647, + "learning_rate": 4.5713001498419025e-05, + "loss": 0.8558, + "num_input_tokens_seen": 312163040, + "step": 1728 + }, + { + "epoch": 0.1892772106516325, + "grad_norm": 1.1814117966708257, + "learning_rate": 4.570818539410905e-05, + "loss": 0.7916, + "num_input_tokens_seen": 312342240, + "step": 1729 + }, + { + "epoch": 0.1893866827225704, + "grad_norm": 1.2136927116380762, + "learning_rate": 4.5703366840060335e-05, + "loss": 0.6366, + "num_input_tokens_seen": 312512928, + "step": 1730 + }, + { + "epoch": 0.18949615479350831, + "grad_norm": 1.3623152756364967, + "learning_rate": 4.5698545836842896e-05, + "loss": 0.7346, + "num_input_tokens_seen": 312678016, + "step": 1731 + }, + { + "epoch": 0.1896056268644462, + "grad_norm": 1.4040325908247366, + "learning_rate": 4.569372238502705e-05, + "loss": 0.7743, + "num_input_tokens_seen": 312838400, + "step": 1732 + }, + { + "epoch": 0.1897150989353841, + "grad_norm": 1.3044090659167191, + "learning_rate": 4.568889648518341e-05, + "loss": 0.7329, + "num_input_tokens_seen": 312994528, + "step": 1733 + }, + { + "epoch": 0.18982457100632202, + "grad_norm": 1.2043349882090377, + "learning_rate": 4.568406813788287e-05, + "loss": 0.8068, + "num_input_tokens_seen": 313172384, + "step": 1734 + }, + { + "epoch": 0.18993404307725992, + "grad_norm": 1.185773347091126, + "learning_rate": 4.5679237343696604e-05, + "loss": 0.7402, + "num_input_tokens_seen": 313359872, + "step": 1735 + }, + { + "epoch": 0.1900435151481978, + "grad_norm": 1.3483405545815013, + "learning_rate": 4.567440410319609e-05, + "loss": 0.7864, + "num_input_tokens_seen": 313520704, + "step": 1736 + }, + { + "epoch": 0.1901529872191357, + "grad_norm": 1.2627140052633197, + "learning_rate": 4.56695684169531e-05, + "loss": 0.9233, + "num_input_tokens_seen": 313712224, + "step": 1737 + }, + { + "epoch": 0.19026245929007363, + "grad_norm": 1.3193747898031163, + "learning_rate": 4.5664730285539684e-05, + "loss": 0.8014, + "num_input_tokens_seen": 313900832, + "step": 1738 + }, + { + "epoch": 0.19037193136101152, + "grad_norm": 1.256655095390317, + "learning_rate": 4.565988970952817e-05, + "loss": 0.7296, + "num_input_tokens_seen": 314086304, + "step": 1739 + }, + { + "epoch": 0.19048140343194941, + "grad_norm": 1.313999726999449, + "learning_rate": 4.5655046689491204e-05, + "loss": 0.761, + "num_input_tokens_seen": 314264384, + "step": 1740 + }, + { + "epoch": 0.19059087550288734, + "grad_norm": 1.3535448211559575, + "learning_rate": 4.56502012260017e-05, + "loss": 0.7569, + "num_input_tokens_seen": 314404608, + "step": 1741 + }, + { + "epoch": 0.19070034757382523, + "grad_norm": 1.2265642008192577, + "learning_rate": 4.564535331963287e-05, + "loss": 0.6949, + "num_input_tokens_seen": 314576192, + "step": 1742 + }, + { + "epoch": 0.19080981964476312, + "grad_norm": 1.180489475919094, + "learning_rate": 4.56405029709582e-05, + "loss": 0.6304, + "num_input_tokens_seen": 314738592, + "step": 1743 + }, + { + "epoch": 0.19091929171570104, + "grad_norm": 1.364391847332586, + "learning_rate": 4.5635650180551494e-05, + "loss": 0.6863, + "num_input_tokens_seen": 314924960, + "step": 1744 + }, + { + "epoch": 0.19102876378663894, + "grad_norm": 1.2266734856609496, + "learning_rate": 4.5630794948986814e-05, + "loss": 0.5743, + "num_input_tokens_seen": 315058016, + "step": 1745 + }, + { + "epoch": 0.19113823585757683, + "grad_norm": 1.2649992820677398, + "learning_rate": 4.562593727683854e-05, + "loss": 0.6203, + "num_input_tokens_seen": 315254912, + "step": 1746 + }, + { + "epoch": 0.19124770792851473, + "grad_norm": 1.3815159727147224, + "learning_rate": 4.562107716468131e-05, + "loss": 0.6785, + "num_input_tokens_seen": 315426944, + "step": 1747 + }, + { + "epoch": 0.19135717999945265, + "grad_norm": 1.1569094452761166, + "learning_rate": 4.561621461309007e-05, + "loss": 0.6488, + "num_input_tokens_seen": 315621824, + "step": 1748 + }, + { + "epoch": 0.19146665207039054, + "grad_norm": 1.2573091909751197, + "learning_rate": 4.561134962264006e-05, + "loss": 0.7032, + "num_input_tokens_seen": 315816928, + "step": 1749 + }, + { + "epoch": 0.19157612414132844, + "grad_norm": 1.24555436182395, + "learning_rate": 4.560648219390678e-05, + "loss": 0.6696, + "num_input_tokens_seen": 315982688, + "step": 1750 + }, + { + "epoch": 0.19168559621226636, + "grad_norm": 1.265288555496231, + "learning_rate": 4.560161232746606e-05, + "loss": 0.8413, + "num_input_tokens_seen": 316194816, + "step": 1751 + }, + { + "epoch": 0.19179506828320425, + "grad_norm": 1.2566237429835765, + "learning_rate": 4.5596740023893986e-05, + "loss": 0.8308, + "num_input_tokens_seen": 316410752, + "step": 1752 + }, + { + "epoch": 0.19190454035414214, + "grad_norm": 1.1322065640180912, + "learning_rate": 4.559186528376694e-05, + "loss": 0.54, + "num_input_tokens_seen": 316593088, + "step": 1753 + }, + { + "epoch": 0.19201401242508004, + "grad_norm": 1.2353629974921603, + "learning_rate": 4.558698810766159e-05, + "loss": 0.7964, + "num_input_tokens_seen": 316767808, + "step": 1754 + }, + { + "epoch": 0.19212348449601796, + "grad_norm": 1.133739381633188, + "learning_rate": 4.558210849615491e-05, + "loss": 0.4845, + "num_input_tokens_seen": 316934688, + "step": 1755 + }, + { + "epoch": 0.19223295656695585, + "grad_norm": 1.2772266205913851, + "learning_rate": 4.557722644982414e-05, + "loss": 0.7998, + "num_input_tokens_seen": 317108288, + "step": 1756 + }, + { + "epoch": 0.19234242863789375, + "grad_norm": 1.253848921276349, + "learning_rate": 4.5572341969246814e-05, + "loss": 0.6236, + "num_input_tokens_seen": 317283456, + "step": 1757 + }, + { + "epoch": 0.19245190070883167, + "grad_norm": 1.2685578355757465, + "learning_rate": 4.556745505500076e-05, + "loss": 0.8564, + "num_input_tokens_seen": 317472960, + "step": 1758 + }, + { + "epoch": 0.19256137277976956, + "grad_norm": 1.1408451860665263, + "learning_rate": 4.55625657076641e-05, + "loss": 0.5888, + "num_input_tokens_seen": 317666496, + "step": 1759 + }, + { + "epoch": 0.19267084485070746, + "grad_norm": 1.3125794704537757, + "learning_rate": 4.555767392781522e-05, + "loss": 0.8448, + "num_input_tokens_seen": 317863840, + "step": 1760 + }, + { + "epoch": 0.19278031692164538, + "grad_norm": 1.3263593268747846, + "learning_rate": 4.5552779716032815e-05, + "loss": 0.7811, + "num_input_tokens_seen": 318042368, + "step": 1761 + }, + { + "epoch": 0.19288978899258327, + "grad_norm": 1.3689743136886487, + "learning_rate": 4.554788307289585e-05, + "loss": 0.8339, + "num_input_tokens_seen": 318253600, + "step": 1762 + }, + { + "epoch": 0.19299926106352117, + "grad_norm": 1.1823214500585768, + "learning_rate": 4.5542983998983605e-05, + "loss": 0.7666, + "num_input_tokens_seen": 318432576, + "step": 1763 + }, + { + "epoch": 0.19310873313445906, + "grad_norm": 1.0951512760893545, + "learning_rate": 4.5538082494875626e-05, + "loss": 0.4976, + "num_input_tokens_seen": 318582880, + "step": 1764 + }, + { + "epoch": 0.19321820520539698, + "grad_norm": 1.402577882735999, + "learning_rate": 4.553317856115176e-05, + "loss": 0.8529, + "num_input_tokens_seen": 318781568, + "step": 1765 + }, + { + "epoch": 0.19332767727633487, + "grad_norm": 1.199040601588542, + "learning_rate": 4.552827219839211e-05, + "loss": 0.7746, + "num_input_tokens_seen": 318959872, + "step": 1766 + }, + { + "epoch": 0.19343714934727277, + "grad_norm": 1.1547898016225275, + "learning_rate": 4.55233634071771e-05, + "loss": 0.5816, + "num_input_tokens_seen": 319149152, + "step": 1767 + }, + { + "epoch": 0.1935466214182107, + "grad_norm": 1.3060656823616341, + "learning_rate": 4.5518452188087444e-05, + "loss": 0.89, + "num_input_tokens_seen": 319311104, + "step": 1768 + }, + { + "epoch": 0.19365609348914858, + "grad_norm": 1.302289941290542, + "learning_rate": 4.551353854170411e-05, + "loss": 0.7718, + "num_input_tokens_seen": 319489184, + "step": 1769 + }, + { + "epoch": 0.19376556556008648, + "grad_norm": 1.216233663498252, + "learning_rate": 4.550862246860839e-05, + "loss": 0.7554, + "num_input_tokens_seen": 319679584, + "step": 1770 + }, + { + "epoch": 0.19387503763102437, + "grad_norm": 1.2798970685491187, + "learning_rate": 4.5503703969381826e-05, + "loss": 0.7449, + "num_input_tokens_seen": 319879392, + "step": 1771 + }, + { + "epoch": 0.1939845097019623, + "grad_norm": 1.3103765840368895, + "learning_rate": 4.5498783044606285e-05, + "loss": 0.6338, + "num_input_tokens_seen": 320044256, + "step": 1772 + }, + { + "epoch": 0.1940939817729002, + "grad_norm": 1.2036539376532247, + "learning_rate": 4.5493859694863894e-05, + "loss": 0.6581, + "num_input_tokens_seen": 320241152, + "step": 1773 + }, + { + "epoch": 0.19420345384383808, + "grad_norm": 1.3460591458451832, + "learning_rate": 4.5488933920737087e-05, + "loss": 0.8496, + "num_input_tokens_seen": 320391680, + "step": 1774 + }, + { + "epoch": 0.194312925914776, + "grad_norm": 1.3270136053829515, + "learning_rate": 4.5484005722808566e-05, + "loss": 0.7812, + "num_input_tokens_seen": 320592832, + "step": 1775 + }, + { + "epoch": 0.1944223979857139, + "grad_norm": 1.2490906268577968, + "learning_rate": 4.5479075101661316e-05, + "loss": 0.7774, + "num_input_tokens_seen": 320757696, + "step": 1776 + }, + { + "epoch": 0.1945318700566518, + "grad_norm": 1.2421564542141665, + "learning_rate": 4.5474142057878636e-05, + "loss": 0.664, + "num_input_tokens_seen": 320967584, + "step": 1777 + }, + { + "epoch": 0.1946413421275897, + "grad_norm": 1.383317542656239, + "learning_rate": 4.546920659204409e-05, + "loss": 0.806, + "num_input_tokens_seen": 321136704, + "step": 1778 + }, + { + "epoch": 0.1947508141985276, + "grad_norm": 1.2963523271097281, + "learning_rate": 4.546426870474154e-05, + "loss": 0.7577, + "num_input_tokens_seen": 321349504, + "step": 1779 + }, + { + "epoch": 0.1948602862694655, + "grad_norm": 1.2200807983971018, + "learning_rate": 4.5459328396555114e-05, + "loss": 0.7678, + "num_input_tokens_seen": 321558720, + "step": 1780 + }, + { + "epoch": 0.1949697583404034, + "grad_norm": 1.1520201665152925, + "learning_rate": 4.545438566806925e-05, + "loss": 0.693, + "num_input_tokens_seen": 321740160, + "step": 1781 + }, + { + "epoch": 0.19507923041134131, + "grad_norm": 1.2752283995009912, + "learning_rate": 4.5449440519868675e-05, + "loss": 0.7572, + "num_input_tokens_seen": 321929888, + "step": 1782 + }, + { + "epoch": 0.1951887024822792, + "grad_norm": 1.2857947806864307, + "learning_rate": 4.544449295253837e-05, + "loss": 0.5926, + "num_input_tokens_seen": 322055328, + "step": 1783 + }, + { + "epoch": 0.1952981745532171, + "grad_norm": 1.251537045566439, + "learning_rate": 4.543954296666363e-05, + "loss": 0.708, + "num_input_tokens_seen": 322217728, + "step": 1784 + }, + { + "epoch": 0.19540764662415502, + "grad_norm": 1.1882681119193779, + "learning_rate": 4.5434590562830035e-05, + "loss": 0.6501, + "num_input_tokens_seen": 322398048, + "step": 1785 + }, + { + "epoch": 0.19551711869509292, + "grad_norm": 1.390195501630253, + "learning_rate": 4.542963574162344e-05, + "loss": 0.7077, + "num_input_tokens_seen": 322562688, + "step": 1786 + }, + { + "epoch": 0.1956265907660308, + "grad_norm": 1.262070267312561, + "learning_rate": 4.542467850363e-05, + "loss": 0.6376, + "num_input_tokens_seen": 322744352, + "step": 1787 + }, + { + "epoch": 0.1957360628369687, + "grad_norm": 1.9429088894680697, + "learning_rate": 4.541971884943613e-05, + "loss": 1.2777, + "num_input_tokens_seen": 322963424, + "step": 1788 + }, + { + "epoch": 0.19584553490790663, + "grad_norm": 1.2336852162366725, + "learning_rate": 4.5414756779628556e-05, + "loss": 0.8742, + "num_input_tokens_seen": 323149792, + "step": 1789 + }, + { + "epoch": 0.19595500697884452, + "grad_norm": 1.024073919768651, + "learning_rate": 4.5409792294794284e-05, + "loss": 0.6351, + "num_input_tokens_seen": 323338176, + "step": 1790 + }, + { + "epoch": 0.19606447904978241, + "grad_norm": 1.33444563345936, + "learning_rate": 4.54048253955206e-05, + "loss": 0.6919, + "num_input_tokens_seen": 323499904, + "step": 1791 + }, + { + "epoch": 0.19617395112072034, + "grad_norm": 1.099869800578315, + "learning_rate": 4.5399856082395074e-05, + "loss": 0.787, + "num_input_tokens_seen": 323684480, + "step": 1792 + }, + { + "epoch": 0.19628342319165823, + "grad_norm": 1.2160104135707372, + "learning_rate": 4.5394884356005574e-05, + "loss": 0.6584, + "num_input_tokens_seen": 323868832, + "step": 1793 + }, + { + "epoch": 0.19639289526259612, + "grad_norm": 1.3475664191782848, + "learning_rate": 4.538991021694025e-05, + "loss": 0.9239, + "num_input_tokens_seen": 324041312, + "step": 1794 + }, + { + "epoch": 0.19650236733353404, + "grad_norm": 1.2612991816418306, + "learning_rate": 4.5384933665787524e-05, + "loss": 0.764, + "num_input_tokens_seen": 324250304, + "step": 1795 + }, + { + "epoch": 0.19661183940447194, + "grad_norm": 1.257896894384011, + "learning_rate": 4.537995470313611e-05, + "loss": 0.7487, + "num_input_tokens_seen": 324412032, + "step": 1796 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 1.2217089606055234, + "learning_rate": 4.537497332957501e-05, + "loss": 0.6143, + "num_input_tokens_seen": 324591008, + "step": 1797 + }, + { + "epoch": 0.19683078354634773, + "grad_norm": 1.3054458333075145, + "learning_rate": 4.536998954569353e-05, + "loss": 0.8605, + "num_input_tokens_seen": 324757888, + "step": 1798 + }, + { + "epoch": 0.19694025561728565, + "grad_norm": 1.1003722124654851, + "learning_rate": 4.536500335208121e-05, + "loss": 0.5422, + "num_input_tokens_seen": 324932160, + "step": 1799 + }, + { + "epoch": 0.19704972768822354, + "grad_norm": 1.227926382840362, + "learning_rate": 4.536001474932793e-05, + "loss": 0.7896, + "num_input_tokens_seen": 325134880, + "step": 1800 + }, + { + "epoch": 0.19715919975916144, + "grad_norm": 1.356230771679966, + "learning_rate": 4.535502373802383e-05, + "loss": 0.92, + "num_input_tokens_seen": 325292800, + "step": 1801 + }, + { + "epoch": 0.19726867183009936, + "grad_norm": 1.251132642670986, + "learning_rate": 4.535003031875934e-05, + "loss": 0.9008, + "num_input_tokens_seen": 325484096, + "step": 1802 + }, + { + "epoch": 0.19737814390103725, + "grad_norm": 1.150463004822577, + "learning_rate": 4.534503449212516e-05, + "loss": 0.6776, + "num_input_tokens_seen": 325674272, + "step": 1803 + }, + { + "epoch": 0.19748761597197514, + "grad_norm": 1.076570523568471, + "learning_rate": 4.534003625871229e-05, + "loss": 0.6254, + "num_input_tokens_seen": 325862656, + "step": 1804 + }, + { + "epoch": 0.19759708804291304, + "grad_norm": 1.2415050815350765, + "learning_rate": 4.533503561911202e-05, + "loss": 0.8917, + "num_input_tokens_seen": 326057984, + "step": 1805 + }, + { + "epoch": 0.19770656011385096, + "grad_norm": 1.320878218039604, + "learning_rate": 4.5330032573915903e-05, + "loss": 0.8582, + "num_input_tokens_seen": 326232256, + "step": 1806 + }, + { + "epoch": 0.19781603218478885, + "grad_norm": 1.2037244021980877, + "learning_rate": 4.53250271237158e-05, + "loss": 0.734, + "num_input_tokens_seen": 326412800, + "step": 1807 + }, + { + "epoch": 0.19792550425572675, + "grad_norm": 1.1527712271528507, + "learning_rate": 4.532001926910385e-05, + "loss": 0.7288, + "num_input_tokens_seen": 326589312, + "step": 1808 + }, + { + "epoch": 0.19803497632666467, + "grad_norm": 1.1961174542092077, + "learning_rate": 4.531500901067246e-05, + "loss": 0.812, + "num_input_tokens_seen": 326775008, + "step": 1809 + }, + { + "epoch": 0.19814444839760256, + "grad_norm": 1.4125977828351783, + "learning_rate": 4.5309996349014336e-05, + "loss": 0.7664, + "num_input_tokens_seen": 326953536, + "step": 1810 + }, + { + "epoch": 0.19825392046854046, + "grad_norm": 1.1330017984010676, + "learning_rate": 4.5304981284722484e-05, + "loss": 0.684, + "num_input_tokens_seen": 327135424, + "step": 1811 + }, + { + "epoch": 0.19836339253947838, + "grad_norm": 1.2003143474998648, + "learning_rate": 4.5299963818390144e-05, + "loss": 0.8002, + "num_input_tokens_seen": 327329632, + "step": 1812 + }, + { + "epoch": 0.19847286461041627, + "grad_norm": 1.3107206908708386, + "learning_rate": 4.5294943950610904e-05, + "loss": 0.7857, + "num_input_tokens_seen": 327479936, + "step": 1813 + }, + { + "epoch": 0.19858233668135417, + "grad_norm": 1.2435609913506391, + "learning_rate": 4.528992168197859e-05, + "loss": 0.6425, + "num_input_tokens_seen": 327668320, + "step": 1814 + }, + { + "epoch": 0.19869180875229206, + "grad_norm": 1.333410896588379, + "learning_rate": 4.5284897013087326e-05, + "loss": 0.7205, + "num_input_tokens_seen": 327841920, + "step": 1815 + }, + { + "epoch": 0.19880128082322998, + "grad_norm": 1.2471229344962922, + "learning_rate": 4.527986994453152e-05, + "loss": 0.8387, + "num_input_tokens_seen": 328045536, + "step": 1816 + }, + { + "epoch": 0.19891075289416787, + "grad_norm": 1.2350041681288835, + "learning_rate": 4.5274840476905873e-05, + "loss": 0.5109, + "num_input_tokens_seen": 328208832, + "step": 1817 + }, + { + "epoch": 0.19902022496510577, + "grad_norm": 1.37072483761958, + "learning_rate": 4.526980861080535e-05, + "loss": 0.8093, + "num_input_tokens_seen": 328397888, + "step": 1818 + }, + { + "epoch": 0.1991296970360437, + "grad_norm": 1.2605366084931124, + "learning_rate": 4.5264774346825226e-05, + "loss": 0.6252, + "num_input_tokens_seen": 328565216, + "step": 1819 + }, + { + "epoch": 0.19923916910698158, + "grad_norm": 1.2905442478142337, + "learning_rate": 4.5259737685561035e-05, + "loss": 0.7201, + "num_input_tokens_seen": 328753152, + "step": 1820 + }, + { + "epoch": 0.19934864117791948, + "grad_norm": 1.2502566074195978, + "learning_rate": 4.52546986276086e-05, + "loss": 0.7332, + "num_input_tokens_seen": 328952512, + "step": 1821 + }, + { + "epoch": 0.19945811324885737, + "grad_norm": 1.0953153036084393, + "learning_rate": 4.524965717356405e-05, + "loss": 0.5897, + "num_input_tokens_seen": 329144704, + "step": 1822 + }, + { + "epoch": 0.1995675853197953, + "grad_norm": 1.0913203910855627, + "learning_rate": 4.524461332402375e-05, + "loss": 0.5396, + "num_input_tokens_seen": 329324800, + "step": 1823 + }, + { + "epoch": 0.1996770573907332, + "grad_norm": 1.2209278601721594, + "learning_rate": 4.523956707958441e-05, + "loss": 0.6519, + "num_input_tokens_seen": 329487200, + "step": 1824 + }, + { + "epoch": 0.19978652946167108, + "grad_norm": 1.2864859522687477, + "learning_rate": 4.523451844084297e-05, + "loss": 0.8408, + "num_input_tokens_seen": 329679392, + "step": 1825 + }, + { + "epoch": 0.199896001532609, + "grad_norm": 1.2888195849335313, + "learning_rate": 4.5229467408396686e-05, + "loss": 0.6825, + "num_input_tokens_seen": 329848960, + "step": 1826 + }, + { + "epoch": 0.2000054736035469, + "grad_norm": 1.2343866854730212, + "learning_rate": 4.5224413982843075e-05, + "loss": 0.7168, + "num_input_tokens_seen": 330012704, + "step": 1827 + }, + { + "epoch": 0.2001149456744848, + "grad_norm": 1.3061186655996957, + "learning_rate": 4.521935816477995e-05, + "loss": 0.6705, + "num_input_tokens_seen": 330208704, + "step": 1828 + }, + { + "epoch": 0.2002244177454227, + "grad_norm": 1.2068886780716892, + "learning_rate": 4.5214299954805404e-05, + "loss": 0.5973, + "num_input_tokens_seen": 330363040, + "step": 1829 + }, + { + "epoch": 0.2003338898163606, + "grad_norm": 1.3294822041302163, + "learning_rate": 4.520923935351782e-05, + "loss": 0.6642, + "num_input_tokens_seen": 330551648, + "step": 1830 + }, + { + "epoch": 0.2004433618872985, + "grad_norm": 1.2032846617343804, + "learning_rate": 4.520417636151586e-05, + "loss": 0.6393, + "num_input_tokens_seen": 330732640, + "step": 1831 + }, + { + "epoch": 0.2005528339582364, + "grad_norm": 1.1315290876047248, + "learning_rate": 4.5199110979398454e-05, + "loss": 0.728, + "num_input_tokens_seen": 330914528, + "step": 1832 + }, + { + "epoch": 0.20066230602917431, + "grad_norm": 1.384447930268508, + "learning_rate": 4.5194043207764835e-05, + "loss": 0.6575, + "num_input_tokens_seen": 331119712, + "step": 1833 + }, + { + "epoch": 0.2007717781001122, + "grad_norm": 1.3592579670655562, + "learning_rate": 4.5188973047214514e-05, + "loss": 0.7018, + "num_input_tokens_seen": 331300704, + "step": 1834 + }, + { + "epoch": 0.2008812501710501, + "grad_norm": 1.2033577235665667, + "learning_rate": 4.518390049834727e-05, + "loss": 0.6257, + "num_input_tokens_seen": 331492224, + "step": 1835 + }, + { + "epoch": 0.20099072224198802, + "grad_norm": 1.3640476466328242, + "learning_rate": 4.517882556176318e-05, + "loss": 0.7575, + "num_input_tokens_seen": 331644768, + "step": 1836 + }, + { + "epoch": 0.20110019431292592, + "grad_norm": 1.3768358978800495, + "learning_rate": 4.51737482380626e-05, + "loss": 0.7399, + "num_input_tokens_seen": 331826432, + "step": 1837 + }, + { + "epoch": 0.2012096663838638, + "grad_norm": 1.113670147346219, + "learning_rate": 4.516866852784618e-05, + "loss": 0.6275, + "num_input_tokens_seen": 332001152, + "step": 1838 + }, + { + "epoch": 0.2013191384548017, + "grad_norm": 1.3728450013569828, + "learning_rate": 4.516358643171482e-05, + "loss": 0.8783, + "num_input_tokens_seen": 332196928, + "step": 1839 + }, + { + "epoch": 0.20142861052573963, + "grad_norm": 1.21147916430962, + "learning_rate": 4.515850195026974e-05, + "loss": 0.6919, + "num_input_tokens_seen": 332383744, + "step": 1840 + }, + { + "epoch": 0.20153808259667752, + "grad_norm": 1.2210579074277892, + "learning_rate": 4.5153415084112406e-05, + "loss": 0.6655, + "num_input_tokens_seen": 332574592, + "step": 1841 + }, + { + "epoch": 0.20164755466761541, + "grad_norm": 1.2770458655697765, + "learning_rate": 4.5148325833844595e-05, + "loss": 0.7179, + "num_input_tokens_seen": 332767456, + "step": 1842 + }, + { + "epoch": 0.20175702673855334, + "grad_norm": 1.255429065126984, + "learning_rate": 4.514323420006836e-05, + "loss": 0.5503, + "num_input_tokens_seen": 332904992, + "step": 1843 + }, + { + "epoch": 0.20186649880949123, + "grad_norm": 1.4760159888913436, + "learning_rate": 4.5138140183386025e-05, + "loss": 1.0478, + "num_input_tokens_seen": 333109952, + "step": 1844 + }, + { + "epoch": 0.20197597088042912, + "grad_norm": 1.168133523092511, + "learning_rate": 4.51330437844002e-05, + "loss": 0.7447, + "num_input_tokens_seen": 333285344, + "step": 1845 + }, + { + "epoch": 0.20208544295136704, + "grad_norm": 1.1853813043201646, + "learning_rate": 4.512794500371379e-05, + "loss": 0.7165, + "num_input_tokens_seen": 333478432, + "step": 1846 + }, + { + "epoch": 0.20219491502230494, + "grad_norm": 1.229672710324915, + "learning_rate": 4.5122843841929965e-05, + "loss": 0.8159, + "num_input_tokens_seen": 333687872, + "step": 1847 + }, + { + "epoch": 0.20230438709324283, + "grad_norm": 1.306080742415381, + "learning_rate": 4.5117740299652175e-05, + "loss": 0.7467, + "num_input_tokens_seen": 333809504, + "step": 1848 + }, + { + "epoch": 0.20241385916418073, + "grad_norm": 1.4086696999925366, + "learning_rate": 4.511263437748416e-05, + "loss": 0.8614, + "num_input_tokens_seen": 333958464, + "step": 1849 + }, + { + "epoch": 0.20252333123511865, + "grad_norm": 1.280874429877787, + "learning_rate": 4.510752607602996e-05, + "loss": 0.7666, + "num_input_tokens_seen": 334115264, + "step": 1850 + }, + { + "epoch": 0.20263280330605654, + "grad_norm": 1.0672052206017542, + "learning_rate": 4.510241539589386e-05, + "loss": 0.5959, + "num_input_tokens_seen": 334293792, + "step": 1851 + }, + { + "epoch": 0.20274227537699444, + "grad_norm": 1.1691615925191852, + "learning_rate": 4.509730233768045e-05, + "loss": 0.662, + "num_input_tokens_seen": 334467616, + "step": 1852 + }, + { + "epoch": 0.20285174744793236, + "grad_norm": 1.2125496745994684, + "learning_rate": 4.5092186901994594e-05, + "loss": 0.551, + "num_input_tokens_seen": 334666304, + "step": 1853 + }, + { + "epoch": 0.20296121951887025, + "grad_norm": 1.3071609624238614, + "learning_rate": 4.5087069089441434e-05, + "loss": 0.6095, + "num_input_tokens_seen": 334850880, + "step": 1854 + }, + { + "epoch": 0.20307069158980814, + "grad_norm": 1.202581951893846, + "learning_rate": 4.50819489006264e-05, + "loss": 0.6165, + "num_input_tokens_seen": 335013728, + "step": 1855 + }, + { + "epoch": 0.20318016366074607, + "grad_norm": 1.4970842474023265, + "learning_rate": 4.5076826336155196e-05, + "loss": 0.8556, + "num_input_tokens_seen": 335142752, + "step": 1856 + }, + { + "epoch": 0.20328963573168396, + "grad_norm": 1.4861690721864915, + "learning_rate": 4.507170139663382e-05, + "loss": 0.8702, + "num_input_tokens_seen": 335284320, + "step": 1857 + }, + { + "epoch": 0.20339910780262185, + "grad_norm": 1.6231004910386817, + "learning_rate": 4.506657408266855e-05, + "loss": 0.7581, + "num_input_tokens_seen": 335446944, + "step": 1858 + }, + { + "epoch": 0.20350857987355975, + "grad_norm": 1.3355819050428244, + "learning_rate": 4.506144439486591e-05, + "loss": 1.0415, + "num_input_tokens_seen": 335659296, + "step": 1859 + }, + { + "epoch": 0.20361805194449767, + "grad_norm": 1.1058968298150083, + "learning_rate": 4.5056312333832764e-05, + "loss": 0.5702, + "num_input_tokens_seen": 335815872, + "step": 1860 + }, + { + "epoch": 0.20372752401543556, + "grad_norm": 1.141856169229549, + "learning_rate": 4.505117790017621e-05, + "loss": 0.631, + "num_input_tokens_seen": 336027552, + "step": 1861 + }, + { + "epoch": 0.20383699608637346, + "grad_norm": 1.2929250473519602, + "learning_rate": 4.504604109450363e-05, + "loss": 0.6895, + "num_input_tokens_seen": 336215264, + "step": 1862 + }, + { + "epoch": 0.20394646815731138, + "grad_norm": 1.4034654811668088, + "learning_rate": 4.504090191742272e-05, + "loss": 0.8398, + "num_input_tokens_seen": 336386400, + "step": 1863 + }, + { + "epoch": 0.20405594022824927, + "grad_norm": 1.2694455631389368, + "learning_rate": 4.503576036954142e-05, + "loss": 0.8084, + "num_input_tokens_seen": 336585760, + "step": 1864 + }, + { + "epoch": 0.20416541229918717, + "grad_norm": 1.221307418785342, + "learning_rate": 4.5030616451467964e-05, + "loss": 0.8067, + "num_input_tokens_seen": 336792064, + "step": 1865 + }, + { + "epoch": 0.20427488437012506, + "grad_norm": 1.4665574285979164, + "learning_rate": 4.502547016381089e-05, + "loss": 0.7804, + "num_input_tokens_seen": 336964320, + "step": 1866 + }, + { + "epoch": 0.20438435644106298, + "grad_norm": 1.3138340119740763, + "learning_rate": 4.5020321507178965e-05, + "loss": 0.8149, + "num_input_tokens_seen": 337153824, + "step": 1867 + }, + { + "epoch": 0.20449382851200087, + "grad_norm": 1.245377529078543, + "learning_rate": 4.501517048218128e-05, + "loss": 0.7003, + "num_input_tokens_seen": 337320480, + "step": 1868 + }, + { + "epoch": 0.20460330058293877, + "grad_norm": 1.4590860150130092, + "learning_rate": 4.5010017089427195e-05, + "loss": 0.7328, + "num_input_tokens_seen": 337478848, + "step": 1869 + }, + { + "epoch": 0.2047127726538767, + "grad_norm": 1.1359392357646374, + "learning_rate": 4.500486132952634e-05, + "loss": 0.6813, + "num_input_tokens_seen": 337669248, + "step": 1870 + }, + { + "epoch": 0.20482224472481458, + "grad_norm": 1.2348336442111272, + "learning_rate": 4.499970320308863e-05, + "loss": 0.5509, + "num_input_tokens_seen": 337808128, + "step": 1871 + }, + { + "epoch": 0.20493171679575248, + "grad_norm": 1.2922519414071296, + "learning_rate": 4.4994542710724264e-05, + "loss": 0.5976, + "num_input_tokens_seen": 337979264, + "step": 1872 + }, + { + "epoch": 0.2050411888666904, + "grad_norm": 1.2393593412132442, + "learning_rate": 4.498937985304371e-05, + "loss": 0.619, + "num_input_tokens_seen": 338153088, + "step": 1873 + }, + { + "epoch": 0.2051506609376283, + "grad_norm": 1.2177864080134486, + "learning_rate": 4.4984214630657744e-05, + "loss": 0.7421, + "num_input_tokens_seen": 338347744, + "step": 1874 + }, + { + "epoch": 0.2052601330085662, + "grad_norm": 1.4050769151560984, + "learning_rate": 4.497904704417739e-05, + "loss": 1.0407, + "num_input_tokens_seen": 338535680, + "step": 1875 + }, + { + "epoch": 0.20536960507950408, + "grad_norm": 1.3625250962772133, + "learning_rate": 4.4973877094213954e-05, + "loss": 0.6343, + "num_input_tokens_seen": 338662240, + "step": 1876 + }, + { + "epoch": 0.205479077150442, + "grad_norm": 1.3836102959081336, + "learning_rate": 4.496870478137906e-05, + "loss": 0.7782, + "num_input_tokens_seen": 338840992, + "step": 1877 + }, + { + "epoch": 0.2055885492213799, + "grad_norm": 1.2465219748434844, + "learning_rate": 4.496353010628455e-05, + "loss": 0.6019, + "num_input_tokens_seen": 339032064, + "step": 1878 + }, + { + "epoch": 0.2056980212923178, + "grad_norm": 1.2390294747683719, + "learning_rate": 4.495835306954259e-05, + "loss": 0.6098, + "num_input_tokens_seen": 339198944, + "step": 1879 + }, + { + "epoch": 0.2058074933632557, + "grad_norm": 1.351740566276479, + "learning_rate": 4.495317367176562e-05, + "loss": 0.6677, + "num_input_tokens_seen": 339359552, + "step": 1880 + }, + { + "epoch": 0.2059169654341936, + "grad_norm": 1.3224085134724883, + "learning_rate": 4.4947991913566355e-05, + "loss": 0.8619, + "num_input_tokens_seen": 339544128, + "step": 1881 + }, + { + "epoch": 0.2060264375051315, + "grad_norm": 1.3714232307000847, + "learning_rate": 4.494280779555777e-05, + "loss": 1.0113, + "num_input_tokens_seen": 339720192, + "step": 1882 + }, + { + "epoch": 0.2061359095760694, + "grad_norm": 1.1184978136937376, + "learning_rate": 4.493762131835315e-05, + "loss": 0.5132, + "num_input_tokens_seen": 339892448, + "step": 1883 + }, + { + "epoch": 0.20624538164700731, + "grad_norm": 1.2468893925115805, + "learning_rate": 4.4932432482566045e-05, + "loss": 0.5052, + "num_input_tokens_seen": 340049024, + "step": 1884 + }, + { + "epoch": 0.2063548537179452, + "grad_norm": 1.4199811458679166, + "learning_rate": 4.492724128881029e-05, + "loss": 0.7394, + "num_input_tokens_seen": 340253312, + "step": 1885 + }, + { + "epoch": 0.2064643257888831, + "grad_norm": 1.2779890291223666, + "learning_rate": 4.492204773769997e-05, + "loss": 0.6585, + "num_input_tokens_seen": 340421088, + "step": 1886 + }, + { + "epoch": 0.20657379785982102, + "grad_norm": 1.3117187081950978, + "learning_rate": 4.491685182984949e-05, + "loss": 0.6671, + "num_input_tokens_seen": 340583488, + "step": 1887 + }, + { + "epoch": 0.20668326993075892, + "grad_norm": 1.185059297904869, + "learning_rate": 4.4911653565873524e-05, + "loss": 0.5914, + "num_input_tokens_seen": 340788448, + "step": 1888 + }, + { + "epoch": 0.2067927420016968, + "grad_norm": 1.2621625468164477, + "learning_rate": 4.4906452946386995e-05, + "loss": 0.6703, + "num_input_tokens_seen": 340983328, + "step": 1889 + }, + { + "epoch": 0.20690221407263473, + "grad_norm": 1.196842336582463, + "learning_rate": 4.490124997200514e-05, + "loss": 0.6257, + "num_input_tokens_seen": 341184256, + "step": 1890 + }, + { + "epoch": 0.20701168614357263, + "grad_norm": 1.385871994035939, + "learning_rate": 4.489604464334346e-05, + "loss": 0.7084, + "num_input_tokens_seen": 341379360, + "step": 1891 + }, + { + "epoch": 0.20712115821451052, + "grad_norm": 1.2164829394127663, + "learning_rate": 4.489083696101773e-05, + "loss": 0.6465, + "num_input_tokens_seen": 341516224, + "step": 1892 + }, + { + "epoch": 0.20723063028544841, + "grad_norm": 1.3441101301348608, + "learning_rate": 4.4885626925644016e-05, + "loss": 0.7529, + "num_input_tokens_seen": 341673920, + "step": 1893 + }, + { + "epoch": 0.20734010235638634, + "grad_norm": 1.318819319001496, + "learning_rate": 4.4880414537838643e-05, + "loss": 0.7849, + "num_input_tokens_seen": 341837888, + "step": 1894 + }, + { + "epoch": 0.20744957442732423, + "grad_norm": 1.2064902071375563, + "learning_rate": 4.487519979821824e-05, + "loss": 0.6271, + "num_input_tokens_seen": 342005664, + "step": 1895 + }, + { + "epoch": 0.20755904649826212, + "grad_norm": 1.4366731338768453, + "learning_rate": 4.486998270739971e-05, + "loss": 1.0387, + "num_input_tokens_seen": 342178592, + "step": 1896 + }, + { + "epoch": 0.20766851856920004, + "grad_norm": 1.2134134882521663, + "learning_rate": 4.486476326600019e-05, + "loss": 0.8455, + "num_input_tokens_seen": 342353312, + "step": 1897 + }, + { + "epoch": 0.20777799064013794, + "grad_norm": 1.2129641401583389, + "learning_rate": 4.4859541474637153e-05, + "loss": 0.7172, + "num_input_tokens_seen": 342528704, + "step": 1898 + }, + { + "epoch": 0.20788746271107583, + "grad_norm": 1.1453582067593862, + "learning_rate": 4.4854317333928335e-05, + "loss": 0.6398, + "num_input_tokens_seen": 342722464, + "step": 1899 + }, + { + "epoch": 0.20799693478201373, + "grad_norm": 1.1517412065121027, + "learning_rate": 4.484909084449172e-05, + "loss": 0.6009, + "num_input_tokens_seen": 342896960, + "step": 1900 + }, + { + "epoch": 0.20810640685295165, + "grad_norm": 1.2380116647592938, + "learning_rate": 4.484386200694561e-05, + "loss": 0.8514, + "num_input_tokens_seen": 343096544, + "step": 1901 + }, + { + "epoch": 0.20821587892388954, + "grad_norm": 1.250549195759857, + "learning_rate": 4.4838630821908564e-05, + "loss": 0.6413, + "num_input_tokens_seen": 343256704, + "step": 1902 + }, + { + "epoch": 0.20832535099482744, + "grad_norm": 1.2710020495292411, + "learning_rate": 4.483339728999941e-05, + "loss": 0.7912, + "num_input_tokens_seen": 343473536, + "step": 1903 + }, + { + "epoch": 0.20843482306576536, + "grad_norm": 1.3000071286959696, + "learning_rate": 4.482816141183728e-05, + "loss": 0.7125, + "num_input_tokens_seen": 343610624, + "step": 1904 + }, + { + "epoch": 0.20854429513670325, + "grad_norm": 1.3547220069511896, + "learning_rate": 4.4822923188041555e-05, + "loss": 0.7315, + "num_input_tokens_seen": 343779296, + "step": 1905 + }, + { + "epoch": 0.20865376720764114, + "grad_norm": 1.2980917312994649, + "learning_rate": 4.481768261923191e-05, + "loss": 0.7621, + "num_input_tokens_seen": 343981120, + "step": 1906 + }, + { + "epoch": 0.20876323927857907, + "grad_norm": 1.241061822201285, + "learning_rate": 4.48124397060283e-05, + "loss": 0.5179, + "num_input_tokens_seen": 344165920, + "step": 1907 + }, + { + "epoch": 0.20887271134951696, + "grad_norm": 1.2647722927405276, + "learning_rate": 4.4807194449050936e-05, + "loss": 0.7012, + "num_input_tokens_seen": 344377376, + "step": 1908 + }, + { + "epoch": 0.20898218342045485, + "grad_norm": 1.3358976123945245, + "learning_rate": 4.480194684892035e-05, + "loss": 0.7162, + "num_input_tokens_seen": 344533952, + "step": 1909 + }, + { + "epoch": 0.20909165549139275, + "grad_norm": 1.4008256776469257, + "learning_rate": 4.4796696906257294e-05, + "loss": 0.8421, + "num_input_tokens_seen": 344714272, + "step": 1910 + }, + { + "epoch": 0.20920112756233067, + "grad_norm": 1.3611933068133595, + "learning_rate": 4.479144462168284e-05, + "loss": 0.6532, + "num_input_tokens_seen": 344885632, + "step": 1911 + }, + { + "epoch": 0.20931059963326856, + "grad_norm": 1.2296936015616324, + "learning_rate": 4.478618999581833e-05, + "loss": 0.7443, + "num_input_tokens_seen": 345076032, + "step": 1912 + }, + { + "epoch": 0.20942007170420646, + "grad_norm": 1.2401906884376446, + "learning_rate": 4.4780933029285365e-05, + "loss": 0.5801, + "num_input_tokens_seen": 345253664, + "step": 1913 + }, + { + "epoch": 0.20952954377514438, + "grad_norm": 1.1979047304747412, + "learning_rate": 4.4775673722705836e-05, + "loss": 0.7066, + "num_input_tokens_seen": 345447200, + "step": 1914 + }, + { + "epoch": 0.20963901584608227, + "grad_norm": 1.3522013659258894, + "learning_rate": 4.47704120767019e-05, + "loss": 0.8532, + "num_input_tokens_seen": 345657984, + "step": 1915 + }, + { + "epoch": 0.20974848791702017, + "grad_norm": 1.3069702553753755, + "learning_rate": 4.476514809189603e-05, + "loss": 0.7004, + "num_input_tokens_seen": 345868992, + "step": 1916 + }, + { + "epoch": 0.20985795998795806, + "grad_norm": 1.361492962483161, + "learning_rate": 4.4759881768910915e-05, + "loss": 0.8298, + "num_input_tokens_seen": 346066112, + "step": 1917 + }, + { + "epoch": 0.20996743205889598, + "grad_norm": 1.3849979523185714, + "learning_rate": 4.475461310836957e-05, + "loss": 0.6776, + "num_input_tokens_seen": 346249120, + "step": 1918 + }, + { + "epoch": 0.21007690412983387, + "grad_norm": 1.3719919419900024, + "learning_rate": 4.4749342110895244e-05, + "loss": 0.8439, + "num_input_tokens_seen": 346409280, + "step": 1919 + }, + { + "epoch": 0.21018637620077177, + "grad_norm": 1.2549720583993, + "learning_rate": 4.4744068777111506e-05, + "loss": 0.5991, + "num_input_tokens_seen": 346559136, + "step": 1920 + }, + { + "epoch": 0.2102958482717097, + "grad_norm": 1.1617133996631217, + "learning_rate": 4.4738793107642174e-05, + "loss": 0.5326, + "num_input_tokens_seen": 346733856, + "step": 1921 + }, + { + "epoch": 0.21040532034264758, + "grad_norm": 1.2241579528785842, + "learning_rate": 4.4733515103111356e-05, + "loss": 0.6439, + "num_input_tokens_seen": 346907232, + "step": 1922 + }, + { + "epoch": 0.21051479241358548, + "grad_norm": 1.3772741498147214, + "learning_rate": 4.472823476414343e-05, + "loss": 0.859, + "num_input_tokens_seen": 347088448, + "step": 1923 + }, + { + "epoch": 0.2106242644845234, + "grad_norm": 1.2441564346339018, + "learning_rate": 4.4722952091363034e-05, + "loss": 0.7572, + "num_input_tokens_seen": 347292288, + "step": 1924 + }, + { + "epoch": 0.2107337365554613, + "grad_norm": 1.3296601791354266, + "learning_rate": 4.471766708539512e-05, + "loss": 0.8705, + "num_input_tokens_seen": 347478880, + "step": 1925 + }, + { + "epoch": 0.2108432086263992, + "grad_norm": 1.5513949393064579, + "learning_rate": 4.4712379746864876e-05, + "loss": 0.8501, + "num_input_tokens_seen": 347609472, + "step": 1926 + }, + { + "epoch": 0.21095268069733708, + "grad_norm": 1.279963056359019, + "learning_rate": 4.4707090076397795e-05, + "loss": 0.8559, + "num_input_tokens_seen": 347778368, + "step": 1927 + }, + { + "epoch": 0.211062152768275, + "grad_norm": 1.3571650200303327, + "learning_rate": 4.4701798074619626e-05, + "loss": 0.6504, + "num_input_tokens_seen": 347960256, + "step": 1928 + }, + { + "epoch": 0.2111716248392129, + "grad_norm": 1.341442833269959, + "learning_rate": 4.4696503742156414e-05, + "loss": 0.6153, + "num_input_tokens_seen": 348138336, + "step": 1929 + }, + { + "epoch": 0.2112810969101508, + "grad_norm": 1.6587807376086285, + "learning_rate": 4.469120707963447e-05, + "loss": 0.8939, + "num_input_tokens_seen": 348288416, + "step": 1930 + }, + { + "epoch": 0.2113905689810887, + "grad_norm": 1.2099746321548952, + "learning_rate": 4.468590808768036e-05, + "loss": 0.5701, + "num_input_tokens_seen": 348462464, + "step": 1931 + }, + { + "epoch": 0.2115000410520266, + "grad_norm": 1.2984852732696168, + "learning_rate": 4.4680606766920954e-05, + "loss": 0.6949, + "num_input_tokens_seen": 348642112, + "step": 1932 + }, + { + "epoch": 0.2116095131229645, + "grad_norm": 1.3562438683437141, + "learning_rate": 4.46753031179834e-05, + "loss": 0.9169, + "num_input_tokens_seen": 348832736, + "step": 1933 + }, + { + "epoch": 0.2117189851939024, + "grad_norm": 1.346195169747301, + "learning_rate": 4.4669997141495095e-05, + "loss": 0.688, + "num_input_tokens_seen": 349028512, + "step": 1934 + }, + { + "epoch": 0.21182845726484031, + "grad_norm": 1.1081775058585908, + "learning_rate": 4.466468883808373e-05, + "loss": 0.77, + "num_input_tokens_seen": 349212192, + "step": 1935 + }, + { + "epoch": 0.2119379293357782, + "grad_norm": 1.2301439613514404, + "learning_rate": 4.4659378208377276e-05, + "loss": 0.7475, + "num_input_tokens_seen": 349396768, + "step": 1936 + }, + { + "epoch": 0.2120474014067161, + "grad_norm": 1.3244182695176554, + "learning_rate": 4.465406525300395e-05, + "loss": 0.7057, + "num_input_tokens_seen": 349573952, + "step": 1937 + }, + { + "epoch": 0.21215687347765402, + "grad_norm": 1.322851369653043, + "learning_rate": 4.4648749972592286e-05, + "loss": 0.9661, + "num_input_tokens_seen": 349761664, + "step": 1938 + }, + { + "epoch": 0.21226634554859192, + "grad_norm": 1.240921094714402, + "learning_rate": 4.464343236777106e-05, + "loss": 0.6221, + "num_input_tokens_seen": 349921152, + "step": 1939 + }, + { + "epoch": 0.2123758176195298, + "grad_norm": 1.2933141241334862, + "learning_rate": 4.463811243916933e-05, + "loss": 0.8982, + "num_input_tokens_seen": 350095424, + "step": 1940 + }, + { + "epoch": 0.21248528969046773, + "grad_norm": 1.3106040885250285, + "learning_rate": 4.463279018741645e-05, + "loss": 0.8376, + "num_input_tokens_seen": 350275744, + "step": 1941 + }, + { + "epoch": 0.21259476176140563, + "grad_norm": 1.2323314796824534, + "learning_rate": 4.4627465613142014e-05, + "loss": 0.6347, + "num_input_tokens_seen": 350449792, + "step": 1942 + }, + { + "epoch": 0.21270423383234352, + "grad_norm": 1.1340977185379006, + "learning_rate": 4.462213871697592e-05, + "loss": 0.6734, + "num_input_tokens_seen": 350644000, + "step": 1943 + }, + { + "epoch": 0.2128137059032814, + "grad_norm": 1.290215271575339, + "learning_rate": 4.4616809499548334e-05, + "loss": 0.798, + "num_input_tokens_seen": 350821856, + "step": 1944 + }, + { + "epoch": 0.21292317797421934, + "grad_norm": 1.2489489428008367, + "learning_rate": 4.461147796148968e-05, + "loss": 0.74, + "num_input_tokens_seen": 350984928, + "step": 1945 + }, + { + "epoch": 0.21303265004515723, + "grad_norm": 1.2992000190820598, + "learning_rate": 4.460614410343067e-05, + "loss": 0.6716, + "num_input_tokens_seen": 351154496, + "step": 1946 + }, + { + "epoch": 0.21314212211609512, + "grad_norm": 1.3648879412531802, + "learning_rate": 4.46008079260023e-05, + "loss": 0.703, + "num_input_tokens_seen": 351318240, + "step": 1947 + }, + { + "epoch": 0.21325159418703304, + "grad_norm": 1.3888530800759664, + "learning_rate": 4.4595469429835826e-05, + "loss": 0.6723, + "num_input_tokens_seen": 351494752, + "step": 1948 + }, + { + "epoch": 0.21336106625797094, + "grad_norm": 1.2049402192035288, + "learning_rate": 4.4590128615562765e-05, + "loss": 0.7073, + "num_input_tokens_seen": 351673952, + "step": 1949 + }, + { + "epoch": 0.21347053832890883, + "grad_norm": 1.1264142048618757, + "learning_rate": 4.458478548381495e-05, + "loss": 0.6136, + "num_input_tokens_seen": 351849120, + "step": 1950 + }, + { + "epoch": 0.21358001039984673, + "grad_norm": 1.136857748545596, + "learning_rate": 4.4579440035224446e-05, + "loss": 0.4825, + "num_input_tokens_seen": 352012864, + "step": 1951 + }, + { + "epoch": 0.21368948247078465, + "grad_norm": 1.236563613424506, + "learning_rate": 4.457409227042362e-05, + "loss": 0.7573, + "num_input_tokens_seen": 352215584, + "step": 1952 + }, + { + "epoch": 0.21379895454172254, + "grad_norm": 1.1725961627852526, + "learning_rate": 4.456874219004509e-05, + "loss": 0.514, + "num_input_tokens_seen": 352405536, + "step": 1953 + }, + { + "epoch": 0.21390842661266044, + "grad_norm": 1.2421680664845567, + "learning_rate": 4.4563389794721776e-05, + "loss": 0.7939, + "num_input_tokens_seen": 352610272, + "step": 1954 + }, + { + "epoch": 0.21401789868359836, + "grad_norm": 1.1934106694012583, + "learning_rate": 4.455803508508685e-05, + "loss": 0.8915, + "num_input_tokens_seen": 352784544, + "step": 1955 + }, + { + "epoch": 0.21412737075453625, + "grad_norm": 1.3952826604610669, + "learning_rate": 4.455267806177376e-05, + "loss": 0.8095, + "num_input_tokens_seen": 352955680, + "step": 1956 + }, + { + "epoch": 0.21423684282547414, + "grad_norm": 1.3026941113061945, + "learning_rate": 4.454731872541622e-05, + "loss": 0.76, + "num_input_tokens_seen": 353110240, + "step": 1957 + }, + { + "epoch": 0.21434631489641207, + "grad_norm": 1.2917391182463467, + "learning_rate": 4.454195707664825e-05, + "loss": 0.8617, + "num_input_tokens_seen": 353297952, + "step": 1958 + }, + { + "epoch": 0.21445578696734996, + "grad_norm": 1.3081590528288862, + "learning_rate": 4.4536593116104125e-05, + "loss": 0.658, + "num_input_tokens_seen": 353442880, + "step": 1959 + }, + { + "epoch": 0.21456525903828785, + "grad_norm": 1.23044535296766, + "learning_rate": 4.453122684441837e-05, + "loss": 0.6638, + "num_input_tokens_seen": 353624320, + "step": 1960 + }, + { + "epoch": 0.21467473110922575, + "grad_norm": 1.2540860626493833, + "learning_rate": 4.452585826222583e-05, + "loss": 0.723, + "num_input_tokens_seen": 353829056, + "step": 1961 + }, + { + "epoch": 0.21478420318016367, + "grad_norm": 1.2246169678667125, + "learning_rate": 4.4520487370161576e-05, + "loss": 0.7129, + "num_input_tokens_seen": 354013632, + "step": 1962 + }, + { + "epoch": 0.21489367525110156, + "grad_norm": 1.336567958618177, + "learning_rate": 4.451511416886099e-05, + "loss": 0.8509, + "num_input_tokens_seen": 354243008, + "step": 1963 + }, + { + "epoch": 0.21500314732203946, + "grad_norm": 1.204569061434021, + "learning_rate": 4.45097386589597e-05, + "loss": 0.7095, + "num_input_tokens_seen": 354410336, + "step": 1964 + }, + { + "epoch": 0.21511261939297738, + "grad_norm": 1.3564819594637845, + "learning_rate": 4.450436084109362e-05, + "loss": 0.8761, + "num_input_tokens_seen": 354592448, + "step": 1965 + }, + { + "epoch": 0.21522209146391527, + "grad_norm": 1.2281184441389061, + "learning_rate": 4.449898071589894e-05, + "loss": 0.6093, + "num_input_tokens_seen": 354762240, + "step": 1966 + }, + { + "epoch": 0.21533156353485317, + "grad_norm": 1.3423688133261293, + "learning_rate": 4.449359828401212e-05, + "loss": 0.7803, + "num_input_tokens_seen": 354939648, + "step": 1967 + }, + { + "epoch": 0.21544103560579106, + "grad_norm": 1.2020261130551073, + "learning_rate": 4.4488213546069884e-05, + "loss": 0.8137, + "num_input_tokens_seen": 355115488, + "step": 1968 + }, + { + "epoch": 0.21555050767672898, + "grad_norm": 1.2956611743814193, + "learning_rate": 4.448282650270924e-05, + "loss": 0.6437, + "num_input_tokens_seen": 355254816, + "step": 1969 + }, + { + "epoch": 0.21565997974766687, + "grad_norm": 1.2288692189898307, + "learning_rate": 4.447743715456747e-05, + "loss": 0.6666, + "num_input_tokens_seen": 355415872, + "step": 1970 + }, + { + "epoch": 0.21576945181860477, + "grad_norm": 1.2793626608204616, + "learning_rate": 4.4472045502282115e-05, + "loss": 0.7389, + "num_input_tokens_seen": 355606720, + "step": 1971 + }, + { + "epoch": 0.2158789238895427, + "grad_norm": 1.6983961898707067, + "learning_rate": 4.4466651546491006e-05, + "loss": 1.0017, + "num_input_tokens_seen": 355809664, + "step": 1972 + }, + { + "epoch": 0.21598839596048058, + "grad_norm": 1.2681127046954452, + "learning_rate": 4.4461255287832235e-05, + "loss": 0.8947, + "num_input_tokens_seen": 355991552, + "step": 1973 + }, + { + "epoch": 0.21609786803141848, + "grad_norm": 1.1753898805696432, + "learning_rate": 4.4455856726944155e-05, + "loss": 0.806, + "num_input_tokens_seen": 356207040, + "step": 1974 + }, + { + "epoch": 0.2162073401023564, + "grad_norm": 1.2428344865043057, + "learning_rate": 4.445045586446543e-05, + "loss": 0.8259, + "num_input_tokens_seen": 356398784, + "step": 1975 + }, + { + "epoch": 0.2163168121732943, + "grad_norm": 1.1907476887605426, + "learning_rate": 4.4445052701034955e-05, + "loss": 0.6571, + "num_input_tokens_seen": 356596800, + "step": 1976 + }, + { + "epoch": 0.2164262842442322, + "grad_norm": 1.236362065094954, + "learning_rate": 4.443964723729191e-05, + "loss": 0.9279, + "num_input_tokens_seen": 356788096, + "step": 1977 + }, + { + "epoch": 0.21653575631517008, + "grad_norm": 1.2550526544840037, + "learning_rate": 4.443423947387577e-05, + "loss": 0.8563, + "num_input_tokens_seen": 356972672, + "step": 1978 + }, + { + "epoch": 0.216645228386108, + "grad_norm": 1.1415879874486934, + "learning_rate": 4.4428829411426254e-05, + "loss": 0.6347, + "num_input_tokens_seen": 357163968, + "step": 1979 + }, + { + "epoch": 0.2167547004570459, + "grad_norm": 1.2429546170776897, + "learning_rate": 4.442341705058335e-05, + "loss": 0.7792, + "num_input_tokens_seen": 357355936, + "step": 1980 + }, + { + "epoch": 0.2168641725279838, + "grad_norm": 1.33125649571513, + "learning_rate": 4.4418002391987345e-05, + "loss": 0.8749, + "num_input_tokens_seen": 357555072, + "step": 1981 + }, + { + "epoch": 0.2169736445989217, + "grad_norm": 1.3130102982099041, + "learning_rate": 4.441258543627879e-05, + "loss": 0.7566, + "num_input_tokens_seen": 357720608, + "step": 1982 + }, + { + "epoch": 0.2170831166698596, + "grad_norm": 1.3165530793572569, + "learning_rate": 4.440716618409847e-05, + "loss": 0.8343, + "num_input_tokens_seen": 357904512, + "step": 1983 + }, + { + "epoch": 0.2171925887407975, + "grad_norm": 1.213964702344433, + "learning_rate": 4.4401744636087495e-05, + "loss": 0.6336, + "num_input_tokens_seen": 358073632, + "step": 1984 + }, + { + "epoch": 0.2173020608117354, + "grad_norm": 1.414865760905842, + "learning_rate": 4.439632079288722e-05, + "loss": 0.7854, + "num_input_tokens_seen": 358283744, + "step": 1985 + }, + { + "epoch": 0.21741153288267331, + "grad_norm": 1.3112428639147744, + "learning_rate": 4.439089465513928e-05, + "loss": 0.6696, + "num_input_tokens_seen": 358437856, + "step": 1986 + }, + { + "epoch": 0.2175210049536112, + "grad_norm": 1.3089071325577157, + "learning_rate": 4.438546622348557e-05, + "loss": 0.8466, + "num_input_tokens_seen": 358619072, + "step": 1987 + }, + { + "epoch": 0.2176304770245491, + "grad_norm": 1.2890701756104341, + "learning_rate": 4.438003549856826e-05, + "loss": 0.5911, + "num_input_tokens_seen": 358778112, + "step": 1988 + }, + { + "epoch": 0.21773994909548702, + "grad_norm": 1.340752829835186, + "learning_rate": 4.4374602481029807e-05, + "loss": 0.858, + "num_input_tokens_seen": 358991136, + "step": 1989 + }, + { + "epoch": 0.21784942116642492, + "grad_norm": 1.156582037186851, + "learning_rate": 4.43691671715129e-05, + "loss": 0.6973, + "num_input_tokens_seen": 359128672, + "step": 1990 + }, + { + "epoch": 0.2179588932373628, + "grad_norm": 1.265981459163956, + "learning_rate": 4.436372957066056e-05, + "loss": 0.6666, + "num_input_tokens_seen": 359299360, + "step": 1991 + }, + { + "epoch": 0.21806836530830073, + "grad_norm": 1.3168868367695583, + "learning_rate": 4.4358289679116026e-05, + "loss": 0.7876, + "num_input_tokens_seen": 359490208, + "step": 1992 + }, + { + "epoch": 0.21817783737923863, + "grad_norm": 1.2270958099421776, + "learning_rate": 4.435284749752283e-05, + "loss": 0.6753, + "num_input_tokens_seen": 359661120, + "step": 1993 + }, + { + "epoch": 0.21828730945017652, + "grad_norm": 1.5737881888073026, + "learning_rate": 4.434740302652477e-05, + "loss": 0.8226, + "num_input_tokens_seen": 359840096, + "step": 1994 + }, + { + "epoch": 0.2183967815211144, + "grad_norm": 1.1710525678502657, + "learning_rate": 4.434195626676592e-05, + "loss": 0.569, + "num_input_tokens_seen": 360019520, + "step": 1995 + }, + { + "epoch": 0.21850625359205234, + "grad_norm": 1.2646258613603854, + "learning_rate": 4.4336507218890624e-05, + "loss": 0.688, + "num_input_tokens_seen": 360202528, + "step": 1996 + }, + { + "epoch": 0.21861572566299023, + "grad_norm": 1.3055623028402725, + "learning_rate": 4.433105588354348e-05, + "loss": 0.7879, + "num_input_tokens_seen": 360382848, + "step": 1997 + }, + { + "epoch": 0.21872519773392812, + "grad_norm": 1.186013244702907, + "learning_rate": 4.4325602261369384e-05, + "loss": 0.8162, + "num_input_tokens_seen": 360595200, + "step": 1998 + }, + { + "epoch": 0.21883466980486604, + "grad_norm": 1.2035009118913873, + "learning_rate": 4.432014635301348e-05, + "loss": 0.4785, + "num_input_tokens_seen": 360760960, + "step": 1999 + }, + { + "epoch": 0.21894414187580394, + "grad_norm": 1.3128403267820437, + "learning_rate": 4.43146881591212e-05, + "loss": 0.8412, + "num_input_tokens_seen": 360958752, + "step": 2000 + }, + { + "epoch": 0.21905361394674183, + "grad_norm": 1.6039577529746962, + "learning_rate": 4.430922768033824e-05, + "loss": 1.052, + "num_input_tokens_seen": 361143552, + "step": 2001 + }, + { + "epoch": 0.21916308601767973, + "grad_norm": 1.1411288990393318, + "learning_rate": 4.4303764917310555e-05, + "loss": 0.711, + "num_input_tokens_seen": 361340224, + "step": 2002 + }, + { + "epoch": 0.21927255808861765, + "grad_norm": 1.3061230009116436, + "learning_rate": 4.429829987068438e-05, + "loss": 0.6528, + "num_input_tokens_seen": 361512032, + "step": 2003 + }, + { + "epoch": 0.21938203015955554, + "grad_norm": 1.3159784363281475, + "learning_rate": 4.4292832541106214e-05, + "loss": 0.8474, + "num_input_tokens_seen": 361712288, + "step": 2004 + }, + { + "epoch": 0.21949150223049343, + "grad_norm": 1.2990301218923364, + "learning_rate": 4.428736292922285e-05, + "loss": 0.8661, + "num_input_tokens_seen": 361905600, + "step": 2005 + }, + { + "epoch": 0.21960097430143136, + "grad_norm": 1.2791547157565992, + "learning_rate": 4.428189103568132e-05, + "loss": 0.6566, + "num_input_tokens_seen": 362109216, + "step": 2006 + }, + { + "epoch": 0.21971044637236925, + "grad_norm": 1.2151180828467785, + "learning_rate": 4.427641686112894e-05, + "loss": 0.6232, + "num_input_tokens_seen": 362308576, + "step": 2007 + }, + { + "epoch": 0.21981991844330714, + "grad_norm": 1.2985610113605868, + "learning_rate": 4.4270940406213304e-05, + "loss": 0.6635, + "num_input_tokens_seen": 362481952, + "step": 2008 + }, + { + "epoch": 0.21992939051424507, + "grad_norm": 1.3112984561603587, + "learning_rate": 4.4265461671582254e-05, + "loss": 0.6922, + "num_input_tokens_seen": 362630912, + "step": 2009 + }, + { + "epoch": 0.22003886258518296, + "grad_norm": 1.320706100031772, + "learning_rate": 4.4259980657883916e-05, + "loss": 1.0244, + "num_input_tokens_seen": 362855136, + "step": 2010 + }, + { + "epoch": 0.22014833465612085, + "grad_norm": 1.1695522940965075, + "learning_rate": 4.425449736576668e-05, + "loss": 0.7141, + "num_input_tokens_seen": 363052256, + "step": 2011 + }, + { + "epoch": 0.22025780672705875, + "grad_norm": 1.2660607423779093, + "learning_rate": 4.424901179587922e-05, + "loss": 0.8426, + "num_input_tokens_seen": 363216000, + "step": 2012 + }, + { + "epoch": 0.22036727879799667, + "grad_norm": 1.2762190040314065, + "learning_rate": 4.4243523948870465e-05, + "loss": 0.6569, + "num_input_tokens_seen": 363372352, + "step": 2013 + }, + { + "epoch": 0.22047675086893456, + "grad_norm": 1.2131255531997638, + "learning_rate": 4.4238033825389605e-05, + "loss": 0.6161, + "num_input_tokens_seen": 363582464, + "step": 2014 + }, + { + "epoch": 0.22058622293987246, + "grad_norm": 1.2390107133200514, + "learning_rate": 4.423254142608613e-05, + "loss": 0.6432, + "num_input_tokens_seen": 363736352, + "step": 2015 + }, + { + "epoch": 0.22069569501081038, + "grad_norm": 1.0665482087718874, + "learning_rate": 4.422704675160976e-05, + "loss": 0.6241, + "num_input_tokens_seen": 363946688, + "step": 2016 + }, + { + "epoch": 0.22080516708174827, + "grad_norm": 1.315577214838842, + "learning_rate": 4.422154980261053e-05, + "loss": 0.6576, + "num_input_tokens_seen": 364140896, + "step": 2017 + }, + { + "epoch": 0.22091463915268617, + "grad_norm": 1.2099495087484862, + "learning_rate": 4.4216050579738685e-05, + "loss": 0.7891, + "num_input_tokens_seen": 364314272, + "step": 2018 + }, + { + "epoch": 0.22102411122362406, + "grad_norm": 1.2522125213461284, + "learning_rate": 4.42105490836448e-05, + "loss": 0.6892, + "num_input_tokens_seen": 364537376, + "step": 2019 + }, + { + "epoch": 0.22113358329456198, + "grad_norm": 1.3871285706042964, + "learning_rate": 4.420504531497969e-05, + "loss": 0.7486, + "num_input_tokens_seen": 364732480, + "step": 2020 + }, + { + "epoch": 0.22124305536549987, + "grad_norm": 1.2576103758860755, + "learning_rate": 4.419953927439443e-05, + "loss": 0.6361, + "num_input_tokens_seen": 364914144, + "step": 2021 + }, + { + "epoch": 0.22135252743643777, + "grad_norm": 1.241518668287497, + "learning_rate": 4.419403096254037e-05, + "loss": 0.5429, + "num_input_tokens_seen": 365098720, + "step": 2022 + }, + { + "epoch": 0.2214619995073757, + "grad_norm": 1.2380847474149805, + "learning_rate": 4.4188520380069145e-05, + "loss": 0.7161, + "num_input_tokens_seen": 365254400, + "step": 2023 + }, + { + "epoch": 0.22157147157831358, + "grad_norm": 1.2480451556986396, + "learning_rate": 4.418300752763264e-05, + "loss": 0.7134, + "num_input_tokens_seen": 365401120, + "step": 2024 + }, + { + "epoch": 0.22168094364925148, + "grad_norm": 1.3397469888786053, + "learning_rate": 4.4177492405883016e-05, + "loss": 0.6245, + "num_input_tokens_seen": 365595776, + "step": 2025 + }, + { + "epoch": 0.2217904157201894, + "grad_norm": 1.2476404808925396, + "learning_rate": 4.4171975015472705e-05, + "loss": 0.5532, + "num_input_tokens_seen": 365770048, + "step": 2026 + }, + { + "epoch": 0.2218998877911273, + "grad_norm": 1.426647602396594, + "learning_rate": 4.4166455357054394e-05, + "loss": 0.6425, + "num_input_tokens_seen": 365929312, + "step": 2027 + }, + { + "epoch": 0.2220093598620652, + "grad_norm": 1.2953606461986147, + "learning_rate": 4.416093343128106e-05, + "loss": 0.5842, + "num_input_tokens_seen": 366089248, + "step": 2028 + }, + { + "epoch": 0.22211883193300308, + "grad_norm": 1.3663041667199873, + "learning_rate": 4.415540923880593e-05, + "loss": 0.8142, + "num_input_tokens_seen": 366283232, + "step": 2029 + }, + { + "epoch": 0.222228304003941, + "grad_norm": 1.5335191534616754, + "learning_rate": 4.41498827802825e-05, + "loss": 1.0114, + "num_input_tokens_seen": 366466240, + "step": 2030 + }, + { + "epoch": 0.2223377760748789, + "grad_norm": 1.4035567878783233, + "learning_rate": 4.414435405636455e-05, + "loss": 0.8991, + "num_input_tokens_seen": 366662464, + "step": 2031 + }, + { + "epoch": 0.2224472481458168, + "grad_norm": 1.3305338836334932, + "learning_rate": 4.4138823067706116e-05, + "loss": 0.6525, + "num_input_tokens_seen": 366836960, + "step": 2032 + }, + { + "epoch": 0.2225567202167547, + "grad_norm": 1.3098259433836064, + "learning_rate": 4.413328981496149e-05, + "loss": 0.5975, + "num_input_tokens_seen": 366969120, + "step": 2033 + }, + { + "epoch": 0.2226661922876926, + "grad_norm": 1.3787472554903217, + "learning_rate": 4.412775429878527e-05, + "loss": 0.7552, + "num_input_tokens_seen": 367142496, + "step": 2034 + }, + { + "epoch": 0.2227756643586305, + "grad_norm": 1.1571819055366903, + "learning_rate": 4.412221651983227e-05, + "loss": 0.6497, + "num_input_tokens_seen": 367314528, + "step": 2035 + }, + { + "epoch": 0.22288513642956842, + "grad_norm": 1.525905468518804, + "learning_rate": 4.4116676478757616e-05, + "loss": 0.7986, + "num_input_tokens_seen": 367483200, + "step": 2036 + }, + { + "epoch": 0.22299460850050631, + "grad_norm": 1.2284774410457298, + "learning_rate": 4.4111134176216685e-05, + "loss": 0.9344, + "num_input_tokens_seen": 367675392, + "step": 2037 + }, + { + "epoch": 0.2231040805714442, + "grad_norm": 1.1985462800831257, + "learning_rate": 4.410558961286511e-05, + "loss": 0.803, + "num_input_tokens_seen": 367848544, + "step": 2038 + }, + { + "epoch": 0.2232135526423821, + "grad_norm": 1.1603599395218414, + "learning_rate": 4.41000427893588e-05, + "loss": 0.4954, + "num_input_tokens_seen": 368026624, + "step": 2039 + }, + { + "epoch": 0.22332302471332002, + "grad_norm": 1.304950801755078, + "learning_rate": 4.409449370635395e-05, + "loss": 0.79, + "num_input_tokens_seen": 368216128, + "step": 2040 + }, + { + "epoch": 0.22343249678425792, + "grad_norm": 1.1938646311421013, + "learning_rate": 4.4088942364506994e-05, + "loss": 0.8221, + "num_input_tokens_seen": 368432512, + "step": 2041 + }, + { + "epoch": 0.2235419688551958, + "grad_norm": 1.093368354697796, + "learning_rate": 4.408338876447465e-05, + "loss": 0.7129, + "num_input_tokens_seen": 368652032, + "step": 2042 + }, + { + "epoch": 0.22365144092613373, + "grad_norm": 1.1968018970126402, + "learning_rate": 4.4077832906913895e-05, + "loss": 0.7354, + "num_input_tokens_seen": 368857440, + "step": 2043 + }, + { + "epoch": 0.22376091299707163, + "grad_norm": 1.4858470644541932, + "learning_rate": 4.407227479248198e-05, + "loss": 1.0417, + "num_input_tokens_seen": 369048512, + "step": 2044 + }, + { + "epoch": 0.22387038506800952, + "grad_norm": 1.2020078649692, + "learning_rate": 4.406671442183642e-05, + "loss": 0.8381, + "num_input_tokens_seen": 369241600, + "step": 2045 + }, + { + "epoch": 0.2239798571389474, + "grad_norm": 1.1881428534517366, + "learning_rate": 4.4061151795634985e-05, + "loss": 0.7005, + "num_input_tokens_seen": 369445440, + "step": 2046 + }, + { + "epoch": 0.22408932920988534, + "grad_norm": 1.2708081206441983, + "learning_rate": 4.405558691453574e-05, + "loss": 0.6792, + "num_input_tokens_seen": 369628448, + "step": 2047 + }, + { + "epoch": 0.22419880128082323, + "grad_norm": 1.2413650284987883, + "learning_rate": 4.4050019779196984e-05, + "loss": 0.6395, + "num_input_tokens_seen": 369781216, + "step": 2048 + }, + { + "epoch": 0.22430827335176112, + "grad_norm": 1.29413582029179, + "learning_rate": 4.4044450390277306e-05, + "loss": 0.7212, + "num_input_tokens_seen": 369962208, + "step": 2049 + }, + { + "epoch": 0.22441774542269904, + "grad_norm": 1.2452476283542708, + "learning_rate": 4.403887874843556e-05, + "loss": 0.8673, + "num_input_tokens_seen": 370166048, + "step": 2050 + }, + { + "epoch": 0.22452721749363694, + "grad_norm": 1.4821103136754281, + "learning_rate": 4.403330485433085e-05, + "loss": 0.7634, + "num_input_tokens_seen": 370328448, + "step": 2051 + }, + { + "epoch": 0.22463668956457483, + "grad_norm": 1.32614203125938, + "learning_rate": 4.4027728708622555e-05, + "loss": 0.7397, + "num_input_tokens_seen": 370497792, + "step": 2052 + }, + { + "epoch": 0.22474616163551275, + "grad_norm": 1.2553450271232638, + "learning_rate": 4.4022150311970335e-05, + "loss": 0.6868, + "num_input_tokens_seen": 370686176, + "step": 2053 + }, + { + "epoch": 0.22485563370645065, + "grad_norm": 1.259952596347952, + "learning_rate": 4.4016569665034105e-05, + "loss": 0.8084, + "num_input_tokens_seen": 370887552, + "step": 2054 + }, + { + "epoch": 0.22496510577738854, + "grad_norm": 1.3751753781920044, + "learning_rate": 4.401098676847402e-05, + "loss": 0.7042, + "num_input_tokens_seen": 371058912, + "step": 2055 + }, + { + "epoch": 0.22507457784832643, + "grad_norm": 1.3002546922906733, + "learning_rate": 4.400540162295056e-05, + "loss": 0.702, + "num_input_tokens_seen": 371208096, + "step": 2056 + }, + { + "epoch": 0.22518404991926436, + "grad_norm": 1.2061960732062293, + "learning_rate": 4.399981422912441e-05, + "loss": 0.5679, + "num_input_tokens_seen": 371367136, + "step": 2057 + }, + { + "epoch": 0.22529352199020225, + "grad_norm": 1.2822017391268608, + "learning_rate": 4.3994224587656556e-05, + "loss": 0.6447, + "num_input_tokens_seen": 371550368, + "step": 2058 + }, + { + "epoch": 0.22540299406114014, + "grad_norm": 1.1461159213157208, + "learning_rate": 4.398863269920825e-05, + "loss": 0.5606, + "num_input_tokens_seen": 371719040, + "step": 2059 + }, + { + "epoch": 0.22551246613207807, + "grad_norm": 1.2708629959063305, + "learning_rate": 4.398303856444099e-05, + "loss": 0.9015, + "num_input_tokens_seen": 371906304, + "step": 2060 + }, + { + "epoch": 0.22562193820301596, + "grad_norm": 1.1850681501018168, + "learning_rate": 4.397744218401657e-05, + "loss": 0.8008, + "num_input_tokens_seen": 372101408, + "step": 2061 + }, + { + "epoch": 0.22573141027395385, + "grad_norm": 1.3241206219979893, + "learning_rate": 4.3971843558597e-05, + "loss": 0.6843, + "num_input_tokens_seen": 372314880, + "step": 2062 + }, + { + "epoch": 0.22584088234489175, + "grad_norm": 1.1555975026582674, + "learning_rate": 4.396624268884462e-05, + "loss": 0.5991, + "num_input_tokens_seen": 372480640, + "step": 2063 + }, + { + "epoch": 0.22595035441582967, + "grad_norm": 1.4555189041120906, + "learning_rate": 4.396063957542198e-05, + "loss": 0.9296, + "num_input_tokens_seen": 372680000, + "step": 2064 + }, + { + "epoch": 0.22605982648676756, + "grad_norm": 1.3803226879418395, + "learning_rate": 4.3955034218991934e-05, + "loss": 0.8449, + "num_input_tokens_seen": 372882944, + "step": 2065 + }, + { + "epoch": 0.22616929855770546, + "grad_norm": 1.1550903691199836, + "learning_rate": 4.394942662021756e-05, + "loss": 0.7749, + "num_input_tokens_seen": 373067968, + "step": 2066 + }, + { + "epoch": 0.22627877062864338, + "grad_norm": 1.3493301831533453, + "learning_rate": 4.3943816779762256e-05, + "loss": 0.7659, + "num_input_tokens_seen": 373221184, + "step": 2067 + }, + { + "epoch": 0.22638824269958127, + "grad_norm": 1.340125674290003, + "learning_rate": 4.393820469828964e-05, + "loss": 0.9793, + "num_input_tokens_seen": 373432416, + "step": 2068 + }, + { + "epoch": 0.22649771477051917, + "grad_norm": 1.277782213277075, + "learning_rate": 4.39325903764636e-05, + "loss": 0.6913, + "num_input_tokens_seen": 373606688, + "step": 2069 + }, + { + "epoch": 0.2266071868414571, + "grad_norm": 1.3068245842365949, + "learning_rate": 4.392697381494832e-05, + "loss": 0.681, + "num_input_tokens_seen": 373793728, + "step": 2070 + }, + { + "epoch": 0.22671665891239498, + "grad_norm": 1.2733174366899058, + "learning_rate": 4.3921355014408226e-05, + "loss": 0.6642, + "num_input_tokens_seen": 373954112, + "step": 2071 + }, + { + "epoch": 0.22682613098333287, + "grad_norm": 1.3226218708615285, + "learning_rate": 4.3915733975508e-05, + "loss": 0.8819, + "num_input_tokens_seen": 374159968, + "step": 2072 + }, + { + "epoch": 0.22693560305427077, + "grad_norm": 1.2738096945329185, + "learning_rate": 4.39101106989126e-05, + "loss": 0.7061, + "num_input_tokens_seen": 374335584, + "step": 2073 + }, + { + "epoch": 0.2270450751252087, + "grad_norm": 1.2928524368384142, + "learning_rate": 4.3904485185287256e-05, + "loss": 0.7383, + "num_input_tokens_seen": 374519040, + "step": 2074 + }, + { + "epoch": 0.22715454719614658, + "grad_norm": 1.3477960486395308, + "learning_rate": 4.389885743529746e-05, + "loss": 0.8211, + "num_input_tokens_seen": 374700480, + "step": 2075 + }, + { + "epoch": 0.22726401926708448, + "grad_norm": 1.4026803380447395, + "learning_rate": 4.389322744960895e-05, + "loss": 0.7222, + "num_input_tokens_seen": 374862432, + "step": 2076 + }, + { + "epoch": 0.2273734913380224, + "grad_norm": 1.2762402542145663, + "learning_rate": 4.388759522888776e-05, + "loss": 0.7916, + "num_input_tokens_seen": 375050816, + "step": 2077 + }, + { + "epoch": 0.2274829634089603, + "grad_norm": 1.1766451579893678, + "learning_rate": 4.3881960773800154e-05, + "loss": 0.659, + "num_input_tokens_seen": 375220384, + "step": 2078 + }, + { + "epoch": 0.2275924354798982, + "grad_norm": 1.3065108846392777, + "learning_rate": 4.387632408501269e-05, + "loss": 0.8047, + "num_input_tokens_seen": 375421312, + "step": 2079 + }, + { + "epoch": 0.22770190755083608, + "grad_norm": 1.2887034985563852, + "learning_rate": 4.3870685163192165e-05, + "loss": 0.8133, + "num_input_tokens_seen": 375584384, + "step": 2080 + }, + { + "epoch": 0.227811379621774, + "grad_norm": 1.1491437543909009, + "learning_rate": 4.386504400900566e-05, + "loss": 0.6045, + "num_input_tokens_seen": 375797632, + "step": 2081 + }, + { + "epoch": 0.2279208516927119, + "grad_norm": 1.2148506679521929, + "learning_rate": 4.3859400623120515e-05, + "loss": 0.6957, + "num_input_tokens_seen": 375979744, + "step": 2082 + }, + { + "epoch": 0.2280303237636498, + "grad_norm": 1.3039643345004146, + "learning_rate": 4.3853755006204334e-05, + "loss": 0.8164, + "num_input_tokens_seen": 376154016, + "step": 2083 + }, + { + "epoch": 0.2281397958345877, + "grad_norm": 1.2608995169552213, + "learning_rate": 4.384810715892498e-05, + "loss": 0.6439, + "num_input_tokens_seen": 376335904, + "step": 2084 + }, + { + "epoch": 0.2282492679055256, + "grad_norm": 1.3093455873522604, + "learning_rate": 4.3842457081950575e-05, + "loss": 0.6883, + "num_input_tokens_seen": 376517568, + "step": 2085 + }, + { + "epoch": 0.2283587399764635, + "grad_norm": 1.331584463952241, + "learning_rate": 4.383680477594951e-05, + "loss": 0.6989, + "num_input_tokens_seen": 376665632, + "step": 2086 + }, + { + "epoch": 0.22846821204740142, + "grad_norm": 1.3471588986165082, + "learning_rate": 4.3831150241590464e-05, + "loss": 0.9081, + "num_input_tokens_seen": 376847072, + "step": 2087 + }, + { + "epoch": 0.2285776841183393, + "grad_norm": 1.1730572141432418, + "learning_rate": 4.382549347954233e-05, + "loss": 0.5016, + "num_input_tokens_seen": 377031872, + "step": 2088 + }, + { + "epoch": 0.2286871561892772, + "grad_norm": 1.4034415705674266, + "learning_rate": 4.381983449047432e-05, + "loss": 0.7132, + "num_input_tokens_seen": 377209728, + "step": 2089 + }, + { + "epoch": 0.2287966282602151, + "grad_norm": 1.243894687460368, + "learning_rate": 4.381417327505586e-05, + "loss": 0.7477, + "num_input_tokens_seen": 377408864, + "step": 2090 + }, + { + "epoch": 0.22890610033115302, + "grad_norm": 1.1176629821924433, + "learning_rate": 4.3808509833956666e-05, + "loss": 0.4857, + "num_input_tokens_seen": 377609344, + "step": 2091 + }, + { + "epoch": 0.22901557240209092, + "grad_norm": 1.306022691798272, + "learning_rate": 4.380284416784672e-05, + "loss": 0.6178, + "num_input_tokens_seen": 377783616, + "step": 2092 + }, + { + "epoch": 0.2291250444730288, + "grad_norm": 1.3357713830297655, + "learning_rate": 4.3797176277396245e-05, + "loss": 0.8217, + "num_input_tokens_seen": 377975360, + "step": 2093 + }, + { + "epoch": 0.22923451654396673, + "grad_norm": 1.4485346986817098, + "learning_rate": 4.3791506163275764e-05, + "loss": 0.8519, + "num_input_tokens_seen": 378153888, + "step": 2094 + }, + { + "epoch": 0.22934398861490463, + "grad_norm": 1.537950917932838, + "learning_rate": 4.378583382615601e-05, + "loss": 0.8439, + "num_input_tokens_seen": 378318528, + "step": 2095 + }, + { + "epoch": 0.22945346068584252, + "grad_norm": 1.1915424206570322, + "learning_rate": 4.378015926670804e-05, + "loss": 0.682, + "num_input_tokens_seen": 378505120, + "step": 2096 + }, + { + "epoch": 0.2295629327567804, + "grad_norm": 1.162405178345752, + "learning_rate": 4.377448248560313e-05, + "loss": 0.8057, + "num_input_tokens_seen": 378686560, + "step": 2097 + }, + { + "epoch": 0.22967240482771833, + "grad_norm": 1.307245590593551, + "learning_rate": 4.376880348351283e-05, + "loss": 0.7287, + "num_input_tokens_seen": 378836416, + "step": 2098 + }, + { + "epoch": 0.22978187689865623, + "grad_norm": 1.1523999733867596, + "learning_rate": 4.376312226110895e-05, + "loss": 0.5919, + "num_input_tokens_seen": 378994336, + "step": 2099 + }, + { + "epoch": 0.22989134896959412, + "grad_norm": 1.336734779212754, + "learning_rate": 4.375743881906359e-05, + "loss": 0.7046, + "num_input_tokens_seen": 379149568, + "step": 2100 + }, + { + "epoch": 0.23000082104053204, + "grad_norm": 1.6055175328326479, + "learning_rate": 4.3751753158049065e-05, + "loss": 1.1376, + "num_input_tokens_seen": 379326304, + "step": 2101 + }, + { + "epoch": 0.23011029311146994, + "grad_norm": 1.3370650591403406, + "learning_rate": 4.374606527873799e-05, + "loss": 0.7187, + "num_input_tokens_seen": 379507968, + "step": 2102 + }, + { + "epoch": 0.23021976518240783, + "grad_norm": 1.3106480202679232, + "learning_rate": 4.3740375181803225e-05, + "loss": 0.6193, + "num_input_tokens_seen": 379709792, + "step": 2103 + }, + { + "epoch": 0.23032923725334575, + "grad_norm": 1.164639374823897, + "learning_rate": 4.373468286791792e-05, + "loss": 0.6533, + "num_input_tokens_seen": 379883616, + "step": 2104 + }, + { + "epoch": 0.23043870932428365, + "grad_norm": 1.3682854303825618, + "learning_rate": 4.3728988337755426e-05, + "loss": 0.8281, + "num_input_tokens_seen": 380048704, + "step": 2105 + }, + { + "epoch": 0.23054818139522154, + "grad_norm": 1.1054973800779964, + "learning_rate": 4.372329159198943e-05, + "loss": 0.6358, + "num_input_tokens_seen": 380246720, + "step": 2106 + }, + { + "epoch": 0.23065765346615943, + "grad_norm": 1.3341391879906812, + "learning_rate": 4.371759263129382e-05, + "loss": 0.7766, + "num_input_tokens_seen": 380412256, + "step": 2107 + }, + { + "epoch": 0.23076712553709736, + "grad_norm": 1.1824244902134164, + "learning_rate": 4.371189145634279e-05, + "loss": 0.7225, + "num_input_tokens_seen": 380597952, + "step": 2108 + }, + { + "epoch": 0.23087659760803525, + "grad_norm": 1.2739528245201166, + "learning_rate": 4.3706188067810766e-05, + "loss": 0.6908, + "num_input_tokens_seen": 380758336, + "step": 2109 + }, + { + "epoch": 0.23098606967897314, + "grad_norm": 1.258279701799449, + "learning_rate": 4.370048246637246e-05, + "loss": 0.6135, + "num_input_tokens_seen": 380951648, + "step": 2110 + }, + { + "epoch": 0.23109554174991107, + "grad_norm": 1.2639886615250662, + "learning_rate": 4.369477465270282e-05, + "loss": 0.7887, + "num_input_tokens_seen": 381150112, + "step": 2111 + }, + { + "epoch": 0.23120501382084896, + "grad_norm": 1.3439656461628013, + "learning_rate": 4.3689064627477084e-05, + "loss": 0.7214, + "num_input_tokens_seen": 381359104, + "step": 2112 + }, + { + "epoch": 0.23131448589178685, + "grad_norm": 1.3883839669597884, + "learning_rate": 4.368335239137073e-05, + "loss": 0.658, + "num_input_tokens_seen": 381538080, + "step": 2113 + }, + { + "epoch": 0.23142395796272475, + "grad_norm": 1.2865559107213524, + "learning_rate": 4.36776379450595e-05, + "loss": 0.6878, + "num_input_tokens_seen": 381725344, + "step": 2114 + }, + { + "epoch": 0.23153343003366267, + "grad_norm": 1.4873677651748494, + "learning_rate": 4.3671921289219415e-05, + "loss": 0.8352, + "num_input_tokens_seen": 381894240, + "step": 2115 + }, + { + "epoch": 0.23164290210460056, + "grad_norm": 1.1778544654161416, + "learning_rate": 4.3666202424526724e-05, + "loss": 0.6355, + "num_input_tokens_seen": 382082176, + "step": 2116 + }, + { + "epoch": 0.23175237417553846, + "grad_norm": 1.3476113132227256, + "learning_rate": 4.366048135165798e-05, + "loss": 0.8084, + "num_input_tokens_seen": 382264736, + "step": 2117 + }, + { + "epoch": 0.23186184624647638, + "grad_norm": 1.2980387877706852, + "learning_rate": 4.365475807128996e-05, + "loss": 0.727, + "num_input_tokens_seen": 382443264, + "step": 2118 + }, + { + "epoch": 0.23197131831741427, + "grad_norm": 1.3755773719262017, + "learning_rate": 4.364903258409973e-05, + "loss": 0.7689, + "num_input_tokens_seen": 382629632, + "step": 2119 + }, + { + "epoch": 0.23208079038835217, + "grad_norm": 1.2006278938445245, + "learning_rate": 4.364330489076458e-05, + "loss": 0.6362, + "num_input_tokens_seen": 382795840, + "step": 2120 + }, + { + "epoch": 0.2321902624592901, + "grad_norm": 1.3537594354077032, + "learning_rate": 4.3637574991962113e-05, + "loss": 0.7211, + "num_input_tokens_seen": 382955104, + "step": 2121 + }, + { + "epoch": 0.23229973453022798, + "grad_norm": 1.2728784656359475, + "learning_rate": 4.3631842888370154e-05, + "loss": 0.8299, + "num_input_tokens_seen": 383152896, + "step": 2122 + }, + { + "epoch": 0.23240920660116587, + "grad_norm": 1.3198680693990599, + "learning_rate": 4.362610858066679e-05, + "loss": 0.6905, + "num_input_tokens_seen": 383333888, + "step": 2123 + }, + { + "epoch": 0.23251867867210377, + "grad_norm": 1.1019682441331424, + "learning_rate": 4.3620372069530404e-05, + "loss": 0.7238, + "num_input_tokens_seen": 383508832, + "step": 2124 + }, + { + "epoch": 0.2326281507430417, + "grad_norm": 1.2409151866926986, + "learning_rate": 4.361463335563959e-05, + "loss": 0.6629, + "num_input_tokens_seen": 383673472, + "step": 2125 + }, + { + "epoch": 0.23273762281397958, + "grad_norm": 1.067269559193553, + "learning_rate": 4.3608892439673234e-05, + "loss": 0.5215, + "num_input_tokens_seen": 383874624, + "step": 2126 + }, + { + "epoch": 0.23284709488491748, + "grad_norm": 1.1820123261262463, + "learning_rate": 4.360314932231048e-05, + "loss": 0.5942, + "num_input_tokens_seen": 384067936, + "step": 2127 + }, + { + "epoch": 0.2329565669558554, + "grad_norm": 1.1466870104478422, + "learning_rate": 4.3597404004230714e-05, + "loss": 0.5175, + "num_input_tokens_seen": 384252064, + "step": 2128 + }, + { + "epoch": 0.2330660390267933, + "grad_norm": 1.2581553495397297, + "learning_rate": 4.3591656486113616e-05, + "loss": 0.7117, + "num_input_tokens_seen": 384441120, + "step": 2129 + }, + { + "epoch": 0.2331755110977312, + "grad_norm": 1.1881479703959306, + "learning_rate": 4.3585906768639095e-05, + "loss": 0.6636, + "num_input_tokens_seen": 384623232, + "step": 2130 + }, + { + "epoch": 0.23328498316866908, + "grad_norm": 1.3121650359194772, + "learning_rate": 4.358015485248733e-05, + "loss": 0.7299, + "num_input_tokens_seen": 384808928, + "step": 2131 + }, + { + "epoch": 0.233394455239607, + "grad_norm": 1.2697039207994636, + "learning_rate": 4.357440073833877e-05, + "loss": 0.6303, + "num_input_tokens_seen": 384949376, + "step": 2132 + }, + { + "epoch": 0.2335039273105449, + "grad_norm": 1.6021667829126138, + "learning_rate": 4.356864442687411e-05, + "loss": 0.8497, + "num_input_tokens_seen": 385141120, + "step": 2133 + }, + { + "epoch": 0.2336133993814828, + "grad_norm": 1.221684176088377, + "learning_rate": 4.356288591877431e-05, + "loss": 0.6641, + "num_input_tokens_seen": 385357280, + "step": 2134 + }, + { + "epoch": 0.2337228714524207, + "grad_norm": 1.1785214958512515, + "learning_rate": 4.355712521472059e-05, + "loss": 0.6397, + "num_input_tokens_seen": 385529984, + "step": 2135 + }, + { + "epoch": 0.2338323435233586, + "grad_norm": 1.3176823891157594, + "learning_rate": 4.355136231539443e-05, + "loss": 0.8616, + "num_input_tokens_seen": 385716352, + "step": 2136 + }, + { + "epoch": 0.2339418155942965, + "grad_norm": 1.2672635997494759, + "learning_rate": 4.3545597221477585e-05, + "loss": 0.7354, + "num_input_tokens_seen": 385882112, + "step": 2137 + }, + { + "epoch": 0.23405128766523442, + "grad_norm": 1.204869430005136, + "learning_rate": 4.353982993365203e-05, + "loss": 0.6767, + "num_input_tokens_seen": 386033312, + "step": 2138 + }, + { + "epoch": 0.2341607597361723, + "grad_norm": 1.145104638944575, + "learning_rate": 4.3534060452600046e-05, + "loss": 0.5562, + "num_input_tokens_seen": 386233792, + "step": 2139 + }, + { + "epoch": 0.2342702318071102, + "grad_norm": 1.3397783980822024, + "learning_rate": 4.3528288779004135e-05, + "loss": 0.7539, + "num_input_tokens_seen": 386407616, + "step": 2140 + }, + { + "epoch": 0.2343797038780481, + "grad_norm": 1.2033643608284688, + "learning_rate": 4.352251491354708e-05, + "loss": 0.6199, + "num_input_tokens_seen": 386567776, + "step": 2141 + }, + { + "epoch": 0.23448917594898602, + "grad_norm": 1.2804312066840429, + "learning_rate": 4.351673885691192e-05, + "loss": 0.6844, + "num_input_tokens_seen": 386748544, + "step": 2142 + }, + { + "epoch": 0.23459864801992392, + "grad_norm": 1.254087916253771, + "learning_rate": 4.3510960609781954e-05, + "loss": 0.8967, + "num_input_tokens_seen": 386940736, + "step": 2143 + }, + { + "epoch": 0.2347081200908618, + "grad_norm": 0.9997064636473308, + "learning_rate": 4.350518017284073e-05, + "loss": 0.5102, + "num_input_tokens_seen": 387100896, + "step": 2144 + }, + { + "epoch": 0.23481759216179973, + "grad_norm": 1.2526136729724977, + "learning_rate": 4.349939754677208e-05, + "loss": 0.6685, + "num_input_tokens_seen": 387310112, + "step": 2145 + }, + { + "epoch": 0.23492706423273763, + "grad_norm": 1.3612472094182326, + "learning_rate": 4.349361273226005e-05, + "loss": 0.6578, + "num_input_tokens_seen": 387446528, + "step": 2146 + }, + { + "epoch": 0.23503653630367552, + "grad_norm": 1.3835404629733932, + "learning_rate": 4.3487825729988995e-05, + "loss": 0.9187, + "num_input_tokens_seen": 387650816, + "step": 2147 + }, + { + "epoch": 0.2351460083746134, + "grad_norm": 1.218519286790812, + "learning_rate": 4.34820365406435e-05, + "loss": 0.6088, + "num_input_tokens_seen": 387808288, + "step": 2148 + }, + { + "epoch": 0.23525548044555133, + "grad_norm": 1.3053045127282719, + "learning_rate": 4.347624516490841e-05, + "loss": 0.7736, + "num_input_tokens_seen": 388017056, + "step": 2149 + }, + { + "epoch": 0.23536495251648923, + "grad_norm": 1.3815311594516526, + "learning_rate": 4.3470451603468836e-05, + "loss": 0.7928, + "num_input_tokens_seen": 388217760, + "step": 2150 + }, + { + "epoch": 0.23547442458742712, + "grad_norm": 1.2591593858334595, + "learning_rate": 4.346465585701015e-05, + "loss": 0.7631, + "num_input_tokens_seen": 388377248, + "step": 2151 + }, + { + "epoch": 0.23558389665836504, + "grad_norm": 1.296656670633994, + "learning_rate": 4.345885792621798e-05, + "loss": 0.7203, + "num_input_tokens_seen": 388578848, + "step": 2152 + }, + { + "epoch": 0.23569336872930294, + "grad_norm": 1.3092771652926054, + "learning_rate": 4.34530578117782e-05, + "loss": 0.7063, + "num_input_tokens_seen": 388758720, + "step": 2153 + }, + { + "epoch": 0.23580284080024083, + "grad_norm": 1.1401862254207518, + "learning_rate": 4.344725551437695e-05, + "loss": 0.6471, + "num_input_tokens_seen": 388932544, + "step": 2154 + }, + { + "epoch": 0.23591231287117875, + "grad_norm": 1.1616560877387163, + "learning_rate": 4.344145103470065e-05, + "loss": 0.699, + "num_input_tokens_seen": 389117792, + "step": 2155 + }, + { + "epoch": 0.23602178494211665, + "grad_norm": 1.1706618069118464, + "learning_rate": 4.343564437343594e-05, + "loss": 0.8703, + "num_input_tokens_seen": 389288256, + "step": 2156 + }, + { + "epoch": 0.23613125701305454, + "grad_norm": 1.1953627631895336, + "learning_rate": 4.342983553126974e-05, + "loss": 0.5769, + "num_input_tokens_seen": 389463200, + "step": 2157 + }, + { + "epoch": 0.23624072908399243, + "grad_norm": 1.118670512950576, + "learning_rate": 4.342402450888924e-05, + "loss": 0.8318, + "num_input_tokens_seen": 389688320, + "step": 2158 + }, + { + "epoch": 0.23635020115493036, + "grad_norm": 1.2620111368779128, + "learning_rate": 4.341821130698185e-05, + "loss": 0.6685, + "num_input_tokens_seen": 389885664, + "step": 2159 + }, + { + "epoch": 0.23645967322586825, + "grad_norm": 1.4668809076815188, + "learning_rate": 4.341239592623527e-05, + "loss": 1.1616, + "num_input_tokens_seen": 390085696, + "step": 2160 + }, + { + "epoch": 0.23656914529680614, + "grad_norm": 1.1714382629375764, + "learning_rate": 4.3406578367337466e-05, + "loss": 0.6276, + "num_input_tokens_seen": 390266688, + "step": 2161 + }, + { + "epoch": 0.23667861736774407, + "grad_norm": 1.2419705951299653, + "learning_rate": 4.340075863097662e-05, + "loss": 0.7528, + "num_input_tokens_seen": 390484640, + "step": 2162 + }, + { + "epoch": 0.23678808943868196, + "grad_norm": 1.4222695777794134, + "learning_rate": 4.33949367178412e-05, + "loss": 0.7423, + "num_input_tokens_seen": 390656000, + "step": 2163 + }, + { + "epoch": 0.23689756150961985, + "grad_norm": 1.1772898883814025, + "learning_rate": 4.338911262861993e-05, + "loss": 0.5792, + "num_input_tokens_seen": 390824000, + "step": 2164 + }, + { + "epoch": 0.23700703358055775, + "grad_norm": 1.2807853600563, + "learning_rate": 4.3383286364001794e-05, + "loss": 0.5947, + "num_input_tokens_seen": 391000064, + "step": 2165 + }, + { + "epoch": 0.23711650565149567, + "grad_norm": 1.3992987596564002, + "learning_rate": 4.337745792467604e-05, + "loss": 0.677, + "num_input_tokens_seen": 391172544, + "step": 2166 + }, + { + "epoch": 0.23722597772243356, + "grad_norm": 1.3216128745482976, + "learning_rate": 4.337162731133212e-05, + "loss": 0.6756, + "num_input_tokens_seen": 391335392, + "step": 2167 + }, + { + "epoch": 0.23733544979337146, + "grad_norm": 1.2115918760204265, + "learning_rate": 4.336579452465982e-05, + "loss": 0.6934, + "num_input_tokens_seen": 391483232, + "step": 2168 + }, + { + "epoch": 0.23744492186430938, + "grad_norm": 1.3212814387223468, + "learning_rate": 4.335995956534914e-05, + "loss": 0.6566, + "num_input_tokens_seen": 391667136, + "step": 2169 + }, + { + "epoch": 0.23755439393524727, + "grad_norm": 1.2893338929214235, + "learning_rate": 4.335412243409034e-05, + "loss": 0.7516, + "num_input_tokens_seen": 391818784, + "step": 2170 + }, + { + "epoch": 0.23766386600618516, + "grad_norm": 1.1411325553470928, + "learning_rate": 4.3348283131573944e-05, + "loss": 0.5706, + "num_input_tokens_seen": 391975360, + "step": 2171 + }, + { + "epoch": 0.2377733380771231, + "grad_norm": 1.325941152826079, + "learning_rate": 4.3342441658490724e-05, + "loss": 0.7973, + "num_input_tokens_seen": 392146048, + "step": 2172 + }, + { + "epoch": 0.23788281014806098, + "grad_norm": 1.2665456555100787, + "learning_rate": 4.333659801553173e-05, + "loss": 0.7025, + "num_input_tokens_seen": 392308672, + "step": 2173 + }, + { + "epoch": 0.23799228221899887, + "grad_norm": 1.2362720060945238, + "learning_rate": 4.3330752203388234e-05, + "loss": 0.8281, + "num_input_tokens_seen": 392503104, + "step": 2174 + }, + { + "epoch": 0.23810175428993677, + "grad_norm": 1.2493078699821467, + "learning_rate": 4.3324904222751796e-05, + "loss": 0.8117, + "num_input_tokens_seen": 392684096, + "step": 2175 + }, + { + "epoch": 0.2382112263608747, + "grad_norm": 1.3133701891173368, + "learning_rate": 4.331905407431422e-05, + "loss": 0.6373, + "num_input_tokens_seen": 392870912, + "step": 2176 + }, + { + "epoch": 0.23832069843181258, + "grad_norm": 1.3035974574616445, + "learning_rate": 4.3313201758767574e-05, + "loss": 0.6628, + "num_input_tokens_seen": 393057952, + "step": 2177 + }, + { + "epoch": 0.23843017050275048, + "grad_norm": 1.2915494106763805, + "learning_rate": 4.330734727680417e-05, + "loss": 0.8184, + "num_input_tokens_seen": 393260896, + "step": 2178 + }, + { + "epoch": 0.2385396425736884, + "grad_norm": 1.315679966736526, + "learning_rate": 4.330149062911657e-05, + "loss": 0.7815, + "num_input_tokens_seen": 393437408, + "step": 2179 + }, + { + "epoch": 0.2386491146446263, + "grad_norm": 1.2650901700477482, + "learning_rate": 4.3295631816397626e-05, + "loss": 0.8418, + "num_input_tokens_seen": 393630272, + "step": 2180 + }, + { + "epoch": 0.23875858671556419, + "grad_norm": 1.4008853735363962, + "learning_rate": 4.32897708393404e-05, + "loss": 0.743, + "num_input_tokens_seen": 393832320, + "step": 2181 + }, + { + "epoch": 0.23886805878650208, + "grad_norm": 1.259266457246225, + "learning_rate": 4.328390769863826e-05, + "loss": 0.6785, + "num_input_tokens_seen": 394022048, + "step": 2182 + }, + { + "epoch": 0.23897753085744, + "grad_norm": 1.295122272065545, + "learning_rate": 4.327804239498479e-05, + "loss": 0.6435, + "num_input_tokens_seen": 394197216, + "step": 2183 + }, + { + "epoch": 0.2390870029283779, + "grad_norm": 1.4129031576539952, + "learning_rate": 4.3272174929073846e-05, + "loss": 0.7714, + "num_input_tokens_seen": 394355584, + "step": 2184 + }, + { + "epoch": 0.2391964749993158, + "grad_norm": 1.2240482973624112, + "learning_rate": 4.326630530159954e-05, + "loss": 0.6722, + "num_input_tokens_seen": 394539488, + "step": 2185 + }, + { + "epoch": 0.2393059470702537, + "grad_norm": 1.2437833216051783, + "learning_rate": 4.3260433513256227e-05, + "loss": 0.8328, + "num_input_tokens_seen": 394732800, + "step": 2186 + }, + { + "epoch": 0.2394154191411916, + "grad_norm": 1.1364404396632128, + "learning_rate": 4.325455956473854e-05, + "loss": 0.6082, + "num_input_tokens_seen": 394925888, + "step": 2187 + }, + { + "epoch": 0.2395248912121295, + "grad_norm": 1.4275065813063181, + "learning_rate": 4.324868345674136e-05, + "loss": 0.7465, + "num_input_tokens_seen": 395087168, + "step": 2188 + }, + { + "epoch": 0.23963436328306742, + "grad_norm": 1.3596978078323, + "learning_rate": 4.324280518995981e-05, + "loss": 0.6871, + "num_input_tokens_seen": 395254720, + "step": 2189 + }, + { + "epoch": 0.2397438353540053, + "grad_norm": 1.6859984605178049, + "learning_rate": 4.3236924765089284e-05, + "loss": 0.8124, + "num_input_tokens_seen": 395418016, + "step": 2190 + }, + { + "epoch": 0.2398533074249432, + "grad_norm": 1.2791805767748534, + "learning_rate": 4.323104218282542e-05, + "loss": 0.7596, + "num_input_tokens_seen": 395592064, + "step": 2191 + }, + { + "epoch": 0.2399627794958811, + "grad_norm": 1.292081591914452, + "learning_rate": 4.322515744386411e-05, + "loss": 0.8633, + "num_input_tokens_seen": 395768576, + "step": 2192 + }, + { + "epoch": 0.24007225156681902, + "grad_norm": 1.1867886972814217, + "learning_rate": 4.321927054890153e-05, + "loss": 0.7009, + "num_input_tokens_seen": 395962336, + "step": 2193 + }, + { + "epoch": 0.24018172363775692, + "grad_norm": 1.347956173458839, + "learning_rate": 4.3213381498634056e-05, + "loss": 0.871, + "num_input_tokens_seen": 396158784, + "step": 2194 + }, + { + "epoch": 0.2402911957086948, + "grad_norm": 1.2958978395669072, + "learning_rate": 4.3207490293758374e-05, + "loss": 0.8203, + "num_input_tokens_seen": 396340672, + "step": 2195 + }, + { + "epoch": 0.24040066777963273, + "grad_norm": 1.076061651604967, + "learning_rate": 4.32015969349714e-05, + "loss": 0.6214, + "num_input_tokens_seen": 396516736, + "step": 2196 + }, + { + "epoch": 0.24051013985057063, + "grad_norm": 1.2704160453383662, + "learning_rate": 4.31957014229703e-05, + "loss": 0.6514, + "num_input_tokens_seen": 396698848, + "step": 2197 + }, + { + "epoch": 0.24061961192150852, + "grad_norm": 1.1698457568948681, + "learning_rate": 4.3189803758452504e-05, + "loss": 0.7347, + "num_input_tokens_seen": 396894400, + "step": 2198 + }, + { + "epoch": 0.24072908399244644, + "grad_norm": 1.3069298485402467, + "learning_rate": 4.318390394211571e-05, + "loss": 0.8095, + "num_input_tokens_seen": 397070912, + "step": 2199 + }, + { + "epoch": 0.24083855606338433, + "grad_norm": 1.156775638593815, + "learning_rate": 4.3178001974657836e-05, + "loss": 0.6231, + "num_input_tokens_seen": 397223904, + "step": 2200 + }, + { + "epoch": 0.24094802813432223, + "grad_norm": 1.2401012439311498, + "learning_rate": 4.317209785677707e-05, + "loss": 0.8237, + "num_input_tokens_seen": 397432224, + "step": 2201 + }, + { + "epoch": 0.24105750020526012, + "grad_norm": 1.251357247694325, + "learning_rate": 4.3166191589171875e-05, + "loss": 0.767, + "num_input_tokens_seen": 397635616, + "step": 2202 + }, + { + "epoch": 0.24116697227619804, + "grad_norm": 1.1810140666786104, + "learning_rate": 4.316028317254094e-05, + "loss": 0.7334, + "num_input_tokens_seen": 397838112, + "step": 2203 + }, + { + "epoch": 0.24127644434713594, + "grad_norm": 1.0436995442695642, + "learning_rate": 4.315437260758322e-05, + "loss": 0.6005, + "num_input_tokens_seen": 398018432, + "step": 2204 + }, + { + "epoch": 0.24138591641807383, + "grad_norm": 1.4071511242439017, + "learning_rate": 4.3148459894997926e-05, + "loss": 1.0094, + "num_input_tokens_seen": 398202112, + "step": 2205 + }, + { + "epoch": 0.24149538848901175, + "grad_norm": 1.3419075688397193, + "learning_rate": 4.3142545035484526e-05, + "loss": 0.9189, + "num_input_tokens_seen": 398374592, + "step": 2206 + }, + { + "epoch": 0.24160486055994965, + "grad_norm": 1.4143766612718698, + "learning_rate": 4.3136628029742735e-05, + "loss": 0.9183, + "num_input_tokens_seen": 398574848, + "step": 2207 + }, + { + "epoch": 0.24171433263088754, + "grad_norm": 1.2334311051320126, + "learning_rate": 4.3130708878472505e-05, + "loss": 0.6346, + "num_input_tokens_seen": 398737472, + "step": 2208 + }, + { + "epoch": 0.24182380470182543, + "grad_norm": 1.2853970287601455, + "learning_rate": 4.312478758237408e-05, + "loss": 0.6345, + "num_input_tokens_seen": 398908160, + "step": 2209 + }, + { + "epoch": 0.24193327677276336, + "grad_norm": 1.2611077781013291, + "learning_rate": 4.3118864142147944e-05, + "loss": 0.6804, + "num_input_tokens_seen": 399111552, + "step": 2210 + }, + { + "epoch": 0.24204274884370125, + "grad_norm": 1.4186151885794938, + "learning_rate": 4.31129385584948e-05, + "loss": 0.9816, + "num_input_tokens_seen": 399280000, + "step": 2211 + }, + { + "epoch": 0.24215222091463914, + "grad_norm": 1.2884729597831481, + "learning_rate": 4.310701083211566e-05, + "loss": 0.7062, + "num_input_tokens_seen": 399433664, + "step": 2212 + }, + { + "epoch": 0.24226169298557707, + "grad_norm": 1.208352905421857, + "learning_rate": 4.310108096371175e-05, + "loss": 0.6091, + "num_input_tokens_seen": 399615328, + "step": 2213 + }, + { + "epoch": 0.24237116505651496, + "grad_norm": 1.3674366751171048, + "learning_rate": 4.309514895398456e-05, + "loss": 0.803, + "num_input_tokens_seen": 399776160, + "step": 2214 + }, + { + "epoch": 0.24248063712745285, + "grad_norm": 1.3571799961544047, + "learning_rate": 4.308921480363586e-05, + "loss": 0.7244, + "num_input_tokens_seen": 399945728, + "step": 2215 + }, + { + "epoch": 0.24259010919839077, + "grad_norm": 1.3714084101157178, + "learning_rate": 4.308327851336762e-05, + "loss": 0.6973, + "num_input_tokens_seen": 400110368, + "step": 2216 + }, + { + "epoch": 0.24269958126932867, + "grad_norm": 1.3674439979816237, + "learning_rate": 4.307734008388209e-05, + "loss": 0.8851, + "num_input_tokens_seen": 400268288, + "step": 2217 + }, + { + "epoch": 0.24280905334026656, + "grad_norm": 1.177973885798868, + "learning_rate": 4.307139951588179e-05, + "loss": 0.658, + "num_input_tokens_seen": 400423520, + "step": 2218 + }, + { + "epoch": 0.24291852541120446, + "grad_norm": 1.1218615836613726, + "learning_rate": 4.306545681006949e-05, + "loss": 0.593, + "num_input_tokens_seen": 400591072, + "step": 2219 + }, + { + "epoch": 0.24302799748214238, + "grad_norm": 1.5370068449266316, + "learning_rate": 4.305951196714817e-05, + "loss": 0.8656, + "num_input_tokens_seen": 400777440, + "step": 2220 + }, + { + "epoch": 0.24313746955308027, + "grad_norm": 1.232717584357471, + "learning_rate": 4.305356498782112e-05, + "loss": 0.6704, + "num_input_tokens_seen": 400961568, + "step": 2221 + }, + { + "epoch": 0.24324694162401816, + "grad_norm": 1.271000830238125, + "learning_rate": 4.304761587279183e-05, + "loss": 0.6563, + "num_input_tokens_seen": 401110976, + "step": 2222 + }, + { + "epoch": 0.2433564136949561, + "grad_norm": 1.1922077422217168, + "learning_rate": 4.304166462276409e-05, + "loss": 0.6782, + "num_input_tokens_seen": 401308096, + "step": 2223 + }, + { + "epoch": 0.24346588576589398, + "grad_norm": 1.2628071817287498, + "learning_rate": 4.303571123844191e-05, + "loss": 0.8364, + "num_input_tokens_seen": 401477664, + "step": 2224 + }, + { + "epoch": 0.24357535783683187, + "grad_norm": 1.2380293340181174, + "learning_rate": 4.3029755720529576e-05, + "loss": 0.5803, + "num_input_tokens_seen": 401613632, + "step": 2225 + }, + { + "epoch": 0.24368482990776977, + "grad_norm": 1.340849340559436, + "learning_rate": 4.30237980697316e-05, + "loss": 0.692, + "num_input_tokens_seen": 401804704, + "step": 2226 + }, + { + "epoch": 0.2437943019787077, + "grad_norm": 1.3132989813012133, + "learning_rate": 4.3017838286752776e-05, + "loss": 0.8151, + "num_input_tokens_seen": 401953216, + "step": 2227 + }, + { + "epoch": 0.24390377404964558, + "grad_norm": 1.14848228938157, + "learning_rate": 4.301187637229812e-05, + "loss": 0.5109, + "num_input_tokens_seen": 402080672, + "step": 2228 + }, + { + "epoch": 0.24401324612058348, + "grad_norm": 1.1447875481283554, + "learning_rate": 4.300591232707293e-05, + "loss": 0.659, + "num_input_tokens_seen": 402278688, + "step": 2229 + }, + { + "epoch": 0.2441227181915214, + "grad_norm": 1.3528104103655592, + "learning_rate": 4.2999946151782735e-05, + "loss": 0.7858, + "num_input_tokens_seen": 402450720, + "step": 2230 + }, + { + "epoch": 0.2442321902624593, + "grad_norm": 1.2731244223082574, + "learning_rate": 4.29939778471333e-05, + "loss": 0.7344, + "num_input_tokens_seen": 402622976, + "step": 2231 + }, + { + "epoch": 0.24434166233339719, + "grad_norm": 1.278819453720234, + "learning_rate": 4.298800741383071e-05, + "loss": 0.6002, + "num_input_tokens_seen": 402821440, + "step": 2232 + }, + { + "epoch": 0.2444511344043351, + "grad_norm": 1.0930308021016408, + "learning_rate": 4.298203485258122e-05, + "loss": 0.5028, + "num_input_tokens_seen": 403008480, + "step": 2233 + }, + { + "epoch": 0.244560606475273, + "grad_norm": 1.2580163920749152, + "learning_rate": 4.2976060164091384e-05, + "loss": 0.7309, + "num_input_tokens_seen": 403154080, + "step": 2234 + }, + { + "epoch": 0.2446700785462109, + "grad_norm": 1.2071520797348168, + "learning_rate": 4.297008334906798e-05, + "loss": 0.5743, + "num_input_tokens_seen": 403335296, + "step": 2235 + }, + { + "epoch": 0.2447795506171488, + "grad_norm": 1.3947900697538975, + "learning_rate": 4.2964104408218085e-05, + "loss": 0.76, + "num_input_tokens_seen": 403505984, + "step": 2236 + }, + { + "epoch": 0.2448890226880867, + "grad_norm": 1.216437076871346, + "learning_rate": 4.295812334224898e-05, + "loss": 0.5893, + "num_input_tokens_seen": 403695712, + "step": 2237 + }, + { + "epoch": 0.2449984947590246, + "grad_norm": 1.1414351003396355, + "learning_rate": 4.2952140151868204e-05, + "loss": 0.6426, + "num_input_tokens_seen": 403891264, + "step": 2238 + }, + { + "epoch": 0.2451079668299625, + "grad_norm": 1.1741009742620356, + "learning_rate": 4.294615483778358e-05, + "loss": 0.5291, + "num_input_tokens_seen": 404044480, + "step": 2239 + }, + { + "epoch": 0.24521743890090042, + "grad_norm": 1.2182649478601995, + "learning_rate": 4.2940167400703134e-05, + "loss": 0.7297, + "num_input_tokens_seen": 404213824, + "step": 2240 + }, + { + "epoch": 0.2453269109718383, + "grad_norm": 1.4716240379377659, + "learning_rate": 4.293417784133519e-05, + "loss": 0.9402, + "num_input_tokens_seen": 404383840, + "step": 2241 + }, + { + "epoch": 0.2454363830427762, + "grad_norm": 1.3382026698285256, + "learning_rate": 4.2928186160388286e-05, + "loss": 0.6448, + "num_input_tokens_seen": 404550720, + "step": 2242 + }, + { + "epoch": 0.2455458551137141, + "grad_norm": 1.22563418107239, + "learning_rate": 4.292219235857123e-05, + "loss": 0.5925, + "num_input_tokens_seen": 404729472, + "step": 2243 + }, + { + "epoch": 0.24565532718465202, + "grad_norm": 1.1834256899526188, + "learning_rate": 4.291619643659308e-05, + "loss": 0.6788, + "num_input_tokens_seen": 404905760, + "step": 2244 + }, + { + "epoch": 0.24576479925558992, + "grad_norm": 1.1104890307142754, + "learning_rate": 4.291019839516314e-05, + "loss": 0.5877, + "num_input_tokens_seen": 405104672, + "step": 2245 + }, + { + "epoch": 0.2458742713265278, + "grad_norm": 1.3656490280167461, + "learning_rate": 4.290419823499098e-05, + "loss": 0.8504, + "num_input_tokens_seen": 405281184, + "step": 2246 + }, + { + "epoch": 0.24598374339746573, + "grad_norm": 1.4035259141340473, + "learning_rate": 4.289819595678638e-05, + "loss": 0.8372, + "num_input_tokens_seen": 405455680, + "step": 2247 + }, + { + "epoch": 0.24609321546840363, + "grad_norm": 1.2512585979398732, + "learning_rate": 4.289219156125942e-05, + "loss": 0.7548, + "num_input_tokens_seen": 405636672, + "step": 2248 + }, + { + "epoch": 0.24620268753934152, + "grad_norm": 1.2087938593074157, + "learning_rate": 4.288618504912041e-05, + "loss": 0.7107, + "num_input_tokens_seen": 405825952, + "step": 2249 + }, + { + "epoch": 0.24631215961027944, + "grad_norm": 1.237933088870126, + "learning_rate": 4.2880176421079896e-05, + "loss": 0.5628, + "num_input_tokens_seen": 405968864, + "step": 2250 + }, + { + "epoch": 0.24642163168121733, + "grad_norm": 1.3306878438706098, + "learning_rate": 4.287416567784869e-05, + "loss": 0.6264, + "num_input_tokens_seen": 406159264, + "step": 2251 + }, + { + "epoch": 0.24653110375215523, + "grad_norm": 1.29712679153676, + "learning_rate": 4.2868152820137855e-05, + "loss": 0.7524, + "num_input_tokens_seen": 406335328, + "step": 2252 + }, + { + "epoch": 0.24664057582309312, + "grad_norm": 1.3255309858351783, + "learning_rate": 4.28621378486587e-05, + "loss": 0.6801, + "num_input_tokens_seen": 406509152, + "step": 2253 + }, + { + "epoch": 0.24675004789403104, + "grad_norm": 1.2643743858769614, + "learning_rate": 4.285612076412279e-05, + "loss": 0.8152, + "num_input_tokens_seen": 406694176, + "step": 2254 + }, + { + "epoch": 0.24685951996496894, + "grad_norm": 1.0493381616920796, + "learning_rate": 4.285010156724192e-05, + "loss": 0.5984, + "num_input_tokens_seen": 406907200, + "step": 2255 + }, + { + "epoch": 0.24696899203590683, + "grad_norm": 1.2890008160253388, + "learning_rate": 4.2844080258728156e-05, + "loss": 0.7221, + "num_input_tokens_seen": 407088864, + "step": 2256 + }, + { + "epoch": 0.24707846410684475, + "grad_norm": 1.1976573110477453, + "learning_rate": 4.2838056839293816e-05, + "loss": 0.7122, + "num_input_tokens_seen": 407253728, + "step": 2257 + }, + { + "epoch": 0.24718793617778265, + "grad_norm": 1.4156315986530035, + "learning_rate": 4.283203130965145e-05, + "loss": 0.8215, + "num_input_tokens_seen": 407422624, + "step": 2258 + }, + { + "epoch": 0.24729740824872054, + "grad_norm": 1.3268691464482338, + "learning_rate": 4.282600367051387e-05, + "loss": 0.7149, + "num_input_tokens_seen": 407548960, + "step": 2259 + }, + { + "epoch": 0.24740688031965843, + "grad_norm": 1.1891034995190113, + "learning_rate": 4.2819973922594134e-05, + "loss": 0.6445, + "num_input_tokens_seen": 407739584, + "step": 2260 + }, + { + "epoch": 0.24751635239059636, + "grad_norm": 1.1679583392747765, + "learning_rate": 4.281394206660555e-05, + "loss": 0.7869, + "num_input_tokens_seen": 407932448, + "step": 2261 + }, + { + "epoch": 0.24762582446153425, + "grad_norm": 1.204358051143178, + "learning_rate": 4.2807908103261674e-05, + "loss": 0.6026, + "num_input_tokens_seen": 408093056, + "step": 2262 + }, + { + "epoch": 0.24773529653247214, + "grad_norm": 1.308966694538556, + "learning_rate": 4.280187203327631e-05, + "loss": 0.8459, + "num_input_tokens_seen": 408274048, + "step": 2263 + }, + { + "epoch": 0.24784476860341006, + "grad_norm": 1.2323544056072475, + "learning_rate": 4.2795833857363515e-05, + "loss": 0.6728, + "num_input_tokens_seen": 408434432, + "step": 2264 + }, + { + "epoch": 0.24795424067434796, + "grad_norm": 1.1670916271539225, + "learning_rate": 4.2789793576237594e-05, + "loss": 0.6107, + "num_input_tokens_seen": 408621024, + "step": 2265 + }, + { + "epoch": 0.24806371274528585, + "grad_norm": 1.3685717741012706, + "learning_rate": 4.278375119061311e-05, + "loss": 0.6844, + "num_input_tokens_seen": 408799776, + "step": 2266 + }, + { + "epoch": 0.24817318481622377, + "grad_norm": 1.238579699052938, + "learning_rate": 4.2777706701204846e-05, + "loss": 0.7267, + "num_input_tokens_seen": 408999360, + "step": 2267 + }, + { + "epoch": 0.24828265688716167, + "grad_norm": 1.2141715394766448, + "learning_rate": 4.277166010872787e-05, + "loss": 0.7287, + "num_input_tokens_seen": 409167584, + "step": 2268 + }, + { + "epoch": 0.24839212895809956, + "grad_norm": 2.493470438725189, + "learning_rate": 4.276561141389748e-05, + "loss": 0.8652, + "num_input_tokens_seen": 409370080, + "step": 2269 + }, + { + "epoch": 0.24850160102903746, + "grad_norm": 1.3037691824828568, + "learning_rate": 4.275956061742921e-05, + "loss": 0.5777, + "num_input_tokens_seen": 409536064, + "step": 2270 + }, + { + "epoch": 0.24861107309997538, + "grad_norm": 1.4918198057591354, + "learning_rate": 4.275350772003888e-05, + "loss": 0.7637, + "num_input_tokens_seen": 409689952, + "step": 2271 + }, + { + "epoch": 0.24872054517091327, + "grad_norm": 1.4829121659673161, + "learning_rate": 4.2747452722442524e-05, + "loss": 0.9454, + "num_input_tokens_seen": 409857728, + "step": 2272 + }, + { + "epoch": 0.24883001724185116, + "grad_norm": 1.19746613807028, + "learning_rate": 4.274139562535643e-05, + "loss": 0.6154, + "num_input_tokens_seen": 410054624, + "step": 2273 + }, + { + "epoch": 0.2489394893127891, + "grad_norm": 1.2549380041668865, + "learning_rate": 4.2735336429497166e-05, + "loss": 0.8847, + "num_input_tokens_seen": 410273696, + "step": 2274 + }, + { + "epoch": 0.24904896138372698, + "grad_norm": 1.2801712891589125, + "learning_rate": 4.272927513558149e-05, + "loss": 0.7859, + "num_input_tokens_seen": 410477760, + "step": 2275 + }, + { + "epoch": 0.24915843345466487, + "grad_norm": 1.351834793445274, + "learning_rate": 4.272321174432646e-05, + "loss": 0.8957, + "num_input_tokens_seen": 410657856, + "step": 2276 + }, + { + "epoch": 0.24926790552560277, + "grad_norm": 1.3297953704219834, + "learning_rate": 4.271714625644937e-05, + "loss": 0.7914, + "num_input_tokens_seen": 410850720, + "step": 2277 + }, + { + "epoch": 0.2493773775965407, + "grad_norm": 1.2411642598545067, + "learning_rate": 4.271107867266775e-05, + "loss": 0.8169, + "num_input_tokens_seen": 411024320, + "step": 2278 + }, + { + "epoch": 0.24948684966747858, + "grad_norm": 1.444068662146947, + "learning_rate": 4.270500899369937e-05, + "loss": 0.8569, + "num_input_tokens_seen": 411218080, + "step": 2279 + }, + { + "epoch": 0.24959632173841648, + "grad_norm": 1.1905821404540007, + "learning_rate": 4.269893722026228e-05, + "loss": 0.6996, + "num_input_tokens_seen": 411384512, + "step": 2280 + }, + { + "epoch": 0.2497057938093544, + "grad_norm": 1.3365376334723487, + "learning_rate": 4.2692863353074745e-05, + "loss": 0.6996, + "num_input_tokens_seen": 411570880, + "step": 2281 + }, + { + "epoch": 0.2498152658802923, + "grad_norm": 1.2141981463953986, + "learning_rate": 4.26867873928553e-05, + "loss": 0.6311, + "num_input_tokens_seen": 411710208, + "step": 2282 + }, + { + "epoch": 0.24992473795123019, + "grad_norm": 1.1650472994031171, + "learning_rate": 4.2680709340322725e-05, + "loss": 0.6481, + "num_input_tokens_seen": 411864096, + "step": 2283 + }, + { + "epoch": 0.2500342100221681, + "grad_norm": 1.3111557207133524, + "learning_rate": 4.2674629196196025e-05, + "loss": 0.9176, + "num_input_tokens_seen": 412073312, + "step": 2284 + }, + { + "epoch": 0.250143682093106, + "grad_norm": 1.1905250667574967, + "learning_rate": 4.266854696119449e-05, + "loss": 0.5857, + "num_input_tokens_seen": 412282528, + "step": 2285 + }, + { + "epoch": 0.2502531541640439, + "grad_norm": 1.4145614839569651, + "learning_rate": 4.266246263603761e-05, + "loss": 0.9749, + "num_input_tokens_seen": 412494880, + "step": 2286 + }, + { + "epoch": 0.2503626262349818, + "grad_norm": 1.4019027909250885, + "learning_rate": 4.2656376221445185e-05, + "loss": 0.8228, + "num_input_tokens_seen": 412664000, + "step": 2287 + }, + { + "epoch": 0.2504720983059197, + "grad_norm": 1.4022743824243227, + "learning_rate": 4.265028771813719e-05, + "loss": 0.7435, + "num_input_tokens_seen": 412849920, + "step": 2288 + }, + { + "epoch": 0.25058157037685763, + "grad_norm": 1.326928845470652, + "learning_rate": 4.2644197126833906e-05, + "loss": 0.7253, + "num_input_tokens_seen": 413002912, + "step": 2289 + }, + { + "epoch": 0.2506910424477955, + "grad_norm": 1.3517041435330022, + "learning_rate": 4.263810444825583e-05, + "loss": 0.8337, + "num_input_tokens_seen": 413161056, + "step": 2290 + }, + { + "epoch": 0.2508005145187334, + "grad_norm": 1.2084303466174777, + "learning_rate": 4.2632009683123716e-05, + "loss": 0.7558, + "num_input_tokens_seen": 413329952, + "step": 2291 + }, + { + "epoch": 0.2509099865896713, + "grad_norm": 1.1784177097792248, + "learning_rate": 4.262591283215857e-05, + "loss": 0.5995, + "num_input_tokens_seen": 413518560, + "step": 2292 + }, + { + "epoch": 0.2510194586606092, + "grad_norm": 1.303086080358741, + "learning_rate": 4.261981389608162e-05, + "loss": 0.6912, + "num_input_tokens_seen": 413678720, + "step": 2293 + }, + { + "epoch": 0.25112893073154713, + "grad_norm": 1.4017635834867315, + "learning_rate": 4.2613712875614374e-05, + "loss": 0.6114, + "num_input_tokens_seen": 413848064, + "step": 2294 + }, + { + "epoch": 0.251238402802485, + "grad_norm": 1.3374187890469602, + "learning_rate": 4.260760977147858e-05, + "loss": 0.7621, + "num_input_tokens_seen": 414013152, + "step": 2295 + }, + { + "epoch": 0.2513478748734229, + "grad_norm": 1.2058008336079344, + "learning_rate": 4.260150458439619e-05, + "loss": 0.7083, + "num_input_tokens_seen": 414172640, + "step": 2296 + }, + { + "epoch": 0.25145734694436084, + "grad_norm": 1.3132871356494098, + "learning_rate": 4.259539731508947e-05, + "loss": 0.7597, + "num_input_tokens_seen": 414339968, + "step": 2297 + }, + { + "epoch": 0.2515668190152987, + "grad_norm": 1.2624898154726074, + "learning_rate": 4.2589287964280874e-05, + "loss": 0.6881, + "num_input_tokens_seen": 414521856, + "step": 2298 + }, + { + "epoch": 0.2516762910862366, + "grad_norm": 1.3079258822897946, + "learning_rate": 4.2583176532693136e-05, + "loss": 0.7224, + "num_input_tokens_seen": 414725248, + "step": 2299 + }, + { + "epoch": 0.25178576315717455, + "grad_norm": 1.3403070388437208, + "learning_rate": 4.257706302104924e-05, + "loss": 0.7737, + "num_input_tokens_seen": 414920800, + "step": 2300 + }, + { + "epoch": 0.2518952352281124, + "grad_norm": 1.1973811680361806, + "learning_rate": 4.2570947430072384e-05, + "loss": 0.6697, + "num_input_tokens_seen": 415101792, + "step": 2301 + }, + { + "epoch": 0.25200470729905033, + "grad_norm": 1.2049761316301921, + "learning_rate": 4.256482976048603e-05, + "loss": 0.6812, + "num_input_tokens_seen": 415268448, + "step": 2302 + }, + { + "epoch": 0.25211417936998826, + "grad_norm": 1.3120316169232173, + "learning_rate": 4.2558710013013906e-05, + "loss": 0.521, + "num_input_tokens_seen": 415420320, + "step": 2303 + }, + { + "epoch": 0.2522236514409261, + "grad_norm": 1.3369594034067478, + "learning_rate": 4.255258818837994e-05, + "loss": 0.9001, + "num_input_tokens_seen": 415593024, + "step": 2304 + }, + { + "epoch": 0.25233312351186404, + "grad_norm": 1.13225499608173, + "learning_rate": 4.254646428730835e-05, + "loss": 0.5922, + "num_input_tokens_seen": 415806272, + "step": 2305 + }, + { + "epoch": 0.25244259558280197, + "grad_norm": 1.1647483290254814, + "learning_rate": 4.254033831052359e-05, + "loss": 0.5669, + "num_input_tokens_seen": 415979200, + "step": 2306 + }, + { + "epoch": 0.25255206765373983, + "grad_norm": 1.1992642769885173, + "learning_rate": 4.253421025875033e-05, + "loss": 0.6982, + "num_input_tokens_seen": 416184384, + "step": 2307 + }, + { + "epoch": 0.25266153972467775, + "grad_norm": 1.494033310969504, + "learning_rate": 4.252808013271351e-05, + "loss": 0.71, + "num_input_tokens_seen": 416343648, + "step": 2308 + }, + { + "epoch": 0.2527710117956156, + "grad_norm": 1.3583662740929638, + "learning_rate": 4.252194793313833e-05, + "loss": 0.8952, + "num_input_tokens_seen": 416525984, + "step": 2309 + }, + { + "epoch": 0.25288048386655354, + "grad_norm": 1.3855949468799145, + "learning_rate": 4.25158136607502e-05, + "loss": 0.6391, + "num_input_tokens_seen": 416680544, + "step": 2310 + }, + { + "epoch": 0.25298995593749146, + "grad_norm": 1.1563963926276855, + "learning_rate": 4.25096773162748e-05, + "loss": 0.609, + "num_input_tokens_seen": 416866240, + "step": 2311 + }, + { + "epoch": 0.25309942800842933, + "grad_norm": 1.2764094957900436, + "learning_rate": 4.2503538900438044e-05, + "loss": 0.7208, + "num_input_tokens_seen": 417051712, + "step": 2312 + }, + { + "epoch": 0.25320890007936725, + "grad_norm": 1.1209064287272044, + "learning_rate": 4.2497398413966094e-05, + "loss": 0.5881, + "num_input_tokens_seen": 417230688, + "step": 2313 + }, + { + "epoch": 0.25331837215030517, + "grad_norm": 1.2442277192153348, + "learning_rate": 4.249125585758537e-05, + "loss": 0.6289, + "num_input_tokens_seen": 417422880, + "step": 2314 + }, + { + "epoch": 0.25342784422124304, + "grad_norm": 1.3349243694841806, + "learning_rate": 4.248511123202251e-05, + "loss": 0.6618, + "num_input_tokens_seen": 417602976, + "step": 2315 + }, + { + "epoch": 0.25353731629218096, + "grad_norm": 1.234151409517641, + "learning_rate": 4.247896453800442e-05, + "loss": 0.7165, + "num_input_tokens_seen": 417800544, + "step": 2316 + }, + { + "epoch": 0.2536467883631189, + "grad_norm": 1.3639631779222796, + "learning_rate": 4.247281577625824e-05, + "loss": 0.6589, + "num_input_tokens_seen": 417988256, + "step": 2317 + }, + { + "epoch": 0.25375626043405675, + "grad_norm": 1.436147136142105, + "learning_rate": 4.2466664947511356e-05, + "loss": 0.8643, + "num_input_tokens_seen": 418140800, + "step": 2318 + }, + { + "epoch": 0.25386573250499467, + "grad_norm": 1.3004939131368816, + "learning_rate": 4.246051205249139e-05, + "loss": 0.764, + "num_input_tokens_seen": 418354720, + "step": 2319 + }, + { + "epoch": 0.2539752045759326, + "grad_norm": 1.2467067729309143, + "learning_rate": 4.2454357091926236e-05, + "loss": 0.676, + "num_input_tokens_seen": 418517792, + "step": 2320 + }, + { + "epoch": 0.25408467664687046, + "grad_norm": 1.1789025733238208, + "learning_rate": 4.244820006654401e-05, + "loss": 0.5762, + "num_input_tokens_seen": 418690944, + "step": 2321 + }, + { + "epoch": 0.2541941487178084, + "grad_norm": 1.1725802524650795, + "learning_rate": 4.244204097707306e-05, + "loss": 0.7981, + "num_input_tokens_seen": 418912480, + "step": 2322 + }, + { + "epoch": 0.2543036207887463, + "grad_norm": 1.7746157324255347, + "learning_rate": 4.243587982424201e-05, + "loss": 0.701, + "num_input_tokens_seen": 419089888, + "step": 2323 + }, + { + "epoch": 0.25441309285968416, + "grad_norm": 1.2247528527517242, + "learning_rate": 4.242971660877971e-05, + "loss": 0.6268, + "num_input_tokens_seen": 419246240, + "step": 2324 + }, + { + "epoch": 0.2545225649306221, + "grad_norm": 1.1627402252232586, + "learning_rate": 4.242355133141525e-05, + "loss": 0.6317, + "num_input_tokens_seen": 419442688, + "step": 2325 + }, + { + "epoch": 0.25463203700155995, + "grad_norm": 1.299591253284433, + "learning_rate": 4.241738399287798e-05, + "loss": 0.6422, + "num_input_tokens_seen": 419623680, + "step": 2326 + }, + { + "epoch": 0.2547415090724979, + "grad_norm": 1.2376068396638833, + "learning_rate": 4.2411214593897486e-05, + "loss": 0.7002, + "num_input_tokens_seen": 419819008, + "step": 2327 + }, + { + "epoch": 0.2548509811434358, + "grad_norm": 1.374378974426546, + "learning_rate": 4.2405043135203584e-05, + "loss": 0.9622, + "num_input_tokens_seen": 420017696, + "step": 2328 + }, + { + "epoch": 0.25496045321437366, + "grad_norm": 1.3429622559004835, + "learning_rate": 4.239886961752635e-05, + "loss": 0.7019, + "num_input_tokens_seen": 420210784, + "step": 2329 + }, + { + "epoch": 0.2550699252853116, + "grad_norm": 1.3405364000613809, + "learning_rate": 4.23926940415961e-05, + "loss": 0.6885, + "num_input_tokens_seen": 420407904, + "step": 2330 + }, + { + "epoch": 0.2551793973562495, + "grad_norm": 1.3000967613302523, + "learning_rate": 4.2386516408143404e-05, + "loss": 0.5546, + "num_input_tokens_seen": 420589120, + "step": 2331 + }, + { + "epoch": 0.25528886942718737, + "grad_norm": 1.363385303100592, + "learning_rate": 4.2380336717899044e-05, + "loss": 0.6421, + "num_input_tokens_seen": 420744352, + "step": 2332 + }, + { + "epoch": 0.2553983414981253, + "grad_norm": 1.2822787643928024, + "learning_rate": 4.237415497159408e-05, + "loss": 0.6979, + "num_input_tokens_seen": 420938112, + "step": 2333 + }, + { + "epoch": 0.2555078135690632, + "grad_norm": 1.353006330400588, + "learning_rate": 4.2367971169959796e-05, + "loss": 0.6976, + "num_input_tokens_seen": 421084160, + "step": 2334 + }, + { + "epoch": 0.2556172856400011, + "grad_norm": 1.100682224776786, + "learning_rate": 4.2361785313727726e-05, + "loss": 0.6831, + "num_input_tokens_seen": 421281728, + "step": 2335 + }, + { + "epoch": 0.255726757710939, + "grad_norm": 1.2725357888515674, + "learning_rate": 4.235559740362964e-05, + "loss": 0.8004, + "num_input_tokens_seen": 421481984, + "step": 2336 + }, + { + "epoch": 0.2558362297818769, + "grad_norm": 1.2939242148928616, + "learning_rate": 4.2349407440397566e-05, + "loss": 0.7721, + "num_input_tokens_seen": 421669696, + "step": 2337 + }, + { + "epoch": 0.2559457018528148, + "grad_norm": 1.1614352282405564, + "learning_rate": 4.234321542476375e-05, + "loss": 0.6656, + "num_input_tokens_seen": 421873312, + "step": 2338 + }, + { + "epoch": 0.2560551739237527, + "grad_norm": 1.0998466059629655, + "learning_rate": 4.2337021357460706e-05, + "loss": 0.6814, + "num_input_tokens_seen": 422064832, + "step": 2339 + }, + { + "epoch": 0.25616464599469063, + "grad_norm": 1.2425171516662268, + "learning_rate": 4.2330825239221186e-05, + "loss": 0.8215, + "num_input_tokens_seen": 422230144, + "step": 2340 + }, + { + "epoch": 0.2562741180656285, + "grad_norm": 1.3450512906047876, + "learning_rate": 4.232462707077816e-05, + "loss": 0.8495, + "num_input_tokens_seen": 422457280, + "step": 2341 + }, + { + "epoch": 0.2563835901365664, + "grad_norm": 1.3525998759455655, + "learning_rate": 4.231842685286488e-05, + "loss": 0.9702, + "num_input_tokens_seen": 422669856, + "step": 2342 + }, + { + "epoch": 0.2564930622075043, + "grad_norm": 1.247067779323362, + "learning_rate": 4.23122245862148e-05, + "loss": 0.5781, + "num_input_tokens_seen": 422831136, + "step": 2343 + }, + { + "epoch": 0.2566025342784422, + "grad_norm": 1.2774083765695645, + "learning_rate": 4.2306020271561656e-05, + "loss": 0.8055, + "num_input_tokens_seen": 423015488, + "step": 2344 + }, + { + "epoch": 0.25671200634938013, + "grad_norm": 1.2178483708996242, + "learning_rate": 4.2299813909639395e-05, + "loss": 0.6582, + "num_input_tokens_seen": 423209024, + "step": 2345 + }, + { + "epoch": 0.256821478420318, + "grad_norm": 1.3047323162125246, + "learning_rate": 4.229360550118222e-05, + "loss": 0.7001, + "num_input_tokens_seen": 423365824, + "step": 2346 + }, + { + "epoch": 0.2569309504912559, + "grad_norm": 1.3187918850723994, + "learning_rate": 4.228739504692457e-05, + "loss": 0.6909, + "num_input_tokens_seen": 423554880, + "step": 2347 + }, + { + "epoch": 0.25704042256219384, + "grad_norm": 1.209218386186564, + "learning_rate": 4.228118254760114e-05, + "loss": 0.5869, + "num_input_tokens_seen": 423735872, + "step": 2348 + }, + { + "epoch": 0.2571498946331317, + "grad_norm": 1.3227982455539287, + "learning_rate": 4.2274968003946845e-05, + "loss": 0.7899, + "num_input_tokens_seen": 423918880, + "step": 2349 + }, + { + "epoch": 0.2572593667040696, + "grad_norm": 1.2755625026257469, + "learning_rate": 4.226875141669686e-05, + "loss": 0.6313, + "num_input_tokens_seen": 424086656, + "step": 2350 + }, + { + "epoch": 0.25736883877500755, + "grad_norm": 1.2869407531777197, + "learning_rate": 4.22625327865866e-05, + "loss": 0.6408, + "num_input_tokens_seen": 424259136, + "step": 2351 + }, + { + "epoch": 0.2574783108459454, + "grad_norm": 1.332449659931827, + "learning_rate": 4.22563121143517e-05, + "loss": 0.6255, + "num_input_tokens_seen": 424428704, + "step": 2352 + }, + { + "epoch": 0.25758778291688333, + "grad_norm": 1.3073035413144496, + "learning_rate": 4.225008940072808e-05, + "loss": 0.8997, + "num_input_tokens_seen": 424625824, + "step": 2353 + }, + { + "epoch": 0.25769725498782126, + "grad_norm": 1.3787019054086187, + "learning_rate": 4.224386464645186e-05, + "loss": 0.8799, + "num_input_tokens_seen": 424826304, + "step": 2354 + }, + { + "epoch": 0.2578067270587591, + "grad_norm": 1.1793700037265669, + "learning_rate": 4.22376378522594e-05, + "loss": 0.6415, + "num_input_tokens_seen": 424968320, + "step": 2355 + }, + { + "epoch": 0.25791619912969704, + "grad_norm": 1.1141035289084318, + "learning_rate": 4.223140901888736e-05, + "loss": 0.6528, + "num_input_tokens_seen": 425156480, + "step": 2356 + }, + { + "epoch": 0.25802567120063497, + "grad_norm": 1.3475456745283945, + "learning_rate": 4.222517814707255e-05, + "loss": 0.959, + "num_input_tokens_seen": 425366368, + "step": 2357 + }, + { + "epoch": 0.25813514327157283, + "grad_norm": 1.2805017286688931, + "learning_rate": 4.221894523755211e-05, + "loss": 0.6475, + "num_input_tokens_seen": 425564608, + "step": 2358 + }, + { + "epoch": 0.25824461534251075, + "grad_norm": 1.2488171703155262, + "learning_rate": 4.2212710291063354e-05, + "loss": 0.5597, + "num_input_tokens_seen": 425725888, + "step": 2359 + }, + { + "epoch": 0.2583540874134486, + "grad_norm": 1.2852410285605425, + "learning_rate": 4.220647330834389e-05, + "loss": 0.7329, + "num_input_tokens_seen": 425882688, + "step": 2360 + }, + { + "epoch": 0.25846355948438654, + "grad_norm": 1.3134404400818234, + "learning_rate": 4.2200234290131515e-05, + "loss": 0.679, + "num_input_tokens_seen": 426037024, + "step": 2361 + }, + { + "epoch": 0.25857303155532446, + "grad_norm": 1.1855524556313002, + "learning_rate": 4.219399323716431e-05, + "loss": 0.6232, + "num_input_tokens_seen": 426195392, + "step": 2362 + }, + { + "epoch": 0.25868250362626233, + "grad_norm": 1.2722987600807265, + "learning_rate": 4.2187750150180574e-05, + "loss": 0.7653, + "num_input_tokens_seen": 426348160, + "step": 2363 + }, + { + "epoch": 0.25879197569720025, + "grad_norm": 1.3772795161839038, + "learning_rate": 4.2181505029918847e-05, + "loss": 0.797, + "num_input_tokens_seen": 426523328, + "step": 2364 + }, + { + "epoch": 0.25890144776813817, + "grad_norm": 1.4562845643488913, + "learning_rate": 4.217525787711792e-05, + "loss": 0.9774, + "num_input_tokens_seen": 426702304, + "step": 2365 + }, + { + "epoch": 0.25901091983907604, + "grad_norm": 1.3340778457713331, + "learning_rate": 4.216900869251683e-05, + "loss": 0.7212, + "num_input_tokens_seen": 426879936, + "step": 2366 + }, + { + "epoch": 0.25912039191001396, + "grad_norm": 1.1478582984683832, + "learning_rate": 4.216275747685482e-05, + "loss": 0.6235, + "num_input_tokens_seen": 427081760, + "step": 2367 + }, + { + "epoch": 0.2592298639809519, + "grad_norm": 1.233628252355038, + "learning_rate": 4.215650423087142e-05, + "loss": 0.6489, + "num_input_tokens_seen": 427258944, + "step": 2368 + }, + { + "epoch": 0.25933933605188975, + "grad_norm": 1.326555019757464, + "learning_rate": 4.215024895530636e-05, + "loss": 0.6584, + "num_input_tokens_seen": 427439488, + "step": 2369 + }, + { + "epoch": 0.25944880812282767, + "grad_norm": 1.184604729186238, + "learning_rate": 4.2143991650899646e-05, + "loss": 0.6371, + "num_input_tokens_seen": 427616896, + "step": 2370 + }, + { + "epoch": 0.2595582801937656, + "grad_norm": 1.21539416830595, + "learning_rate": 4.213773231839149e-05, + "loss": 0.6665, + "num_input_tokens_seen": 427818944, + "step": 2371 + }, + { + "epoch": 0.25966775226470346, + "grad_norm": 1.3238199809508548, + "learning_rate": 4.213147095852235e-05, + "loss": 0.7532, + "num_input_tokens_seen": 428003520, + "step": 2372 + }, + { + "epoch": 0.2597772243356414, + "grad_norm": 1.2433234245239533, + "learning_rate": 4.2125207572032954e-05, + "loss": 0.7031, + "num_input_tokens_seen": 428199296, + "step": 2373 + }, + { + "epoch": 0.2598866964065793, + "grad_norm": 1.24933950139996, + "learning_rate": 4.211894215966424e-05, + "loss": 0.6883, + "num_input_tokens_seen": 428368192, + "step": 2374 + }, + { + "epoch": 0.25999616847751716, + "grad_norm": 1.3910499095039315, + "learning_rate": 4.21126747221574e-05, + "loss": 0.7356, + "num_input_tokens_seen": 428549408, + "step": 2375 + }, + { + "epoch": 0.2601056405484551, + "grad_norm": 1.143442202149445, + "learning_rate": 4.210640526025384e-05, + "loss": 0.6502, + "num_input_tokens_seen": 428732192, + "step": 2376 + }, + { + "epoch": 0.26021511261939295, + "grad_norm": 1.2730939904969807, + "learning_rate": 4.210013377469525e-05, + "loss": 0.7482, + "num_input_tokens_seen": 428913184, + "step": 2377 + }, + { + "epoch": 0.2603245846903309, + "grad_norm": 1.237801636434149, + "learning_rate": 4.209386026622353e-05, + "loss": 0.584, + "num_input_tokens_seen": 429097984, + "step": 2378 + }, + { + "epoch": 0.2604340567612688, + "grad_norm": 1.2637303617406122, + "learning_rate": 4.208758473558081e-05, + "loss": 0.5518, + "num_input_tokens_seen": 429265760, + "step": 2379 + }, + { + "epoch": 0.26054352883220666, + "grad_norm": 1.3330901738807424, + "learning_rate": 4.208130718350948e-05, + "loss": 0.7819, + "num_input_tokens_seen": 429449888, + "step": 2380 + }, + { + "epoch": 0.2606530009031446, + "grad_norm": 1.297881689360643, + "learning_rate": 4.207502761075217e-05, + "loss": 0.7541, + "num_input_tokens_seen": 429643648, + "step": 2381 + }, + { + "epoch": 0.2607624729740825, + "grad_norm": 1.2356832307824093, + "learning_rate": 4.206874601805173e-05, + "loss": 0.5879, + "num_input_tokens_seen": 429833824, + "step": 2382 + }, + { + "epoch": 0.26087194504502037, + "grad_norm": 1.28486096546544, + "learning_rate": 4.206246240615127e-05, + "loss": 0.8328, + "num_input_tokens_seen": 429999136, + "step": 2383 + }, + { + "epoch": 0.2609814171159583, + "grad_norm": 1.2937498931147582, + "learning_rate": 4.205617677579413e-05, + "loss": 0.8094, + "num_input_tokens_seen": 430203200, + "step": 2384 + }, + { + "epoch": 0.2610908891868962, + "grad_norm": 1.1989027967717725, + "learning_rate": 4.2049889127723876e-05, + "loss": 0.6311, + "num_input_tokens_seen": 430389792, + "step": 2385 + }, + { + "epoch": 0.2612003612578341, + "grad_norm": 1.1384310590387465, + "learning_rate": 4.204359946268432e-05, + "loss": 0.5677, + "num_input_tokens_seen": 430579520, + "step": 2386 + }, + { + "epoch": 0.261309833328772, + "grad_norm": 1.353841987224983, + "learning_rate": 4.203730778141955e-05, + "loss": 0.8018, + "num_input_tokens_seen": 430761184, + "step": 2387 + }, + { + "epoch": 0.2614193053997099, + "grad_norm": 1.1278072934811372, + "learning_rate": 4.203101408467383e-05, + "loss": 0.5371, + "num_input_tokens_seen": 430934112, + "step": 2388 + }, + { + "epoch": 0.2615287774706478, + "grad_norm": 1.3312572120865152, + "learning_rate": 4.2024718373191705e-05, + "loss": 0.9066, + "num_input_tokens_seen": 431126304, + "step": 2389 + }, + { + "epoch": 0.2616382495415857, + "grad_norm": 1.3275409985262017, + "learning_rate": 4.201842064771794e-05, + "loss": 0.7265, + "num_input_tokens_seen": 431313344, + "step": 2390 + }, + { + "epoch": 0.26174772161252363, + "grad_norm": 1.1992745650532775, + "learning_rate": 4.2012120908997546e-05, + "loss": 0.7215, + "num_input_tokens_seen": 431490752, + "step": 2391 + }, + { + "epoch": 0.2618571936834615, + "grad_norm": 1.2627846670729703, + "learning_rate": 4.200581915777577e-05, + "loss": 0.7194, + "num_input_tokens_seen": 431678688, + "step": 2392 + }, + { + "epoch": 0.2619666657543994, + "grad_norm": 1.1432753302335006, + "learning_rate": 4.19995153947981e-05, + "loss": 0.5698, + "num_input_tokens_seen": 431856768, + "step": 2393 + }, + { + "epoch": 0.2620761378253373, + "grad_norm": 1.2780607585105093, + "learning_rate": 4.1993209620810255e-05, + "loss": 0.6913, + "num_input_tokens_seen": 432067328, + "step": 2394 + }, + { + "epoch": 0.2621856098962752, + "grad_norm": 1.2190360687214599, + "learning_rate": 4.19869018365582e-05, + "loss": 0.7507, + "num_input_tokens_seen": 432260640, + "step": 2395 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 1.1709727056141812, + "learning_rate": 4.198059204278813e-05, + "loss": 0.5902, + "num_input_tokens_seen": 432423264, + "step": 2396 + }, + { + "epoch": 0.262404554038151, + "grad_norm": 1.23118570421296, + "learning_rate": 4.1974280240246477e-05, + "loss": 0.6058, + "num_input_tokens_seen": 432619264, + "step": 2397 + }, + { + "epoch": 0.2625140261090889, + "grad_norm": 1.2196452447372776, + "learning_rate": 4.196796642967992e-05, + "loss": 0.6659, + "num_input_tokens_seen": 432791744, + "step": 2398 + }, + { + "epoch": 0.26262349818002684, + "grad_norm": 1.4836063201677112, + "learning_rate": 4.1961650611835376e-05, + "loss": 1.0284, + "num_input_tokens_seen": 432996704, + "step": 2399 + }, + { + "epoch": 0.2627329702509647, + "grad_norm": 1.241499704404712, + "learning_rate": 4.195533278745999e-05, + "loss": 0.6089, + "num_input_tokens_seen": 433174336, + "step": 2400 + }, + { + "epoch": 0.2628424423219026, + "grad_norm": 1.285455083453299, + "learning_rate": 4.194901295730115e-05, + "loss": 0.6598, + "num_input_tokens_seen": 433387584, + "step": 2401 + }, + { + "epoch": 0.26295191439284055, + "grad_norm": 1.178975528327178, + "learning_rate": 4.1942691122106484e-05, + "loss": 0.6033, + "num_input_tokens_seen": 433560960, + "step": 2402 + }, + { + "epoch": 0.2630613864637784, + "grad_norm": 1.251809479421965, + "learning_rate": 4.1936367282623836e-05, + "loss": 0.6755, + "num_input_tokens_seen": 433741952, + "step": 2403 + }, + { + "epoch": 0.26317085853471633, + "grad_norm": 1.3002209072221207, + "learning_rate": 4.1930041439601316e-05, + "loss": 0.8841, + "num_input_tokens_seen": 433956096, + "step": 2404 + }, + { + "epoch": 0.26328033060565426, + "grad_norm": 1.251091575844587, + "learning_rate": 4.192371359378726e-05, + "loss": 0.6934, + "num_input_tokens_seen": 434138208, + "step": 2405 + }, + { + "epoch": 0.2633898026765921, + "grad_norm": 1.189635915757652, + "learning_rate": 4.191738374593024e-05, + "loss": 0.6385, + "num_input_tokens_seen": 434327264, + "step": 2406 + }, + { + "epoch": 0.26349927474753004, + "grad_norm": 1.217258812672477, + "learning_rate": 4.191105189677906e-05, + "loss": 0.5218, + "num_input_tokens_seen": 434476224, + "step": 2407 + }, + { + "epoch": 0.26360874681846796, + "grad_norm": 1.2854442331747313, + "learning_rate": 4.190471804708278e-05, + "loss": 0.861, + "num_input_tokens_seen": 434660800, + "step": 2408 + }, + { + "epoch": 0.26371821888940583, + "grad_norm": 1.1461597613441483, + "learning_rate": 4.189838219759066e-05, + "loss": 0.5958, + "num_input_tokens_seen": 434808192, + "step": 2409 + }, + { + "epoch": 0.26382769096034375, + "grad_norm": 1.3584017722707988, + "learning_rate": 4.1892044349052234e-05, + "loss": 0.8483, + "num_input_tokens_seen": 434996576, + "step": 2410 + }, + { + "epoch": 0.2639371630312816, + "grad_norm": 1.3342416719030565, + "learning_rate": 4.1885704502217255e-05, + "loss": 0.9089, + "num_input_tokens_seen": 435221248, + "step": 2411 + }, + { + "epoch": 0.26404663510221954, + "grad_norm": 1.398078190340092, + "learning_rate": 4.187936265783571e-05, + "loss": 0.6613, + "num_input_tokens_seen": 435386112, + "step": 2412 + }, + { + "epoch": 0.26415610717315746, + "grad_norm": 1.2768086937237906, + "learning_rate": 4.187301881665783e-05, + "loss": 0.7273, + "num_input_tokens_seen": 435560832, + "step": 2413 + }, + { + "epoch": 0.2642655792440953, + "grad_norm": 1.3420850508184536, + "learning_rate": 4.1866672979434084e-05, + "loss": 0.8307, + "num_input_tokens_seen": 435733312, + "step": 2414 + }, + { + "epoch": 0.26437505131503325, + "grad_norm": 1.180928114777798, + "learning_rate": 4.1860325146915166e-05, + "loss": 0.7134, + "num_input_tokens_seen": 435938048, + "step": 2415 + }, + { + "epoch": 0.26448452338597117, + "grad_norm": 1.3389608070814043, + "learning_rate": 4.1853975319852015e-05, + "loss": 0.7758, + "num_input_tokens_seen": 436129568, + "step": 2416 + }, + { + "epoch": 0.26459399545690904, + "grad_norm": 1.2289815483135929, + "learning_rate": 4.18476234989958e-05, + "loss": 0.6861, + "num_input_tokens_seen": 436316608, + "step": 2417 + }, + { + "epoch": 0.26470346752784696, + "grad_norm": 1.213633925351378, + "learning_rate": 4.184126968509794e-05, + "loss": 0.6837, + "num_input_tokens_seen": 436530304, + "step": 2418 + }, + { + "epoch": 0.2648129395987849, + "grad_norm": 1.3016797419358193, + "learning_rate": 4.183491387891007e-05, + "loss": 0.7014, + "num_input_tokens_seen": 436719808, + "step": 2419 + }, + { + "epoch": 0.26492241166972275, + "grad_norm": 1.1456228525294578, + "learning_rate": 4.1828556081184064e-05, + "loss": 0.8221, + "num_input_tokens_seen": 436912672, + "step": 2420 + }, + { + "epoch": 0.26503188374066067, + "grad_norm": 1.1635479397833737, + "learning_rate": 4.1822196292672045e-05, + "loss": 0.5469, + "num_input_tokens_seen": 437094336, + "step": 2421 + }, + { + "epoch": 0.2651413558115986, + "grad_norm": 1.1599572521587302, + "learning_rate": 4.1815834514126366e-05, + "loss": 0.5706, + "num_input_tokens_seen": 437259200, + "step": 2422 + }, + { + "epoch": 0.26525082788253646, + "grad_norm": 1.3746469217751511, + "learning_rate": 4.180947074629961e-05, + "loss": 0.8705, + "num_input_tokens_seen": 437459904, + "step": 2423 + }, + { + "epoch": 0.2653602999534744, + "grad_norm": 1.3427182605850918, + "learning_rate": 4.1803104989944594e-05, + "loss": 0.7482, + "num_input_tokens_seen": 437634176, + "step": 2424 + }, + { + "epoch": 0.2654697720244123, + "grad_norm": 1.37733906416764, + "learning_rate": 4.1796737245814396e-05, + "loss": 0.8687, + "num_input_tokens_seen": 437843616, + "step": 2425 + }, + { + "epoch": 0.26557924409535016, + "grad_norm": 1.381497377363457, + "learning_rate": 4.1790367514662276e-05, + "loss": 0.8499, + "num_input_tokens_seen": 438044096, + "step": 2426 + }, + { + "epoch": 0.2656887161662881, + "grad_norm": 1.3989086103467352, + "learning_rate": 4.178399579724178e-05, + "loss": 0.6385, + "num_input_tokens_seen": 438193280, + "step": 2427 + }, + { + "epoch": 0.26579818823722595, + "grad_norm": 1.3239935849486695, + "learning_rate": 4.177762209430667e-05, + "loss": 0.6204, + "num_input_tokens_seen": 438355904, + "step": 2428 + }, + { + "epoch": 0.2659076603081639, + "grad_norm": 1.2559583752471233, + "learning_rate": 4.177124640661094e-05, + "loss": 0.5386, + "num_input_tokens_seen": 438549440, + "step": 2429 + }, + { + "epoch": 0.2660171323791018, + "grad_norm": 1.3594982047537385, + "learning_rate": 4.176486873490882e-05, + "loss": 0.7279, + "num_input_tokens_seen": 438728864, + "step": 2430 + }, + { + "epoch": 0.26612660445003966, + "grad_norm": 1.3828578014819775, + "learning_rate": 4.1758489079954774e-05, + "loss": 0.8481, + "num_input_tokens_seen": 438902464, + "step": 2431 + }, + { + "epoch": 0.2662360765209776, + "grad_norm": 1.3671875435965395, + "learning_rate": 4.1752107442503505e-05, + "loss": 0.6231, + "num_input_tokens_seen": 439053888, + "step": 2432 + }, + { + "epoch": 0.2663455485919155, + "grad_norm": 1.204151755815913, + "learning_rate": 4.174572382330996e-05, + "loss": 0.7068, + "num_input_tokens_seen": 439234432, + "step": 2433 + }, + { + "epoch": 0.26645502066285337, + "grad_norm": 1.1656192891898893, + "learning_rate": 4.1739338223129294e-05, + "loss": 0.7253, + "num_input_tokens_seen": 439432896, + "step": 2434 + }, + { + "epoch": 0.2665644927337913, + "grad_norm": 1.2700523832736803, + "learning_rate": 4.1732950642716916e-05, + "loss": 0.7188, + "num_input_tokens_seen": 439590816, + "step": 2435 + }, + { + "epoch": 0.2666739648047292, + "grad_norm": 1.323453158631631, + "learning_rate": 4.1726561082828466e-05, + "loss": 0.9017, + "num_input_tokens_seen": 439791072, + "step": 2436 + }, + { + "epoch": 0.2667834368756671, + "grad_norm": 1.239023653939162, + "learning_rate": 4.172016954421981e-05, + "loss": 0.7513, + "num_input_tokens_seen": 439971840, + "step": 2437 + }, + { + "epoch": 0.266892908946605, + "grad_norm": 1.2451577333318697, + "learning_rate": 4.171377602764707e-05, + "loss": 0.6477, + "num_input_tokens_seen": 440125280, + "step": 2438 + }, + { + "epoch": 0.2670023810175429, + "grad_norm": 1.3225307452372435, + "learning_rate": 4.170738053386657e-05, + "loss": 0.6648, + "num_input_tokens_seen": 440295296, + "step": 2439 + }, + { + "epoch": 0.2671118530884808, + "grad_norm": 1.1969200245329128, + "learning_rate": 4.170098306363489e-05, + "loss": 0.7852, + "num_input_tokens_seen": 440485472, + "step": 2440 + }, + { + "epoch": 0.2672213251594187, + "grad_norm": 1.2513902086022113, + "learning_rate": 4.169458361770885e-05, + "loss": 0.6549, + "num_input_tokens_seen": 440642944, + "step": 2441 + }, + { + "epoch": 0.26733079723035663, + "grad_norm": 1.2774618485011267, + "learning_rate": 4.168818219684548e-05, + "loss": 0.6812, + "num_input_tokens_seen": 440801760, + "step": 2442 + }, + { + "epoch": 0.2674402693012945, + "grad_norm": 1.2361603392514982, + "learning_rate": 4.168177880180205e-05, + "loss": 0.6429, + "num_input_tokens_seen": 440986336, + "step": 2443 + }, + { + "epoch": 0.2675497413722324, + "grad_norm": 1.4298071472465301, + "learning_rate": 4.167537343333608e-05, + "loss": 0.7478, + "num_input_tokens_seen": 441152320, + "step": 2444 + }, + { + "epoch": 0.2676592134431703, + "grad_norm": 1.3127752651209028, + "learning_rate": 4.166896609220532e-05, + "loss": 0.7742, + "num_input_tokens_seen": 441335328, + "step": 2445 + }, + { + "epoch": 0.2677686855141082, + "grad_norm": 1.2283176571912386, + "learning_rate": 4.1662556779167735e-05, + "loss": 0.5799, + "num_input_tokens_seen": 441495488, + "step": 2446 + }, + { + "epoch": 0.26787815758504613, + "grad_norm": 1.2641536495885564, + "learning_rate": 4.165614549498152e-05, + "loss": 0.6634, + "num_input_tokens_seen": 441693280, + "step": 2447 + }, + { + "epoch": 0.267987629655984, + "grad_norm": 1.322244167858121, + "learning_rate": 4.164973224040516e-05, + "loss": 0.7197, + "num_input_tokens_seen": 441866656, + "step": 2448 + }, + { + "epoch": 0.2680971017269219, + "grad_norm": 1.2032502839884451, + "learning_rate": 4.164331701619729e-05, + "loss": 0.7235, + "num_input_tokens_seen": 442048768, + "step": 2449 + }, + { + "epoch": 0.26820657379785984, + "grad_norm": 1.331309909099366, + "learning_rate": 4.1636899823116835e-05, + "loss": 0.6798, + "num_input_tokens_seen": 442212960, + "step": 2450 + }, + { + "epoch": 0.2683160458687977, + "grad_norm": 1.2649978685172598, + "learning_rate": 4.1630480661922935e-05, + "loss": 0.7217, + "num_input_tokens_seen": 442365728, + "step": 2451 + }, + { + "epoch": 0.2684255179397356, + "grad_norm": 1.2223565950124715, + "learning_rate": 4.162405953337497e-05, + "loss": 0.7866, + "num_input_tokens_seen": 442572704, + "step": 2452 + }, + { + "epoch": 0.26853499001067355, + "grad_norm": 1.1163705893521136, + "learning_rate": 4.161763643823253e-05, + "loss": 0.5574, + "num_input_tokens_seen": 442731072, + "step": 2453 + }, + { + "epoch": 0.2686444620816114, + "grad_norm": 1.260043044187605, + "learning_rate": 4.1611211377255473e-05, + "loss": 0.6538, + "num_input_tokens_seen": 442924160, + "step": 2454 + }, + { + "epoch": 0.26875393415254933, + "grad_norm": 1.1808769343386463, + "learning_rate": 4.1604784351203876e-05, + "loss": 0.7989, + "num_input_tokens_seen": 443128896, + "step": 2455 + }, + { + "epoch": 0.26886340622348726, + "grad_norm": 1.2676130151869545, + "learning_rate": 4.1598355360838016e-05, + "loss": 0.6935, + "num_input_tokens_seen": 443291296, + "step": 2456 + }, + { + "epoch": 0.2689728782944251, + "grad_norm": 1.3491904992976642, + "learning_rate": 4.159192440691846e-05, + "loss": 1.0139, + "num_input_tokens_seen": 443492672, + "step": 2457 + }, + { + "epoch": 0.26908235036536304, + "grad_norm": 1.276954101487443, + "learning_rate": 4.1585491490205965e-05, + "loss": 0.6998, + "num_input_tokens_seen": 443674560, + "step": 2458 + }, + { + "epoch": 0.26919182243630096, + "grad_norm": 1.2458734108252698, + "learning_rate": 4.157905661146152e-05, + "loss": 0.7053, + "num_input_tokens_seen": 443854432, + "step": 2459 + }, + { + "epoch": 0.26930129450723883, + "grad_norm": 1.1763196453917024, + "learning_rate": 4.157261977144638e-05, + "loss": 0.7108, + "num_input_tokens_seen": 444051104, + "step": 2460 + }, + { + "epoch": 0.26941076657817675, + "grad_norm": 1.1914093517825914, + "learning_rate": 4.1566180970922006e-05, + "loss": 0.6096, + "num_input_tokens_seen": 444240608, + "step": 2461 + }, + { + "epoch": 0.2695202386491146, + "grad_norm": 1.2053957726076114, + "learning_rate": 4.155974021065009e-05, + "loss": 0.7337, + "num_input_tokens_seen": 444428096, + "step": 2462 + }, + { + "epoch": 0.26962971072005254, + "grad_norm": 1.1964467470215951, + "learning_rate": 4.1553297491392564e-05, + "loss": 0.71, + "num_input_tokens_seen": 444596992, + "step": 2463 + }, + { + "epoch": 0.26973918279099046, + "grad_norm": 1.0350054361136125, + "learning_rate": 4.154685281391158e-05, + "loss": 0.6077, + "num_input_tokens_seen": 444800608, + "step": 2464 + }, + { + "epoch": 0.2698486548619283, + "grad_norm": 1.3693672895889113, + "learning_rate": 4.1540406178969553e-05, + "loss": 0.8101, + "num_input_tokens_seen": 444963456, + "step": 2465 + }, + { + "epoch": 0.26995812693286625, + "grad_norm": 1.1739236853066515, + "learning_rate": 4.153395758732909e-05, + "loss": 0.5022, + "num_input_tokens_seen": 445115104, + "step": 2466 + }, + { + "epoch": 0.27006759900380417, + "grad_norm": 1.4147636403533264, + "learning_rate": 4.152750703975305e-05, + "loss": 0.8127, + "num_input_tokens_seen": 445315136, + "step": 2467 + }, + { + "epoch": 0.27017707107474204, + "grad_norm": 1.3984980649992698, + "learning_rate": 4.152105453700452e-05, + "loss": 0.8784, + "num_input_tokens_seen": 445531072, + "step": 2468 + }, + { + "epoch": 0.27028654314567996, + "grad_norm": 1.301257805118367, + "learning_rate": 4.151460007984683e-05, + "loss": 0.7166, + "num_input_tokens_seen": 445703104, + "step": 2469 + }, + { + "epoch": 0.2703960152166179, + "grad_norm": 1.2534462153213313, + "learning_rate": 4.150814366904352e-05, + "loss": 0.7559, + "num_input_tokens_seen": 445850720, + "step": 2470 + }, + { + "epoch": 0.27050548728755575, + "grad_norm": 1.2186531126195839, + "learning_rate": 4.150168530535837e-05, + "loss": 0.7502, + "num_input_tokens_seen": 446035744, + "step": 2471 + }, + { + "epoch": 0.27061495935849367, + "grad_norm": 1.3173589230598954, + "learning_rate": 4.149522498955539e-05, + "loss": 0.6764, + "num_input_tokens_seen": 446230624, + "step": 2472 + }, + { + "epoch": 0.2707244314294316, + "grad_norm": 1.1964982079613253, + "learning_rate": 4.148876272239883e-05, + "loss": 0.7967, + "num_input_tokens_seen": 446413856, + "step": 2473 + }, + { + "epoch": 0.27083390350036946, + "grad_norm": 1.1073035659700925, + "learning_rate": 4.148229850465316e-05, + "loss": 0.6252, + "num_input_tokens_seen": 446599104, + "step": 2474 + }, + { + "epoch": 0.2709433755713074, + "grad_norm": 1.1851419326028056, + "learning_rate": 4.1475832337083085e-05, + "loss": 0.6244, + "num_input_tokens_seen": 446778304, + "step": 2475 + }, + { + "epoch": 0.2710528476422453, + "grad_norm": 1.2167068864541126, + "learning_rate": 4.1469364220453546e-05, + "loss": 0.714, + "num_input_tokens_seen": 446964224, + "step": 2476 + }, + { + "epoch": 0.27116231971318316, + "grad_norm": 1.4496891872480886, + "learning_rate": 4.1462894155529706e-05, + "loss": 0.9179, + "num_input_tokens_seen": 447130880, + "step": 2477 + }, + { + "epoch": 0.2712717917841211, + "grad_norm": 1.1741601153954209, + "learning_rate": 4.145642214307695e-05, + "loss": 0.6579, + "num_input_tokens_seen": 447331136, + "step": 2478 + }, + { + "epoch": 0.27138126385505895, + "grad_norm": 1.112847601774286, + "learning_rate": 4.144994818386092e-05, + "loss": 0.5765, + "num_input_tokens_seen": 447512576, + "step": 2479 + }, + { + "epoch": 0.2714907359259969, + "grad_norm": 1.3255679029140306, + "learning_rate": 4.144347227864747e-05, + "loss": 0.6908, + "num_input_tokens_seen": 447699616, + "step": 2480 + }, + { + "epoch": 0.2716002079969348, + "grad_norm": 1.3556861812983827, + "learning_rate": 4.1436994428202667e-05, + "loss": 0.7807, + "num_input_tokens_seen": 447861120, + "step": 2481 + }, + { + "epoch": 0.27170968006787266, + "grad_norm": 1.4238239387691374, + "learning_rate": 4.143051463329286e-05, + "loss": 0.6498, + "num_input_tokens_seen": 448017696, + "step": 2482 + }, + { + "epoch": 0.2718191521388106, + "grad_norm": 1.3153236533335813, + "learning_rate": 4.1424032894684584e-05, + "loss": 0.6461, + "num_input_tokens_seen": 448163520, + "step": 2483 + }, + { + "epoch": 0.2719286242097485, + "grad_norm": 1.2198454995116526, + "learning_rate": 4.141754921314461e-05, + "loss": 0.5423, + "num_input_tokens_seen": 448348768, + "step": 2484 + }, + { + "epoch": 0.27203809628068637, + "grad_norm": 1.316756114255298, + "learning_rate": 4.141106358943995e-05, + "loss": 0.8305, + "num_input_tokens_seen": 448559104, + "step": 2485 + }, + { + "epoch": 0.2721475683516243, + "grad_norm": 1.4778736892845383, + "learning_rate": 4.140457602433784e-05, + "loss": 0.7723, + "num_input_tokens_seen": 448746816, + "step": 2486 + }, + { + "epoch": 0.2722570404225622, + "grad_norm": 1.3042607780275903, + "learning_rate": 4.139808651860574e-05, + "loss": 0.7453, + "num_input_tokens_seen": 448911232, + "step": 2487 + }, + { + "epoch": 0.2723665124935001, + "grad_norm": 1.3333336065212607, + "learning_rate": 4.139159507301136e-05, + "loss": 0.5576, + "num_input_tokens_seen": 449090880, + "step": 2488 + }, + { + "epoch": 0.272475984564438, + "grad_norm": 1.3687791986704172, + "learning_rate": 4.138510168832261e-05, + "loss": 0.8987, + "num_input_tokens_seen": 449277472, + "step": 2489 + }, + { + "epoch": 0.2725854566353759, + "grad_norm": 1.3153049378688633, + "learning_rate": 4.137860636530766e-05, + "loss": 0.7678, + "num_input_tokens_seen": 449475936, + "step": 2490 + }, + { + "epoch": 0.2726949287063138, + "grad_norm": 1.2423795635562918, + "learning_rate": 4.1372109104734886e-05, + "loss": 0.7396, + "num_input_tokens_seen": 449685152, + "step": 2491 + }, + { + "epoch": 0.2728044007772517, + "grad_norm": 1.3225476458643597, + "learning_rate": 4.1365609907372905e-05, + "loss": 0.6865, + "num_input_tokens_seen": 449833664, + "step": 2492 + }, + { + "epoch": 0.27291387284818963, + "grad_norm": 1.2001055690899674, + "learning_rate": 4.135910877399055e-05, + "loss": 0.6291, + "num_input_tokens_seen": 450011968, + "step": 2493 + }, + { + "epoch": 0.2730233449191275, + "grad_norm": 1.159578584137454, + "learning_rate": 4.13526057053569e-05, + "loss": 0.6542, + "num_input_tokens_seen": 450195424, + "step": 2494 + }, + { + "epoch": 0.2731328169900654, + "grad_norm": 1.2427573186658505, + "learning_rate": 4.134610070224127e-05, + "loss": 0.7338, + "num_input_tokens_seen": 450359616, + "step": 2495 + }, + { + "epoch": 0.2732422890610033, + "grad_norm": 1.186360565435354, + "learning_rate": 4.133959376541317e-05, + "loss": 0.6666, + "num_input_tokens_seen": 450537248, + "step": 2496 + }, + { + "epoch": 0.2733517611319412, + "grad_norm": 1.306901982602183, + "learning_rate": 4.133308489564236e-05, + "loss": 0.6672, + "num_input_tokens_seen": 450717120, + "step": 2497 + }, + { + "epoch": 0.27346123320287913, + "grad_norm": 1.4294161513008916, + "learning_rate": 4.132657409369883e-05, + "loss": 0.8104, + "num_input_tokens_seen": 450876832, + "step": 2498 + }, + { + "epoch": 0.273570705273817, + "grad_norm": 1.3005441902129724, + "learning_rate": 4.1320061360352804e-05, + "loss": 0.8793, + "num_input_tokens_seen": 451063872, + "step": 2499 + }, + { + "epoch": 0.2736801773447549, + "grad_norm": 1.2514019733403032, + "learning_rate": 4.13135466963747e-05, + "loss": 0.7204, + "num_input_tokens_seen": 451229856, + "step": 2500 + }, + { + "epoch": 0.27378964941569284, + "grad_norm": 1.2478355742245422, + "learning_rate": 4.130703010253523e-05, + "loss": 0.6425, + "num_input_tokens_seen": 451394272, + "step": 2501 + }, + { + "epoch": 0.2738991214866307, + "grad_norm": 1.4361552707384817, + "learning_rate": 4.130051157960526e-05, + "loss": 0.6298, + "num_input_tokens_seen": 451577952, + "step": 2502 + }, + { + "epoch": 0.2740085935575686, + "grad_norm": 1.1573880754904975, + "learning_rate": 4.1293991128355934e-05, + "loss": 0.5696, + "num_input_tokens_seen": 451759392, + "step": 2503 + }, + { + "epoch": 0.27411806562850655, + "grad_norm": 1.102175116127384, + "learning_rate": 4.12874687495586e-05, + "loss": 0.6699, + "num_input_tokens_seen": 451978240, + "step": 2504 + }, + { + "epoch": 0.2742275376994444, + "grad_norm": 1.3124934605026817, + "learning_rate": 4.128094444398486e-05, + "loss": 0.7837, + "num_input_tokens_seen": 452204480, + "step": 2505 + }, + { + "epoch": 0.27433700977038233, + "grad_norm": 1.2028574955790545, + "learning_rate": 4.127441821240651e-05, + "loss": 0.7288, + "num_input_tokens_seen": 452376960, + "step": 2506 + }, + { + "epoch": 0.27444648184132026, + "grad_norm": 1.2872387556191336, + "learning_rate": 4.12678900555956e-05, + "loss": 0.6966, + "num_input_tokens_seen": 452565344, + "step": 2507 + }, + { + "epoch": 0.2745559539122581, + "grad_norm": 1.2360039115646342, + "learning_rate": 4.1261359974324387e-05, + "loss": 0.5866, + "num_input_tokens_seen": 452757984, + "step": 2508 + }, + { + "epoch": 0.27466542598319604, + "grad_norm": 1.3033474190350016, + "learning_rate": 4.1254827969365387e-05, + "loss": 0.913, + "num_input_tokens_seen": 452954880, + "step": 2509 + }, + { + "epoch": 0.27477489805413396, + "grad_norm": 1.2435680371312816, + "learning_rate": 4.1248294041491306e-05, + "loss": 0.6852, + "num_input_tokens_seen": 453153792, + "step": 2510 + }, + { + "epoch": 0.27488437012507183, + "grad_norm": 1.363807205133276, + "learning_rate": 4.12417581914751e-05, + "loss": 0.7437, + "num_input_tokens_seen": 453320224, + "step": 2511 + }, + { + "epoch": 0.27499384219600975, + "grad_norm": 1.2664166082373947, + "learning_rate": 4.123522042008996e-05, + "loss": 0.7121, + "num_input_tokens_seen": 453485760, + "step": 2512 + }, + { + "epoch": 0.2751033142669476, + "grad_norm": 1.268287354478459, + "learning_rate": 4.122868072810927e-05, + "loss": 0.7123, + "num_input_tokens_seen": 453683776, + "step": 2513 + }, + { + "epoch": 0.27521278633788554, + "grad_norm": 1.1581405442620822, + "learning_rate": 4.122213911630667e-05, + "loss": 0.644, + "num_input_tokens_seen": 453868576, + "step": 2514 + }, + { + "epoch": 0.27532225840882346, + "grad_norm": 1.2472209555646152, + "learning_rate": 4.121559558545603e-05, + "loss": 0.7475, + "num_input_tokens_seen": 454073760, + "step": 2515 + }, + { + "epoch": 0.2754317304797613, + "grad_norm": 1.3036038586936605, + "learning_rate": 4.120905013633143e-05, + "loss": 0.6714, + "num_input_tokens_seen": 454247136, + "step": 2516 + }, + { + "epoch": 0.27554120255069925, + "grad_norm": 1.4265687204916833, + "learning_rate": 4.1202502769707184e-05, + "loss": 0.7585, + "num_input_tokens_seen": 454406400, + "step": 2517 + }, + { + "epoch": 0.27565067462163717, + "grad_norm": 1.363947314875372, + "learning_rate": 4.119595348635784e-05, + "loss": 0.6338, + "num_input_tokens_seen": 454582912, + "step": 2518 + }, + { + "epoch": 0.27576014669257504, + "grad_norm": 1.2913532133109924, + "learning_rate": 4.118940228705815e-05, + "loss": 0.8511, + "num_input_tokens_seen": 454765472, + "step": 2519 + }, + { + "epoch": 0.27586961876351296, + "grad_norm": 1.304282531019626, + "learning_rate": 4.1182849172583135e-05, + "loss": 0.9503, + "num_input_tokens_seen": 454977152, + "step": 2520 + }, + { + "epoch": 0.2759790908344509, + "grad_norm": 1.1863411217794975, + "learning_rate": 4.117629414370799e-05, + "loss": 0.6124, + "num_input_tokens_seen": 455160832, + "step": 2521 + }, + { + "epoch": 0.27608856290538875, + "grad_norm": 1.206934310822745, + "learning_rate": 4.116973720120817e-05, + "loss": 0.7638, + "num_input_tokens_seen": 455349440, + "step": 2522 + }, + { + "epoch": 0.27619803497632667, + "grad_norm": 1.1275258319712191, + "learning_rate": 4.116317834585935e-05, + "loss": 0.5594, + "num_input_tokens_seen": 455513184, + "step": 2523 + }, + { + "epoch": 0.2763075070472646, + "grad_norm": 1.2043046988778454, + "learning_rate": 4.115661757843743e-05, + "loss": 0.7322, + "num_input_tokens_seen": 455706272, + "step": 2524 + }, + { + "epoch": 0.27641697911820245, + "grad_norm": 1.1137199742029953, + "learning_rate": 4.115005489971854e-05, + "loss": 0.7606, + "num_input_tokens_seen": 455899136, + "step": 2525 + }, + { + "epoch": 0.2765264511891404, + "grad_norm": 1.317037368852524, + "learning_rate": 4.114349031047903e-05, + "loss": 0.8265, + "num_input_tokens_seen": 456079680, + "step": 2526 + }, + { + "epoch": 0.2766359232600783, + "grad_norm": 1.294302169246487, + "learning_rate": 4.1136923811495475e-05, + "loss": 0.7435, + "num_input_tokens_seen": 456276128, + "step": 2527 + }, + { + "epoch": 0.27674539533101616, + "grad_norm": 1.195436957368714, + "learning_rate": 4.1130355403544675e-05, + "loss": 0.5567, + "num_input_tokens_seen": 456467872, + "step": 2528 + }, + { + "epoch": 0.2768548674019541, + "grad_norm": 1.3138493459144263, + "learning_rate": 4.1123785087403676e-05, + "loss": 0.6926, + "num_input_tokens_seen": 456671264, + "step": 2529 + }, + { + "epoch": 0.27696433947289195, + "grad_norm": 1.269290091277573, + "learning_rate": 4.111721286384972e-05, + "loss": 0.6822, + "num_input_tokens_seen": 456855616, + "step": 2530 + }, + { + "epoch": 0.2770738115438299, + "grad_norm": 1.2820203372693115, + "learning_rate": 4.1110638733660294e-05, + "loss": 0.6777, + "num_input_tokens_seen": 457012864, + "step": 2531 + }, + { + "epoch": 0.2771832836147678, + "grad_norm": 1.3307243491407212, + "learning_rate": 4.110406269761311e-05, + "loss": 1.0981, + "num_input_tokens_seen": 457236416, + "step": 2532 + }, + { + "epoch": 0.27729275568570566, + "grad_norm": 1.3141107212431369, + "learning_rate": 4.109748475648609e-05, + "loss": 0.6851, + "num_input_tokens_seen": 457399040, + "step": 2533 + }, + { + "epoch": 0.2774022277566436, + "grad_norm": 1.2742077890716392, + "learning_rate": 4.109090491105739e-05, + "loss": 0.6021, + "num_input_tokens_seen": 457568160, + "step": 2534 + }, + { + "epoch": 0.2775116998275815, + "grad_norm": 1.2649927797224532, + "learning_rate": 4.108432316210541e-05, + "loss": 0.6856, + "num_input_tokens_seen": 457742208, + "step": 2535 + }, + { + "epoch": 0.27762117189851937, + "grad_norm": 1.4717533166561918, + "learning_rate": 4.107773951040874e-05, + "loss": 0.9781, + "num_input_tokens_seen": 457935744, + "step": 2536 + }, + { + "epoch": 0.2777306439694573, + "grad_norm": 1.4577740368772465, + "learning_rate": 4.107115395674623e-05, + "loss": 0.9058, + "num_input_tokens_seen": 458108000, + "step": 2537 + }, + { + "epoch": 0.2778401160403952, + "grad_norm": 1.3659075603987199, + "learning_rate": 4.1064566501896925e-05, + "loss": 0.7031, + "num_input_tokens_seen": 458277344, + "step": 2538 + }, + { + "epoch": 0.2779495881113331, + "grad_norm": 1.3295348818343053, + "learning_rate": 4.1057977146640115e-05, + "loss": 0.7866, + "num_input_tokens_seen": 458465504, + "step": 2539 + }, + { + "epoch": 0.278059060182271, + "grad_norm": 1.1275320698137845, + "learning_rate": 4.10513858917553e-05, + "loss": 0.6235, + "num_input_tokens_seen": 458659712, + "step": 2540 + }, + { + "epoch": 0.2781685322532089, + "grad_norm": 1.1045636357167932, + "learning_rate": 4.104479273802222e-05, + "loss": 0.5307, + "num_input_tokens_seen": 458844288, + "step": 2541 + }, + { + "epoch": 0.2782780043241468, + "grad_norm": 1.275885026101736, + "learning_rate": 4.1038197686220837e-05, + "loss": 0.6158, + "num_input_tokens_seen": 459038272, + "step": 2542 + }, + { + "epoch": 0.2783874763950847, + "grad_norm": 1.3072968846352486, + "learning_rate": 4.1031600737131326e-05, + "loss": 0.8242, + "num_input_tokens_seen": 459245696, + "step": 2543 + }, + { + "epoch": 0.27849694846602263, + "grad_norm": 1.2761064889233922, + "learning_rate": 4.102500189153409e-05, + "loss": 0.7983, + "num_input_tokens_seen": 459439008, + "step": 2544 + }, + { + "epoch": 0.2786064205369605, + "grad_norm": 1.1219396290984434, + "learning_rate": 4.1018401150209776e-05, + "loss": 0.5546, + "num_input_tokens_seen": 459610816, + "step": 2545 + }, + { + "epoch": 0.2787158926078984, + "grad_norm": 1.2173000905528184, + "learning_rate": 4.101179851393921e-05, + "loss": 0.5729, + "num_input_tokens_seen": 459783072, + "step": 2546 + }, + { + "epoch": 0.2788253646788363, + "grad_norm": 1.2488531096449693, + "learning_rate": 4.100519398350351e-05, + "loss": 0.5532, + "num_input_tokens_seen": 459951296, + "step": 2547 + }, + { + "epoch": 0.2789348367497742, + "grad_norm": 1.195225382104733, + "learning_rate": 4.099858755968394e-05, + "loss": 0.8026, + "num_input_tokens_seen": 460122880, + "step": 2548 + }, + { + "epoch": 0.27904430882071213, + "grad_norm": 1.274344980991053, + "learning_rate": 4.0991979243262054e-05, + "loss": 0.7547, + "num_input_tokens_seen": 460291552, + "step": 2549 + }, + { + "epoch": 0.27915378089165, + "grad_norm": 1.4047568977703921, + "learning_rate": 4.09853690350196e-05, + "loss": 0.8274, + "num_input_tokens_seen": 460471648, + "step": 2550 + }, + { + "epoch": 0.2792632529625879, + "grad_norm": 1.2093545719757572, + "learning_rate": 4.097875693573854e-05, + "loss": 0.659, + "num_input_tokens_seen": 460651520, + "step": 2551 + }, + { + "epoch": 0.27937272503352584, + "grad_norm": 1.1800983547180153, + "learning_rate": 4.0972142946201086e-05, + "loss": 0.6332, + "num_input_tokens_seen": 460831392, + "step": 2552 + }, + { + "epoch": 0.2794821971044637, + "grad_norm": 1.1504354937729633, + "learning_rate": 4.0965527067189655e-05, + "loss": 0.6982, + "num_input_tokens_seen": 461026272, + "step": 2553 + }, + { + "epoch": 0.2795916691754016, + "grad_norm": 1.2720842250283984, + "learning_rate": 4.095890929948689e-05, + "loss": 0.8168, + "num_input_tokens_seen": 461203904, + "step": 2554 + }, + { + "epoch": 0.27970114124633955, + "grad_norm": 1.3108522880785953, + "learning_rate": 4.095228964387566e-05, + "loss": 0.6863, + "num_input_tokens_seen": 461378624, + "step": 2555 + }, + { + "epoch": 0.2798106133172774, + "grad_norm": 1.2875664092038306, + "learning_rate": 4.094566810113907e-05, + "loss": 0.822, + "num_input_tokens_seen": 461555360, + "step": 2556 + }, + { + "epoch": 0.27992008538821533, + "grad_norm": 1.2428913161867932, + "learning_rate": 4.0939044672060426e-05, + "loss": 0.7704, + "num_input_tokens_seen": 461741056, + "step": 2557 + }, + { + "epoch": 0.28002955745915326, + "grad_norm": 1.159790340994083, + "learning_rate": 4.093241935742326e-05, + "loss": 0.647, + "num_input_tokens_seen": 461936832, + "step": 2558 + }, + { + "epoch": 0.2801390295300911, + "grad_norm": 1.1216689172885606, + "learning_rate": 4.0925792158011345e-05, + "loss": 0.6284, + "num_input_tokens_seen": 462098112, + "step": 2559 + }, + { + "epoch": 0.28024850160102904, + "grad_norm": 1.295952940242453, + "learning_rate": 4.091916307460866e-05, + "loss": 0.679, + "num_input_tokens_seen": 462267680, + "step": 2560 + }, + { + "epoch": 0.28035797367196696, + "grad_norm": 1.321394166903571, + "learning_rate": 4.091253210799942e-05, + "loss": 0.7662, + "num_input_tokens_seen": 462473760, + "step": 2561 + }, + { + "epoch": 0.28046744574290483, + "grad_norm": 1.382769869562135, + "learning_rate": 4.0905899258968046e-05, + "loss": 0.8217, + "num_input_tokens_seen": 462670432, + "step": 2562 + }, + { + "epoch": 0.28057691781384275, + "grad_norm": 1.2205215685297068, + "learning_rate": 4.08992645282992e-05, + "loss": 0.7336, + "num_input_tokens_seen": 462876736, + "step": 2563 + }, + { + "epoch": 0.2806863898847806, + "grad_norm": 1.379044998580003, + "learning_rate": 4.089262791677775e-05, + "loss": 0.7736, + "num_input_tokens_seen": 463031744, + "step": 2564 + }, + { + "epoch": 0.28079586195571854, + "grad_norm": 1.1094428565535508, + "learning_rate": 4.0885989425188806e-05, + "loss": 0.56, + "num_input_tokens_seen": 463221024, + "step": 2565 + }, + { + "epoch": 0.28090533402665646, + "grad_norm": 1.2500052451977355, + "learning_rate": 4.087934905431768e-05, + "loss": 0.7438, + "num_input_tokens_seen": 463410528, + "step": 2566 + }, + { + "epoch": 0.2810148060975943, + "grad_norm": 1.344394085906895, + "learning_rate": 4.087270680494992e-05, + "loss": 0.7131, + "num_input_tokens_seen": 463609440, + "step": 2567 + }, + { + "epoch": 0.28112427816853225, + "grad_norm": 1.2417825482631895, + "learning_rate": 4.086606267787128e-05, + "loss": 0.8143, + "num_input_tokens_seen": 463799616, + "step": 2568 + }, + { + "epoch": 0.28123375023947017, + "grad_norm": 1.2793355921353795, + "learning_rate": 4.0859416673867755e-05, + "loss": 0.6703, + "num_input_tokens_seen": 463951040, + "step": 2569 + }, + { + "epoch": 0.28134322231040804, + "grad_norm": 1.2772061801754901, + "learning_rate": 4.085276879372557e-05, + "loss": 0.8107, + "num_input_tokens_seen": 464143008, + "step": 2570 + }, + { + "epoch": 0.28145269438134596, + "grad_norm": 1.331442604996697, + "learning_rate": 4.084611903823113e-05, + "loss": 0.842, + "num_input_tokens_seen": 464292192, + "step": 2571 + }, + { + "epoch": 0.2815621664522839, + "grad_norm": 1.4299674671980733, + "learning_rate": 4.083946740817111e-05, + "loss": 0.7741, + "num_input_tokens_seen": 464448992, + "step": 2572 + }, + { + "epoch": 0.28167163852322175, + "grad_norm": 1.2336053017866, + "learning_rate": 4.083281390433236e-05, + "loss": 0.9608, + "num_input_tokens_seen": 464658208, + "step": 2573 + }, + { + "epoch": 0.28178111059415967, + "grad_norm": 1.3089306753465437, + "learning_rate": 4.0826158527502e-05, + "loss": 0.7138, + "num_input_tokens_seen": 464808512, + "step": 2574 + }, + { + "epoch": 0.2818905826650976, + "grad_norm": 1.203687462058627, + "learning_rate": 4.081950127846735e-05, + "loss": 0.6454, + "num_input_tokens_seen": 464975392, + "step": 2575 + }, + { + "epoch": 0.28200005473603545, + "grad_norm": 1.2551518608253007, + "learning_rate": 4.081284215801593e-05, + "loss": 0.679, + "num_input_tokens_seen": 465122560, + "step": 2576 + }, + { + "epoch": 0.2821095268069734, + "grad_norm": 1.4038497361498228, + "learning_rate": 4.080618116693551e-05, + "loss": 0.8124, + "num_input_tokens_seen": 465326848, + "step": 2577 + }, + { + "epoch": 0.2822189988779113, + "grad_norm": 1.4887576804935845, + "learning_rate": 4.079951830601408e-05, + "loss": 0.7779, + "num_input_tokens_seen": 465508736, + "step": 2578 + }, + { + "epoch": 0.28232847094884916, + "grad_norm": 1.277763787598431, + "learning_rate": 4.079285357603984e-05, + "loss": 0.8101, + "num_input_tokens_seen": 465688832, + "step": 2579 + }, + { + "epoch": 0.2824379430197871, + "grad_norm": 1.163381031437584, + "learning_rate": 4.078618697780121e-05, + "loss": 0.68, + "num_input_tokens_seen": 465867360, + "step": 2580 + }, + { + "epoch": 0.28254741509072495, + "grad_norm": 1.1615769136164178, + "learning_rate": 4.0779518512086834e-05, + "loss": 0.8635, + "num_input_tokens_seen": 466077248, + "step": 2581 + }, + { + "epoch": 0.2826568871616629, + "grad_norm": 1.0891125047716268, + "learning_rate": 4.077284817968559e-05, + "loss": 0.5325, + "num_input_tokens_seen": 466269216, + "step": 2582 + }, + { + "epoch": 0.2827663592326008, + "grad_norm": 1.3672198046546162, + "learning_rate": 4.0766175981386556e-05, + "loss": 0.6551, + "num_input_tokens_seen": 466404736, + "step": 2583 + }, + { + "epoch": 0.28287583130353866, + "grad_norm": 1.3255631365767564, + "learning_rate": 4.0759501917979035e-05, + "loss": 0.8677, + "num_input_tokens_seen": 466550336, + "step": 2584 + }, + { + "epoch": 0.2829853033744766, + "grad_norm": 1.2747830729977547, + "learning_rate": 4.0752825990252574e-05, + "loss": 0.8391, + "num_input_tokens_seen": 466731328, + "step": 2585 + }, + { + "epoch": 0.2830947754454145, + "grad_norm": 1.2365762904441533, + "learning_rate": 4.074614819899692e-05, + "loss": 0.6522, + "num_input_tokens_seen": 466886112, + "step": 2586 + }, + { + "epoch": 0.28320424751635237, + "grad_norm": 1.3299001162889714, + "learning_rate": 4.073946854500202e-05, + "loss": 0.7409, + "num_input_tokens_seen": 467059488, + "step": 2587 + }, + { + "epoch": 0.2833137195872903, + "grad_norm": 1.2388211586508493, + "learning_rate": 4.073278702905809e-05, + "loss": 0.6733, + "num_input_tokens_seen": 467238240, + "step": 2588 + }, + { + "epoch": 0.2834231916582282, + "grad_norm": 1.3294205125228495, + "learning_rate": 4.0726103651955525e-05, + "loss": 0.4938, + "num_input_tokens_seen": 467406464, + "step": 2589 + }, + { + "epoch": 0.2835326637291661, + "grad_norm": 1.6321502510683195, + "learning_rate": 4.071941841448496e-05, + "loss": 0.9921, + "num_input_tokens_seen": 467623072, + "step": 2590 + }, + { + "epoch": 0.283642135800104, + "grad_norm": 1.191985892951618, + "learning_rate": 4.071273131743725e-05, + "loss": 0.4916, + "num_input_tokens_seen": 467781664, + "step": 2591 + }, + { + "epoch": 0.2837516078710419, + "grad_norm": 1.3495392843283123, + "learning_rate": 4.070604236160347e-05, + "loss": 0.7407, + "num_input_tokens_seen": 467980352, + "step": 2592 + }, + { + "epoch": 0.2838610799419798, + "grad_norm": 1.3912603234304786, + "learning_rate": 4.06993515477749e-05, + "loss": 0.9193, + "num_input_tokens_seen": 468190464, + "step": 2593 + }, + { + "epoch": 0.2839705520129177, + "grad_norm": 1.3967360049738575, + "learning_rate": 4.069265887674304e-05, + "loss": 0.6743, + "num_input_tokens_seen": 468334048, + "step": 2594 + }, + { + "epoch": 0.28408002408385563, + "grad_norm": 1.2434931676789664, + "learning_rate": 4.068596434929965e-05, + "loss": 0.7248, + "num_input_tokens_seen": 468519968, + "step": 2595 + }, + { + "epoch": 0.2841894961547935, + "grad_norm": 1.4198448023808448, + "learning_rate": 4.067926796623666e-05, + "loss": 0.7753, + "num_input_tokens_seen": 468702304, + "step": 2596 + }, + { + "epoch": 0.2842989682257314, + "grad_norm": 1.3572992283143759, + "learning_rate": 4.067256972834623e-05, + "loss": 0.8119, + "num_input_tokens_seen": 468868064, + "step": 2597 + }, + { + "epoch": 0.2844084402966693, + "grad_norm": 1.1248930244447233, + "learning_rate": 4.066586963642078e-05, + "loss": 0.692, + "num_input_tokens_seen": 469047264, + "step": 2598 + }, + { + "epoch": 0.2845179123676072, + "grad_norm": 1.3042677244006105, + "learning_rate": 4.06591676912529e-05, + "loss": 0.6408, + "num_input_tokens_seen": 469197120, + "step": 2599 + }, + { + "epoch": 0.28462738443854513, + "grad_norm": 1.320642328541304, + "learning_rate": 4.065246389363541e-05, + "loss": 0.8267, + "num_input_tokens_seen": 469388864, + "step": 2600 + }, + { + "epoch": 0.284736856509483, + "grad_norm": 1.1327637760601836, + "learning_rate": 4.064575824436136e-05, + "loss": 0.4828, + "num_input_tokens_seen": 469549920, + "step": 2601 + }, + { + "epoch": 0.2848463285804209, + "grad_norm": 1.140434510511054, + "learning_rate": 4.063905074422403e-05, + "loss": 0.6579, + "num_input_tokens_seen": 469704928, + "step": 2602 + }, + { + "epoch": 0.28495580065135884, + "grad_norm": 1.3682781120212908, + "learning_rate": 4.0632341394016884e-05, + "loss": 0.7735, + "num_input_tokens_seen": 469904288, + "step": 2603 + }, + { + "epoch": 0.2850652727222967, + "grad_norm": 1.3017786000016445, + "learning_rate": 4.062563019453364e-05, + "loss": 0.7966, + "num_input_tokens_seen": 470104768, + "step": 2604 + }, + { + "epoch": 0.2851747447932346, + "grad_norm": 1.3097970468976825, + "learning_rate": 4.0618917146568214e-05, + "loss": 0.6812, + "num_input_tokens_seen": 470258432, + "step": 2605 + }, + { + "epoch": 0.28528421686417255, + "grad_norm": 1.3659467462153856, + "learning_rate": 4.061220225091474e-05, + "loss": 0.6137, + "num_input_tokens_seen": 470419712, + "step": 2606 + }, + { + "epoch": 0.2853936889351104, + "grad_norm": 1.3496834807745692, + "learning_rate": 4.06054855083676e-05, + "loss": 0.6157, + "num_input_tokens_seen": 470585024, + "step": 2607 + }, + { + "epoch": 0.28550316100604833, + "grad_norm": 1.0963568547715, + "learning_rate": 4.059876691972135e-05, + "loss": 0.572, + "num_input_tokens_seen": 470768256, + "step": 2608 + }, + { + "epoch": 0.28561263307698626, + "grad_norm": 1.2556263661507694, + "learning_rate": 4.05920464857708e-05, + "loss": 0.7178, + "num_input_tokens_seen": 470948576, + "step": 2609 + }, + { + "epoch": 0.2857221051479241, + "grad_norm": 1.333397262749295, + "learning_rate": 4.0585324207310946e-05, + "loss": 0.7708, + "num_input_tokens_seen": 471140992, + "step": 2610 + }, + { + "epoch": 0.28583157721886204, + "grad_norm": 1.3183991150694487, + "learning_rate": 4.057860008513703e-05, + "loss": 0.6023, + "num_input_tokens_seen": 471283456, + "step": 2611 + }, + { + "epoch": 0.28594104928979996, + "grad_norm": 1.2017175442951578, + "learning_rate": 4.057187412004452e-05, + "loss": 0.7144, + "num_input_tokens_seen": 471489088, + "step": 2612 + }, + { + "epoch": 0.28605052136073783, + "grad_norm": 1.1855227422379047, + "learning_rate": 4.056514631282906e-05, + "loss": 0.5612, + "num_input_tokens_seen": 471686432, + "step": 2613 + }, + { + "epoch": 0.28615999343167575, + "grad_norm": 1.2782092414086152, + "learning_rate": 4.055841666428655e-05, + "loss": 0.8474, + "num_input_tokens_seen": 471893856, + "step": 2614 + }, + { + "epoch": 0.2862694655026136, + "grad_norm": 1.2329318144756092, + "learning_rate": 4.0551685175213094e-05, + "loss": 0.5951, + "num_input_tokens_seen": 472048864, + "step": 2615 + }, + { + "epoch": 0.28637893757355154, + "grad_norm": 1.2270690940889988, + "learning_rate": 4.0544951846405e-05, + "loss": 0.6453, + "num_input_tokens_seen": 472209920, + "step": 2616 + }, + { + "epoch": 0.28648840964448946, + "grad_norm": 1.3084497628777318, + "learning_rate": 4.053821667865883e-05, + "loss": 0.6488, + "num_input_tokens_seen": 472371648, + "step": 2617 + }, + { + "epoch": 0.2865978817154273, + "grad_norm": 1.3395717333602366, + "learning_rate": 4.053147967277133e-05, + "loss": 0.7562, + "num_input_tokens_seen": 472534944, + "step": 2618 + }, + { + "epoch": 0.28670735378636525, + "grad_norm": 1.413470613309752, + "learning_rate": 4.052474082953948e-05, + "loss": 0.6095, + "num_input_tokens_seen": 472691520, + "step": 2619 + }, + { + "epoch": 0.28681682585730317, + "grad_norm": 1.2038624967809657, + "learning_rate": 4.051800014976046e-05, + "loss": 0.6513, + "num_input_tokens_seen": 472858400, + "step": 2620 + }, + { + "epoch": 0.28692629792824104, + "grad_norm": 1.2344491670756434, + "learning_rate": 4.051125763423169e-05, + "loss": 0.6595, + "num_input_tokens_seen": 473011616, + "step": 2621 + }, + { + "epoch": 0.28703576999917896, + "grad_norm": 1.1291797662986285, + "learning_rate": 4.050451328375079e-05, + "loss": 0.6963, + "num_input_tokens_seen": 473207616, + "step": 2622 + }, + { + "epoch": 0.2871452420701169, + "grad_norm": 1.2264574819909293, + "learning_rate": 4.0497767099115615e-05, + "loss": 0.8038, + "num_input_tokens_seen": 473411904, + "step": 2623 + }, + { + "epoch": 0.28725471414105475, + "grad_norm": 1.2441122629714638, + "learning_rate": 4.0491019081124216e-05, + "loss": 0.6942, + "num_input_tokens_seen": 473600288, + "step": 2624 + }, + { + "epoch": 0.28736418621199267, + "grad_norm": 1.1786496479120474, + "learning_rate": 4.048426923057488e-05, + "loss": 0.886, + "num_input_tokens_seen": 473797408, + "step": 2625 + }, + { + "epoch": 0.2874736582829306, + "grad_norm": 1.1648927326508496, + "learning_rate": 4.047751754826608e-05, + "loss": 0.6483, + "num_input_tokens_seen": 473976608, + "step": 2626 + }, + { + "epoch": 0.28758313035386845, + "grad_norm": 1.1641118083981827, + "learning_rate": 4.0470764034996556e-05, + "loss": 0.6745, + "num_input_tokens_seen": 474164992, + "step": 2627 + }, + { + "epoch": 0.2876926024248064, + "grad_norm": 1.2340264129965057, + "learning_rate": 4.046400869156521e-05, + "loss": 0.7982, + "num_input_tokens_seen": 474351584, + "step": 2628 + }, + { + "epoch": 0.2878020744957443, + "grad_norm": 1.3730556873022464, + "learning_rate": 4.045725151877121e-05, + "loss": 0.7603, + "num_input_tokens_seen": 474533696, + "step": 2629 + }, + { + "epoch": 0.28791154656668216, + "grad_norm": 1.6898154867897652, + "learning_rate": 4.04504925174139e-05, + "loss": 0.9596, + "num_input_tokens_seen": 474712896, + "step": 2630 + }, + { + "epoch": 0.2880210186376201, + "grad_norm": 1.2778639827664928, + "learning_rate": 4.0443731688292866e-05, + "loss": 0.8781, + "num_input_tokens_seen": 474922560, + "step": 2631 + }, + { + "epoch": 0.28813049070855795, + "grad_norm": 1.26688600050785, + "learning_rate": 4.043696903220788e-05, + "loss": 0.5924, + "num_input_tokens_seen": 475078912, + "step": 2632 + }, + { + "epoch": 0.2882399627794959, + "grad_norm": 1.1718011451336665, + "learning_rate": 4.0430204549958986e-05, + "loss": 0.551, + "num_input_tokens_seen": 475252288, + "step": 2633 + }, + { + "epoch": 0.2883494348504338, + "grad_norm": 1.1919257860410717, + "learning_rate": 4.0423438242346386e-05, + "loss": 0.783, + "num_input_tokens_seen": 475461056, + "step": 2634 + }, + { + "epoch": 0.28845890692137166, + "grad_norm": 1.2912683744498783, + "learning_rate": 4.0416670110170526e-05, + "loss": 0.7863, + "num_input_tokens_seen": 475655488, + "step": 2635 + }, + { + "epoch": 0.2885683789923096, + "grad_norm": 1.4153176317145095, + "learning_rate": 4.040990015423206e-05, + "loss": 0.7313, + "num_input_tokens_seen": 475814528, + "step": 2636 + }, + { + "epoch": 0.2886778510632475, + "grad_norm": 1.2275510072695994, + "learning_rate": 4.040312837533187e-05, + "loss": 0.7351, + "num_input_tokens_seen": 475992608, + "step": 2637 + }, + { + "epoch": 0.28878732313418537, + "grad_norm": 1.2488533482824784, + "learning_rate": 4.039635477427103e-05, + "loss": 1.0363, + "num_input_tokens_seen": 476207648, + "step": 2638 + }, + { + "epoch": 0.2888967952051233, + "grad_norm": 1.3372036026147576, + "learning_rate": 4.038957935185086e-05, + "loss": 0.7408, + "num_input_tokens_seen": 476387520, + "step": 2639 + }, + { + "epoch": 0.2890062672760612, + "grad_norm": 1.2503674443916748, + "learning_rate": 4.038280210887287e-05, + "loss": 0.8477, + "num_input_tokens_seen": 476592256, + "step": 2640 + }, + { + "epoch": 0.2891157393469991, + "grad_norm": 1.17326536986003, + "learning_rate": 4.0376023046138803e-05, + "loss": 0.5161, + "num_input_tokens_seen": 476774144, + "step": 2641 + }, + { + "epoch": 0.289225211417937, + "grad_norm": 1.3398074932030641, + "learning_rate": 4.036924216445061e-05, + "loss": 0.7096, + "num_input_tokens_seen": 476962304, + "step": 2642 + }, + { + "epoch": 0.2893346834888749, + "grad_norm": 1.290342354478756, + "learning_rate": 4.036245946461043e-05, + "loss": 0.7925, + "num_input_tokens_seen": 477129632, + "step": 2643 + }, + { + "epoch": 0.2894441555598128, + "grad_norm": 1.218751320471415, + "learning_rate": 4.0355674947420676e-05, + "loss": 0.6097, + "num_input_tokens_seen": 477313984, + "step": 2644 + }, + { + "epoch": 0.2895536276307507, + "grad_norm": 1.4389506773935774, + "learning_rate": 4.0348888613683925e-05, + "loss": 0.9934, + "num_input_tokens_seen": 477486688, + "step": 2645 + }, + { + "epoch": 0.28966309970168863, + "grad_norm": 1.3376948526643153, + "learning_rate": 4.0342100464203e-05, + "loss": 0.7809, + "num_input_tokens_seen": 477656480, + "step": 2646 + }, + { + "epoch": 0.2897725717726265, + "grad_norm": 1.3960902371439743, + "learning_rate": 4.033531049978091e-05, + "loss": 0.7938, + "num_input_tokens_seen": 477843744, + "step": 2647 + }, + { + "epoch": 0.2898820438435644, + "grad_norm": 1.2724645911725654, + "learning_rate": 4.032851872122091e-05, + "loss": 0.6259, + "num_input_tokens_seen": 477998528, + "step": 2648 + }, + { + "epoch": 0.28999151591450234, + "grad_norm": 1.275343375623269, + "learning_rate": 4.0321725129326446e-05, + "loss": 0.6605, + "num_input_tokens_seen": 478177056, + "step": 2649 + }, + { + "epoch": 0.2901009879854402, + "grad_norm": 1.189594830370036, + "learning_rate": 4.031492972490119e-05, + "loss": 0.5733, + "num_input_tokens_seen": 478352224, + "step": 2650 + }, + { + "epoch": 0.29021046005637813, + "grad_norm": 1.516431456272345, + "learning_rate": 4.030813250874903e-05, + "loss": 0.6928, + "num_input_tokens_seen": 478516640, + "step": 2651 + }, + { + "epoch": 0.290319932127316, + "grad_norm": 1.2793305137812594, + "learning_rate": 4.030133348167405e-05, + "loss": 0.7149, + "num_input_tokens_seen": 478701440, + "step": 2652 + }, + { + "epoch": 0.2904294041982539, + "grad_norm": 1.2326735830424158, + "learning_rate": 4.0294532644480576e-05, + "loss": 0.6239, + "num_input_tokens_seen": 478897440, + "step": 2653 + }, + { + "epoch": 0.29053887626919184, + "grad_norm": 1.3595629595018042, + "learning_rate": 4.028772999797313e-05, + "loss": 0.8451, + "num_input_tokens_seen": 479069920, + "step": 2654 + }, + { + "epoch": 0.2906483483401297, + "grad_norm": 1.3240746613826995, + "learning_rate": 4.028092554295645e-05, + "loss": 0.8026, + "num_input_tokens_seen": 479253824, + "step": 2655 + }, + { + "epoch": 0.2907578204110676, + "grad_norm": 1.3899965473516382, + "learning_rate": 4.027411928023549e-05, + "loss": 0.7605, + "num_input_tokens_seen": 479444448, + "step": 2656 + }, + { + "epoch": 0.29086729248200555, + "grad_norm": 1.2539301124274898, + "learning_rate": 4.026731121061541e-05, + "loss": 0.7468, + "num_input_tokens_seen": 479627680, + "step": 2657 + }, + { + "epoch": 0.2909767645529434, + "grad_norm": 1.2217961903007986, + "learning_rate": 4.026050133490161e-05, + "loss": 0.8872, + "num_input_tokens_seen": 479837344, + "step": 2658 + }, + { + "epoch": 0.29108623662388133, + "grad_norm": 1.6074327767467116, + "learning_rate": 4.025368965389967e-05, + "loss": 1.1086, + "num_input_tokens_seen": 480029312, + "step": 2659 + }, + { + "epoch": 0.29119570869481926, + "grad_norm": 1.2947502655423826, + "learning_rate": 4.02468761684154e-05, + "loss": 0.5807, + "num_input_tokens_seen": 480184768, + "step": 2660 + }, + { + "epoch": 0.2913051807657571, + "grad_norm": 1.3684750842612723, + "learning_rate": 4.024006087925484e-05, + "loss": 0.8739, + "num_input_tokens_seen": 480375840, + "step": 2661 + }, + { + "epoch": 0.29141465283669504, + "grad_norm": 1.261377199788945, + "learning_rate": 4.02332437872242e-05, + "loss": 0.7637, + "num_input_tokens_seen": 480583040, + "step": 2662 + }, + { + "epoch": 0.29152412490763296, + "grad_norm": 1.3352192068686437, + "learning_rate": 4.022642489312994e-05, + "loss": 0.8802, + "num_input_tokens_seen": 480779936, + "step": 2663 + }, + { + "epoch": 0.29163359697857083, + "grad_norm": 1.2614540318427023, + "learning_rate": 4.0219604197778725e-05, + "loss": 0.8596, + "num_input_tokens_seen": 480967200, + "step": 2664 + }, + { + "epoch": 0.29174306904950875, + "grad_norm": 1.2052214551171223, + "learning_rate": 4.0212781701977434e-05, + "loss": 0.6464, + "num_input_tokens_seen": 481154912, + "step": 2665 + }, + { + "epoch": 0.2918525411204467, + "grad_norm": 1.2601497094208702, + "learning_rate": 4.020595740653315e-05, + "loss": 0.803, + "num_input_tokens_seen": 481326496, + "step": 2666 + }, + { + "epoch": 0.29196201319138454, + "grad_norm": 1.29620964723201, + "learning_rate": 4.019913131225318e-05, + "loss": 0.7658, + "num_input_tokens_seen": 481483296, + "step": 2667 + }, + { + "epoch": 0.29207148526232246, + "grad_norm": 1.1934935249309584, + "learning_rate": 4.019230341994501e-05, + "loss": 0.6103, + "num_input_tokens_seen": 481626432, + "step": 2668 + }, + { + "epoch": 0.2921809573332603, + "grad_norm": 1.1210273962288804, + "learning_rate": 4.018547373041641e-05, + "loss": 0.7102, + "num_input_tokens_seen": 481790176, + "step": 2669 + }, + { + "epoch": 0.29229042940419825, + "grad_norm": 1.2506875531415473, + "learning_rate": 4.017864224447528e-05, + "loss": 0.6012, + "num_input_tokens_seen": 481959968, + "step": 2670 + }, + { + "epoch": 0.29239990147513617, + "grad_norm": 1.4488006564509033, + "learning_rate": 4.01718089629298e-05, + "loss": 0.9035, + "num_input_tokens_seen": 482166272, + "step": 2671 + }, + { + "epoch": 0.29250937354607404, + "grad_norm": 1.3670126230930941, + "learning_rate": 4.016497388658832e-05, + "loss": 0.6891, + "num_input_tokens_seen": 482344128, + "step": 2672 + }, + { + "epoch": 0.29261884561701196, + "grad_norm": 1.2084605325418132, + "learning_rate": 4.015813701625942e-05, + "loss": 0.569, + "num_input_tokens_seen": 482541696, + "step": 2673 + }, + { + "epoch": 0.2927283176879499, + "grad_norm": 1.1908557057575972, + "learning_rate": 4.015129835275189e-05, + "loss": 0.7128, + "num_input_tokens_seen": 482749120, + "step": 2674 + }, + { + "epoch": 0.29283778975888775, + "grad_norm": 1.4138269807885016, + "learning_rate": 4.014445789687472e-05, + "loss": 0.7743, + "num_input_tokens_seen": 482941312, + "step": 2675 + }, + { + "epoch": 0.29294726182982567, + "grad_norm": 1.1041493984287625, + "learning_rate": 4.013761564943714e-05, + "loss": 0.5479, + "num_input_tokens_seen": 483154560, + "step": 2676 + }, + { + "epoch": 0.2930567339007636, + "grad_norm": 1.198789588107467, + "learning_rate": 4.013077161124857e-05, + "loss": 0.6463, + "num_input_tokens_seen": 483349664, + "step": 2677 + }, + { + "epoch": 0.29316620597170145, + "grad_norm": 1.3632016746228637, + "learning_rate": 4.012392578311864e-05, + "loss": 0.7593, + "num_input_tokens_seen": 483523488, + "step": 2678 + }, + { + "epoch": 0.2932756780426394, + "grad_norm": 1.220364894548076, + "learning_rate": 4.0117078165857205e-05, + "loss": 0.5259, + "num_input_tokens_seen": 483681632, + "step": 2679 + }, + { + "epoch": 0.2933851501135773, + "grad_norm": 1.3278077588139237, + "learning_rate": 4.0110228760274314e-05, + "loss": 0.7735, + "num_input_tokens_seen": 483855008, + "step": 2680 + }, + { + "epoch": 0.29349462218451516, + "grad_norm": 1.2902307476254877, + "learning_rate": 4.010337756718026e-05, + "loss": 0.629, + "num_input_tokens_seen": 484040256, + "step": 2681 + }, + { + "epoch": 0.2936040942554531, + "grad_norm": 1.409322899238147, + "learning_rate": 4.00965245873855e-05, + "loss": 0.905, + "num_input_tokens_seen": 484216320, + "step": 2682 + }, + { + "epoch": 0.293713566326391, + "grad_norm": 1.1930677491525756, + "learning_rate": 4.008966982170074e-05, + "loss": 0.7703, + "num_input_tokens_seen": 484379616, + "step": 2683 + }, + { + "epoch": 0.2938230383973289, + "grad_norm": 1.3438230095479837, + "learning_rate": 4.008281327093689e-05, + "loss": 0.875, + "num_input_tokens_seen": 484580096, + "step": 2684 + }, + { + "epoch": 0.2939325104682668, + "grad_norm": 1.3292476621402416, + "learning_rate": 4.007595493590506e-05, + "loss": 0.6374, + "num_input_tokens_seen": 484784160, + "step": 2685 + }, + { + "epoch": 0.29404198253920466, + "grad_norm": 1.2374008851866503, + "learning_rate": 4.006909481741659e-05, + "loss": 0.7606, + "num_input_tokens_seen": 485000096, + "step": 2686 + }, + { + "epoch": 0.2941514546101426, + "grad_norm": 1.2284923848438971, + "learning_rate": 4.006223291628301e-05, + "loss": 0.6953, + "num_input_tokens_seen": 485183776, + "step": 2687 + }, + { + "epoch": 0.2942609266810805, + "grad_norm": 1.249527269619184, + "learning_rate": 4.0055369233316063e-05, + "loss": 0.6848, + "num_input_tokens_seen": 485366560, + "step": 2688 + }, + { + "epoch": 0.29437039875201837, + "grad_norm": 1.2452674924597926, + "learning_rate": 4.004850376932772e-05, + "loss": 0.605, + "num_input_tokens_seen": 485536800, + "step": 2689 + }, + { + "epoch": 0.2944798708229563, + "grad_norm": 1.362261590310123, + "learning_rate": 4.0041636525130156e-05, + "loss": 0.7007, + "num_input_tokens_seen": 485678144, + "step": 2690 + }, + { + "epoch": 0.2945893428938942, + "grad_norm": 1.443791346762984, + "learning_rate": 4.003476750153573e-05, + "loss": 0.8169, + "num_input_tokens_seen": 485838304, + "step": 2691 + }, + { + "epoch": 0.2946988149648321, + "grad_norm": 1.2441949039039066, + "learning_rate": 4.002789669935706e-05, + "loss": 0.6483, + "num_input_tokens_seen": 486014144, + "step": 2692 + }, + { + "epoch": 0.29480828703577, + "grad_norm": 1.2220282846289718, + "learning_rate": 4.002102411940694e-05, + "loss": 0.7284, + "num_input_tokens_seen": 486202304, + "step": 2693 + }, + { + "epoch": 0.2949177591067079, + "grad_norm": 1.3199493089248175, + "learning_rate": 4.001414976249839e-05, + "loss": 0.8499, + "num_input_tokens_seen": 486386656, + "step": 2694 + }, + { + "epoch": 0.2950272311776458, + "grad_norm": 1.1433617667748888, + "learning_rate": 4.000727362944461e-05, + "loss": 0.7277, + "num_input_tokens_seen": 486580192, + "step": 2695 + }, + { + "epoch": 0.2951367032485837, + "grad_norm": 1.1980730977405352, + "learning_rate": 4.0000395721059053e-05, + "loss": 0.8296, + "num_input_tokens_seen": 486748416, + "step": 2696 + }, + { + "epoch": 0.29524617531952163, + "grad_norm": 1.3190947113767428, + "learning_rate": 3.999351603815536e-05, + "loss": 0.6923, + "num_input_tokens_seen": 486920224, + "step": 2697 + }, + { + "epoch": 0.2953556473904595, + "grad_norm": 1.2536391214345046, + "learning_rate": 3.998663458154738e-05, + "loss": 0.6927, + "num_input_tokens_seen": 487101664, + "step": 2698 + }, + { + "epoch": 0.2954651194613974, + "grad_norm": 1.1736528770393637, + "learning_rate": 3.997975135204918e-05, + "loss": 0.6878, + "num_input_tokens_seen": 487281088, + "step": 2699 + }, + { + "epoch": 0.29557459153233534, + "grad_norm": 1.230765751916297, + "learning_rate": 3.997286635047503e-05, + "loss": 0.7752, + "num_input_tokens_seen": 487471040, + "step": 2700 + }, + { + "epoch": 0.2956840636032732, + "grad_norm": 1.3694438081670646, + "learning_rate": 3.9965979577639416e-05, + "loss": 0.808, + "num_input_tokens_seen": 487660992, + "step": 2701 + }, + { + "epoch": 0.2957935356742111, + "grad_norm": 1.3214162693408396, + "learning_rate": 3.9959091034357036e-05, + "loss": 0.6672, + "num_input_tokens_seen": 487835264, + "step": 2702 + }, + { + "epoch": 0.295903007745149, + "grad_norm": 1.4842878617258428, + "learning_rate": 3.995220072144277e-05, + "loss": 0.8301, + "num_input_tokens_seen": 488037984, + "step": 2703 + }, + { + "epoch": 0.2960124798160869, + "grad_norm": 1.2772443073539337, + "learning_rate": 3.994530863971175e-05, + "loss": 0.701, + "num_input_tokens_seen": 488238688, + "step": 2704 + }, + { + "epoch": 0.29612195188702484, + "grad_norm": 1.2658734960674227, + "learning_rate": 3.993841478997928e-05, + "loss": 0.6376, + "num_input_tokens_seen": 488438048, + "step": 2705 + }, + { + "epoch": 0.2962314239579627, + "grad_norm": 1.150922202330488, + "learning_rate": 3.993151917306091e-05, + "loss": 0.6891, + "num_input_tokens_seen": 488614560, + "step": 2706 + }, + { + "epoch": 0.2963408960289006, + "grad_norm": 1.181720522866993, + "learning_rate": 3.992462178977235e-05, + "loss": 0.5963, + "num_input_tokens_seen": 488818624, + "step": 2707 + }, + { + "epoch": 0.29645036809983855, + "grad_norm": 1.1831912121026646, + "learning_rate": 3.9917722640929576e-05, + "loss": 0.6143, + "num_input_tokens_seen": 489001408, + "step": 2708 + }, + { + "epoch": 0.2965598401707764, + "grad_norm": 1.3329772722148236, + "learning_rate": 3.991082172734874e-05, + "loss": 0.6866, + "num_input_tokens_seen": 489194272, + "step": 2709 + }, + { + "epoch": 0.29666931224171433, + "grad_norm": 1.2853141154347032, + "learning_rate": 3.990391904984618e-05, + "loss": 0.828, + "num_input_tokens_seen": 489374592, + "step": 2710 + }, + { + "epoch": 0.29677878431265226, + "grad_norm": 1.2448709640889495, + "learning_rate": 3.9897014609238496e-05, + "loss": 0.6197, + "num_input_tokens_seen": 489537664, + "step": 2711 + }, + { + "epoch": 0.2968882563835901, + "grad_norm": 1.2160153642330018, + "learning_rate": 3.9890108406342455e-05, + "loss": 0.7954, + "num_input_tokens_seen": 489692224, + "step": 2712 + }, + { + "epoch": 0.29699772845452804, + "grad_norm": 1.1961847252012583, + "learning_rate": 3.988320044197507e-05, + "loss": 0.5368, + "num_input_tokens_seen": 489853728, + "step": 2713 + }, + { + "epoch": 0.29710720052546596, + "grad_norm": 1.1531537328929198, + "learning_rate": 3.987629071695351e-05, + "loss": 0.4457, + "num_input_tokens_seen": 490021952, + "step": 2714 + }, + { + "epoch": 0.29721667259640383, + "grad_norm": 1.1474593272096414, + "learning_rate": 3.9869379232095204e-05, + "loss": 0.6428, + "num_input_tokens_seen": 490218400, + "step": 2715 + }, + { + "epoch": 0.29732614466734175, + "grad_norm": 1.3803587009884315, + "learning_rate": 3.986246598821776e-05, + "loss": 0.778, + "num_input_tokens_seen": 490405664, + "step": 2716 + }, + { + "epoch": 0.2974356167382797, + "grad_norm": 1.296375040407615, + "learning_rate": 3.9855550986139e-05, + "loss": 0.8182, + "num_input_tokens_seen": 490602336, + "step": 2717 + }, + { + "epoch": 0.29754508880921754, + "grad_norm": 1.1979701099360667, + "learning_rate": 3.984863422667695e-05, + "loss": 0.6533, + "num_input_tokens_seen": 490776384, + "step": 2718 + }, + { + "epoch": 0.29765456088015546, + "grad_norm": 1.4206412695487187, + "learning_rate": 3.9841715710649865e-05, + "loss": 0.7878, + "num_input_tokens_seen": 490936768, + "step": 2719 + }, + { + "epoch": 0.2977640329510933, + "grad_norm": 1.2309120469610413, + "learning_rate": 3.983479543887618e-05, + "loss": 0.698, + "num_input_tokens_seen": 491087520, + "step": 2720 + }, + { + "epoch": 0.29787350502203125, + "grad_norm": 1.2522714956932388, + "learning_rate": 3.9827873412174565e-05, + "loss": 0.6897, + "num_input_tokens_seen": 491289120, + "step": 2721 + }, + { + "epoch": 0.29798297709296917, + "grad_norm": 1.1859839951720257, + "learning_rate": 3.9820949631363855e-05, + "loss": 0.6004, + "num_input_tokens_seen": 491491616, + "step": 2722 + }, + { + "epoch": 0.29809244916390704, + "grad_norm": 1.3091455463395005, + "learning_rate": 3.9814024097263154e-05, + "loss": 0.6376, + "num_input_tokens_seen": 491645056, + "step": 2723 + }, + { + "epoch": 0.29820192123484496, + "grad_norm": 1.3302347816381082, + "learning_rate": 3.980709681069171e-05, + "loss": 0.7212, + "num_input_tokens_seen": 491822016, + "step": 2724 + }, + { + "epoch": 0.2983113933057829, + "grad_norm": 1.2948306412583948, + "learning_rate": 3.980016777246902e-05, + "loss": 0.763, + "num_input_tokens_seen": 492016000, + "step": 2725 + }, + { + "epoch": 0.29842086537672075, + "grad_norm": 1.1739292196450821, + "learning_rate": 3.979323698341478e-05, + "loss": 0.6382, + "num_input_tokens_seen": 492198784, + "step": 2726 + }, + { + "epoch": 0.29853033744765867, + "grad_norm": 1.6334536813007825, + "learning_rate": 3.978630444434888e-05, + "loss": 0.8562, + "num_input_tokens_seen": 492373056, + "step": 2727 + }, + { + "epoch": 0.2986398095185966, + "grad_norm": 1.2402473028515442, + "learning_rate": 3.977937015609143e-05, + "loss": 0.6055, + "num_input_tokens_seen": 492550464, + "step": 2728 + }, + { + "epoch": 0.29874928158953445, + "grad_norm": 1.3729729883491701, + "learning_rate": 3.9772434119462754e-05, + "loss": 0.8492, + "num_input_tokens_seen": 492757888, + "step": 2729 + }, + { + "epoch": 0.2988587536604724, + "grad_norm": 1.2658253263948975, + "learning_rate": 3.976549633528336e-05, + "loss": 0.7463, + "num_input_tokens_seen": 492923200, + "step": 2730 + }, + { + "epoch": 0.2989682257314103, + "grad_norm": 1.339466453345761, + "learning_rate": 3.975855680437397e-05, + "loss": 0.6969, + "num_input_tokens_seen": 493085824, + "step": 2731 + }, + { + "epoch": 0.29907769780234816, + "grad_norm": 1.2097833362991277, + "learning_rate": 3.975161552755552e-05, + "loss": 0.8239, + "num_input_tokens_seen": 493272864, + "step": 2732 + }, + { + "epoch": 0.2991871698732861, + "grad_norm": 1.16071221487678, + "learning_rate": 3.974467250564916e-05, + "loss": 0.575, + "num_input_tokens_seen": 493435936, + "step": 2733 + }, + { + "epoch": 0.299296641944224, + "grad_norm": 1.2506998487166676, + "learning_rate": 3.973772773947623e-05, + "loss": 0.7081, + "num_input_tokens_seen": 493616032, + "step": 2734 + }, + { + "epoch": 0.2994061140151619, + "grad_norm": 1.3313387863567718, + "learning_rate": 3.9730781229858284e-05, + "loss": 0.732, + "num_input_tokens_seen": 493755136, + "step": 2735 + }, + { + "epoch": 0.2995155860860998, + "grad_norm": 1.1910324761089086, + "learning_rate": 3.972383297761707e-05, + "loss": 0.5326, + "num_input_tokens_seen": 493895360, + "step": 2736 + }, + { + "epoch": 0.29962505815703766, + "grad_norm": 1.1272779290856743, + "learning_rate": 3.971688298357457e-05, + "loss": 0.6188, + "num_input_tokens_seen": 494047008, + "step": 2737 + }, + { + "epoch": 0.2997345302279756, + "grad_norm": 1.1742380350017831, + "learning_rate": 3.9709931248552944e-05, + "loss": 0.5434, + "num_input_tokens_seen": 494216352, + "step": 2738 + }, + { + "epoch": 0.2998440022989135, + "grad_norm": 1.436026771933404, + "learning_rate": 3.9702977773374576e-05, + "loss": 0.9036, + "num_input_tokens_seen": 494405408, + "step": 2739 + }, + { + "epoch": 0.29995347436985137, + "grad_norm": 1.378359980888733, + "learning_rate": 3.969602255886203e-05, + "loss": 0.8111, + "num_input_tokens_seen": 494597600, + "step": 2740 + }, + { + "epoch": 0.3000629464407893, + "grad_norm": 1.4721903987460019, + "learning_rate": 3.968906560583813e-05, + "loss": 0.8526, + "num_input_tokens_seen": 494785536, + "step": 2741 + }, + { + "epoch": 0.3001724185117272, + "grad_norm": 1.207244706029101, + "learning_rate": 3.968210691512584e-05, + "loss": 0.6901, + "num_input_tokens_seen": 494995872, + "step": 2742 + }, + { + "epoch": 0.3002818905826651, + "grad_norm": 1.3745635813927932, + "learning_rate": 3.9675146487548364e-05, + "loss": 0.8486, + "num_input_tokens_seen": 495192096, + "step": 2743 + }, + { + "epoch": 0.300391362653603, + "grad_norm": 1.2972471323736978, + "learning_rate": 3.966818432392912e-05, + "loss": 0.7542, + "num_input_tokens_seen": 495364352, + "step": 2744 + }, + { + "epoch": 0.3005008347245409, + "grad_norm": 1.248344087507699, + "learning_rate": 3.9661220425091705e-05, + "loss": 0.7337, + "num_input_tokens_seen": 495536608, + "step": 2745 + }, + { + "epoch": 0.3006103067954788, + "grad_norm": 1.2713438738706209, + "learning_rate": 3.9654254791859943e-05, + "loss": 0.6002, + "num_input_tokens_seen": 495678176, + "step": 2746 + }, + { + "epoch": 0.3007197788664167, + "grad_norm": 1.1425584057793567, + "learning_rate": 3.9647287425057864e-05, + "loss": 0.5132, + "num_input_tokens_seen": 495846400, + "step": 2747 + }, + { + "epoch": 0.30082925093735463, + "grad_norm": 1.2473124699885756, + "learning_rate": 3.9640318325509676e-05, + "loss": 0.6548, + "num_input_tokens_seen": 496030304, + "step": 2748 + }, + { + "epoch": 0.3009387230082925, + "grad_norm": 1.1660371455725762, + "learning_rate": 3.9633347494039814e-05, + "loss": 0.6137, + "num_input_tokens_seen": 496235040, + "step": 2749 + }, + { + "epoch": 0.3010481950792304, + "grad_norm": 1.0308651061366714, + "learning_rate": 3.962637493147292e-05, + "loss": 0.461, + "num_input_tokens_seen": 496402816, + "step": 2750 + }, + { + "epoch": 0.30115766715016834, + "grad_norm": 1.316228656505185, + "learning_rate": 3.961940063863383e-05, + "loss": 0.7966, + "num_input_tokens_seen": 496564768, + "step": 2751 + }, + { + "epoch": 0.3012671392211062, + "grad_norm": 1.1944550075113218, + "learning_rate": 3.9612424616347596e-05, + "loss": 0.6718, + "num_input_tokens_seen": 496741504, + "step": 2752 + }, + { + "epoch": 0.3013766112920441, + "grad_norm": 1.1562641246036032, + "learning_rate": 3.9605446865439466e-05, + "loss": 0.5075, + "num_input_tokens_seen": 496932128, + "step": 2753 + }, + { + "epoch": 0.301486083362982, + "grad_norm": 1.3434612939878854, + "learning_rate": 3.959846738673488e-05, + "loss": 0.7756, + "num_input_tokens_seen": 497142688, + "step": 2754 + }, + { + "epoch": 0.3015955554339199, + "grad_norm": 1.4076266861541245, + "learning_rate": 3.9591486181059524e-05, + "loss": 0.7264, + "num_input_tokens_seen": 497320320, + "step": 2755 + }, + { + "epoch": 0.30170502750485784, + "grad_norm": 1.431875170637343, + "learning_rate": 3.958450324923924e-05, + "loss": 0.8953, + "num_input_tokens_seen": 497532224, + "step": 2756 + }, + { + "epoch": 0.3018144995757957, + "grad_norm": 1.3143778945230709, + "learning_rate": 3.9577518592100114e-05, + "loss": 0.7377, + "num_input_tokens_seen": 497718592, + "step": 2757 + }, + { + "epoch": 0.3019239716467336, + "grad_norm": 1.1588343529311098, + "learning_rate": 3.957053221046839e-05, + "loss": 0.5535, + "num_input_tokens_seen": 497867776, + "step": 2758 + }, + { + "epoch": 0.30203344371767155, + "grad_norm": 1.157923441113594, + "learning_rate": 3.956354410517057e-05, + "loss": 0.6142, + "num_input_tokens_seen": 498040928, + "step": 2759 + }, + { + "epoch": 0.3021429157886094, + "grad_norm": 1.223469765190188, + "learning_rate": 3.955655427703332e-05, + "loss": 0.7489, + "num_input_tokens_seen": 498231104, + "step": 2760 + }, + { + "epoch": 0.30225238785954733, + "grad_norm": 1.2304451107600296, + "learning_rate": 3.954956272688353e-05, + "loss": 0.5415, + "num_input_tokens_seen": 498414560, + "step": 2761 + }, + { + "epoch": 0.30236185993048525, + "grad_norm": 1.1628901575489277, + "learning_rate": 3.954256945554827e-05, + "loss": 0.6429, + "num_input_tokens_seen": 498616832, + "step": 2762 + }, + { + "epoch": 0.3024713320014231, + "grad_norm": 1.2435258577125543, + "learning_rate": 3.9535574463854856e-05, + "loss": 0.6342, + "num_input_tokens_seen": 498804544, + "step": 2763 + }, + { + "epoch": 0.30258080407236104, + "grad_norm": 1.1399340692494973, + "learning_rate": 3.952857775263077e-05, + "loss": 0.5288, + "num_input_tokens_seen": 498985088, + "step": 2764 + }, + { + "epoch": 0.30269027614329896, + "grad_norm": 1.2431259449091676, + "learning_rate": 3.9521579322703704e-05, + "loss": 0.6922, + "num_input_tokens_seen": 499167200, + "step": 2765 + }, + { + "epoch": 0.30279974821423683, + "grad_norm": 1.1430425759576786, + "learning_rate": 3.951457917490157e-05, + "loss": 0.7798, + "num_input_tokens_seen": 499329824, + "step": 2766 + }, + { + "epoch": 0.30290922028517475, + "grad_norm": 1.2129155400615053, + "learning_rate": 3.950757731005247e-05, + "loss": 0.7547, + "num_input_tokens_seen": 499481696, + "step": 2767 + }, + { + "epoch": 0.3030186923561127, + "grad_norm": 1.2308485141421466, + "learning_rate": 3.95005737289847e-05, + "loss": 0.686, + "num_input_tokens_seen": 499690912, + "step": 2768 + }, + { + "epoch": 0.30312816442705054, + "grad_norm": 1.203205749972534, + "learning_rate": 3.9493568432526787e-05, + "loss": 0.7246, + "num_input_tokens_seen": 499900576, + "step": 2769 + }, + { + "epoch": 0.30323763649798846, + "grad_norm": 1.4052395899383154, + "learning_rate": 3.948656142150742e-05, + "loss": 0.8593, + "num_input_tokens_seen": 500088288, + "step": 2770 + }, + { + "epoch": 0.3033471085689263, + "grad_norm": 1.3050970017347647, + "learning_rate": 3.947955269675554e-05, + "loss": 0.6, + "num_input_tokens_seen": 500239712, + "step": 2771 + }, + { + "epoch": 0.30345658063986425, + "grad_norm": 1.4001421634836881, + "learning_rate": 3.9472542259100264e-05, + "loss": 0.7396, + "num_input_tokens_seen": 500411744, + "step": 2772 + }, + { + "epoch": 0.30356605271080217, + "grad_norm": 1.3268583203784354, + "learning_rate": 3.94655301093709e-05, + "loss": 0.7097, + "num_input_tokens_seen": 500605280, + "step": 2773 + }, + { + "epoch": 0.30367552478174004, + "grad_norm": 1.1802162854745175, + "learning_rate": 3.945851624839697e-05, + "loss": 0.6901, + "num_input_tokens_seen": 500793440, + "step": 2774 + }, + { + "epoch": 0.30378499685267796, + "grad_norm": 1.2840595609808139, + "learning_rate": 3.9451500677008213e-05, + "loss": 0.6398, + "num_input_tokens_seen": 500969280, + "step": 2775 + }, + { + "epoch": 0.3038944689236159, + "grad_norm": 1.3958125705977753, + "learning_rate": 3.944448339603455e-05, + "loss": 0.7351, + "num_input_tokens_seen": 501124064, + "step": 2776 + }, + { + "epoch": 0.30400394099455375, + "grad_norm": 1.2827156336984864, + "learning_rate": 3.9437464406306124e-05, + "loss": 0.9323, + "num_input_tokens_seen": 501310880, + "step": 2777 + }, + { + "epoch": 0.30411341306549167, + "grad_norm": 1.2308813462979467, + "learning_rate": 3.9430443708653255e-05, + "loss": 0.7429, + "num_input_tokens_seen": 501473280, + "step": 2778 + }, + { + "epoch": 0.3042228851364296, + "grad_norm": 1.3252107398667914, + "learning_rate": 3.9423421303906474e-05, + "loss": 0.7402, + "num_input_tokens_seen": 501661664, + "step": 2779 + }, + { + "epoch": 0.30433235720736745, + "grad_norm": 1.3374086277491333, + "learning_rate": 3.9416397192896523e-05, + "loss": 0.7956, + "num_input_tokens_seen": 501830784, + "step": 2780 + }, + { + "epoch": 0.3044418292783054, + "grad_norm": 1.4002030378658805, + "learning_rate": 3.940937137645435e-05, + "loss": 0.881, + "num_input_tokens_seen": 502003712, + "step": 2781 + }, + { + "epoch": 0.3045513013492433, + "grad_norm": 1.3412467795099594, + "learning_rate": 3.94023438554111e-05, + "loss": 0.8933, + "num_input_tokens_seen": 502203296, + "step": 2782 + }, + { + "epoch": 0.30466077342018116, + "grad_norm": 1.1696791357240222, + "learning_rate": 3.939531463059809e-05, + "loss": 0.8568, + "num_input_tokens_seen": 502424384, + "step": 2783 + }, + { + "epoch": 0.3047702454911191, + "grad_norm": 1.5579897826548978, + "learning_rate": 3.9388283702846876e-05, + "loss": 0.9976, + "num_input_tokens_seen": 502628672, + "step": 2784 + }, + { + "epoch": 0.304879717562057, + "grad_norm": 1.157035457750251, + "learning_rate": 3.9381251072989216e-05, + "loss": 0.6313, + "num_input_tokens_seen": 502828480, + "step": 2785 + }, + { + "epoch": 0.3049891896329949, + "grad_norm": 1.2130680169504795, + "learning_rate": 3.937421674185704e-05, + "loss": 0.7279, + "num_input_tokens_seen": 503030752, + "step": 2786 + }, + { + "epoch": 0.3050986617039328, + "grad_norm": 1.2109145623772635, + "learning_rate": 3.9367180710282504e-05, + "loss": 0.7397, + "num_input_tokens_seen": 503205472, + "step": 2787 + }, + { + "epoch": 0.30520813377487066, + "grad_norm": 1.2886757935287783, + "learning_rate": 3.936014297909796e-05, + "loss": 0.839, + "num_input_tokens_seen": 503388032, + "step": 2788 + }, + { + "epoch": 0.3053176058458086, + "grad_norm": 1.266629326838865, + "learning_rate": 3.935310354913595e-05, + "loss": 0.6767, + "num_input_tokens_seen": 503568352, + "step": 2789 + }, + { + "epoch": 0.3054270779167465, + "grad_norm": 1.1967355572219673, + "learning_rate": 3.934606242122922e-05, + "loss": 0.6014, + "num_input_tokens_seen": 503736576, + "step": 2790 + }, + { + "epoch": 0.30553654998768437, + "grad_norm": 1.1582419274763311, + "learning_rate": 3.9339019596210746e-05, + "loss": 0.6478, + "num_input_tokens_seen": 503934592, + "step": 2791 + }, + { + "epoch": 0.3056460220586223, + "grad_norm": 1.1806742097404361, + "learning_rate": 3.933197507491366e-05, + "loss": 0.5422, + "num_input_tokens_seen": 504105280, + "step": 2792 + }, + { + "epoch": 0.3057554941295602, + "grad_norm": 1.5822059817424212, + "learning_rate": 3.932492885817132e-05, + "loss": 0.7855, + "num_input_tokens_seen": 504296800, + "step": 2793 + }, + { + "epoch": 0.3058649662004981, + "grad_norm": 1.3235000865779167, + "learning_rate": 3.9317880946817274e-05, + "loss": 0.7483, + "num_input_tokens_seen": 504493696, + "step": 2794 + }, + { + "epoch": 0.305974438271436, + "grad_norm": 1.2874384967224717, + "learning_rate": 3.931083134168529e-05, + "loss": 0.7795, + "num_input_tokens_seen": 504699552, + "step": 2795 + }, + { + "epoch": 0.3060839103423739, + "grad_norm": 1.1721555246920785, + "learning_rate": 3.9303780043609315e-05, + "loss": 0.596, + "num_input_tokens_seen": 504885024, + "step": 2796 + }, + { + "epoch": 0.3061933824133118, + "grad_norm": 1.2458564747646057, + "learning_rate": 3.9296727053423506e-05, + "loss": 0.7177, + "num_input_tokens_seen": 505080128, + "step": 2797 + }, + { + "epoch": 0.3063028544842497, + "grad_norm": 1.265672047058503, + "learning_rate": 3.9289672371962214e-05, + "loss": 0.6644, + "num_input_tokens_seen": 505268288, + "step": 2798 + }, + { + "epoch": 0.30641232655518763, + "grad_norm": 1.0769697994010887, + "learning_rate": 3.928261600006e-05, + "loss": 0.6082, + "num_input_tokens_seen": 505439200, + "step": 2799 + }, + { + "epoch": 0.3065217986261255, + "grad_norm": 1.2494738902141835, + "learning_rate": 3.9275557938551614e-05, + "loss": 0.8565, + "num_input_tokens_seen": 505626016, + "step": 2800 + }, + { + "epoch": 0.3066312706970634, + "grad_norm": 1.1779555181126227, + "learning_rate": 3.926849818827202e-05, + "loss": 0.5678, + "num_input_tokens_seen": 505794688, + "step": 2801 + }, + { + "epoch": 0.30674074276800134, + "grad_norm": 1.0817450409950333, + "learning_rate": 3.9261436750056364e-05, + "loss": 0.5778, + "num_input_tokens_seen": 505969408, + "step": 2802 + }, + { + "epoch": 0.3068502148389392, + "grad_norm": 1.2392583415674354, + "learning_rate": 3.925437362474001e-05, + "loss": 0.7556, + "num_input_tokens_seen": 506154880, + "step": 2803 + }, + { + "epoch": 0.3069596869098771, + "grad_norm": 1.2336917869647606, + "learning_rate": 3.924730881315849e-05, + "loss": 0.7718, + "num_input_tokens_seen": 506317280, + "step": 2804 + }, + { + "epoch": 0.307069158980815, + "grad_norm": 1.2292679028389513, + "learning_rate": 3.9240242316147586e-05, + "loss": 0.6171, + "num_input_tokens_seen": 506506112, + "step": 2805 + }, + { + "epoch": 0.3071786310517529, + "grad_norm": 1.2992760366356375, + "learning_rate": 3.923317413454324e-05, + "loss": 0.7287, + "num_input_tokens_seen": 506709504, + "step": 2806 + }, + { + "epoch": 0.30728810312269084, + "grad_norm": 1.263126969220317, + "learning_rate": 3.922610426918159e-05, + "loss": 0.7014, + "num_input_tokens_seen": 506902816, + "step": 2807 + }, + { + "epoch": 0.3073975751936287, + "grad_norm": 1.2778081953648366, + "learning_rate": 3.921903272089901e-05, + "loss": 0.6937, + "num_input_tokens_seen": 507052000, + "step": 2808 + }, + { + "epoch": 0.3075070472645666, + "grad_norm": 1.3727182182171862, + "learning_rate": 3.9211959490532044e-05, + "loss": 0.8297, + "num_input_tokens_seen": 507236800, + "step": 2809 + }, + { + "epoch": 0.30761651933550455, + "grad_norm": 1.1163472570445327, + "learning_rate": 3.920488457891743e-05, + "loss": 0.4767, + "num_input_tokens_seen": 507393824, + "step": 2810 + }, + { + "epoch": 0.3077259914064424, + "grad_norm": 1.4131663733882955, + "learning_rate": 3.919780798689213e-05, + "loss": 0.8102, + "num_input_tokens_seen": 507573024, + "step": 2811 + }, + { + "epoch": 0.30783546347738033, + "grad_norm": 1.202328604143609, + "learning_rate": 3.919072971529329e-05, + "loss": 0.7186, + "num_input_tokens_seen": 507764320, + "step": 2812 + }, + { + "epoch": 0.30794493554831825, + "grad_norm": 1.4669761497363425, + "learning_rate": 3.918364976495825e-05, + "loss": 0.6168, + "num_input_tokens_seen": 507919552, + "step": 2813 + }, + { + "epoch": 0.3080544076192561, + "grad_norm": 1.3330441648116689, + "learning_rate": 3.917656813672456e-05, + "loss": 0.7816, + "num_input_tokens_seen": 508092256, + "step": 2814 + }, + { + "epoch": 0.30816387969019404, + "grad_norm": 1.2826859870629974, + "learning_rate": 3.916948483142996e-05, + "loss": 0.6203, + "num_input_tokens_seen": 508259360, + "step": 2815 + }, + { + "epoch": 0.30827335176113196, + "grad_norm": 1.5309028231915265, + "learning_rate": 3.916239984991239e-05, + "loss": 0.8996, + "num_input_tokens_seen": 508426912, + "step": 2816 + }, + { + "epoch": 0.30838282383206983, + "grad_norm": 1.2914862301631833, + "learning_rate": 3.915531319300999e-05, + "loss": 0.7073, + "num_input_tokens_seen": 508605888, + "step": 2817 + }, + { + "epoch": 0.30849229590300775, + "grad_norm": 1.2094850258469958, + "learning_rate": 3.9148224861561105e-05, + "loss": 0.6224, + "num_input_tokens_seen": 508782176, + "step": 2818 + }, + { + "epoch": 0.3086017679739457, + "grad_norm": 1.1037355457020452, + "learning_rate": 3.914113485640426e-05, + "loss": 0.6324, + "num_input_tokens_seen": 508975040, + "step": 2819 + }, + { + "epoch": 0.30871124004488354, + "grad_norm": 1.3335807938217346, + "learning_rate": 3.91340431783782e-05, + "loss": 0.7823, + "num_input_tokens_seen": 509184928, + "step": 2820 + }, + { + "epoch": 0.30882071211582146, + "grad_norm": 1.2758773178867362, + "learning_rate": 3.912694982832185e-05, + "loss": 0.7183, + "num_input_tokens_seen": 509351360, + "step": 2821 + }, + { + "epoch": 0.3089301841867593, + "grad_norm": 1.2216314829243806, + "learning_rate": 3.9119854807074336e-05, + "loss": 0.7218, + "num_input_tokens_seen": 509547584, + "step": 2822 + }, + { + "epoch": 0.30903965625769725, + "grad_norm": 1.1282545274254412, + "learning_rate": 3.911275811547499e-05, + "loss": 0.4951, + "num_input_tokens_seen": 509699680, + "step": 2823 + }, + { + "epoch": 0.30914912832863517, + "grad_norm": 1.3541722322007275, + "learning_rate": 3.910565975436335e-05, + "loss": 0.8541, + "num_input_tokens_seen": 509892992, + "step": 2824 + }, + { + "epoch": 0.30925860039957304, + "grad_norm": 1.2681112946154651, + "learning_rate": 3.909855972457912e-05, + "loss": 0.8201, + "num_input_tokens_seen": 510073984, + "step": 2825 + }, + { + "epoch": 0.30936807247051096, + "grad_norm": 1.233781454580069, + "learning_rate": 3.9091458026962226e-05, + "loss": 0.6367, + "num_input_tokens_seen": 510246464, + "step": 2826 + }, + { + "epoch": 0.3094775445414489, + "grad_norm": 1.260625785104926, + "learning_rate": 3.9084354662352784e-05, + "loss": 0.7428, + "num_input_tokens_seen": 510419616, + "step": 2827 + }, + { + "epoch": 0.30958701661238675, + "grad_norm": 1.2801370206152476, + "learning_rate": 3.9077249631591106e-05, + "loss": 0.6168, + "num_input_tokens_seen": 510577984, + "step": 2828 + }, + { + "epoch": 0.30969648868332467, + "grad_norm": 1.3069115145529298, + "learning_rate": 3.9070142935517714e-05, + "loss": 0.9965, + "num_input_tokens_seen": 510769056, + "step": 2829 + }, + { + "epoch": 0.3098059607542626, + "grad_norm": 1.3047073271381269, + "learning_rate": 3.906303457497331e-05, + "loss": 0.7856, + "num_input_tokens_seen": 510964832, + "step": 2830 + }, + { + "epoch": 0.30991543282520045, + "grad_norm": 1.1555509774003263, + "learning_rate": 3.9055924550798806e-05, + "loss": 0.6376, + "num_input_tokens_seen": 511126784, + "step": 2831 + }, + { + "epoch": 0.3100249048961384, + "grad_norm": 1.3687031655159105, + "learning_rate": 3.904881286383529e-05, + "loss": 0.6484, + "num_input_tokens_seen": 511289184, + "step": 2832 + }, + { + "epoch": 0.3101343769670763, + "grad_norm": 1.3479401220357263, + "learning_rate": 3.904169951492407e-05, + "loss": 0.8467, + "num_input_tokens_seen": 511478464, + "step": 2833 + }, + { + "epoch": 0.31024384903801416, + "grad_norm": 1.1664516841269457, + "learning_rate": 3.903458450490664e-05, + "loss": 0.6678, + "num_input_tokens_seen": 511657664, + "step": 2834 + }, + { + "epoch": 0.3103533211089521, + "grad_norm": 1.4522732935616853, + "learning_rate": 3.9027467834624696e-05, + "loss": 0.7997, + "num_input_tokens_seen": 511820288, + "step": 2835 + }, + { + "epoch": 0.31046279317989, + "grad_norm": 1.3214080148003773, + "learning_rate": 3.902034950492012e-05, + "loss": 0.7553, + "num_input_tokens_seen": 511995680, + "step": 2836 + }, + { + "epoch": 0.3105722652508279, + "grad_norm": 1.7139304820488324, + "learning_rate": 3.9013229516635e-05, + "loss": 0.7594, + "num_input_tokens_seen": 512193472, + "step": 2837 + }, + { + "epoch": 0.3106817373217658, + "grad_norm": 1.2462670376147742, + "learning_rate": 3.900610787061162e-05, + "loss": 0.8091, + "num_input_tokens_seen": 512401344, + "step": 2838 + }, + { + "epoch": 0.31079120939270366, + "grad_norm": 1.4507880338593413, + "learning_rate": 3.899898456769245e-05, + "loss": 0.6557, + "num_input_tokens_seen": 512578752, + "step": 2839 + }, + { + "epoch": 0.3109006814636416, + "grad_norm": 1.2575955411135515, + "learning_rate": 3.899185960872016e-05, + "loss": 0.826, + "num_input_tokens_seen": 512783488, + "step": 2840 + }, + { + "epoch": 0.3110101535345795, + "grad_norm": 1.283605596635124, + "learning_rate": 3.8984732994537644e-05, + "loss": 0.7326, + "num_input_tokens_seen": 512990688, + "step": 2841 + }, + { + "epoch": 0.31111962560551737, + "grad_norm": 1.28340498034804, + "learning_rate": 3.8977604725987936e-05, + "loss": 0.6607, + "num_input_tokens_seen": 513169216, + "step": 2842 + }, + { + "epoch": 0.3112290976764553, + "grad_norm": 1.2908211473579725, + "learning_rate": 3.897047480391431e-05, + "loss": 0.6077, + "num_input_tokens_seen": 513366112, + "step": 2843 + }, + { + "epoch": 0.3113385697473932, + "grad_norm": 1.2161637276987791, + "learning_rate": 3.8963343229160235e-05, + "loss": 0.5833, + "num_input_tokens_seen": 513538144, + "step": 2844 + }, + { + "epoch": 0.3114480418183311, + "grad_norm": 1.2594979408401794, + "learning_rate": 3.8956210002569334e-05, + "loss": 0.5917, + "num_input_tokens_seen": 513708608, + "step": 2845 + }, + { + "epoch": 0.311557513889269, + "grad_norm": 1.1642063231234394, + "learning_rate": 3.894907512498548e-05, + "loss": 0.6332, + "num_input_tokens_seen": 513868096, + "step": 2846 + }, + { + "epoch": 0.3116669859602069, + "grad_norm": 1.1964110269459862, + "learning_rate": 3.89419385972527e-05, + "loss": 0.6904, + "num_input_tokens_seen": 514042368, + "step": 2847 + }, + { + "epoch": 0.3117764580311448, + "grad_norm": 1.155159358553559, + "learning_rate": 3.893480042021523e-05, + "loss": 0.6416, + "num_input_tokens_seen": 514250016, + "step": 2848 + }, + { + "epoch": 0.3118859301020827, + "grad_norm": 1.4287003407896923, + "learning_rate": 3.892766059471752e-05, + "loss": 0.7677, + "num_input_tokens_seen": 514442656, + "step": 2849 + }, + { + "epoch": 0.31199540217302063, + "grad_norm": 1.433286878291355, + "learning_rate": 3.892051912160418e-05, + "loss": 0.7286, + "num_input_tokens_seen": 514614912, + "step": 2850 + }, + { + "epoch": 0.3121048742439585, + "grad_norm": 1.3482319362426942, + "learning_rate": 3.8913376001720046e-05, + "loss": 0.8638, + "num_input_tokens_seen": 514799488, + "step": 2851 + }, + { + "epoch": 0.3122143463148964, + "grad_norm": 1.3229317952089519, + "learning_rate": 3.890623123591013e-05, + "loss": 0.7756, + "num_input_tokens_seen": 514996832, + "step": 2852 + }, + { + "epoch": 0.31232381838583434, + "grad_norm": 1.2188738246517326, + "learning_rate": 3.889908482501963e-05, + "loss": 0.5609, + "num_input_tokens_seen": 515172896, + "step": 2853 + }, + { + "epoch": 0.3124332904567722, + "grad_norm": 1.3931576671851975, + "learning_rate": 3.889193676989398e-05, + "loss": 0.8509, + "num_input_tokens_seen": 515345152, + "step": 2854 + }, + { + "epoch": 0.3125427625277101, + "grad_norm": 1.318655882093791, + "learning_rate": 3.888478707137875e-05, + "loss": 0.8009, + "num_input_tokens_seen": 515557952, + "step": 2855 + }, + { + "epoch": 0.312652234598648, + "grad_norm": 1.2231749875356148, + "learning_rate": 3.8877635730319774e-05, + "loss": 0.6578, + "num_input_tokens_seen": 515750592, + "step": 2856 + }, + { + "epoch": 0.3127617066695859, + "grad_norm": 1.4184856050801002, + "learning_rate": 3.8870482747563006e-05, + "loss": 0.8557, + "num_input_tokens_seen": 515917248, + "step": 2857 + }, + { + "epoch": 0.31287117874052384, + "grad_norm": 1.3347437471913866, + "learning_rate": 3.886332812395465e-05, + "loss": 0.9126, + "num_input_tokens_seen": 516117952, + "step": 2858 + }, + { + "epoch": 0.3129806508114617, + "grad_norm": 1.2627497850839706, + "learning_rate": 3.885617186034107e-05, + "loss": 0.7197, + "num_input_tokens_seen": 516309248, + "step": 2859 + }, + { + "epoch": 0.3130901228823996, + "grad_norm": 1.200368870228628, + "learning_rate": 3.884901395756886e-05, + "loss": 0.8931, + "num_input_tokens_seen": 516502112, + "step": 2860 + }, + { + "epoch": 0.31319959495333755, + "grad_norm": 1.373580546736005, + "learning_rate": 3.884185441648477e-05, + "loss": 1.0365, + "num_input_tokens_seen": 516731488, + "step": 2861 + }, + { + "epoch": 0.3133090670242754, + "grad_norm": 1.1777058308496795, + "learning_rate": 3.883469323793576e-05, + "loss": 0.7319, + "num_input_tokens_seen": 516924352, + "step": 2862 + }, + { + "epoch": 0.31341853909521333, + "grad_norm": 1.3714683433992967, + "learning_rate": 3.882753042276899e-05, + "loss": 0.9392, + "num_input_tokens_seen": 517138496, + "step": 2863 + }, + { + "epoch": 0.31352801116615125, + "grad_norm": 1.232597084709581, + "learning_rate": 3.882036597183181e-05, + "loss": 0.8536, + "num_input_tokens_seen": 517339872, + "step": 2864 + }, + { + "epoch": 0.3136374832370891, + "grad_norm": 1.1441749304528332, + "learning_rate": 3.881319988597174e-05, + "loss": 0.8045, + "num_input_tokens_seen": 517532960, + "step": 2865 + }, + { + "epoch": 0.31374695530802704, + "grad_norm": 1.193613328150183, + "learning_rate": 3.8806032166036545e-05, + "loss": 0.8623, + "num_input_tokens_seen": 517729408, + "step": 2866 + }, + { + "epoch": 0.31385642737896496, + "grad_norm": 1.244072832928473, + "learning_rate": 3.8798862812874136e-05, + "loss": 0.7046, + "num_input_tokens_seen": 517910624, + "step": 2867 + }, + { + "epoch": 0.31396589944990283, + "grad_norm": 1.4205299135500553, + "learning_rate": 3.8791691827332627e-05, + "loss": 0.6361, + "num_input_tokens_seen": 518083328, + "step": 2868 + }, + { + "epoch": 0.31407537152084075, + "grad_norm": 1.296264090907466, + "learning_rate": 3.8784519210260343e-05, + "loss": 0.6417, + "num_input_tokens_seen": 518250880, + "step": 2869 + }, + { + "epoch": 0.3141848435917787, + "grad_norm": 1.4433241079768309, + "learning_rate": 3.877734496250579e-05, + "loss": 0.7183, + "num_input_tokens_seen": 518419776, + "step": 2870 + }, + { + "epoch": 0.31429431566271654, + "grad_norm": 1.2492161200270842, + "learning_rate": 3.877016908491767e-05, + "loss": 0.6618, + "num_input_tokens_seen": 518585088, + "step": 2871 + }, + { + "epoch": 0.31440378773365446, + "grad_norm": 1.4161940047935724, + "learning_rate": 3.8762991578344864e-05, + "loss": 0.8113, + "num_input_tokens_seen": 518769888, + "step": 2872 + }, + { + "epoch": 0.3145132598045923, + "grad_norm": 1.8311604787147309, + "learning_rate": 3.8755812443636466e-05, + "loss": 0.6016, + "num_input_tokens_seen": 518951328, + "step": 2873 + }, + { + "epoch": 0.31462273187553025, + "grad_norm": 1.3035620215014943, + "learning_rate": 3.8748631681641757e-05, + "loss": 0.7797, + "num_input_tokens_seen": 519157408, + "step": 2874 + }, + { + "epoch": 0.31473220394646817, + "grad_norm": 1.1980076741464734, + "learning_rate": 3.8741449293210194e-05, + "loss": 0.6789, + "num_input_tokens_seen": 519308608, + "step": 2875 + }, + { + "epoch": 0.31484167601740604, + "grad_norm": 1.3566318326050608, + "learning_rate": 3.8734265279191455e-05, + "loss": 0.7589, + "num_input_tokens_seen": 519516032, + "step": 2876 + }, + { + "epoch": 0.31495114808834396, + "grad_norm": 1.176058866354245, + "learning_rate": 3.872707964043539e-05, + "loss": 0.5528, + "num_input_tokens_seen": 519686944, + "step": 2877 + }, + { + "epoch": 0.3150606201592819, + "grad_norm": 1.3308301415800272, + "learning_rate": 3.871989237779204e-05, + "loss": 0.6317, + "num_input_tokens_seen": 519847776, + "step": 2878 + }, + { + "epoch": 0.31517009223021974, + "grad_norm": 1.242963632396854, + "learning_rate": 3.8712703492111656e-05, + "loss": 0.5936, + "num_input_tokens_seen": 520044224, + "step": 2879 + }, + { + "epoch": 0.31527956430115767, + "grad_norm": 1.1353541307405528, + "learning_rate": 3.8705512984244665e-05, + "loss": 0.5583, + "num_input_tokens_seen": 520232160, + "step": 2880 + }, + { + "epoch": 0.3153890363720956, + "grad_norm": 1.3244633631551577, + "learning_rate": 3.869832085504168e-05, + "loss": 0.6576, + "num_input_tokens_seen": 520425472, + "step": 2881 + }, + { + "epoch": 0.31549850844303345, + "grad_norm": 1.139419702687726, + "learning_rate": 3.869112710535353e-05, + "loss": 0.6297, + "num_input_tokens_seen": 520643872, + "step": 2882 + }, + { + "epoch": 0.3156079805139714, + "grad_norm": 1.240326260744834, + "learning_rate": 3.868393173603122e-05, + "loss": 0.5198, + "num_input_tokens_seen": 520814336, + "step": 2883 + }, + { + "epoch": 0.3157174525849093, + "grad_norm": 1.3332804530306877, + "learning_rate": 3.867673474792593e-05, + "loss": 0.6114, + "num_input_tokens_seen": 520967552, + "step": 2884 + }, + { + "epoch": 0.31582692465584716, + "grad_norm": 1.2995248861685655, + "learning_rate": 3.866953614188908e-05, + "loss": 0.6074, + "num_input_tokens_seen": 521151008, + "step": 2885 + }, + { + "epoch": 0.3159363967267851, + "grad_norm": 1.1994337951497984, + "learning_rate": 3.866233591877223e-05, + "loss": 0.5478, + "num_input_tokens_seen": 521344768, + "step": 2886 + }, + { + "epoch": 0.316045868797723, + "grad_norm": 1.1975163606390136, + "learning_rate": 3.865513407942716e-05, + "loss": 0.4603, + "num_input_tokens_seen": 521489472, + "step": 2887 + }, + { + "epoch": 0.3161553408686609, + "grad_norm": 1.3290706185264014, + "learning_rate": 3.864793062470583e-05, + "loss": 0.792, + "num_input_tokens_seen": 521684352, + "step": 2888 + }, + { + "epoch": 0.3162648129395988, + "grad_norm": 1.2928291545087474, + "learning_rate": 3.864072555546041e-05, + "loss": 0.8604, + "num_input_tokens_seen": 521887296, + "step": 2889 + }, + { + "epoch": 0.31637428501053666, + "grad_norm": 1.2311920946321018, + "learning_rate": 3.863351887254322e-05, + "loss": 0.787, + "num_input_tokens_seen": 522084640, + "step": 2890 + }, + { + "epoch": 0.3164837570814746, + "grad_norm": 1.2910149324677842, + "learning_rate": 3.862631057680681e-05, + "loss": 0.691, + "num_input_tokens_seen": 522277504, + "step": 2891 + }, + { + "epoch": 0.3165932291524125, + "grad_norm": 1.3461548776413037, + "learning_rate": 3.8619100669103916e-05, + "loss": 0.7287, + "num_input_tokens_seen": 522471488, + "step": 2892 + }, + { + "epoch": 0.31670270122335037, + "grad_norm": 1.3088271203404676, + "learning_rate": 3.861188915028744e-05, + "loss": 0.7116, + "num_input_tokens_seen": 522693024, + "step": 2893 + }, + { + "epoch": 0.3168121732942883, + "grad_norm": 1.3464778004688591, + "learning_rate": 3.8604676021210506e-05, + "loss": 0.7532, + "num_input_tokens_seen": 522905600, + "step": 2894 + }, + { + "epoch": 0.3169216453652262, + "grad_norm": 1.2650966718924679, + "learning_rate": 3.85974612827264e-05, + "loss": 0.5196, + "num_input_tokens_seen": 523067552, + "step": 2895 + }, + { + "epoch": 0.3170311174361641, + "grad_norm": 1.2260143731816466, + "learning_rate": 3.859024493568862e-05, + "loss": 0.8541, + "num_input_tokens_seen": 523273408, + "step": 2896 + }, + { + "epoch": 0.317140589507102, + "grad_norm": 1.3672553345336749, + "learning_rate": 3.8583026980950846e-05, + "loss": 0.7167, + "num_input_tokens_seen": 523433344, + "step": 2897 + }, + { + "epoch": 0.3172500615780399, + "grad_norm": 1.216689299455663, + "learning_rate": 3.857580741936695e-05, + "loss": 0.5849, + "num_input_tokens_seen": 523590816, + "step": 2898 + }, + { + "epoch": 0.3173595336489778, + "grad_norm": 1.3296616584248075, + "learning_rate": 3.856858625179098e-05, + "loss": 0.7044, + "num_input_tokens_seen": 523795776, + "step": 2899 + }, + { + "epoch": 0.3174690057199157, + "grad_norm": 1.2090137582884295, + "learning_rate": 3.85613634790772e-05, + "loss": 0.7911, + "num_input_tokens_seen": 523971840, + "step": 2900 + }, + { + "epoch": 0.31757847779085363, + "grad_norm": 1.305775993531043, + "learning_rate": 3.8554139102080044e-05, + "loss": 0.7645, + "num_input_tokens_seen": 524167840, + "step": 2901 + }, + { + "epoch": 0.3176879498617915, + "grad_norm": 1.4520550962719194, + "learning_rate": 3.854691312165414e-05, + "loss": 0.6904, + "num_input_tokens_seen": 524330240, + "step": 2902 + }, + { + "epoch": 0.3177974219327294, + "grad_norm": 1.2667359550665633, + "learning_rate": 3.8539685538654325e-05, + "loss": 0.7055, + "num_input_tokens_seen": 524507200, + "step": 2903 + }, + { + "epoch": 0.31790689400366734, + "grad_norm": 1.32829198348534, + "learning_rate": 3.853245635393558e-05, + "loss": 0.7334, + "num_input_tokens_seen": 524675648, + "step": 2904 + }, + { + "epoch": 0.3180163660746052, + "grad_norm": 1.2265060800587502, + "learning_rate": 3.852522556835313e-05, + "loss": 0.6949, + "num_input_tokens_seen": 524858432, + "step": 2905 + }, + { + "epoch": 0.3181258381455431, + "grad_norm": 1.1655427878179225, + "learning_rate": 3.8517993182762334e-05, + "loss": 0.6758, + "num_input_tokens_seen": 525032480, + "step": 2906 + }, + { + "epoch": 0.318235310216481, + "grad_norm": 1.1949752631081145, + "learning_rate": 3.8510759198018805e-05, + "loss": 0.6856, + "num_input_tokens_seen": 525216832, + "step": 2907 + }, + { + "epoch": 0.3183447822874189, + "grad_norm": 1.2822012742651019, + "learning_rate": 3.8503523614978274e-05, + "loss": 0.6203, + "num_input_tokens_seen": 525407008, + "step": 2908 + }, + { + "epoch": 0.31845425435835684, + "grad_norm": 1.358295011662404, + "learning_rate": 3.849628643449673e-05, + "loss": 0.7456, + "num_input_tokens_seen": 525571648, + "step": 2909 + }, + { + "epoch": 0.3185637264292947, + "grad_norm": 1.1868348015299945, + "learning_rate": 3.8489047657430286e-05, + "loss": 0.5976, + "num_input_tokens_seen": 525740096, + "step": 2910 + }, + { + "epoch": 0.3186731985002326, + "grad_norm": 1.3253449458832625, + "learning_rate": 3.84818072846353e-05, + "loss": 0.8062, + "num_input_tokens_seen": 525909216, + "step": 2911 + }, + { + "epoch": 0.31878267057117055, + "grad_norm": 1.4743212285733298, + "learning_rate": 3.8474565316968284e-05, + "loss": 0.7807, + "num_input_tokens_seen": 526065344, + "step": 2912 + }, + { + "epoch": 0.3188921426421084, + "grad_norm": 1.470736519686431, + "learning_rate": 3.846732175528595e-05, + "loss": 0.8759, + "num_input_tokens_seen": 526248576, + "step": 2913 + }, + { + "epoch": 0.31900161471304633, + "grad_norm": 1.4531413251718641, + "learning_rate": 3.84600766004452e-05, + "loss": 0.7472, + "num_input_tokens_seen": 526432704, + "step": 2914 + }, + { + "epoch": 0.31911108678398425, + "grad_norm": 1.376690605515789, + "learning_rate": 3.845282985330311e-05, + "loss": 0.7634, + "num_input_tokens_seen": 526586368, + "step": 2915 + }, + { + "epoch": 0.3192205588549221, + "grad_norm": 1.3395585626869928, + "learning_rate": 3.8445581514716977e-05, + "loss": 0.601, + "num_input_tokens_seen": 526721216, + "step": 2916 + }, + { + "epoch": 0.31933003092586004, + "grad_norm": 1.4855759430498192, + "learning_rate": 3.843833158554425e-05, + "loss": 0.8711, + "num_input_tokens_seen": 526950592, + "step": 2917 + }, + { + "epoch": 0.31943950299679796, + "grad_norm": 1.2665185952641846, + "learning_rate": 3.843108006664259e-05, + "loss": 0.7113, + "num_input_tokens_seen": 527112096, + "step": 2918 + }, + { + "epoch": 0.31954897506773583, + "grad_norm": 1.1016411178346455, + "learning_rate": 3.8423826958869825e-05, + "loss": 0.7535, + "num_input_tokens_seen": 527303616, + "step": 2919 + }, + { + "epoch": 0.31965844713867375, + "grad_norm": 1.13504970231652, + "learning_rate": 3.841657226308399e-05, + "loss": 0.8099, + "num_input_tokens_seen": 527494688, + "step": 2920 + }, + { + "epoch": 0.3197679192096117, + "grad_norm": 1.4004289736212172, + "learning_rate": 3.840931598014332e-05, + "loss": 0.5972, + "num_input_tokens_seen": 527643872, + "step": 2921 + }, + { + "epoch": 0.31987739128054954, + "grad_norm": 1.1545392345173537, + "learning_rate": 3.840205811090619e-05, + "loss": 0.5544, + "num_input_tokens_seen": 527814336, + "step": 2922 + }, + { + "epoch": 0.31998686335148746, + "grad_norm": 1.2966626004855641, + "learning_rate": 3.8394798656231215e-05, + "loss": 0.8348, + "num_input_tokens_seen": 528016160, + "step": 2923 + }, + { + "epoch": 0.3200963354224253, + "grad_norm": 1.2900420195367321, + "learning_rate": 3.8387537616977165e-05, + "loss": 0.9048, + "num_input_tokens_seen": 528228064, + "step": 2924 + }, + { + "epoch": 0.32020580749336325, + "grad_norm": 1.0958800691470185, + "learning_rate": 3.8380274994003e-05, + "loss": 0.562, + "num_input_tokens_seen": 528374112, + "step": 2925 + }, + { + "epoch": 0.32031527956430117, + "grad_norm": 1.1362078590258247, + "learning_rate": 3.837301078816789e-05, + "loss": 0.7073, + "num_input_tokens_seen": 528534944, + "step": 2926 + }, + { + "epoch": 0.32042475163523904, + "grad_norm": 1.3234615805640855, + "learning_rate": 3.8365745000331164e-05, + "loss": 0.6107, + "num_input_tokens_seen": 528705856, + "step": 2927 + }, + { + "epoch": 0.32053422370617696, + "grad_norm": 1.2809912257433766, + "learning_rate": 3.8358477631352364e-05, + "loss": 0.5801, + "num_input_tokens_seen": 528861312, + "step": 2928 + }, + { + "epoch": 0.3206436957771149, + "grad_norm": 1.415527006888111, + "learning_rate": 3.8351208682091185e-05, + "loss": 0.8095, + "num_input_tokens_seen": 529051040, + "step": 2929 + }, + { + "epoch": 0.32075316784805274, + "grad_norm": 1.2436952377608494, + "learning_rate": 3.834393815340754e-05, + "loss": 0.6655, + "num_input_tokens_seen": 529252416, + "step": 2930 + }, + { + "epoch": 0.32086263991899067, + "grad_norm": 1.2363545443588135, + "learning_rate": 3.833666604616153e-05, + "loss": 0.6824, + "num_input_tokens_seen": 529439904, + "step": 2931 + }, + { + "epoch": 0.3209721119899286, + "grad_norm": 1.3598062116334892, + "learning_rate": 3.832939236121342e-05, + "loss": 0.7335, + "num_input_tokens_seen": 529600512, + "step": 2932 + }, + { + "epoch": 0.32108158406086645, + "grad_norm": 1.3167237032032926, + "learning_rate": 3.8322117099423674e-05, + "loss": 0.7757, + "num_input_tokens_seen": 529796064, + "step": 2933 + }, + { + "epoch": 0.3211910561318044, + "grad_norm": 1.3149878673936566, + "learning_rate": 3.8314840261652954e-05, + "loss": 0.7436, + "num_input_tokens_seen": 529973024, + "step": 2934 + }, + { + "epoch": 0.3213005282027423, + "grad_norm": 1.2284560439273244, + "learning_rate": 3.8307561848762066e-05, + "loss": 0.6174, + "num_input_tokens_seen": 530156928, + "step": 2935 + }, + { + "epoch": 0.32141000027368016, + "grad_norm": 1.2419657479291593, + "learning_rate": 3.8300281861612056e-05, + "loss": 0.8203, + "num_input_tokens_seen": 530369056, + "step": 2936 + }, + { + "epoch": 0.3215194723446181, + "grad_norm": 1.1790565167438787, + "learning_rate": 3.829300030106413e-05, + "loss": 0.69, + "num_input_tokens_seen": 530553408, + "step": 2937 + }, + { + "epoch": 0.321628944415556, + "grad_norm": 1.281832399818227, + "learning_rate": 3.828571716797968e-05, + "loss": 0.836, + "num_input_tokens_seen": 530759040, + "step": 2938 + }, + { + "epoch": 0.3217384164864939, + "grad_norm": 1.3514176660453145, + "learning_rate": 3.827843246322029e-05, + "loss": 0.668, + "num_input_tokens_seen": 530961536, + "step": 2939 + }, + { + "epoch": 0.3218478885574318, + "grad_norm": 1.3320927759543233, + "learning_rate": 3.827114618764772e-05, + "loss": 0.9635, + "num_input_tokens_seen": 531151712, + "step": 2940 + }, + { + "epoch": 0.32195736062836966, + "grad_norm": 1.2956389546137401, + "learning_rate": 3.8263858342123936e-05, + "loss": 0.8707, + "num_input_tokens_seen": 531353312, + "step": 2941 + }, + { + "epoch": 0.3220668326993076, + "grad_norm": 1.1021546199556906, + "learning_rate": 3.8256568927511047e-05, + "loss": 0.7634, + "num_input_tokens_seen": 531543488, + "step": 2942 + }, + { + "epoch": 0.3221763047702455, + "grad_norm": 1.314936193433898, + "learning_rate": 3.8249277944671415e-05, + "loss": 0.7338, + "num_input_tokens_seen": 531733216, + "step": 2943 + }, + { + "epoch": 0.32228577684118337, + "grad_norm": 1.272810003220591, + "learning_rate": 3.824198539446752e-05, + "loss": 0.7396, + "num_input_tokens_seen": 531911520, + "step": 2944 + }, + { + "epoch": 0.3223952489121213, + "grad_norm": 1.1597907521348356, + "learning_rate": 3.823469127776208e-05, + "loss": 0.6152, + "num_input_tokens_seen": 532068096, + "step": 2945 + }, + { + "epoch": 0.3225047209830592, + "grad_norm": 1.19287803919578, + "learning_rate": 3.822739559541795e-05, + "loss": 0.5899, + "num_input_tokens_seen": 532240576, + "step": 2946 + }, + { + "epoch": 0.3226141930539971, + "grad_norm": 1.2761925225387196, + "learning_rate": 3.8220098348298204e-05, + "loss": 0.6232, + "num_input_tokens_seen": 532431648, + "step": 2947 + }, + { + "epoch": 0.322723665124935, + "grad_norm": 1.3478789656887744, + "learning_rate": 3.8212799537266105e-05, + "loss": 0.7026, + "num_input_tokens_seen": 532594944, + "step": 2948 + }, + { + "epoch": 0.3228331371958729, + "grad_norm": 1.264189624406083, + "learning_rate": 3.8205499163185074e-05, + "loss": 0.796, + "num_input_tokens_seen": 532771008, + "step": 2949 + }, + { + "epoch": 0.3229426092668108, + "grad_norm": 1.2969363094683026, + "learning_rate": 3.819819722691874e-05, + "loss": 0.7151, + "num_input_tokens_seen": 532947072, + "step": 2950 + }, + { + "epoch": 0.3230520813377487, + "grad_norm": 1.3675286439619005, + "learning_rate": 3.8190893729330904e-05, + "loss": 0.8587, + "num_input_tokens_seen": 533166592, + "step": 2951 + }, + { + "epoch": 0.32316155340868663, + "grad_norm": 1.4200734893157372, + "learning_rate": 3.8183588671285556e-05, + "loss": 0.7172, + "num_input_tokens_seen": 533376480, + "step": 2952 + }, + { + "epoch": 0.3232710254796245, + "grad_norm": 1.1347139382860874, + "learning_rate": 3.817628205364687e-05, + "loss": 0.6343, + "num_input_tokens_seen": 533547168, + "step": 2953 + }, + { + "epoch": 0.3233804975505624, + "grad_norm": 1.4725837993254465, + "learning_rate": 3.816897387727921e-05, + "loss": 0.8268, + "num_input_tokens_seen": 533713376, + "step": 2954 + }, + { + "epoch": 0.32348996962150034, + "grad_norm": 1.2255378379349975, + "learning_rate": 3.816166414304711e-05, + "loss": 0.8066, + "num_input_tokens_seen": 533893696, + "step": 2955 + }, + { + "epoch": 0.3235994416924382, + "grad_norm": 1.2983554524964347, + "learning_rate": 3.81543528518153e-05, + "loss": 0.7821, + "num_input_tokens_seen": 534101792, + "step": 2956 + }, + { + "epoch": 0.3237089137633761, + "grad_norm": 1.2428433108617107, + "learning_rate": 3.81470400044487e-05, + "loss": 0.8894, + "num_input_tokens_seen": 534293536, + "step": 2957 + }, + { + "epoch": 0.323818385834314, + "grad_norm": 1.2166727899806642, + "learning_rate": 3.81397256018124e-05, + "loss": 0.7484, + "num_input_tokens_seen": 534464896, + "step": 2958 + }, + { + "epoch": 0.3239278579052519, + "grad_norm": 1.1077354565773943, + "learning_rate": 3.8132409644771683e-05, + "loss": 0.605, + "num_input_tokens_seen": 534665600, + "step": 2959 + }, + { + "epoch": 0.32403732997618984, + "grad_norm": 1.2910258744333596, + "learning_rate": 3.812509213419201e-05, + "loss": 0.6027, + "num_input_tokens_seen": 534832928, + "step": 2960 + }, + { + "epoch": 0.3241468020471277, + "grad_norm": 1.2203163938179364, + "learning_rate": 3.8117773070939025e-05, + "loss": 0.8371, + "num_input_tokens_seen": 535006080, + "step": 2961 + }, + { + "epoch": 0.3242562741180656, + "grad_norm": 1.224210926802615, + "learning_rate": 3.811045245587856e-05, + "loss": 0.7262, + "num_input_tokens_seen": 535208576, + "step": 2962 + }, + { + "epoch": 0.32436574618900355, + "grad_norm": 1.2472136436786645, + "learning_rate": 3.810313028987663e-05, + "loss": 0.7924, + "num_input_tokens_seen": 535373664, + "step": 2963 + }, + { + "epoch": 0.3244752182599414, + "grad_norm": 1.2765977646848916, + "learning_rate": 3.809580657379944e-05, + "loss": 0.6628, + "num_input_tokens_seen": 535570560, + "step": 2964 + }, + { + "epoch": 0.32458469033087933, + "grad_norm": 1.290839756054268, + "learning_rate": 3.8088481308513375e-05, + "loss": 0.5767, + "num_input_tokens_seen": 535761408, + "step": 2965 + }, + { + "epoch": 0.32469416240181725, + "grad_norm": 1.300753939688437, + "learning_rate": 3.808115449488499e-05, + "loss": 0.7851, + "num_input_tokens_seen": 535956512, + "step": 2966 + }, + { + "epoch": 0.3248036344727551, + "grad_norm": 1.3184769641569443, + "learning_rate": 3.8073826133781026e-05, + "loss": 0.7067, + "num_input_tokens_seen": 536103008, + "step": 2967 + }, + { + "epoch": 0.32491310654369304, + "grad_norm": 1.1722877792710644, + "learning_rate": 3.8066496226068426e-05, + "loss": 0.6918, + "num_input_tokens_seen": 536293856, + "step": 2968 + }, + { + "epoch": 0.32502257861463096, + "grad_norm": 1.4037618453493304, + "learning_rate": 3.8059164772614304e-05, + "loss": 0.7358, + "num_input_tokens_seen": 536477984, + "step": 2969 + }, + { + "epoch": 0.32513205068556883, + "grad_norm": 1.3629262292507616, + "learning_rate": 3.805183177428595e-05, + "loss": 0.7322, + "num_input_tokens_seen": 536648224, + "step": 2970 + }, + { + "epoch": 0.32524152275650675, + "grad_norm": 1.3682907884436726, + "learning_rate": 3.8044497231950855e-05, + "loss": 0.5611, + "num_input_tokens_seen": 536829440, + "step": 2971 + }, + { + "epoch": 0.3253509948274447, + "grad_norm": 1.2639627726179141, + "learning_rate": 3.803716114647667e-05, + "loss": 0.6641, + "num_input_tokens_seen": 537007520, + "step": 2972 + }, + { + "epoch": 0.32546046689838254, + "grad_norm": 1.1744343085790458, + "learning_rate": 3.8029823518731247e-05, + "loss": 0.7223, + "num_input_tokens_seen": 537198816, + "step": 2973 + }, + { + "epoch": 0.32556993896932046, + "grad_norm": 1.3153051191337213, + "learning_rate": 3.802248434958261e-05, + "loss": 0.6333, + "num_input_tokens_seen": 537365248, + "step": 2974 + }, + { + "epoch": 0.3256794110402583, + "grad_norm": 1.2819846303438056, + "learning_rate": 3.801514363989897e-05, + "loss": 0.6795, + "num_input_tokens_seen": 537539296, + "step": 2975 + }, + { + "epoch": 0.32578888311119625, + "grad_norm": 1.2296939893306527, + "learning_rate": 3.8007801390548706e-05, + "loss": 0.5947, + "num_input_tokens_seen": 537716480, + "step": 2976 + }, + { + "epoch": 0.32589835518213417, + "grad_norm": 1.2404629714433373, + "learning_rate": 3.800045760240042e-05, + "loss": 0.6689, + "num_input_tokens_seen": 537888512, + "step": 2977 + }, + { + "epoch": 0.32600782725307204, + "grad_norm": 1.2306066042259907, + "learning_rate": 3.799311227632284e-05, + "loss": 0.6666, + "num_input_tokens_seen": 538080928, + "step": 2978 + }, + { + "epoch": 0.32611729932400996, + "grad_norm": 1.2619796341493597, + "learning_rate": 3.7985765413184924e-05, + "loss": 0.7422, + "num_input_tokens_seen": 538241536, + "step": 2979 + }, + { + "epoch": 0.3262267713949479, + "grad_norm": 1.1835993398402191, + "learning_rate": 3.797841701385578e-05, + "loss": 0.6312, + "num_input_tokens_seen": 538422976, + "step": 2980 + }, + { + "epoch": 0.32633624346588574, + "grad_norm": 1.363173078819859, + "learning_rate": 3.7971067079204726e-05, + "loss": 0.7662, + "num_input_tokens_seen": 538588288, + "step": 2981 + }, + { + "epoch": 0.32644571553682367, + "grad_norm": 1.330155335368547, + "learning_rate": 3.7963715610101215e-05, + "loss": 0.7762, + "num_input_tokens_seen": 538786752, + "step": 2982 + }, + { + "epoch": 0.3265551876077616, + "grad_norm": 1.3661652585315107, + "learning_rate": 3.795636260741494e-05, + "loss": 1.0204, + "num_input_tokens_seen": 538953632, + "step": 2983 + }, + { + "epoch": 0.32666465967869945, + "grad_norm": 1.4263161687856005, + "learning_rate": 3.794900807201574e-05, + "loss": 0.7219, + "num_input_tokens_seen": 539126112, + "step": 2984 + }, + { + "epoch": 0.3267741317496374, + "grad_norm": 1.2270199839760803, + "learning_rate": 3.794165200477363e-05, + "loss": 0.4956, + "num_input_tokens_seen": 539273504, + "step": 2985 + }, + { + "epoch": 0.3268836038205753, + "grad_norm": 1.3353075918245356, + "learning_rate": 3.793429440655884e-05, + "loss": 0.7768, + "num_input_tokens_seen": 539477120, + "step": 2986 + }, + { + "epoch": 0.32699307589151316, + "grad_norm": 1.3905685499070737, + "learning_rate": 3.792693527824174e-05, + "loss": 0.7814, + "num_input_tokens_seen": 539651616, + "step": 2987 + }, + { + "epoch": 0.3271025479624511, + "grad_norm": 1.348177513263475, + "learning_rate": 3.791957462069291e-05, + "loss": 0.7395, + "num_input_tokens_seen": 539804160, + "step": 2988 + }, + { + "epoch": 0.327212020033389, + "grad_norm": 1.157597040677193, + "learning_rate": 3.7912212434783095e-05, + "loss": 0.5808, + "num_input_tokens_seen": 539979776, + "step": 2989 + }, + { + "epoch": 0.32732149210432687, + "grad_norm": 1.3791451828004546, + "learning_rate": 3.7904848721383234e-05, + "loss": 0.6056, + "num_input_tokens_seen": 540129184, + "step": 2990 + }, + { + "epoch": 0.3274309641752648, + "grad_norm": 1.3298303314489368, + "learning_rate": 3.789748348136444e-05, + "loss": 0.7609, + "num_input_tokens_seen": 540339072, + "step": 2991 + }, + { + "epoch": 0.3275404362462027, + "grad_norm": 1.2856086931691966, + "learning_rate": 3.7890116715598013e-05, + "loss": 0.5684, + "num_input_tokens_seen": 540545376, + "step": 2992 + }, + { + "epoch": 0.3276499083171406, + "grad_norm": 1.360067257059586, + "learning_rate": 3.7882748424955414e-05, + "loss": 0.9684, + "num_input_tokens_seen": 540744736, + "step": 2993 + }, + { + "epoch": 0.3277593803880785, + "grad_norm": 1.29288908836923, + "learning_rate": 3.7875378610308306e-05, + "loss": 0.7161, + "num_input_tokens_seen": 540930208, + "step": 2994 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 1.4165827595414702, + "learning_rate": 3.7868007272528524e-05, + "loss": 0.8304, + "num_input_tokens_seen": 541080960, + "step": 2995 + }, + { + "epoch": 0.3279783245299543, + "grad_norm": 1.3488757821332586, + "learning_rate": 3.786063441248808e-05, + "loss": 0.6485, + "num_input_tokens_seen": 541271136, + "step": 2996 + }, + { + "epoch": 0.3280877966008922, + "grad_norm": 1.505996796785674, + "learning_rate": 3.785326003105916e-05, + "loss": 0.8814, + "num_input_tokens_seen": 541427488, + "step": 2997 + }, + { + "epoch": 0.3281972686718301, + "grad_norm": 1.3098395950449886, + "learning_rate": 3.784588412911416e-05, + "loss": 0.5283, + "num_input_tokens_seen": 541617888, + "step": 2998 + }, + { + "epoch": 0.328306740742768, + "grad_norm": 1.266598409562021, + "learning_rate": 3.783850670752563e-05, + "loss": 0.6947, + "num_input_tokens_seen": 541816352, + "step": 2999 + }, + { + "epoch": 0.3284162128137059, + "grad_norm": 1.1955609530869105, + "learning_rate": 3.783112776716629e-05, + "loss": 0.821, + "num_input_tokens_seen": 541993760, + "step": 3000 + }, + { + "epoch": 0.3285256848846438, + "grad_norm": 1.4135991807489445, + "learning_rate": 3.782374730890908e-05, + "loss": 1.0811, + "num_input_tokens_seen": 542173632, + "step": 3001 + }, + { + "epoch": 0.3286351569555817, + "grad_norm": 1.372382533704484, + "learning_rate": 3.781636533362706e-05, + "loss": 0.7887, + "num_input_tokens_seen": 542372320, + "step": 3002 + }, + { + "epoch": 0.32874462902651963, + "grad_norm": 1.3186081489095949, + "learning_rate": 3.780898184219352e-05, + "loss": 0.7109, + "num_input_tokens_seen": 542521728, + "step": 3003 + }, + { + "epoch": 0.3288541010974575, + "grad_norm": 1.2984063632504785, + "learning_rate": 3.780159683548192e-05, + "loss": 0.904, + "num_input_tokens_seen": 542699584, + "step": 3004 + }, + { + "epoch": 0.3289635731683954, + "grad_norm": 1.2534254346564997, + "learning_rate": 3.779421031436588e-05, + "loss": 0.8152, + "num_input_tokens_seen": 542902752, + "step": 3005 + }, + { + "epoch": 0.32907304523933334, + "grad_norm": 1.1773944319844636, + "learning_rate": 3.7786822279719237e-05, + "loss": 0.5504, + "num_input_tokens_seen": 543088224, + "step": 3006 + }, + { + "epoch": 0.3291825173102712, + "grad_norm": 1.1792762999684065, + "learning_rate": 3.777943273241595e-05, + "loss": 0.7772, + "num_input_tokens_seen": 543258016, + "step": 3007 + }, + { + "epoch": 0.3292919893812091, + "grad_norm": 1.2296330596234675, + "learning_rate": 3.77720416733302e-05, + "loss": 0.6468, + "num_input_tokens_seen": 543426912, + "step": 3008 + }, + { + "epoch": 0.32940146145214705, + "grad_norm": 1.0441333683445342, + "learning_rate": 3.776464910333635e-05, + "loss": 0.5287, + "num_input_tokens_seen": 543617312, + "step": 3009 + }, + { + "epoch": 0.3295109335230849, + "grad_norm": 1.3469843419558987, + "learning_rate": 3.77572550233089e-05, + "loss": 0.6494, + "num_input_tokens_seen": 543810848, + "step": 3010 + }, + { + "epoch": 0.32962040559402284, + "grad_norm": 1.2435501589841667, + "learning_rate": 3.774985943412257e-05, + "loss": 0.6514, + "num_input_tokens_seen": 543986912, + "step": 3011 + }, + { + "epoch": 0.3297298776649607, + "grad_norm": 1.454436499947368, + "learning_rate": 3.774246233665224e-05, + "loss": 0.9398, + "num_input_tokens_seen": 544188960, + "step": 3012 + }, + { + "epoch": 0.3298393497358986, + "grad_norm": 1.1999305327494494, + "learning_rate": 3.773506373177298e-05, + "loss": 0.5982, + "num_input_tokens_seen": 544376896, + "step": 3013 + }, + { + "epoch": 0.32994882180683655, + "grad_norm": 1.1085783018470048, + "learning_rate": 3.7727663620360026e-05, + "loss": 0.5543, + "num_input_tokens_seen": 544550944, + "step": 3014 + }, + { + "epoch": 0.3300582938777744, + "grad_norm": 1.222677651117487, + "learning_rate": 3.772026200328879e-05, + "loss": 0.5163, + "num_input_tokens_seen": 544731936, + "step": 3015 + }, + { + "epoch": 0.33016776594871233, + "grad_norm": 1.4411647356524118, + "learning_rate": 3.771285888143489e-05, + "loss": 0.7702, + "num_input_tokens_seen": 544918080, + "step": 3016 + }, + { + "epoch": 0.33027723801965025, + "grad_norm": 1.3100066798527594, + "learning_rate": 3.7705454255674064e-05, + "loss": 0.5687, + "num_input_tokens_seen": 545073984, + "step": 3017 + }, + { + "epoch": 0.3303867100905881, + "grad_norm": 1.3314797610593703, + "learning_rate": 3.769804812688231e-05, + "loss": 0.8614, + "num_input_tokens_seen": 545245120, + "step": 3018 + }, + { + "epoch": 0.33049618216152604, + "grad_norm": 1.417891991589572, + "learning_rate": 3.769064049593573e-05, + "loss": 0.748, + "num_input_tokens_seen": 545417600, + "step": 3019 + }, + { + "epoch": 0.33060565423246396, + "grad_norm": 1.3555000147662977, + "learning_rate": 3.768323136371064e-05, + "loss": 0.6956, + "num_input_tokens_seen": 545569024, + "step": 3020 + }, + { + "epoch": 0.33071512630340183, + "grad_norm": 1.3814476303324918, + "learning_rate": 3.7675820731083526e-05, + "loss": 0.6547, + "num_input_tokens_seen": 545751360, + "step": 3021 + }, + { + "epoch": 0.33082459837433975, + "grad_norm": 1.248306271336528, + "learning_rate": 3.766840859893105e-05, + "loss": 0.8081, + "num_input_tokens_seen": 545925184, + "step": 3022 + }, + { + "epoch": 0.3309340704452777, + "grad_norm": 1.3107980865441269, + "learning_rate": 3.766099496813006e-05, + "loss": 0.6221, + "num_input_tokens_seen": 546068544, + "step": 3023 + }, + { + "epoch": 0.33104354251621554, + "grad_norm": 1.1257799941406021, + "learning_rate": 3.765357983955756e-05, + "loss": 0.5686, + "num_input_tokens_seen": 546239680, + "step": 3024 + }, + { + "epoch": 0.33115301458715346, + "grad_norm": 1.1937277766850392, + "learning_rate": 3.764616321409076e-05, + "loss": 0.6519, + "num_input_tokens_seen": 546415072, + "step": 3025 + }, + { + "epoch": 0.3312624866580914, + "grad_norm": 1.3998666205902655, + "learning_rate": 3.763874509260702e-05, + "loss": 0.7772, + "num_input_tokens_seen": 546616672, + "step": 3026 + }, + { + "epoch": 0.33137195872902925, + "grad_norm": 1.3478037877799587, + "learning_rate": 3.7631325475983905e-05, + "loss": 0.6952, + "num_input_tokens_seen": 546797664, + "step": 3027 + }, + { + "epoch": 0.33148143079996717, + "grad_norm": 1.3093793190677758, + "learning_rate": 3.7623904365099134e-05, + "loss": 0.7839, + "num_input_tokens_seen": 546989632, + "step": 3028 + }, + { + "epoch": 0.33159090287090504, + "grad_norm": 1.5965498471911361, + "learning_rate": 3.761648176083061e-05, + "loss": 0.8631, + "num_input_tokens_seen": 547180032, + "step": 3029 + }, + { + "epoch": 0.33170037494184296, + "grad_norm": 1.2560177906319285, + "learning_rate": 3.76090576640564e-05, + "loss": 0.6256, + "num_input_tokens_seen": 547363712, + "step": 3030 + }, + { + "epoch": 0.3318098470127809, + "grad_norm": 1.254824198381831, + "learning_rate": 3.7601632075654784e-05, + "loss": 0.7486, + "num_input_tokens_seen": 547529248, + "step": 3031 + }, + { + "epoch": 0.33191931908371874, + "grad_norm": 1.3009987497492301, + "learning_rate": 3.759420499650419e-05, + "loss": 0.6361, + "num_input_tokens_seen": 547691872, + "step": 3032 + }, + { + "epoch": 0.33202879115465667, + "grad_norm": 1.1641421194817674, + "learning_rate": 3.758677642748321e-05, + "loss": 0.7569, + "num_input_tokens_seen": 547911392, + "step": 3033 + }, + { + "epoch": 0.3321382632255946, + "grad_norm": 1.287779614723621, + "learning_rate": 3.757934636947064e-05, + "loss": 0.6862, + "num_input_tokens_seen": 548075808, + "step": 3034 + }, + { + "epoch": 0.33224773529653245, + "grad_norm": 1.2612909117188946, + "learning_rate": 3.7571914823345444e-05, + "loss": 0.7132, + "num_input_tokens_seen": 548250080, + "step": 3035 + }, + { + "epoch": 0.3323572073674704, + "grad_norm": 1.3036503124071213, + "learning_rate": 3.756448178998676e-05, + "loss": 0.701, + "num_input_tokens_seen": 548462432, + "step": 3036 + }, + { + "epoch": 0.3324666794384083, + "grad_norm": 1.2554619666096107, + "learning_rate": 3.755704727027389e-05, + "loss": 0.6908, + "num_input_tokens_seen": 548632000, + "step": 3037 + }, + { + "epoch": 0.33257615150934616, + "grad_norm": 1.2434316198876747, + "learning_rate": 3.754961126508634e-05, + "loss": 0.5264, + "num_input_tokens_seen": 548783648, + "step": 3038 + }, + { + "epoch": 0.3326856235802841, + "grad_norm": 1.1857355963684482, + "learning_rate": 3.754217377530377e-05, + "loss": 0.5652, + "num_input_tokens_seen": 548983680, + "step": 3039 + }, + { + "epoch": 0.332795095651222, + "grad_norm": 1.2645679816498314, + "learning_rate": 3.753473480180603e-05, + "loss": 0.701, + "num_input_tokens_seen": 549152576, + "step": 3040 + }, + { + "epoch": 0.33290456772215987, + "grad_norm": 1.268724672412347, + "learning_rate": 3.752729434547311e-05, + "loss": 0.6705, + "num_input_tokens_seen": 549351040, + "step": 3041 + }, + { + "epoch": 0.3330140397930978, + "grad_norm": 1.3276784707196803, + "learning_rate": 3.751985240718522e-05, + "loss": 0.6188, + "num_input_tokens_seen": 549493728, + "step": 3042 + }, + { + "epoch": 0.3331235118640357, + "grad_norm": 1.507629620623974, + "learning_rate": 3.7512408987822724e-05, + "loss": 0.8571, + "num_input_tokens_seen": 549695552, + "step": 3043 + }, + { + "epoch": 0.3332329839349736, + "grad_norm": 1.2011331195316681, + "learning_rate": 3.750496408826616e-05, + "loss": 0.6649, + "num_input_tokens_seen": 549878560, + "step": 3044 + }, + { + "epoch": 0.3333424560059115, + "grad_norm": 1.4110567804992713, + "learning_rate": 3.749751770939626e-05, + "loss": 0.6378, + "num_input_tokens_seen": 550043200, + "step": 3045 + }, + { + "epoch": 0.33345192807684937, + "grad_norm": 1.394534541107483, + "learning_rate": 3.74900698520939e-05, + "loss": 0.7189, + "num_input_tokens_seen": 550253536, + "step": 3046 + }, + { + "epoch": 0.3335614001477873, + "grad_norm": 1.2107377410526414, + "learning_rate": 3.7482620517240155e-05, + "loss": 0.5551, + "num_input_tokens_seen": 550438336, + "step": 3047 + }, + { + "epoch": 0.3336708722187252, + "grad_norm": 1.430023737490314, + "learning_rate": 3.747516970571626e-05, + "loss": 0.8268, + "num_input_tokens_seen": 550610368, + "step": 3048 + }, + { + "epoch": 0.3337803442896631, + "grad_norm": 1.1498193018382856, + "learning_rate": 3.746771741840365e-05, + "loss": 0.5868, + "num_input_tokens_seen": 550766944, + "step": 3049 + }, + { + "epoch": 0.333889816360601, + "grad_norm": 1.347053238123199, + "learning_rate": 3.746026365618389e-05, + "loss": 0.6358, + "num_input_tokens_seen": 550944128, + "step": 3050 + }, + { + "epoch": 0.3339992884315389, + "grad_norm": 1.3687770213765105, + "learning_rate": 3.745280841993876e-05, + "loss": 0.7253, + "num_input_tokens_seen": 551118624, + "step": 3051 + }, + { + "epoch": 0.3341087605024768, + "grad_norm": 1.2358845039898139, + "learning_rate": 3.744535171055021e-05, + "loss": 0.6385, + "num_input_tokens_seen": 551301856, + "step": 3052 + }, + { + "epoch": 0.3342182325734147, + "grad_norm": 1.2631648135623172, + "learning_rate": 3.743789352890034e-05, + "loss": 0.7595, + "num_input_tokens_seen": 551468736, + "step": 3053 + }, + { + "epoch": 0.33432770464435263, + "grad_norm": 1.4208818257727416, + "learning_rate": 3.743043387587144e-05, + "loss": 0.8359, + "num_input_tokens_seen": 551643456, + "step": 3054 + }, + { + "epoch": 0.3344371767152905, + "grad_norm": 1.1922618857759342, + "learning_rate": 3.742297275234598e-05, + "loss": 0.6378, + "num_input_tokens_seen": 551815712, + "step": 3055 + }, + { + "epoch": 0.3345466487862284, + "grad_norm": 1.2720851621459517, + "learning_rate": 3.7415510159206593e-05, + "loss": 0.6353, + "num_input_tokens_seen": 551983936, + "step": 3056 + }, + { + "epoch": 0.33465612085716634, + "grad_norm": 1.244469759296663, + "learning_rate": 3.740804609733608e-05, + "loss": 0.6878, + "num_input_tokens_seen": 552176800, + "step": 3057 + }, + { + "epoch": 0.3347655929281042, + "grad_norm": 1.2607596327492918, + "learning_rate": 3.740058056761743e-05, + "loss": 0.7228, + "num_input_tokens_seen": 552345696, + "step": 3058 + }, + { + "epoch": 0.3348750649990421, + "grad_norm": 1.2697500656527483, + "learning_rate": 3.739311357093382e-05, + "loss": 0.5975, + "num_input_tokens_seen": 552532960, + "step": 3059 + }, + { + "epoch": 0.33498453706998005, + "grad_norm": 1.3088845911928728, + "learning_rate": 3.738564510816856e-05, + "loss": 0.8951, + "num_input_tokens_seen": 552701632, + "step": 3060 + }, + { + "epoch": 0.3350940091409179, + "grad_norm": 1.311539480149579, + "learning_rate": 3.737817518020516e-05, + "loss": 0.7841, + "num_input_tokens_seen": 552892480, + "step": 3061 + }, + { + "epoch": 0.33520348121185584, + "grad_norm": 1.2451659667987889, + "learning_rate": 3.73707037879273e-05, + "loss": 0.642, + "num_input_tokens_seen": 553065856, + "step": 3062 + }, + { + "epoch": 0.3353129532827937, + "grad_norm": 1.4573889899141457, + "learning_rate": 3.736323093221884e-05, + "loss": 0.9782, + "num_input_tokens_seen": 553250656, + "step": 3063 + }, + { + "epoch": 0.3354224253537316, + "grad_norm": 1.3135325139831266, + "learning_rate": 3.735575661396378e-05, + "loss": 0.6574, + "num_input_tokens_seen": 553420000, + "step": 3064 + }, + { + "epoch": 0.33553189742466955, + "grad_norm": 1.2633846375122426, + "learning_rate": 3.7348280834046334e-05, + "loss": 0.7347, + "num_input_tokens_seen": 553614656, + "step": 3065 + }, + { + "epoch": 0.3356413694956074, + "grad_norm": 1.1616191439073065, + "learning_rate": 3.7340803593350884e-05, + "loss": 0.838, + "num_input_tokens_seen": 553818048, + "step": 3066 + }, + { + "epoch": 0.33575084156654533, + "grad_norm": 1.409073982318888, + "learning_rate": 3.733332489276195e-05, + "loss": 0.81, + "num_input_tokens_seen": 553992096, + "step": 3067 + }, + { + "epoch": 0.33586031363748325, + "grad_norm": 1.2032969958735542, + "learning_rate": 3.7325844733164256e-05, + "loss": 0.7872, + "num_input_tokens_seen": 554184288, + "step": 3068 + }, + { + "epoch": 0.3359697857084211, + "grad_norm": 1.255760224553047, + "learning_rate": 3.73183631154427e-05, + "loss": 0.6419, + "num_input_tokens_seen": 554396192, + "step": 3069 + }, + { + "epoch": 0.33607925777935904, + "grad_norm": 1.278951537134971, + "learning_rate": 3.7310880040482335e-05, + "loss": 0.7937, + "num_input_tokens_seen": 554600704, + "step": 3070 + }, + { + "epoch": 0.33618872985029696, + "grad_norm": 1.1030612786667922, + "learning_rate": 3.730339550916839e-05, + "loss": 0.7986, + "num_input_tokens_seen": 554803648, + "step": 3071 + }, + { + "epoch": 0.33629820192123483, + "grad_norm": 1.2355685198103346, + "learning_rate": 3.729590952238628e-05, + "loss": 0.6929, + "num_input_tokens_seen": 554989792, + "step": 3072 + }, + { + "epoch": 0.33640767399217275, + "grad_norm": 1.3097002504019823, + "learning_rate": 3.728842208102158e-05, + "loss": 0.7243, + "num_input_tokens_seen": 555178176, + "step": 3073 + }, + { + "epoch": 0.3365171460631107, + "grad_norm": 1.1617955914181757, + "learning_rate": 3.728093318596004e-05, + "loss": 0.5647, + "num_input_tokens_seen": 555370368, + "step": 3074 + }, + { + "epoch": 0.33662661813404854, + "grad_norm": 1.3089739802217315, + "learning_rate": 3.7273442838087584e-05, + "loss": 0.7666, + "num_input_tokens_seen": 555529632, + "step": 3075 + }, + { + "epoch": 0.33673609020498646, + "grad_norm": 1.328706860707769, + "learning_rate": 3.7265951038290305e-05, + "loss": 0.6633, + "num_input_tokens_seen": 555668288, + "step": 3076 + }, + { + "epoch": 0.3368455622759244, + "grad_norm": 1.2294094794150408, + "learning_rate": 3.725845778745446e-05, + "loss": 0.6464, + "num_input_tokens_seen": 555896096, + "step": 3077 + }, + { + "epoch": 0.33695503434686225, + "grad_norm": 1.1761083813179478, + "learning_rate": 3.725096308646649e-05, + "loss": 0.5667, + "num_input_tokens_seen": 556085152, + "step": 3078 + }, + { + "epoch": 0.33706450641780017, + "grad_norm": 1.286168172548632, + "learning_rate": 3.724346693621301e-05, + "loss": 0.7083, + "num_input_tokens_seen": 556254944, + "step": 3079 + }, + { + "epoch": 0.33717397848873804, + "grad_norm": 1.4485676991894896, + "learning_rate": 3.72359693375808e-05, + "loss": 0.804, + "num_input_tokens_seen": 556428992, + "step": 3080 + }, + { + "epoch": 0.33728345055967596, + "grad_norm": 1.2511245913924602, + "learning_rate": 3.722847029145681e-05, + "loss": 0.6773, + "num_input_tokens_seen": 556602144, + "step": 3081 + }, + { + "epoch": 0.3373929226306139, + "grad_norm": 1.322942788574559, + "learning_rate": 3.722096979872815e-05, + "loss": 0.7295, + "num_input_tokens_seen": 556771264, + "step": 3082 + }, + { + "epoch": 0.33750239470155174, + "grad_norm": 1.3647314190288853, + "learning_rate": 3.7213467860282144e-05, + "loss": 0.9204, + "num_input_tokens_seen": 556961664, + "step": 3083 + }, + { + "epoch": 0.33761186677248967, + "grad_norm": 1.294744511075786, + "learning_rate": 3.720596447700623e-05, + "loss": 0.6539, + "num_input_tokens_seen": 557126528, + "step": 3084 + }, + { + "epoch": 0.3377213388434276, + "grad_norm": 1.2538014305724823, + "learning_rate": 3.7198459649788045e-05, + "loss": 0.6929, + "num_input_tokens_seen": 557306624, + "step": 3085 + }, + { + "epoch": 0.33783081091436545, + "grad_norm": 1.204810460565377, + "learning_rate": 3.7190953379515404e-05, + "loss": 0.7993, + "num_input_tokens_seen": 557520320, + "step": 3086 + }, + { + "epoch": 0.3379402829853034, + "grad_norm": 1.1914101022126504, + "learning_rate": 3.718344566707629e-05, + "loss": 0.7108, + "num_input_tokens_seen": 557704896, + "step": 3087 + }, + { + "epoch": 0.3380497550562413, + "grad_norm": 1.2883962123046548, + "learning_rate": 3.717593651335884e-05, + "loss": 0.6535, + "num_input_tokens_seen": 557868864, + "step": 3088 + }, + { + "epoch": 0.33815922712717916, + "grad_norm": 1.2783641414253903, + "learning_rate": 3.716842591925138e-05, + "loss": 0.6744, + "num_input_tokens_seen": 558042016, + "step": 3089 + }, + { + "epoch": 0.3382686991981171, + "grad_norm": 1.2072723049205385, + "learning_rate": 3.71609138856424e-05, + "loss": 0.6626, + "num_input_tokens_seen": 558191648, + "step": 3090 + }, + { + "epoch": 0.338378171269055, + "grad_norm": 1.2022310378875114, + "learning_rate": 3.715340041342055e-05, + "loss": 0.737, + "num_input_tokens_seen": 558379584, + "step": 3091 + }, + { + "epoch": 0.33848764333999287, + "grad_norm": 1.2621471516260645, + "learning_rate": 3.7145885503474654e-05, + "loss": 0.7874, + "num_input_tokens_seen": 558549152, + "step": 3092 + }, + { + "epoch": 0.3385971154109308, + "grad_norm": 1.2066729856469651, + "learning_rate": 3.713836915669373e-05, + "loss": 0.6163, + "num_input_tokens_seen": 558719168, + "step": 3093 + }, + { + "epoch": 0.3387065874818687, + "grad_norm": 1.263997573976965, + "learning_rate": 3.713085137396694e-05, + "loss": 0.8031, + "num_input_tokens_seen": 558892096, + "step": 3094 + }, + { + "epoch": 0.3388160595528066, + "grad_norm": 1.3612145440814687, + "learning_rate": 3.712333215618363e-05, + "loss": 0.9104, + "num_input_tokens_seen": 559080704, + "step": 3095 + }, + { + "epoch": 0.3389255316237445, + "grad_norm": 1.185340222316872, + "learning_rate": 3.71158115042333e-05, + "loss": 0.6035, + "num_input_tokens_seen": 559244224, + "step": 3096 + }, + { + "epoch": 0.33903500369468237, + "grad_norm": 1.4044306960561348, + "learning_rate": 3.7108289419005625e-05, + "loss": 0.7184, + "num_input_tokens_seen": 559429472, + "step": 3097 + }, + { + "epoch": 0.3391444757656203, + "grad_norm": 1.3729236270726173, + "learning_rate": 3.710076590139045e-05, + "loss": 0.8382, + "num_input_tokens_seen": 559622784, + "step": 3098 + }, + { + "epoch": 0.3392539478365582, + "grad_norm": 1.1992469809126172, + "learning_rate": 3.7093240952277816e-05, + "loss": 0.8304, + "num_input_tokens_seen": 559803104, + "step": 3099 + }, + { + "epoch": 0.3393634199074961, + "grad_norm": 1.2260025592920372, + "learning_rate": 3.708571457255789e-05, + "loss": 0.7293, + "num_input_tokens_seen": 559989024, + "step": 3100 + }, + { + "epoch": 0.339472891978434, + "grad_norm": 1.3109605025591045, + "learning_rate": 3.7078186763121034e-05, + "loss": 0.8897, + "num_input_tokens_seen": 560163072, + "step": 3101 + }, + { + "epoch": 0.3395823640493719, + "grad_norm": 1.266474309336157, + "learning_rate": 3.7070657524857786e-05, + "loss": 0.523, + "num_input_tokens_seen": 560325920, + "step": 3102 + }, + { + "epoch": 0.3396918361203098, + "grad_norm": 1.42904031415577, + "learning_rate": 3.706312685865881e-05, + "loss": 0.7425, + "num_input_tokens_seen": 560524832, + "step": 3103 + }, + { + "epoch": 0.3398013081912477, + "grad_norm": 1.3000554531447979, + "learning_rate": 3.7055594765415e-05, + "loss": 0.7011, + "num_input_tokens_seen": 560701344, + "step": 3104 + }, + { + "epoch": 0.33991078026218563, + "grad_norm": 1.3251640074369833, + "learning_rate": 3.704806124601736e-05, + "loss": 0.6051, + "num_input_tokens_seen": 560850976, + "step": 3105 + }, + { + "epoch": 0.3400202523331235, + "grad_norm": 1.2284806433034974, + "learning_rate": 3.704052630135713e-05, + "loss": 0.6006, + "num_input_tokens_seen": 561039808, + "step": 3106 + }, + { + "epoch": 0.3401297244040614, + "grad_norm": 1.2587895360077175, + "learning_rate": 3.7032989932325634e-05, + "loss": 0.5989, + "num_input_tokens_seen": 561188320, + "step": 3107 + }, + { + "epoch": 0.34023919647499934, + "grad_norm": 1.2010213121783138, + "learning_rate": 3.7025452139814445e-05, + "loss": 0.7302, + "num_input_tokens_seen": 561352288, + "step": 3108 + }, + { + "epoch": 0.3403486685459372, + "grad_norm": 1.4643777741924062, + "learning_rate": 3.7017912924715257e-05, + "loss": 0.92, + "num_input_tokens_seen": 561546496, + "step": 3109 + }, + { + "epoch": 0.3404581406168751, + "grad_norm": 1.33482425997416, + "learning_rate": 3.701037228791993e-05, + "loss": 0.6253, + "num_input_tokens_seen": 561729280, + "step": 3110 + }, + { + "epoch": 0.34056761268781305, + "grad_norm": 1.2659248185305356, + "learning_rate": 3.7002830230320537e-05, + "loss": 0.6745, + "num_input_tokens_seen": 561910496, + "step": 3111 + }, + { + "epoch": 0.3406770847587509, + "grad_norm": 1.2839613812334234, + "learning_rate": 3.699528675280926e-05, + "loss": 0.9247, + "num_input_tokens_seen": 562082080, + "step": 3112 + }, + { + "epoch": 0.34078655682968884, + "grad_norm": 1.2609828539022203, + "learning_rate": 3.69877418562785e-05, + "loss": 0.6365, + "num_input_tokens_seen": 562282560, + "step": 3113 + }, + { + "epoch": 0.3408960289006267, + "grad_norm": 1.298703203432123, + "learning_rate": 3.69801955416208e-05, + "loss": 0.8286, + "num_input_tokens_seen": 562476768, + "step": 3114 + }, + { + "epoch": 0.3410055009715646, + "grad_norm": 1.2412828234033073, + "learning_rate": 3.697264780972886e-05, + "loss": 0.7051, + "num_input_tokens_seen": 562667840, + "step": 3115 + }, + { + "epoch": 0.34111497304250255, + "grad_norm": 1.1622735428455484, + "learning_rate": 3.696509866149558e-05, + "loss": 0.7157, + "num_input_tokens_seen": 562840544, + "step": 3116 + }, + { + "epoch": 0.3412244451134404, + "grad_norm": 1.5167272433385723, + "learning_rate": 3.6957548097814e-05, + "loss": 0.8251, + "num_input_tokens_seen": 563028928, + "step": 3117 + }, + { + "epoch": 0.34133391718437833, + "grad_norm": 1.3006866695626313, + "learning_rate": 3.6949996119577335e-05, + "loss": 0.7393, + "num_input_tokens_seen": 563203424, + "step": 3118 + }, + { + "epoch": 0.34144338925531625, + "grad_norm": 1.0770713522642532, + "learning_rate": 3.694244272767897e-05, + "loss": 0.4825, + "num_input_tokens_seen": 563385088, + "step": 3119 + }, + { + "epoch": 0.3415528613262541, + "grad_norm": 1.3545277432022174, + "learning_rate": 3.693488792301247e-05, + "loss": 0.7199, + "num_input_tokens_seen": 563563392, + "step": 3120 + }, + { + "epoch": 0.34166233339719204, + "grad_norm": 1.3178383482871434, + "learning_rate": 3.6927331706471536e-05, + "loss": 0.6731, + "num_input_tokens_seen": 563747072, + "step": 3121 + }, + { + "epoch": 0.34177180546812996, + "grad_norm": 1.6077178274739974, + "learning_rate": 3.6919774078950065e-05, + "loss": 0.9561, + "num_input_tokens_seen": 563935232, + "step": 3122 + }, + { + "epoch": 0.34188127753906783, + "grad_norm": 1.2583380128550306, + "learning_rate": 3.691221504134211e-05, + "loss": 0.7495, + "num_input_tokens_seen": 564103680, + "step": 3123 + }, + { + "epoch": 0.34199074961000575, + "grad_norm": 1.220512143263365, + "learning_rate": 3.6904654594541885e-05, + "loss": 0.5662, + "num_input_tokens_seen": 564279072, + "step": 3124 + }, + { + "epoch": 0.3421002216809437, + "grad_norm": 1.3835407214608353, + "learning_rate": 3.689709273944378e-05, + "loss": 0.9131, + "num_input_tokens_seen": 564487616, + "step": 3125 + }, + { + "epoch": 0.34220969375188154, + "grad_norm": 1.2656205730596284, + "learning_rate": 3.6889529476942344e-05, + "loss": 0.664, + "num_input_tokens_seen": 564654944, + "step": 3126 + }, + { + "epoch": 0.34231916582281946, + "grad_norm": 1.3068238544717938, + "learning_rate": 3.6881964807932306e-05, + "loss": 0.7431, + "num_input_tokens_seen": 564873344, + "step": 3127 + }, + { + "epoch": 0.3424286378937574, + "grad_norm": 1.3029582733417453, + "learning_rate": 3.6874398733308544e-05, + "loss": 0.7563, + "num_input_tokens_seen": 565071584, + "step": 3128 + }, + { + "epoch": 0.34253810996469525, + "grad_norm": 1.3783257318784496, + "learning_rate": 3.686683125396611e-05, + "loss": 0.739, + "num_input_tokens_seen": 565204640, + "step": 3129 + }, + { + "epoch": 0.34264758203563317, + "grad_norm": 1.3546718486510934, + "learning_rate": 3.685926237080023e-05, + "loss": 0.779, + "num_input_tokens_seen": 565410048, + "step": 3130 + }, + { + "epoch": 0.34275705410657104, + "grad_norm": 1.2544440901526495, + "learning_rate": 3.6851692084706266e-05, + "loss": 0.6577, + "num_input_tokens_seen": 565572448, + "step": 3131 + }, + { + "epoch": 0.34286652617750896, + "grad_norm": 1.2074855208861592, + "learning_rate": 3.68441203965798e-05, + "loss": 0.6851, + "num_input_tokens_seen": 565735072, + "step": 3132 + }, + { + "epoch": 0.3429759982484469, + "grad_norm": 1.1588698939709943, + "learning_rate": 3.6836547307316524e-05, + "loss": 0.5406, + "num_input_tokens_seen": 565940032, + "step": 3133 + }, + { + "epoch": 0.34308547031938474, + "grad_norm": 1.1752469188890802, + "learning_rate": 3.682897281781234e-05, + "loss": 0.6749, + "num_input_tokens_seen": 566097280, + "step": 3134 + }, + { + "epoch": 0.34319494239032267, + "grad_norm": 1.2570273276600357, + "learning_rate": 3.682139692896328e-05, + "loss": 0.6863, + "num_input_tokens_seen": 566263040, + "step": 3135 + }, + { + "epoch": 0.3433044144612606, + "grad_norm": 1.3747860568670556, + "learning_rate": 3.681381964166556e-05, + "loss": 0.7361, + "num_input_tokens_seen": 566426784, + "step": 3136 + }, + { + "epoch": 0.34341388653219845, + "grad_norm": 1.2946098492553424, + "learning_rate": 3.680624095681557e-05, + "loss": 0.605, + "num_input_tokens_seen": 566561856, + "step": 3137 + }, + { + "epoch": 0.3435233586031364, + "grad_norm": 1.3753868339159134, + "learning_rate": 3.6798660875309836e-05, + "loss": 0.6285, + "num_input_tokens_seen": 566719776, + "step": 3138 + }, + { + "epoch": 0.3436328306740743, + "grad_norm": 1.251579240741757, + "learning_rate": 3.679107939804507e-05, + "loss": 0.5439, + "num_input_tokens_seen": 566885984, + "step": 3139 + }, + { + "epoch": 0.34374230274501216, + "grad_norm": 1.1120179085455801, + "learning_rate": 3.678349652591816e-05, + "loss": 0.4707, + "num_input_tokens_seen": 567065632, + "step": 3140 + }, + { + "epoch": 0.3438517748159501, + "grad_norm": 1.2563745085852203, + "learning_rate": 3.677591225982614e-05, + "loss": 0.7488, + "num_input_tokens_seen": 567249536, + "step": 3141 + }, + { + "epoch": 0.343961246886888, + "grad_norm": 1.2915657475742406, + "learning_rate": 3.67683266006662e-05, + "loss": 0.7251, + "num_input_tokens_seen": 567454272, + "step": 3142 + }, + { + "epoch": 0.34407071895782587, + "grad_norm": 1.270221229267941, + "learning_rate": 3.676073954933573e-05, + "loss": 0.6472, + "num_input_tokens_seen": 567641312, + "step": 3143 + }, + { + "epoch": 0.3441801910287638, + "grad_norm": 1.1470458753136936, + "learning_rate": 3.6753151106732255e-05, + "loss": 0.7084, + "num_input_tokens_seen": 567819168, + "step": 3144 + }, + { + "epoch": 0.3442896630997017, + "grad_norm": 1.2027419335033942, + "learning_rate": 3.674556127375347e-05, + "loss": 0.6171, + "num_input_tokens_seen": 568002400, + "step": 3145 + }, + { + "epoch": 0.3443991351706396, + "grad_norm": 1.274803926313666, + "learning_rate": 3.6737970051297234e-05, + "loss": 0.8196, + "num_input_tokens_seen": 568205792, + "step": 3146 + }, + { + "epoch": 0.3445086072415775, + "grad_norm": 1.3307800681375557, + "learning_rate": 3.673037744026159e-05, + "loss": 0.7456, + "num_input_tokens_seen": 568401120, + "step": 3147 + }, + { + "epoch": 0.34461807931251537, + "grad_norm": 1.2517375790250405, + "learning_rate": 3.672278344154471e-05, + "loss": 0.5945, + "num_input_tokens_seen": 568557696, + "step": 3148 + }, + { + "epoch": 0.3447275513834533, + "grad_norm": 1.2734490610545712, + "learning_rate": 3.671518805604496e-05, + "loss": 0.7462, + "num_input_tokens_seen": 568737792, + "step": 3149 + }, + { + "epoch": 0.3448370234543912, + "grad_norm": 1.4044536137021673, + "learning_rate": 3.670759128466087e-05, + "loss": 0.7897, + "num_input_tokens_seen": 568940960, + "step": 3150 + }, + { + "epoch": 0.3449464955253291, + "grad_norm": 1.033635587991637, + "learning_rate": 3.669999312829111e-05, + "loss": 0.4074, + "num_input_tokens_seen": 569113888, + "step": 3151 + }, + { + "epoch": 0.345055967596267, + "grad_norm": 1.3764983596320863, + "learning_rate": 3.669239358783452e-05, + "loss": 0.7551, + "num_input_tokens_seen": 569288160, + "step": 3152 + }, + { + "epoch": 0.3451654396672049, + "grad_norm": 1.1414643426272684, + "learning_rate": 3.668479266419012e-05, + "loss": 0.5305, + "num_input_tokens_seen": 569438016, + "step": 3153 + }, + { + "epoch": 0.3452749117381428, + "grad_norm": 1.2941912723190974, + "learning_rate": 3.6677190358257086e-05, + "loss": 0.8747, + "num_input_tokens_seen": 569623040, + "step": 3154 + }, + { + "epoch": 0.3453843838090807, + "grad_norm": 1.1804898798507566, + "learning_rate": 3.666958667093476e-05, + "loss": 0.7497, + "num_input_tokens_seen": 569811424, + "step": 3155 + }, + { + "epoch": 0.34549385588001863, + "grad_norm": 1.0800301677411126, + "learning_rate": 3.6661981603122645e-05, + "loss": 0.5819, + "num_input_tokens_seen": 569986368, + "step": 3156 + }, + { + "epoch": 0.3456033279509565, + "grad_norm": 1.2936213480058998, + "learning_rate": 3.665437515572039e-05, + "loss": 0.6702, + "num_input_tokens_seen": 570184160, + "step": 3157 + }, + { + "epoch": 0.3457128000218944, + "grad_norm": 1.3578515559682323, + "learning_rate": 3.664676732962784e-05, + "loss": 0.8348, + "num_input_tokens_seen": 570394720, + "step": 3158 + }, + { + "epoch": 0.34582227209283234, + "grad_norm": 1.107290808508975, + "learning_rate": 3.663915812574497e-05, + "loss": 0.5655, + "num_input_tokens_seen": 570571008, + "step": 3159 + }, + { + "epoch": 0.3459317441637702, + "grad_norm": 1.3634569115036153, + "learning_rate": 3.663154754497196e-05, + "loss": 0.6114, + "num_input_tokens_seen": 570739008, + "step": 3160 + }, + { + "epoch": 0.3460412162347081, + "grad_norm": 1.3186663686982125, + "learning_rate": 3.66239355882091e-05, + "loss": 0.6208, + "num_input_tokens_seen": 570895360, + "step": 3161 + }, + { + "epoch": 0.34615068830564605, + "grad_norm": 1.2674843587418156, + "learning_rate": 3.6616322256356884e-05, + "loss": 0.7707, + "num_input_tokens_seen": 571085536, + "step": 3162 + }, + { + "epoch": 0.3462601603765839, + "grad_norm": 1.3791206344498057, + "learning_rate": 3.6608707550315944e-05, + "loss": 0.6948, + "num_input_tokens_seen": 571273024, + "step": 3163 + }, + { + "epoch": 0.34636963244752184, + "grad_norm": 1.3871303323652, + "learning_rate": 3.660109147098711e-05, + "loss": 0.7548, + "num_input_tokens_seen": 571444160, + "step": 3164 + }, + { + "epoch": 0.3464791045184597, + "grad_norm": 1.3379419024588337, + "learning_rate": 3.659347401927131e-05, + "loss": 0.7098, + "num_input_tokens_seen": 571642400, + "step": 3165 + }, + { + "epoch": 0.3465885765893976, + "grad_norm": 1.3376714596520292, + "learning_rate": 3.6585855196069704e-05, + "loss": 0.6282, + "num_input_tokens_seen": 571820032, + "step": 3166 + }, + { + "epoch": 0.34669804866033554, + "grad_norm": 1.395966419119393, + "learning_rate": 3.657823500228359e-05, + "loss": 0.7717, + "num_input_tokens_seen": 571995648, + "step": 3167 + }, + { + "epoch": 0.3468075207312734, + "grad_norm": 1.1743536104586028, + "learning_rate": 3.65706134388144e-05, + "loss": 0.5266, + "num_input_tokens_seen": 572149536, + "step": 3168 + }, + { + "epoch": 0.34691699280221133, + "grad_norm": 1.2680667352804493, + "learning_rate": 3.656299050656376e-05, + "loss": 0.6338, + "num_input_tokens_seen": 572310816, + "step": 3169 + }, + { + "epoch": 0.34702646487314925, + "grad_norm": 1.258934042241385, + "learning_rate": 3.655536620643345e-05, + "loss": 0.6667, + "num_input_tokens_seen": 572476576, + "step": 3170 + }, + { + "epoch": 0.3471359369440871, + "grad_norm": 1.3085950709094312, + "learning_rate": 3.654774053932541e-05, + "loss": 0.6294, + "num_input_tokens_seen": 572661376, + "step": 3171 + }, + { + "epoch": 0.34724540901502504, + "grad_norm": 1.3760749343134937, + "learning_rate": 3.6540113506141734e-05, + "loss": 0.8661, + "num_input_tokens_seen": 572833184, + "step": 3172 + }, + { + "epoch": 0.34735488108596296, + "grad_norm": 1.3268903041968627, + "learning_rate": 3.653248510778469e-05, + "loss": 0.7283, + "num_input_tokens_seen": 573025600, + "step": 3173 + }, + { + "epoch": 0.34746435315690083, + "grad_norm": 1.2735589092395156, + "learning_rate": 3.652485534515671e-05, + "loss": 0.7794, + "num_input_tokens_seen": 573230336, + "step": 3174 + }, + { + "epoch": 0.34757382522783875, + "grad_norm": 1.31667947622096, + "learning_rate": 3.6517224219160365e-05, + "loss": 0.7147, + "num_input_tokens_seen": 573406176, + "step": 3175 + }, + { + "epoch": 0.3476832972987767, + "grad_norm": 1.0706227542031912, + "learning_rate": 3.6509591730698416e-05, + "loss": 0.5735, + "num_input_tokens_seen": 573601504, + "step": 3176 + }, + { + "epoch": 0.34779276936971454, + "grad_norm": 1.2362241776626697, + "learning_rate": 3.6501957880673775e-05, + "loss": 0.6359, + "num_input_tokens_seen": 573781376, + "step": 3177 + }, + { + "epoch": 0.34790224144065246, + "grad_norm": 1.1716577455679318, + "learning_rate": 3.64943226699895e-05, + "loss": 0.6908, + "num_input_tokens_seen": 573964160, + "step": 3178 + }, + { + "epoch": 0.3480117135115904, + "grad_norm": 1.2568247453003023, + "learning_rate": 3.648668609954883e-05, + "loss": 0.7793, + "num_input_tokens_seen": 574157920, + "step": 3179 + }, + { + "epoch": 0.34812118558252825, + "grad_norm": 1.2315419194733754, + "learning_rate": 3.647904817025514e-05, + "loss": 0.5795, + "num_input_tokens_seen": 574319648, + "step": 3180 + }, + { + "epoch": 0.34823065765346617, + "grad_norm": 1.2212230338142502, + "learning_rate": 3.6471408883012006e-05, + "loss": 0.8677, + "num_input_tokens_seen": 574479136, + "step": 3181 + }, + { + "epoch": 0.34834012972440404, + "grad_norm": 1.316989033907549, + "learning_rate": 3.646376823872313e-05, + "loss": 1.0189, + "num_input_tokens_seen": 574696864, + "step": 3182 + }, + { + "epoch": 0.34844960179534196, + "grad_norm": 1.2464209817506158, + "learning_rate": 3.6456126238292394e-05, + "loss": 0.6874, + "num_input_tokens_seen": 574873824, + "step": 3183 + }, + { + "epoch": 0.3485590738662799, + "grad_norm": 1.3749404807646937, + "learning_rate": 3.6448482882623814e-05, + "loss": 0.8923, + "num_input_tokens_seen": 575080576, + "step": 3184 + }, + { + "epoch": 0.34866854593721774, + "grad_norm": 1.1755833253262638, + "learning_rate": 3.64408381726216e-05, + "loss": 0.7943, + "num_input_tokens_seen": 575275232, + "step": 3185 + }, + { + "epoch": 0.34877801800815567, + "grad_norm": 1.3631813865308713, + "learning_rate": 3.6433192109190096e-05, + "loss": 0.929, + "num_input_tokens_seen": 575455776, + "step": 3186 + }, + { + "epoch": 0.3488874900790936, + "grad_norm": 1.0801977605250168, + "learning_rate": 3.642554469323382e-05, + "loss": 0.5777, + "num_input_tokens_seen": 575619296, + "step": 3187 + }, + { + "epoch": 0.34899696215003145, + "grad_norm": 1.3033977232309815, + "learning_rate": 3.641789592565746e-05, + "loss": 0.7902, + "num_input_tokens_seen": 575823136, + "step": 3188 + }, + { + "epoch": 0.3491064342209694, + "grad_norm": 1.287989175327496, + "learning_rate": 3.641024580736583e-05, + "loss": 0.6671, + "num_input_tokens_seen": 576006592, + "step": 3189 + }, + { + "epoch": 0.3492159062919073, + "grad_norm": 1.4398247538000148, + "learning_rate": 3.640259433926394e-05, + "loss": 0.7917, + "num_input_tokens_seen": 576158464, + "step": 3190 + }, + { + "epoch": 0.34932537836284516, + "grad_norm": 1.3466101525427518, + "learning_rate": 3.639494152225693e-05, + "loss": 0.9047, + "num_input_tokens_seen": 576366784, + "step": 3191 + }, + { + "epoch": 0.3494348504337831, + "grad_norm": 1.2182979723594969, + "learning_rate": 3.638728735725013e-05, + "loss": 0.5282, + "num_input_tokens_seen": 576524256, + "step": 3192 + }, + { + "epoch": 0.349544322504721, + "grad_norm": 1.2194199799063137, + "learning_rate": 3.6379631845148995e-05, + "loss": 0.6554, + "num_input_tokens_seen": 576708832, + "step": 3193 + }, + { + "epoch": 0.34965379457565887, + "grad_norm": 1.2010074161742068, + "learning_rate": 3.637197498685917e-05, + "loss": 0.4771, + "num_input_tokens_seen": 576884896, + "step": 3194 + }, + { + "epoch": 0.3497632666465968, + "grad_norm": 1.3466932753886425, + "learning_rate": 3.636431678328646e-05, + "loss": 0.8125, + "num_input_tokens_seen": 577090752, + "step": 3195 + }, + { + "epoch": 0.3498727387175347, + "grad_norm": 1.3788636717017586, + "learning_rate": 3.635665723533678e-05, + "loss": 0.58, + "num_input_tokens_seen": 577250688, + "step": 3196 + }, + { + "epoch": 0.3499822107884726, + "grad_norm": 1.22983057336118, + "learning_rate": 3.634899634391626e-05, + "loss": 0.7901, + "num_input_tokens_seen": 577427872, + "step": 3197 + }, + { + "epoch": 0.3500916828594105, + "grad_norm": 1.2236262851793682, + "learning_rate": 3.634133410993117e-05, + "loss": 0.5697, + "num_input_tokens_seen": 577572128, + "step": 3198 + }, + { + "epoch": 0.35020115493034837, + "grad_norm": 1.1376279884687261, + "learning_rate": 3.6333670534287945e-05, + "loss": 0.6586, + "num_input_tokens_seen": 577743936, + "step": 3199 + }, + { + "epoch": 0.3503106270012863, + "grad_norm": 1.3907788223744268, + "learning_rate": 3.632600561789315e-05, + "loss": 0.6503, + "num_input_tokens_seen": 577902528, + "step": 3200 + }, + { + "epoch": 0.3504200990722242, + "grad_norm": 2.2042004280413203, + "learning_rate": 3.6318339361653545e-05, + "loss": 0.9012, + "num_input_tokens_seen": 578095168, + "step": 3201 + }, + { + "epoch": 0.3505295711431621, + "grad_norm": 1.223414420620259, + "learning_rate": 3.631067176647603e-05, + "loss": 0.6675, + "num_input_tokens_seen": 578265184, + "step": 3202 + }, + { + "epoch": 0.3506390432141, + "grad_norm": 1.2965393551268316, + "learning_rate": 3.630300283326768e-05, + "loss": 0.7568, + "num_input_tokens_seen": 578458720, + "step": 3203 + }, + { + "epoch": 0.3507485152850379, + "grad_norm": 1.2542674177305089, + "learning_rate": 3.629533256293569e-05, + "loss": 0.648, + "num_input_tokens_seen": 578649344, + "step": 3204 + }, + { + "epoch": 0.3508579873559758, + "grad_norm": 1.19035182632535, + "learning_rate": 3.6287660956387454e-05, + "loss": 0.5904, + "num_input_tokens_seen": 578803008, + "step": 3205 + }, + { + "epoch": 0.3509674594269137, + "grad_norm": 1.328911223225648, + "learning_rate": 3.62799880145305e-05, + "loss": 0.736, + "num_input_tokens_seen": 578962496, + "step": 3206 + }, + { + "epoch": 0.35107693149785163, + "grad_norm": 1.2824501371450299, + "learning_rate": 3.627231373827253e-05, + "loss": 0.8662, + "num_input_tokens_seen": 579156256, + "step": 3207 + }, + { + "epoch": 0.3511864035687895, + "grad_norm": 1.228110193449653, + "learning_rate": 3.62646381285214e-05, + "loss": 0.5092, + "num_input_tokens_seen": 579323136, + "step": 3208 + }, + { + "epoch": 0.3512958756397274, + "grad_norm": 1.2733704250901714, + "learning_rate": 3.6256961186185115e-05, + "loss": 0.6725, + "num_input_tokens_seen": 579508608, + "step": 3209 + }, + { + "epoch": 0.35140534771066534, + "grad_norm": 1.197211110303685, + "learning_rate": 3.624928291217184e-05, + "loss": 0.6277, + "num_input_tokens_seen": 579673920, + "step": 3210 + }, + { + "epoch": 0.3515148197816032, + "grad_norm": 1.1726969570182562, + "learning_rate": 3.624160330738989e-05, + "loss": 0.5944, + "num_input_tokens_seen": 579851104, + "step": 3211 + }, + { + "epoch": 0.3516242918525411, + "grad_norm": 1.3004683384544622, + "learning_rate": 3.623392237274777e-05, + "loss": 0.6536, + "num_input_tokens_seen": 580018880, + "step": 3212 + }, + { + "epoch": 0.35173376392347905, + "grad_norm": 1.238025626864827, + "learning_rate": 3.6226240109154105e-05, + "loss": 0.6492, + "num_input_tokens_seen": 580183520, + "step": 3213 + }, + { + "epoch": 0.3518432359944169, + "grad_norm": 1.2432507936439459, + "learning_rate": 3.6218556517517695e-05, + "loss": 0.8064, + "num_input_tokens_seen": 580401472, + "step": 3214 + }, + { + "epoch": 0.35195270806535484, + "grad_norm": 1.3762950866964079, + "learning_rate": 3.6210871598747495e-05, + "loss": 0.6504, + "num_input_tokens_seen": 580597248, + "step": 3215 + }, + { + "epoch": 0.3520621801362927, + "grad_norm": 1.449581954149842, + "learning_rate": 3.620318535375262e-05, + "loss": 0.9963, + "num_input_tokens_seen": 580798624, + "step": 3216 + }, + { + "epoch": 0.3521716522072306, + "grad_norm": 1.259911249304838, + "learning_rate": 3.6195497783442336e-05, + "loss": 0.6888, + "num_input_tokens_seen": 581004032, + "step": 3217 + }, + { + "epoch": 0.35228112427816854, + "grad_norm": 1.2933520529726108, + "learning_rate": 3.618780888872606e-05, + "loss": 0.8282, + "num_input_tokens_seen": 581174048, + "step": 3218 + }, + { + "epoch": 0.3523905963491064, + "grad_norm": 1.3872797301436213, + "learning_rate": 3.618011867051339e-05, + "loss": 0.69, + "num_input_tokens_seen": 581343616, + "step": 3219 + }, + { + "epoch": 0.35250006842004433, + "grad_norm": 1.2111294994115154, + "learning_rate": 3.6172427129714036e-05, + "loss": 0.8062, + "num_input_tokens_seen": 581519680, + "step": 3220 + }, + { + "epoch": 0.35260954049098225, + "grad_norm": 1.3394285398347963, + "learning_rate": 3.616473426723792e-05, + "loss": 0.6634, + "num_input_tokens_seen": 581679168, + "step": 3221 + }, + { + "epoch": 0.3527190125619201, + "grad_norm": 1.391736561434349, + "learning_rate": 3.615704008399509e-05, + "loss": 0.8617, + "num_input_tokens_seen": 581851872, + "step": 3222 + }, + { + "epoch": 0.35282848463285804, + "grad_norm": 1.253061931288234, + "learning_rate": 3.614934458089575e-05, + "loss": 0.6769, + "num_input_tokens_seen": 582036672, + "step": 3223 + }, + { + "epoch": 0.35293795670379596, + "grad_norm": 1.1598877258885318, + "learning_rate": 3.614164775885025e-05, + "loss": 0.8119, + "num_input_tokens_seen": 582243424, + "step": 3224 + }, + { + "epoch": 0.35304742877473383, + "grad_norm": 1.1607704463409252, + "learning_rate": 3.613394961876912e-05, + "loss": 0.5529, + "num_input_tokens_seen": 582410752, + "step": 3225 + }, + { + "epoch": 0.35315690084567175, + "grad_norm": 1.266546079057158, + "learning_rate": 3.612625016156303e-05, + "loss": 0.8328, + "num_input_tokens_seen": 582577632, + "step": 3226 + }, + { + "epoch": 0.35326637291660967, + "grad_norm": 1.1713041822670318, + "learning_rate": 3.611854938814282e-05, + "loss": 0.527, + "num_input_tokens_seen": 582725248, + "step": 3227 + }, + { + "epoch": 0.35337584498754754, + "grad_norm": 1.3869996122237298, + "learning_rate": 3.6110847299419474e-05, + "loss": 0.7511, + "num_input_tokens_seen": 582890560, + "step": 3228 + }, + { + "epoch": 0.35348531705848546, + "grad_norm": 1.3259241156404886, + "learning_rate": 3.6103143896304136e-05, + "loss": 0.6117, + "num_input_tokens_seen": 583074016, + "step": 3229 + }, + { + "epoch": 0.3535947891294234, + "grad_norm": 1.2417842762353872, + "learning_rate": 3.6095439179708096e-05, + "loss": 0.7153, + "num_input_tokens_seen": 583249184, + "step": 3230 + }, + { + "epoch": 0.35370426120036125, + "grad_norm": 1.2066831117549126, + "learning_rate": 3.6087733150542814e-05, + "loss": 0.6756, + "num_input_tokens_seen": 583436448, + "step": 3231 + }, + { + "epoch": 0.35381373327129917, + "grad_norm": 1.2268286252305656, + "learning_rate": 3.6080025809719894e-05, + "loss": 0.6668, + "num_input_tokens_seen": 583600192, + "step": 3232 + }, + { + "epoch": 0.35392320534223703, + "grad_norm": 1.096998730497921, + "learning_rate": 3.607231715815111e-05, + "loss": 0.6171, + "num_input_tokens_seen": 583804032, + "step": 3233 + }, + { + "epoch": 0.35403267741317496, + "grad_norm": 1.250192102929626, + "learning_rate": 3.6064607196748365e-05, + "loss": 0.6807, + "num_input_tokens_seen": 583993760, + "step": 3234 + }, + { + "epoch": 0.3541421494841129, + "grad_norm": 1.2996266140609951, + "learning_rate": 3.6056895926423736e-05, + "loss": 0.8425, + "num_input_tokens_seen": 584178112, + "step": 3235 + }, + { + "epoch": 0.35425162155505074, + "grad_norm": 1.1761846008929924, + "learning_rate": 3.604918334808947e-05, + "loss": 0.5065, + "num_input_tokens_seen": 584347456, + "step": 3236 + }, + { + "epoch": 0.35436109362598867, + "grad_norm": 1.376797151862617, + "learning_rate": 3.6041469462657926e-05, + "loss": 0.7108, + "num_input_tokens_seen": 584520160, + "step": 3237 + }, + { + "epoch": 0.3544705656969266, + "grad_norm": 1.2315251251179133, + "learning_rate": 3.6033754271041654e-05, + "loss": 0.5951, + "num_input_tokens_seen": 584683008, + "step": 3238 + }, + { + "epoch": 0.35458003776786445, + "grad_norm": 1.231986571997377, + "learning_rate": 3.602603777415335e-05, + "loss": 0.8605, + "num_input_tokens_seen": 584884608, + "step": 3239 + }, + { + "epoch": 0.3546895098388024, + "grad_norm": 1.3487780780715608, + "learning_rate": 3.601831997290585e-05, + "loss": 0.7763, + "num_input_tokens_seen": 585055072, + "step": 3240 + }, + { + "epoch": 0.3547989819097403, + "grad_norm": 1.282940423978197, + "learning_rate": 3.6010600868212156e-05, + "loss": 0.6473, + "num_input_tokens_seen": 585220160, + "step": 3241 + }, + { + "epoch": 0.35490845398067816, + "grad_norm": 1.3996925902055837, + "learning_rate": 3.6002880460985436e-05, + "loss": 0.6425, + "num_input_tokens_seen": 585380320, + "step": 3242 + }, + { + "epoch": 0.3550179260516161, + "grad_norm": 1.2709535114601347, + "learning_rate": 3.599515875213899e-05, + "loss": 0.6272, + "num_input_tokens_seen": 585570944, + "step": 3243 + }, + { + "epoch": 0.355127398122554, + "grad_norm": 1.2690695054970038, + "learning_rate": 3.5987435742586286e-05, + "loss": 0.6361, + "num_input_tokens_seen": 585733792, + "step": 3244 + }, + { + "epoch": 0.35523687019349187, + "grad_norm": 1.159171717422711, + "learning_rate": 3.5979711433240934e-05, + "loss": 0.606, + "num_input_tokens_seen": 585858336, + "step": 3245 + }, + { + "epoch": 0.3553463422644298, + "grad_norm": 1.2427454241290634, + "learning_rate": 3.597198582501671e-05, + "loss": 0.6926, + "num_input_tokens_seen": 586074272, + "step": 3246 + }, + { + "epoch": 0.3554558143353677, + "grad_norm": 1.2418700478334566, + "learning_rate": 3.596425891882754e-05, + "loss": 0.65, + "num_input_tokens_seen": 586266912, + "step": 3247 + }, + { + "epoch": 0.3555652864063056, + "grad_norm": 1.20051335837905, + "learning_rate": 3.59565307155875e-05, + "loss": 0.7093, + "num_input_tokens_seen": 586441632, + "step": 3248 + }, + { + "epoch": 0.3556747584772435, + "grad_norm": 1.4559704393540507, + "learning_rate": 3.594880121621081e-05, + "loss": 0.7655, + "num_input_tokens_seen": 586600224, + "step": 3249 + }, + { + "epoch": 0.35578423054818137, + "grad_norm": 1.1442934381575705, + "learning_rate": 3.5941070421611874e-05, + "loss": 0.6586, + "num_input_tokens_seen": 586775616, + "step": 3250 + }, + { + "epoch": 0.3558937026191193, + "grad_norm": 1.1515470692650167, + "learning_rate": 3.5933338332705225e-05, + "loss": 0.6395, + "num_input_tokens_seen": 586973632, + "step": 3251 + }, + { + "epoch": 0.3560031746900572, + "grad_norm": 1.1406966409001886, + "learning_rate": 3.592560495040556e-05, + "loss": 0.5088, + "num_input_tokens_seen": 587128416, + "step": 3252 + }, + { + "epoch": 0.3561126467609951, + "grad_norm": 1.2128364686039934, + "learning_rate": 3.59178702756277e-05, + "loss": 0.6844, + "num_input_tokens_seen": 587306496, + "step": 3253 + }, + { + "epoch": 0.356222118831933, + "grad_norm": 1.3656299983790776, + "learning_rate": 3.591013430928666e-05, + "loss": 0.746, + "num_input_tokens_seen": 587474048, + "step": 3254 + }, + { + "epoch": 0.3563315909028709, + "grad_norm": 1.2268422287670873, + "learning_rate": 3.59023970522976e-05, + "loss": 0.5208, + "num_input_tokens_seen": 587648096, + "step": 3255 + }, + { + "epoch": 0.3564410629738088, + "grad_norm": 1.5287724907656235, + "learning_rate": 3.5894658505575805e-05, + "loss": 0.7146, + "num_input_tokens_seen": 587833568, + "step": 3256 + }, + { + "epoch": 0.3565505350447467, + "grad_norm": 1.2504652111304162, + "learning_rate": 3.588691867003673e-05, + "loss": 0.6186, + "num_input_tokens_seen": 587983648, + "step": 3257 + }, + { + "epoch": 0.35666000711568463, + "grad_norm": 1.3084054384387354, + "learning_rate": 3.5879177546595996e-05, + "loss": 0.7749, + "num_input_tokens_seen": 588159712, + "step": 3258 + }, + { + "epoch": 0.3567694791866225, + "grad_norm": 1.10753597364707, + "learning_rate": 3.5871435136169355e-05, + "loss": 0.5357, + "num_input_tokens_seen": 588341376, + "step": 3259 + }, + { + "epoch": 0.3568789512575604, + "grad_norm": 1.368207540100359, + "learning_rate": 3.5863691439672715e-05, + "loss": 0.9149, + "num_input_tokens_seen": 588528640, + "step": 3260 + }, + { + "epoch": 0.35698842332849834, + "grad_norm": 1.2424368937870989, + "learning_rate": 3.5855946458022145e-05, + "loss": 0.6582, + "num_input_tokens_seen": 588685888, + "step": 3261 + }, + { + "epoch": 0.3570978953994362, + "grad_norm": 1.316941557126967, + "learning_rate": 3.5848200192133866e-05, + "loss": 0.6487, + "num_input_tokens_seen": 588860832, + "step": 3262 + }, + { + "epoch": 0.3572073674703741, + "grad_norm": 1.5081778760573494, + "learning_rate": 3.5840452642924243e-05, + "loss": 0.8504, + "num_input_tokens_seen": 589020096, + "step": 3263 + }, + { + "epoch": 0.35731683954131205, + "grad_norm": 1.197084198440326, + "learning_rate": 3.5832703811309795e-05, + "loss": 0.7016, + "num_input_tokens_seen": 589203328, + "step": 3264 + }, + { + "epoch": 0.3574263116122499, + "grad_norm": 1.448994333828733, + "learning_rate": 3.58249536982072e-05, + "loss": 1.0058, + "num_input_tokens_seen": 589383424, + "step": 3265 + }, + { + "epoch": 0.35753578368318784, + "grad_norm": 1.180877439087535, + "learning_rate": 3.581720230453327e-05, + "loss": 0.8033, + "num_input_tokens_seen": 589581440, + "step": 3266 + }, + { + "epoch": 0.3576452557541257, + "grad_norm": 1.0987780394464515, + "learning_rate": 3.5809449631204985e-05, + "loss": 0.682, + "num_input_tokens_seen": 589745632, + "step": 3267 + }, + { + "epoch": 0.3577547278250636, + "grad_norm": 1.2498075813966976, + "learning_rate": 3.580169567913947e-05, + "loss": 0.7716, + "num_input_tokens_seen": 589922368, + "step": 3268 + }, + { + "epoch": 0.35786419989600154, + "grad_norm": 1.205872111844835, + "learning_rate": 3.5793940449254016e-05, + "loss": 0.6651, + "num_input_tokens_seen": 590061696, + "step": 3269 + }, + { + "epoch": 0.3579736719669394, + "grad_norm": 1.1498279587969835, + "learning_rate": 3.578618394246603e-05, + "loss": 0.6599, + "num_input_tokens_seen": 590231712, + "step": 3270 + }, + { + "epoch": 0.35808314403787733, + "grad_norm": 1.3164881559071695, + "learning_rate": 3.577842615969311e-05, + "loss": 0.7798, + "num_input_tokens_seen": 590412480, + "step": 3271 + }, + { + "epoch": 0.35819261610881525, + "grad_norm": 1.3134791491174578, + "learning_rate": 3.577066710185298e-05, + "loss": 0.6977, + "num_input_tokens_seen": 590569952, + "step": 3272 + }, + { + "epoch": 0.3583020881797531, + "grad_norm": 1.3006069765395698, + "learning_rate": 3.576290676986352e-05, + "loss": 0.6471, + "num_input_tokens_seen": 590773792, + "step": 3273 + }, + { + "epoch": 0.35841156025069104, + "grad_norm": 1.277795274356708, + "learning_rate": 3.575514516464277e-05, + "loss": 0.7674, + "num_input_tokens_seen": 590960160, + "step": 3274 + }, + { + "epoch": 0.35852103232162896, + "grad_norm": 1.113893735282631, + "learning_rate": 3.57473822871089e-05, + "loss": 0.5837, + "num_input_tokens_seen": 591161088, + "step": 3275 + }, + { + "epoch": 0.35863050439256683, + "grad_norm": 1.2496478538396154, + "learning_rate": 3.5739618138180254e-05, + "loss": 0.708, + "num_input_tokens_seen": 591330432, + "step": 3276 + }, + { + "epoch": 0.35873997646350475, + "grad_norm": 1.4135823989130756, + "learning_rate": 3.573185271877531e-05, + "loss": 0.7754, + "num_input_tokens_seen": 591480288, + "step": 3277 + }, + { + "epoch": 0.35884944853444267, + "grad_norm": 1.5128190326766324, + "learning_rate": 3.572408602981271e-05, + "loss": 0.7743, + "num_input_tokens_seen": 591699360, + "step": 3278 + }, + { + "epoch": 0.35895892060538054, + "grad_norm": 1.274497965539062, + "learning_rate": 3.571631807221123e-05, + "loss": 0.6722, + "num_input_tokens_seen": 591897600, + "step": 3279 + }, + { + "epoch": 0.35906839267631846, + "grad_norm": 1.1976624371094466, + "learning_rate": 3.570854884688981e-05, + "loss": 0.6769, + "num_input_tokens_seen": 592092032, + "step": 3280 + }, + { + "epoch": 0.3591778647472564, + "grad_norm": 1.575117890926121, + "learning_rate": 3.570077835476753e-05, + "loss": 0.9675, + "num_input_tokens_seen": 592286688, + "step": 3281 + }, + { + "epoch": 0.35928733681819425, + "grad_norm": 1.287116136084528, + "learning_rate": 3.569300659676363e-05, + "loss": 0.6696, + "num_input_tokens_seen": 592456704, + "step": 3282 + }, + { + "epoch": 0.35939680888913217, + "grad_norm": 1.0957468061528972, + "learning_rate": 3.568523357379749e-05, + "loss": 0.6586, + "num_input_tokens_seen": 592663904, + "step": 3283 + }, + { + "epoch": 0.35950628096007003, + "grad_norm": 1.2120061281508823, + "learning_rate": 3.5677459286788645e-05, + "loss": 0.6057, + "num_input_tokens_seen": 592863712, + "step": 3284 + }, + { + "epoch": 0.35961575303100796, + "grad_norm": 1.0922399723835137, + "learning_rate": 3.566968373665678e-05, + "loss": 0.6455, + "num_input_tokens_seen": 593078528, + "step": 3285 + }, + { + "epoch": 0.3597252251019459, + "grad_norm": 1.485097046965908, + "learning_rate": 3.5661906924321723e-05, + "loss": 0.7128, + "num_input_tokens_seen": 593238688, + "step": 3286 + }, + { + "epoch": 0.35983469717288374, + "grad_norm": 1.240701422271319, + "learning_rate": 3.5654128850703464e-05, + "loss": 0.5496, + "num_input_tokens_seen": 593433568, + "step": 3287 + }, + { + "epoch": 0.35994416924382167, + "grad_norm": 1.107433123926692, + "learning_rate": 3.564634951672212e-05, + "loss": 0.5224, + "num_input_tokens_seen": 593597984, + "step": 3288 + }, + { + "epoch": 0.3600536413147596, + "grad_norm": 1.131863268702444, + "learning_rate": 3.5638568923297985e-05, + "loss": 0.5969, + "num_input_tokens_seen": 593759040, + "step": 3289 + }, + { + "epoch": 0.36016311338569745, + "grad_norm": 1.3528175039853123, + "learning_rate": 3.563078707135149e-05, + "loss": 0.7618, + "num_input_tokens_seen": 593949888, + "step": 3290 + }, + { + "epoch": 0.3602725854566354, + "grad_norm": 1.280854838771297, + "learning_rate": 3.56230039618032e-05, + "loss": 0.7381, + "num_input_tokens_seen": 594138496, + "step": 3291 + }, + { + "epoch": 0.3603820575275733, + "grad_norm": 1.2141036941039782, + "learning_rate": 3.561521959557385e-05, + "loss": 0.524, + "num_input_tokens_seen": 594287904, + "step": 3292 + }, + { + "epoch": 0.36049152959851116, + "grad_norm": 1.3388311268564173, + "learning_rate": 3.5607433973584316e-05, + "loss": 0.5885, + "num_input_tokens_seen": 594470912, + "step": 3293 + }, + { + "epoch": 0.3606010016694491, + "grad_norm": 1.2692826247681763, + "learning_rate": 3.5599647096755624e-05, + "loss": 0.7536, + "num_input_tokens_seen": 594664448, + "step": 3294 + }, + { + "epoch": 0.360710473740387, + "grad_norm": 1.3222249643224355, + "learning_rate": 3.5591858966008935e-05, + "loss": 0.7582, + "num_input_tokens_seen": 594830208, + "step": 3295 + }, + { + "epoch": 0.36081994581132487, + "grad_norm": 1.3137567044131508, + "learning_rate": 3.558406958226559e-05, + "loss": 0.7423, + "num_input_tokens_seen": 594965952, + "step": 3296 + }, + { + "epoch": 0.3609294178822628, + "grad_norm": 1.2908663988075266, + "learning_rate": 3.5576278946447036e-05, + "loss": 0.6987, + "num_input_tokens_seen": 595152992, + "step": 3297 + }, + { + "epoch": 0.3610388899532007, + "grad_norm": 1.45439264933488, + "learning_rate": 3.556848705947491e-05, + "loss": 0.7194, + "num_input_tokens_seen": 595372512, + "step": 3298 + }, + { + "epoch": 0.3611483620241386, + "grad_norm": 1.1607011741903714, + "learning_rate": 3.556069392227096e-05, + "loss": 0.6593, + "num_input_tokens_seen": 595542752, + "step": 3299 + }, + { + "epoch": 0.3612578340950765, + "grad_norm": 1.2201046872952745, + "learning_rate": 3.5552899535757115e-05, + "loss": 0.5969, + "num_input_tokens_seen": 595731360, + "step": 3300 + }, + { + "epoch": 0.36136730616601437, + "grad_norm": 1.2223691268016785, + "learning_rate": 3.554510390085543e-05, + "loss": 0.5062, + "num_input_tokens_seen": 595918624, + "step": 3301 + }, + { + "epoch": 0.3614767782369523, + "grad_norm": 1.1900997815653052, + "learning_rate": 3.5537307018488095e-05, + "loss": 0.6638, + "num_input_tokens_seen": 596109472, + "step": 3302 + }, + { + "epoch": 0.3615862503078902, + "grad_norm": 1.298603239165919, + "learning_rate": 3.55295088895775e-05, + "loss": 0.6893, + "num_input_tokens_seen": 596267168, + "step": 3303 + }, + { + "epoch": 0.3616957223788281, + "grad_norm": 1.2189917202418765, + "learning_rate": 3.552170951504613e-05, + "loss": 0.6731, + "num_input_tokens_seen": 596422400, + "step": 3304 + }, + { + "epoch": 0.361805194449766, + "grad_norm": 1.090333343245404, + "learning_rate": 3.551390889581664e-05, + "loss": 0.6372, + "num_input_tokens_seen": 596632736, + "step": 3305 + }, + { + "epoch": 0.3619146665207039, + "grad_norm": 1.2394994285246719, + "learning_rate": 3.550610703281182e-05, + "loss": 0.683, + "num_input_tokens_seen": 596809024, + "step": 3306 + }, + { + "epoch": 0.3620241385916418, + "grad_norm": 1.313820129613371, + "learning_rate": 3.5498303926954626e-05, + "loss": 0.5589, + "num_input_tokens_seen": 596996288, + "step": 3307 + }, + { + "epoch": 0.3621336106625797, + "grad_norm": 1.2594077378175697, + "learning_rate": 3.549049957916815e-05, + "loss": 0.5323, + "num_input_tokens_seen": 597149504, + "step": 3308 + }, + { + "epoch": 0.36224308273351763, + "grad_norm": 1.3482934744467834, + "learning_rate": 3.548269399037562e-05, + "loss": 0.7142, + "num_input_tokens_seen": 597329824, + "step": 3309 + }, + { + "epoch": 0.3623525548044555, + "grad_norm": 1.365436631748295, + "learning_rate": 3.547488716150044e-05, + "loss": 0.7663, + "num_input_tokens_seen": 597491552, + "step": 3310 + }, + { + "epoch": 0.3624620268753934, + "grad_norm": 1.1853229242053014, + "learning_rate": 3.546707909346613e-05, + "loss": 0.6858, + "num_input_tokens_seen": 597678144, + "step": 3311 + }, + { + "epoch": 0.36257149894633134, + "grad_norm": 1.3052270196346778, + "learning_rate": 3.545926978719637e-05, + "loss": 0.8191, + "num_input_tokens_seen": 597886464, + "step": 3312 + }, + { + "epoch": 0.3626809710172692, + "grad_norm": 1.240386713238866, + "learning_rate": 3.545145924361499e-05, + "loss": 0.9161, + "num_input_tokens_seen": 598096576, + "step": 3313 + }, + { + "epoch": 0.3627904430882071, + "grad_norm": 1.256764751513937, + "learning_rate": 3.544364746364596e-05, + "loss": 0.7122, + "num_input_tokens_seen": 598273088, + "step": 3314 + }, + { + "epoch": 0.36289991515914505, + "grad_norm": 1.2439842425265268, + "learning_rate": 3.54358344482134e-05, + "loss": 0.6235, + "num_input_tokens_seen": 598450720, + "step": 3315 + }, + { + "epoch": 0.3630093872300829, + "grad_norm": 1.2022858207285971, + "learning_rate": 3.542802019824158e-05, + "loss": 0.7491, + "num_input_tokens_seen": 598638880, + "step": 3316 + }, + { + "epoch": 0.36311885930102084, + "grad_norm": 1.182767623795866, + "learning_rate": 3.5420204714654906e-05, + "loss": 0.5544, + "num_input_tokens_seen": 598834880, + "step": 3317 + }, + { + "epoch": 0.3632283313719587, + "grad_norm": 1.1095656110317795, + "learning_rate": 3.5412387998377926e-05, + "loss": 0.6329, + "num_input_tokens_seen": 599006240, + "step": 3318 + }, + { + "epoch": 0.3633378034428966, + "grad_norm": 1.3782132922757029, + "learning_rate": 3.5404570050335354e-05, + "loss": 0.9257, + "num_input_tokens_seen": 599208064, + "step": 3319 + }, + { + "epoch": 0.36344727551383454, + "grad_norm": 1.17444055103277, + "learning_rate": 3.5396750871452036e-05, + "loss": 0.5907, + "num_input_tokens_seen": 599379648, + "step": 3320 + }, + { + "epoch": 0.3635567475847724, + "grad_norm": 1.2131340040671663, + "learning_rate": 3.538893046265297e-05, + "loss": 0.8463, + "num_input_tokens_seen": 599581920, + "step": 3321 + }, + { + "epoch": 0.36366621965571033, + "grad_norm": 1.1781892167052, + "learning_rate": 3.5381108824863284e-05, + "loss": 0.6833, + "num_input_tokens_seen": 599766048, + "step": 3322 + }, + { + "epoch": 0.36377569172664825, + "grad_norm": 1.1612018535804212, + "learning_rate": 3.5373285959008265e-05, + "loss": 0.6742, + "num_input_tokens_seen": 599989376, + "step": 3323 + }, + { + "epoch": 0.3638851637975861, + "grad_norm": 1.2323728330983734, + "learning_rate": 3.536546186601336e-05, + "loss": 0.6352, + "num_input_tokens_seen": 600173952, + "step": 3324 + }, + { + "epoch": 0.36399463586852404, + "grad_norm": 1.3142089163439863, + "learning_rate": 3.5357636546804125e-05, + "loss": 0.8862, + "num_input_tokens_seen": 600350016, + "step": 3325 + }, + { + "epoch": 0.36410410793946196, + "grad_norm": 1.189861509374681, + "learning_rate": 3.534981000230629e-05, + "loss": 0.6444, + "num_input_tokens_seen": 600536608, + "step": 3326 + }, + { + "epoch": 0.36421358001039983, + "grad_norm": 1.2859889987724171, + "learning_rate": 3.5341982233445715e-05, + "loss": 0.7013, + "num_input_tokens_seen": 600719616, + "step": 3327 + }, + { + "epoch": 0.36432305208133775, + "grad_norm": 1.3601282159588306, + "learning_rate": 3.533415324114841e-05, + "loss": 0.8, + "num_input_tokens_seen": 600907552, + "step": 3328 + }, + { + "epoch": 0.36443252415227567, + "grad_norm": 1.1420461606351757, + "learning_rate": 3.532632302634053e-05, + "loss": 0.5329, + "num_input_tokens_seen": 601085856, + "step": 3329 + }, + { + "epoch": 0.36454199622321354, + "grad_norm": 1.4271166256921775, + "learning_rate": 3.531849158994839e-05, + "loss": 0.6869, + "num_input_tokens_seen": 601250272, + "step": 3330 + }, + { + "epoch": 0.36465146829415146, + "grad_norm": 1.2350022376190795, + "learning_rate": 3.531065893289841e-05, + "loss": 0.6708, + "num_input_tokens_seen": 601406624, + "step": 3331 + }, + { + "epoch": 0.3647609403650894, + "grad_norm": 1.4459633676207355, + "learning_rate": 3.530282505611719e-05, + "loss": 0.8848, + "num_input_tokens_seen": 601601728, + "step": 3332 + }, + { + "epoch": 0.36487041243602725, + "grad_norm": 1.3855044784060524, + "learning_rate": 3.5294989960531456e-05, + "loss": 0.7992, + "num_input_tokens_seen": 601800640, + "step": 3333 + }, + { + "epoch": 0.36497988450696517, + "grad_norm": 1.399601286924108, + "learning_rate": 3.52871536470681e-05, + "loss": 0.684, + "num_input_tokens_seen": 601942880, + "step": 3334 + }, + { + "epoch": 0.36508935657790303, + "grad_norm": 1.2482786246986108, + "learning_rate": 3.5279316116654126e-05, + "loss": 0.665, + "num_input_tokens_seen": 602105952, + "step": 3335 + }, + { + "epoch": 0.36519882864884096, + "grad_norm": 1.2480743357423696, + "learning_rate": 3.527147737021671e-05, + "loss": 0.7021, + "num_input_tokens_seen": 602279104, + "step": 3336 + }, + { + "epoch": 0.3653083007197789, + "grad_norm": 1.4479139680174051, + "learning_rate": 3.526363740868316e-05, + "loss": 0.6554, + "num_input_tokens_seen": 602478016, + "step": 3337 + }, + { + "epoch": 0.36541777279071674, + "grad_norm": 1.2607761322344988, + "learning_rate": 3.525579623298092e-05, + "loss": 0.6294, + "num_input_tokens_seen": 602686560, + "step": 3338 + }, + { + "epoch": 0.36552724486165467, + "grad_norm": 1.2903397676742219, + "learning_rate": 3.52479538440376e-05, + "loss": 0.7855, + "num_input_tokens_seen": 602887936, + "step": 3339 + }, + { + "epoch": 0.3656367169325926, + "grad_norm": 1.2560414231266848, + "learning_rate": 3.5240110242780916e-05, + "loss": 0.9159, + "num_input_tokens_seen": 603089088, + "step": 3340 + }, + { + "epoch": 0.36574618900353045, + "grad_norm": 1.3248227972620348, + "learning_rate": 3.5232265430138776e-05, + "loss": 0.5979, + "num_input_tokens_seen": 603250144, + "step": 3341 + }, + { + "epoch": 0.3658556610744684, + "grad_norm": 1.3744514845122373, + "learning_rate": 3.52244194070392e-05, + "loss": 0.836, + "num_input_tokens_seen": 603428000, + "step": 3342 + }, + { + "epoch": 0.3659651331454063, + "grad_norm": 1.359371185297382, + "learning_rate": 3.521657217441034e-05, + "loss": 0.6995, + "num_input_tokens_seen": 603585472, + "step": 3343 + }, + { + "epoch": 0.36607460521634416, + "grad_norm": 1.1421124414081456, + "learning_rate": 3.520872373318053e-05, + "loss": 0.6596, + "num_input_tokens_seen": 603764896, + "step": 3344 + }, + { + "epoch": 0.3661840772872821, + "grad_norm": 1.2394047402773085, + "learning_rate": 3.520087408427822e-05, + "loss": 0.8945, + "num_input_tokens_seen": 603971648, + "step": 3345 + }, + { + "epoch": 0.36629354935822, + "grad_norm": 1.3492771739050429, + "learning_rate": 3.5193023228632003e-05, + "loss": 0.9512, + "num_input_tokens_seen": 604164064, + "step": 3346 + }, + { + "epoch": 0.36640302142915787, + "grad_norm": 1.2825936970680654, + "learning_rate": 3.518517116717063e-05, + "loss": 0.7553, + "num_input_tokens_seen": 604334528, + "step": 3347 + }, + { + "epoch": 0.3665124935000958, + "grad_norm": 1.2220219926150575, + "learning_rate": 3.5177317900822974e-05, + "loss": 0.6105, + "num_input_tokens_seen": 604531424, + "step": 3348 + }, + { + "epoch": 0.3666219655710337, + "grad_norm": 1.1605876801159831, + "learning_rate": 3.516946343051806e-05, + "loss": 0.6353, + "num_input_tokens_seen": 604732800, + "step": 3349 + }, + { + "epoch": 0.3667314376419716, + "grad_norm": 1.229327977904931, + "learning_rate": 3.516160775718508e-05, + "loss": 0.7061, + "num_input_tokens_seen": 604938880, + "step": 3350 + }, + { + "epoch": 0.3668409097129095, + "grad_norm": 1.2010754058060338, + "learning_rate": 3.5153750881753314e-05, + "loss": 0.6514, + "num_input_tokens_seen": 605102624, + "step": 3351 + }, + { + "epoch": 0.3669503817838474, + "grad_norm": 1.3081036461236837, + "learning_rate": 3.514589280515223e-05, + "loss": 0.7267, + "num_input_tokens_seen": 605268608, + "step": 3352 + }, + { + "epoch": 0.3670598538547853, + "grad_norm": 1.4000800092859116, + "learning_rate": 3.513803352831143e-05, + "loss": 0.9207, + "num_input_tokens_seen": 605448256, + "step": 3353 + }, + { + "epoch": 0.3671693259257232, + "grad_norm": 1.2479898024808747, + "learning_rate": 3.5130173052160645e-05, + "loss": 0.7838, + "num_input_tokens_seen": 605639552, + "step": 3354 + }, + { + "epoch": 0.3672787979966611, + "grad_norm": 1.1304073715155887, + "learning_rate": 3.512231137762975e-05, + "loss": 0.5323, + "num_input_tokens_seen": 605824576, + "step": 3355 + }, + { + "epoch": 0.367388270067599, + "grad_norm": 1.1945881362852193, + "learning_rate": 3.5114448505648754e-05, + "loss": 0.7502, + "num_input_tokens_seen": 606016768, + "step": 3356 + }, + { + "epoch": 0.3674977421385369, + "grad_norm": 1.3719206487407882, + "learning_rate": 3.510658443714785e-05, + "loss": 0.5687, + "num_input_tokens_seen": 606185216, + "step": 3357 + }, + { + "epoch": 0.3676072142094748, + "grad_norm": 1.3481333012650256, + "learning_rate": 3.509871917305734e-05, + "loss": 0.6255, + "num_input_tokens_seen": 606374272, + "step": 3358 + }, + { + "epoch": 0.3677166862804127, + "grad_norm": 1.2382608405997473, + "learning_rate": 3.509085271430764e-05, + "loss": 0.5482, + "num_input_tokens_seen": 606539360, + "step": 3359 + }, + { + "epoch": 0.36782615835135063, + "grad_norm": 1.2023990472145138, + "learning_rate": 3.508298506182936e-05, + "loss": 0.6908, + "num_input_tokens_seen": 606727968, + "step": 3360 + }, + { + "epoch": 0.3679356304222885, + "grad_norm": 1.1918125147810765, + "learning_rate": 3.5075116216553225e-05, + "loss": 0.5082, + "num_input_tokens_seen": 606932928, + "step": 3361 + }, + { + "epoch": 0.3680451024932264, + "grad_norm": 1.4191623844627932, + "learning_rate": 3.50672461794101e-05, + "loss": 0.9114, + "num_input_tokens_seen": 607099584, + "step": 3362 + }, + { + "epoch": 0.36815457456416434, + "grad_norm": 1.3298948276496352, + "learning_rate": 3.5059374951330995e-05, + "loss": 0.8327, + "num_input_tokens_seen": 607264224, + "step": 3363 + }, + { + "epoch": 0.3682640466351022, + "grad_norm": 1.1489962302097134, + "learning_rate": 3.505150253324706e-05, + "loss": 0.6214, + "num_input_tokens_seen": 607459104, + "step": 3364 + }, + { + "epoch": 0.3683735187060401, + "grad_norm": 1.2295237954510365, + "learning_rate": 3.5043628926089596e-05, + "loss": 0.7016, + "num_input_tokens_seen": 607644800, + "step": 3365 + }, + { + "epoch": 0.36848299077697805, + "grad_norm": 1.2691171293470174, + "learning_rate": 3.503575413079003e-05, + "loss": 0.6754, + "num_input_tokens_seen": 607854016, + "step": 3366 + }, + { + "epoch": 0.3685924628479159, + "grad_norm": 1.1954404974328614, + "learning_rate": 3.502787814827994e-05, + "loss": 0.6725, + "num_input_tokens_seen": 608042400, + "step": 3367 + }, + { + "epoch": 0.36870193491885384, + "grad_norm": 1.3171719551300443, + "learning_rate": 3.5020000979491025e-05, + "loss": 0.6151, + "num_input_tokens_seen": 608211072, + "step": 3368 + }, + { + "epoch": 0.36881140698979176, + "grad_norm": 1.1167837627184185, + "learning_rate": 3.501212262535515e-05, + "loss": 0.6844, + "num_input_tokens_seen": 608418496, + "step": 3369 + }, + { + "epoch": 0.3689208790607296, + "grad_norm": 1.3329158318053804, + "learning_rate": 3.500424308680431e-05, + "loss": 0.6813, + "num_input_tokens_seen": 608574624, + "step": 3370 + }, + { + "epoch": 0.36903035113166754, + "grad_norm": 1.1927904436702141, + "learning_rate": 3.499636236477064e-05, + "loss": 0.5584, + "num_input_tokens_seen": 608753600, + "step": 3371 + }, + { + "epoch": 0.3691398232026054, + "grad_norm": 1.2895398267680072, + "learning_rate": 3.498848046018641e-05, + "loss": 0.747, + "num_input_tokens_seen": 608956768, + "step": 3372 + }, + { + "epoch": 0.36924929527354333, + "grad_norm": 1.40639139100392, + "learning_rate": 3.498059737398405e-05, + "loss": 0.7515, + "num_input_tokens_seen": 609147392, + "step": 3373 + }, + { + "epoch": 0.36935876734448125, + "grad_norm": 1.3833041421652164, + "learning_rate": 3.497271310709608e-05, + "loss": 0.6842, + "num_input_tokens_seen": 609293888, + "step": 3374 + }, + { + "epoch": 0.3694682394154191, + "grad_norm": 1.273515055348399, + "learning_rate": 3.4964827660455226e-05, + "loss": 0.7378, + "num_input_tokens_seen": 609474432, + "step": 3375 + }, + { + "epoch": 0.36957771148635704, + "grad_norm": 1.1033459558273921, + "learning_rate": 3.495694103499431e-05, + "loss": 0.4927, + "num_input_tokens_seen": 609666176, + "step": 3376 + }, + { + "epoch": 0.36968718355729496, + "grad_norm": 1.2506358913422075, + "learning_rate": 3.494905323164629e-05, + "loss": 0.7194, + "num_input_tokens_seen": 609844928, + "step": 3377 + }, + { + "epoch": 0.36979665562823283, + "grad_norm": 1.195490705382984, + "learning_rate": 3.4941164251344306e-05, + "loss": 0.6797, + "num_input_tokens_seen": 610026368, + "step": 3378 + }, + { + "epoch": 0.36990612769917075, + "grad_norm": 1.249729890249883, + "learning_rate": 3.493327409502159e-05, + "loss": 0.8618, + "num_input_tokens_seen": 610220576, + "step": 3379 + }, + { + "epoch": 0.37001559977010867, + "grad_norm": 1.3312762405603202, + "learning_rate": 3.492538276361154e-05, + "loss": 0.7187, + "num_input_tokens_seen": 610367296, + "step": 3380 + }, + { + "epoch": 0.37012507184104654, + "grad_norm": 1.379505837599932, + "learning_rate": 3.491749025804768e-05, + "loss": 0.8269, + "num_input_tokens_seen": 610542016, + "step": 3381 + }, + { + "epoch": 0.37023454391198446, + "grad_norm": 1.301666599978658, + "learning_rate": 3.4909596579263685e-05, + "loss": 0.7192, + "num_input_tokens_seen": 610747872, + "step": 3382 + }, + { + "epoch": 0.3703440159829224, + "grad_norm": 1.3000626053774087, + "learning_rate": 3.490170172819336e-05, + "loss": 0.5798, + "num_input_tokens_seen": 610901536, + "step": 3383 + }, + { + "epoch": 0.37045348805386025, + "grad_norm": 1.2122103984309354, + "learning_rate": 3.489380570577064e-05, + "loss": 0.6032, + "num_input_tokens_seen": 611066848, + "step": 3384 + }, + { + "epoch": 0.37056296012479817, + "grad_norm": 1.2151252935507493, + "learning_rate": 3.488590851292963e-05, + "loss": 0.759, + "num_input_tokens_seen": 611283680, + "step": 3385 + }, + { + "epoch": 0.3706724321957361, + "grad_norm": 1.2063946572962068, + "learning_rate": 3.487801015060453e-05, + "loss": 0.6417, + "num_input_tokens_seen": 611478560, + "step": 3386 + }, + { + "epoch": 0.37078190426667396, + "grad_norm": 1.4167402846711903, + "learning_rate": 3.487011061972972e-05, + "loss": 1.0781, + "num_input_tokens_seen": 611686432, + "step": 3387 + }, + { + "epoch": 0.3708913763376119, + "grad_norm": 1.1996925735068251, + "learning_rate": 3.48622099212397e-05, + "loss": 0.6617, + "num_input_tokens_seen": 611868768, + "step": 3388 + }, + { + "epoch": 0.37100084840854974, + "grad_norm": 1.201016101195615, + "learning_rate": 3.485430805606909e-05, + "loss": 0.5927, + "num_input_tokens_seen": 612009216, + "step": 3389 + }, + { + "epoch": 0.37111032047948767, + "grad_norm": 1.3048786320281784, + "learning_rate": 3.484640502515267e-05, + "loss": 0.8317, + "num_input_tokens_seen": 612200736, + "step": 3390 + }, + { + "epoch": 0.3712197925504256, + "grad_norm": 1.1491209319163118, + "learning_rate": 3.483850082942537e-05, + "loss": 0.8839, + "num_input_tokens_seen": 612423840, + "step": 3391 + }, + { + "epoch": 0.37132926462136345, + "grad_norm": 1.3472097348205798, + "learning_rate": 3.4830595469822224e-05, + "loss": 0.7767, + "num_input_tokens_seen": 612597888, + "step": 3392 + }, + { + "epoch": 0.3714387366923014, + "grad_norm": 1.246924001165882, + "learning_rate": 3.482268894727843e-05, + "loss": 0.7403, + "num_input_tokens_seen": 612759616, + "step": 3393 + }, + { + "epoch": 0.3715482087632393, + "grad_norm": 1.3596200393287454, + "learning_rate": 3.481478126272931e-05, + "loss": 0.7046, + "num_input_tokens_seen": 612924928, + "step": 3394 + }, + { + "epoch": 0.37165768083417716, + "grad_norm": 1.1058421228936757, + "learning_rate": 3.4806872417110333e-05, + "loss": 0.6082, + "num_input_tokens_seen": 613099424, + "step": 3395 + }, + { + "epoch": 0.3717671529051151, + "grad_norm": 1.3519851376574352, + "learning_rate": 3.479896241135709e-05, + "loss": 0.8083, + "num_input_tokens_seen": 613279744, + "step": 3396 + }, + { + "epoch": 0.371876624976053, + "grad_norm": 1.2305196115790236, + "learning_rate": 3.4791051246405326e-05, + "loss": 0.5648, + "num_input_tokens_seen": 613451552, + "step": 3397 + }, + { + "epoch": 0.37198609704699087, + "grad_norm": 1.2332968047033595, + "learning_rate": 3.478313892319092e-05, + "loss": 0.6135, + "num_input_tokens_seen": 613638592, + "step": 3398 + }, + { + "epoch": 0.3720955691179288, + "grad_norm": 1.2023117983255265, + "learning_rate": 3.477522544264988e-05, + "loss": 0.5899, + "num_input_tokens_seen": 613812192, + "step": 3399 + }, + { + "epoch": 0.3722050411888667, + "grad_norm": 1.202172484416909, + "learning_rate": 3.4767310805718355e-05, + "loss": 0.6572, + "num_input_tokens_seen": 614010208, + "step": 3400 + }, + { + "epoch": 0.3723145132598046, + "grad_norm": 1.3128453663155928, + "learning_rate": 3.475939501333264e-05, + "loss": 0.8875, + "num_input_tokens_seen": 614209120, + "step": 3401 + }, + { + "epoch": 0.3724239853307425, + "grad_norm": 1.1838793520158186, + "learning_rate": 3.4751478066429156e-05, + "loss": 0.574, + "num_input_tokens_seen": 614400640, + "step": 3402 + }, + { + "epoch": 0.3725334574016804, + "grad_norm": 1.3008437041448215, + "learning_rate": 3.474355996594445e-05, + "loss": 0.79, + "num_input_tokens_seen": 614602016, + "step": 3403 + }, + { + "epoch": 0.3726429294726183, + "grad_norm": 1.2410240722983525, + "learning_rate": 3.473564071281522e-05, + "loss": 0.8379, + "num_input_tokens_seen": 614802720, + "step": 3404 + }, + { + "epoch": 0.3727524015435562, + "grad_norm": 1.3267048142729745, + "learning_rate": 3.472772030797832e-05, + "loss": 0.7605, + "num_input_tokens_seen": 614975648, + "step": 3405 + }, + { + "epoch": 0.3728618736144941, + "grad_norm": 1.1061699466512045, + "learning_rate": 3.4719798752370694e-05, + "loss": 0.6571, + "num_input_tokens_seen": 615192704, + "step": 3406 + }, + { + "epoch": 0.372971345685432, + "grad_norm": 1.256129495335379, + "learning_rate": 3.471187604692945e-05, + "loss": 0.7727, + "num_input_tokens_seen": 615376160, + "step": 3407 + }, + { + "epoch": 0.3730808177563699, + "grad_norm": 1.2763836253245269, + "learning_rate": 3.470395219259185e-05, + "loss": 0.6472, + "num_input_tokens_seen": 615542592, + "step": 3408 + }, + { + "epoch": 0.3731902898273078, + "grad_norm": 1.1247364901180472, + "learning_rate": 3.469602719029526e-05, + "loss": 0.486, + "num_input_tokens_seen": 615714400, + "step": 3409 + }, + { + "epoch": 0.3732997618982457, + "grad_norm": 1.259137847380401, + "learning_rate": 3.4688101040977164e-05, + "loss": 0.7045, + "num_input_tokens_seen": 615856192, + "step": 3410 + }, + { + "epoch": 0.37340923396918363, + "grad_norm": 1.302872588962702, + "learning_rate": 3.468017374557526e-05, + "loss": 0.8153, + "num_input_tokens_seen": 616062048, + "step": 3411 + }, + { + "epoch": 0.3735187060401215, + "grad_norm": 1.3744821440257438, + "learning_rate": 3.46722453050273e-05, + "loss": 0.8219, + "num_input_tokens_seen": 616221312, + "step": 3412 + }, + { + "epoch": 0.3736281781110594, + "grad_norm": 1.3826299428317768, + "learning_rate": 3.466431572027121e-05, + "loss": 0.7456, + "num_input_tokens_seen": 616406336, + "step": 3413 + }, + { + "epoch": 0.37373765018199734, + "grad_norm": 1.5277997285055895, + "learning_rate": 3.465638499224504e-05, + "loss": 0.7126, + "num_input_tokens_seen": 616576352, + "step": 3414 + }, + { + "epoch": 0.3738471222529352, + "grad_norm": 1.2919173561390773, + "learning_rate": 3.4648453121886994e-05, + "loss": 0.7554, + "num_input_tokens_seen": 616733824, + "step": 3415 + }, + { + "epoch": 0.3739565943238731, + "grad_norm": 1.216626983587658, + "learning_rate": 3.464052011013539e-05, + "loss": 0.75, + "num_input_tokens_seen": 616903168, + "step": 3416 + }, + { + "epoch": 0.37406606639481105, + "grad_norm": 1.1667234997303337, + "learning_rate": 3.463258595792867e-05, + "loss": 0.6599, + "num_input_tokens_seen": 617118880, + "step": 3417 + }, + { + "epoch": 0.3741755384657489, + "grad_norm": 1.3508646491953058, + "learning_rate": 3.462465066620546e-05, + "loss": 0.7821, + "num_input_tokens_seen": 617292032, + "step": 3418 + }, + { + "epoch": 0.37428501053668684, + "grad_norm": 1.273923769357481, + "learning_rate": 3.461671423590447e-05, + "loss": 0.7024, + "num_input_tokens_seen": 617464288, + "step": 3419 + }, + { + "epoch": 0.37439448260762476, + "grad_norm": 1.3575027112424412, + "learning_rate": 3.460877666796457e-05, + "loss": 0.7639, + "num_input_tokens_seen": 617644384, + "step": 3420 + }, + { + "epoch": 0.3745039546785626, + "grad_norm": 1.3675195345276638, + "learning_rate": 3.460083796332476e-05, + "loss": 0.6281, + "num_input_tokens_seen": 617805664, + "step": 3421 + }, + { + "epoch": 0.37461342674950054, + "grad_norm": 1.36240010901533, + "learning_rate": 3.459289812292418e-05, + "loss": 0.8818, + "num_input_tokens_seen": 617983744, + "step": 3422 + }, + { + "epoch": 0.3747228988204384, + "grad_norm": 1.2408320391110002, + "learning_rate": 3.458495714770208e-05, + "loss": 0.7425, + "num_input_tokens_seen": 618167200, + "step": 3423 + }, + { + "epoch": 0.37483237089137633, + "grad_norm": 1.1549947987889349, + "learning_rate": 3.4577015038597874e-05, + "loss": 0.5721, + "num_input_tokens_seen": 618313248, + "step": 3424 + }, + { + "epoch": 0.37494184296231425, + "grad_norm": 1.4026966979050681, + "learning_rate": 3.45690717965511e-05, + "loss": 0.7513, + "num_input_tokens_seen": 618495136, + "step": 3425 + }, + { + "epoch": 0.3750513150332521, + "grad_norm": 1.2799024240733845, + "learning_rate": 3.456112742250143e-05, + "loss": 0.5456, + "num_input_tokens_seen": 618659104, + "step": 3426 + }, + { + "epoch": 0.37516078710419004, + "grad_norm": 1.1670863157481872, + "learning_rate": 3.4553181917388664e-05, + "loss": 0.5851, + "num_input_tokens_seen": 618840096, + "step": 3427 + }, + { + "epoch": 0.37527025917512796, + "grad_norm": 1.4462054818966494, + "learning_rate": 3.4545235282152724e-05, + "loss": 0.9804, + "num_input_tokens_seen": 619003392, + "step": 3428 + }, + { + "epoch": 0.37537973124606583, + "grad_norm": 1.2110258993061558, + "learning_rate": 3.4537287517733713e-05, + "loss": 0.555, + "num_input_tokens_seen": 619192000, + "step": 3429 + }, + { + "epoch": 0.37548920331700375, + "grad_norm": 1.2866692260772734, + "learning_rate": 3.452933862507182e-05, + "loss": 0.7426, + "num_input_tokens_seen": 619360224, + "step": 3430 + }, + { + "epoch": 0.37559867538794167, + "grad_norm": 1.3082047524816551, + "learning_rate": 3.452138860510737e-05, + "loss": 0.7633, + "num_input_tokens_seen": 619561824, + "step": 3431 + }, + { + "epoch": 0.37570814745887954, + "grad_norm": 1.2631579714647487, + "learning_rate": 3.451343745878086e-05, + "loss": 0.779, + "num_input_tokens_seen": 619759168, + "step": 3432 + }, + { + "epoch": 0.37581761952981746, + "grad_norm": 1.271347015039858, + "learning_rate": 3.4505485187032894e-05, + "loss": 0.5906, + "num_input_tokens_seen": 619975776, + "step": 3433 + }, + { + "epoch": 0.3759270916007554, + "grad_norm": 1.1767232169609902, + "learning_rate": 3.4497531790804194e-05, + "loss": 0.6173, + "num_input_tokens_seen": 620158336, + "step": 3434 + }, + { + "epoch": 0.37603656367169325, + "grad_norm": 1.229456991187297, + "learning_rate": 3.448957727103564e-05, + "loss": 0.8563, + "num_input_tokens_seen": 620345376, + "step": 3435 + }, + { + "epoch": 0.37614603574263117, + "grad_norm": 1.0914433270261559, + "learning_rate": 3.448162162866823e-05, + "loss": 0.5848, + "num_input_tokens_seen": 620533984, + "step": 3436 + }, + { + "epoch": 0.3762555078135691, + "grad_norm": 1.1578352602885034, + "learning_rate": 3.447366486464312e-05, + "loss": 0.7495, + "num_input_tokens_seen": 620738272, + "step": 3437 + }, + { + "epoch": 0.37636497988450696, + "grad_norm": 1.3097884460882088, + "learning_rate": 3.446570697990155e-05, + "loss": 0.7797, + "num_input_tokens_seen": 620902912, + "step": 3438 + }, + { + "epoch": 0.3764744519554449, + "grad_norm": 1.372350480876314, + "learning_rate": 3.445774797538495e-05, + "loss": 0.8431, + "num_input_tokens_seen": 621094432, + "step": 3439 + }, + { + "epoch": 0.37658392402638274, + "grad_norm": 1.1764659218835851, + "learning_rate": 3.444978785203484e-05, + "loss": 0.5268, + "num_input_tokens_seen": 621257504, + "step": 3440 + }, + { + "epoch": 0.37669339609732067, + "grad_norm": 1.1265552155448328, + "learning_rate": 3.44418266107929e-05, + "loss": 0.5065, + "num_input_tokens_seen": 621409600, + "step": 3441 + }, + { + "epoch": 0.3768028681682586, + "grad_norm": 1.3244531024623913, + "learning_rate": 3.4433864252600916e-05, + "loss": 0.7604, + "num_input_tokens_seen": 621602464, + "step": 3442 + }, + { + "epoch": 0.37691234023919645, + "grad_norm": 1.2795030267611827, + "learning_rate": 3.442590077840083e-05, + "loss": 0.5679, + "num_input_tokens_seen": 621746720, + "step": 3443 + }, + { + "epoch": 0.3770218123101344, + "grad_norm": 1.4586079838389747, + "learning_rate": 3.441793618913469e-05, + "loss": 0.7557, + "num_input_tokens_seen": 621880000, + "step": 3444 + }, + { + "epoch": 0.3771312843810723, + "grad_norm": 1.2859533093432007, + "learning_rate": 3.4409970485744714e-05, + "loss": 0.5577, + "num_input_tokens_seen": 622058528, + "step": 3445 + }, + { + "epoch": 0.37724075645201016, + "grad_norm": 1.2018166401025188, + "learning_rate": 3.440200366917321e-05, + "loss": 0.604, + "num_input_tokens_seen": 622214208, + "step": 3446 + }, + { + "epoch": 0.3773502285229481, + "grad_norm": 1.3298522936753048, + "learning_rate": 3.439403574036266e-05, + "loss": 0.6832, + "num_input_tokens_seen": 622395648, + "step": 3447 + }, + { + "epoch": 0.377459700593886, + "grad_norm": 1.306070246032623, + "learning_rate": 3.438606670025563e-05, + "loss": 0.7479, + "num_input_tokens_seen": 622581344, + "step": 3448 + }, + { + "epoch": 0.37756917266482387, + "grad_norm": 1.2725812689680116, + "learning_rate": 3.437809654979485e-05, + "loss": 0.6944, + "num_input_tokens_seen": 622775104, + "step": 3449 + }, + { + "epoch": 0.3776786447357618, + "grad_norm": 1.3842695071151634, + "learning_rate": 3.4370125289923176e-05, + "loss": 0.7201, + "num_input_tokens_seen": 622923392, + "step": 3450 + }, + { + "epoch": 0.3777881168066997, + "grad_norm": 1.246609285193408, + "learning_rate": 3.436215292158359e-05, + "loss": 0.752, + "num_input_tokens_seen": 623118272, + "step": 3451 + }, + { + "epoch": 0.3778975888776376, + "grad_norm": 1.1892754683591216, + "learning_rate": 3.435417944571922e-05, + "loss": 0.6996, + "num_input_tokens_seen": 623289632, + "step": 3452 + }, + { + "epoch": 0.3780070609485755, + "grad_norm": 1.1973673293460085, + "learning_rate": 3.4346204863273304e-05, + "loss": 0.5636, + "num_input_tokens_seen": 623467488, + "step": 3453 + }, + { + "epoch": 0.3781165330195134, + "grad_norm": 1.1978669640001736, + "learning_rate": 3.433822917518921e-05, + "loss": 0.7555, + "num_input_tokens_seen": 623661920, + "step": 3454 + }, + { + "epoch": 0.3782260050904513, + "grad_norm": 1.300086079095129, + "learning_rate": 3.433025238241047e-05, + "loss": 0.6554, + "num_input_tokens_seen": 623831040, + "step": 3455 + }, + { + "epoch": 0.3783354771613892, + "grad_norm": 1.2668489730713364, + "learning_rate": 3.43222744858807e-05, + "loss": 0.631, + "num_input_tokens_seen": 624018976, + "step": 3456 + }, + { + "epoch": 0.3784449492323271, + "grad_norm": 1.3947027191829233, + "learning_rate": 3.431429548654368e-05, + "loss": 0.6552, + "num_input_tokens_seen": 624187648, + "step": 3457 + }, + { + "epoch": 0.378554421303265, + "grad_norm": 1.3333506831391357, + "learning_rate": 3.4306315385343316e-05, + "loss": 0.714, + "num_input_tokens_seen": 624377600, + "step": 3458 + }, + { + "epoch": 0.3786638933742029, + "grad_norm": 1.2157059336102114, + "learning_rate": 3.4298334183223624e-05, + "loss": 0.5447, + "num_input_tokens_seen": 624547392, + "step": 3459 + }, + { + "epoch": 0.3787733654451408, + "grad_norm": 1.293423160787777, + "learning_rate": 3.4290351881128767e-05, + "loss": 0.7613, + "num_input_tokens_seen": 624742720, + "step": 3460 + }, + { + "epoch": 0.3788828375160787, + "grad_norm": 1.4127072747661655, + "learning_rate": 3.4282368480003056e-05, + "loss": 0.6768, + "num_input_tokens_seen": 624959776, + "step": 3461 + }, + { + "epoch": 0.37899230958701663, + "grad_norm": 1.239462785175235, + "learning_rate": 3.42743839807909e-05, + "loss": 0.6703, + "num_input_tokens_seen": 625115680, + "step": 3462 + }, + { + "epoch": 0.3791017816579545, + "grad_norm": 1.3356679454819875, + "learning_rate": 3.426639838443684e-05, + "loss": 0.7018, + "num_input_tokens_seen": 625303616, + "step": 3463 + }, + { + "epoch": 0.3792112537288924, + "grad_norm": 1.1783747663771849, + "learning_rate": 3.4258411691885575e-05, + "loss": 0.5344, + "num_input_tokens_seen": 625484384, + "step": 3464 + }, + { + "epoch": 0.37932072579983034, + "grad_norm": 1.2146923301999188, + "learning_rate": 3.425042390408189e-05, + "loss": 0.6884, + "num_input_tokens_seen": 625635360, + "step": 3465 + }, + { + "epoch": 0.3794301978707682, + "grad_norm": 1.377323787931864, + "learning_rate": 3.424243502197076e-05, + "loss": 0.7606, + "num_input_tokens_seen": 625817696, + "step": 3466 + }, + { + "epoch": 0.3795396699417061, + "grad_norm": 1.5102588001052297, + "learning_rate": 3.4234445046497225e-05, + "loss": 0.723, + "num_input_tokens_seen": 625996672, + "step": 3467 + }, + { + "epoch": 0.37964914201264405, + "grad_norm": 1.2395391963933398, + "learning_rate": 3.42264539786065e-05, + "loss": 0.5086, + "num_input_tokens_seen": 626164224, + "step": 3468 + }, + { + "epoch": 0.3797586140835819, + "grad_norm": 1.274280386158485, + "learning_rate": 3.421846181924391e-05, + "loss": 0.8155, + "num_input_tokens_seen": 626350816, + "step": 3469 + }, + { + "epoch": 0.37986808615451984, + "grad_norm": 1.212565158784255, + "learning_rate": 3.421046856935489e-05, + "loss": 0.511, + "num_input_tokens_seen": 626517248, + "step": 3470 + }, + { + "epoch": 0.37997755822545776, + "grad_norm": 1.2676770564003004, + "learning_rate": 3.420247422988506e-05, + "loss": 0.5787, + "num_input_tokens_seen": 626670464, + "step": 3471 + }, + { + "epoch": 0.3800870302963956, + "grad_norm": 1.1293662958701651, + "learning_rate": 3.4194478801780116e-05, + "loss": 0.6905, + "num_input_tokens_seen": 626848544, + "step": 3472 + }, + { + "epoch": 0.38019650236733354, + "grad_norm": 1.1833158361905418, + "learning_rate": 3.4186482285985915e-05, + "loss": 0.645, + "num_input_tokens_seen": 627019232, + "step": 3473 + }, + { + "epoch": 0.3803059744382714, + "grad_norm": 1.1370651399007994, + "learning_rate": 3.417848468344842e-05, + "loss": 0.6214, + "num_input_tokens_seen": 627213664, + "step": 3474 + }, + { + "epoch": 0.38041544650920933, + "grad_norm": 1.268807495392936, + "learning_rate": 3.417048599511373e-05, + "loss": 0.726, + "num_input_tokens_seen": 627378752, + "step": 3475 + }, + { + "epoch": 0.38052491858014725, + "grad_norm": 1.1351062046894016, + "learning_rate": 3.416248622192807e-05, + "loss": 0.5725, + "num_input_tokens_seen": 627549888, + "step": 3476 + }, + { + "epoch": 0.3806343906510851, + "grad_norm": 1.2923411894874264, + "learning_rate": 3.415448536483782e-05, + "loss": 0.6593, + "num_input_tokens_seen": 627709152, + "step": 3477 + }, + { + "epoch": 0.38074386272202304, + "grad_norm": 1.4463999188661552, + "learning_rate": 3.4146483424789445e-05, + "loss": 0.8815, + "num_input_tokens_seen": 627892160, + "step": 3478 + }, + { + "epoch": 0.38085333479296096, + "grad_norm": 1.2039797335309914, + "learning_rate": 3.4138480402729564e-05, + "loss": 0.6754, + "num_input_tokens_seen": 628064640, + "step": 3479 + }, + { + "epoch": 0.38096280686389883, + "grad_norm": 1.4628139419727841, + "learning_rate": 3.413047629960492e-05, + "loss": 0.9209, + "num_input_tokens_seen": 628253696, + "step": 3480 + }, + { + "epoch": 0.38107227893483675, + "grad_norm": 1.3128160822691666, + "learning_rate": 3.412247111636239e-05, + "loss": 0.7456, + "num_input_tokens_seen": 628425280, + "step": 3481 + }, + { + "epoch": 0.38118175100577467, + "grad_norm": 1.3006723719213804, + "learning_rate": 3.411446485394896e-05, + "loss": 0.781, + "num_input_tokens_seen": 628607616, + "step": 3482 + }, + { + "epoch": 0.38129122307671254, + "grad_norm": 1.1195352600835096, + "learning_rate": 3.410645751331176e-05, + "loss": 0.5653, + "num_input_tokens_seen": 628793536, + "step": 3483 + }, + { + "epoch": 0.38140069514765046, + "grad_norm": 1.087208230790539, + "learning_rate": 3.4098449095398054e-05, + "loss": 0.5763, + "num_input_tokens_seen": 628984160, + "step": 3484 + }, + { + "epoch": 0.3815101672185884, + "grad_norm": 1.3657288535846608, + "learning_rate": 3.409043960115521e-05, + "loss": 0.6494, + "num_input_tokens_seen": 629142304, + "step": 3485 + }, + { + "epoch": 0.38161963928952625, + "grad_norm": 1.1636684185897446, + "learning_rate": 3.408242903153074e-05, + "loss": 0.7008, + "num_input_tokens_seen": 629330016, + "step": 3486 + }, + { + "epoch": 0.38172911136046417, + "grad_norm": 1.2375580032318567, + "learning_rate": 3.4074417387472274e-05, + "loss": 0.7423, + "num_input_tokens_seen": 629520864, + "step": 3487 + }, + { + "epoch": 0.3818385834314021, + "grad_norm": 1.2712890193656132, + "learning_rate": 3.406640466992758e-05, + "loss": 0.5757, + "num_input_tokens_seen": 629708128, + "step": 3488 + }, + { + "epoch": 0.38194805550233996, + "grad_norm": 1.144011812522564, + "learning_rate": 3.405839087984455e-05, + "loss": 0.5804, + "num_input_tokens_seen": 629896288, + "step": 3489 + }, + { + "epoch": 0.3820575275732779, + "grad_norm": 1.2694581935109253, + "learning_rate": 3.405037601817119e-05, + "loss": 0.607, + "num_input_tokens_seen": 630104384, + "step": 3490 + }, + { + "epoch": 0.38216699964421574, + "grad_norm": 1.2477808327003856, + "learning_rate": 3.4042360085855654e-05, + "loss": 0.6528, + "num_input_tokens_seen": 630323456, + "step": 3491 + }, + { + "epoch": 0.38227647171515367, + "grad_norm": 1.363995078256962, + "learning_rate": 3.40343430838462e-05, + "loss": 0.6488, + "num_input_tokens_seen": 630471296, + "step": 3492 + }, + { + "epoch": 0.3823859437860916, + "grad_norm": 1.234940797568262, + "learning_rate": 3.4026325013091224e-05, + "loss": 0.5574, + "num_input_tokens_seen": 630644224, + "step": 3493 + }, + { + "epoch": 0.38249541585702945, + "grad_norm": 1.166413308062856, + "learning_rate": 3.4018305874539264e-05, + "loss": 0.4858, + "num_input_tokens_seen": 630816256, + "step": 3494 + }, + { + "epoch": 0.3826048879279674, + "grad_norm": 1.2526390351050785, + "learning_rate": 3.401028566913896e-05, + "loss": 0.6602, + "num_input_tokens_seen": 631001728, + "step": 3495 + }, + { + "epoch": 0.3827143599989053, + "grad_norm": 1.320921808081457, + "learning_rate": 3.400226439783908e-05, + "loss": 0.7076, + "num_input_tokens_seen": 631164800, + "step": 3496 + }, + { + "epoch": 0.38282383206984316, + "grad_norm": 1.2641866068984684, + "learning_rate": 3.399424206158855e-05, + "loss": 0.7095, + "num_input_tokens_seen": 631336384, + "step": 3497 + }, + { + "epoch": 0.3829333041407811, + "grad_norm": 1.4617607530668426, + "learning_rate": 3.3986218661336355e-05, + "loss": 0.8616, + "num_input_tokens_seen": 631544704, + "step": 3498 + }, + { + "epoch": 0.383042776211719, + "grad_norm": 1.51884260483974, + "learning_rate": 3.397819419803168e-05, + "loss": 0.8851, + "num_input_tokens_seen": 631728608, + "step": 3499 + }, + { + "epoch": 0.38315224828265687, + "grad_norm": 1.2663136716087604, + "learning_rate": 3.397016867262379e-05, + "loss": 0.684, + "num_input_tokens_seen": 631898624, + "step": 3500 + }, + { + "epoch": 0.3832617203535948, + "grad_norm": 1.2730995619137504, + "learning_rate": 3.39621420860621e-05, + "loss": 0.8396, + "num_input_tokens_seen": 632070656, + "step": 3501 + }, + { + "epoch": 0.3833711924245327, + "grad_norm": 1.18770140897406, + "learning_rate": 3.395411443929613e-05, + "loss": 0.6993, + "num_input_tokens_seen": 632255456, + "step": 3502 + }, + { + "epoch": 0.3834806644954706, + "grad_norm": 1.2459947313531035, + "learning_rate": 3.394608573327554e-05, + "loss": 0.6201, + "num_input_tokens_seen": 632425696, + "step": 3503 + }, + { + "epoch": 0.3835901365664085, + "grad_norm": 1.2743177121743539, + "learning_rate": 3.393805596895011e-05, + "loss": 0.7592, + "num_input_tokens_seen": 632585632, + "step": 3504 + }, + { + "epoch": 0.3836996086373464, + "grad_norm": 1.3479655919735007, + "learning_rate": 3.3930025147269746e-05, + "loss": 0.9398, + "num_input_tokens_seen": 632756544, + "step": 3505 + }, + { + "epoch": 0.3838090807082843, + "grad_norm": 1.2916998038604603, + "learning_rate": 3.3921993269184474e-05, + "loss": 0.675, + "num_input_tokens_seen": 632908416, + "step": 3506 + }, + { + "epoch": 0.3839185527792222, + "grad_norm": 1.2344998646608991, + "learning_rate": 3.391396033564446e-05, + "loss": 0.7226, + "num_input_tokens_seen": 633104640, + "step": 3507 + }, + { + "epoch": 0.3840280248501601, + "grad_norm": 1.3065128007298932, + "learning_rate": 3.390592634759998e-05, + "loss": 0.726, + "num_input_tokens_seen": 633298176, + "step": 3508 + }, + { + "epoch": 0.384137496921098, + "grad_norm": 1.2880244844135265, + "learning_rate": 3.389789130600144e-05, + "loss": 0.583, + "num_input_tokens_seen": 633474688, + "step": 3509 + }, + { + "epoch": 0.3842469689920359, + "grad_norm": 1.2328191682081622, + "learning_rate": 3.388985521179937e-05, + "loss": 0.6932, + "num_input_tokens_seen": 633633504, + "step": 3510 + }, + { + "epoch": 0.3843564410629738, + "grad_norm": 1.2569267044548398, + "learning_rate": 3.3881818065944416e-05, + "loss": 0.7529, + "num_input_tokens_seen": 633821664, + "step": 3511 + }, + { + "epoch": 0.3844659131339117, + "grad_norm": 1.259847049890961, + "learning_rate": 3.3873779869387356e-05, + "loss": 0.5711, + "num_input_tokens_seen": 633989440, + "step": 3512 + }, + { + "epoch": 0.38457538520484963, + "grad_norm": 1.3115399346127146, + "learning_rate": 3.3865740623079116e-05, + "loss": 0.7372, + "num_input_tokens_seen": 634195744, + "step": 3513 + }, + { + "epoch": 0.3846848572757875, + "grad_norm": 1.1957323421583768, + "learning_rate": 3.3857700327970696e-05, + "loss": 0.5626, + "num_input_tokens_seen": 634389056, + "step": 3514 + }, + { + "epoch": 0.3847943293467254, + "grad_norm": 1.3787928202283624, + "learning_rate": 3.384965898501327e-05, + "loss": 0.7285, + "num_input_tokens_seen": 634581920, + "step": 3515 + }, + { + "epoch": 0.38490380141766334, + "grad_norm": 1.190104339176669, + "learning_rate": 3.384161659515811e-05, + "loss": 0.6266, + "num_input_tokens_seen": 634760896, + "step": 3516 + }, + { + "epoch": 0.3850132734886012, + "grad_norm": 1.4607525876773266, + "learning_rate": 3.38335731593566e-05, + "loss": 0.8768, + "num_input_tokens_seen": 634932704, + "step": 3517 + }, + { + "epoch": 0.3851227455595391, + "grad_norm": 1.273768937678592, + "learning_rate": 3.382552867856027e-05, + "loss": 0.5249, + "num_input_tokens_seen": 635088160, + "step": 3518 + }, + { + "epoch": 0.38523221763047705, + "grad_norm": 1.297616401992974, + "learning_rate": 3.381748315372077e-05, + "loss": 0.7001, + "num_input_tokens_seen": 635257056, + "step": 3519 + }, + { + "epoch": 0.3853416897014149, + "grad_norm": 1.1636977680420588, + "learning_rate": 3.380943658578987e-05, + "loss": 0.6903, + "num_input_tokens_seen": 635440064, + "step": 3520 + }, + { + "epoch": 0.38545116177235283, + "grad_norm": 1.255464245468253, + "learning_rate": 3.380138897571946e-05, + "loss": 0.691, + "num_input_tokens_seen": 635608288, + "step": 3521 + }, + { + "epoch": 0.38556063384329076, + "grad_norm": 1.2700866892656646, + "learning_rate": 3.379334032446157e-05, + "loss": 0.6702, + "num_input_tokens_seen": 635775168, + "step": 3522 + }, + { + "epoch": 0.3856701059142286, + "grad_norm": 1.4158263612739805, + "learning_rate": 3.378529063296832e-05, + "loss": 0.7594, + "num_input_tokens_seen": 635963776, + "step": 3523 + }, + { + "epoch": 0.38577957798516654, + "grad_norm": 1.0678225324392214, + "learning_rate": 3.377723990219198e-05, + "loss": 0.5231, + "num_input_tokens_seen": 636139840, + "step": 3524 + }, + { + "epoch": 0.3858890500561044, + "grad_norm": 1.3130078468553132, + "learning_rate": 3.376918813308495e-05, + "loss": 0.6448, + "num_input_tokens_seen": 636298656, + "step": 3525 + }, + { + "epoch": 0.38599852212704233, + "grad_norm": 1.1859038819384373, + "learning_rate": 3.3761135326599716e-05, + "loss": 0.5687, + "num_input_tokens_seen": 636446496, + "step": 3526 + }, + { + "epoch": 0.38610799419798025, + "grad_norm": 1.2476672339866572, + "learning_rate": 3.375308148368893e-05, + "loss": 0.648, + "num_input_tokens_seen": 636618528, + "step": 3527 + }, + { + "epoch": 0.3862174662689181, + "grad_norm": 1.2914732613920998, + "learning_rate": 3.374502660530534e-05, + "loss": 0.6723, + "num_input_tokens_seen": 636798176, + "step": 3528 + }, + { + "epoch": 0.38632693833985604, + "grad_norm": 1.3341156879544331, + "learning_rate": 3.373697069240181e-05, + "loss": 0.6954, + "num_input_tokens_seen": 636998880, + "step": 3529 + }, + { + "epoch": 0.38643641041079396, + "grad_norm": 1.3130666781024043, + "learning_rate": 3.3728913745931356e-05, + "loss": 0.6488, + "num_input_tokens_seen": 637168224, + "step": 3530 + }, + { + "epoch": 0.38654588248173183, + "grad_norm": 1.213110420104061, + "learning_rate": 3.372085576684709e-05, + "loss": 0.8162, + "num_input_tokens_seen": 637372960, + "step": 3531 + }, + { + "epoch": 0.38665535455266975, + "grad_norm": 1.287816225494054, + "learning_rate": 3.371279675610226e-05, + "loss": 0.6996, + "num_input_tokens_seen": 637532224, + "step": 3532 + }, + { + "epoch": 0.38676482662360767, + "grad_norm": 1.2499525061167753, + "learning_rate": 3.370473671465022e-05, + "loss": 0.6445, + "num_input_tokens_seen": 637691264, + "step": 3533 + }, + { + "epoch": 0.38687429869454554, + "grad_norm": 1.6889532683814816, + "learning_rate": 3.369667564344449e-05, + "loss": 0.838, + "num_input_tokens_seen": 637835744, + "step": 3534 + }, + { + "epoch": 0.38698377076548346, + "grad_norm": 1.351228661734808, + "learning_rate": 3.368861354343863e-05, + "loss": 0.8217, + "num_input_tokens_seen": 638006432, + "step": 3535 + }, + { + "epoch": 0.3870932428364214, + "grad_norm": 1.4494886121351884, + "learning_rate": 3.3680550415586416e-05, + "loss": 0.7862, + "num_input_tokens_seen": 638203104, + "step": 3536 + }, + { + "epoch": 0.38720271490735925, + "grad_norm": 1.14987067241748, + "learning_rate": 3.367248626084168e-05, + "loss": 0.6576, + "num_input_tokens_seen": 638397088, + "step": 3537 + }, + { + "epoch": 0.38731218697829717, + "grad_norm": 1.2514911817025853, + "learning_rate": 3.3664421080158394e-05, + "loss": 0.8003, + "num_input_tokens_seen": 638563968, + "step": 3538 + }, + { + "epoch": 0.3874216590492351, + "grad_norm": 1.2773435166851612, + "learning_rate": 3.365635487449065e-05, + "loss": 0.7296, + "num_input_tokens_seen": 638744288, + "step": 3539 + }, + { + "epoch": 0.38753113112017296, + "grad_norm": 1.2567058457372855, + "learning_rate": 3.364828764479269e-05, + "loss": 0.7647, + "num_input_tokens_seen": 638930432, + "step": 3540 + }, + { + "epoch": 0.3876406031911109, + "grad_norm": 1.1821872424196733, + "learning_rate": 3.3640219392018824e-05, + "loss": 0.9648, + "num_input_tokens_seen": 639144352, + "step": 3541 + }, + { + "epoch": 0.38775007526204874, + "grad_norm": 1.2598631355339376, + "learning_rate": 3.3632150117123524e-05, + "loss": 0.5847, + "num_input_tokens_seen": 639315712, + "step": 3542 + }, + { + "epoch": 0.38785954733298666, + "grad_norm": 1.2752892072700863, + "learning_rate": 3.362407982106136e-05, + "loss": 0.7576, + "num_input_tokens_seen": 639479456, + "step": 3543 + }, + { + "epoch": 0.3879690194039246, + "grad_norm": 1.1159820455453637, + "learning_rate": 3.361600850478704e-05, + "loss": 0.5505, + "num_input_tokens_seen": 639658208, + "step": 3544 + }, + { + "epoch": 0.38807849147486245, + "grad_norm": 1.2300956039180218, + "learning_rate": 3.3607936169255396e-05, + "loss": 0.7474, + "num_input_tokens_seen": 639846368, + "step": 3545 + }, + { + "epoch": 0.3881879635458004, + "grad_norm": 1.2062133190527207, + "learning_rate": 3.359986281542135e-05, + "loss": 0.6702, + "num_input_tokens_seen": 640042368, + "step": 3546 + }, + { + "epoch": 0.3882974356167383, + "grad_norm": 1.4262798116980147, + "learning_rate": 3.359178844423998e-05, + "loss": 0.7807, + "num_input_tokens_seen": 640197824, + "step": 3547 + }, + { + "epoch": 0.38840690768767616, + "grad_norm": 1.3033871595108721, + "learning_rate": 3.3583713056666454e-05, + "loss": 0.5957, + "num_input_tokens_seen": 640378592, + "step": 3548 + }, + { + "epoch": 0.3885163797586141, + "grad_norm": 1.2712409142363628, + "learning_rate": 3.3575636653656094e-05, + "loss": 0.8118, + "num_input_tokens_seen": 640575936, + "step": 3549 + }, + { + "epoch": 0.388625851829552, + "grad_norm": 1.3670515592293624, + "learning_rate": 3.35675592361643e-05, + "loss": 0.6425, + "num_input_tokens_seen": 640761856, + "step": 3550 + }, + { + "epoch": 0.38873532390048987, + "grad_norm": 1.4408881023728284, + "learning_rate": 3.3559480805146634e-05, + "loss": 0.9478, + "num_input_tokens_seen": 640956512, + "step": 3551 + }, + { + "epoch": 0.3888447959714278, + "grad_norm": 1.4417724144286173, + "learning_rate": 3.355140136155875e-05, + "loss": 0.6906, + "num_input_tokens_seen": 641124288, + "step": 3552 + }, + { + "epoch": 0.3889542680423657, + "grad_norm": 1.4904500222821624, + "learning_rate": 3.354332090635643e-05, + "loss": 0.6998, + "num_input_tokens_seen": 641308416, + "step": 3553 + }, + { + "epoch": 0.3890637401133036, + "grad_norm": 1.274303025125641, + "learning_rate": 3.353523944049558e-05, + "loss": 0.6442, + "num_input_tokens_seen": 641491872, + "step": 3554 + }, + { + "epoch": 0.3891732121842415, + "grad_norm": 1.325317242269165, + "learning_rate": 3.352715696493222e-05, + "loss": 0.7195, + "num_input_tokens_seen": 641643520, + "step": 3555 + }, + { + "epoch": 0.3892826842551794, + "grad_norm": 1.1855476794169428, + "learning_rate": 3.3519073480622495e-05, + "loss": 0.6733, + "num_input_tokens_seen": 641850272, + "step": 3556 + }, + { + "epoch": 0.3893921563261173, + "grad_norm": 1.2834406477576314, + "learning_rate": 3.351098898852266e-05, + "loss": 0.8, + "num_input_tokens_seen": 642023200, + "step": 3557 + }, + { + "epoch": 0.3895016283970552, + "grad_norm": 1.2212063415907466, + "learning_rate": 3.35029034895891e-05, + "loss": 0.7061, + "num_input_tokens_seen": 642213600, + "step": 3558 + }, + { + "epoch": 0.3896111004679931, + "grad_norm": 1.2301096558731714, + "learning_rate": 3.349481698477831e-05, + "loss": 0.6759, + "num_input_tokens_seen": 642375328, + "step": 3559 + }, + { + "epoch": 0.389720572538931, + "grad_norm": 1.375501064556657, + "learning_rate": 3.348672947504691e-05, + "loss": 0.7953, + "num_input_tokens_seen": 642554976, + "step": 3560 + }, + { + "epoch": 0.3898300446098689, + "grad_norm": 1.3399131912609306, + "learning_rate": 3.3478640961351635e-05, + "loss": 0.596, + "num_input_tokens_seen": 642734176, + "step": 3561 + }, + { + "epoch": 0.3899395166808068, + "grad_norm": 1.3753826302553607, + "learning_rate": 3.3470551444649346e-05, + "loss": 0.696, + "num_input_tokens_seen": 642904864, + "step": 3562 + }, + { + "epoch": 0.3900489887517447, + "grad_norm": 1.2634350231900258, + "learning_rate": 3.346246092589702e-05, + "loss": 0.7262, + "num_input_tokens_seen": 643093696, + "step": 3563 + }, + { + "epoch": 0.39015846082268263, + "grad_norm": 1.3173215949424226, + "learning_rate": 3.3454369406051736e-05, + "loss": 0.7172, + "num_input_tokens_seen": 643247808, + "step": 3564 + }, + { + "epoch": 0.3902679328936205, + "grad_norm": 1.248691589318223, + "learning_rate": 3.344627688607071e-05, + "loss": 0.755, + "num_input_tokens_seen": 643417152, + "step": 3565 + }, + { + "epoch": 0.3903774049645584, + "grad_norm": 1.1703389591337876, + "learning_rate": 3.343818336691128e-05, + "loss": 0.7633, + "num_input_tokens_seen": 643612928, + "step": 3566 + }, + { + "epoch": 0.39048687703549634, + "grad_norm": 1.3581597168475015, + "learning_rate": 3.3430088849530886e-05, + "loss": 0.8775, + "num_input_tokens_seen": 643809600, + "step": 3567 + }, + { + "epoch": 0.3905963491064342, + "grad_norm": 1.2805737362194176, + "learning_rate": 3.34219933348871e-05, + "loss": 0.8137, + "num_input_tokens_seen": 643994400, + "step": 3568 + }, + { + "epoch": 0.3907058211773721, + "grad_norm": 1.0562180328046171, + "learning_rate": 3.34138968239376e-05, + "loss": 0.5446, + "num_input_tokens_seen": 644182784, + "step": 3569 + }, + { + "epoch": 0.39081529324831005, + "grad_norm": 1.0796564255036636, + "learning_rate": 3.3405799317640196e-05, + "loss": 0.5222, + "num_input_tokens_seen": 644384832, + "step": 3570 + }, + { + "epoch": 0.3909247653192479, + "grad_norm": 1.2206941894204206, + "learning_rate": 3.3397700816952795e-05, + "loss": 0.7357, + "num_input_tokens_seen": 644547008, + "step": 3571 + }, + { + "epoch": 0.39103423739018583, + "grad_norm": 1.3159253065368237, + "learning_rate": 3.3389601322833454e-05, + "loss": 0.7655, + "num_input_tokens_seen": 644707616, + "step": 3572 + }, + { + "epoch": 0.39114370946112376, + "grad_norm": 1.2684138148601778, + "learning_rate": 3.3381500836240296e-05, + "loss": 0.6157, + "num_input_tokens_seen": 644873152, + "step": 3573 + }, + { + "epoch": 0.3912531815320616, + "grad_norm": 1.3135177208787245, + "learning_rate": 3.337339935813163e-05, + "loss": 0.6822, + "num_input_tokens_seen": 645043168, + "step": 3574 + }, + { + "epoch": 0.39136265360299954, + "grad_norm": 1.1272707380839386, + "learning_rate": 3.3365296889465814e-05, + "loss": 0.5988, + "num_input_tokens_seen": 645246112, + "step": 3575 + }, + { + "epoch": 0.3914721256739374, + "grad_norm": 1.094590708651246, + "learning_rate": 3.3357193431201374e-05, + "loss": 0.6227, + "num_input_tokens_seen": 645443232, + "step": 3576 + }, + { + "epoch": 0.39158159774487533, + "grad_norm": 1.3726788783284434, + "learning_rate": 3.3349088984296916e-05, + "loss": 0.7044, + "num_input_tokens_seen": 645619296, + "step": 3577 + }, + { + "epoch": 0.39169106981581325, + "grad_norm": 1.170473404990806, + "learning_rate": 3.33409835497112e-05, + "loss": 0.6025, + "num_input_tokens_seen": 645810816, + "step": 3578 + }, + { + "epoch": 0.3918005418867511, + "grad_norm": 1.1625564869108442, + "learning_rate": 3.333287712840308e-05, + "loss": 0.5436, + "num_input_tokens_seen": 645967616, + "step": 3579 + }, + { + "epoch": 0.39191001395768904, + "grad_norm": 1.363762144949005, + "learning_rate": 3.3324769721331515e-05, + "loss": 0.6143, + "num_input_tokens_seen": 646097984, + "step": 3580 + }, + { + "epoch": 0.39201948602862696, + "grad_norm": 1.228761487656813, + "learning_rate": 3.331666132945562e-05, + "loss": 0.7269, + "num_input_tokens_seen": 646266656, + "step": 3581 + }, + { + "epoch": 0.39212895809956483, + "grad_norm": 1.2940646134566307, + "learning_rate": 3.3308551953734576e-05, + "loss": 0.7231, + "num_input_tokens_seen": 646443168, + "step": 3582 + }, + { + "epoch": 0.39223843017050275, + "grad_norm": 1.303883653265525, + "learning_rate": 3.330044159512773e-05, + "loss": 0.6935, + "num_input_tokens_seen": 646636928, + "step": 3583 + }, + { + "epoch": 0.39234790224144067, + "grad_norm": 1.077965987268732, + "learning_rate": 3.3292330254594504e-05, + "loss": 0.6439, + "num_input_tokens_seen": 646803360, + "step": 3584 + }, + { + "epoch": 0.39245737431237854, + "grad_norm": 1.1884772144776612, + "learning_rate": 3.3284217933094465e-05, + "loss": 0.6084, + "num_input_tokens_seen": 647002272, + "step": 3585 + }, + { + "epoch": 0.39256684638331646, + "grad_norm": 1.186322984128483, + "learning_rate": 3.3276104631587274e-05, + "loss": 0.7016, + "num_input_tokens_seen": 647190432, + "step": 3586 + }, + { + "epoch": 0.3926763184542544, + "grad_norm": 1.190385625209975, + "learning_rate": 3.326799035103273e-05, + "loss": 0.7, + "num_input_tokens_seen": 647349696, + "step": 3587 + }, + { + "epoch": 0.39278579052519225, + "grad_norm": 1.2462223668224963, + "learning_rate": 3.325987509239074e-05, + "loss": 0.6261, + "num_input_tokens_seen": 647548160, + "step": 3588 + }, + { + "epoch": 0.39289526259613017, + "grad_norm": 1.3320126797900231, + "learning_rate": 3.3251758856621303e-05, + "loss": 0.6884, + "num_input_tokens_seen": 647726016, + "step": 3589 + }, + { + "epoch": 0.3930047346670681, + "grad_norm": 1.144188683185246, + "learning_rate": 3.324364164468458e-05, + "loss": 0.5285, + "num_input_tokens_seen": 647920224, + "step": 3590 + }, + { + "epoch": 0.39311420673800596, + "grad_norm": 1.27043124649191, + "learning_rate": 3.3235523457540805e-05, + "loss": 0.6309, + "num_input_tokens_seen": 648098752, + "step": 3591 + }, + { + "epoch": 0.3932236788089439, + "grad_norm": 1.340730068117669, + "learning_rate": 3.322740429615035e-05, + "loss": 0.6196, + "num_input_tokens_seen": 648280864, + "step": 3592 + }, + { + "epoch": 0.39333315087988174, + "grad_norm": 1.3109387694433383, + "learning_rate": 3.32192841614737e-05, + "loss": 0.8418, + "num_input_tokens_seen": 648464768, + "step": 3593 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 1.320735795825061, + "learning_rate": 3.321116305447143e-05, + "loss": 0.5551, + "num_input_tokens_seen": 648630528, + "step": 3594 + }, + { + "epoch": 0.3935520950217576, + "grad_norm": 1.1608416655096068, + "learning_rate": 3.3203040976104285e-05, + "loss": 0.6282, + "num_input_tokens_seen": 648822048, + "step": 3595 + }, + { + "epoch": 0.39366156709269545, + "grad_norm": 1.2073681308777233, + "learning_rate": 3.319491792733307e-05, + "loss": 0.7658, + "num_input_tokens_seen": 648998336, + "step": 3596 + }, + { + "epoch": 0.3937710391636334, + "grad_norm": 1.2663758017134827, + "learning_rate": 3.318679390911873e-05, + "loss": 0.9055, + "num_input_tokens_seen": 649200608, + "step": 3597 + }, + { + "epoch": 0.3938805112345713, + "grad_norm": 1.2470689264228545, + "learning_rate": 3.317866892242231e-05, + "loss": 0.6582, + "num_input_tokens_seen": 649363008, + "step": 3598 + }, + { + "epoch": 0.39398998330550916, + "grad_norm": 1.191243144893236, + "learning_rate": 3.3170542968205e-05, + "loss": 0.8219, + "num_input_tokens_seen": 649561248, + "step": 3599 + }, + { + "epoch": 0.3940994553764471, + "grad_norm": 1.2888305917843326, + "learning_rate": 3.316241604742807e-05, + "loss": 0.7225, + "num_input_tokens_seen": 649715136, + "step": 3600 + }, + { + "epoch": 0.394208927447385, + "grad_norm": 1.3329266981101362, + "learning_rate": 3.3154288161052936e-05, + "loss": 0.8177, + "num_input_tokens_seen": 649896576, + "step": 3601 + }, + { + "epoch": 0.39431839951832287, + "grad_norm": 1.2269979785103533, + "learning_rate": 3.3146159310041095e-05, + "loss": 0.6565, + "num_input_tokens_seen": 650093024, + "step": 3602 + }, + { + "epoch": 0.3944278715892608, + "grad_norm": 1.377782260894318, + "learning_rate": 3.3138029495354184e-05, + "loss": 0.9172, + "num_input_tokens_seen": 650274912, + "step": 3603 + }, + { + "epoch": 0.3945373436601987, + "grad_norm": 1.2502226154461102, + "learning_rate": 3.3129898717953946e-05, + "loss": 0.6353, + "num_input_tokens_seen": 650463296, + "step": 3604 + }, + { + "epoch": 0.3946468157311366, + "grad_norm": 1.26283469909625, + "learning_rate": 3.312176697880222e-05, + "loss": 0.661, + "num_input_tokens_seen": 650634208, + "step": 3605 + }, + { + "epoch": 0.3947562878020745, + "grad_norm": 1.2285668100146678, + "learning_rate": 3.3113634278860994e-05, + "loss": 0.6968, + "num_input_tokens_seen": 650821024, + "step": 3606 + }, + { + "epoch": 0.3948657598730124, + "grad_norm": 1.266769598032897, + "learning_rate": 3.310550061909233e-05, + "loss": 0.8367, + "num_input_tokens_seen": 650989472, + "step": 3607 + }, + { + "epoch": 0.3949752319439503, + "grad_norm": 1.310839920140568, + "learning_rate": 3.3097366000458454e-05, + "loss": 0.5892, + "num_input_tokens_seen": 651161952, + "step": 3608 + }, + { + "epoch": 0.3950847040148882, + "grad_norm": 1.2365080836548983, + "learning_rate": 3.308923042392165e-05, + "loss": 0.6177, + "num_input_tokens_seen": 651331072, + "step": 3609 + }, + { + "epoch": 0.3951941760858261, + "grad_norm": 1.2738391266283098, + "learning_rate": 3.308109389044436e-05, + "loss": 0.6569, + "num_input_tokens_seen": 651499296, + "step": 3610 + }, + { + "epoch": 0.395303648156764, + "grad_norm": 1.2677549641458778, + "learning_rate": 3.3072956400989103e-05, + "loss": 0.5741, + "num_input_tokens_seen": 651671552, + "step": 3611 + }, + { + "epoch": 0.3954131202277019, + "grad_norm": 1.3662905557377094, + "learning_rate": 3.306481795651854e-05, + "loss": 0.7297, + "num_input_tokens_seen": 651842016, + "step": 3612 + }, + { + "epoch": 0.3955225922986398, + "grad_norm": 1.4891330153909912, + "learning_rate": 3.3056678557995434e-05, + "loss": 0.5838, + "num_input_tokens_seen": 652005536, + "step": 3613 + }, + { + "epoch": 0.3956320643695777, + "grad_norm": 1.2231512560631004, + "learning_rate": 3.3048538206382645e-05, + "loss": 0.5869, + "num_input_tokens_seen": 652154272, + "step": 3614 + }, + { + "epoch": 0.39574153644051563, + "grad_norm": 1.2589303966458743, + "learning_rate": 3.3040396902643186e-05, + "loss": 0.6406, + "num_input_tokens_seen": 652318240, + "step": 3615 + }, + { + "epoch": 0.3958510085114535, + "grad_norm": 1.1263786451598032, + "learning_rate": 3.3032254647740135e-05, + "loss": 0.6463, + "num_input_tokens_seen": 652484448, + "step": 3616 + }, + { + "epoch": 0.3959604805823914, + "grad_norm": 1.2446715271672166, + "learning_rate": 3.3024111442636716e-05, + "loss": 0.63, + "num_input_tokens_seen": 652647968, + "step": 3617 + }, + { + "epoch": 0.39606995265332934, + "grad_norm": 1.2280848586338806, + "learning_rate": 3.3015967288296256e-05, + "loss": 0.7043, + "num_input_tokens_seen": 652804992, + "step": 3618 + }, + { + "epoch": 0.3961794247242672, + "grad_norm": 1.1692391825288329, + "learning_rate": 3.300782218568218e-05, + "loss": 0.6614, + "num_input_tokens_seen": 653006368, + "step": 3619 + }, + { + "epoch": 0.3962888967952051, + "grad_norm": 1.0807395413678265, + "learning_rate": 3.299967613575806e-05, + "loss": 0.5085, + "num_input_tokens_seen": 653202144, + "step": 3620 + }, + { + "epoch": 0.39639836886614305, + "grad_norm": 1.3740556681959624, + "learning_rate": 3.299152913948754e-05, + "loss": 0.7079, + "num_input_tokens_seen": 653367904, + "step": 3621 + }, + { + "epoch": 0.3965078409370809, + "grad_norm": 1.2717832340062187, + "learning_rate": 3.298338119783439e-05, + "loss": 0.6152, + "num_input_tokens_seen": 653579584, + "step": 3622 + }, + { + "epoch": 0.39661731300801883, + "grad_norm": 1.2752676608057942, + "learning_rate": 3.297523231176253e-05, + "loss": 0.6164, + "num_input_tokens_seen": 653740192, + "step": 3623 + }, + { + "epoch": 0.39672678507895676, + "grad_norm": 1.263821623948704, + "learning_rate": 3.296708248223592e-05, + "loss": 0.7469, + "num_input_tokens_seen": 653919168, + "step": 3624 + }, + { + "epoch": 0.3968362571498946, + "grad_norm": 1.2043484498894914, + "learning_rate": 3.295893171021868e-05, + "loss": 0.7527, + "num_input_tokens_seen": 654093888, + "step": 3625 + }, + { + "epoch": 0.39694572922083254, + "grad_norm": 1.2429253168387997, + "learning_rate": 3.295077999667504e-05, + "loss": 0.7998, + "num_input_tokens_seen": 654303776, + "step": 3626 + }, + { + "epoch": 0.3970552012917704, + "grad_norm": 1.2930717312003301, + "learning_rate": 3.294262734256933e-05, + "loss": 0.5605, + "num_input_tokens_seen": 654472000, + "step": 3627 + }, + { + "epoch": 0.39716467336270833, + "grad_norm": 1.3317386607895256, + "learning_rate": 3.2934473748865976e-05, + "loss": 0.5277, + "num_input_tokens_seen": 654634400, + "step": 3628 + }, + { + "epoch": 0.39727414543364625, + "grad_norm": 1.2690993293203154, + "learning_rate": 3.292631921652955e-05, + "loss": 0.7428, + "num_input_tokens_seen": 654824576, + "step": 3629 + }, + { + "epoch": 0.3973836175045841, + "grad_norm": 1.3802729417566013, + "learning_rate": 3.2918163746524714e-05, + "loss": 0.6288, + "num_input_tokens_seen": 654989888, + "step": 3630 + }, + { + "epoch": 0.39749308957552204, + "grad_norm": 1.164028397643034, + "learning_rate": 3.291000733981624e-05, + "loss": 0.7729, + "num_input_tokens_seen": 655178272, + "step": 3631 + }, + { + "epoch": 0.39760256164645996, + "grad_norm": 1.402317567543707, + "learning_rate": 3.290184999736903e-05, + "loss": 0.6577, + "num_input_tokens_seen": 655374944, + "step": 3632 + }, + { + "epoch": 0.39771203371739783, + "grad_norm": 1.3285255725626135, + "learning_rate": 3.2893691720148064e-05, + "loss": 0.8782, + "num_input_tokens_seen": 655561312, + "step": 3633 + }, + { + "epoch": 0.39782150578833575, + "grad_norm": 1.2818462101136592, + "learning_rate": 3.2885532509118446e-05, + "loss": 0.7927, + "num_input_tokens_seen": 655758656, + "step": 3634 + }, + { + "epoch": 0.39793097785927367, + "grad_norm": 1.2217396478247546, + "learning_rate": 3.2877372365245426e-05, + "loss": 0.6926, + "num_input_tokens_seen": 655941888, + "step": 3635 + }, + { + "epoch": 0.39804044993021154, + "grad_norm": 1.2455841265995515, + "learning_rate": 3.28692112894943e-05, + "loss": 0.7179, + "num_input_tokens_seen": 656100032, + "step": 3636 + }, + { + "epoch": 0.39814992200114946, + "grad_norm": 1.09391826970123, + "learning_rate": 3.286104928283054e-05, + "loss": 0.5905, + "num_input_tokens_seen": 656296704, + "step": 3637 + }, + { + "epoch": 0.3982593940720874, + "grad_norm": 1.2616370207118368, + "learning_rate": 3.285288634621966e-05, + "loss": 0.6305, + "num_input_tokens_seen": 656470528, + "step": 3638 + }, + { + "epoch": 0.39836886614302525, + "grad_norm": 1.2925541146643078, + "learning_rate": 3.2844722480627346e-05, + "loss": 0.8518, + "num_input_tokens_seen": 656636960, + "step": 3639 + }, + { + "epoch": 0.39847833821396317, + "grad_norm": 1.2377343158255998, + "learning_rate": 3.2836557687019356e-05, + "loss": 0.7705, + "num_input_tokens_seen": 656814368, + "step": 3640 + }, + { + "epoch": 0.3985878102849011, + "grad_norm": 1.308449261787635, + "learning_rate": 3.2828391966361574e-05, + "loss": 0.7442, + "num_input_tokens_seen": 657002080, + "step": 3641 + }, + { + "epoch": 0.39869728235583896, + "grad_norm": 1.3069297117203984, + "learning_rate": 3.2820225319619985e-05, + "loss": 0.7696, + "num_input_tokens_seen": 657186656, + "step": 3642 + }, + { + "epoch": 0.3988067544267769, + "grad_norm": 1.397152699425544, + "learning_rate": 3.281205774776069e-05, + "loss": 0.6857, + "num_input_tokens_seen": 657345696, + "step": 3643 + }, + { + "epoch": 0.39891622649771474, + "grad_norm": 1.2724797678540034, + "learning_rate": 3.280388925174991e-05, + "loss": 0.6552, + "num_input_tokens_seen": 657547296, + "step": 3644 + }, + { + "epoch": 0.39902569856865266, + "grad_norm": 1.173183067156921, + "learning_rate": 3.279571983255394e-05, + "loss": 0.4807, + "num_input_tokens_seen": 657695584, + "step": 3645 + }, + { + "epoch": 0.3991351706395906, + "grad_norm": 1.242954904801652, + "learning_rate": 3.278754949113921e-05, + "loss": 0.5602, + "num_input_tokens_seen": 657857088, + "step": 3646 + }, + { + "epoch": 0.39924464271052845, + "grad_norm": 1.131322999569468, + "learning_rate": 3.277937822847228e-05, + "loss": 0.6172, + "num_input_tokens_seen": 658014784, + "step": 3647 + }, + { + "epoch": 0.3993541147814664, + "grad_norm": 1.2692983090830543, + "learning_rate": 3.277120604551976e-05, + "loss": 0.5639, + "num_input_tokens_seen": 658137984, + "step": 3648 + }, + { + "epoch": 0.3994635868524043, + "grad_norm": 1.269377901645877, + "learning_rate": 3.276303294324843e-05, + "loss": 0.8239, + "num_input_tokens_seen": 658332864, + "step": 3649 + }, + { + "epoch": 0.39957305892334216, + "grad_norm": 1.3896857958320614, + "learning_rate": 3.275485892262514e-05, + "loss": 0.7238, + "num_input_tokens_seen": 658513408, + "step": 3650 + }, + { + "epoch": 0.3996825309942801, + "grad_norm": 1.3958951974321174, + "learning_rate": 3.274668398461686e-05, + "loss": 0.7913, + "num_input_tokens_seen": 658691712, + "step": 3651 + }, + { + "epoch": 0.399792003065218, + "grad_norm": 1.4004636184205108, + "learning_rate": 3.273850813019068e-05, + "loss": 0.8298, + "num_input_tokens_seen": 658890400, + "step": 3652 + }, + { + "epoch": 0.39990147513615587, + "grad_norm": 1.3085576294426045, + "learning_rate": 3.273033136031378e-05, + "loss": 0.6915, + "num_input_tokens_seen": 659069376, + "step": 3653 + }, + { + "epoch": 0.4000109472070938, + "grad_norm": 1.3150618845371322, + "learning_rate": 3.272215367595346e-05, + "loss": 0.852, + "num_input_tokens_seen": 659232000, + "step": 3654 + }, + { + "epoch": 0.4001204192780317, + "grad_norm": 1.4104768172059068, + "learning_rate": 3.271397507807712e-05, + "loss": 0.7987, + "num_input_tokens_seen": 659419264, + "step": 3655 + }, + { + "epoch": 0.4002298913489696, + "grad_norm": 1.3544353316406197, + "learning_rate": 3.2705795567652276e-05, + "loss": 0.7644, + "num_input_tokens_seen": 659593536, + "step": 3656 + }, + { + "epoch": 0.4003393634199075, + "grad_norm": 1.3863142256310037, + "learning_rate": 3.269761514564655e-05, + "loss": 0.8496, + "num_input_tokens_seen": 659793120, + "step": 3657 + }, + { + "epoch": 0.4004488354908454, + "grad_norm": 1.1732675035570337, + "learning_rate": 3.268943381302767e-05, + "loss": 0.6602, + "num_input_tokens_seen": 659992032, + "step": 3658 + }, + { + "epoch": 0.4005583075617833, + "grad_norm": 1.2539254540609825, + "learning_rate": 3.268125157076346e-05, + "loss": 0.8245, + "num_input_tokens_seen": 660171680, + "step": 3659 + }, + { + "epoch": 0.4006677796327212, + "grad_norm": 1.389121281996713, + "learning_rate": 3.267306841982188e-05, + "loss": 0.8781, + "num_input_tokens_seen": 660325792, + "step": 3660 + }, + { + "epoch": 0.4007772517036591, + "grad_norm": 1.191549073710659, + "learning_rate": 3.266488436117097e-05, + "loss": 0.7475, + "num_input_tokens_seen": 660498944, + "step": 3661 + }, + { + "epoch": 0.400886723774597, + "grad_norm": 1.2752438237291768, + "learning_rate": 3.265669939577889e-05, + "loss": 0.7077, + "num_input_tokens_seen": 660675680, + "step": 3662 + }, + { + "epoch": 0.4009961958455349, + "grad_norm": 1.0770621105196265, + "learning_rate": 3.264851352461391e-05, + "loss": 0.5512, + "num_input_tokens_seen": 660871904, + "step": 3663 + }, + { + "epoch": 0.4011056679164728, + "grad_norm": 1.2632799911728663, + "learning_rate": 3.26403267486444e-05, + "loss": 0.8656, + "num_input_tokens_seen": 661040352, + "step": 3664 + }, + { + "epoch": 0.4012151399874107, + "grad_norm": 1.0695110020974459, + "learning_rate": 3.263213906883885e-05, + "loss": 0.7723, + "num_input_tokens_seen": 661238816, + "step": 3665 + }, + { + "epoch": 0.40132461205834863, + "grad_norm": 1.2395630468850964, + "learning_rate": 3.262395048616584e-05, + "loss": 0.6963, + "num_input_tokens_seen": 661412864, + "step": 3666 + }, + { + "epoch": 0.4014340841292865, + "grad_norm": 1.2889644238702296, + "learning_rate": 3.2615761001594055e-05, + "loss": 0.7936, + "num_input_tokens_seen": 661580864, + "step": 3667 + }, + { + "epoch": 0.4015435562002244, + "grad_norm": 1.3078531475427257, + "learning_rate": 3.26075706160923e-05, + "loss": 0.8228, + "num_input_tokens_seen": 661773952, + "step": 3668 + }, + { + "epoch": 0.40165302827116234, + "grad_norm": 1.2653919994490215, + "learning_rate": 3.259937933062949e-05, + "loss": 0.7054, + "num_input_tokens_seen": 661946656, + "step": 3669 + }, + { + "epoch": 0.4017625003421002, + "grad_norm": 1.1054932284930643, + "learning_rate": 3.2591187146174636e-05, + "loss": 0.5957, + "num_input_tokens_seen": 662106816, + "step": 3670 + }, + { + "epoch": 0.4018719724130381, + "grad_norm": 1.3708221079512648, + "learning_rate": 3.258299406369685e-05, + "loss": 0.95, + "num_input_tokens_seen": 662288032, + "step": 3671 + }, + { + "epoch": 0.40198144448397605, + "grad_norm": 1.1613410012320147, + "learning_rate": 3.257480008416536e-05, + "loss": 0.7413, + "num_input_tokens_seen": 662493888, + "step": 3672 + }, + { + "epoch": 0.4020909165549139, + "grad_norm": 1.2296061565246283, + "learning_rate": 3.25666052085495e-05, + "loss": 0.7711, + "num_input_tokens_seen": 662673088, + "step": 3673 + }, + { + "epoch": 0.40220038862585183, + "grad_norm": 1.2701056956065053, + "learning_rate": 3.2558409437818714e-05, + "loss": 0.8008, + "num_input_tokens_seen": 662841088, + "step": 3674 + }, + { + "epoch": 0.40230986069678976, + "grad_norm": 1.212205481399557, + "learning_rate": 3.255021277294253e-05, + "loss": 0.6629, + "num_input_tokens_seen": 663009088, + "step": 3675 + }, + { + "epoch": 0.4024193327677276, + "grad_norm": 1.2206261694492846, + "learning_rate": 3.254201521489062e-05, + "loss": 0.801, + "num_input_tokens_seen": 663183808, + "step": 3676 + }, + { + "epoch": 0.40252880483866554, + "grad_norm": 1.4016140285107783, + "learning_rate": 3.253381676463273e-05, + "loss": 0.8856, + "num_input_tokens_seen": 663379584, + "step": 3677 + }, + { + "epoch": 0.4026382769096034, + "grad_norm": 1.2131598967653145, + "learning_rate": 3.252561742313871e-05, + "loss": 0.7237, + "num_input_tokens_seen": 663546688, + "step": 3678 + }, + { + "epoch": 0.40274774898054133, + "grad_norm": 1.6379427100282962, + "learning_rate": 3.2517417191378544e-05, + "loss": 0.7983, + "num_input_tokens_seen": 663757248, + "step": 3679 + }, + { + "epoch": 0.40285722105147925, + "grad_norm": 1.2255325366540977, + "learning_rate": 3.250921607032229e-05, + "loss": 0.7509, + "num_input_tokens_seen": 663943168, + "step": 3680 + }, + { + "epoch": 0.4029666931224171, + "grad_norm": 1.3817897398003052, + "learning_rate": 3.2501014060940135e-05, + "loss": 0.845, + "num_input_tokens_seen": 664134912, + "step": 3681 + }, + { + "epoch": 0.40307616519335504, + "grad_norm": 1.2974342151642344, + "learning_rate": 3.249281116420234e-05, + "loss": 0.661, + "num_input_tokens_seen": 664336288, + "step": 3682 + }, + { + "epoch": 0.40318563726429296, + "grad_norm": 1.3138004853686573, + "learning_rate": 3.248460738107932e-05, + "loss": 0.7089, + "num_input_tokens_seen": 664490176, + "step": 3683 + }, + { + "epoch": 0.40329510933523083, + "grad_norm": 1.242513218047603, + "learning_rate": 3.2476402712541556e-05, + "loss": 0.6887, + "num_input_tokens_seen": 664677440, + "step": 3684 + }, + { + "epoch": 0.40340458140616875, + "grad_norm": 1.2717386627231846, + "learning_rate": 3.246819715955964e-05, + "loss": 0.7156, + "num_input_tokens_seen": 664863360, + "step": 3685 + }, + { + "epoch": 0.40351405347710667, + "grad_norm": 1.286092863280707, + "learning_rate": 3.2459990723104285e-05, + "loss": 0.691, + "num_input_tokens_seen": 665021280, + "step": 3686 + }, + { + "epoch": 0.40362352554804454, + "grad_norm": 1.1662299440055666, + "learning_rate": 3.245178340414628e-05, + "loss": 0.6117, + "num_input_tokens_seen": 665220864, + "step": 3687 + }, + { + "epoch": 0.40373299761898246, + "grad_norm": 1.1990452743458149, + "learning_rate": 3.244357520365654e-05, + "loss": 0.6945, + "num_input_tokens_seen": 665399840, + "step": 3688 + }, + { + "epoch": 0.4038424696899204, + "grad_norm": 1.1704539520383377, + "learning_rate": 3.243536612260609e-05, + "loss": 0.6683, + "num_input_tokens_seen": 665576128, + "step": 3689 + }, + { + "epoch": 0.40395194176085825, + "grad_norm": 1.287552891728141, + "learning_rate": 3.242715616196604e-05, + "loss": 0.507, + "num_input_tokens_seen": 665744576, + "step": 3690 + }, + { + "epoch": 0.40406141383179617, + "grad_norm": 1.2030954728900063, + "learning_rate": 3.241894532270762e-05, + "loss": 0.6367, + "num_input_tokens_seen": 665898240, + "step": 3691 + }, + { + "epoch": 0.4041708859027341, + "grad_norm": 1.1396115647654064, + "learning_rate": 3.2410733605802146e-05, + "loss": 0.5868, + "num_input_tokens_seen": 666071392, + "step": 3692 + }, + { + "epoch": 0.40428035797367196, + "grad_norm": 1.546714928561888, + "learning_rate": 3.240252101222105e-05, + "loss": 0.9129, + "num_input_tokens_seen": 666230208, + "step": 3693 + }, + { + "epoch": 0.4043898300446099, + "grad_norm": 1.272651570254738, + "learning_rate": 3.2394307542935876e-05, + "loss": 0.8384, + "num_input_tokens_seen": 666440768, + "step": 3694 + }, + { + "epoch": 0.4044993021155478, + "grad_norm": 1.2800190758028767, + "learning_rate": 3.2386093198918246e-05, + "loss": 0.6633, + "num_input_tokens_seen": 666601376, + "step": 3695 + }, + { + "epoch": 0.40460877418648566, + "grad_norm": 1.267239612114587, + "learning_rate": 3.237787798113992e-05, + "loss": 0.7502, + "num_input_tokens_seen": 666761760, + "step": 3696 + }, + { + "epoch": 0.4047182462574236, + "grad_norm": 1.277364421525562, + "learning_rate": 3.236966189057273e-05, + "loss": 0.7542, + "num_input_tokens_seen": 666946784, + "step": 3697 + }, + { + "epoch": 0.40482771832836145, + "grad_norm": 1.3170148761259581, + "learning_rate": 3.236144492818862e-05, + "loss": 0.6308, + "num_input_tokens_seen": 667107168, + "step": 3698 + }, + { + "epoch": 0.4049371903992994, + "grad_norm": 1.2279859410733063, + "learning_rate": 3.235322709495966e-05, + "loss": 0.8963, + "num_input_tokens_seen": 667283008, + "step": 3699 + }, + { + "epoch": 0.4050466624702373, + "grad_norm": 1.3487147500396395, + "learning_rate": 3.234500839185799e-05, + "loss": 0.7218, + "num_input_tokens_seen": 667429952, + "step": 3700 + }, + { + "epoch": 0.40515613454117516, + "grad_norm": 1.3047888710706277, + "learning_rate": 3.233678881985586e-05, + "loss": 0.7665, + "num_input_tokens_seen": 667631776, + "step": 3701 + }, + { + "epoch": 0.4052656066121131, + "grad_norm": 1.228903171077104, + "learning_rate": 3.232856837992564e-05, + "loss": 0.9228, + "num_input_tokens_seen": 667821728, + "step": 3702 + }, + { + "epoch": 0.405375078683051, + "grad_norm": 1.3546740046162247, + "learning_rate": 3.232034707303979e-05, + "loss": 0.7327, + "num_input_tokens_seen": 667997120, + "step": 3703 + }, + { + "epoch": 0.40548455075398887, + "grad_norm": 1.2747899462135568, + "learning_rate": 3.231212490017088e-05, + "loss": 0.6943, + "num_input_tokens_seen": 668197152, + "step": 3704 + }, + { + "epoch": 0.4055940228249268, + "grad_norm": 1.258332897127257, + "learning_rate": 3.230390186229157e-05, + "loss": 0.6198, + "num_input_tokens_seen": 668370976, + "step": 3705 + }, + { + "epoch": 0.4057034948958647, + "grad_norm": 1.2346644484700517, + "learning_rate": 3.229567796037463e-05, + "loss": 0.7217, + "num_input_tokens_seen": 668548832, + "step": 3706 + }, + { + "epoch": 0.4058129669668026, + "grad_norm": 1.320327126687195, + "learning_rate": 3.228745319539294e-05, + "loss": 0.5537, + "num_input_tokens_seen": 668709888, + "step": 3707 + }, + { + "epoch": 0.4059224390377405, + "grad_norm": 1.3191903215377156, + "learning_rate": 3.227922756831947e-05, + "loss": 0.6726, + "num_input_tokens_seen": 668870496, + "step": 3708 + }, + { + "epoch": 0.4060319111086784, + "grad_norm": 1.3539414829924061, + "learning_rate": 3.227100108012728e-05, + "loss": 0.7782, + "num_input_tokens_seen": 669060224, + "step": 3709 + }, + { + "epoch": 0.4061413831796163, + "grad_norm": 1.1912747670147819, + "learning_rate": 3.226277373178957e-05, + "loss": 0.5244, + "num_input_tokens_seen": 669217024, + "step": 3710 + }, + { + "epoch": 0.4062508552505542, + "grad_norm": 1.394285249783988, + "learning_rate": 3.2254545524279626e-05, + "loss": 0.6058, + "num_input_tokens_seen": 669393984, + "step": 3711 + }, + { + "epoch": 0.40636032732149213, + "grad_norm": 1.1371531492172813, + "learning_rate": 3.22463164585708e-05, + "loss": 0.6531, + "num_input_tokens_seen": 669539360, + "step": 3712 + }, + { + "epoch": 0.40646979939243, + "grad_norm": 1.2974318722017992, + "learning_rate": 3.223808653563659e-05, + "loss": 0.6643, + "num_input_tokens_seen": 669680928, + "step": 3713 + }, + { + "epoch": 0.4065792714633679, + "grad_norm": 1.3872738868745347, + "learning_rate": 3.222985575645058e-05, + "loss": 0.7938, + "num_input_tokens_seen": 669882976, + "step": 3714 + }, + { + "epoch": 0.4066887435343058, + "grad_norm": 1.1899319892169435, + "learning_rate": 3.222162412198646e-05, + "loss": 0.7022, + "num_input_tokens_seen": 670060608, + "step": 3715 + }, + { + "epoch": 0.4067982156052437, + "grad_norm": 1.2406411292423316, + "learning_rate": 3.221339163321801e-05, + "loss": 0.6649, + "num_input_tokens_seen": 670197920, + "step": 3716 + }, + { + "epoch": 0.40690768767618163, + "grad_norm": 1.1157628292138513, + "learning_rate": 3.220515829111911e-05, + "loss": 0.6229, + "num_input_tokens_seen": 670403104, + "step": 3717 + }, + { + "epoch": 0.4070171597471195, + "grad_norm": 1.1632652368717944, + "learning_rate": 3.219692409666377e-05, + "loss": 0.8802, + "num_input_tokens_seen": 670611200, + "step": 3718 + }, + { + "epoch": 0.4071266318180574, + "grad_norm": 1.175329787054426, + "learning_rate": 3.218868905082606e-05, + "loss": 0.6061, + "num_input_tokens_seen": 670811680, + "step": 3719 + }, + { + "epoch": 0.40723610388899534, + "grad_norm": 1.2680709656643339, + "learning_rate": 3.218045315458018e-05, + "loss": 0.6817, + "num_input_tokens_seen": 670949888, + "step": 3720 + }, + { + "epoch": 0.4073455759599332, + "grad_norm": 1.2905873383206412, + "learning_rate": 3.2172216408900426e-05, + "loss": 0.6153, + "num_input_tokens_seen": 671121024, + "step": 3721 + }, + { + "epoch": 0.4074550480308711, + "grad_norm": 1.2963838681406983, + "learning_rate": 3.2163978814761174e-05, + "loss": 0.7762, + "num_input_tokens_seen": 671307840, + "step": 3722 + }, + { + "epoch": 0.40756452010180905, + "grad_norm": 1.2570227756060484, + "learning_rate": 3.215574037313692e-05, + "loss": 0.7713, + "num_input_tokens_seen": 671491968, + "step": 3723 + }, + { + "epoch": 0.4076739921727469, + "grad_norm": 1.2582187825708802, + "learning_rate": 3.214750108500227e-05, + "loss": 0.6724, + "num_input_tokens_seen": 671687072, + "step": 3724 + }, + { + "epoch": 0.40778346424368483, + "grad_norm": 1.2832533827974348, + "learning_rate": 3.21392609513319e-05, + "loss": 0.7907, + "num_input_tokens_seen": 671864032, + "step": 3725 + }, + { + "epoch": 0.40789293631462276, + "grad_norm": 1.2755488579401064, + "learning_rate": 3.21310199731006e-05, + "loss": 0.878, + "num_input_tokens_seen": 672034720, + "step": 3726 + }, + { + "epoch": 0.4080024083855606, + "grad_norm": 1.1215942541576578, + "learning_rate": 3.212277815128328e-05, + "loss": 0.5762, + "num_input_tokens_seen": 672210336, + "step": 3727 + }, + { + "epoch": 0.40811188045649854, + "grad_norm": 1.3624782438028582, + "learning_rate": 3.2114535486854915e-05, + "loss": 0.6817, + "num_input_tokens_seen": 672370944, + "step": 3728 + }, + { + "epoch": 0.40822135252743647, + "grad_norm": 1.5544024642278875, + "learning_rate": 3.210629198079061e-05, + "loss": 0.7638, + "num_input_tokens_seen": 672532672, + "step": 3729 + }, + { + "epoch": 0.40833082459837433, + "grad_norm": 1.3236214970362117, + "learning_rate": 3.209804763406554e-05, + "loss": 0.6333, + "num_input_tokens_seen": 672665056, + "step": 3730 + }, + { + "epoch": 0.40844029666931225, + "grad_norm": 1.2358532034077632, + "learning_rate": 3.2089802447655006e-05, + "loss": 0.7198, + "num_input_tokens_seen": 672848512, + "step": 3731 + }, + { + "epoch": 0.4085497687402501, + "grad_norm": 1.2264317728342489, + "learning_rate": 3.20815564225344e-05, + "loss": 0.7495, + "num_input_tokens_seen": 673031072, + "step": 3732 + }, + { + "epoch": 0.40865924081118804, + "grad_norm": 1.1294756446667837, + "learning_rate": 3.20733095596792e-05, + "loss": 0.6638, + "num_input_tokens_seen": 673205568, + "step": 3733 + }, + { + "epoch": 0.40876871288212596, + "grad_norm": 1.248930759883833, + "learning_rate": 3.2065061860065016e-05, + "loss": 0.809, + "num_input_tokens_seen": 673403584, + "step": 3734 + }, + { + "epoch": 0.40887818495306383, + "grad_norm": 1.341604027361958, + "learning_rate": 3.20568133246675e-05, + "loss": 0.7813, + "num_input_tokens_seen": 673571808, + "step": 3735 + }, + { + "epoch": 0.40898765702400175, + "grad_norm": 1.233231026470076, + "learning_rate": 3.204856395446247e-05, + "loss": 0.6224, + "num_input_tokens_seen": 673741152, + "step": 3736 + }, + { + "epoch": 0.40909712909493967, + "grad_norm": 1.303223662449265, + "learning_rate": 3.204031375042579e-05, + "loss": 0.6876, + "num_input_tokens_seen": 673935584, + "step": 3737 + }, + { + "epoch": 0.40920660116587754, + "grad_norm": 1.2418519532551822, + "learning_rate": 3.2032062713533464e-05, + "loss": 0.6662, + "num_input_tokens_seen": 674134944, + "step": 3738 + }, + { + "epoch": 0.40931607323681546, + "grad_norm": 1.173462669620729, + "learning_rate": 3.2023810844761554e-05, + "loss": 0.7518, + "num_input_tokens_seen": 674304736, + "step": 3739 + }, + { + "epoch": 0.4094255453077534, + "grad_norm": 1.1609401430739124, + "learning_rate": 3.201555814508626e-05, + "loss": 0.7462, + "num_input_tokens_seen": 674494016, + "step": 3740 + }, + { + "epoch": 0.40953501737869125, + "grad_norm": 1.21380624902034, + "learning_rate": 3.200730461548384e-05, + "loss": 0.7647, + "num_input_tokens_seen": 674651488, + "step": 3741 + }, + { + "epoch": 0.40964448944962917, + "grad_norm": 1.3235299448560751, + "learning_rate": 3.199905025693067e-05, + "loss": 0.6252, + "num_input_tokens_seen": 674807616, + "step": 3742 + }, + { + "epoch": 0.4097539615205671, + "grad_norm": 1.2536075509595557, + "learning_rate": 3.199079507040324e-05, + "loss": 0.8545, + "num_input_tokens_seen": 674978080, + "step": 3743 + }, + { + "epoch": 0.40986343359150496, + "grad_norm": 1.2971235749427192, + "learning_rate": 3.198253905687813e-05, + "loss": 0.6787, + "num_input_tokens_seen": 675174528, + "step": 3744 + }, + { + "epoch": 0.4099729056624429, + "grad_norm": 1.2107974555958887, + "learning_rate": 3.1974282217331985e-05, + "loss": 0.8089, + "num_input_tokens_seen": 675353280, + "step": 3745 + }, + { + "epoch": 0.4100823777333808, + "grad_norm": 1.3148607639383787, + "learning_rate": 3.1966024552741586e-05, + "loss": 0.8599, + "num_input_tokens_seen": 675527552, + "step": 3746 + }, + { + "epoch": 0.41019184980431866, + "grad_norm": 1.1335742888301648, + "learning_rate": 3.1957766064083804e-05, + "loss": 0.5155, + "num_input_tokens_seen": 675685248, + "step": 3747 + }, + { + "epoch": 0.4103013218752566, + "grad_norm": 1.3238335783512087, + "learning_rate": 3.19495067523356e-05, + "loss": 0.7574, + "num_input_tokens_seen": 675873856, + "step": 3748 + }, + { + "epoch": 0.41041079394619445, + "grad_norm": 1.3268699101095733, + "learning_rate": 3.194124661847403e-05, + "loss": 0.7892, + "num_input_tokens_seen": 676078144, + "step": 3749 + }, + { + "epoch": 0.4105202660171324, + "grad_norm": 1.2688358691242079, + "learning_rate": 3.193298566347625e-05, + "loss": 0.6695, + "num_input_tokens_seen": 676262272, + "step": 3750 + }, + { + "epoch": 0.4106297380880703, + "grad_norm": 1.403869903552485, + "learning_rate": 3.192472388831953e-05, + "loss": 0.7248, + "num_input_tokens_seen": 676450880, + "step": 3751 + }, + { + "epoch": 0.41073921015900816, + "grad_norm": 1.3596078519785786, + "learning_rate": 3.19164612939812e-05, + "loss": 0.6391, + "num_input_tokens_seen": 676593568, + "step": 3752 + }, + { + "epoch": 0.4108486822299461, + "grad_norm": 1.4216867678623575, + "learning_rate": 3.1908197881438727e-05, + "loss": 0.7536, + "num_input_tokens_seen": 676772096, + "step": 3753 + }, + { + "epoch": 0.410958154300884, + "grad_norm": 1.1508172738522093, + "learning_rate": 3.1899933651669656e-05, + "loss": 0.5998, + "num_input_tokens_seen": 676950848, + "step": 3754 + }, + { + "epoch": 0.41106762637182187, + "grad_norm": 1.3081985560121252, + "learning_rate": 3.1891668605651614e-05, + "loss": 0.6238, + "num_input_tokens_seen": 677095104, + "step": 3755 + }, + { + "epoch": 0.4111770984427598, + "grad_norm": 1.2416495351868317, + "learning_rate": 3.1883402744362355e-05, + "loss": 0.6728, + "num_input_tokens_seen": 677263328, + "step": 3756 + }, + { + "epoch": 0.4112865705136977, + "grad_norm": 1.2541749375198394, + "learning_rate": 3.1875136068779706e-05, + "loss": 0.5951, + "num_input_tokens_seen": 677422368, + "step": 3757 + }, + { + "epoch": 0.4113960425846356, + "grad_norm": 1.2753282330193825, + "learning_rate": 3.186686857988161e-05, + "loss": 0.8697, + "num_input_tokens_seen": 677618816, + "step": 3758 + }, + { + "epoch": 0.4115055146555735, + "grad_norm": 1.2943971240972438, + "learning_rate": 3.1858600278646084e-05, + "loss": 0.698, + "num_input_tokens_seen": 677778304, + "step": 3759 + }, + { + "epoch": 0.4116149867265114, + "grad_norm": 1.373061287208767, + "learning_rate": 3.185033116605126e-05, + "loss": 0.7378, + "num_input_tokens_seen": 677944736, + "step": 3760 + }, + { + "epoch": 0.4117244587974493, + "grad_norm": 1.2180029707059665, + "learning_rate": 3.1842061243075353e-05, + "loss": 0.5998, + "num_input_tokens_seen": 678141408, + "step": 3761 + }, + { + "epoch": 0.4118339308683872, + "grad_norm": 1.3001079019369015, + "learning_rate": 3.183379051069668e-05, + "loss": 0.7681, + "num_input_tokens_seen": 678320160, + "step": 3762 + }, + { + "epoch": 0.41194340293932513, + "grad_norm": 1.3777353648524762, + "learning_rate": 3.182551896989365e-05, + "loss": 0.8954, + "num_input_tokens_seen": 678518848, + "step": 3763 + }, + { + "epoch": 0.412052875010263, + "grad_norm": 1.2650076691014316, + "learning_rate": 3.181724662164478e-05, + "loss": 0.845, + "num_input_tokens_seen": 678717312, + "step": 3764 + }, + { + "epoch": 0.4121623470812009, + "grad_norm": 1.1963249356866241, + "learning_rate": 3.180897346692867e-05, + "loss": 0.6463, + "num_input_tokens_seen": 678915104, + "step": 3765 + }, + { + "epoch": 0.4122718191521388, + "grad_norm": 1.2951604240792942, + "learning_rate": 3.180069950672401e-05, + "loss": 0.6726, + "num_input_tokens_seen": 679072352, + "step": 3766 + }, + { + "epoch": 0.4123812912230767, + "grad_norm": 1.4345772095807228, + "learning_rate": 3.17924247420096e-05, + "loss": 0.8163, + "num_input_tokens_seen": 679246400, + "step": 3767 + }, + { + "epoch": 0.41249076329401463, + "grad_norm": 1.4259837385644152, + "learning_rate": 3.178414917376433e-05, + "loss": 0.8238, + "num_input_tokens_seen": 679406112, + "step": 3768 + }, + { + "epoch": 0.4126002353649525, + "grad_norm": 1.178477089479149, + "learning_rate": 3.1775872802967175e-05, + "loss": 0.6491, + "num_input_tokens_seen": 679558656, + "step": 3769 + }, + { + "epoch": 0.4127097074358904, + "grad_norm": 1.2404213111148943, + "learning_rate": 3.176759563059722e-05, + "loss": 0.6192, + "num_input_tokens_seen": 679711424, + "step": 3770 + }, + { + "epoch": 0.41281917950682834, + "grad_norm": 1.308207714019226, + "learning_rate": 3.175931765763365e-05, + "loss": 0.8721, + "num_input_tokens_seen": 679914368, + "step": 3771 + }, + { + "epoch": 0.4129286515777662, + "grad_norm": 1.368821132675364, + "learning_rate": 3.175103888505572e-05, + "loss": 0.7846, + "num_input_tokens_seen": 680093344, + "step": 3772 + }, + { + "epoch": 0.4130381236487041, + "grad_norm": 1.2374501613252615, + "learning_rate": 3.174275931384279e-05, + "loss": 0.7348, + "num_input_tokens_seen": 680287776, + "step": 3773 + }, + { + "epoch": 0.41314759571964205, + "grad_norm": 1.4302662162801627, + "learning_rate": 3.173447894497433e-05, + "loss": 0.8077, + "num_input_tokens_seen": 680461600, + "step": 3774 + }, + { + "epoch": 0.4132570677905799, + "grad_norm": 1.2892069880108843, + "learning_rate": 3.172619777942988e-05, + "loss": 0.5736, + "num_input_tokens_seen": 680650208, + "step": 3775 + }, + { + "epoch": 0.41336653986151783, + "grad_norm": 1.297205365856764, + "learning_rate": 3.1717915818189095e-05, + "loss": 0.8898, + "num_input_tokens_seen": 680848000, + "step": 3776 + }, + { + "epoch": 0.41347601193245576, + "grad_norm": 1.4356139705833806, + "learning_rate": 3.17096330622317e-05, + "loss": 0.6552, + "num_input_tokens_seen": 681023168, + "step": 3777 + }, + { + "epoch": 0.4135854840033936, + "grad_norm": 1.1979643383768332, + "learning_rate": 3.170134951253755e-05, + "loss": 0.5491, + "num_input_tokens_seen": 681224096, + "step": 3778 + }, + { + "epoch": 0.41369495607433154, + "grad_norm": 1.3344274095163873, + "learning_rate": 3.169306517008656e-05, + "loss": 0.7226, + "num_input_tokens_seen": 681381120, + "step": 3779 + }, + { + "epoch": 0.41380442814526947, + "grad_norm": 1.2532811968862487, + "learning_rate": 3.168478003585876e-05, + "loss": 0.7341, + "num_input_tokens_seen": 681590336, + "step": 3780 + }, + { + "epoch": 0.41391390021620733, + "grad_norm": 1.3379898368637635, + "learning_rate": 3.167649411083425e-05, + "loss": 0.6995, + "num_input_tokens_seen": 681790144, + "step": 3781 + }, + { + "epoch": 0.41402337228714525, + "grad_norm": 1.1666173356843759, + "learning_rate": 3.1668207395993265e-05, + "loss": 0.7427, + "num_input_tokens_seen": 681982112, + "step": 3782 + }, + { + "epoch": 0.4141328443580831, + "grad_norm": 1.3434834770433088, + "learning_rate": 3.1659919892316084e-05, + "loss": 0.8465, + "num_input_tokens_seen": 682182816, + "step": 3783 + }, + { + "epoch": 0.41424231642902104, + "grad_norm": 1.278565828367422, + "learning_rate": 3.1651631600783114e-05, + "loss": 0.7725, + "num_input_tokens_seen": 682361792, + "step": 3784 + }, + { + "epoch": 0.41435178849995896, + "grad_norm": 1.2817793427202584, + "learning_rate": 3.164334252237484e-05, + "loss": 0.8713, + "num_input_tokens_seen": 682547712, + "step": 3785 + }, + { + "epoch": 0.41446126057089683, + "grad_norm": 1.2672581908126306, + "learning_rate": 3.163505265807185e-05, + "loss": 0.6638, + "num_input_tokens_seen": 682734976, + "step": 3786 + }, + { + "epoch": 0.41457073264183475, + "grad_norm": 1.232367367755732, + "learning_rate": 3.162676200885481e-05, + "loss": 0.584, + "num_input_tokens_seen": 682907680, + "step": 3787 + }, + { + "epoch": 0.41468020471277267, + "grad_norm": 1.222483321078053, + "learning_rate": 3.161847057570449e-05, + "loss": 0.7333, + "num_input_tokens_seen": 683095392, + "step": 3788 + }, + { + "epoch": 0.41478967678371054, + "grad_norm": 1.2031435779276642, + "learning_rate": 3.161017835960176e-05, + "loss": 0.6066, + "num_input_tokens_seen": 683264736, + "step": 3789 + }, + { + "epoch": 0.41489914885464846, + "grad_norm": 1.374412497806458, + "learning_rate": 3.160188536152756e-05, + "loss": 0.6242, + "num_input_tokens_seen": 683442592, + "step": 3790 + }, + { + "epoch": 0.4150086209255864, + "grad_norm": 1.2672925724442494, + "learning_rate": 3.159359158246294e-05, + "loss": 0.7106, + "num_input_tokens_seen": 683619776, + "step": 3791 + }, + { + "epoch": 0.41511809299652425, + "grad_norm": 1.2280992248004658, + "learning_rate": 3.158529702338905e-05, + "loss": 0.7449, + "num_input_tokens_seen": 683815328, + "step": 3792 + }, + { + "epoch": 0.41522756506746217, + "grad_norm": 1.353922112708214, + "learning_rate": 3.157700168528711e-05, + "loss": 0.6822, + "num_input_tokens_seen": 683987136, + "step": 3793 + }, + { + "epoch": 0.4153370371384001, + "grad_norm": 1.3544057146012667, + "learning_rate": 3.156870556913844e-05, + "loss": 0.7657, + "num_input_tokens_seen": 684166560, + "step": 3794 + }, + { + "epoch": 0.41544650920933796, + "grad_norm": 1.3371173935297884, + "learning_rate": 3.156040867592446e-05, + "loss": 0.7369, + "num_input_tokens_seen": 684366368, + "step": 3795 + }, + { + "epoch": 0.4155559812802759, + "grad_norm": 1.3591258818266474, + "learning_rate": 3.155211100662668e-05, + "loss": 0.7325, + "num_input_tokens_seen": 684557888, + "step": 3796 + }, + { + "epoch": 0.4156654533512138, + "grad_norm": 1.2405412909951854, + "learning_rate": 3.1543812562226685e-05, + "loss": 0.6887, + "num_input_tokens_seen": 684743808, + "step": 3797 + }, + { + "epoch": 0.41577492542215166, + "grad_norm": 1.2211601683454714, + "learning_rate": 3.153551334370617e-05, + "loss": 0.7659, + "num_input_tokens_seen": 684955264, + "step": 3798 + }, + { + "epoch": 0.4158843974930896, + "grad_norm": 1.2243563498633907, + "learning_rate": 3.152721335204693e-05, + "loss": 0.8113, + "num_input_tokens_seen": 685160448, + "step": 3799 + }, + { + "epoch": 0.41599386956402745, + "grad_norm": 1.2952961330507178, + "learning_rate": 3.151891258823082e-05, + "loss": 0.8624, + "num_input_tokens_seen": 685354880, + "step": 3800 + }, + { + "epoch": 0.4161033416349654, + "grad_norm": 1.2315551321626708, + "learning_rate": 3.151061105323982e-05, + "loss": 0.6928, + "num_input_tokens_seen": 685527808, + "step": 3801 + }, + { + "epoch": 0.4162128137059033, + "grad_norm": 1.1343309420182002, + "learning_rate": 3.1502308748055975e-05, + "loss": 0.4996, + "num_input_tokens_seen": 685706112, + "step": 3802 + }, + { + "epoch": 0.41632228577684116, + "grad_norm": 1.3676365578095673, + "learning_rate": 3.1494005673661445e-05, + "loss": 0.9452, + "num_input_tokens_seen": 685876800, + "step": 3803 + }, + { + "epoch": 0.4164317578477791, + "grad_norm": 1.268061940828319, + "learning_rate": 3.1485701831038436e-05, + "loss": 0.885, + "num_input_tokens_seen": 686070112, + "step": 3804 + }, + { + "epoch": 0.416541229918717, + "grad_norm": 1.311615964078412, + "learning_rate": 3.147739722116932e-05, + "loss": 0.6188, + "num_input_tokens_seen": 686239456, + "step": 3805 + }, + { + "epoch": 0.41665070198965487, + "grad_norm": 1.1701517277336169, + "learning_rate": 3.1469091845036486e-05, + "loss": 0.6699, + "num_input_tokens_seen": 686429632, + "step": 3806 + }, + { + "epoch": 0.4167601740605928, + "grad_norm": 1.2355745016380864, + "learning_rate": 3.146078570362246e-05, + "loss": 0.6324, + "num_input_tokens_seen": 686583296, + "step": 3807 + }, + { + "epoch": 0.4168696461315307, + "grad_norm": 1.3562656665371793, + "learning_rate": 3.145247879790983e-05, + "loss": 0.6235, + "num_input_tokens_seen": 686768544, + "step": 3808 + }, + { + "epoch": 0.4169791182024686, + "grad_norm": 1.3393884891857848, + "learning_rate": 3.1444171128881294e-05, + "loss": 0.6675, + "num_input_tokens_seen": 686935424, + "step": 3809 + }, + { + "epoch": 0.4170885902734065, + "grad_norm": 1.2700907721332393, + "learning_rate": 3.1435862697519636e-05, + "loss": 0.7779, + "num_input_tokens_seen": 687126272, + "step": 3810 + }, + { + "epoch": 0.4171980623443444, + "grad_norm": 1.466209730943331, + "learning_rate": 3.142755350480772e-05, + "loss": 0.949, + "num_input_tokens_seen": 687296064, + "step": 3811 + }, + { + "epoch": 0.4173075344152823, + "grad_norm": 1.1916737740755983, + "learning_rate": 3.1419243551728513e-05, + "loss": 0.5468, + "num_input_tokens_seen": 687445472, + "step": 3812 + }, + { + "epoch": 0.4174170064862202, + "grad_norm": 1.140336561965795, + "learning_rate": 3.141093283926506e-05, + "loss": 0.5563, + "num_input_tokens_seen": 687612352, + "step": 3813 + }, + { + "epoch": 0.41752647855715813, + "grad_norm": 1.1990870798001045, + "learning_rate": 3.140262136840052e-05, + "loss": 0.6637, + "num_input_tokens_seen": 687798720, + "step": 3814 + }, + { + "epoch": 0.417635950628096, + "grad_norm": 1.2610962457009458, + "learning_rate": 3.1394309140118104e-05, + "loss": 0.6875, + "num_input_tokens_seen": 688012416, + "step": 3815 + }, + { + "epoch": 0.4177454226990339, + "grad_norm": 1.443080271163599, + "learning_rate": 3.138599615540114e-05, + "loss": 0.783, + "num_input_tokens_seen": 688170336, + "step": 3816 + }, + { + "epoch": 0.4178548947699718, + "grad_norm": 1.3905374306527847, + "learning_rate": 3.137768241523305e-05, + "loss": 0.7636, + "num_input_tokens_seen": 688346624, + "step": 3817 + }, + { + "epoch": 0.4179643668409097, + "grad_norm": 1.3571593545326874, + "learning_rate": 3.1369367920597306e-05, + "loss": 0.7487, + "num_input_tokens_seen": 688548000, + "step": 3818 + }, + { + "epoch": 0.41807383891184763, + "grad_norm": 1.1211724336771742, + "learning_rate": 3.136105267247752e-05, + "loss": 0.6415, + "num_input_tokens_seen": 688703232, + "step": 3819 + }, + { + "epoch": 0.4181833109827855, + "grad_norm": 1.3394508786765302, + "learning_rate": 3.1352736671857366e-05, + "loss": 0.6731, + "num_input_tokens_seen": 688902816, + "step": 3820 + }, + { + "epoch": 0.4182927830537234, + "grad_norm": 1.2851878802920358, + "learning_rate": 3.13444199197206e-05, + "loss": 0.7449, + "num_input_tokens_seen": 689065440, + "step": 3821 + }, + { + "epoch": 0.41840225512466134, + "grad_norm": 1.1792893400959237, + "learning_rate": 3.13361024170511e-05, + "loss": 0.7958, + "num_input_tokens_seen": 689287424, + "step": 3822 + }, + { + "epoch": 0.4185117271955992, + "grad_norm": 1.3128896770570346, + "learning_rate": 3.1327784164832786e-05, + "loss": 0.7449, + "num_input_tokens_seen": 689496864, + "step": 3823 + }, + { + "epoch": 0.4186211992665371, + "grad_norm": 1.0455960104526916, + "learning_rate": 3.13194651640497e-05, + "loss": 0.4806, + "num_input_tokens_seen": 689688160, + "step": 3824 + }, + { + "epoch": 0.41873067133747505, + "grad_norm": 1.306634101794372, + "learning_rate": 3.1311145415685975e-05, + "loss": 0.7647, + "num_input_tokens_seen": 689836672, + "step": 3825 + }, + { + "epoch": 0.4188401434084129, + "grad_norm": 1.2205734794435803, + "learning_rate": 3.13028249207258e-05, + "loss": 0.6736, + "num_input_tokens_seen": 690042080, + "step": 3826 + }, + { + "epoch": 0.41894961547935083, + "grad_norm": 1.180235830033271, + "learning_rate": 3.1294503680153496e-05, + "loss": 0.6007, + "num_input_tokens_seen": 690237408, + "step": 3827 + }, + { + "epoch": 0.41905908755028876, + "grad_norm": 1.3371235451377905, + "learning_rate": 3.128618169495344e-05, + "loss": 0.7314, + "num_input_tokens_seen": 690433408, + "step": 3828 + }, + { + "epoch": 0.4191685596212266, + "grad_norm": 1.5299680364922623, + "learning_rate": 3.1277858966110105e-05, + "loss": 0.7765, + "num_input_tokens_seen": 690605440, + "step": 3829 + }, + { + "epoch": 0.41927803169216454, + "grad_norm": 1.1924908313358764, + "learning_rate": 3.126953549460805e-05, + "loss": 0.7089, + "num_input_tokens_seen": 690805920, + "step": 3830 + }, + { + "epoch": 0.41938750376310246, + "grad_norm": 1.2325115382832499, + "learning_rate": 3.126121128143194e-05, + "loss": 0.7027, + "num_input_tokens_seen": 690992960, + "step": 3831 + }, + { + "epoch": 0.41949697583404033, + "grad_norm": 1.3997969599356561, + "learning_rate": 3.1252886327566494e-05, + "loss": 0.9494, + "num_input_tokens_seen": 691183136, + "step": 3832 + }, + { + "epoch": 0.41960644790497825, + "grad_norm": 1.2079369070079573, + "learning_rate": 3.124456063399656e-05, + "loss": 0.5754, + "num_input_tokens_seen": 691346656, + "step": 3833 + }, + { + "epoch": 0.4197159199759161, + "grad_norm": 1.1382923573388612, + "learning_rate": 3.123623420170703e-05, + "loss": 0.7342, + "num_input_tokens_seen": 691525184, + "step": 3834 + }, + { + "epoch": 0.41982539204685404, + "grad_norm": 1.312186112790375, + "learning_rate": 3.122790703168292e-05, + "loss": 0.8273, + "num_input_tokens_seen": 691719392, + "step": 3835 + }, + { + "epoch": 0.41993486411779196, + "grad_norm": 1.2802455034853026, + "learning_rate": 3.1219579124909324e-05, + "loss": 0.6077, + "num_input_tokens_seen": 691919424, + "step": 3836 + }, + { + "epoch": 0.4200443361887298, + "grad_norm": 1.2586547685225138, + "learning_rate": 3.121125048237139e-05, + "loss": 0.6405, + "num_input_tokens_seen": 692089664, + "step": 3837 + }, + { + "epoch": 0.42015380825966775, + "grad_norm": 1.3402249430694508, + "learning_rate": 3.120292110505441e-05, + "loss": 0.7296, + "num_input_tokens_seen": 692274464, + "step": 3838 + }, + { + "epoch": 0.42026328033060567, + "grad_norm": 1.3637396361519276, + "learning_rate": 3.119459099394372e-05, + "loss": 0.8796, + "num_input_tokens_seen": 692481216, + "step": 3839 + }, + { + "epoch": 0.42037275240154354, + "grad_norm": 1.173645360749116, + "learning_rate": 3.1186260150024755e-05, + "loss": 0.5522, + "num_input_tokens_seen": 692651456, + "step": 3840 + }, + { + "epoch": 0.42048222447248146, + "grad_norm": 1.172178610572475, + "learning_rate": 3.117792857428304e-05, + "loss": 0.5941, + "num_input_tokens_seen": 692840960, + "step": 3841 + }, + { + "epoch": 0.4205916965434194, + "grad_norm": 1.342365971557708, + "learning_rate": 3.116959626770418e-05, + "loss": 0.5999, + "num_input_tokens_seen": 693013664, + "step": 3842 + }, + { + "epoch": 0.42070116861435725, + "grad_norm": 1.2732565815663928, + "learning_rate": 3.1161263231273884e-05, + "loss": 0.7621, + "num_input_tokens_seen": 693204960, + "step": 3843 + }, + { + "epoch": 0.42081064068529517, + "grad_norm": 1.2694574422655487, + "learning_rate": 3.115292946597793e-05, + "loss": 0.7009, + "num_input_tokens_seen": 693371392, + "step": 3844 + }, + { + "epoch": 0.4209201127562331, + "grad_norm": 1.242370256139509, + "learning_rate": 3.1144594972802165e-05, + "loss": 0.6409, + "num_input_tokens_seen": 693544096, + "step": 3845 + }, + { + "epoch": 0.42102958482717096, + "grad_norm": 1.308850755647869, + "learning_rate": 3.1136259752732576e-05, + "loss": 0.9093, + "num_input_tokens_seen": 693732704, + "step": 3846 + }, + { + "epoch": 0.4211390568981089, + "grad_norm": 1.4479777737170936, + "learning_rate": 3.112792380675519e-05, + "loss": 0.6878, + "num_input_tokens_seen": 693868000, + "step": 3847 + }, + { + "epoch": 0.4212485289690468, + "grad_norm": 1.1986026734641673, + "learning_rate": 3.111958713585612e-05, + "loss": 0.5781, + "num_input_tokens_seen": 694052800, + "step": 3848 + }, + { + "epoch": 0.42135800103998466, + "grad_norm": 1.2274633610340349, + "learning_rate": 3.1111249741021606e-05, + "loss": 0.6043, + "num_input_tokens_seen": 694238272, + "step": 3849 + }, + { + "epoch": 0.4214674731109226, + "grad_norm": 1.159104920878861, + "learning_rate": 3.110291162323792e-05, + "loss": 0.5407, + "num_input_tokens_seen": 694416128, + "step": 3850 + }, + { + "epoch": 0.42157694518186045, + "grad_norm": 1.1414796423113907, + "learning_rate": 3.109457278349145e-05, + "loss": 0.678, + "num_input_tokens_seen": 694615040, + "step": 3851 + }, + { + "epoch": 0.4216864172527984, + "grad_norm": 1.2230867840503652, + "learning_rate": 3.108623322276868e-05, + "loss": 0.6687, + "num_input_tokens_seen": 694802080, + "step": 3852 + }, + { + "epoch": 0.4217958893237363, + "grad_norm": 1.292391507005782, + "learning_rate": 3.1077892942056153e-05, + "loss": 0.657, + "num_input_tokens_seen": 694962912, + "step": 3853 + }, + { + "epoch": 0.42190536139467416, + "grad_norm": 1.2402713319573686, + "learning_rate": 3.106955194234051e-05, + "loss": 0.7479, + "num_input_tokens_seen": 695140544, + "step": 3854 + }, + { + "epoch": 0.4220148334656121, + "grad_norm": 1.191557977739342, + "learning_rate": 3.106121022460847e-05, + "loss": 0.6215, + "num_input_tokens_seen": 695310336, + "step": 3855 + }, + { + "epoch": 0.42212430553655, + "grad_norm": 1.2234410701227045, + "learning_rate": 3.105286778984686e-05, + "loss": 0.7586, + "num_input_tokens_seen": 695505664, + "step": 3856 + }, + { + "epoch": 0.42223377760748787, + "grad_norm": 1.3287449007124856, + "learning_rate": 3.104452463904255e-05, + "loss": 0.8825, + "num_input_tokens_seen": 695699424, + "step": 3857 + }, + { + "epoch": 0.4223432496784258, + "grad_norm": 1.258828315685132, + "learning_rate": 3.1036180773182535e-05, + "loss": 0.676, + "num_input_tokens_seen": 695893632, + "step": 3858 + }, + { + "epoch": 0.4224527217493637, + "grad_norm": 1.3428931498482986, + "learning_rate": 3.1027836193253874e-05, + "loss": 0.6713, + "num_input_tokens_seen": 696073504, + "step": 3859 + }, + { + "epoch": 0.4225621938203016, + "grad_norm": 1.4104294868836968, + "learning_rate": 3.1019490900243716e-05, + "loss": 0.7296, + "num_input_tokens_seen": 696268832, + "step": 3860 + }, + { + "epoch": 0.4226716658912395, + "grad_norm": 1.314871280810328, + "learning_rate": 3.101114489513929e-05, + "loss": 0.8261, + "num_input_tokens_seen": 696430784, + "step": 3861 + }, + { + "epoch": 0.4227811379621774, + "grad_norm": 1.3064779456898048, + "learning_rate": 3.100279817892792e-05, + "loss": 0.7216, + "num_input_tokens_seen": 696604160, + "step": 3862 + }, + { + "epoch": 0.4228906100331153, + "grad_norm": 1.3049590233694466, + "learning_rate": 3.099445075259698e-05, + "loss": 0.7855, + "num_input_tokens_seen": 696777984, + "step": 3863 + }, + { + "epoch": 0.4230000821040532, + "grad_norm": 1.3827774129607975, + "learning_rate": 3.098610261713399e-05, + "loss": 0.7956, + "num_input_tokens_seen": 696976448, + "step": 3864 + }, + { + "epoch": 0.42310955417499113, + "grad_norm": 1.3228366357019483, + "learning_rate": 3.0977753773526505e-05, + "loss": 0.7307, + "num_input_tokens_seen": 697142880, + "step": 3865 + }, + { + "epoch": 0.423219026245929, + "grad_norm": 1.4476937138361865, + "learning_rate": 3.096940422276218e-05, + "loss": 0.9007, + "num_input_tokens_seen": 697370464, + "step": 3866 + }, + { + "epoch": 0.4233284983168669, + "grad_norm": 1.2323842957294346, + "learning_rate": 3.096105396582874e-05, + "loss": 0.6706, + "num_input_tokens_seen": 697506880, + "step": 3867 + }, + { + "epoch": 0.4234379703878048, + "grad_norm": 1.361494888752769, + "learning_rate": 3.095270300371401e-05, + "loss": 0.8648, + "num_input_tokens_seen": 697676672, + "step": 3868 + }, + { + "epoch": 0.4235474424587427, + "grad_norm": 1.416646377567785, + "learning_rate": 3.0944351337405906e-05, + "loss": 0.899, + "num_input_tokens_seen": 697852064, + "step": 3869 + }, + { + "epoch": 0.42365691452968063, + "grad_norm": 1.3564654376938017, + "learning_rate": 3.09359989678924e-05, + "loss": 0.7229, + "num_input_tokens_seen": 698024992, + "step": 3870 + }, + { + "epoch": 0.4237663866006185, + "grad_norm": 1.4151870723845523, + "learning_rate": 3.092764589616155e-05, + "loss": 0.797, + "num_input_tokens_seen": 698227264, + "step": 3871 + }, + { + "epoch": 0.4238758586715564, + "grad_norm": 1.3073091493124018, + "learning_rate": 3.0919292123201524e-05, + "loss": 0.7225, + "num_input_tokens_seen": 698429312, + "step": 3872 + }, + { + "epoch": 0.42398533074249434, + "grad_norm": 1.3138804669118107, + "learning_rate": 3.0910937650000565e-05, + "loss": 0.9548, + "num_input_tokens_seen": 698615904, + "step": 3873 + }, + { + "epoch": 0.4240948028134322, + "grad_norm": 1.3220714612480466, + "learning_rate": 3.090258247754698e-05, + "loss": 0.7437, + "num_input_tokens_seen": 698752992, + "step": 3874 + }, + { + "epoch": 0.4242042748843701, + "grad_norm": 1.2446880483662026, + "learning_rate": 3.0894226606829166e-05, + "loss": 0.6731, + "num_input_tokens_seen": 698941824, + "step": 3875 + }, + { + "epoch": 0.42431374695530805, + "grad_norm": 1.3153223391813134, + "learning_rate": 3.088587003883562e-05, + "loss": 0.783, + "num_input_tokens_seen": 699101088, + "step": 3876 + }, + { + "epoch": 0.4244232190262459, + "grad_norm": 1.3044033540605964, + "learning_rate": 3.08775127745549e-05, + "loss": 0.6962, + "num_input_tokens_seen": 699287680, + "step": 3877 + }, + { + "epoch": 0.42453269109718383, + "grad_norm": 1.2418716796922766, + "learning_rate": 3.086915481497565e-05, + "loss": 0.7134, + "num_input_tokens_seen": 699472480, + "step": 3878 + }, + { + "epoch": 0.42464216316812176, + "grad_norm": 1.3093686670409665, + "learning_rate": 3.08607961610866e-05, + "loss": 0.9097, + "num_input_tokens_seen": 699663776, + "step": 3879 + }, + { + "epoch": 0.4247516352390596, + "grad_norm": 1.1960626380108235, + "learning_rate": 3.0852436813876576e-05, + "loss": 0.7816, + "num_input_tokens_seen": 699862016, + "step": 3880 + }, + { + "epoch": 0.42486110730999754, + "grad_norm": 1.217117193931393, + "learning_rate": 3.084407677433447e-05, + "loss": 0.5213, + "num_input_tokens_seen": 700039872, + "step": 3881 + }, + { + "epoch": 0.42497057938093546, + "grad_norm": 1.3225526033406803, + "learning_rate": 3.083571604344925e-05, + "loss": 0.8626, + "num_input_tokens_seen": 700217952, + "step": 3882 + }, + { + "epoch": 0.42508005145187333, + "grad_norm": 1.3837132510549357, + "learning_rate": 3.0827354622209976e-05, + "loss": 0.6128, + "num_input_tokens_seen": 700375648, + "step": 3883 + }, + { + "epoch": 0.42518952352281125, + "grad_norm": 1.3587993850085618, + "learning_rate": 3.081899251160578e-05, + "loss": 0.8852, + "num_input_tokens_seen": 700551936, + "step": 3884 + }, + { + "epoch": 0.4252989955937491, + "grad_norm": 1.3215726899020062, + "learning_rate": 3.081062971262591e-05, + "loss": 0.9995, + "num_input_tokens_seen": 700739200, + "step": 3885 + }, + { + "epoch": 0.42540846766468704, + "grad_norm": 1.203041519637667, + "learning_rate": 3.080226622625964e-05, + "loss": 0.765, + "num_input_tokens_seen": 700926912, + "step": 3886 + }, + { + "epoch": 0.42551793973562496, + "grad_norm": 1.1605860880387218, + "learning_rate": 3.0793902053496374e-05, + "loss": 0.5215, + "num_input_tokens_seen": 701108576, + "step": 3887 + }, + { + "epoch": 0.4256274118065628, + "grad_norm": 1.380093287825461, + "learning_rate": 3.0785537195325574e-05, + "loss": 0.8893, + "num_input_tokens_seen": 701314208, + "step": 3888 + }, + { + "epoch": 0.42573688387750075, + "grad_norm": 1.1941389918043195, + "learning_rate": 3.0777171652736784e-05, + "loss": 0.7126, + "num_input_tokens_seen": 701495872, + "step": 3889 + }, + { + "epoch": 0.42584635594843867, + "grad_norm": 1.084369652814626, + "learning_rate": 3.076880542671963e-05, + "loss": 0.5699, + "num_input_tokens_seen": 701705088, + "step": 3890 + }, + { + "epoch": 0.42595582801937654, + "grad_norm": 1.1709889431829674, + "learning_rate": 3.0760438518263826e-05, + "loss": 0.7596, + "num_input_tokens_seen": 701872416, + "step": 3891 + }, + { + "epoch": 0.42606530009031446, + "grad_norm": 1.240098840711944, + "learning_rate": 3.0752070928359147e-05, + "loss": 0.7499, + "num_input_tokens_seen": 702043104, + "step": 3892 + }, + { + "epoch": 0.4261747721612524, + "grad_norm": 1.1884910062289942, + "learning_rate": 3.0743702657995475e-05, + "loss": 0.7803, + "num_input_tokens_seen": 702218272, + "step": 3893 + }, + { + "epoch": 0.42628424423219025, + "grad_norm": 1.0305728423070255, + "learning_rate": 3.0735333708162763e-05, + "loss": 0.462, + "num_input_tokens_seen": 702406432, + "step": 3894 + }, + { + "epoch": 0.42639371630312817, + "grad_norm": 1.2433499349366985, + "learning_rate": 3.0726964079851037e-05, + "loss": 0.568, + "num_input_tokens_seen": 702590112, + "step": 3895 + }, + { + "epoch": 0.4265031883740661, + "grad_norm": 1.292951186138612, + "learning_rate": 3.071859377405041e-05, + "loss": 0.8496, + "num_input_tokens_seen": 702793280, + "step": 3896 + }, + { + "epoch": 0.42661266044500396, + "grad_norm": 1.30482687748171, + "learning_rate": 3.071022279175107e-05, + "loss": 0.8331, + "num_input_tokens_seen": 702982560, + "step": 3897 + }, + { + "epoch": 0.4267221325159419, + "grad_norm": 1.17940335925463, + "learning_rate": 3.070185113394329e-05, + "loss": 0.6637, + "num_input_tokens_seen": 703158400, + "step": 3898 + }, + { + "epoch": 0.4268316045868798, + "grad_norm": 1.1938862488170474, + "learning_rate": 3.069347880161741e-05, + "loss": 0.5959, + "num_input_tokens_seen": 703340288, + "step": 3899 + }, + { + "epoch": 0.42694107665781766, + "grad_norm": 1.1990373207055325, + "learning_rate": 3.068510579576389e-05, + "loss": 0.6411, + "num_input_tokens_seen": 703516576, + "step": 3900 + }, + { + "epoch": 0.4270505487287556, + "grad_norm": 1.274046910040627, + "learning_rate": 3.067673211737321e-05, + "loss": 0.6541, + "num_input_tokens_seen": 703659488, + "step": 3901 + }, + { + "epoch": 0.42716002079969345, + "grad_norm": 1.370556196163567, + "learning_rate": 3.066835776743598e-05, + "loss": 0.7273, + "num_input_tokens_seen": 703854368, + "step": 3902 + }, + { + "epoch": 0.4272694928706314, + "grad_norm": 1.2683866065003602, + "learning_rate": 3.0659982746942864e-05, + "loss": 0.5986, + "num_input_tokens_seen": 704032672, + "step": 3903 + }, + { + "epoch": 0.4273789649415693, + "grad_norm": 1.2345633061347028, + "learning_rate": 3.065160705688461e-05, + "loss": 0.7118, + "num_input_tokens_seen": 704210080, + "step": 3904 + }, + { + "epoch": 0.42748843701250716, + "grad_norm": 1.2798492403952346, + "learning_rate": 3.064323069825203e-05, + "loss": 0.7934, + "num_input_tokens_seen": 704389728, + "step": 3905 + }, + { + "epoch": 0.4275979090834451, + "grad_norm": 1.2411552794888476, + "learning_rate": 3.0634853672036054e-05, + "loss": 0.764, + "num_input_tokens_seen": 704595360, + "step": 3906 + }, + { + "epoch": 0.427707381154383, + "grad_norm": 1.2932542560595113, + "learning_rate": 3.0626475979227665e-05, + "loss": 0.7848, + "num_input_tokens_seen": 704781504, + "step": 3907 + }, + { + "epoch": 0.42781685322532087, + "grad_norm": 1.3517610701736817, + "learning_rate": 3.061809762081792e-05, + "loss": 0.923, + "num_input_tokens_seen": 704993184, + "step": 3908 + }, + { + "epoch": 0.4279263252962588, + "grad_norm": 1.1514581413031917, + "learning_rate": 3.060971859779797e-05, + "loss": 0.7028, + "num_input_tokens_seen": 705187168, + "step": 3909 + }, + { + "epoch": 0.4280357973671967, + "grad_norm": 1.1675227407654498, + "learning_rate": 3.060133891115903e-05, + "loss": 0.8451, + "num_input_tokens_seen": 705384960, + "step": 3910 + }, + { + "epoch": 0.4281452694381346, + "grad_norm": 1.1889117031245584, + "learning_rate": 3.059295856189241e-05, + "loss": 0.5343, + "num_input_tokens_seen": 705570432, + "step": 3911 + }, + { + "epoch": 0.4282547415090725, + "grad_norm": 1.3304009178910843, + "learning_rate": 3.058457755098948e-05, + "loss": 0.9712, + "num_input_tokens_seen": 705762176, + "step": 3912 + }, + { + "epoch": 0.4283642135800104, + "grad_norm": 1.2746928761399963, + "learning_rate": 3.05761958794417e-05, + "loss": 0.5657, + "num_input_tokens_seen": 705937120, + "step": 3913 + }, + { + "epoch": 0.4284736856509483, + "grad_norm": 1.1906548804190689, + "learning_rate": 3.056781354824061e-05, + "loss": 0.6916, + "num_input_tokens_seen": 706093920, + "step": 3914 + }, + { + "epoch": 0.4285831577218862, + "grad_norm": 1.2578004635063162, + "learning_rate": 3.055943055837782e-05, + "loss": 0.6702, + "num_input_tokens_seen": 706283872, + "step": 3915 + }, + { + "epoch": 0.42869262979282413, + "grad_norm": 1.3830000324566465, + "learning_rate": 3.055104691084502e-05, + "loss": 0.7949, + "num_input_tokens_seen": 706478752, + "step": 3916 + }, + { + "epoch": 0.428802101863762, + "grad_norm": 1.3795574692898094, + "learning_rate": 3.054266260663399e-05, + "loss": 0.7404, + "num_input_tokens_seen": 706641152, + "step": 3917 + }, + { + "epoch": 0.4289115739346999, + "grad_norm": 1.1760592718076537, + "learning_rate": 3.0534277646736564e-05, + "loss": 0.5111, + "num_input_tokens_seen": 706818336, + "step": 3918 + }, + { + "epoch": 0.4290210460056378, + "grad_norm": 1.3898727010849778, + "learning_rate": 3.052589203214467e-05, + "loss": 0.7979, + "num_input_tokens_seen": 706994624, + "step": 3919 + }, + { + "epoch": 0.4291305180765757, + "grad_norm": 1.4798766388902178, + "learning_rate": 3.0517505763850318e-05, + "loss": 0.7122, + "num_input_tokens_seen": 707141568, + "step": 3920 + }, + { + "epoch": 0.42923999014751363, + "grad_norm": 1.3202052326756917, + "learning_rate": 3.050911884284558e-05, + "loss": 0.5327, + "num_input_tokens_seen": 707289632, + "step": 3921 + }, + { + "epoch": 0.4293494622184515, + "grad_norm": 1.1790054572790416, + "learning_rate": 3.050073127012261e-05, + "loss": 0.4822, + "num_input_tokens_seen": 707471968, + "step": 3922 + }, + { + "epoch": 0.4294589342893894, + "grad_norm": 1.3675632941262599, + "learning_rate": 3.0492343046673654e-05, + "loss": 0.7077, + "num_input_tokens_seen": 707663936, + "step": 3923 + }, + { + "epoch": 0.42956840636032734, + "grad_norm": 1.3425297186490632, + "learning_rate": 3.0483954173491015e-05, + "loss": 0.6574, + "num_input_tokens_seen": 707855904, + "step": 3924 + }, + { + "epoch": 0.4296778784312652, + "grad_norm": 1.3368602042429405, + "learning_rate": 3.047556465156708e-05, + "loss": 0.6901, + "num_input_tokens_seen": 708031744, + "step": 3925 + }, + { + "epoch": 0.4297873505022031, + "grad_norm": 1.2286517578775606, + "learning_rate": 3.04671744818943e-05, + "loss": 0.612, + "num_input_tokens_seen": 708167264, + "step": 3926 + }, + { + "epoch": 0.42989682257314105, + "grad_norm": 1.3186673631132684, + "learning_rate": 3.045878366546524e-05, + "loss": 0.7541, + "num_input_tokens_seen": 708370208, + "step": 3927 + }, + { + "epoch": 0.4300062946440789, + "grad_norm": 1.3961021060191885, + "learning_rate": 3.045039220327251e-05, + "loss": 0.7001, + "num_input_tokens_seen": 708533504, + "step": 3928 + }, + { + "epoch": 0.43011576671501683, + "grad_norm": 1.3270857840395127, + "learning_rate": 3.0442000096308802e-05, + "loss": 0.7834, + "num_input_tokens_seen": 708739360, + "step": 3929 + }, + { + "epoch": 0.43022523878595476, + "grad_norm": 1.2565871720926527, + "learning_rate": 3.043360734556689e-05, + "loss": 0.6669, + "num_input_tokens_seen": 708942304, + "step": 3930 + }, + { + "epoch": 0.4303347108568926, + "grad_norm": 1.2872356069252457, + "learning_rate": 3.0425213952039612e-05, + "loss": 0.6135, + "num_input_tokens_seen": 709099776, + "step": 3931 + }, + { + "epoch": 0.43044418292783054, + "grad_norm": 1.3332777260072461, + "learning_rate": 3.0416819916719895e-05, + "loss": 0.8763, + "num_input_tokens_seen": 709296672, + "step": 3932 + }, + { + "epoch": 0.43055365499876846, + "grad_norm": 1.1299411029995607, + "learning_rate": 3.040842524060073e-05, + "loss": 0.5249, + "num_input_tokens_seen": 709486176, + "step": 3933 + }, + { + "epoch": 0.43066312706970633, + "grad_norm": 1.2767504792967899, + "learning_rate": 3.0400029924675206e-05, + "loss": 0.6727, + "num_input_tokens_seen": 709683072, + "step": 3934 + }, + { + "epoch": 0.43077259914064425, + "grad_norm": 1.1416916236562815, + "learning_rate": 3.0391633969936468e-05, + "loss": 0.5945, + "num_input_tokens_seen": 709862048, + "step": 3935 + }, + { + "epoch": 0.4308820712115821, + "grad_norm": 1.279601129358216, + "learning_rate": 3.0383237377377734e-05, + "loss": 0.6451, + "num_input_tokens_seen": 710036320, + "step": 3936 + }, + { + "epoch": 0.43099154328252004, + "grad_norm": 1.3504628023958702, + "learning_rate": 3.03748401479923e-05, + "loss": 0.8141, + "num_input_tokens_seen": 710179232, + "step": 3937 + }, + { + "epoch": 0.43110101535345796, + "grad_norm": 1.2109207152157044, + "learning_rate": 3.0366442282773567e-05, + "loss": 0.9026, + "num_input_tokens_seen": 710389568, + "step": 3938 + }, + { + "epoch": 0.4312104874243958, + "grad_norm": 1.2064863536709556, + "learning_rate": 3.035804378271496e-05, + "loss": 0.5485, + "num_input_tokens_seen": 710545024, + "step": 3939 + }, + { + "epoch": 0.43131995949533375, + "grad_norm": 1.2818808514458224, + "learning_rate": 3.034964464881002e-05, + "loss": 0.8002, + "num_input_tokens_seen": 710726688, + "step": 3940 + }, + { + "epoch": 0.43142943156627167, + "grad_norm": 1.2918622217529123, + "learning_rate": 3.0341244882052346e-05, + "loss": 0.656, + "num_input_tokens_seen": 710879904, + "step": 3941 + }, + { + "epoch": 0.43153890363720954, + "grad_norm": 1.2882012929445863, + "learning_rate": 3.0332844483435614e-05, + "loss": 0.7227, + "num_input_tokens_seen": 711061344, + "step": 3942 + }, + { + "epoch": 0.43164837570814746, + "grad_norm": 1.4125377379920974, + "learning_rate": 3.0324443453953578e-05, + "loss": 0.977, + "num_input_tokens_seen": 711240096, + "step": 3943 + }, + { + "epoch": 0.4317578477790854, + "grad_norm": 1.4793664054153581, + "learning_rate": 3.0316041794600054e-05, + "loss": 0.9692, + "num_input_tokens_seen": 711425120, + "step": 3944 + }, + { + "epoch": 0.43186731985002325, + "grad_norm": 1.2020494186315547, + "learning_rate": 3.030763950636895e-05, + "loss": 0.6132, + "num_input_tokens_seen": 711595584, + "step": 3945 + }, + { + "epoch": 0.43197679192096117, + "grad_norm": 1.3662041751893268, + "learning_rate": 3.0299236590254236e-05, + "loss": 0.8728, + "num_input_tokens_seen": 711773216, + "step": 3946 + }, + { + "epoch": 0.4320862639918991, + "grad_norm": 1.3623970902802178, + "learning_rate": 3.0290833047249966e-05, + "loss": 0.6938, + "num_input_tokens_seen": 711964288, + "step": 3947 + }, + { + "epoch": 0.43219573606283695, + "grad_norm": 1.0566541632104058, + "learning_rate": 3.0282428878350256e-05, + "loss": 0.4894, + "num_input_tokens_seen": 712151776, + "step": 3948 + }, + { + "epoch": 0.4323052081337749, + "grad_norm": 1.1303285924447781, + "learning_rate": 3.0274024084549312e-05, + "loss": 0.6961, + "num_input_tokens_seen": 712328736, + "step": 3949 + }, + { + "epoch": 0.4324146802047128, + "grad_norm": 1.3231160121318741, + "learning_rate": 3.0265618666841405e-05, + "loss": 0.8065, + "num_input_tokens_seen": 712503008, + "step": 3950 + }, + { + "epoch": 0.43252415227565066, + "grad_norm": 1.1544412951803307, + "learning_rate": 3.0257212626220872e-05, + "loss": 0.6454, + "num_input_tokens_seen": 712692512, + "step": 3951 + }, + { + "epoch": 0.4326336243465886, + "grad_norm": 1.1249527921308566, + "learning_rate": 3.0248805963682135e-05, + "loss": 0.5226, + "num_input_tokens_seen": 712815488, + "step": 3952 + }, + { + "epoch": 0.43274309641752645, + "grad_norm": 1.1741554451309884, + "learning_rate": 3.0240398680219685e-05, + "loss": 0.7462, + "num_input_tokens_seen": 713035232, + "step": 3953 + }, + { + "epoch": 0.4328525684884644, + "grad_norm": 1.2578686529858225, + "learning_rate": 3.0231990776828096e-05, + "loss": 0.5986, + "num_input_tokens_seen": 713219808, + "step": 3954 + }, + { + "epoch": 0.4329620405594023, + "grad_norm": 1.224789206215821, + "learning_rate": 3.0223582254501993e-05, + "loss": 0.5568, + "num_input_tokens_seen": 713352192, + "step": 3955 + }, + { + "epoch": 0.43307151263034016, + "grad_norm": 1.1844744537387681, + "learning_rate": 3.02151731142361e-05, + "loss": 0.704, + "num_input_tokens_seen": 713520416, + "step": 3956 + }, + { + "epoch": 0.4331809847012781, + "grad_norm": 1.2376359286937793, + "learning_rate": 3.0206763357025196e-05, + "loss": 0.7014, + "num_input_tokens_seen": 713694240, + "step": 3957 + }, + { + "epoch": 0.433290456772216, + "grad_norm": 1.334230190879838, + "learning_rate": 3.0198352983864138e-05, + "loss": 0.8019, + "num_input_tokens_seen": 713876576, + "step": 3958 + }, + { + "epoch": 0.43339992884315387, + "grad_norm": 1.4056140733347608, + "learning_rate": 3.0189941995747863e-05, + "loss": 0.7661, + "num_input_tokens_seen": 714059360, + "step": 3959 + }, + { + "epoch": 0.4335094009140918, + "grad_norm": 1.1768359652957425, + "learning_rate": 3.0181530393671364e-05, + "loss": 0.7001, + "num_input_tokens_seen": 714263648, + "step": 3960 + }, + { + "epoch": 0.4336188729850297, + "grad_norm": 1.2974557150925587, + "learning_rate": 3.0173118178629728e-05, + "loss": 0.7705, + "num_input_tokens_seen": 714441952, + "step": 3961 + }, + { + "epoch": 0.4337283450559676, + "grad_norm": 1.2088399625198722, + "learning_rate": 3.0164705351618104e-05, + "loss": 0.8372, + "num_input_tokens_seen": 714646688, + "step": 3962 + }, + { + "epoch": 0.4338378171269055, + "grad_norm": 1.2808441356649327, + "learning_rate": 3.0156291913631712e-05, + "loss": 0.9654, + "num_input_tokens_seen": 714853216, + "step": 3963 + }, + { + "epoch": 0.4339472891978434, + "grad_norm": 1.3362868895046496, + "learning_rate": 3.0147877865665843e-05, + "loss": 0.8553, + "num_input_tokens_seen": 715050336, + "step": 3964 + }, + { + "epoch": 0.4340567612687813, + "grad_norm": 1.0807315994909397, + "learning_rate": 3.013946320871586e-05, + "loss": 0.5535, + "num_input_tokens_seen": 715235808, + "step": 3965 + }, + { + "epoch": 0.4341662333397192, + "grad_norm": 1.261680248133646, + "learning_rate": 3.0131047943777207e-05, + "loss": 0.6924, + "num_input_tokens_seen": 715426656, + "step": 3966 + }, + { + "epoch": 0.43427570541065713, + "grad_norm": 1.173841479538088, + "learning_rate": 3.012263207184539e-05, + "loss": 0.6858, + "num_input_tokens_seen": 715608320, + "step": 3967 + }, + { + "epoch": 0.434385177481595, + "grad_norm": 1.1605100252868128, + "learning_rate": 3.011421559391599e-05, + "loss": 0.5542, + "num_input_tokens_seen": 715787744, + "step": 3968 + }, + { + "epoch": 0.4344946495525329, + "grad_norm": 1.3686608794997832, + "learning_rate": 3.010579851098466e-05, + "loss": 0.8148, + "num_input_tokens_seen": 715988448, + "step": 3969 + }, + { + "epoch": 0.4346041216234708, + "grad_norm": 1.1842946410282318, + "learning_rate": 3.0097380824047132e-05, + "loss": 0.5349, + "num_input_tokens_seen": 716166528, + "step": 3970 + }, + { + "epoch": 0.4347135936944087, + "grad_norm": 1.1873234065406781, + "learning_rate": 3.0088962534099195e-05, + "loss": 0.614, + "num_input_tokens_seen": 716331840, + "step": 3971 + }, + { + "epoch": 0.43482306576534663, + "grad_norm": 1.1611449784009678, + "learning_rate": 3.0080543642136723e-05, + "loss": 0.5818, + "num_input_tokens_seen": 716518432, + "step": 3972 + }, + { + "epoch": 0.4349325378362845, + "grad_norm": 1.357818984530892, + "learning_rate": 3.007212414915565e-05, + "loss": 0.6332, + "num_input_tokens_seen": 716684416, + "step": 3973 + }, + { + "epoch": 0.4350420099072224, + "grad_norm": 1.36567949234335, + "learning_rate": 3.0063704056151975e-05, + "loss": 0.7219, + "num_input_tokens_seen": 716889824, + "step": 3974 + }, + { + "epoch": 0.43515148197816034, + "grad_norm": 1.3164042577530362, + "learning_rate": 3.00552833641218e-05, + "loss": 0.6136, + "num_input_tokens_seen": 717082016, + "step": 3975 + }, + { + "epoch": 0.4352609540490982, + "grad_norm": 1.2027071933332352, + "learning_rate": 3.0046862074061266e-05, + "loss": 0.5752, + "num_input_tokens_seen": 717262560, + "step": 3976 + }, + { + "epoch": 0.4353704261200361, + "grad_norm": 1.3563478021472626, + "learning_rate": 3.00384401869666e-05, + "loss": 0.734, + "num_input_tokens_seen": 717441984, + "step": 3977 + }, + { + "epoch": 0.43547989819097405, + "grad_norm": 1.2492539563684144, + "learning_rate": 3.003001770383409e-05, + "loss": 0.5059, + "num_input_tokens_seen": 717621856, + "step": 3978 + }, + { + "epoch": 0.4355893702619119, + "grad_norm": 1.279508383934754, + "learning_rate": 3.0021594625660095e-05, + "loss": 0.7149, + "num_input_tokens_seen": 717817408, + "step": 3979 + }, + { + "epoch": 0.43569884233284983, + "grad_norm": 1.3201148884077802, + "learning_rate": 3.0013170953441062e-05, + "loss": 0.8263, + "num_input_tokens_seen": 717993024, + "step": 3980 + }, + { + "epoch": 0.43580831440378776, + "grad_norm": 1.3740046973668723, + "learning_rate": 3.000474668817348e-05, + "loss": 0.7118, + "num_input_tokens_seen": 718182752, + "step": 3981 + }, + { + "epoch": 0.4359177864747256, + "grad_norm": 1.2244005040330668, + "learning_rate": 2.999632183085394e-05, + "loss": 0.5441, + "num_input_tokens_seen": 718364640, + "step": 3982 + }, + { + "epoch": 0.43602725854566354, + "grad_norm": 1.161756958939208, + "learning_rate": 2.998789638247908e-05, + "loss": 0.659, + "num_input_tokens_seen": 718579904, + "step": 3983 + }, + { + "epoch": 0.43613673061660146, + "grad_norm": 1.2066480898788294, + "learning_rate": 2.9979470344045614e-05, + "loss": 0.5388, + "num_input_tokens_seen": 718762912, + "step": 3984 + }, + { + "epoch": 0.43624620268753933, + "grad_norm": 1.1850024109023152, + "learning_rate": 2.9971043716550316e-05, + "loss": 0.6563, + "num_input_tokens_seen": 718926432, + "step": 3985 + }, + { + "epoch": 0.43635567475847725, + "grad_norm": 1.1508672533185778, + "learning_rate": 2.9962616500990058e-05, + "loss": 0.7023, + "num_input_tokens_seen": 719123552, + "step": 3986 + }, + { + "epoch": 0.4364651468294151, + "grad_norm": 1.2562726943969522, + "learning_rate": 2.995418869836175e-05, + "loss": 0.8936, + "num_input_tokens_seen": 719313952, + "step": 3987 + }, + { + "epoch": 0.43657461890035304, + "grad_norm": 1.2810190969677069, + "learning_rate": 2.9945760309662395e-05, + "loss": 0.7088, + "num_input_tokens_seen": 719485760, + "step": 3988 + }, + { + "epoch": 0.43668409097129096, + "grad_norm": 1.3246780544195325, + "learning_rate": 2.9937331335889045e-05, + "loss": 0.8447, + "num_input_tokens_seen": 719684000, + "step": 3989 + }, + { + "epoch": 0.4367935630422288, + "grad_norm": 1.2332317514513567, + "learning_rate": 2.9928901778038837e-05, + "loss": 0.752, + "num_input_tokens_seen": 719898368, + "step": 3990 + }, + { + "epoch": 0.43690303511316675, + "grad_norm": 1.259681734352478, + "learning_rate": 2.9920471637108977e-05, + "loss": 0.8068, + "num_input_tokens_seen": 720093472, + "step": 3991 + }, + { + "epoch": 0.43701250718410467, + "grad_norm": 1.198716397018349, + "learning_rate": 2.9912040914096724e-05, + "loss": 0.9269, + "num_input_tokens_seen": 720280064, + "step": 3992 + }, + { + "epoch": 0.43712197925504254, + "grad_norm": 1.2925666575702741, + "learning_rate": 2.990360960999942e-05, + "loss": 0.7803, + "num_input_tokens_seen": 720468896, + "step": 3993 + }, + { + "epoch": 0.43723145132598046, + "grad_norm": 1.3139772276363884, + "learning_rate": 2.989517772581447e-05, + "loss": 0.7404, + "num_input_tokens_seen": 720658400, + "step": 3994 + }, + { + "epoch": 0.4373409233969184, + "grad_norm": 1.2955700847141107, + "learning_rate": 2.9886745262539362e-05, + "loss": 1.0339, + "num_input_tokens_seen": 720862016, + "step": 3995 + }, + { + "epoch": 0.43745039546785625, + "grad_norm": 1.2784724949998696, + "learning_rate": 2.9878312221171627e-05, + "loss": 0.9618, + "num_input_tokens_seen": 721046592, + "step": 3996 + }, + { + "epoch": 0.43755986753879417, + "grad_norm": 1.3215662855020585, + "learning_rate": 2.9869878602708885e-05, + "loss": 0.6566, + "num_input_tokens_seen": 721203168, + "step": 3997 + }, + { + "epoch": 0.4376693396097321, + "grad_norm": 1.214386685364751, + "learning_rate": 2.9861444408148815e-05, + "loss": 0.6509, + "num_input_tokens_seen": 721359296, + "step": 3998 + }, + { + "epoch": 0.43777881168066995, + "grad_norm": 1.4381156929706345, + "learning_rate": 2.985300963848916e-05, + "loss": 0.6289, + "num_input_tokens_seen": 721518560, + "step": 3999 + }, + { + "epoch": 0.4378882837516079, + "grad_norm": 1.3830991113914297, + "learning_rate": 2.984457429472774e-05, + "loss": 0.8155, + "num_input_tokens_seen": 721687008, + "step": 4000 + }, + { + "epoch": 0.4379977558225458, + "grad_norm": 1.193965177465918, + "learning_rate": 2.9836138377862442e-05, + "loss": 0.5141, + "num_input_tokens_seen": 721846048, + "step": 4001 + }, + { + "epoch": 0.43810722789348366, + "grad_norm": 1.1116826408778882, + "learning_rate": 2.9827701888891223e-05, + "loss": 0.5326, + "num_input_tokens_seen": 722020096, + "step": 4002 + }, + { + "epoch": 0.4382166999644216, + "grad_norm": 1.4322946259439127, + "learning_rate": 2.98192648288121e-05, + "loss": 0.6692, + "num_input_tokens_seen": 722201984, + "step": 4003 + }, + { + "epoch": 0.43832617203535945, + "grad_norm": 1.4367129617712386, + "learning_rate": 2.9810827198623158e-05, + "loss": 0.7519, + "num_input_tokens_seen": 722371776, + "step": 4004 + }, + { + "epoch": 0.4384356441062974, + "grad_norm": 1.3566686942033044, + "learning_rate": 2.980238899932256e-05, + "loss": 0.6478, + "num_input_tokens_seen": 722530368, + "step": 4005 + }, + { + "epoch": 0.4385451161772353, + "grad_norm": 1.39612349535608, + "learning_rate": 2.9793950231908523e-05, + "loss": 0.8284, + "num_input_tokens_seen": 722701728, + "step": 4006 + }, + { + "epoch": 0.43865458824817316, + "grad_norm": 1.2508975144722896, + "learning_rate": 2.9785510897379337e-05, + "loss": 0.7222, + "num_input_tokens_seen": 722875776, + "step": 4007 + }, + { + "epoch": 0.4387640603191111, + "grad_norm": 1.251478989156913, + "learning_rate": 2.9777070996733354e-05, + "loss": 0.8009, + "num_input_tokens_seen": 723069312, + "step": 4008 + }, + { + "epoch": 0.438873532390049, + "grad_norm": 1.3728280686699008, + "learning_rate": 2.976863053096901e-05, + "loss": 0.7669, + "num_input_tokens_seen": 723256128, + "step": 4009 + }, + { + "epoch": 0.43898300446098687, + "grad_norm": 1.3805724281535345, + "learning_rate": 2.976018950108479e-05, + "loss": 0.9514, + "num_input_tokens_seen": 723463104, + "step": 4010 + }, + { + "epoch": 0.4390924765319248, + "grad_norm": 1.2329994939786022, + "learning_rate": 2.9751747908079246e-05, + "loss": 0.7664, + "num_input_tokens_seen": 723652832, + "step": 4011 + }, + { + "epoch": 0.4392019486028627, + "grad_norm": 1.3060977646307015, + "learning_rate": 2.9743305752951016e-05, + "loss": 0.8466, + "num_input_tokens_seen": 723823744, + "step": 4012 + }, + { + "epoch": 0.4393114206738006, + "grad_norm": 1.2208550686686146, + "learning_rate": 2.9734863036698784e-05, + "loss": 0.6108, + "num_input_tokens_seen": 724015040, + "step": 4013 + }, + { + "epoch": 0.4394208927447385, + "grad_norm": 1.1856249511750914, + "learning_rate": 2.97264197603213e-05, + "loss": 0.6806, + "num_input_tokens_seen": 724216192, + "step": 4014 + }, + { + "epoch": 0.4395303648156764, + "grad_norm": 1.2287898644532234, + "learning_rate": 2.97179759248174e-05, + "loss": 0.8554, + "num_input_tokens_seen": 724410176, + "step": 4015 + }, + { + "epoch": 0.4396398368866143, + "grad_norm": 1.2653369340259168, + "learning_rate": 2.9709531531185964e-05, + "loss": 0.7449, + "num_input_tokens_seen": 724615136, + "step": 4016 + }, + { + "epoch": 0.4397493089575522, + "grad_norm": 1.2425827741011295, + "learning_rate": 2.9701086580425954e-05, + "loss": 0.7408, + "num_input_tokens_seen": 724784704, + "step": 4017 + }, + { + "epoch": 0.43985878102849013, + "grad_norm": 1.345311787568545, + "learning_rate": 2.969264107353638e-05, + "loss": 0.6664, + "num_input_tokens_seen": 724959200, + "step": 4018 + }, + { + "epoch": 0.439968253099428, + "grad_norm": 1.1464721055162252, + "learning_rate": 2.9684195011516347e-05, + "loss": 0.5817, + "num_input_tokens_seen": 725153408, + "step": 4019 + }, + { + "epoch": 0.4400777251703659, + "grad_norm": 1.3688614978404794, + "learning_rate": 2.9675748395365e-05, + "loss": 0.7197, + "num_input_tokens_seen": 725315584, + "step": 4020 + }, + { + "epoch": 0.4401871972413038, + "grad_norm": 1.3616624200039544, + "learning_rate": 2.9667301226081546e-05, + "loss": 0.6023, + "num_input_tokens_seen": 725476864, + "step": 4021 + }, + { + "epoch": 0.4402966693122417, + "grad_norm": 1.224400601394416, + "learning_rate": 2.9658853504665286e-05, + "loss": 0.5936, + "num_input_tokens_seen": 725659648, + "step": 4022 + }, + { + "epoch": 0.44040614138317963, + "grad_norm": 1.238125667966598, + "learning_rate": 2.965040523211556e-05, + "loss": 0.5967, + "num_input_tokens_seen": 725884544, + "step": 4023 + }, + { + "epoch": 0.4405156134541175, + "grad_norm": 1.3325804979153708, + "learning_rate": 2.964195640943178e-05, + "loss": 0.7454, + "num_input_tokens_seen": 726042016, + "step": 4024 + }, + { + "epoch": 0.4406250855250554, + "grad_norm": 1.2557793053335122, + "learning_rate": 2.9633507037613446e-05, + "loss": 0.8161, + "num_input_tokens_seen": 726232416, + "step": 4025 + }, + { + "epoch": 0.44073455759599334, + "grad_norm": 1.4501943260728969, + "learning_rate": 2.9625057117660077e-05, + "loss": 0.7472, + "num_input_tokens_seen": 726436480, + "step": 4026 + }, + { + "epoch": 0.4408440296669312, + "grad_norm": 1.324021046151403, + "learning_rate": 2.9616606650571292e-05, + "loss": 0.659, + "num_input_tokens_seen": 726621504, + "step": 4027 + }, + { + "epoch": 0.4409535017378691, + "grad_norm": 1.284470581062292, + "learning_rate": 2.960815563734677e-05, + "loss": 0.6183, + "num_input_tokens_seen": 726790624, + "step": 4028 + }, + { + "epoch": 0.44106297380880705, + "grad_norm": 1.3700487985760712, + "learning_rate": 2.959970407898624e-05, + "loss": 0.7642, + "num_input_tokens_seen": 726954816, + "step": 4029 + }, + { + "epoch": 0.4411724458797449, + "grad_norm": 1.2158911994583594, + "learning_rate": 2.9591251976489514e-05, + "loss": 0.6425, + "num_input_tokens_seen": 727126176, + "step": 4030 + }, + { + "epoch": 0.44128191795068283, + "grad_norm": 1.2297841423089255, + "learning_rate": 2.9582799330856458e-05, + "loss": 0.53, + "num_input_tokens_seen": 727312544, + "step": 4031 + }, + { + "epoch": 0.44139139002162076, + "grad_norm": 1.2446239257329734, + "learning_rate": 2.9574346143086994e-05, + "loss": 0.6328, + "num_input_tokens_seen": 727469792, + "step": 4032 + }, + { + "epoch": 0.4415008620925586, + "grad_norm": 1.4211209152502537, + "learning_rate": 2.9565892414181133e-05, + "loss": 0.8226, + "num_input_tokens_seen": 727636224, + "step": 4033 + }, + { + "epoch": 0.44161033416349654, + "grad_norm": 1.3541143505065374, + "learning_rate": 2.9557438145138933e-05, + "loss": 0.7232, + "num_input_tokens_seen": 727848128, + "step": 4034 + }, + { + "epoch": 0.44171980623443446, + "grad_norm": 1.303311244544896, + "learning_rate": 2.9548983336960502e-05, + "loss": 0.601, + "num_input_tokens_seen": 728020608, + "step": 4035 + }, + { + "epoch": 0.44182927830537233, + "grad_norm": 1.3180792164708213, + "learning_rate": 2.9540527990646045e-05, + "loss": 0.7301, + "num_input_tokens_seen": 728214592, + "step": 4036 + }, + { + "epoch": 0.44193875037631025, + "grad_norm": 1.3219207809853306, + "learning_rate": 2.953207210719581e-05, + "loss": 0.7351, + "num_input_tokens_seen": 728406560, + "step": 4037 + }, + { + "epoch": 0.4420482224472481, + "grad_norm": 1.4690681275318984, + "learning_rate": 2.9523615687610102e-05, + "loss": 0.8148, + "num_input_tokens_seen": 728570080, + "step": 4038 + }, + { + "epoch": 0.44215769451818604, + "grad_norm": 1.4644535613142022, + "learning_rate": 2.9515158732889305e-05, + "loss": 0.7182, + "num_input_tokens_seen": 728728672, + "step": 4039 + }, + { + "epoch": 0.44226716658912396, + "grad_norm": 1.2941687970329152, + "learning_rate": 2.9506701244033864e-05, + "loss": 0.6441, + "num_input_tokens_seen": 728919296, + "step": 4040 + }, + { + "epoch": 0.4423766386600618, + "grad_norm": 1.0650933536394158, + "learning_rate": 2.9498243222044282e-05, + "loss": 0.6377, + "num_input_tokens_seen": 729089312, + "step": 4041 + }, + { + "epoch": 0.44248611073099975, + "grad_norm": 1.1531474785823537, + "learning_rate": 2.9489784667921122e-05, + "loss": 0.619, + "num_input_tokens_seen": 729277696, + "step": 4042 + }, + { + "epoch": 0.44259558280193767, + "grad_norm": 1.358073345804802, + "learning_rate": 2.9481325582665013e-05, + "loss": 0.7955, + "num_input_tokens_seen": 729449280, + "step": 4043 + }, + { + "epoch": 0.44270505487287554, + "grad_norm": 1.0597346165272348, + "learning_rate": 2.9472865967276668e-05, + "loss": 0.5415, + "num_input_tokens_seen": 729620416, + "step": 4044 + }, + { + "epoch": 0.44281452694381346, + "grad_norm": 1.3720617979606806, + "learning_rate": 2.9464405822756823e-05, + "loss": 0.8005, + "num_input_tokens_seen": 729805216, + "step": 4045 + }, + { + "epoch": 0.4429239990147514, + "grad_norm": 1.2944686349610361, + "learning_rate": 2.9455945150106314e-05, + "loss": 1.1538, + "num_input_tokens_seen": 730016896, + "step": 4046 + }, + { + "epoch": 0.44303347108568925, + "grad_norm": 1.2469644404482803, + "learning_rate": 2.9447483950326e-05, + "loss": 0.7595, + "num_input_tokens_seen": 730171456, + "step": 4047 + }, + { + "epoch": 0.44314294315662717, + "grad_norm": 1.2260323611899973, + "learning_rate": 2.9439022224416833e-05, + "loss": 0.7009, + "num_input_tokens_seen": 730342144, + "step": 4048 + }, + { + "epoch": 0.4432524152275651, + "grad_norm": 1.2468246660135682, + "learning_rate": 2.9430559973379834e-05, + "loss": 0.5701, + "num_input_tokens_seen": 730490656, + "step": 4049 + }, + { + "epoch": 0.44336188729850295, + "grad_norm": 1.2386600143745365, + "learning_rate": 2.942209719821606e-05, + "loss": 0.642, + "num_input_tokens_seen": 730642304, + "step": 4050 + }, + { + "epoch": 0.4434713593694409, + "grad_norm": 1.1608298045093568, + "learning_rate": 2.9413633899926634e-05, + "loss": 0.6884, + "num_input_tokens_seen": 730853312, + "step": 4051 + }, + { + "epoch": 0.4435808314403788, + "grad_norm": 1.2473906939672093, + "learning_rate": 2.940517007951276e-05, + "loss": 0.619, + "num_input_tokens_seen": 730983008, + "step": 4052 + }, + { + "epoch": 0.44369030351131666, + "grad_norm": 1.187142217846443, + "learning_rate": 2.9396705737975683e-05, + "loss": 0.6903, + "num_input_tokens_seen": 731190432, + "step": 4053 + }, + { + "epoch": 0.4437997755822546, + "grad_norm": 1.1946770967573885, + "learning_rate": 2.9388240876316727e-05, + "loss": 0.4793, + "num_input_tokens_seen": 731336480, + "step": 4054 + }, + { + "epoch": 0.4439092476531925, + "grad_norm": 1.3235800674194196, + "learning_rate": 2.9379775495537254e-05, + "loss": 0.8048, + "num_input_tokens_seen": 731510528, + "step": 4055 + }, + { + "epoch": 0.4440187197241304, + "grad_norm": 1.165554447442243, + "learning_rate": 2.9371309596638725e-05, + "loss": 0.5982, + "num_input_tokens_seen": 731703392, + "step": 4056 + }, + { + "epoch": 0.4441281917950683, + "grad_norm": 1.2320602055583747, + "learning_rate": 2.9362843180622624e-05, + "loss": 0.9146, + "num_input_tokens_seen": 731908576, + "step": 4057 + }, + { + "epoch": 0.44423766386600616, + "grad_norm": 1.25465237769045, + "learning_rate": 2.935437624849051e-05, + "loss": 0.5755, + "num_input_tokens_seen": 732071200, + "step": 4058 + }, + { + "epoch": 0.4443471359369441, + "grad_norm": 1.182891234343626, + "learning_rate": 2.9345908801244015e-05, + "loss": 0.5786, + "num_input_tokens_seen": 732257120, + "step": 4059 + }, + { + "epoch": 0.444456608007882, + "grad_norm": 1.1735369278274679, + "learning_rate": 2.9337440839884817e-05, + "loss": 0.5651, + "num_input_tokens_seen": 732438336, + "step": 4060 + }, + { + "epoch": 0.44456608007881987, + "grad_norm": 1.2259250612427666, + "learning_rate": 2.932897236541466e-05, + "loss": 0.7945, + "num_input_tokens_seen": 732654048, + "step": 4061 + }, + { + "epoch": 0.4446755521497578, + "grad_norm": 1.2558756541002403, + "learning_rate": 2.932050337883534e-05, + "loss": 0.5544, + "num_input_tokens_seen": 732783296, + "step": 4062 + }, + { + "epoch": 0.4447850242206957, + "grad_norm": 1.4274767683714198, + "learning_rate": 2.9312033881148738e-05, + "loss": 0.9232, + "num_input_tokens_seen": 732972352, + "step": 4063 + }, + { + "epoch": 0.4448944962916336, + "grad_norm": 1.223572603953184, + "learning_rate": 2.9303563873356767e-05, + "loss": 0.6307, + "num_input_tokens_seen": 733154240, + "step": 4064 + }, + { + "epoch": 0.4450039683625715, + "grad_norm": 1.3803650916947023, + "learning_rate": 2.9295093356461416e-05, + "loss": 0.9087, + "num_input_tokens_seen": 733333664, + "step": 4065 + }, + { + "epoch": 0.4451134404335094, + "grad_norm": 1.4661535485326072, + "learning_rate": 2.9286622331464736e-05, + "loss": 0.8427, + "num_input_tokens_seen": 733531456, + "step": 4066 + }, + { + "epoch": 0.4452229125044473, + "grad_norm": 1.2373655766391212, + "learning_rate": 2.9278150799368825e-05, + "loss": 0.7133, + "num_input_tokens_seen": 733751200, + "step": 4067 + }, + { + "epoch": 0.4453323845753852, + "grad_norm": 1.1547552189565533, + "learning_rate": 2.9269678761175857e-05, + "loss": 0.7294, + "num_input_tokens_seen": 733933088, + "step": 4068 + }, + { + "epoch": 0.44544185664632313, + "grad_norm": 1.1725641385980905, + "learning_rate": 2.9261206217888048e-05, + "loss": 0.7485, + "num_input_tokens_seen": 734102656, + "step": 4069 + }, + { + "epoch": 0.445551328717261, + "grad_norm": 1.278460000305677, + "learning_rate": 2.925273317050769e-05, + "loss": 0.7672, + "num_input_tokens_seen": 734280960, + "step": 4070 + }, + { + "epoch": 0.4456608007881989, + "grad_norm": 1.2618105356000002, + "learning_rate": 2.9244259620037135e-05, + "loss": 0.772, + "num_input_tokens_seen": 734458592, + "step": 4071 + }, + { + "epoch": 0.44577027285913684, + "grad_norm": 1.2347700718115069, + "learning_rate": 2.9235785567478774e-05, + "loss": 0.7832, + "num_input_tokens_seen": 734637792, + "step": 4072 + }, + { + "epoch": 0.4458797449300747, + "grad_norm": 1.2862165071892402, + "learning_rate": 2.9227311013835084e-05, + "loss": 0.7481, + "num_input_tokens_seen": 734814304, + "step": 4073 + }, + { + "epoch": 0.44598921700101263, + "grad_norm": 1.2608668991801315, + "learning_rate": 2.921883596010857e-05, + "loss": 0.5067, + "num_input_tokens_seen": 734980288, + "step": 4074 + }, + { + "epoch": 0.4460986890719505, + "grad_norm": 1.2633542069863783, + "learning_rate": 2.921036040730184e-05, + "loss": 0.6887, + "num_input_tokens_seen": 735175392, + "step": 4075 + }, + { + "epoch": 0.4462081611428884, + "grad_norm": 1.2768462259165945, + "learning_rate": 2.9201884356417514e-05, + "loss": 0.6144, + "num_input_tokens_seen": 735339584, + "step": 4076 + }, + { + "epoch": 0.44631763321382634, + "grad_norm": 1.4016273389977245, + "learning_rate": 2.9193407808458308e-05, + "loss": 0.7013, + "num_input_tokens_seen": 735512960, + "step": 4077 + }, + { + "epoch": 0.4464271052847642, + "grad_norm": 1.471532661166251, + "learning_rate": 2.918493076442697e-05, + "loss": 0.8481, + "num_input_tokens_seen": 735680288, + "step": 4078 + }, + { + "epoch": 0.4465365773557021, + "grad_norm": 1.1817536607382582, + "learning_rate": 2.9176453225326328e-05, + "loss": 0.6095, + "num_input_tokens_seen": 735845152, + "step": 4079 + }, + { + "epoch": 0.44664604942664005, + "grad_norm": 1.3994284587154222, + "learning_rate": 2.9167975192159247e-05, + "loss": 0.7467, + "num_input_tokens_seen": 736051232, + "step": 4080 + }, + { + "epoch": 0.4467555214975779, + "grad_norm": 1.3940830739360235, + "learning_rate": 2.9159496665928677e-05, + "loss": 0.9203, + "num_input_tokens_seen": 736245888, + "step": 4081 + }, + { + "epoch": 0.44686499356851583, + "grad_norm": 1.5489792816175338, + "learning_rate": 2.915101764763759e-05, + "loss": 0.8306, + "num_input_tokens_seen": 736385216, + "step": 4082 + }, + { + "epoch": 0.44697446563945376, + "grad_norm": 1.2287368939031584, + "learning_rate": 2.914253813828906e-05, + "loss": 0.7493, + "num_input_tokens_seen": 736585248, + "step": 4083 + }, + { + "epoch": 0.4470839377103916, + "grad_norm": 1.22745937917278, + "learning_rate": 2.9134058138886188e-05, + "loss": 0.7806, + "num_input_tokens_seen": 736779008, + "step": 4084 + }, + { + "epoch": 0.44719340978132954, + "grad_norm": 1.1735412450205331, + "learning_rate": 2.9125577650432133e-05, + "loss": 0.5471, + "num_input_tokens_seen": 736956416, + "step": 4085 + }, + { + "epoch": 0.44730288185226746, + "grad_norm": 1.3759993042971825, + "learning_rate": 2.9117096673930138e-05, + "loss": 0.7088, + "num_input_tokens_seen": 737128672, + "step": 4086 + }, + { + "epoch": 0.44741235392320533, + "grad_norm": 1.261927489974003, + "learning_rate": 2.910861521038347e-05, + "loss": 0.6709, + "num_input_tokens_seen": 737310560, + "step": 4087 + }, + { + "epoch": 0.44752182599414325, + "grad_norm": 1.288659928779005, + "learning_rate": 2.9100133260795488e-05, + "loss": 0.6609, + "num_input_tokens_seen": 737491776, + "step": 4088 + }, + { + "epoch": 0.4476312980650812, + "grad_norm": 1.2724677764170114, + "learning_rate": 2.9091650826169565e-05, + "loss": 0.5555, + "num_input_tokens_seen": 737637600, + "step": 4089 + }, + { + "epoch": 0.44774077013601904, + "grad_norm": 1.2536590427558039, + "learning_rate": 2.9083167907509178e-05, + "loss": 0.6714, + "num_input_tokens_seen": 737818592, + "step": 4090 + }, + { + "epoch": 0.44785024220695696, + "grad_norm": 1.288997440430295, + "learning_rate": 2.9074684505817835e-05, + "loss": 0.7428, + "num_input_tokens_seen": 738006528, + "step": 4091 + }, + { + "epoch": 0.4479597142778948, + "grad_norm": 1.158843508324918, + "learning_rate": 2.9066200622099106e-05, + "loss": 0.8535, + "num_input_tokens_seen": 738213728, + "step": 4092 + }, + { + "epoch": 0.44806918634883275, + "grad_norm": 1.2840817026098286, + "learning_rate": 2.9057716257356614e-05, + "loss": 0.7829, + "num_input_tokens_seen": 738416896, + "step": 4093 + }, + { + "epoch": 0.44817865841977067, + "grad_norm": 1.2356151194121927, + "learning_rate": 2.9049231412594046e-05, + "loss": 0.6918, + "num_input_tokens_seen": 738621408, + "step": 4094 + }, + { + "epoch": 0.44828813049070854, + "grad_norm": 1.2169699752413936, + "learning_rate": 2.9040746088815142e-05, + "loss": 0.638, + "num_input_tokens_seen": 738798368, + "step": 4095 + }, + { + "epoch": 0.44839760256164646, + "grad_norm": 1.4399932303534435, + "learning_rate": 2.9032260287023698e-05, + "loss": 0.8713, + "num_input_tokens_seen": 738984960, + "step": 4096 + }, + { + "epoch": 0.4485070746325844, + "grad_norm": 1.160369236979566, + "learning_rate": 2.902377400822357e-05, + "loss": 0.7486, + "num_input_tokens_seen": 739185216, + "step": 4097 + }, + { + "epoch": 0.44861654670352225, + "grad_norm": 1.4759522645410927, + "learning_rate": 2.9015287253418672e-05, + "loss": 0.7816, + "num_input_tokens_seen": 739343360, + "step": 4098 + }, + { + "epoch": 0.44872601877446017, + "grad_norm": 1.398811162543762, + "learning_rate": 2.900680002361297e-05, + "loss": 0.6913, + "num_input_tokens_seen": 739489856, + "step": 4099 + }, + { + "epoch": 0.4488354908453981, + "grad_norm": 1.2779772292990483, + "learning_rate": 2.8998312319810482e-05, + "loss": 0.5013, + "num_input_tokens_seen": 739663008, + "step": 4100 + }, + { + "epoch": 0.44894496291633595, + "grad_norm": 1.1950622994850828, + "learning_rate": 2.8989824143015286e-05, + "loss": 0.7365, + "num_input_tokens_seen": 739844672, + "step": 4101 + }, + { + "epoch": 0.4490544349872739, + "grad_norm": 1.1050409358628106, + "learning_rate": 2.8981335494231533e-05, + "loss": 0.7275, + "num_input_tokens_seen": 740044256, + "step": 4102 + }, + { + "epoch": 0.4491639070582118, + "grad_norm": 1.3463005434003565, + "learning_rate": 2.8972846374463387e-05, + "loss": 0.899, + "num_input_tokens_seen": 740218976, + "step": 4103 + }, + { + "epoch": 0.44927337912914966, + "grad_norm": 1.2114171708587844, + "learning_rate": 2.896435678471512e-05, + "loss": 0.6348, + "num_input_tokens_seen": 740425056, + "step": 4104 + }, + { + "epoch": 0.4493828512000876, + "grad_norm": 1.1941238676672963, + "learning_rate": 2.895586672599102e-05, + "loss": 0.7896, + "num_input_tokens_seen": 740608288, + "step": 4105 + }, + { + "epoch": 0.4494923232710255, + "grad_norm": 1.338892295818998, + "learning_rate": 2.894737619929545e-05, + "loss": 0.7365, + "num_input_tokens_seen": 740798240, + "step": 4106 + }, + { + "epoch": 0.4496017953419634, + "grad_norm": 1.3675617250811911, + "learning_rate": 2.893888520563282e-05, + "loss": 0.9102, + "num_input_tokens_seen": 740975424, + "step": 4107 + }, + { + "epoch": 0.4497112674129013, + "grad_norm": 1.5579151789306218, + "learning_rate": 2.8930393746007606e-05, + "loss": 0.6858, + "num_input_tokens_seen": 741119232, + "step": 4108 + }, + { + "epoch": 0.44982073948383916, + "grad_norm": 1.2900748237082962, + "learning_rate": 2.8921901821424313e-05, + "loss": 0.6842, + "num_input_tokens_seen": 741298656, + "step": 4109 + }, + { + "epoch": 0.4499302115547771, + "grad_norm": 1.2854552689273502, + "learning_rate": 2.8913409432887546e-05, + "loss": 0.8468, + "num_input_tokens_seen": 741493088, + "step": 4110 + }, + { + "epoch": 0.450039683625715, + "grad_norm": 1.1859438889724665, + "learning_rate": 2.8904916581401913e-05, + "loss": 0.7091, + "num_input_tokens_seen": 741676992, + "step": 4111 + }, + { + "epoch": 0.45014915569665287, + "grad_norm": 1.1854907655714266, + "learning_rate": 2.8896423267972123e-05, + "loss": 0.7575, + "num_input_tokens_seen": 741867168, + "step": 4112 + }, + { + "epoch": 0.4502586277675908, + "grad_norm": 1.277587447411482, + "learning_rate": 2.8887929493602905e-05, + "loss": 0.7856, + "num_input_tokens_seen": 742041664, + "step": 4113 + }, + { + "epoch": 0.4503680998385287, + "grad_norm": 1.4362878249643234, + "learning_rate": 2.8879435259299065e-05, + "loss": 0.7802, + "num_input_tokens_seen": 742257600, + "step": 4114 + }, + { + "epoch": 0.4504775719094666, + "grad_norm": 1.0975896197421802, + "learning_rate": 2.8870940566065442e-05, + "loss": 0.6139, + "num_input_tokens_seen": 742423136, + "step": 4115 + }, + { + "epoch": 0.4505870439804045, + "grad_norm": 1.2801853501709182, + "learning_rate": 2.8862445414906953e-05, + "loss": 0.6623, + "num_input_tokens_seen": 742599648, + "step": 4116 + }, + { + "epoch": 0.4506965160513424, + "grad_norm": 1.3315052325081085, + "learning_rate": 2.8853949806828558e-05, + "loss": 0.8687, + "num_input_tokens_seen": 742803488, + "step": 4117 + }, + { + "epoch": 0.4508059881222803, + "grad_norm": 1.2328272906867812, + "learning_rate": 2.884545374283526e-05, + "loss": 0.6805, + "num_input_tokens_seen": 743003968, + "step": 4118 + }, + { + "epoch": 0.4509154601932182, + "grad_norm": 1.3638032717138413, + "learning_rate": 2.8836957223932137e-05, + "loss": 0.7523, + "num_input_tokens_seen": 743171744, + "step": 4119 + }, + { + "epoch": 0.45102493226415613, + "grad_norm": 1.2615258509330385, + "learning_rate": 2.8828460251124317e-05, + "loss": 0.7095, + "num_input_tokens_seen": 743366624, + "step": 4120 + }, + { + "epoch": 0.451134404335094, + "grad_norm": 1.2202959281782806, + "learning_rate": 2.881996282541697e-05, + "loss": 0.7463, + "num_input_tokens_seen": 743542688, + "step": 4121 + }, + { + "epoch": 0.4512438764060319, + "grad_norm": 1.2708850391479563, + "learning_rate": 2.8811464947815314e-05, + "loss": 0.5331, + "num_input_tokens_seen": 743706208, + "step": 4122 + }, + { + "epoch": 0.45135334847696984, + "grad_norm": 1.2381112737149915, + "learning_rate": 2.8802966619324645e-05, + "loss": 0.794, + "num_input_tokens_seen": 743927968, + "step": 4123 + }, + { + "epoch": 0.4514628205479077, + "grad_norm": 1.2563007344491615, + "learning_rate": 2.8794467840950295e-05, + "loss": 0.6574, + "num_input_tokens_seen": 744106720, + "step": 4124 + }, + { + "epoch": 0.4515722926188456, + "grad_norm": 1.1973303424214459, + "learning_rate": 2.8785968613697655e-05, + "loss": 0.5533, + "num_input_tokens_seen": 744302496, + "step": 4125 + }, + { + "epoch": 0.4516817646897835, + "grad_norm": 1.370714531864896, + "learning_rate": 2.877746893857216e-05, + "loss": 0.8233, + "num_input_tokens_seen": 744490432, + "step": 4126 + }, + { + "epoch": 0.4517912367607214, + "grad_norm": 1.3939716915474405, + "learning_rate": 2.8768968816579312e-05, + "loss": 0.7654, + "num_input_tokens_seen": 744665152, + "step": 4127 + }, + { + "epoch": 0.45190070883165934, + "grad_norm": 1.183035589871053, + "learning_rate": 2.8760468248724665e-05, + "loss": 0.6564, + "num_input_tokens_seen": 744849728, + "step": 4128 + }, + { + "epoch": 0.4520101809025972, + "grad_norm": 1.0883759489571196, + "learning_rate": 2.875196723601381e-05, + "loss": 0.5281, + "num_input_tokens_seen": 745029600, + "step": 4129 + }, + { + "epoch": 0.4521196529735351, + "grad_norm": 1.2730548494363714, + "learning_rate": 2.8743465779452394e-05, + "loss": 0.6389, + "num_input_tokens_seen": 745204096, + "step": 4130 + }, + { + "epoch": 0.45222912504447305, + "grad_norm": 1.314996162234896, + "learning_rate": 2.8734963880046145e-05, + "loss": 0.5771, + "num_input_tokens_seen": 745374336, + "step": 4131 + }, + { + "epoch": 0.4523385971154109, + "grad_norm": 1.2735180507529396, + "learning_rate": 2.8726461538800802e-05, + "loss": 0.5602, + "num_input_tokens_seen": 745508288, + "step": 4132 + }, + { + "epoch": 0.45244806918634883, + "grad_norm": 1.2086670787342069, + "learning_rate": 2.871795875672219e-05, + "loss": 0.5364, + "num_input_tokens_seen": 745704512, + "step": 4133 + }, + { + "epoch": 0.45255754125728676, + "grad_norm": 1.2813025673107785, + "learning_rate": 2.870945553481616e-05, + "loss": 0.6495, + "num_input_tokens_seen": 745868032, + "step": 4134 + }, + { + "epoch": 0.4526670133282246, + "grad_norm": 1.4160204236179468, + "learning_rate": 2.8700951874088634e-05, + "loss": 0.5812, + "num_input_tokens_seen": 746040960, + "step": 4135 + }, + { + "epoch": 0.45277648539916254, + "grad_norm": 1.3833179304498664, + "learning_rate": 2.869244777554557e-05, + "loss": 0.7929, + "num_input_tokens_seen": 746244576, + "step": 4136 + }, + { + "epoch": 0.45288595747010046, + "grad_norm": 1.394552278760829, + "learning_rate": 2.8683943240192997e-05, + "loss": 0.6314, + "num_input_tokens_seen": 746431840, + "step": 4137 + }, + { + "epoch": 0.45299542954103833, + "grad_norm": 1.4350726323902898, + "learning_rate": 2.867543826903698e-05, + "loss": 0.8459, + "num_input_tokens_seen": 746588416, + "step": 4138 + }, + { + "epoch": 0.45310490161197625, + "grad_norm": 1.5085164536794846, + "learning_rate": 2.866693286308364e-05, + "loss": 0.8173, + "num_input_tokens_seen": 746765152, + "step": 4139 + }, + { + "epoch": 0.4532143736829142, + "grad_norm": 1.3490650277183662, + "learning_rate": 2.8658427023339156e-05, + "loss": 0.755, + "num_input_tokens_seen": 746951072, + "step": 4140 + }, + { + "epoch": 0.45332384575385204, + "grad_norm": 1.3978569487973895, + "learning_rate": 2.864992075080975e-05, + "loss": 1.021, + "num_input_tokens_seen": 747156480, + "step": 4141 + }, + { + "epoch": 0.45343331782478996, + "grad_norm": 1.2348514072123362, + "learning_rate": 2.8641414046501697e-05, + "loss": 0.7432, + "num_input_tokens_seen": 747331648, + "step": 4142 + }, + { + "epoch": 0.4535427898957278, + "grad_norm": 1.1519509890780046, + "learning_rate": 2.8632906911421313e-05, + "loss": 0.7253, + "num_input_tokens_seen": 747502560, + "step": 4143 + }, + { + "epoch": 0.45365226196666575, + "grad_norm": 1.1812612381657872, + "learning_rate": 2.8624399346575e-05, + "loss": 0.7932, + "num_input_tokens_seen": 747707296, + "step": 4144 + }, + { + "epoch": 0.45376173403760367, + "grad_norm": 1.168129236939218, + "learning_rate": 2.861589135296917e-05, + "loss": 0.5306, + "num_input_tokens_seen": 747858944, + "step": 4145 + }, + { + "epoch": 0.45387120610854154, + "grad_norm": 1.2128729826646902, + "learning_rate": 2.8607382931610306e-05, + "loss": 0.6339, + "num_input_tokens_seen": 748054496, + "step": 4146 + }, + { + "epoch": 0.45398067817947946, + "grad_norm": 1.2366602543427279, + "learning_rate": 2.8598874083504933e-05, + "loss": 0.949, + "num_input_tokens_seen": 748261248, + "step": 4147 + }, + { + "epoch": 0.4540901502504174, + "grad_norm": 1.4464824168629569, + "learning_rate": 2.8590364809659632e-05, + "loss": 0.7696, + "num_input_tokens_seen": 748423424, + "step": 4148 + }, + { + "epoch": 0.45419962232135525, + "grad_norm": 1.3309636018392, + "learning_rate": 2.858185511108104e-05, + "loss": 0.7185, + "num_input_tokens_seen": 748609792, + "step": 4149 + }, + { + "epoch": 0.45430909439229317, + "grad_norm": 1.3455315689556806, + "learning_rate": 2.8573344988775834e-05, + "loss": 0.822, + "num_input_tokens_seen": 748801760, + "step": 4150 + }, + { + "epoch": 0.4544185664632311, + "grad_norm": 1.2034541583932745, + "learning_rate": 2.8564834443750753e-05, + "loss": 0.5067, + "num_input_tokens_seen": 748969536, + "step": 4151 + }, + { + "epoch": 0.45452803853416895, + "grad_norm": 1.3145343817752126, + "learning_rate": 2.8556323477012577e-05, + "loss": 0.6684, + "num_input_tokens_seen": 749141344, + "step": 4152 + }, + { + "epoch": 0.4546375106051069, + "grad_norm": 1.2208301203471708, + "learning_rate": 2.8547812089568128e-05, + "loss": 0.6541, + "num_input_tokens_seen": 749343168, + "step": 4153 + }, + { + "epoch": 0.4547469826760448, + "grad_norm": 1.3737802730900543, + "learning_rate": 2.8539300282424288e-05, + "loss": 0.8786, + "num_input_tokens_seen": 749514304, + "step": 4154 + }, + { + "epoch": 0.45485645474698266, + "grad_norm": 1.2000365807997555, + "learning_rate": 2.8530788056587993e-05, + "loss": 0.5559, + "num_input_tokens_seen": 749690592, + "step": 4155 + }, + { + "epoch": 0.4549659268179206, + "grad_norm": 1.3730117554602432, + "learning_rate": 2.852227541306622e-05, + "loss": 0.8524, + "num_input_tokens_seen": 749885248, + "step": 4156 + }, + { + "epoch": 0.4550753988888585, + "grad_norm": 1.4625269765077633, + "learning_rate": 2.851376235286599e-05, + "loss": 1.0456, + "num_input_tokens_seen": 750072736, + "step": 4157 + }, + { + "epoch": 0.4551848709597964, + "grad_norm": 1.3331359578114632, + "learning_rate": 2.85052488769944e-05, + "loss": 0.7398, + "num_input_tokens_seen": 750250816, + "step": 4158 + }, + { + "epoch": 0.4552943430307343, + "grad_norm": 1.242489232298709, + "learning_rate": 2.849673498645857e-05, + "loss": 0.9143, + "num_input_tokens_seen": 750441664, + "step": 4159 + }, + { + "epoch": 0.45540381510167216, + "grad_norm": 1.2360964011032614, + "learning_rate": 2.848822068226567e-05, + "loss": 0.4653, + "num_input_tokens_seen": 750596672, + "step": 4160 + }, + { + "epoch": 0.4555132871726101, + "grad_norm": 1.2439809843492815, + "learning_rate": 2.8479705965422937e-05, + "loss": 0.5955, + "num_input_tokens_seen": 750740480, + "step": 4161 + }, + { + "epoch": 0.455622759243548, + "grad_norm": 1.25067721142082, + "learning_rate": 2.8471190836937638e-05, + "loss": 0.7815, + "num_input_tokens_seen": 750924384, + "step": 4162 + }, + { + "epoch": 0.45573223131448587, + "grad_norm": 1.2171675850606503, + "learning_rate": 2.84626752978171e-05, + "loss": 0.5404, + "num_input_tokens_seen": 751105376, + "step": 4163 + }, + { + "epoch": 0.4558417033854238, + "grad_norm": 1.2678410473421673, + "learning_rate": 2.845415934906869e-05, + "loss": 0.7871, + "num_input_tokens_seen": 751283680, + "step": 4164 + }, + { + "epoch": 0.4559511754563617, + "grad_norm": 1.1273246694788726, + "learning_rate": 2.8445642991699835e-05, + "loss": 0.6632, + "num_input_tokens_seen": 751448992, + "step": 4165 + }, + { + "epoch": 0.4560606475272996, + "grad_norm": 1.4048841413059152, + "learning_rate": 2.8437126226718e-05, + "loss": 0.8513, + "num_input_tokens_seen": 751643648, + "step": 4166 + }, + { + "epoch": 0.4561701195982375, + "grad_norm": 1.3980918755527378, + "learning_rate": 2.8428609055130707e-05, + "loss": 0.9216, + "num_input_tokens_seen": 751836736, + "step": 4167 + }, + { + "epoch": 0.4562795916691754, + "grad_norm": 1.2650215688381976, + "learning_rate": 2.8420091477945514e-05, + "loss": 0.6976, + "num_input_tokens_seen": 752004736, + "step": 4168 + }, + { + "epoch": 0.4563890637401133, + "grad_norm": 1.2477577602914818, + "learning_rate": 2.8411573496170034e-05, + "loss": 0.7811, + "num_input_tokens_seen": 752211712, + "step": 4169 + }, + { + "epoch": 0.4564985358110512, + "grad_norm": 1.3665225782654231, + "learning_rate": 2.840305511081194e-05, + "loss": 0.796, + "num_input_tokens_seen": 752391136, + "step": 4170 + }, + { + "epoch": 0.45660800788198913, + "grad_norm": 1.2057840267523627, + "learning_rate": 2.8394536322878916e-05, + "loss": 0.5451, + "num_input_tokens_seen": 752544128, + "step": 4171 + }, + { + "epoch": 0.456717479952927, + "grad_norm": 1.2595499016635183, + "learning_rate": 2.838601713337875e-05, + "loss": 0.8127, + "num_input_tokens_seen": 752744160, + "step": 4172 + }, + { + "epoch": 0.4568269520238649, + "grad_norm": 1.4750977370645046, + "learning_rate": 2.8377497543319227e-05, + "loss": 0.8205, + "num_input_tokens_seen": 752907008, + "step": 4173 + }, + { + "epoch": 0.45693642409480284, + "grad_norm": 1.2341591368623044, + "learning_rate": 2.8368977553708198e-05, + "loss": 0.8, + "num_input_tokens_seen": 753096288, + "step": 4174 + }, + { + "epoch": 0.4570458961657407, + "grad_norm": 1.1664399823716953, + "learning_rate": 2.836045716555357e-05, + "loss": 0.739, + "num_input_tokens_seen": 753271456, + "step": 4175 + }, + { + "epoch": 0.4571553682366786, + "grad_norm": 1.0743186765490227, + "learning_rate": 2.835193637986328e-05, + "loss": 0.4571, + "num_input_tokens_seen": 753479776, + "step": 4176 + }, + { + "epoch": 0.4572648403076165, + "grad_norm": 1.1095538465301784, + "learning_rate": 2.8343415197645317e-05, + "loss": 0.5259, + "num_input_tokens_seen": 753660320, + "step": 4177 + }, + { + "epoch": 0.4573743123785544, + "grad_norm": 1.1759759988800862, + "learning_rate": 2.8334893619907737e-05, + "loss": 0.5846, + "num_input_tokens_seen": 753830784, + "step": 4178 + }, + { + "epoch": 0.45748378444949234, + "grad_norm": 1.324000427793451, + "learning_rate": 2.8326371647658618e-05, + "loss": 0.8426, + "num_input_tokens_seen": 754016480, + "step": 4179 + }, + { + "epoch": 0.4575932565204302, + "grad_norm": 1.268119943081327, + "learning_rate": 2.831784928190609e-05, + "loss": 0.6645, + "num_input_tokens_seen": 754192768, + "step": 4180 + }, + { + "epoch": 0.4577027285913681, + "grad_norm": 1.1263118618520451, + "learning_rate": 2.8309326523658324e-05, + "loss": 0.5332, + "num_input_tokens_seen": 754377120, + "step": 4181 + }, + { + "epoch": 0.45781220066230605, + "grad_norm": 1.3690212486128337, + "learning_rate": 2.830080337392357e-05, + "loss": 0.7576, + "num_input_tokens_seen": 754573568, + "step": 4182 + }, + { + "epoch": 0.4579216727332439, + "grad_norm": 1.4577628336763313, + "learning_rate": 2.8292279833710084e-05, + "loss": 0.9079, + "num_input_tokens_seen": 754772256, + "step": 4183 + }, + { + "epoch": 0.45803114480418183, + "grad_norm": 1.1385579649284234, + "learning_rate": 2.828375590402618e-05, + "loss": 0.5969, + "num_input_tokens_seen": 754943840, + "step": 4184 + }, + { + "epoch": 0.45814061687511975, + "grad_norm": 1.367009832554737, + "learning_rate": 2.8275231585880236e-05, + "loss": 0.7089, + "num_input_tokens_seen": 755118112, + "step": 4185 + }, + { + "epoch": 0.4582500889460576, + "grad_norm": 1.394065971652379, + "learning_rate": 2.826670688028066e-05, + "loss": 0.7822, + "num_input_tokens_seen": 755281632, + "step": 4186 + }, + { + "epoch": 0.45835956101699554, + "grad_norm": 1.2202012640238256, + "learning_rate": 2.8258181788235906e-05, + "loss": 0.8449, + "num_input_tokens_seen": 755453888, + "step": 4187 + }, + { + "epoch": 0.45846903308793346, + "grad_norm": 1.4534554054602504, + "learning_rate": 2.824965631075447e-05, + "loss": 0.7468, + "num_input_tokens_seen": 755618976, + "step": 4188 + }, + { + "epoch": 0.45857850515887133, + "grad_norm": 1.1984601930063785, + "learning_rate": 2.8241130448844905e-05, + "loss": 0.5656, + "num_input_tokens_seen": 755802432, + "step": 4189 + }, + { + "epoch": 0.45868797722980925, + "grad_norm": 1.3493398571164537, + "learning_rate": 2.82326042035158e-05, + "loss": 0.7551, + "num_input_tokens_seen": 755932352, + "step": 4190 + }, + { + "epoch": 0.4587974493007472, + "grad_norm": 1.2502578946150191, + "learning_rate": 2.8224077575775803e-05, + "loss": 0.6234, + "num_input_tokens_seen": 756128352, + "step": 4191 + }, + { + "epoch": 0.45890692137168504, + "grad_norm": 1.34932223189886, + "learning_rate": 2.8215550566633588e-05, + "loss": 0.8083, + "num_input_tokens_seen": 756295680, + "step": 4192 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 1.456401781022707, + "learning_rate": 2.820702317709789e-05, + "loss": 0.7964, + "num_input_tokens_seen": 756443968, + "step": 4193 + }, + { + "epoch": 0.4591258655135608, + "grad_norm": 1.1897004971146217, + "learning_rate": 2.8198495408177484e-05, + "loss": 0.5165, + "num_input_tokens_seen": 756595840, + "step": 4194 + }, + { + "epoch": 0.45923533758449875, + "grad_norm": 1.2685945312257032, + "learning_rate": 2.8189967260881183e-05, + "loss": 0.6723, + "num_input_tokens_seen": 756772800, + "step": 4195 + }, + { + "epoch": 0.45934480965543667, + "grad_norm": 1.4054728585862248, + "learning_rate": 2.818143873621785e-05, + "loss": 0.6793, + "num_input_tokens_seen": 756963424, + "step": 4196 + }, + { + "epoch": 0.45945428172637454, + "grad_norm": 1.4732242369212494, + "learning_rate": 2.8172909835196404e-05, + "loss": 0.6344, + "num_input_tokens_seen": 757089984, + "step": 4197 + }, + { + "epoch": 0.45956375379731246, + "grad_norm": 1.366679505931432, + "learning_rate": 2.8164380558825782e-05, + "loss": 0.7127, + "num_input_tokens_seen": 757284640, + "step": 4198 + }, + { + "epoch": 0.4596732258682504, + "grad_norm": 1.203219422437522, + "learning_rate": 2.8155850908114996e-05, + "loss": 0.6374, + "num_input_tokens_seen": 757471008, + "step": 4199 + }, + { + "epoch": 0.45978269793918825, + "grad_norm": 1.2543648805005774, + "learning_rate": 2.814732088407308e-05, + "loss": 0.5336, + "num_input_tokens_seen": 757610560, + "step": 4200 + }, + { + "epoch": 0.45989217001012617, + "grad_norm": 1.361253470914705, + "learning_rate": 2.8138790487709115e-05, + "loss": 0.9575, + "num_input_tokens_seen": 757804992, + "step": 4201 + }, + { + "epoch": 0.4600016420810641, + "grad_norm": 1.2054486315686912, + "learning_rate": 2.8130259720032237e-05, + "loss": 0.5293, + "num_input_tokens_seen": 757957312, + "step": 4202 + }, + { + "epoch": 0.46011111415200195, + "grad_norm": 1.2420072124812351, + "learning_rate": 2.812172858205162e-05, + "loss": 0.639, + "num_input_tokens_seen": 758151744, + "step": 4203 + }, + { + "epoch": 0.4602205862229399, + "grad_norm": 1.2818858267012117, + "learning_rate": 2.8113197074776477e-05, + "loss": 0.6157, + "num_input_tokens_seen": 758309888, + "step": 4204 + }, + { + "epoch": 0.4603300582938778, + "grad_norm": 1.4501794473827045, + "learning_rate": 2.8104665199216074e-05, + "loss": 0.9556, + "num_input_tokens_seen": 758483936, + "step": 4205 + }, + { + "epoch": 0.46043953036481566, + "grad_norm": 1.36479059737087, + "learning_rate": 2.809613295637971e-05, + "loss": 0.7719, + "num_input_tokens_seen": 758672320, + "step": 4206 + }, + { + "epoch": 0.4605490024357536, + "grad_norm": 1.2816974044821146, + "learning_rate": 2.8087600347276744e-05, + "loss": 0.7458, + "num_input_tokens_seen": 758860032, + "step": 4207 + }, + { + "epoch": 0.4606584745066915, + "grad_norm": 1.2272843101866802, + "learning_rate": 2.8079067372916555e-05, + "loss": 0.5115, + "num_input_tokens_seen": 759009440, + "step": 4208 + }, + { + "epoch": 0.4607679465776294, + "grad_norm": 1.3232083587263694, + "learning_rate": 2.8070534034308583e-05, + "loss": 0.8742, + "num_input_tokens_seen": 759190656, + "step": 4209 + }, + { + "epoch": 0.4608774186485673, + "grad_norm": 1.1811047560593733, + "learning_rate": 2.8062000332462302e-05, + "loss": 0.5874, + "num_input_tokens_seen": 759398528, + "step": 4210 + }, + { + "epoch": 0.46098689071950516, + "grad_norm": 1.3814402954062233, + "learning_rate": 2.805346626838723e-05, + "loss": 0.7034, + "num_input_tokens_seen": 759572128, + "step": 4211 + }, + { + "epoch": 0.4610963627904431, + "grad_norm": 1.383028821673831, + "learning_rate": 2.8044931843092948e-05, + "loss": 1.0248, + "num_input_tokens_seen": 759758272, + "step": 4212 + }, + { + "epoch": 0.461205834861381, + "grad_norm": 1.3300931820458313, + "learning_rate": 2.8036397057589038e-05, + "loss": 0.7229, + "num_input_tokens_seen": 759958528, + "step": 4213 + }, + { + "epoch": 0.46131530693231887, + "grad_norm": 1.3504369381837977, + "learning_rate": 2.8027861912885168e-05, + "loss": 0.6246, + "num_input_tokens_seen": 760135264, + "step": 4214 + }, + { + "epoch": 0.4614247790032568, + "grad_norm": 1.248714596740037, + "learning_rate": 2.801932640999102e-05, + "loss": 0.7071, + "num_input_tokens_seen": 760295200, + "step": 4215 + }, + { + "epoch": 0.4615342510741947, + "grad_norm": 1.2968828591717578, + "learning_rate": 2.8010790549916333e-05, + "loss": 0.7642, + "num_input_tokens_seen": 760491872, + "step": 4216 + }, + { + "epoch": 0.4616437231451326, + "grad_norm": 1.2382129445776644, + "learning_rate": 2.8002254333670874e-05, + "loss": 0.6831, + "num_input_tokens_seen": 760671520, + "step": 4217 + }, + { + "epoch": 0.4617531952160705, + "grad_norm": 1.187406887620197, + "learning_rate": 2.7993717762264464e-05, + "loss": 0.835, + "num_input_tokens_seen": 760858784, + "step": 4218 + }, + { + "epoch": 0.4618626672870084, + "grad_norm": 1.1704444800648626, + "learning_rate": 2.7985180836706975e-05, + "loss": 0.7763, + "num_input_tokens_seen": 761024096, + "step": 4219 + }, + { + "epoch": 0.4619721393579463, + "grad_norm": 1.2623808927344649, + "learning_rate": 2.7976643558008297e-05, + "loss": 0.5668, + "num_input_tokens_seen": 761196576, + "step": 4220 + }, + { + "epoch": 0.4620816114288842, + "grad_norm": 1.298956108958102, + "learning_rate": 2.7968105927178372e-05, + "loss": 0.6902, + "num_input_tokens_seen": 761399744, + "step": 4221 + }, + { + "epoch": 0.46219108349982213, + "grad_norm": 1.2983750549798236, + "learning_rate": 2.7959567945227195e-05, + "loss": 0.7692, + "num_input_tokens_seen": 761567968, + "step": 4222 + }, + { + "epoch": 0.46230055557076, + "grad_norm": 1.11139967005946, + "learning_rate": 2.7951029613164782e-05, + "loss": 0.603, + "num_input_tokens_seen": 761750528, + "step": 4223 + }, + { + "epoch": 0.4624100276416979, + "grad_norm": 1.3648266708798977, + "learning_rate": 2.7942490932001212e-05, + "loss": 0.8188, + "num_input_tokens_seen": 761936000, + "step": 4224 + }, + { + "epoch": 0.46251949971263584, + "grad_norm": 1.293935869290424, + "learning_rate": 2.7933951902746587e-05, + "loss": 0.5685, + "num_input_tokens_seen": 762116096, + "step": 4225 + }, + { + "epoch": 0.4626289717835737, + "grad_norm": 1.1692557500306755, + "learning_rate": 2.792541252641106e-05, + "loss": 0.7955, + "num_input_tokens_seen": 762314336, + "step": 4226 + }, + { + "epoch": 0.4627384438545116, + "grad_norm": 1.312745343846796, + "learning_rate": 2.791687280400483e-05, + "loss": 0.8521, + "num_input_tokens_seen": 762476736, + "step": 4227 + }, + { + "epoch": 0.4628479159254495, + "grad_norm": 1.235551104808204, + "learning_rate": 2.790833273653812e-05, + "loss": 0.8275, + "num_input_tokens_seen": 762664896, + "step": 4228 + }, + { + "epoch": 0.4629573879963874, + "grad_norm": 1.2783263274095742, + "learning_rate": 2.7899792325021207e-05, + "loss": 0.8029, + "num_input_tokens_seen": 762828192, + "step": 4229 + }, + { + "epoch": 0.46306686006732534, + "grad_norm": 1.3189562089431246, + "learning_rate": 2.7891251570464406e-05, + "loss": 0.8204, + "num_input_tokens_seen": 763015904, + "step": 4230 + }, + { + "epoch": 0.4631763321382632, + "grad_norm": 1.2940059777199875, + "learning_rate": 2.788271047387807e-05, + "loss": 0.7079, + "num_input_tokens_seen": 763202496, + "step": 4231 + }, + { + "epoch": 0.4632858042092011, + "grad_norm": 1.3705357125106108, + "learning_rate": 2.7874169036272597e-05, + "loss": 0.6894, + "num_input_tokens_seen": 763336672, + "step": 4232 + }, + { + "epoch": 0.46339527628013905, + "grad_norm": 1.3336218233734836, + "learning_rate": 2.7865627258658417e-05, + "loss": 0.6564, + "num_input_tokens_seen": 763531776, + "step": 4233 + }, + { + "epoch": 0.4635047483510769, + "grad_norm": 1.3311398563578691, + "learning_rate": 2.7857085142046013e-05, + "loss": 0.676, + "num_input_tokens_seen": 763696416, + "step": 4234 + }, + { + "epoch": 0.46361422042201483, + "grad_norm": 1.1828993973250195, + "learning_rate": 2.78485426874459e-05, + "loss": 0.6123, + "num_input_tokens_seen": 763880768, + "step": 4235 + }, + { + "epoch": 0.46372369249295275, + "grad_norm": 1.0938519021656725, + "learning_rate": 2.783999989586863e-05, + "loss": 0.6461, + "num_input_tokens_seen": 764069152, + "step": 4236 + }, + { + "epoch": 0.4638331645638906, + "grad_norm": 1.215013253018032, + "learning_rate": 2.7831456768324805e-05, + "loss": 0.7703, + "num_input_tokens_seen": 764253504, + "step": 4237 + }, + { + "epoch": 0.46394263663482854, + "grad_norm": 1.2724442147878268, + "learning_rate": 2.7822913305825054e-05, + "loss": 0.8396, + "num_input_tokens_seen": 764451296, + "step": 4238 + }, + { + "epoch": 0.46405210870576646, + "grad_norm": 1.376821005685037, + "learning_rate": 2.7814369509380055e-05, + "loss": 0.8019, + "num_input_tokens_seen": 764630048, + "step": 4239 + }, + { + "epoch": 0.46416158077670433, + "grad_norm": 1.3829602092582203, + "learning_rate": 2.7805825380000528e-05, + "loss": 0.733, + "num_input_tokens_seen": 764801184, + "step": 4240 + }, + { + "epoch": 0.46427105284764225, + "grad_norm": 1.1879337923886486, + "learning_rate": 2.779728091869722e-05, + "loss": 0.7258, + "num_input_tokens_seen": 764970528, + "step": 4241 + }, + { + "epoch": 0.4643805249185802, + "grad_norm": 1.2341935712537793, + "learning_rate": 2.778873612648093e-05, + "loss": 0.8455, + "num_input_tokens_seen": 765174816, + "step": 4242 + }, + { + "epoch": 0.46448999698951804, + "grad_norm": 1.3127963321946052, + "learning_rate": 2.778019100436248e-05, + "loss": 0.7934, + "num_input_tokens_seen": 765373280, + "step": 4243 + }, + { + "epoch": 0.46459946906045596, + "grad_norm": 1.2977076236079679, + "learning_rate": 2.7771645553352753e-05, + "loss": 0.6965, + "num_input_tokens_seen": 765577568, + "step": 4244 + }, + { + "epoch": 0.4647089411313938, + "grad_norm": 1.2633256157779407, + "learning_rate": 2.7763099774462646e-05, + "loss": 0.5751, + "num_input_tokens_seen": 765738624, + "step": 4245 + }, + { + "epoch": 0.46481841320233175, + "grad_norm": 1.1165016042831928, + "learning_rate": 2.775455366870313e-05, + "loss": 0.6169, + "num_input_tokens_seen": 765927008, + "step": 4246 + }, + { + "epoch": 0.46492788527326967, + "grad_norm": 1.1620446451576107, + "learning_rate": 2.774600723708518e-05, + "loss": 0.6674, + "num_input_tokens_seen": 766137792, + "step": 4247 + }, + { + "epoch": 0.46503735734420754, + "grad_norm": 1.3231495279337022, + "learning_rate": 2.7737460480619827e-05, + "loss": 0.6421, + "num_input_tokens_seen": 766287424, + "step": 4248 + }, + { + "epoch": 0.46514682941514546, + "grad_norm": 1.1214201980240028, + "learning_rate": 2.7728913400318125e-05, + "loss": 0.6302, + "num_input_tokens_seen": 766499328, + "step": 4249 + }, + { + "epoch": 0.4652563014860834, + "grad_norm": 1.2061724524203503, + "learning_rate": 2.7720365997191188e-05, + "loss": 0.6673, + "num_input_tokens_seen": 766685920, + "step": 4250 + }, + { + "epoch": 0.46536577355702125, + "grad_norm": 1.3158912443656927, + "learning_rate": 2.7711818272250152e-05, + "loss": 0.8285, + "num_input_tokens_seen": 766883040, + "step": 4251 + }, + { + "epoch": 0.46547524562795917, + "grad_norm": 1.21019540012931, + "learning_rate": 2.7703270226506196e-05, + "loss": 0.6953, + "num_input_tokens_seen": 767028640, + "step": 4252 + }, + { + "epoch": 0.4655847176988971, + "grad_norm": 1.14887431654019, + "learning_rate": 2.769472186097054e-05, + "loss": 0.6327, + "num_input_tokens_seen": 767231360, + "step": 4253 + }, + { + "epoch": 0.46569418976983495, + "grad_norm": 1.1949041828542981, + "learning_rate": 2.7686173176654446e-05, + "loss": 0.6156, + "num_input_tokens_seen": 767412576, + "step": 4254 + }, + { + "epoch": 0.4658036618407729, + "grad_norm": 1.2641014536359547, + "learning_rate": 2.7677624174569187e-05, + "loss": 0.8213, + "num_input_tokens_seen": 767595808, + "step": 4255 + }, + { + "epoch": 0.4659131339117108, + "grad_norm": 1.3712512412191666, + "learning_rate": 2.766907485572612e-05, + "loss": 0.5815, + "num_input_tokens_seen": 767783520, + "step": 4256 + }, + { + "epoch": 0.46602260598264866, + "grad_norm": 1.1528251030538852, + "learning_rate": 2.7660525221136595e-05, + "loss": 0.5881, + "num_input_tokens_seen": 767953984, + "step": 4257 + }, + { + "epoch": 0.4661320780535866, + "grad_norm": 1.3587497720305841, + "learning_rate": 2.7651975271812026e-05, + "loss": 0.9978, + "num_input_tokens_seen": 768112352, + "step": 4258 + }, + { + "epoch": 0.4662415501245245, + "grad_norm": 1.2607331574794014, + "learning_rate": 2.7643425008763845e-05, + "loss": 0.6524, + "num_input_tokens_seen": 768274976, + "step": 4259 + }, + { + "epoch": 0.4663510221954624, + "grad_norm": 1.3087883192571759, + "learning_rate": 2.7634874433003545e-05, + "loss": 0.7458, + "num_input_tokens_seen": 768426848, + "step": 4260 + }, + { + "epoch": 0.4664604942664003, + "grad_norm": 1.3315418944233839, + "learning_rate": 2.762632354554264e-05, + "loss": 0.9385, + "num_input_tokens_seen": 768608288, + "step": 4261 + }, + { + "epoch": 0.46656996633733816, + "grad_norm": 1.293606050748245, + "learning_rate": 2.7617772347392672e-05, + "loss": 0.793, + "num_input_tokens_seen": 768771584, + "step": 4262 + }, + { + "epoch": 0.4666794384082761, + "grad_norm": 1.2724442616304648, + "learning_rate": 2.760922083956525e-05, + "loss": 0.6927, + "num_input_tokens_seen": 768930400, + "step": 4263 + }, + { + "epoch": 0.466788910479214, + "grad_norm": 1.3741831086846297, + "learning_rate": 2.7600669023071978e-05, + "loss": 0.6679, + "num_input_tokens_seen": 769113632, + "step": 4264 + }, + { + "epoch": 0.46689838255015187, + "grad_norm": 1.168181741233421, + "learning_rate": 2.7592116898924537e-05, + "loss": 0.5807, + "num_input_tokens_seen": 769252512, + "step": 4265 + }, + { + "epoch": 0.4670078546210898, + "grad_norm": 1.4159937363609962, + "learning_rate": 2.7583564468134615e-05, + "loss": 0.7771, + "num_input_tokens_seen": 769449408, + "step": 4266 + }, + { + "epoch": 0.4671173266920277, + "grad_norm": 4.224063719890171, + "learning_rate": 2.7575011731713968e-05, + "loss": 1.1046, + "num_input_tokens_seen": 769641600, + "step": 4267 + }, + { + "epoch": 0.4672267987629656, + "grad_norm": 1.2798962768610216, + "learning_rate": 2.756645869067435e-05, + "loss": 0.7965, + "num_input_tokens_seen": 769851488, + "step": 4268 + }, + { + "epoch": 0.4673362708339035, + "grad_norm": 1.249300522600042, + "learning_rate": 2.7557905346027578e-05, + "loss": 0.7947, + "num_input_tokens_seen": 770039648, + "step": 4269 + }, + { + "epoch": 0.4674457429048414, + "grad_norm": 1.1650307232157795, + "learning_rate": 2.7549351698785492e-05, + "loss": 0.6467, + "num_input_tokens_seen": 770225344, + "step": 4270 + }, + { + "epoch": 0.4675552149757793, + "grad_norm": 1.224197391404026, + "learning_rate": 2.7540797749959974e-05, + "loss": 0.552, + "num_input_tokens_seen": 770402976, + "step": 4271 + }, + { + "epoch": 0.4676646870467172, + "grad_norm": 1.2499016723106446, + "learning_rate": 2.753224350056293e-05, + "loss": 0.8069, + "num_input_tokens_seen": 770615104, + "step": 4272 + }, + { + "epoch": 0.46777415911765513, + "grad_norm": 1.1093427626533061, + "learning_rate": 2.7523688951606337e-05, + "loss": 0.6014, + "num_input_tokens_seen": 770809984, + "step": 4273 + }, + { + "epoch": 0.467883631188593, + "grad_norm": 1.0893583686684003, + "learning_rate": 2.751513410410216e-05, + "loss": 0.6194, + "num_input_tokens_seen": 770975968, + "step": 4274 + }, + { + "epoch": 0.4679931032595309, + "grad_norm": 1.2920704282337372, + "learning_rate": 2.7506578959062424e-05, + "loss": 0.8015, + "num_input_tokens_seen": 771149344, + "step": 4275 + }, + { + "epoch": 0.46810257533046884, + "grad_norm": 1.1291570681812686, + "learning_rate": 2.7498023517499183e-05, + "loss": 0.6043, + "num_input_tokens_seen": 771318912, + "step": 4276 + }, + { + "epoch": 0.4682120474014067, + "grad_norm": 1.2988757131936373, + "learning_rate": 2.7489467780424544e-05, + "loss": 0.8178, + "num_input_tokens_seen": 771494304, + "step": 4277 + }, + { + "epoch": 0.4683215194723446, + "grad_norm": 1.2631388133943875, + "learning_rate": 2.7480911748850624e-05, + "loss": 0.7673, + "num_input_tokens_seen": 771662976, + "step": 4278 + }, + { + "epoch": 0.4684309915432825, + "grad_norm": 1.244140193825352, + "learning_rate": 2.7472355423789582e-05, + "loss": 0.6127, + "num_input_tokens_seen": 771844640, + "step": 4279 + }, + { + "epoch": 0.4685404636142204, + "grad_norm": 1.1991988686768105, + "learning_rate": 2.746379880625362e-05, + "loss": 0.5756, + "num_input_tokens_seen": 772007488, + "step": 4280 + }, + { + "epoch": 0.46864993568515834, + "grad_norm": 1.3445637035720015, + "learning_rate": 2.7455241897254974e-05, + "loss": 0.6482, + "num_input_tokens_seen": 772185792, + "step": 4281 + }, + { + "epoch": 0.4687594077560962, + "grad_norm": 1.2877425400509168, + "learning_rate": 2.7446684697805907e-05, + "loss": 0.6445, + "num_input_tokens_seen": 772344160, + "step": 4282 + }, + { + "epoch": 0.4688688798270341, + "grad_norm": 1.3389390832919863, + "learning_rate": 2.743812720891872e-05, + "loss": 0.6815, + "num_input_tokens_seen": 772547776, + "step": 4283 + }, + { + "epoch": 0.46897835189797205, + "grad_norm": 1.072813745148467, + "learning_rate": 2.742956943160574e-05, + "loss": 0.5607, + "num_input_tokens_seen": 772730336, + "step": 4284 + }, + { + "epoch": 0.4690878239689099, + "grad_norm": 1.3123505371005448, + "learning_rate": 2.742101136687934e-05, + "loss": 0.8053, + "num_input_tokens_seen": 772911776, + "step": 4285 + }, + { + "epoch": 0.46919729603984783, + "grad_norm": 1.2796263758182536, + "learning_rate": 2.7412453015751916e-05, + "loss": 0.5849, + "num_input_tokens_seen": 773125696, + "step": 4286 + }, + { + "epoch": 0.46930676811078575, + "grad_norm": 1.440942870713109, + "learning_rate": 2.7403894379235916e-05, + "loss": 1.0683, + "num_input_tokens_seen": 773328864, + "step": 4287 + }, + { + "epoch": 0.4694162401817236, + "grad_norm": 1.1563531984206559, + "learning_rate": 2.7395335458343813e-05, + "loss": 0.5099, + "num_input_tokens_seen": 773491488, + "step": 4288 + }, + { + "epoch": 0.46952571225266154, + "grad_norm": 1.4089035429795789, + "learning_rate": 2.7386776254088103e-05, + "loss": 0.7875, + "num_input_tokens_seen": 773648512, + "step": 4289 + }, + { + "epoch": 0.46963518432359946, + "grad_norm": 1.1837512122433023, + "learning_rate": 2.7378216767481322e-05, + "loss": 0.5539, + "num_input_tokens_seen": 773823680, + "step": 4290 + }, + { + "epoch": 0.46974465639453733, + "grad_norm": 1.4104331634924072, + "learning_rate": 2.736965699953605e-05, + "loss": 0.6998, + "num_input_tokens_seen": 774021248, + "step": 4291 + }, + { + "epoch": 0.46985412846547525, + "grad_norm": 1.2165434498425938, + "learning_rate": 2.7361096951264882e-05, + "loss": 0.8498, + "num_input_tokens_seen": 774221952, + "step": 4292 + }, + { + "epoch": 0.4699636005364132, + "grad_norm": 1.2203163938179364, + "learning_rate": 2.7352536623680454e-05, + "loss": 0.7234, + "num_input_tokens_seen": 774398688, + "step": 4293 + }, + { + "epoch": 0.47007307260735104, + "grad_norm": 1.507228678995574, + "learning_rate": 2.7343976017795443e-05, + "loss": 0.7814, + "num_input_tokens_seen": 774533088, + "step": 4294 + }, + { + "epoch": 0.47018254467828896, + "grad_norm": 1.1859692696197321, + "learning_rate": 2.7335415134622548e-05, + "loss": 0.6258, + "num_input_tokens_seen": 774680928, + "step": 4295 + }, + { + "epoch": 0.4702920167492268, + "grad_norm": 1.3226734698775, + "learning_rate": 2.732685397517451e-05, + "loss": 0.8843, + "num_input_tokens_seen": 774872000, + "step": 4296 + }, + { + "epoch": 0.47040148882016475, + "grad_norm": 1.348949442162786, + "learning_rate": 2.731829254046409e-05, + "loss": 0.8768, + "num_input_tokens_seen": 775069344, + "step": 4297 + }, + { + "epoch": 0.47051096089110267, + "grad_norm": 1.3002838466781275, + "learning_rate": 2.7309730831504105e-05, + "loss": 0.6972, + "num_input_tokens_seen": 775241824, + "step": 4298 + }, + { + "epoch": 0.47062043296204054, + "grad_norm": 1.1763314008771661, + "learning_rate": 2.7301168849307364e-05, + "loss": 0.8387, + "num_input_tokens_seen": 775444768, + "step": 4299 + }, + { + "epoch": 0.47072990503297846, + "grad_norm": 1.261065854484943, + "learning_rate": 2.7292606594886756e-05, + "loss": 0.7989, + "num_input_tokens_seen": 775631360, + "step": 4300 + }, + { + "epoch": 0.4708393771039164, + "grad_norm": 1.094767832594905, + "learning_rate": 2.728404406925517e-05, + "loss": 0.7526, + "num_input_tokens_seen": 775824448, + "step": 4301 + }, + { + "epoch": 0.47094884917485424, + "grad_norm": 1.5190275872634142, + "learning_rate": 2.727548127342554e-05, + "loss": 0.6731, + "num_input_tokens_seen": 776024928, + "step": 4302 + }, + { + "epoch": 0.47105832124579217, + "grad_norm": 1.1692213403412335, + "learning_rate": 2.7266918208410824e-05, + "loss": 0.7333, + "num_input_tokens_seen": 776206592, + "step": 4303 + }, + { + "epoch": 0.4711677933167301, + "grad_norm": 1.1719917747807826, + "learning_rate": 2.7258354875224014e-05, + "loss": 0.6393, + "num_input_tokens_seen": 776357568, + "step": 4304 + }, + { + "epoch": 0.47127726538766795, + "grad_norm": 1.334375550316869, + "learning_rate": 2.7249791274878146e-05, + "loss": 0.778, + "num_input_tokens_seen": 776530944, + "step": 4305 + }, + { + "epoch": 0.4713867374586059, + "grad_norm": 1.3539961145055064, + "learning_rate": 2.724122740838626e-05, + "loss": 0.6682, + "num_input_tokens_seen": 776713952, + "step": 4306 + }, + { + "epoch": 0.4714962095295438, + "grad_norm": 1.4045202000741683, + "learning_rate": 2.723266327676146e-05, + "loss": 0.8789, + "num_input_tokens_seen": 776894496, + "step": 4307 + }, + { + "epoch": 0.47160568160048166, + "grad_norm": 1.3563752235227498, + "learning_rate": 2.722409888101686e-05, + "loss": 0.6702, + "num_input_tokens_seen": 777080416, + "step": 4308 + }, + { + "epoch": 0.4717151536714196, + "grad_norm": 1.1416108038848845, + "learning_rate": 2.7215534222165622e-05, + "loss": 0.8059, + "num_input_tokens_seen": 777285152, + "step": 4309 + }, + { + "epoch": 0.4718246257423575, + "grad_norm": 1.296370948365118, + "learning_rate": 2.720696930122092e-05, + "loss": 0.7218, + "num_input_tokens_seen": 777477792, + "step": 4310 + }, + { + "epoch": 0.4719340978132954, + "grad_norm": 1.187880003510657, + "learning_rate": 2.7198404119195965e-05, + "loss": 0.9108, + "num_input_tokens_seen": 777660800, + "step": 4311 + }, + { + "epoch": 0.4720435698842333, + "grad_norm": 1.2945100753792858, + "learning_rate": 2.718983867710401e-05, + "loss": 0.8167, + "num_input_tokens_seen": 777846048, + "step": 4312 + }, + { + "epoch": 0.47215304195517116, + "grad_norm": 1.1190907731861481, + "learning_rate": 2.7181272975958318e-05, + "loss": 0.5095, + "num_input_tokens_seen": 778036000, + "step": 4313 + }, + { + "epoch": 0.4722625140261091, + "grad_norm": 1.1494707921628642, + "learning_rate": 2.717270701677221e-05, + "loss": 0.6635, + "num_input_tokens_seen": 778228192, + "step": 4314 + }, + { + "epoch": 0.472371986097047, + "grad_norm": 1.2551234151942088, + "learning_rate": 2.7164140800559013e-05, + "loss": 0.7656, + "num_input_tokens_seen": 778402912, + "step": 4315 + }, + { + "epoch": 0.47248145816798487, + "grad_norm": 1.0867902851487667, + "learning_rate": 2.7155574328332095e-05, + "loss": 0.6086, + "num_input_tokens_seen": 778608544, + "step": 4316 + }, + { + "epoch": 0.4725909302389228, + "grad_norm": 1.265050310182358, + "learning_rate": 2.7147007601104858e-05, + "loss": 0.6855, + "num_input_tokens_seen": 778766016, + "step": 4317 + }, + { + "epoch": 0.4727004023098607, + "grad_norm": 1.4518932741733614, + "learning_rate": 2.713844061989072e-05, + "loss": 0.7241, + "num_input_tokens_seen": 778937152, + "step": 4318 + }, + { + "epoch": 0.4728098743807986, + "grad_norm": 1.2339973354836358, + "learning_rate": 2.7129873385703146e-05, + "loss": 0.8103, + "num_input_tokens_seen": 779109856, + "step": 4319 + }, + { + "epoch": 0.4729193464517365, + "grad_norm": 1.271284471492709, + "learning_rate": 2.712130589955562e-05, + "loss": 0.6347, + "num_input_tokens_seen": 779272256, + "step": 4320 + }, + { + "epoch": 0.4730288185226744, + "grad_norm": 1.3444300860083989, + "learning_rate": 2.711273816246167e-05, + "loss": 0.6893, + "num_input_tokens_seen": 779438688, + "step": 4321 + }, + { + "epoch": 0.4731382905936123, + "grad_norm": 1.355486339268081, + "learning_rate": 2.710417017543483e-05, + "loss": 0.82, + "num_input_tokens_seen": 779630208, + "step": 4322 + }, + { + "epoch": 0.4732477626645502, + "grad_norm": 1.1399321868871384, + "learning_rate": 2.7095601939488685e-05, + "loss": 0.7439, + "num_input_tokens_seen": 779832928, + "step": 4323 + }, + { + "epoch": 0.47335723473548813, + "grad_norm": 1.1519022985029268, + "learning_rate": 2.7087033455636834e-05, + "loss": 0.677, + "num_input_tokens_seen": 780015488, + "step": 4324 + }, + { + "epoch": 0.473466706806426, + "grad_norm": 1.2510416935590645, + "learning_rate": 2.7078464724892917e-05, + "loss": 0.853, + "num_input_tokens_seen": 780225376, + "step": 4325 + }, + { + "epoch": 0.4735761788773639, + "grad_norm": 1.3989196883729662, + "learning_rate": 2.706989574827059e-05, + "loss": 0.6221, + "num_input_tokens_seen": 780401664, + "step": 4326 + }, + { + "epoch": 0.47368565094830184, + "grad_norm": 1.195828843634459, + "learning_rate": 2.7061326526783555e-05, + "loss": 0.599, + "num_input_tokens_seen": 780583552, + "step": 4327 + }, + { + "epoch": 0.4737951230192397, + "grad_norm": 1.4324244582145373, + "learning_rate": 2.7052757061445534e-05, + "loss": 0.8147, + "num_input_tokens_seen": 780764096, + "step": 4328 + }, + { + "epoch": 0.4739045950901776, + "grad_norm": 1.3614655566248879, + "learning_rate": 2.7044187353270268e-05, + "loss": 0.9329, + "num_input_tokens_seen": 780949568, + "step": 4329 + }, + { + "epoch": 0.4740140671611155, + "grad_norm": 1.308191220447757, + "learning_rate": 2.703561740327156e-05, + "loss": 0.7073, + "num_input_tokens_seen": 781107264, + "step": 4330 + }, + { + "epoch": 0.4741235392320534, + "grad_norm": 1.3019810547395207, + "learning_rate": 2.7027047212463198e-05, + "loss": 0.7422, + "num_input_tokens_seen": 781264960, + "step": 4331 + }, + { + "epoch": 0.47423301130299134, + "grad_norm": 1.2380446921063513, + "learning_rate": 2.7018476781859027e-05, + "loss": 0.7077, + "num_input_tokens_seen": 781456032, + "step": 4332 + }, + { + "epoch": 0.4743424833739292, + "grad_norm": 1.215838647420738, + "learning_rate": 2.7009906112472904e-05, + "loss": 0.6652, + "num_input_tokens_seen": 781644192, + "step": 4333 + }, + { + "epoch": 0.4744519554448671, + "grad_norm": 1.2192101710221919, + "learning_rate": 2.700133520531874e-05, + "loss": 0.8589, + "num_input_tokens_seen": 781852960, + "step": 4334 + }, + { + "epoch": 0.47456142751580505, + "grad_norm": 1.266635256093387, + "learning_rate": 2.6992764061410446e-05, + "loss": 0.7215, + "num_input_tokens_seen": 782022528, + "step": 4335 + }, + { + "epoch": 0.4746708995867429, + "grad_norm": 1.266238593664885, + "learning_rate": 2.6984192681761972e-05, + "loss": 0.7996, + "num_input_tokens_seen": 782194784, + "step": 4336 + }, + { + "epoch": 0.47478037165768083, + "grad_norm": 1.3154649850121087, + "learning_rate": 2.6975621067387296e-05, + "loss": 0.9537, + "num_input_tokens_seen": 782408480, + "step": 4337 + }, + { + "epoch": 0.47488984372861875, + "grad_norm": 1.2027482768212745, + "learning_rate": 2.6967049219300427e-05, + "loss": 0.5328, + "num_input_tokens_seen": 782588352, + "step": 4338 + }, + { + "epoch": 0.4749993157995566, + "grad_norm": 1.1932410947929737, + "learning_rate": 2.6958477138515393e-05, + "loss": 0.9314, + "num_input_tokens_seen": 782750080, + "step": 4339 + }, + { + "epoch": 0.47510878787049454, + "grad_norm": 1.391154670568247, + "learning_rate": 2.6949904826046258e-05, + "loss": 0.6225, + "num_input_tokens_seen": 782889632, + "step": 4340 + }, + { + "epoch": 0.47521825994143246, + "grad_norm": 1.6605500595647318, + "learning_rate": 2.6941332282907107e-05, + "loss": 0.7861, + "num_input_tokens_seen": 783088768, + "step": 4341 + }, + { + "epoch": 0.47532773201237033, + "grad_norm": 1.2476690971272024, + "learning_rate": 2.693275951011206e-05, + "loss": 0.7144, + "num_input_tokens_seen": 783261920, + "step": 4342 + }, + { + "epoch": 0.47543720408330825, + "grad_norm": 1.2042599068856417, + "learning_rate": 2.692418650867526e-05, + "loss": 0.658, + "num_input_tokens_seen": 783433056, + "step": 4343 + }, + { + "epoch": 0.4755466761542462, + "grad_norm": 1.2915715162034973, + "learning_rate": 2.6915613279610874e-05, + "loss": 0.9023, + "num_input_tokens_seen": 783623904, + "step": 4344 + }, + { + "epoch": 0.47565614822518404, + "grad_norm": 1.2821938829404904, + "learning_rate": 2.6907039823933093e-05, + "loss": 0.5613, + "num_input_tokens_seen": 783771520, + "step": 4345 + }, + { + "epoch": 0.47576562029612196, + "grad_norm": 1.0964858579336807, + "learning_rate": 2.6898466142656154e-05, + "loss": 0.6151, + "num_input_tokens_seen": 783918240, + "step": 4346 + }, + { + "epoch": 0.4758750923670598, + "grad_norm": 1.1909571066818716, + "learning_rate": 2.6889892236794294e-05, + "loss": 0.5388, + "num_input_tokens_seen": 784084672, + "step": 4347 + }, + { + "epoch": 0.47598456443799775, + "grad_norm": 1.3339261535181983, + "learning_rate": 2.68813181073618e-05, + "loss": 0.6254, + "num_input_tokens_seen": 784248416, + "step": 4348 + }, + { + "epoch": 0.47609403650893567, + "grad_norm": 1.1534206207867603, + "learning_rate": 2.687274375537297e-05, + "loss": 0.5564, + "num_input_tokens_seen": 784416864, + "step": 4349 + }, + { + "epoch": 0.47620350857987354, + "grad_norm": 1.4026825052060325, + "learning_rate": 2.686416918184213e-05, + "loss": 0.8855, + "num_input_tokens_seen": 784620032, + "step": 4350 + }, + { + "epoch": 0.47631298065081146, + "grad_norm": 1.2786167346054336, + "learning_rate": 2.6855594387783638e-05, + "loss": 0.7481, + "num_input_tokens_seen": 784818048, + "step": 4351 + }, + { + "epoch": 0.4764224527217494, + "grad_norm": 1.3043401564151451, + "learning_rate": 2.6847019374211886e-05, + "loss": 0.8672, + "num_input_tokens_seen": 785028608, + "step": 4352 + }, + { + "epoch": 0.47653192479268724, + "grad_norm": 1.2167720883999817, + "learning_rate": 2.6838444142141267e-05, + "loss": 0.6204, + "num_input_tokens_seen": 785206240, + "step": 4353 + }, + { + "epoch": 0.47664139686362517, + "grad_norm": 1.3053228692868255, + "learning_rate": 2.6829868692586218e-05, + "loss": 0.649, + "num_input_tokens_seen": 785376032, + "step": 4354 + }, + { + "epoch": 0.4767508689345631, + "grad_norm": 1.2870137898403644, + "learning_rate": 2.6821293026561206e-05, + "loss": 0.7916, + "num_input_tokens_seen": 785576064, + "step": 4355 + }, + { + "epoch": 0.47686034100550095, + "grad_norm": 1.2666890416344163, + "learning_rate": 2.6812717145080713e-05, + "loss": 0.7191, + "num_input_tokens_seen": 785761088, + "step": 4356 + }, + { + "epoch": 0.4769698130764389, + "grad_norm": 1.174995641497381, + "learning_rate": 2.6804141049159243e-05, + "loss": 0.5745, + "num_input_tokens_seen": 785963808, + "step": 4357 + }, + { + "epoch": 0.4770792851473768, + "grad_norm": 1.2691628728614923, + "learning_rate": 2.6795564739811335e-05, + "loss": 0.6032, + "num_input_tokens_seen": 786128672, + "step": 4358 + }, + { + "epoch": 0.47718875721831466, + "grad_norm": 1.2199820135698058, + "learning_rate": 2.6786988218051556e-05, + "loss": 0.8609, + "num_input_tokens_seen": 786316832, + "step": 4359 + }, + { + "epoch": 0.4772982292892526, + "grad_norm": 1.1696606887406613, + "learning_rate": 2.6778411484894478e-05, + "loss": 0.5873, + "num_input_tokens_seen": 786475872, + "step": 4360 + }, + { + "epoch": 0.4774077013601905, + "grad_norm": 1.3189414314790235, + "learning_rate": 2.6769834541354727e-05, + "loss": 0.7035, + "num_input_tokens_seen": 786682400, + "step": 4361 + }, + { + "epoch": 0.47751717343112837, + "grad_norm": 1.2947044132425276, + "learning_rate": 2.6761257388446924e-05, + "loss": 0.8495, + "num_input_tokens_seen": 786849952, + "step": 4362 + }, + { + "epoch": 0.4776266455020663, + "grad_norm": 1.332737575987591, + "learning_rate": 2.675268002718575e-05, + "loss": 0.8792, + "num_input_tokens_seen": 787042816, + "step": 4363 + }, + { + "epoch": 0.47773611757300416, + "grad_norm": 1.1524418675952122, + "learning_rate": 2.674410245858588e-05, + "loss": 0.5287, + "num_input_tokens_seen": 787210144, + "step": 4364 + }, + { + "epoch": 0.4778455896439421, + "grad_norm": 1.2503616286743777, + "learning_rate": 2.6735524683662017e-05, + "loss": 0.5813, + "num_input_tokens_seen": 787370080, + "step": 4365 + }, + { + "epoch": 0.47795506171488, + "grad_norm": 1.4101696911961126, + "learning_rate": 2.6726946703428908e-05, + "loss": 0.9046, + "num_input_tokens_seen": 787563168, + "step": 4366 + }, + { + "epoch": 0.47806453378581787, + "grad_norm": 1.203256624626408, + "learning_rate": 2.6718368518901295e-05, + "loss": 0.648, + "num_input_tokens_seen": 787731840, + "step": 4367 + }, + { + "epoch": 0.4781740058567558, + "grad_norm": 1.237420345401547, + "learning_rate": 2.670979013109398e-05, + "loss": 0.7588, + "num_input_tokens_seen": 787943744, + "step": 4368 + }, + { + "epoch": 0.4782834779276937, + "grad_norm": 1.118073705495624, + "learning_rate": 2.6701211541021757e-05, + "loss": 0.715, + "num_input_tokens_seen": 788124736, + "step": 4369 + }, + { + "epoch": 0.4783929499986316, + "grad_norm": 1.3492065358451195, + "learning_rate": 2.6692632749699463e-05, + "loss": 0.9237, + "num_input_tokens_seen": 788331040, + "step": 4370 + }, + { + "epoch": 0.4785024220695695, + "grad_norm": 1.226602298552547, + "learning_rate": 2.6684053758141948e-05, + "loss": 0.5633, + "num_input_tokens_seen": 788517184, + "step": 4371 + }, + { + "epoch": 0.4786118941405074, + "grad_norm": 1.3493959559005582, + "learning_rate": 2.6675474567364096e-05, + "loss": 0.7447, + "num_input_tokens_seen": 788688096, + "step": 4372 + }, + { + "epoch": 0.4787213662114453, + "grad_norm": 1.2108448116146122, + "learning_rate": 2.666689517838081e-05, + "loss": 0.6098, + "num_input_tokens_seen": 788846464, + "step": 4373 + }, + { + "epoch": 0.4788308382823832, + "grad_norm": 1.1037213969169617, + "learning_rate": 2.6658315592206995e-05, + "loss": 0.5732, + "num_input_tokens_seen": 789037088, + "step": 4374 + }, + { + "epoch": 0.47894031035332113, + "grad_norm": 1.2022125945956357, + "learning_rate": 2.664973580985763e-05, + "loss": 0.6179, + "num_input_tokens_seen": 789227264, + "step": 4375 + }, + { + "epoch": 0.479049782424259, + "grad_norm": 1.411478875305167, + "learning_rate": 2.6641155832347668e-05, + "loss": 0.8444, + "num_input_tokens_seen": 789424160, + "step": 4376 + }, + { + "epoch": 0.4791592544951969, + "grad_norm": 1.2571423089735272, + "learning_rate": 2.663257566069211e-05, + "loss": 0.643, + "num_input_tokens_seen": 789584992, + "step": 4377 + }, + { + "epoch": 0.47926872656613484, + "grad_norm": 1.2997142551068595, + "learning_rate": 2.6623995295905974e-05, + "loss": 0.863, + "num_input_tokens_seen": 789774720, + "step": 4378 + }, + { + "epoch": 0.4793781986370727, + "grad_norm": 1.2931769626920606, + "learning_rate": 2.6615414739004297e-05, + "loss": 0.5948, + "num_input_tokens_seen": 789935328, + "step": 4379 + }, + { + "epoch": 0.4794876707080106, + "grad_norm": 1.2997511717271375, + "learning_rate": 2.6606833991002146e-05, + "loss": 0.7567, + "num_input_tokens_seen": 790125056, + "step": 4380 + }, + { + "epoch": 0.4795971427789485, + "grad_norm": 1.2871349835778734, + "learning_rate": 2.6598253052914596e-05, + "loss": 0.6668, + "num_input_tokens_seen": 790312320, + "step": 4381 + }, + { + "epoch": 0.4797066148498864, + "grad_norm": 1.218511851591595, + "learning_rate": 2.6589671925756777e-05, + "loss": 0.7412, + "num_input_tokens_seen": 790492640, + "step": 4382 + }, + { + "epoch": 0.47981608692082434, + "grad_norm": 1.3942395073391725, + "learning_rate": 2.6581090610543796e-05, + "loss": 0.6251, + "num_input_tokens_seen": 790665568, + "step": 4383 + }, + { + "epoch": 0.4799255589917622, + "grad_norm": 1.285629834512347, + "learning_rate": 2.6572509108290826e-05, + "loss": 0.6585, + "num_input_tokens_seen": 790838272, + "step": 4384 + }, + { + "epoch": 0.4800350310627001, + "grad_norm": 1.297632846242284, + "learning_rate": 2.6563927420013036e-05, + "loss": 0.7041, + "num_input_tokens_seen": 791025536, + "step": 4385 + }, + { + "epoch": 0.48014450313363805, + "grad_norm": 1.2224006263737546, + "learning_rate": 2.6555345546725625e-05, + "loss": 0.7529, + "num_input_tokens_seen": 791225792, + "step": 4386 + }, + { + "epoch": 0.4802539752045759, + "grad_norm": 1.2233133226225348, + "learning_rate": 2.6546763489443806e-05, + "loss": 0.6692, + "num_input_tokens_seen": 791427616, + "step": 4387 + }, + { + "epoch": 0.48036344727551383, + "grad_norm": 1.3688510474264861, + "learning_rate": 2.6538181249182813e-05, + "loss": 0.7764, + "num_input_tokens_seen": 791612416, + "step": 4388 + }, + { + "epoch": 0.48047291934645175, + "grad_norm": 1.275024501714708, + "learning_rate": 2.652959882695793e-05, + "loss": 0.6676, + "num_input_tokens_seen": 791777952, + "step": 4389 + }, + { + "epoch": 0.4805823914173896, + "grad_norm": 1.0914354084245845, + "learning_rate": 2.6521016223784427e-05, + "loss": 0.5715, + "num_input_tokens_seen": 791954240, + "step": 4390 + }, + { + "epoch": 0.48069186348832754, + "grad_norm": 1.3224354667278355, + "learning_rate": 2.6512433440677613e-05, + "loss": 0.6608, + "num_input_tokens_seen": 792132992, + "step": 4391 + }, + { + "epoch": 0.48080133555926546, + "grad_norm": 1.1836136920117664, + "learning_rate": 2.6503850478652815e-05, + "loss": 0.5923, + "num_input_tokens_seen": 792313536, + "step": 4392 + }, + { + "epoch": 0.48091080763020333, + "grad_norm": 1.2689560746330601, + "learning_rate": 2.6495267338725375e-05, + "loss": 0.6078, + "num_input_tokens_seen": 792500576, + "step": 4393 + }, + { + "epoch": 0.48102027970114125, + "grad_norm": 1.0164211380591677, + "learning_rate": 2.6486684021910667e-05, + "loss": 0.5514, + "num_input_tokens_seen": 792689632, + "step": 4394 + }, + { + "epoch": 0.4811297517720792, + "grad_norm": 1.3368397393197782, + "learning_rate": 2.647810052922409e-05, + "loss": 0.6569, + "num_input_tokens_seen": 792885184, + "step": 4395 + }, + { + "epoch": 0.48123922384301704, + "grad_norm": 1.1165450055950938, + "learning_rate": 2.6469516861681042e-05, + "loss": 0.5656, + "num_input_tokens_seen": 793107840, + "step": 4396 + }, + { + "epoch": 0.48134869591395496, + "grad_norm": 1.216737356919299, + "learning_rate": 2.6460933020296962e-05, + "loss": 0.6172, + "num_input_tokens_seen": 793283232, + "step": 4397 + }, + { + "epoch": 0.4814581679848929, + "grad_norm": 1.206465999258555, + "learning_rate": 2.6452349006087295e-05, + "loss": 0.6068, + "num_input_tokens_seen": 793481696, + "step": 4398 + }, + { + "epoch": 0.48156764005583075, + "grad_norm": 1.2832244453226682, + "learning_rate": 2.644376482006752e-05, + "loss": 0.6319, + "num_input_tokens_seen": 793630880, + "step": 4399 + }, + { + "epoch": 0.48167711212676867, + "grad_norm": 1.4106762210063284, + "learning_rate": 2.6435180463253123e-05, + "loss": 0.7313, + "num_input_tokens_seen": 793814784, + "step": 4400 + }, + { + "epoch": 0.48178658419770654, + "grad_norm": 1.3832573040594087, + "learning_rate": 2.6426595936659616e-05, + "loss": 0.9118, + "num_input_tokens_seen": 793982336, + "step": 4401 + }, + { + "epoch": 0.48189605626864446, + "grad_norm": 1.2218573644589787, + "learning_rate": 2.6418011241302543e-05, + "loss": 0.8365, + "num_input_tokens_seen": 794196480, + "step": 4402 + }, + { + "epoch": 0.4820055283395824, + "grad_norm": 1.3881759079791853, + "learning_rate": 2.6409426378197456e-05, + "loss": 0.6519, + "num_input_tokens_seen": 794353504, + "step": 4403 + }, + { + "epoch": 0.48211500041052024, + "grad_norm": 1.2866074736885833, + "learning_rate": 2.6400841348359913e-05, + "loss": 0.7739, + "num_input_tokens_seen": 794543232, + "step": 4404 + }, + { + "epoch": 0.48222447248145817, + "grad_norm": 1.21560316482978, + "learning_rate": 2.6392256152805517e-05, + "loss": 0.6613, + "num_input_tokens_seen": 794742592, + "step": 4405 + }, + { + "epoch": 0.4823339445523961, + "grad_norm": 1.1821237128300055, + "learning_rate": 2.6383670792549885e-05, + "loss": 0.6293, + "num_input_tokens_seen": 794908800, + "step": 4406 + }, + { + "epoch": 0.48244341662333395, + "grad_norm": 1.1757847478567416, + "learning_rate": 2.6375085268608645e-05, + "loss": 0.769, + "num_input_tokens_seen": 795058432, + "step": 4407 + }, + { + "epoch": 0.4825528886942719, + "grad_norm": 1.2007016137544424, + "learning_rate": 2.636649958199744e-05, + "loss": 0.9351, + "num_input_tokens_seen": 795263392, + "step": 4408 + }, + { + "epoch": 0.4826623607652098, + "grad_norm": 1.1254793311567943, + "learning_rate": 2.635791373373195e-05, + "loss": 0.62, + "num_input_tokens_seen": 795431616, + "step": 4409 + }, + { + "epoch": 0.48277183283614766, + "grad_norm": 1.281124062280147, + "learning_rate": 2.634932772482786e-05, + "loss": 0.8784, + "num_input_tokens_seen": 795633440, + "step": 4410 + }, + { + "epoch": 0.4828813049070856, + "grad_norm": 1.1951160643488603, + "learning_rate": 2.634074155630088e-05, + "loss": 0.6956, + "num_input_tokens_seen": 795817120, + "step": 4411 + }, + { + "epoch": 0.4829907769780235, + "grad_norm": 1.2187421749548475, + "learning_rate": 2.6332155229166738e-05, + "loss": 0.7658, + "num_input_tokens_seen": 796006848, + "step": 4412 + }, + { + "epoch": 0.48310024904896137, + "grad_norm": 1.3450913942051106, + "learning_rate": 2.6323568744441173e-05, + "loss": 0.9806, + "num_input_tokens_seen": 796193440, + "step": 4413 + }, + { + "epoch": 0.4832097211198993, + "grad_norm": 1.8922198796041556, + "learning_rate": 2.631498210313997e-05, + "loss": 1.08, + "num_input_tokens_seen": 796393696, + "step": 4414 + }, + { + "epoch": 0.4833191931908372, + "grad_norm": 1.2065818962508221, + "learning_rate": 2.630639530627888e-05, + "loss": 0.8097, + "num_input_tokens_seen": 796585440, + "step": 4415 + }, + { + "epoch": 0.4834286652617751, + "grad_norm": 1.2974899856092716, + "learning_rate": 2.6297808354873733e-05, + "loss": 0.7643, + "num_input_tokens_seen": 796793760, + "step": 4416 + }, + { + "epoch": 0.483538137332713, + "grad_norm": 1.0828927807132496, + "learning_rate": 2.6289221249940337e-05, + "loss": 0.5665, + "num_input_tokens_seen": 796984384, + "step": 4417 + }, + { + "epoch": 0.48364760940365087, + "grad_norm": 1.1645110629364128, + "learning_rate": 2.6280633992494536e-05, + "loss": 0.4882, + "num_input_tokens_seen": 797142080, + "step": 4418 + }, + { + "epoch": 0.4837570814745888, + "grad_norm": 1.354566114150572, + "learning_rate": 2.627204658355218e-05, + "loss": 0.8295, + "num_input_tokens_seen": 797341216, + "step": 4419 + }, + { + "epoch": 0.4838665535455267, + "grad_norm": 1.185838088541869, + "learning_rate": 2.6263459024129144e-05, + "loss": 0.5711, + "num_input_tokens_seen": 797519072, + "step": 4420 + }, + { + "epoch": 0.4839760256164646, + "grad_norm": 1.1749995475118862, + "learning_rate": 2.6254871315241318e-05, + "loss": 0.6282, + "num_input_tokens_seen": 797736352, + "step": 4421 + }, + { + "epoch": 0.4840854976874025, + "grad_norm": 1.313454417215731, + "learning_rate": 2.6246283457904612e-05, + "loss": 0.6268, + "num_input_tokens_seen": 797910400, + "step": 4422 + }, + { + "epoch": 0.4841949697583404, + "grad_norm": 1.2434597098338827, + "learning_rate": 2.6237695453134964e-05, + "loss": 0.7688, + "num_input_tokens_seen": 798082880, + "step": 4423 + }, + { + "epoch": 0.4843044418292783, + "grad_norm": 1.3370079971650646, + "learning_rate": 2.6229107301948308e-05, + "loss": 0.7146, + "num_input_tokens_seen": 798269920, + "step": 4424 + }, + { + "epoch": 0.4844139139002162, + "grad_norm": 1.517151683066381, + "learning_rate": 2.62205190053606e-05, + "loss": 0.7889, + "num_input_tokens_seen": 798445536, + "step": 4425 + }, + { + "epoch": 0.48452338597115413, + "grad_norm": 1.3503893131937195, + "learning_rate": 2.6211930564387832e-05, + "loss": 0.8621, + "num_input_tokens_seen": 798624512, + "step": 4426 + }, + { + "epoch": 0.484632858042092, + "grad_norm": 1.1744614097228405, + "learning_rate": 2.6203341980045996e-05, + "loss": 0.5768, + "num_input_tokens_seen": 798803264, + "step": 4427 + }, + { + "epoch": 0.4847423301130299, + "grad_norm": 1.2681933116609538, + "learning_rate": 2.6194753253351102e-05, + "loss": 0.6256, + "num_input_tokens_seen": 798996128, + "step": 4428 + }, + { + "epoch": 0.48485180218396784, + "grad_norm": 1.2964352701007318, + "learning_rate": 2.6186164385319186e-05, + "loss": 0.7118, + "num_input_tokens_seen": 799178688, + "step": 4429 + }, + { + "epoch": 0.4849612742549057, + "grad_norm": 1.2641536024387134, + "learning_rate": 2.6177575376966284e-05, + "loss": 0.6527, + "num_input_tokens_seen": 799344224, + "step": 4430 + }, + { + "epoch": 0.4850707463258436, + "grad_norm": 1.3665159483508298, + "learning_rate": 2.6168986229308473e-05, + "loss": 0.8343, + "num_input_tokens_seen": 799533728, + "step": 4431 + }, + { + "epoch": 0.48518021839678155, + "grad_norm": 1.326222392090584, + "learning_rate": 2.6160396943361827e-05, + "loss": 0.7435, + "num_input_tokens_seen": 799706432, + "step": 4432 + }, + { + "epoch": 0.4852896904677194, + "grad_norm": 1.3648577212845212, + "learning_rate": 2.6151807520142436e-05, + "loss": 0.6554, + "num_input_tokens_seen": 799876672, + "step": 4433 + }, + { + "epoch": 0.48539916253865734, + "grad_norm": 1.2112641909056372, + "learning_rate": 2.6143217960666416e-05, + "loss": 0.7752, + "num_input_tokens_seen": 800091936, + "step": 4434 + }, + { + "epoch": 0.4855086346095952, + "grad_norm": 1.2009844874159166, + "learning_rate": 2.6134628265949903e-05, + "loss": 0.9503, + "num_input_tokens_seen": 800296448, + "step": 4435 + }, + { + "epoch": 0.4856181066805331, + "grad_norm": 1.4932857286721972, + "learning_rate": 2.6126038437009025e-05, + "loss": 0.8758, + "num_input_tokens_seen": 800480352, + "step": 4436 + }, + { + "epoch": 0.48572757875147105, + "grad_norm": 1.1747311182998958, + "learning_rate": 2.6117448474859958e-05, + "loss": 0.5211, + "num_input_tokens_seen": 800652832, + "step": 4437 + }, + { + "epoch": 0.4858370508224089, + "grad_norm": 1.3072980244799826, + "learning_rate": 2.6108858380518874e-05, + "loss": 0.6511, + "num_input_tokens_seen": 800797312, + "step": 4438 + }, + { + "epoch": 0.48594652289334683, + "grad_norm": 1.32313623883966, + "learning_rate": 2.6100268155001968e-05, + "loss": 0.6788, + "num_input_tokens_seen": 800938432, + "step": 4439 + }, + { + "epoch": 0.48605599496428475, + "grad_norm": 1.3428984760674318, + "learning_rate": 2.6091677799325436e-05, + "loss": 0.7295, + "num_input_tokens_seen": 801115616, + "step": 4440 + }, + { + "epoch": 0.4861654670352226, + "grad_norm": 1.2327648719319104, + "learning_rate": 2.608308731450551e-05, + "loss": 0.701, + "num_input_tokens_seen": 801276896, + "step": 4441 + }, + { + "epoch": 0.48627493910616054, + "grad_norm": 1.3238768459707362, + "learning_rate": 2.607449670155842e-05, + "loss": 0.8499, + "num_input_tokens_seen": 801458560, + "step": 4442 + }, + { + "epoch": 0.48638441117709846, + "grad_norm": 1.2204123188764897, + "learning_rate": 2.6065905961500432e-05, + "loss": 0.5845, + "num_input_tokens_seen": 801611776, + "step": 4443 + }, + { + "epoch": 0.48649388324803633, + "grad_norm": 1.3153308131745975, + "learning_rate": 2.60573150953478e-05, + "loss": 0.7975, + "num_input_tokens_seen": 801803296, + "step": 4444 + }, + { + "epoch": 0.48660335531897425, + "grad_norm": 1.1824728817078327, + "learning_rate": 2.6048724104116818e-05, + "loss": 0.7043, + "num_input_tokens_seen": 801974656, + "step": 4445 + }, + { + "epoch": 0.4867128273899122, + "grad_norm": 1.2057292545502578, + "learning_rate": 2.6040132988823775e-05, + "loss": 0.7022, + "num_input_tokens_seen": 802158336, + "step": 4446 + }, + { + "epoch": 0.48682229946085004, + "grad_norm": 1.4058485941726597, + "learning_rate": 2.603154175048499e-05, + "loss": 0.887, + "num_input_tokens_seen": 802326560, + "step": 4447 + }, + { + "epoch": 0.48693177153178796, + "grad_norm": 1.3472211052320815, + "learning_rate": 2.602295039011679e-05, + "loss": 0.7277, + "num_input_tokens_seen": 802533312, + "step": 4448 + }, + { + "epoch": 0.4870412436027259, + "grad_norm": 1.3328691906516663, + "learning_rate": 2.6014358908735504e-05, + "loss": 0.5252, + "num_input_tokens_seen": 802682720, + "step": 4449 + }, + { + "epoch": 0.48715071567366375, + "grad_norm": 1.3599142999855005, + "learning_rate": 2.600576730735751e-05, + "loss": 0.667, + "num_input_tokens_seen": 802874912, + "step": 4450 + }, + { + "epoch": 0.48726018774460167, + "grad_norm": 1.1965288941672183, + "learning_rate": 2.5997175586999163e-05, + "loss": 0.7728, + "num_input_tokens_seen": 803060384, + "step": 4451 + }, + { + "epoch": 0.48736965981553954, + "grad_norm": 1.397647655923349, + "learning_rate": 2.5988583748676854e-05, + "loss": 0.9502, + "num_input_tokens_seen": 803274304, + "step": 4452 + }, + { + "epoch": 0.48747913188647746, + "grad_norm": 1.1878902898183976, + "learning_rate": 2.597999179340697e-05, + "loss": 0.6277, + "num_input_tokens_seen": 803426848, + "step": 4453 + }, + { + "epoch": 0.4875886039574154, + "grad_norm": 1.2759738307230535, + "learning_rate": 2.5971399722205936e-05, + "loss": 0.8243, + "num_input_tokens_seen": 803614784, + "step": 4454 + }, + { + "epoch": 0.48769807602835324, + "grad_norm": 1.2278093451665484, + "learning_rate": 2.596280753609017e-05, + "loss": 0.7554, + "num_input_tokens_seen": 803776512, + "step": 4455 + }, + { + "epoch": 0.48780754809929117, + "grad_norm": 1.4316079837257765, + "learning_rate": 2.5954215236076113e-05, + "loss": 0.887, + "num_input_tokens_seen": 803953920, + "step": 4456 + }, + { + "epoch": 0.4879170201702291, + "grad_norm": 1.2404785396498594, + "learning_rate": 2.594562282318021e-05, + "loss": 0.8002, + "num_input_tokens_seen": 804149472, + "step": 4457 + }, + { + "epoch": 0.48802649224116695, + "grad_norm": 1.079876955140308, + "learning_rate": 2.5937030298418945e-05, + "loss": 0.7096, + "num_input_tokens_seen": 804347936, + "step": 4458 + }, + { + "epoch": 0.4881359643121049, + "grad_norm": 1.5053430604752756, + "learning_rate": 2.5928437662808785e-05, + "loss": 0.8319, + "num_input_tokens_seen": 804512352, + "step": 4459 + }, + { + "epoch": 0.4882454363830428, + "grad_norm": 1.3584071693167192, + "learning_rate": 2.5919844917366225e-05, + "loss": 0.8599, + "num_input_tokens_seen": 804707008, + "step": 4460 + }, + { + "epoch": 0.48835490845398066, + "grad_norm": 1.1634302150222615, + "learning_rate": 2.5911252063107772e-05, + "loss": 0.6703, + "num_input_tokens_seen": 804879936, + "step": 4461 + }, + { + "epoch": 0.4884643805249186, + "grad_norm": 1.2121628007292593, + "learning_rate": 2.5902659101049933e-05, + "loss": 0.961, + "num_input_tokens_seen": 805064736, + "step": 4462 + }, + { + "epoch": 0.4885738525958565, + "grad_norm": 1.1800688575600446, + "learning_rate": 2.589406603220925e-05, + "loss": 0.6585, + "num_input_tokens_seen": 805259392, + "step": 4463 + }, + { + "epoch": 0.48868332466679437, + "grad_norm": 1.3468302973787896, + "learning_rate": 2.5885472857602273e-05, + "loss": 0.9244, + "num_input_tokens_seen": 805432992, + "step": 4464 + }, + { + "epoch": 0.4887927967377323, + "grad_norm": 1.3231252921297367, + "learning_rate": 2.5876879578245543e-05, + "loss": 0.7632, + "num_input_tokens_seen": 805600320, + "step": 4465 + }, + { + "epoch": 0.4889022688086702, + "grad_norm": 1.3018338180526112, + "learning_rate": 2.5868286195155638e-05, + "loss": 0.7125, + "num_input_tokens_seen": 805756896, + "step": 4466 + }, + { + "epoch": 0.4890117408796081, + "grad_norm": 1.3985322888178917, + "learning_rate": 2.585969270934913e-05, + "loss": 0.6898, + "num_input_tokens_seen": 805904288, + "step": 4467 + }, + { + "epoch": 0.489121212950546, + "grad_norm": 1.3297287178150954, + "learning_rate": 2.5851099121842624e-05, + "loss": 0.6502, + "num_input_tokens_seen": 806088864, + "step": 4468 + }, + { + "epoch": 0.48923068502148387, + "grad_norm": 1.0852610175666013, + "learning_rate": 2.5842505433652713e-05, + "loss": 0.5917, + "num_input_tokens_seen": 806281280, + "step": 4469 + }, + { + "epoch": 0.4893401570924218, + "grad_norm": 1.2513179030462034, + "learning_rate": 2.583391164579603e-05, + "loss": 0.8224, + "num_input_tokens_seen": 806451072, + "step": 4470 + }, + { + "epoch": 0.4894496291633597, + "grad_norm": 1.1744518685675145, + "learning_rate": 2.5825317759289185e-05, + "loss": 0.6617, + "num_input_tokens_seen": 806652448, + "step": 4471 + }, + { + "epoch": 0.4895591012342976, + "grad_norm": 1.22622734398867, + "learning_rate": 2.581672377514883e-05, + "loss": 0.7852, + "num_input_tokens_seen": 806852704, + "step": 4472 + }, + { + "epoch": 0.4896685733052355, + "grad_norm": 1.2184578961223087, + "learning_rate": 2.580812969439162e-05, + "loss": 0.7199, + "num_input_tokens_seen": 807056544, + "step": 4473 + }, + { + "epoch": 0.4897780453761734, + "grad_norm": 1.2638530336028713, + "learning_rate": 2.5799535518034205e-05, + "loss": 0.7812, + "num_input_tokens_seen": 807243584, + "step": 4474 + }, + { + "epoch": 0.4898875174471113, + "grad_norm": 1.3513437876940744, + "learning_rate": 2.579094124709327e-05, + "loss": 0.6906, + "num_input_tokens_seen": 807424352, + "step": 4475 + }, + { + "epoch": 0.4899969895180492, + "grad_norm": 1.3102541282456304, + "learning_rate": 2.578234688258549e-05, + "loss": 0.6976, + "num_input_tokens_seen": 807598176, + "step": 4476 + }, + { + "epoch": 0.49010646158898713, + "grad_norm": 1.3291703990457362, + "learning_rate": 2.5773752425527576e-05, + "loss": 0.6548, + "num_input_tokens_seen": 807757440, + "step": 4477 + }, + { + "epoch": 0.490215933659925, + "grad_norm": 1.2660291050161228, + "learning_rate": 2.576515787693622e-05, + "loss": 0.6487, + "num_input_tokens_seen": 807940896, + "step": 4478 + }, + { + "epoch": 0.4903254057308629, + "grad_norm": 1.3810888636571985, + "learning_rate": 2.5756563237828158e-05, + "loss": 0.793, + "num_input_tokens_seen": 808070368, + "step": 4479 + }, + { + "epoch": 0.49043487780180084, + "grad_norm": 1.2639098142398448, + "learning_rate": 2.574796850922011e-05, + "loss": 0.7771, + "num_input_tokens_seen": 808270624, + "step": 4480 + }, + { + "epoch": 0.4905443498727387, + "grad_norm": 1.1680945899560509, + "learning_rate": 2.573937369212882e-05, + "loss": 0.6574, + "num_input_tokens_seen": 808464832, + "step": 4481 + }, + { + "epoch": 0.4906538219436766, + "grad_norm": 1.0879174165833916, + "learning_rate": 2.5730778787571035e-05, + "loss": 0.702, + "num_input_tokens_seen": 808650528, + "step": 4482 + }, + { + "epoch": 0.49076329401461455, + "grad_norm": 1.164683438788627, + "learning_rate": 2.5722183796563508e-05, + "loss": 0.6131, + "num_input_tokens_seen": 808821664, + "step": 4483 + }, + { + "epoch": 0.4908727660855524, + "grad_norm": 1.3588262743979234, + "learning_rate": 2.571358872012303e-05, + "loss": 0.9082, + "num_input_tokens_seen": 809001984, + "step": 4484 + }, + { + "epoch": 0.49098223815649034, + "grad_norm": 1.2052897510157063, + "learning_rate": 2.5704993559266364e-05, + "loss": 0.7736, + "num_input_tokens_seen": 809177824, + "step": 4485 + }, + { + "epoch": 0.4910917102274282, + "grad_norm": 1.259452177574701, + "learning_rate": 2.569639831501031e-05, + "loss": 0.8074, + "num_input_tokens_seen": 809369120, + "step": 4486 + }, + { + "epoch": 0.4912011822983661, + "grad_norm": 1.3487571752885898, + "learning_rate": 2.5687802988371667e-05, + "loss": 0.8023, + "num_input_tokens_seen": 809534880, + "step": 4487 + }, + { + "epoch": 0.49131065436930405, + "grad_norm": 1.2181148830441906, + "learning_rate": 2.5679207580367242e-05, + "loss": 0.7795, + "num_input_tokens_seen": 809707584, + "step": 4488 + }, + { + "epoch": 0.4914201264402419, + "grad_norm": 1.2999114373191147, + "learning_rate": 2.5670612092013857e-05, + "loss": 0.8149, + "num_input_tokens_seen": 809882528, + "step": 4489 + }, + { + "epoch": 0.49152959851117983, + "grad_norm": 1.169791900598509, + "learning_rate": 2.5662016524328346e-05, + "loss": 0.557, + "num_input_tokens_seen": 810067776, + "step": 4490 + }, + { + "epoch": 0.49163907058211775, + "grad_norm": 1.1927368238397347, + "learning_rate": 2.565342087832755e-05, + "loss": 0.682, + "num_input_tokens_seen": 810261536, + "step": 4491 + }, + { + "epoch": 0.4917485426530556, + "grad_norm": 1.2546408809680274, + "learning_rate": 2.5644825155028314e-05, + "loss": 0.7489, + "num_input_tokens_seen": 810443648, + "step": 4492 + }, + { + "epoch": 0.49185801472399354, + "grad_norm": 1.2167478401505076, + "learning_rate": 2.563622935544749e-05, + "loss": 0.6142, + "num_input_tokens_seen": 810632032, + "step": 4493 + }, + { + "epoch": 0.49196748679493146, + "grad_norm": 1.1629782113537717, + "learning_rate": 2.5627633480601953e-05, + "loss": 0.6247, + "num_input_tokens_seen": 810838112, + "step": 4494 + }, + { + "epoch": 0.49207695886586933, + "grad_norm": 1.2598367833585131, + "learning_rate": 2.561903753150857e-05, + "loss": 0.7199, + "num_input_tokens_seen": 811036128, + "step": 4495 + }, + { + "epoch": 0.49218643093680725, + "grad_norm": 1.332301942946247, + "learning_rate": 2.561044150918423e-05, + "loss": 0.7065, + "num_input_tokens_seen": 811214208, + "step": 4496 + }, + { + "epoch": 0.4922959030077452, + "grad_norm": 1.3260104680892375, + "learning_rate": 2.5601845414645832e-05, + "loss": 0.6394, + "num_input_tokens_seen": 811364064, + "step": 4497 + }, + { + "epoch": 0.49240537507868304, + "grad_norm": 1.369039229748376, + "learning_rate": 2.5593249248910272e-05, + "loss": 0.8001, + "num_input_tokens_seen": 811551328, + "step": 4498 + }, + { + "epoch": 0.49251484714962096, + "grad_norm": 1.227859540209589, + "learning_rate": 2.558465301299445e-05, + "loss": 0.5506, + "num_input_tokens_seen": 811716640, + "step": 4499 + }, + { + "epoch": 0.4926243192205589, + "grad_norm": 1.277706549681737, + "learning_rate": 2.5576056707915308e-05, + "loss": 0.7206, + "num_input_tokens_seen": 811911520, + "step": 4500 + }, + { + "epoch": 0.49273379129149675, + "grad_norm": 1.3343197581092765, + "learning_rate": 2.556746033468975e-05, + "loss": 0.8743, + "num_input_tokens_seen": 812097440, + "step": 4501 + }, + { + "epoch": 0.49284326336243467, + "grad_norm": 1.3625798648210485, + "learning_rate": 2.5558863894334722e-05, + "loss": 0.8324, + "num_input_tokens_seen": 812271040, + "step": 4502 + }, + { + "epoch": 0.49295273543337254, + "grad_norm": 1.2362281795076413, + "learning_rate": 2.5550267387867162e-05, + "loss": 0.6287, + "num_input_tokens_seen": 812446432, + "step": 4503 + }, + { + "epoch": 0.49306220750431046, + "grad_norm": 1.2112488869267282, + "learning_rate": 2.5541670816304026e-05, + "loss": 0.6071, + "num_input_tokens_seen": 812626528, + "step": 4504 + }, + { + "epoch": 0.4931716795752484, + "grad_norm": 1.1709942368871318, + "learning_rate": 2.5533074180662265e-05, + "loss": 0.555, + "num_input_tokens_seen": 812805728, + "step": 4505 + }, + { + "epoch": 0.49328115164618624, + "grad_norm": 1.1981102607309966, + "learning_rate": 2.5524477481958846e-05, + "loss": 0.6928, + "num_input_tokens_seen": 812972608, + "step": 4506 + }, + { + "epoch": 0.49339062371712417, + "grad_norm": 1.311741200857602, + "learning_rate": 2.551588072121075e-05, + "loss": 0.7494, + "num_input_tokens_seen": 813116192, + "step": 4507 + }, + { + "epoch": 0.4935000957880621, + "grad_norm": 1.2468403460053812, + "learning_rate": 2.5507283899434948e-05, + "loss": 0.7713, + "num_input_tokens_seen": 813316448, + "step": 4508 + }, + { + "epoch": 0.49360956785899995, + "grad_norm": 1.165386394388972, + "learning_rate": 2.549868701764842e-05, + "loss": 0.6535, + "num_input_tokens_seen": 813518496, + "step": 4509 + }, + { + "epoch": 0.4937190399299379, + "grad_norm": 1.237728537061952, + "learning_rate": 2.5490090076868177e-05, + "loss": 0.6622, + "num_input_tokens_seen": 813707104, + "step": 4510 + }, + { + "epoch": 0.4938285120008758, + "grad_norm": 1.2404180435789798, + "learning_rate": 2.5481493078111218e-05, + "loss": 0.791, + "num_input_tokens_seen": 813908928, + "step": 4511 + }, + { + "epoch": 0.49393798407181366, + "grad_norm": 1.2287376700449184, + "learning_rate": 2.547289602239455e-05, + "loss": 0.7506, + "num_input_tokens_seen": 814108288, + "step": 4512 + }, + { + "epoch": 0.4940474561427516, + "grad_norm": 1.2418240189323093, + "learning_rate": 2.5464298910735186e-05, + "loss": 0.7561, + "num_input_tokens_seen": 814300032, + "step": 4513 + }, + { + "epoch": 0.4941569282136895, + "grad_norm": 1.1918356700023103, + "learning_rate": 2.545570174415015e-05, + "loss": 0.5613, + "num_input_tokens_seen": 814487520, + "step": 4514 + }, + { + "epoch": 0.49426640028462737, + "grad_norm": 1.3383293824634375, + "learning_rate": 2.5447104523656466e-05, + "loss": 0.8227, + "num_input_tokens_seen": 814686208, + "step": 4515 + }, + { + "epoch": 0.4943758723555653, + "grad_norm": 1.270457050502787, + "learning_rate": 2.543850725027117e-05, + "loss": 0.7865, + "num_input_tokens_seen": 814892512, + "step": 4516 + }, + { + "epoch": 0.4944853444265032, + "grad_norm": 1.2889216490579551, + "learning_rate": 2.54299099250113e-05, + "loss": 0.9123, + "num_input_tokens_seen": 815059616, + "step": 4517 + }, + { + "epoch": 0.4945948164974411, + "grad_norm": 1.1211555277852938, + "learning_rate": 2.5421312548893917e-05, + "loss": 0.6339, + "num_input_tokens_seen": 815235904, + "step": 4518 + }, + { + "epoch": 0.494704288568379, + "grad_norm": 1.2767608432462452, + "learning_rate": 2.5412715122936064e-05, + "loss": 0.6855, + "num_input_tokens_seen": 815434592, + "step": 4519 + }, + { + "epoch": 0.49481376063931687, + "grad_norm": 1.3511107467053436, + "learning_rate": 2.5404117648154794e-05, + "loss": 0.7103, + "num_input_tokens_seen": 815609984, + "step": 4520 + }, + { + "epoch": 0.4949232327102548, + "grad_norm": 1.1600615080607282, + "learning_rate": 2.5395520125567185e-05, + "loss": 0.5366, + "num_input_tokens_seen": 815789408, + "step": 4521 + }, + { + "epoch": 0.4950327047811927, + "grad_norm": 1.2713469212737245, + "learning_rate": 2.53869225561903e-05, + "loss": 0.8518, + "num_input_tokens_seen": 815966368, + "step": 4522 + }, + { + "epoch": 0.4951421768521306, + "grad_norm": 1.2312213836287773, + "learning_rate": 2.537832494104121e-05, + "loss": 0.9355, + "num_input_tokens_seen": 816182528, + "step": 4523 + }, + { + "epoch": 0.4952516489230685, + "grad_norm": 1.2007478291136293, + "learning_rate": 2.536972728113701e-05, + "loss": 0.8543, + "num_input_tokens_seen": 816363296, + "step": 4524 + }, + { + "epoch": 0.4953611209940064, + "grad_norm": 1.3096649340553956, + "learning_rate": 2.536112957749478e-05, + "loss": 0.8236, + "num_input_tokens_seen": 816545184, + "step": 4525 + }, + { + "epoch": 0.4954705930649443, + "grad_norm": 1.3253846114134915, + "learning_rate": 2.535253183113161e-05, + "loss": 0.8922, + "num_input_tokens_seen": 816747680, + "step": 4526 + }, + { + "epoch": 0.4955800651358822, + "grad_norm": 1.3786658058502, + "learning_rate": 2.5343934043064598e-05, + "loss": 0.7905, + "num_input_tokens_seen": 816949952, + "step": 4527 + }, + { + "epoch": 0.49568953720682013, + "grad_norm": 1.3621393356332017, + "learning_rate": 2.5335336214310845e-05, + "loss": 0.7685, + "num_input_tokens_seen": 817106304, + "step": 4528 + }, + { + "epoch": 0.495799009277758, + "grad_norm": 1.34960005628623, + "learning_rate": 2.5326738345887462e-05, + "loss": 0.7375, + "num_input_tokens_seen": 817296256, + "step": 4529 + }, + { + "epoch": 0.4959084813486959, + "grad_norm": 1.3628995519527747, + "learning_rate": 2.531814043881155e-05, + "loss": 0.7674, + "num_input_tokens_seen": 817505024, + "step": 4530 + }, + { + "epoch": 0.49601795341963384, + "grad_norm": 1.0944765130350897, + "learning_rate": 2.5309542494100234e-05, + "loss": 0.5189, + "num_input_tokens_seen": 817702144, + "step": 4531 + }, + { + "epoch": 0.4961274254905717, + "grad_norm": 1.361461397540675, + "learning_rate": 2.5300944512770636e-05, + "loss": 0.9222, + "num_input_tokens_seen": 817887168, + "step": 4532 + }, + { + "epoch": 0.4962368975615096, + "grad_norm": 1.2761128879315717, + "learning_rate": 2.529234649583988e-05, + "loss": 0.7377, + "num_input_tokens_seen": 818062784, + "step": 4533 + }, + { + "epoch": 0.49634636963244755, + "grad_norm": 1.309619148817333, + "learning_rate": 2.528374844432509e-05, + "loss": 0.8108, + "num_input_tokens_seen": 818240864, + "step": 4534 + }, + { + "epoch": 0.4964558417033854, + "grad_norm": 1.250556774118098, + "learning_rate": 2.5275150359243405e-05, + "loss": 0.6954, + "num_input_tokens_seen": 818429696, + "step": 4535 + }, + { + "epoch": 0.49656531377432334, + "grad_norm": 1.30955174238791, + "learning_rate": 2.526655224161196e-05, + "loss": 0.7044, + "num_input_tokens_seen": 818623232, + "step": 4536 + }, + { + "epoch": 0.4966747858452612, + "grad_norm": 1.3488416682672868, + "learning_rate": 2.5257954092447878e-05, + "loss": 0.7814, + "num_input_tokens_seen": 818797504, + "step": 4537 + }, + { + "epoch": 0.4967842579161991, + "grad_norm": 1.2213751568105564, + "learning_rate": 2.5249355912768334e-05, + "loss": 0.7245, + "num_input_tokens_seen": 818972224, + "step": 4538 + }, + { + "epoch": 0.49689372998713704, + "grad_norm": 1.3219314671353013, + "learning_rate": 2.5240757703590462e-05, + "loss": 0.6803, + "num_input_tokens_seen": 819165536, + "step": 4539 + }, + { + "epoch": 0.4970032020580749, + "grad_norm": 1.3443872581926848, + "learning_rate": 2.523215946593141e-05, + "loss": 0.9152, + "num_input_tokens_seen": 819351680, + "step": 4540 + }, + { + "epoch": 0.49711267412901283, + "grad_norm": 1.1649004077500345, + "learning_rate": 2.5223561200808334e-05, + "loss": 0.5306, + "num_input_tokens_seen": 819517888, + "step": 4541 + }, + { + "epoch": 0.49722214619995075, + "grad_norm": 1.1040397547115415, + "learning_rate": 2.5214962909238397e-05, + "loss": 0.593, + "num_input_tokens_seen": 819709856, + "step": 4542 + }, + { + "epoch": 0.4973316182708886, + "grad_norm": 1.430168196116579, + "learning_rate": 2.5206364592238767e-05, + "loss": 0.857, + "num_input_tokens_seen": 819889728, + "step": 4543 + }, + { + "epoch": 0.49744109034182654, + "grad_norm": 1.2920868508050043, + "learning_rate": 2.5197766250826586e-05, + "loss": 0.8211, + "num_input_tokens_seen": 820062208, + "step": 4544 + }, + { + "epoch": 0.49755056241276446, + "grad_norm": 1.263505597502579, + "learning_rate": 2.518916788601905e-05, + "loss": 0.6393, + "num_input_tokens_seen": 820259328, + "step": 4545 + }, + { + "epoch": 0.49766003448370233, + "grad_norm": 1.3303764557648128, + "learning_rate": 2.5180569498833308e-05, + "loss": 0.7956, + "num_input_tokens_seen": 820425312, + "step": 4546 + }, + { + "epoch": 0.49776950655464025, + "grad_norm": 1.2822148946447662, + "learning_rate": 2.5171971090286546e-05, + "loss": 0.6533, + "num_input_tokens_seen": 820572704, + "step": 4547 + }, + { + "epoch": 0.4978789786255782, + "grad_norm": 1.3538028472842272, + "learning_rate": 2.5163372661395923e-05, + "loss": 0.6965, + "num_input_tokens_seen": 820751680, + "step": 4548 + }, + { + "epoch": 0.49798845069651604, + "grad_norm": 1.2103596446820228, + "learning_rate": 2.5154774213178634e-05, + "loss": 0.7379, + "num_input_tokens_seen": 820959776, + "step": 4549 + }, + { + "epoch": 0.49809792276745396, + "grad_norm": 1.547755520605742, + "learning_rate": 2.5146175746651847e-05, + "loss": 0.9381, + "num_input_tokens_seen": 821163168, + "step": 4550 + }, + { + "epoch": 0.4982073948383919, + "grad_norm": 1.2905850752966148, + "learning_rate": 2.5137577262832746e-05, + "loss": 0.7848, + "num_input_tokens_seen": 821354912, + "step": 4551 + }, + { + "epoch": 0.49831686690932975, + "grad_norm": 1.4535796571942523, + "learning_rate": 2.512897876273852e-05, + "loss": 0.6358, + "num_input_tokens_seen": 821543968, + "step": 4552 + }, + { + "epoch": 0.49842633898026767, + "grad_norm": 1.198835578862967, + "learning_rate": 2.5120380247386356e-05, + "loss": 0.6543, + "num_input_tokens_seen": 821714208, + "step": 4553 + }, + { + "epoch": 0.49853581105120554, + "grad_norm": 1.339431966334711, + "learning_rate": 2.5111781717793436e-05, + "loss": 0.9203, + "num_input_tokens_seen": 821902816, + "step": 4554 + }, + { + "epoch": 0.49864528312214346, + "grad_norm": 1.2616659808860404, + "learning_rate": 2.5103183174976953e-05, + "loss": 0.7813, + "num_input_tokens_seen": 822076416, + "step": 4555 + }, + { + "epoch": 0.4987547551930814, + "grad_norm": 1.183201136167057, + "learning_rate": 2.5094584619954105e-05, + "loss": 0.6934, + "num_input_tokens_seen": 822256288, + "step": 4556 + }, + { + "epoch": 0.49886422726401924, + "grad_norm": 1.2309685070648384, + "learning_rate": 2.508598605374206e-05, + "loss": 0.7223, + "num_input_tokens_seen": 822462368, + "step": 4557 + }, + { + "epoch": 0.49897369933495717, + "grad_norm": 1.282325200443707, + "learning_rate": 2.5077387477358044e-05, + "loss": 0.7381, + "num_input_tokens_seen": 822635296, + "step": 4558 + }, + { + "epoch": 0.4990831714058951, + "grad_norm": 1.4584740252975543, + "learning_rate": 2.5068788891819235e-05, + "loss": 0.8709, + "num_input_tokens_seen": 822812032, + "step": 4559 + }, + { + "epoch": 0.49919264347683295, + "grad_norm": 1.4305823188837843, + "learning_rate": 2.5060190298142828e-05, + "loss": 0.7359, + "num_input_tokens_seen": 822985632, + "step": 4560 + }, + { + "epoch": 0.4993021155477709, + "grad_norm": 1.145059717445122, + "learning_rate": 2.5051591697346028e-05, + "loss": 0.5459, + "num_input_tokens_seen": 823168192, + "step": 4561 + }, + { + "epoch": 0.4994115876187088, + "grad_norm": 1.1744066992801199, + "learning_rate": 2.504299309044602e-05, + "loss": 0.5383, + "num_input_tokens_seen": 823329472, + "step": 4562 + }, + { + "epoch": 0.49952105968964666, + "grad_norm": 1.3122132760169232, + "learning_rate": 2.503439447846002e-05, + "loss": 0.6406, + "num_input_tokens_seen": 823516736, + "step": 4563 + }, + { + "epoch": 0.4996305317605846, + "grad_norm": 1.2508221306872063, + "learning_rate": 2.5025795862405216e-05, + "loss": 0.6389, + "num_input_tokens_seen": 823698624, + "step": 4564 + }, + { + "epoch": 0.4997400038315225, + "grad_norm": 1.173610267162333, + "learning_rate": 2.5017197243298813e-05, + "loss": 0.7223, + "num_input_tokens_seen": 823886560, + "step": 4565 + }, + { + "epoch": 0.49984947590246037, + "grad_norm": 1.1686426980439866, + "learning_rate": 2.500859862215801e-05, + "loss": 0.6084, + "num_input_tokens_seen": 824049408, + "step": 4566 + }, + { + "epoch": 0.4999589479733983, + "grad_norm": 1.316971428322993, + "learning_rate": 2.5e-05, + "loss": 0.7948, + "num_input_tokens_seen": 824223904, + "step": 4567 + }, + { + "epoch": 0.5000684200443362, + "grad_norm": 1.2865894523653159, + "learning_rate": 2.4991401377841993e-05, + "loss": 0.7047, + "num_input_tokens_seen": 824387872, + "step": 4568 + }, + { + "epoch": 0.5001778921152741, + "grad_norm": 1.3054328662408186, + "learning_rate": 2.4982802756701193e-05, + "loss": 0.7034, + "num_input_tokens_seen": 824582080, + "step": 4569 + }, + { + "epoch": 0.500287364186212, + "grad_norm": 1.2730784934027772, + "learning_rate": 2.4974204137594786e-05, + "loss": 0.5718, + "num_input_tokens_seen": 824740672, + "step": 4570 + }, + { + "epoch": 0.5003968362571499, + "grad_norm": 1.2619497365297343, + "learning_rate": 2.4965605521539982e-05, + "loss": 0.6985, + "num_input_tokens_seen": 824934880, + "step": 4571 + }, + { + "epoch": 0.5005063083280878, + "grad_norm": 1.4405692133166366, + "learning_rate": 2.4957006909553982e-05, + "loss": 0.7332, + "num_input_tokens_seen": 825149696, + "step": 4572 + }, + { + "epoch": 0.5006157803990257, + "grad_norm": 1.353344662921129, + "learning_rate": 2.4948408302653985e-05, + "loss": 0.7051, + "num_input_tokens_seen": 825298208, + "step": 4573 + }, + { + "epoch": 0.5007252524699636, + "grad_norm": 1.4885172018608304, + "learning_rate": 2.4939809701857174e-05, + "loss": 0.8565, + "num_input_tokens_seen": 825473824, + "step": 4574 + }, + { + "epoch": 0.5008347245409015, + "grad_norm": 1.3740171474375171, + "learning_rate": 2.4931211108180767e-05, + "loss": 0.8587, + "num_input_tokens_seen": 825658176, + "step": 4575 + }, + { + "epoch": 0.5009441966118394, + "grad_norm": 1.1885575053000808, + "learning_rate": 2.492261252264196e-05, + "loss": 0.739, + "num_input_tokens_seen": 825855072, + "step": 4576 + }, + { + "epoch": 0.5010536686827773, + "grad_norm": 1.2751477698170455, + "learning_rate": 2.491401394625794e-05, + "loss": 1.0133, + "num_input_tokens_seen": 826050400, + "step": 4577 + }, + { + "epoch": 0.5011631407537153, + "grad_norm": 1.1164068949479309, + "learning_rate": 2.490541538004591e-05, + "loss": 0.5579, + "num_input_tokens_seen": 826219744, + "step": 4578 + }, + { + "epoch": 0.5012726128246531, + "grad_norm": 1.094644997762891, + "learning_rate": 2.489681682502305e-05, + "loss": 0.6077, + "num_input_tokens_seen": 826419104, + "step": 4579 + }, + { + "epoch": 0.501382084895591, + "grad_norm": 1.1413039707988804, + "learning_rate": 2.4888218282206573e-05, + "loss": 0.6397, + "num_input_tokens_seen": 826569408, + "step": 4580 + }, + { + "epoch": 0.5014915569665289, + "grad_norm": 1.3304252003627923, + "learning_rate": 2.4879619752613653e-05, + "loss": 0.6157, + "num_input_tokens_seen": 826747936, + "step": 4581 + }, + { + "epoch": 0.5016010290374668, + "grad_norm": 1.1733618903658578, + "learning_rate": 2.487102123726148e-05, + "loss": 0.7546, + "num_input_tokens_seen": 826927584, + "step": 4582 + }, + { + "epoch": 0.5017105011084048, + "grad_norm": 1.3321004273470556, + "learning_rate": 2.486242273716726e-05, + "loss": 0.7435, + "num_input_tokens_seen": 827114176, + "step": 4583 + }, + { + "epoch": 0.5018199731793426, + "grad_norm": 1.4391263800576064, + "learning_rate": 2.4853824253348162e-05, + "loss": 0.742, + "num_input_tokens_seen": 827277920, + "step": 4584 + }, + { + "epoch": 0.5019294452502805, + "grad_norm": 1.1880538050608156, + "learning_rate": 2.484522578682138e-05, + "loss": 0.8854, + "num_input_tokens_seen": 827485792, + "step": 4585 + }, + { + "epoch": 0.5020389173212184, + "grad_norm": 1.2700093938825519, + "learning_rate": 2.4836627338604083e-05, + "loss": 0.7862, + "num_input_tokens_seen": 827667008, + "step": 4586 + }, + { + "epoch": 0.5021483893921563, + "grad_norm": 1.2725195355217958, + "learning_rate": 2.482802890971347e-05, + "loss": 0.7606, + "num_input_tokens_seen": 827844192, + "step": 4587 + }, + { + "epoch": 0.5022578614630943, + "grad_norm": 1.2970717407273844, + "learning_rate": 2.4819430501166695e-05, + "loss": 0.7711, + "num_input_tokens_seen": 828021376, + "step": 4588 + }, + { + "epoch": 0.5023673335340322, + "grad_norm": 1.585673718995236, + "learning_rate": 2.4810832113980952e-05, + "loss": 0.5647, + "num_input_tokens_seen": 828230368, + "step": 4589 + }, + { + "epoch": 0.50247680560497, + "grad_norm": 1.304835328276038, + "learning_rate": 2.4802233749173416e-05, + "loss": 0.815, + "num_input_tokens_seen": 828409344, + "step": 4590 + }, + { + "epoch": 0.5025862776759079, + "grad_norm": 1.3231741236675065, + "learning_rate": 2.479363540776124e-05, + "loss": 0.7442, + "num_input_tokens_seen": 828594816, + "step": 4591 + }, + { + "epoch": 0.5026957497468458, + "grad_norm": 1.3180402807554976, + "learning_rate": 2.4785037090761605e-05, + "loss": 0.8391, + "num_input_tokens_seen": 828758112, + "step": 4592 + }, + { + "epoch": 0.5028052218177838, + "grad_norm": 1.431661774816064, + "learning_rate": 2.477643879919167e-05, + "loss": 0.8676, + "num_input_tokens_seen": 828955904, + "step": 4593 + }, + { + "epoch": 0.5029146938887217, + "grad_norm": 1.3799109756392611, + "learning_rate": 2.47678405340686e-05, + "loss": 0.7869, + "num_input_tokens_seen": 829134656, + "step": 4594 + }, + { + "epoch": 0.5030241659596596, + "grad_norm": 1.319502542566276, + "learning_rate": 2.4759242296409547e-05, + "loss": 0.5883, + "num_input_tokens_seen": 829305568, + "step": 4595 + }, + { + "epoch": 0.5031336380305974, + "grad_norm": 1.3982146101479518, + "learning_rate": 2.4750644087231668e-05, + "loss": 0.9888, + "num_input_tokens_seen": 829496864, + "step": 4596 + }, + { + "epoch": 0.5032431101015353, + "grad_norm": 1.4107608080430842, + "learning_rate": 2.4742045907552124e-05, + "loss": 0.9745, + "num_input_tokens_seen": 829668448, + "step": 4597 + }, + { + "epoch": 0.5033525821724733, + "grad_norm": 1.5174972139486915, + "learning_rate": 2.473344775838805e-05, + "loss": 0.8008, + "num_input_tokens_seen": 829842496, + "step": 4598 + }, + { + "epoch": 0.5034620542434112, + "grad_norm": 1.2536030340344217, + "learning_rate": 2.4724849640756608e-05, + "loss": 0.6628, + "num_input_tokens_seen": 830023264, + "step": 4599 + }, + { + "epoch": 0.5035715263143491, + "grad_norm": 1.4094131075492482, + "learning_rate": 2.4716251555674913e-05, + "loss": 0.6408, + "num_input_tokens_seen": 830164384, + "step": 4600 + }, + { + "epoch": 0.5036809983852869, + "grad_norm": 1.246928398881897, + "learning_rate": 2.4707653504160118e-05, + "loss": 0.8331, + "num_input_tokens_seen": 830370912, + "step": 4601 + }, + { + "epoch": 0.5037904704562248, + "grad_norm": 1.1761544481276789, + "learning_rate": 2.4699055487229366e-05, + "loss": 0.454, + "num_input_tokens_seen": 830553472, + "step": 4602 + }, + { + "epoch": 0.5038999425271627, + "grad_norm": 1.3009060636557501, + "learning_rate": 2.4690457505899765e-05, + "loss": 0.6247, + "num_input_tokens_seen": 830729312, + "step": 4603 + }, + { + "epoch": 0.5040094145981007, + "grad_norm": 1.324225456648416, + "learning_rate": 2.468185956118845e-05, + "loss": 0.6947, + "num_input_tokens_seen": 830910528, + "step": 4604 + }, + { + "epoch": 0.5041188866690386, + "grad_norm": 1.31045040679498, + "learning_rate": 2.4673261654112543e-05, + "loss": 0.7686, + "num_input_tokens_seen": 831076512, + "step": 4605 + }, + { + "epoch": 0.5042283587399765, + "grad_norm": 1.1198732863850709, + "learning_rate": 2.466466378568916e-05, + "loss": 0.673, + "num_input_tokens_seen": 831275200, + "step": 4606 + }, + { + "epoch": 0.5043378308109143, + "grad_norm": 1.3962148979917834, + "learning_rate": 2.4656065956935408e-05, + "loss": 0.8783, + "num_input_tokens_seen": 831425504, + "step": 4607 + }, + { + "epoch": 0.5044473028818522, + "grad_norm": 1.2124143393234605, + "learning_rate": 2.464746816886839e-05, + "loss": 0.8276, + "num_input_tokens_seen": 831625088, + "step": 4608 + }, + { + "epoch": 0.5045567749527902, + "grad_norm": 1.3702861439726832, + "learning_rate": 2.4638870422505225e-05, + "loss": 0.8796, + "num_input_tokens_seen": 831803840, + "step": 4609 + }, + { + "epoch": 0.5046662470237281, + "grad_norm": 1.3741690986169162, + "learning_rate": 2.4630272718862992e-05, + "loss": 0.8266, + "num_input_tokens_seen": 831984832, + "step": 4610 + }, + { + "epoch": 0.504775719094666, + "grad_norm": 1.353772864179474, + "learning_rate": 2.4621675058958792e-05, + "loss": 0.7756, + "num_input_tokens_seen": 832184192, + "step": 4611 + }, + { + "epoch": 0.5048851911656039, + "grad_norm": 1.4875589855904747, + "learning_rate": 2.4613077443809706e-05, + "loss": 0.6661, + "num_input_tokens_seen": 832330464, + "step": 4612 + }, + { + "epoch": 0.5049946632365417, + "grad_norm": 1.3244177294647888, + "learning_rate": 2.460447987443282e-05, + "loss": 0.7282, + "num_input_tokens_seen": 832506528, + "step": 4613 + }, + { + "epoch": 0.5051041353074797, + "grad_norm": 1.3990547480228903, + "learning_rate": 2.459588235184521e-05, + "loss": 0.8362, + "num_input_tokens_seen": 832690432, + "step": 4614 + }, + { + "epoch": 0.5052136073784176, + "grad_norm": 1.3047998345720524, + "learning_rate": 2.4587284877063942e-05, + "loss": 0.7608, + "num_input_tokens_seen": 832862688, + "step": 4615 + }, + { + "epoch": 0.5053230794493555, + "grad_norm": 1.313920569267549, + "learning_rate": 2.4578687451106085e-05, + "loss": 0.8531, + "num_input_tokens_seen": 833037632, + "step": 4616 + }, + { + "epoch": 0.5054325515202934, + "grad_norm": 1.4393898524297828, + "learning_rate": 2.4570090074988697e-05, + "loss": 0.8333, + "num_input_tokens_seen": 833191072, + "step": 4617 + }, + { + "epoch": 0.5055420235912312, + "grad_norm": 1.2772339006605093, + "learning_rate": 2.456149274972884e-05, + "loss": 0.7134, + "num_input_tokens_seen": 833361312, + "step": 4618 + }, + { + "epoch": 0.5056514956621692, + "grad_norm": 1.2092721624293425, + "learning_rate": 2.455289547634354e-05, + "loss": 0.7392, + "num_input_tokens_seen": 833536704, + "step": 4619 + }, + { + "epoch": 0.5057609677331071, + "grad_norm": 1.3312976864436123, + "learning_rate": 2.4544298255849862e-05, + "loss": 0.8226, + "num_input_tokens_seen": 833730240, + "step": 4620 + }, + { + "epoch": 0.505870439804045, + "grad_norm": 1.2941585264340891, + "learning_rate": 2.453570108926482e-05, + "loss": 0.7372, + "num_input_tokens_seen": 833907424, + "step": 4621 + }, + { + "epoch": 0.5059799118749829, + "grad_norm": 1.3427296690309891, + "learning_rate": 2.4527103977605447e-05, + "loss": 0.8083, + "num_input_tokens_seen": 834112160, + "step": 4622 + }, + { + "epoch": 0.5060893839459208, + "grad_norm": 1.232650082795562, + "learning_rate": 2.4518506921888788e-05, + "loss": 0.6943, + "num_input_tokens_seen": 834284640, + "step": 4623 + }, + { + "epoch": 0.5061988560168587, + "grad_norm": 1.3553504562428387, + "learning_rate": 2.4509909923131822e-05, + "loss": 0.8215, + "num_input_tokens_seen": 834460256, + "step": 4624 + }, + { + "epoch": 0.5063083280877966, + "grad_norm": 1.3848639320160572, + "learning_rate": 2.450131298235158e-05, + "loss": 0.7743, + "num_input_tokens_seen": 834666560, + "step": 4625 + }, + { + "epoch": 0.5064178001587345, + "grad_norm": 1.4245783499918503, + "learning_rate": 2.449271610056506e-05, + "loss": 0.6635, + "num_input_tokens_seen": 834838144, + "step": 4626 + }, + { + "epoch": 0.5065272722296724, + "grad_norm": 1.3386492951856728, + "learning_rate": 2.448411927878926e-05, + "loss": 0.8605, + "num_input_tokens_seen": 835026752, + "step": 4627 + }, + { + "epoch": 0.5066367443006103, + "grad_norm": 1.364414214314655, + "learning_rate": 2.447552251804116e-05, + "loss": 0.8306, + "num_input_tokens_seen": 835210656, + "step": 4628 + }, + { + "epoch": 0.5067462163715483, + "grad_norm": 1.1632584220537077, + "learning_rate": 2.4466925819337734e-05, + "loss": 0.7278, + "num_input_tokens_seen": 835398368, + "step": 4629 + }, + { + "epoch": 0.5068556884424861, + "grad_norm": 1.2421484407246002, + "learning_rate": 2.4458329183695983e-05, + "loss": 0.5558, + "num_input_tokens_seen": 835576448, + "step": 4630 + }, + { + "epoch": 0.506965160513424, + "grad_norm": 1.388412945049651, + "learning_rate": 2.444973261213284e-05, + "loss": 0.8294, + "num_input_tokens_seen": 835769312, + "step": 4631 + }, + { + "epoch": 0.5070746325843619, + "grad_norm": 1.211621294462031, + "learning_rate": 2.4441136105665284e-05, + "loss": 0.6983, + "num_input_tokens_seen": 835947168, + "step": 4632 + }, + { + "epoch": 0.5071841046552998, + "grad_norm": 1.3643482918250547, + "learning_rate": 2.4432539665310252e-05, + "loss": 0.8351, + "num_input_tokens_seen": 836126816, + "step": 4633 + }, + { + "epoch": 0.5072935767262378, + "grad_norm": 1.1952115683234492, + "learning_rate": 2.44239432920847e-05, + "loss": 0.6348, + "num_input_tokens_seen": 836296608, + "step": 4634 + }, + { + "epoch": 0.5074030487971756, + "grad_norm": 1.4269380240237861, + "learning_rate": 2.441534698700555e-05, + "loss": 0.947, + "num_input_tokens_seen": 836493280, + "step": 4635 + }, + { + "epoch": 0.5075125208681135, + "grad_norm": 1.2846490386646907, + "learning_rate": 2.4406750751089737e-05, + "loss": 0.6459, + "num_input_tokens_seen": 836655456, + "step": 4636 + }, + { + "epoch": 0.5076219929390514, + "grad_norm": 1.296442120476383, + "learning_rate": 2.4398154585354174e-05, + "loss": 0.8388, + "num_input_tokens_seen": 836819200, + "step": 4637 + }, + { + "epoch": 0.5077314650099893, + "grad_norm": 1.4268152119646145, + "learning_rate": 2.438955849081577e-05, + "loss": 0.8457, + "num_input_tokens_seen": 836976000, + "step": 4638 + }, + { + "epoch": 0.5078409370809273, + "grad_norm": 1.3416972338509545, + "learning_rate": 2.4380962468491438e-05, + "loss": 0.6062, + "num_input_tokens_seen": 837168192, + "step": 4639 + }, + { + "epoch": 0.5079504091518652, + "grad_norm": 1.178923049716051, + "learning_rate": 2.4372366519398053e-05, + "loss": 0.8231, + "num_input_tokens_seen": 837378976, + "step": 4640 + }, + { + "epoch": 0.508059881222803, + "grad_norm": 1.2384943733564935, + "learning_rate": 2.4363770644552523e-05, + "loss": 0.7786, + "num_input_tokens_seen": 837553920, + "step": 4641 + }, + { + "epoch": 0.5081693532937409, + "grad_norm": 1.3435864459905162, + "learning_rate": 2.4355174844971695e-05, + "loss": 0.6899, + "num_input_tokens_seen": 837756192, + "step": 4642 + }, + { + "epoch": 0.5082788253646788, + "grad_norm": 1.3161732725003663, + "learning_rate": 2.434657912167245e-05, + "loss": 0.7296, + "num_input_tokens_seen": 837910304, + "step": 4643 + }, + { + "epoch": 0.5083882974356168, + "grad_norm": 1.35444294483139, + "learning_rate": 2.4337983475671657e-05, + "loss": 0.6771, + "num_input_tokens_seen": 838068896, + "step": 4644 + }, + { + "epoch": 0.5084977695065547, + "grad_norm": 1.206927989505634, + "learning_rate": 2.4329387907986145e-05, + "loss": 0.6447, + "num_input_tokens_seen": 838261536, + "step": 4645 + }, + { + "epoch": 0.5086072415774926, + "grad_norm": 1.1229395010660657, + "learning_rate": 2.4320792419632764e-05, + "loss": 0.5491, + "num_input_tokens_seen": 838451040, + "step": 4646 + }, + { + "epoch": 0.5087167136484304, + "grad_norm": 1.358294046258983, + "learning_rate": 2.431219701162834e-05, + "loss": 0.762, + "num_input_tokens_seen": 838617024, + "step": 4647 + }, + { + "epoch": 0.5088261857193683, + "grad_norm": 1.1874619528046026, + "learning_rate": 2.4303601684989698e-05, + "loss": 0.7074, + "num_input_tokens_seen": 838774048, + "step": 4648 + }, + { + "epoch": 0.5089356577903063, + "grad_norm": 1.3408790347693913, + "learning_rate": 2.429500644073364e-05, + "loss": 0.8319, + "num_input_tokens_seen": 838955488, + "step": 4649 + }, + { + "epoch": 0.5090451298612442, + "grad_norm": 1.2332834656832143, + "learning_rate": 2.428641127987697e-05, + "loss": 0.854, + "num_input_tokens_seen": 839140512, + "step": 4650 + }, + { + "epoch": 0.5091546019321821, + "grad_norm": 1.200578198610444, + "learning_rate": 2.4277816203436498e-05, + "loss": 0.7887, + "num_input_tokens_seen": 839327776, + "step": 4651 + }, + { + "epoch": 0.5092640740031199, + "grad_norm": 1.4310298100638086, + "learning_rate": 2.426922121242897e-05, + "loss": 0.8172, + "num_input_tokens_seen": 839476512, + "step": 4652 + }, + { + "epoch": 0.5093735460740578, + "grad_norm": 1.2830370094390622, + "learning_rate": 2.426062630787119e-05, + "loss": 0.7144, + "num_input_tokens_seen": 839655264, + "step": 4653 + }, + { + "epoch": 0.5094830181449957, + "grad_norm": 1.294146367413521, + "learning_rate": 2.4252031490779894e-05, + "loss": 0.8514, + "num_input_tokens_seen": 839833792, + "step": 4654 + }, + { + "epoch": 0.5095924902159337, + "grad_norm": 1.2464445092817522, + "learning_rate": 2.4243436762171848e-05, + "loss": 0.6819, + "num_input_tokens_seen": 839998432, + "step": 4655 + }, + { + "epoch": 0.5097019622868716, + "grad_norm": 1.2656766150981646, + "learning_rate": 2.4234842123063786e-05, + "loss": 0.9939, + "num_input_tokens_seen": 840208544, + "step": 4656 + }, + { + "epoch": 0.5098114343578095, + "grad_norm": 1.1919942936760692, + "learning_rate": 2.4226247574472433e-05, + "loss": 0.635, + "num_input_tokens_seen": 840393344, + "step": 4657 + }, + { + "epoch": 0.5099209064287473, + "grad_norm": 1.1211641934158072, + "learning_rate": 2.421765311741452e-05, + "loss": 0.7471, + "num_input_tokens_seen": 840566944, + "step": 4658 + }, + { + "epoch": 0.5100303784996852, + "grad_norm": 1.1885314778262603, + "learning_rate": 2.420905875290674e-05, + "loss": 0.5094, + "num_input_tokens_seen": 840736288, + "step": 4659 + }, + { + "epoch": 0.5101398505706232, + "grad_norm": 1.3687908253612528, + "learning_rate": 2.4200464481965807e-05, + "loss": 0.7844, + "num_input_tokens_seen": 840937664, + "step": 4660 + }, + { + "epoch": 0.5102493226415611, + "grad_norm": 1.2569745038526323, + "learning_rate": 2.419187030560839e-05, + "loss": 0.7691, + "num_input_tokens_seen": 841136128, + "step": 4661 + }, + { + "epoch": 0.510358794712499, + "grad_norm": 1.2216068920033514, + "learning_rate": 2.4183276224851178e-05, + "loss": 0.6193, + "num_input_tokens_seen": 841324288, + "step": 4662 + }, + { + "epoch": 0.5104682667834369, + "grad_norm": 1.4192671282891958, + "learning_rate": 2.417468224071082e-05, + "loss": 0.6841, + "num_input_tokens_seen": 841469888, + "step": 4663 + }, + { + "epoch": 0.5105777388543747, + "grad_norm": 1.3784413621182887, + "learning_rate": 2.4166088354203974e-05, + "loss": 0.9889, + "num_input_tokens_seen": 841637888, + "step": 4664 + }, + { + "epoch": 0.5106872109253127, + "grad_norm": 1.1226863381874017, + "learning_rate": 2.415749456634729e-05, + "loss": 0.5168, + "num_input_tokens_seen": 841817760, + "step": 4665 + }, + { + "epoch": 0.5107966829962506, + "grad_norm": 1.3353268304007215, + "learning_rate": 2.4148900878157378e-05, + "loss": 0.864, + "num_input_tokens_seen": 842032576, + "step": 4666 + }, + { + "epoch": 0.5109061550671885, + "grad_norm": 1.1305093964899464, + "learning_rate": 2.4140307290650874e-05, + "loss": 0.4939, + "num_input_tokens_seen": 842226560, + "step": 4667 + }, + { + "epoch": 0.5110156271381264, + "grad_norm": 1.3594812373598915, + "learning_rate": 2.4131713804844364e-05, + "loss": 0.8269, + "num_input_tokens_seen": 842414496, + "step": 4668 + }, + { + "epoch": 0.5111250992090642, + "grad_norm": 1.2018365277159855, + "learning_rate": 2.412312042175446e-05, + "loss": 0.5688, + "num_input_tokens_seen": 842608704, + "step": 4669 + }, + { + "epoch": 0.5112345712800022, + "grad_norm": 1.3975817656780143, + "learning_rate": 2.4114527142397732e-05, + "loss": 0.7895, + "num_input_tokens_seen": 842780064, + "step": 4670 + }, + { + "epoch": 0.5113440433509401, + "grad_norm": 1.3330140725013266, + "learning_rate": 2.4105933967790744e-05, + "loss": 0.6927, + "num_input_tokens_seen": 842981440, + "step": 4671 + }, + { + "epoch": 0.511453515421878, + "grad_norm": 1.3212760253623952, + "learning_rate": 2.4097340898950073e-05, + "loss": 0.772, + "num_input_tokens_seen": 843179456, + "step": 4672 + }, + { + "epoch": 0.5115629874928159, + "grad_norm": 1.419832712178625, + "learning_rate": 2.4088747936892237e-05, + "loss": 1.0709, + "num_input_tokens_seen": 843370528, + "step": 4673 + }, + { + "epoch": 0.5116724595637538, + "grad_norm": 1.1643464426991215, + "learning_rate": 2.4080155082633784e-05, + "loss": 0.7427, + "num_input_tokens_seen": 843543232, + "step": 4674 + }, + { + "epoch": 0.5117819316346917, + "grad_norm": 1.2382730670261461, + "learning_rate": 2.407156233719122e-05, + "loss": 0.7398, + "num_input_tokens_seen": 843756480, + "step": 4675 + }, + { + "epoch": 0.5118914037056296, + "grad_norm": 1.2433424085297793, + "learning_rate": 2.4062969701581054e-05, + "loss": 0.7332, + "num_input_tokens_seen": 843955392, + "step": 4676 + }, + { + "epoch": 0.5120008757765675, + "grad_norm": 1.3228913802886666, + "learning_rate": 2.4054377176819793e-05, + "loss": 0.5909, + "num_input_tokens_seen": 844163488, + "step": 4677 + }, + { + "epoch": 0.5121103478475054, + "grad_norm": 1.2310026433499526, + "learning_rate": 2.4045784763923893e-05, + "loss": 0.7168, + "num_input_tokens_seen": 844345824, + "step": 4678 + }, + { + "epoch": 0.5122198199184433, + "grad_norm": 1.2186174076042193, + "learning_rate": 2.4037192463909837e-05, + "loss": 0.8186, + "num_input_tokens_seen": 844507776, + "step": 4679 + }, + { + "epoch": 0.5123292919893813, + "grad_norm": 1.1722373910698887, + "learning_rate": 2.4028600277794066e-05, + "loss": 0.9017, + "num_input_tokens_seen": 844713856, + "step": 4680 + }, + { + "epoch": 0.5124387640603191, + "grad_norm": 1.1198994725105276, + "learning_rate": 2.4020008206593036e-05, + "loss": 0.6588, + "num_input_tokens_seen": 844884096, + "step": 4681 + }, + { + "epoch": 0.512548236131257, + "grad_norm": 1.2310455907781814, + "learning_rate": 2.401141625132315e-05, + "loss": 0.6475, + "num_input_tokens_seen": 845043360, + "step": 4682 + }, + { + "epoch": 0.5126577082021949, + "grad_norm": 1.2792762347005022, + "learning_rate": 2.4002824413000836e-05, + "loss": 0.6861, + "num_input_tokens_seen": 845185824, + "step": 4683 + }, + { + "epoch": 0.5127671802731328, + "grad_norm": 1.3503503380179278, + "learning_rate": 2.3994232692642496e-05, + "loss": 0.6432, + "num_input_tokens_seen": 845354944, + "step": 4684 + }, + { + "epoch": 0.5128766523440708, + "grad_norm": 1.1989756284914093, + "learning_rate": 2.3985641091264495e-05, + "loss": 0.8197, + "num_input_tokens_seen": 845546912, + "step": 4685 + }, + { + "epoch": 0.5129861244150086, + "grad_norm": 1.2903602771963052, + "learning_rate": 2.3977049609883222e-05, + "loss": 0.7045, + "num_input_tokens_seen": 845696768, + "step": 4686 + }, + { + "epoch": 0.5130955964859465, + "grad_norm": 1.3370563216832754, + "learning_rate": 2.3968458249515016e-05, + "loss": 0.7707, + "num_input_tokens_seen": 845863200, + "step": 4687 + }, + { + "epoch": 0.5132050685568844, + "grad_norm": 1.4749293908497971, + "learning_rate": 2.395986701117623e-05, + "loss": 1.0119, + "num_input_tokens_seen": 846036576, + "step": 4688 + }, + { + "epoch": 0.5133145406278223, + "grad_norm": 1.3514203123590893, + "learning_rate": 2.3951275895883188e-05, + "loss": 0.8651, + "num_input_tokens_seen": 846219136, + "step": 4689 + }, + { + "epoch": 0.5134240126987603, + "grad_norm": 1.3188438599075312, + "learning_rate": 2.3942684904652204e-05, + "loss": 0.7929, + "num_input_tokens_seen": 846402368, + "step": 4690 + }, + { + "epoch": 0.5135334847696982, + "grad_norm": 1.4133939901567194, + "learning_rate": 2.3934094038499577e-05, + "loss": 0.8649, + "num_input_tokens_seen": 846574400, + "step": 4691 + }, + { + "epoch": 0.513642956840636, + "grad_norm": 1.439092003336612, + "learning_rate": 2.392550329844158e-05, + "loss": 0.7526, + "num_input_tokens_seen": 846742624, + "step": 4692 + }, + { + "epoch": 0.5137524289115739, + "grad_norm": 1.2371959081461865, + "learning_rate": 2.3916912685494502e-05, + "loss": 0.6734, + "num_input_tokens_seen": 846930336, + "step": 4693 + }, + { + "epoch": 0.5138619009825118, + "grad_norm": 1.2808490684120206, + "learning_rate": 2.390832220067457e-05, + "loss": 0.7444, + "num_input_tokens_seen": 847123648, + "step": 4694 + }, + { + "epoch": 0.5139713730534498, + "grad_norm": 1.4416859259816557, + "learning_rate": 2.3899731844998048e-05, + "loss": 0.8045, + "num_input_tokens_seen": 847290752, + "step": 4695 + }, + { + "epoch": 0.5140808451243877, + "grad_norm": 1.2806551878439694, + "learning_rate": 2.3891141619481132e-05, + "loss": 0.7587, + "num_input_tokens_seen": 847467488, + "step": 4696 + }, + { + "epoch": 0.5141903171953256, + "grad_norm": 1.2543306197073647, + "learning_rate": 2.388255152514004e-05, + "loss": 0.682, + "num_input_tokens_seen": 847657664, + "step": 4697 + }, + { + "epoch": 0.5142997892662634, + "grad_norm": 1.2565809108217383, + "learning_rate": 2.387396156299098e-05, + "loss": 0.4822, + "num_input_tokens_seen": 847830144, + "step": 4698 + }, + { + "epoch": 0.5144092613372013, + "grad_norm": 1.3025256766133249, + "learning_rate": 2.3865371734050107e-05, + "loss": 0.6122, + "num_input_tokens_seen": 847996352, + "step": 4699 + }, + { + "epoch": 0.5145187334081393, + "grad_norm": 1.3375404066595602, + "learning_rate": 2.385678203933359e-05, + "loss": 0.687, + "num_input_tokens_seen": 848172416, + "step": 4700 + }, + { + "epoch": 0.5146282054790772, + "grad_norm": 1.0770043339832487, + "learning_rate": 2.3848192479857566e-05, + "loss": 0.6665, + "num_input_tokens_seen": 848379392, + "step": 4701 + }, + { + "epoch": 0.5147376775500151, + "grad_norm": 1.2323074408602095, + "learning_rate": 2.3839603056638186e-05, + "loss": 0.7487, + "num_input_tokens_seen": 848538208, + "step": 4702 + }, + { + "epoch": 0.5148471496209529, + "grad_norm": 1.180613728878892, + "learning_rate": 2.3831013770691533e-05, + "loss": 0.6476, + "num_input_tokens_seen": 848689408, + "step": 4703 + }, + { + "epoch": 0.5149566216918908, + "grad_norm": 1.3587708281576214, + "learning_rate": 2.3822424623033712e-05, + "loss": 0.9179, + "num_input_tokens_seen": 848866592, + "step": 4704 + }, + { + "epoch": 0.5150660937628287, + "grad_norm": 1.2238085499551632, + "learning_rate": 2.381383561468082e-05, + "loss": 0.6197, + "num_input_tokens_seen": 849069088, + "step": 4705 + }, + { + "epoch": 0.5151755658337667, + "grad_norm": 1.3361607861333349, + "learning_rate": 2.38052467466489e-05, + "loss": 0.8342, + "num_input_tokens_seen": 849207072, + "step": 4706 + }, + { + "epoch": 0.5152850379047046, + "grad_norm": 1.3067798853905692, + "learning_rate": 2.3796658019954013e-05, + "loss": 0.8924, + "num_input_tokens_seen": 849402400, + "step": 4707 + }, + { + "epoch": 0.5153945099756425, + "grad_norm": 1.1663359900007566, + "learning_rate": 2.3788069435612174e-05, + "loss": 0.6358, + "num_input_tokens_seen": 849567712, + "step": 4708 + }, + { + "epoch": 0.5155039820465803, + "grad_norm": 1.306507052449617, + "learning_rate": 2.3779480994639406e-05, + "loss": 0.8459, + "num_input_tokens_seen": 849753184, + "step": 4709 + }, + { + "epoch": 0.5156134541175182, + "grad_norm": 1.2026574848967133, + "learning_rate": 2.37708926980517e-05, + "loss": 0.6796, + "num_input_tokens_seen": 849909984, + "step": 4710 + }, + { + "epoch": 0.5157229261884562, + "grad_norm": 1.218358784021428, + "learning_rate": 2.3762304546865042e-05, + "loss": 0.707, + "num_input_tokens_seen": 850046400, + "step": 4711 + }, + { + "epoch": 0.5158323982593941, + "grad_norm": 1.1207536453980846, + "learning_rate": 2.375371654209539e-05, + "loss": 0.7439, + "num_input_tokens_seen": 850255392, + "step": 4712 + }, + { + "epoch": 0.515941870330332, + "grad_norm": 1.2363857840400456, + "learning_rate": 2.3745128684758684e-05, + "loss": 0.8206, + "num_input_tokens_seen": 850433696, + "step": 4713 + }, + { + "epoch": 0.5160513424012699, + "grad_norm": 1.2761566058443123, + "learning_rate": 2.3736540975870865e-05, + "loss": 0.9746, + "num_input_tokens_seen": 850622304, + "step": 4714 + }, + { + "epoch": 0.5161608144722077, + "grad_norm": 1.2111224125607998, + "learning_rate": 2.3727953416447826e-05, + "loss": 0.6455, + "num_input_tokens_seen": 850818976, + "step": 4715 + }, + { + "epoch": 0.5162702865431457, + "grad_norm": 1.10475270296048, + "learning_rate": 2.3719366007505477e-05, + "loss": 0.5936, + "num_input_tokens_seen": 850986976, + "step": 4716 + }, + { + "epoch": 0.5163797586140836, + "grad_norm": 1.364918160552754, + "learning_rate": 2.3710778750059665e-05, + "loss": 0.8448, + "num_input_tokens_seen": 851166176, + "step": 4717 + }, + { + "epoch": 0.5164892306850215, + "grad_norm": 1.3010518018919768, + "learning_rate": 2.3702191645126266e-05, + "loss": 0.7961, + "num_input_tokens_seen": 851374272, + "step": 4718 + }, + { + "epoch": 0.5165987027559594, + "grad_norm": 1.3351917527748052, + "learning_rate": 2.3693604693721126e-05, + "loss": 0.6708, + "num_input_tokens_seen": 851537792, + "step": 4719 + }, + { + "epoch": 0.5167081748268972, + "grad_norm": 1.2334355509558723, + "learning_rate": 2.368501789686004e-05, + "loss": 0.6459, + "num_input_tokens_seen": 851719456, + "step": 4720 + }, + { + "epoch": 0.5168176468978352, + "grad_norm": 1.1451665787940857, + "learning_rate": 2.367643125555883e-05, + "loss": 0.5755, + "num_input_tokens_seen": 851915680, + "step": 4721 + }, + { + "epoch": 0.5169271189687731, + "grad_norm": 1.2266296562713659, + "learning_rate": 2.3667844770833265e-05, + "loss": 0.8051, + "num_input_tokens_seen": 852133408, + "step": 4722 + }, + { + "epoch": 0.517036591039711, + "grad_norm": 1.2243696400952013, + "learning_rate": 2.3659258443699128e-05, + "loss": 0.665, + "num_input_tokens_seen": 852343968, + "step": 4723 + }, + { + "epoch": 0.5171460631106489, + "grad_norm": 1.2160718787808442, + "learning_rate": 2.3650672275172145e-05, + "loss": 0.6773, + "num_input_tokens_seen": 852535712, + "step": 4724 + }, + { + "epoch": 0.5172555351815868, + "grad_norm": 1.2912486179193003, + "learning_rate": 2.3642086266268053e-05, + "loss": 0.6971, + "num_input_tokens_seen": 852701024, + "step": 4725 + }, + { + "epoch": 0.5173650072525247, + "grad_norm": 1.350096199352175, + "learning_rate": 2.363350041800257e-05, + "loss": 0.8445, + "num_input_tokens_seen": 852890080, + "step": 4726 + }, + { + "epoch": 0.5174744793234626, + "grad_norm": 1.2717712828690861, + "learning_rate": 2.362491473139136e-05, + "loss": 0.8064, + "num_input_tokens_seen": 853063680, + "step": 4727 + }, + { + "epoch": 0.5175839513944005, + "grad_norm": 1.275618575143965, + "learning_rate": 2.3616329207450124e-05, + "loss": 0.8204, + "num_input_tokens_seen": 853249824, + "step": 4728 + }, + { + "epoch": 0.5176934234653384, + "grad_norm": 1.3664096472356058, + "learning_rate": 2.360774384719449e-05, + "loss": 0.6753, + "num_input_tokens_seen": 853431040, + "step": 4729 + }, + { + "epoch": 0.5178028955362763, + "grad_norm": 1.372077827840294, + "learning_rate": 2.359915865164009e-05, + "loss": 0.8935, + "num_input_tokens_seen": 853635104, + "step": 4730 + }, + { + "epoch": 0.5179123676072143, + "grad_norm": 1.2865195883969434, + "learning_rate": 2.3590573621802553e-05, + "loss": 0.7694, + "num_input_tokens_seen": 853842976, + "step": 4731 + }, + { + "epoch": 0.5180218396781521, + "grad_norm": 1.2225217897622445, + "learning_rate": 2.3581988758697463e-05, + "loss": 0.7688, + "num_input_tokens_seen": 854019488, + "step": 4732 + }, + { + "epoch": 0.51813131174909, + "grad_norm": 1.2322259375400866, + "learning_rate": 2.3573404063340386e-05, + "loss": 0.905, + "num_input_tokens_seen": 854198688, + "step": 4733 + }, + { + "epoch": 0.5182407838200279, + "grad_norm": 1.1384613208859766, + "learning_rate": 2.3564819536746883e-05, + "loss": 0.593, + "num_input_tokens_seen": 854389984, + "step": 4734 + }, + { + "epoch": 0.5183502558909658, + "grad_norm": 1.3413904898977944, + "learning_rate": 2.3556235179932494e-05, + "loss": 0.7793, + "num_input_tokens_seen": 854586208, + "step": 4735 + }, + { + "epoch": 0.5184597279619038, + "grad_norm": 1.1832865703548283, + "learning_rate": 2.354765099391271e-05, + "loss": 0.7146, + "num_input_tokens_seen": 854773920, + "step": 4736 + }, + { + "epoch": 0.5185692000328416, + "grad_norm": 1.2303377596447316, + "learning_rate": 2.353906697970304e-05, + "loss": 0.7044, + "num_input_tokens_seen": 854964320, + "step": 4737 + }, + { + "epoch": 0.5186786721037795, + "grad_norm": 1.3564509809799892, + "learning_rate": 2.3530483138318964e-05, + "loss": 0.8837, + "num_input_tokens_seen": 855109024, + "step": 4738 + }, + { + "epoch": 0.5187881441747174, + "grad_norm": 1.2257461260789422, + "learning_rate": 2.352189947077591e-05, + "loss": 0.6832, + "num_input_tokens_seen": 855285312, + "step": 4739 + }, + { + "epoch": 0.5188976162456553, + "grad_norm": 1.305789550594191, + "learning_rate": 2.3513315978089336e-05, + "loss": 0.652, + "num_input_tokens_seen": 855423968, + "step": 4740 + }, + { + "epoch": 0.5190070883165933, + "grad_norm": 1.3412042944551836, + "learning_rate": 2.3504732661274627e-05, + "loss": 0.7119, + "num_input_tokens_seen": 855593312, + "step": 4741 + }, + { + "epoch": 0.5191165603875312, + "grad_norm": 1.2882129528478865, + "learning_rate": 2.3496149521347194e-05, + "loss": 0.8052, + "num_input_tokens_seen": 855771616, + "step": 4742 + }, + { + "epoch": 0.519226032458469, + "grad_norm": 1.3213527576688122, + "learning_rate": 2.348756655932239e-05, + "loss": 0.8435, + "num_input_tokens_seen": 855980384, + "step": 4743 + }, + { + "epoch": 0.5193355045294069, + "grad_norm": 1.188346109620297, + "learning_rate": 2.3478983776215572e-05, + "loss": 0.7079, + "num_input_tokens_seen": 856150848, + "step": 4744 + }, + { + "epoch": 0.5194449766003448, + "grad_norm": 1.2877415680413933, + "learning_rate": 2.3470401173042076e-05, + "loss": 0.647, + "num_input_tokens_seen": 856319296, + "step": 4745 + }, + { + "epoch": 0.5195544486712828, + "grad_norm": 1.2976115329853148, + "learning_rate": 2.3461818750817186e-05, + "loss": 0.6906, + "num_input_tokens_seen": 856516864, + "step": 4746 + }, + { + "epoch": 0.5196639207422207, + "grad_norm": 1.2102819329461552, + "learning_rate": 2.3453236510556207e-05, + "loss": 0.6833, + "num_input_tokens_seen": 856678144, + "step": 4747 + }, + { + "epoch": 0.5197733928131586, + "grad_norm": 1.3193119478903585, + "learning_rate": 2.344465445327438e-05, + "loss": 0.7495, + "num_input_tokens_seen": 856839424, + "step": 4748 + }, + { + "epoch": 0.5198828648840964, + "grad_norm": 1.133802362934072, + "learning_rate": 2.3436072579986973e-05, + "loss": 0.5976, + "num_input_tokens_seen": 857019072, + "step": 4749 + }, + { + "epoch": 0.5199923369550343, + "grad_norm": 1.2103744182118432, + "learning_rate": 2.3427490891709176e-05, + "loss": 0.6886, + "num_input_tokens_seen": 857234336, + "step": 4750 + }, + { + "epoch": 0.5201018090259723, + "grad_norm": 1.2780422435727647, + "learning_rate": 2.3418909389456203e-05, + "loss": 0.5229, + "num_input_tokens_seen": 857394496, + "step": 4751 + }, + { + "epoch": 0.5202112810969102, + "grad_norm": 1.1926976443593282, + "learning_rate": 2.3410328074243232e-05, + "loss": 0.6878, + "num_input_tokens_seen": 857587136, + "step": 4752 + }, + { + "epoch": 0.5203207531678481, + "grad_norm": 1.1328624845855402, + "learning_rate": 2.340174694708541e-05, + "loss": 0.6391, + "num_input_tokens_seen": 857772832, + "step": 4753 + }, + { + "epoch": 0.5204302252387859, + "grad_norm": 1.34683516547471, + "learning_rate": 2.339316600899787e-05, + "loss": 0.6634, + "num_input_tokens_seen": 857929856, + "step": 4754 + }, + { + "epoch": 0.5205396973097238, + "grad_norm": 1.4055272258237257, + "learning_rate": 2.338458526099571e-05, + "loss": 0.885, + "num_input_tokens_seen": 858113088, + "step": 4755 + }, + { + "epoch": 0.5206491693806617, + "grad_norm": 1.246316632753639, + "learning_rate": 2.337600470409404e-05, + "loss": 0.7307, + "num_input_tokens_seen": 858268768, + "step": 4756 + }, + { + "epoch": 0.5207586414515997, + "grad_norm": 1.1202806454948733, + "learning_rate": 2.3367424339307895e-05, + "loss": 0.7408, + "num_input_tokens_seen": 858467456, + "step": 4757 + }, + { + "epoch": 0.5208681135225376, + "grad_norm": 1.3495067331062023, + "learning_rate": 2.3358844167652334e-05, + "loss": 0.7521, + "num_input_tokens_seen": 858660320, + "step": 4758 + }, + { + "epoch": 0.5209775855934755, + "grad_norm": 1.3132504179885056, + "learning_rate": 2.3350264190142377e-05, + "loss": 0.7516, + "num_input_tokens_seen": 858864160, + "step": 4759 + }, + { + "epoch": 0.5210870576644133, + "grad_norm": 1.5470700381846667, + "learning_rate": 2.3341684407793004e-05, + "loss": 0.8894, + "num_input_tokens_seen": 859019168, + "step": 4760 + }, + { + "epoch": 0.5211965297353512, + "grad_norm": 1.0780031992346404, + "learning_rate": 2.3333104821619207e-05, + "loss": 0.5534, + "num_input_tokens_seen": 859194112, + "step": 4761 + }, + { + "epoch": 0.5213060018062892, + "grad_norm": 1.24871760390285, + "learning_rate": 2.332452543263591e-05, + "loss": 0.6153, + "num_input_tokens_seen": 859324256, + "step": 4762 + }, + { + "epoch": 0.5214154738772271, + "grad_norm": 1.4115814868459942, + "learning_rate": 2.3315946241858058e-05, + "loss": 0.7163, + "num_input_tokens_seen": 859509952, + "step": 4763 + }, + { + "epoch": 0.521524945948165, + "grad_norm": 1.2219863860344367, + "learning_rate": 2.330736725030054e-05, + "loss": 0.7455, + "num_input_tokens_seen": 859683328, + "step": 4764 + }, + { + "epoch": 0.5216344180191029, + "grad_norm": 1.2322617803170326, + "learning_rate": 2.329878845897824e-05, + "loss": 0.7198, + "num_input_tokens_seen": 859834528, + "step": 4765 + }, + { + "epoch": 0.5217438900900407, + "grad_norm": 1.2164940617985163, + "learning_rate": 2.3290209868906025e-05, + "loss": 0.7367, + "num_input_tokens_seen": 860001632, + "step": 4766 + }, + { + "epoch": 0.5218533621609787, + "grad_norm": 1.1055721598538564, + "learning_rate": 2.3281631481098704e-05, + "loss": 0.4768, + "num_input_tokens_seen": 860206368, + "step": 4767 + }, + { + "epoch": 0.5219628342319166, + "grad_norm": 1.3042052057115252, + "learning_rate": 2.3273053296571104e-05, + "loss": 0.7879, + "num_input_tokens_seen": 860373696, + "step": 4768 + }, + { + "epoch": 0.5220723063028545, + "grad_norm": 1.3323893135719895, + "learning_rate": 2.326447531633799e-05, + "loss": 0.9015, + "num_input_tokens_seen": 860584704, + "step": 4769 + }, + { + "epoch": 0.5221817783737924, + "grad_norm": 1.4549036932727226, + "learning_rate": 2.3255897541414133e-05, + "loss": 1.005, + "num_input_tokens_seen": 860787872, + "step": 4770 + }, + { + "epoch": 0.5222912504447302, + "grad_norm": 1.314669133614114, + "learning_rate": 2.3247319972814256e-05, + "loss": 0.8123, + "num_input_tokens_seen": 860964384, + "step": 4771 + }, + { + "epoch": 0.5224007225156682, + "grad_norm": 1.3239305120626872, + "learning_rate": 2.3238742611553075e-05, + "loss": 0.7913, + "num_input_tokens_seen": 861138656, + "step": 4772 + }, + { + "epoch": 0.5225101945866061, + "grad_norm": 1.2043010858838916, + "learning_rate": 2.3230165458645282e-05, + "loss": 0.646, + "num_input_tokens_seen": 861280224, + "step": 4773 + }, + { + "epoch": 0.522619666657544, + "grad_norm": 1.208214386752775, + "learning_rate": 2.322158851510553e-05, + "loss": 0.7289, + "num_input_tokens_seen": 861469504, + "step": 4774 + }, + { + "epoch": 0.5227291387284819, + "grad_norm": 1.2181125832439108, + "learning_rate": 2.3213011781948456e-05, + "loss": 0.8554, + "num_input_tokens_seen": 861674912, + "step": 4775 + }, + { + "epoch": 0.5228386107994198, + "grad_norm": 1.290238370089419, + "learning_rate": 2.320443526018867e-05, + "loss": 0.7139, + "num_input_tokens_seen": 861871136, + "step": 4776 + }, + { + "epoch": 0.5229480828703577, + "grad_norm": 1.265392376278563, + "learning_rate": 2.319585895084077e-05, + "loss": 0.7043, + "num_input_tokens_seen": 862021664, + "step": 4777 + }, + { + "epoch": 0.5230575549412956, + "grad_norm": 1.2623477466291924, + "learning_rate": 2.3187282854919296e-05, + "loss": 0.686, + "num_input_tokens_seen": 862212960, + "step": 4778 + }, + { + "epoch": 0.5231670270122335, + "grad_norm": 1.0760331246159158, + "learning_rate": 2.3178706973438793e-05, + "loss": 0.5277, + "num_input_tokens_seen": 862412768, + "step": 4779 + }, + { + "epoch": 0.5232764990831714, + "grad_norm": 1.4287690928494228, + "learning_rate": 2.3170131307413788e-05, + "loss": 0.7344, + "num_input_tokens_seen": 862595552, + "step": 4780 + }, + { + "epoch": 0.5233859711541093, + "grad_norm": 1.1822028721922444, + "learning_rate": 2.3161555857858735e-05, + "loss": 0.6211, + "num_input_tokens_seen": 862802976, + "step": 4781 + }, + { + "epoch": 0.5234954432250473, + "grad_norm": 1.154908200698924, + "learning_rate": 2.3152980625788126e-05, + "loss": 0.6523, + "num_input_tokens_seen": 863005024, + "step": 4782 + }, + { + "epoch": 0.5236049152959851, + "grad_norm": 1.2238042639760163, + "learning_rate": 2.3144405612216365e-05, + "loss": 0.6374, + "num_input_tokens_seen": 863190048, + "step": 4783 + }, + { + "epoch": 0.523714387366923, + "grad_norm": 1.3170803619290103, + "learning_rate": 2.3135830818157877e-05, + "loss": 0.6081, + "num_input_tokens_seen": 863408224, + "step": 4784 + }, + { + "epoch": 0.5238238594378609, + "grad_norm": 1.3468712773459692, + "learning_rate": 2.3127256244627036e-05, + "loss": 0.8334, + "num_input_tokens_seen": 863591680, + "step": 4785 + }, + { + "epoch": 0.5239333315087988, + "grad_norm": 1.3357008339573921, + "learning_rate": 2.31186818926382e-05, + "loss": 0.747, + "num_input_tokens_seen": 863731680, + "step": 4786 + }, + { + "epoch": 0.5240428035797368, + "grad_norm": 1.177606072720258, + "learning_rate": 2.311010776320571e-05, + "loss": 0.6623, + "num_input_tokens_seen": 863892960, + "step": 4787 + }, + { + "epoch": 0.5241522756506746, + "grad_norm": 1.2877445303538422, + "learning_rate": 2.3101533857343848e-05, + "loss": 0.8344, + "num_input_tokens_seen": 864073504, + "step": 4788 + }, + { + "epoch": 0.5242617477216125, + "grad_norm": 1.3544217774147216, + "learning_rate": 2.3092960176066912e-05, + "loss": 0.6367, + "num_input_tokens_seen": 864219552, + "step": 4789 + }, + { + "epoch": 0.5243712197925504, + "grad_norm": 1.3107158705425963, + "learning_rate": 2.3084386720389135e-05, + "loss": 0.597, + "num_input_tokens_seen": 864410176, + "step": 4790 + }, + { + "epoch": 0.5244806918634883, + "grad_norm": 1.2923140697924482, + "learning_rate": 2.3075813491324743e-05, + "loss": 0.6825, + "num_input_tokens_seen": 864601920, + "step": 4791 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 1.2152865666873955, + "learning_rate": 2.3067240489887947e-05, + "loss": 0.8041, + "num_input_tokens_seen": 864798592, + "step": 4792 + }, + { + "epoch": 0.5246996360053642, + "grad_norm": 1.3429041573445566, + "learning_rate": 2.3058667717092895e-05, + "loss": 0.6823, + "num_input_tokens_seen": 864965024, + "step": 4793 + }, + { + "epoch": 0.524809108076302, + "grad_norm": 1.283280647525718, + "learning_rate": 2.305009517395375e-05, + "loss": 0.7255, + "num_input_tokens_seen": 865144896, + "step": 4794 + }, + { + "epoch": 0.5249185801472399, + "grad_norm": 1.2608047339823725, + "learning_rate": 2.3041522861484617e-05, + "loss": 0.7192, + "num_input_tokens_seen": 865338880, + "step": 4795 + }, + { + "epoch": 0.5250280522181778, + "grad_norm": 1.1846833705804976, + "learning_rate": 2.3032950780699582e-05, + "loss": 0.683, + "num_input_tokens_seen": 865540928, + "step": 4796 + }, + { + "epoch": 0.5251375242891158, + "grad_norm": 1.2379296221719083, + "learning_rate": 2.302437893261271e-05, + "loss": 0.8557, + "num_input_tokens_seen": 865735584, + "step": 4797 + }, + { + "epoch": 0.5252469963600537, + "grad_norm": 1.281138205884599, + "learning_rate": 2.3015807318238027e-05, + "loss": 0.5805, + "num_input_tokens_seen": 865913888, + "step": 4798 + }, + { + "epoch": 0.5253564684309916, + "grad_norm": 1.2789074487045533, + "learning_rate": 2.300723593858956e-05, + "loss": 0.7204, + "num_input_tokens_seen": 866098912, + "step": 4799 + }, + { + "epoch": 0.5254659405019294, + "grad_norm": 1.384065308386063, + "learning_rate": 2.299866479468126e-05, + "loss": 0.8575, + "num_input_tokens_seen": 866262880, + "step": 4800 + }, + { + "epoch": 0.5255754125728673, + "grad_norm": 1.1749425792867376, + "learning_rate": 2.29900938875271e-05, + "loss": 0.937, + "num_input_tokens_seen": 866459328, + "step": 4801 + }, + { + "epoch": 0.5256848846438053, + "grad_norm": 1.2899132435479652, + "learning_rate": 2.298152321814098e-05, + "loss": 0.7265, + "num_input_tokens_seen": 866625536, + "step": 4802 + }, + { + "epoch": 0.5257943567147432, + "grad_norm": 1.0389844248414755, + "learning_rate": 2.2972952787536815e-05, + "loss": 0.417, + "num_input_tokens_seen": 866801152, + "step": 4803 + }, + { + "epoch": 0.5259038287856811, + "grad_norm": 1.303159950208466, + "learning_rate": 2.2964382596728447e-05, + "loss": 0.7237, + "num_input_tokens_seen": 866957504, + "step": 4804 + }, + { + "epoch": 0.5260133008566189, + "grad_norm": 1.2484586272284164, + "learning_rate": 2.295581264672973e-05, + "loss": 0.5086, + "num_input_tokens_seen": 867122144, + "step": 4805 + }, + { + "epoch": 0.5261227729275568, + "grad_norm": 1.0590012670370705, + "learning_rate": 2.2947242938554475e-05, + "loss": 0.5052, + "num_input_tokens_seen": 867315008, + "step": 4806 + }, + { + "epoch": 0.5262322449984947, + "grad_norm": 1.3989766110011836, + "learning_rate": 2.2938673473216448e-05, + "loss": 0.8607, + "num_input_tokens_seen": 867491296, + "step": 4807 + }, + { + "epoch": 0.5263417170694327, + "grad_norm": 1.429336504766044, + "learning_rate": 2.293010425172942e-05, + "loss": 0.6455, + "num_input_tokens_seen": 867647872, + "step": 4808 + }, + { + "epoch": 0.5264511891403706, + "grad_norm": 1.3134622679432433, + "learning_rate": 2.2921535275107092e-05, + "loss": 0.6339, + "num_input_tokens_seen": 867824832, + "step": 4809 + }, + { + "epoch": 0.5265606612113085, + "grad_norm": 1.2305126848400163, + "learning_rate": 2.291296654436318e-05, + "loss": 0.6224, + "num_input_tokens_seen": 868034944, + "step": 4810 + }, + { + "epoch": 0.5266701332822463, + "grad_norm": 1.2780820247075275, + "learning_rate": 2.2904398060511324e-05, + "loss": 0.8253, + "num_input_tokens_seen": 868213696, + "step": 4811 + }, + { + "epoch": 0.5267796053531842, + "grad_norm": 1.219994276611786, + "learning_rate": 2.289582982456517e-05, + "loss": 0.6109, + "num_input_tokens_seen": 868419776, + "step": 4812 + }, + { + "epoch": 0.5268890774241222, + "grad_norm": 1.2195948093622888, + "learning_rate": 2.2887261837538335e-05, + "loss": 0.6017, + "num_input_tokens_seen": 868614880, + "step": 4813 + }, + { + "epoch": 0.5269985494950601, + "grad_norm": 1.3239625666100718, + "learning_rate": 2.2878694100444377e-05, + "loss": 0.7352, + "num_input_tokens_seen": 868771680, + "step": 4814 + }, + { + "epoch": 0.527108021565998, + "grad_norm": 1.2748933672725924, + "learning_rate": 2.2870126614296856e-05, + "loss": 0.7668, + "num_input_tokens_seen": 868944608, + "step": 4815 + }, + { + "epoch": 0.5272174936369359, + "grad_norm": 1.2785638237762795, + "learning_rate": 2.2861559380109287e-05, + "loss": 0.78, + "num_input_tokens_seen": 869149792, + "step": 4816 + }, + { + "epoch": 0.5273269657078737, + "grad_norm": 1.2664352460738642, + "learning_rate": 2.2852992398895155e-05, + "loss": 0.6638, + "num_input_tokens_seen": 869318912, + "step": 4817 + }, + { + "epoch": 0.5274364377788117, + "grad_norm": 1.208892079221343, + "learning_rate": 2.284442567166791e-05, + "loss": 0.7344, + "num_input_tokens_seen": 869535072, + "step": 4818 + }, + { + "epoch": 0.5275459098497496, + "grad_norm": 1.293210931783312, + "learning_rate": 2.283585919944099e-05, + "loss": 0.8286, + "num_input_tokens_seen": 869713600, + "step": 4819 + }, + { + "epoch": 0.5276553819206875, + "grad_norm": 1.3626463977661827, + "learning_rate": 2.28272929832278e-05, + "loss": 0.7133, + "num_input_tokens_seen": 869864800, + "step": 4820 + }, + { + "epoch": 0.5277648539916254, + "grad_norm": 1.298072813304688, + "learning_rate": 2.2818727024041685e-05, + "loss": 0.7637, + "num_input_tokens_seen": 870032352, + "step": 4821 + }, + { + "epoch": 0.5278743260625632, + "grad_norm": 1.279019810616355, + "learning_rate": 2.2810161322896e-05, + "loss": 0.7537, + "num_input_tokens_seen": 870229024, + "step": 4822 + }, + { + "epoch": 0.5279837981335012, + "grad_norm": 1.2956275915625086, + "learning_rate": 2.2801595880804037e-05, + "loss": 0.7722, + "num_input_tokens_seen": 870419648, + "step": 4823 + }, + { + "epoch": 0.5280932702044391, + "grad_norm": 1.2038958668493127, + "learning_rate": 2.279303069877909e-05, + "loss": 0.7127, + "num_input_tokens_seen": 870605120, + "step": 4824 + }, + { + "epoch": 0.528202742275377, + "grad_norm": 1.2099232517682155, + "learning_rate": 2.2784465777834383e-05, + "loss": 0.8827, + "num_input_tokens_seen": 870809408, + "step": 4825 + }, + { + "epoch": 0.5283122143463149, + "grad_norm": 1.1987927702273657, + "learning_rate": 2.2775901118983138e-05, + "loss": 0.6874, + "num_input_tokens_seen": 871003392, + "step": 4826 + }, + { + "epoch": 0.5284216864172528, + "grad_norm": 1.1822452227955143, + "learning_rate": 2.2767336723238543e-05, + "loss": 0.7421, + "num_input_tokens_seen": 871181472, + "step": 4827 + }, + { + "epoch": 0.5285311584881907, + "grad_norm": 1.2705127853777103, + "learning_rate": 2.2758772591613743e-05, + "loss": 0.71, + "num_input_tokens_seen": 871355744, + "step": 4828 + }, + { + "epoch": 0.5286406305591286, + "grad_norm": 1.3279651770076313, + "learning_rate": 2.275020872512187e-05, + "loss": 0.8326, + "num_input_tokens_seen": 871553536, + "step": 4829 + }, + { + "epoch": 0.5287501026300665, + "grad_norm": 1.4227898138572308, + "learning_rate": 2.274164512477599e-05, + "loss": 0.9164, + "num_input_tokens_seen": 871702496, + "step": 4830 + }, + { + "epoch": 0.5288595747010044, + "grad_norm": 1.265617747344912, + "learning_rate": 2.2733081791589188e-05, + "loss": 0.7647, + "num_input_tokens_seen": 871857280, + "step": 4831 + }, + { + "epoch": 0.5289690467719423, + "grad_norm": 1.2573876463067055, + "learning_rate": 2.272451872657447e-05, + "loss": 0.6724, + "num_input_tokens_seen": 871994816, + "step": 4832 + }, + { + "epoch": 0.5290785188428803, + "grad_norm": 1.2117526848355806, + "learning_rate": 2.2715955930744828e-05, + "loss": 0.6445, + "num_input_tokens_seen": 872168640, + "step": 4833 + }, + { + "epoch": 0.5291879909138181, + "grad_norm": 1.259784787054483, + "learning_rate": 2.2707393405113246e-05, + "loss": 0.8357, + "num_input_tokens_seen": 872334624, + "step": 4834 + }, + { + "epoch": 0.529297462984756, + "grad_norm": 1.123935831933024, + "learning_rate": 2.2698831150692635e-05, + "loss": 0.654, + "num_input_tokens_seen": 872512704, + "step": 4835 + }, + { + "epoch": 0.5294069350556939, + "grad_norm": 1.2643885754220883, + "learning_rate": 2.2690269168495904e-05, + "loss": 0.7207, + "num_input_tokens_seen": 872695712, + "step": 4836 + }, + { + "epoch": 0.5295164071266318, + "grad_norm": 1.3174895405218443, + "learning_rate": 2.2681707459535912e-05, + "loss": 0.8417, + "num_input_tokens_seen": 872890816, + "step": 4837 + }, + { + "epoch": 0.5296258791975698, + "grad_norm": 1.266984327329244, + "learning_rate": 2.2673146024825496e-05, + "loss": 0.5884, + "num_input_tokens_seen": 873069344, + "step": 4838 + }, + { + "epoch": 0.5297353512685076, + "grad_norm": 1.2588285050823345, + "learning_rate": 2.2664584865377454e-05, + "loss": 0.6925, + "num_input_tokens_seen": 873238240, + "step": 4839 + }, + { + "epoch": 0.5298448233394455, + "grad_norm": 1.2071301071028935, + "learning_rate": 2.2656023982204556e-05, + "loss": 0.6294, + "num_input_tokens_seen": 873401984, + "step": 4840 + }, + { + "epoch": 0.5299542954103834, + "grad_norm": 1.1226106808306373, + "learning_rate": 2.2647463376319556e-05, + "loss": 0.6402, + "num_input_tokens_seen": 873573120, + "step": 4841 + }, + { + "epoch": 0.5300637674813213, + "grad_norm": 1.3237957123958568, + "learning_rate": 2.2638903048735124e-05, + "loss": 0.8164, + "num_input_tokens_seen": 873760832, + "step": 4842 + }, + { + "epoch": 0.5301732395522593, + "grad_norm": 1.2283478881632692, + "learning_rate": 2.263034300046396e-05, + "loss": 0.5049, + "num_input_tokens_seen": 873896352, + "step": 4843 + }, + { + "epoch": 0.5302827116231972, + "grad_norm": 1.191172092127735, + "learning_rate": 2.262178323251868e-05, + "loss": 0.7309, + "num_input_tokens_seen": 874062112, + "step": 4844 + }, + { + "epoch": 0.530392183694135, + "grad_norm": 1.260097915235735, + "learning_rate": 2.2613223745911906e-05, + "loss": 0.5604, + "num_input_tokens_seen": 874224512, + "step": 4845 + }, + { + "epoch": 0.5305016557650729, + "grad_norm": 1.206304337458642, + "learning_rate": 2.2604664541656193e-05, + "loss": 0.6768, + "num_input_tokens_seen": 874417152, + "step": 4846 + }, + { + "epoch": 0.5306111278360108, + "grad_norm": 1.3506146674089732, + "learning_rate": 2.2596105620764083e-05, + "loss": 0.8151, + "num_input_tokens_seen": 874611584, + "step": 4847 + }, + { + "epoch": 0.5307205999069488, + "grad_norm": 1.3417313516712943, + "learning_rate": 2.2587546984248086e-05, + "loss": 0.7907, + "num_input_tokens_seen": 874769952, + "step": 4848 + }, + { + "epoch": 0.5308300719778867, + "grad_norm": 1.239538763618081, + "learning_rate": 2.2578988633120667e-05, + "loss": 0.6988, + "num_input_tokens_seen": 874949376, + "step": 4849 + }, + { + "epoch": 0.5309395440488246, + "grad_norm": 1.2869266733445277, + "learning_rate": 2.2570430568394275e-05, + "loss": 0.7918, + "num_input_tokens_seen": 875129472, + "step": 4850 + }, + { + "epoch": 0.5310490161197624, + "grad_norm": 1.2744374474027862, + "learning_rate": 2.256187279108129e-05, + "loss": 0.8634, + "num_input_tokens_seen": 875325248, + "step": 4851 + }, + { + "epoch": 0.5311584881907003, + "grad_norm": 1.417848187830464, + "learning_rate": 2.2553315302194102e-05, + "loss": 0.8175, + "num_input_tokens_seen": 875504672, + "step": 4852 + }, + { + "epoch": 0.5312679602616383, + "grad_norm": 1.307312614404803, + "learning_rate": 2.254475810274503e-05, + "loss": 0.7567, + "num_input_tokens_seen": 875701792, + "step": 4853 + }, + { + "epoch": 0.5313774323325762, + "grad_norm": 1.30678330627465, + "learning_rate": 2.2536201193746375e-05, + "loss": 0.7238, + "num_input_tokens_seen": 875894880, + "step": 4854 + }, + { + "epoch": 0.5314869044035141, + "grad_norm": 1.393646557993517, + "learning_rate": 2.2527644576210423e-05, + "loss": 0.7429, + "num_input_tokens_seen": 876098272, + "step": 4855 + }, + { + "epoch": 0.5315963764744519, + "grad_norm": 1.1929394471692294, + "learning_rate": 2.2519088251149385e-05, + "loss": 0.548, + "num_input_tokens_seen": 876279264, + "step": 4856 + }, + { + "epoch": 0.5317058485453898, + "grad_norm": 1.3415651523893533, + "learning_rate": 2.2510532219575465e-05, + "loss": 0.845, + "num_input_tokens_seen": 876488256, + "step": 4857 + }, + { + "epoch": 0.5318153206163277, + "grad_norm": 1.310954228181923, + "learning_rate": 2.2501976482500823e-05, + "loss": 0.8563, + "num_input_tokens_seen": 876711360, + "step": 4858 + }, + { + "epoch": 0.5319247926872657, + "grad_norm": 1.2465624271678737, + "learning_rate": 2.2493421040937585e-05, + "loss": 0.8203, + "num_input_tokens_seen": 876921472, + "step": 4859 + }, + { + "epoch": 0.5320342647582036, + "grad_norm": 1.2339158954801297, + "learning_rate": 2.248486589589785e-05, + "loss": 0.7309, + "num_input_tokens_seen": 877076928, + "step": 4860 + }, + { + "epoch": 0.5321437368291415, + "grad_norm": 1.3927827874622616, + "learning_rate": 2.2476311048393666e-05, + "loss": 0.8316, + "num_input_tokens_seen": 877270240, + "step": 4861 + }, + { + "epoch": 0.5322532089000793, + "grad_norm": 1.4873963776727321, + "learning_rate": 2.246775649943707e-05, + "loss": 0.8748, + "num_input_tokens_seen": 877451680, + "step": 4862 + }, + { + "epoch": 0.5323626809710172, + "grad_norm": 1.187831179966979, + "learning_rate": 2.2459202250040032e-05, + "loss": 0.7787, + "num_input_tokens_seen": 877605344, + "step": 4863 + }, + { + "epoch": 0.5324721530419552, + "grad_norm": 1.1990050084993367, + "learning_rate": 2.2450648301214517e-05, + "loss": 0.7077, + "num_input_tokens_seen": 877791040, + "step": 4864 + }, + { + "epoch": 0.5325816251128931, + "grad_norm": 1.2106977656154065, + "learning_rate": 2.2442094653972428e-05, + "loss": 0.8287, + "num_input_tokens_seen": 877963296, + "step": 4865 + }, + { + "epoch": 0.532691097183831, + "grad_norm": 1.2426062304026362, + "learning_rate": 2.243354130932565e-05, + "loss": 0.6878, + "num_input_tokens_seen": 878135104, + "step": 4866 + }, + { + "epoch": 0.5328005692547689, + "grad_norm": 1.2911502923717906, + "learning_rate": 2.242498826828604e-05, + "loss": 0.7541, + "num_input_tokens_seen": 878321920, + "step": 4867 + }, + { + "epoch": 0.5329100413257067, + "grad_norm": 1.088600625566299, + "learning_rate": 2.241643553186538e-05, + "loss": 0.7641, + "num_input_tokens_seen": 878513440, + "step": 4868 + }, + { + "epoch": 0.5330195133966447, + "grad_norm": 1.0897620932775938, + "learning_rate": 2.240788310107547e-05, + "loss": 0.4879, + "num_input_tokens_seen": 878692864, + "step": 4869 + }, + { + "epoch": 0.5331289854675826, + "grad_norm": 1.389025550582567, + "learning_rate": 2.2399330976928028e-05, + "loss": 0.8938, + "num_input_tokens_seen": 878879456, + "step": 4870 + }, + { + "epoch": 0.5332384575385205, + "grad_norm": 1.1905967588905064, + "learning_rate": 2.2390779160434767e-05, + "loss": 0.5721, + "num_input_tokens_seen": 879064928, + "step": 4871 + }, + { + "epoch": 0.5333479296094584, + "grad_norm": 1.2630696814879967, + "learning_rate": 2.2382227652607333e-05, + "loss": 0.7968, + "num_input_tokens_seen": 879269664, + "step": 4872 + }, + { + "epoch": 0.5334574016803962, + "grad_norm": 1.4085812636344066, + "learning_rate": 2.2373676454457364e-05, + "loss": 0.6469, + "num_input_tokens_seen": 879455360, + "step": 4873 + }, + { + "epoch": 0.5335668737513342, + "grad_norm": 1.2052246697103635, + "learning_rate": 2.2365125566996457e-05, + "loss": 0.7378, + "num_input_tokens_seen": 879658976, + "step": 4874 + }, + { + "epoch": 0.5336763458222721, + "grad_norm": 1.5566011086381681, + "learning_rate": 2.2356574991236154e-05, + "loss": 0.8755, + "num_input_tokens_seen": 879803008, + "step": 4875 + }, + { + "epoch": 0.53378581789321, + "grad_norm": 1.5573730277387225, + "learning_rate": 2.2348024728187983e-05, + "loss": 0.7691, + "num_input_tokens_seen": 880011776, + "step": 4876 + }, + { + "epoch": 0.5338952899641479, + "grad_norm": 1.1231680365178267, + "learning_rate": 2.2339474778863408e-05, + "loss": 0.5424, + "num_input_tokens_seen": 880214720, + "step": 4877 + }, + { + "epoch": 0.5340047620350858, + "grad_norm": 1.3837148879368257, + "learning_rate": 2.2330925144273884e-05, + "loss": 0.7147, + "num_input_tokens_seen": 880385856, + "step": 4878 + }, + { + "epoch": 0.5341142341060237, + "grad_norm": 1.3794588273178938, + "learning_rate": 2.2322375825430815e-05, + "loss": 0.6893, + "num_input_tokens_seen": 880555200, + "step": 4879 + }, + { + "epoch": 0.5342237061769616, + "grad_norm": 1.467401900383721, + "learning_rate": 2.231382682334556e-05, + "loss": 0.6847, + "num_input_tokens_seen": 880750752, + "step": 4880 + }, + { + "epoch": 0.5343331782478995, + "grad_norm": 1.2382956422757605, + "learning_rate": 2.2305278139029465e-05, + "loss": 0.7432, + "num_input_tokens_seen": 880914272, + "step": 4881 + }, + { + "epoch": 0.5344426503188374, + "grad_norm": 1.5022119266880143, + "learning_rate": 2.2296729773493806e-05, + "loss": 0.9886, + "num_input_tokens_seen": 881082720, + "step": 4882 + }, + { + "epoch": 0.5345521223897753, + "grad_norm": 1.312595454786674, + "learning_rate": 2.228818172774986e-05, + "loss": 0.8082, + "num_input_tokens_seen": 881295968, + "step": 4883 + }, + { + "epoch": 0.5346615944607133, + "grad_norm": 1.1856068025679551, + "learning_rate": 2.2279634002808818e-05, + "loss": 0.5442, + "num_input_tokens_seen": 881448064, + "step": 4884 + }, + { + "epoch": 0.5347710665316511, + "grad_norm": 1.2758387293951703, + "learning_rate": 2.2271086599681887e-05, + "loss": 0.6449, + "num_input_tokens_seen": 881614944, + "step": 4885 + }, + { + "epoch": 0.534880538602589, + "grad_norm": 1.1167580906370165, + "learning_rate": 2.2262539519380182e-05, + "loss": 0.7576, + "num_input_tokens_seen": 881781152, + "step": 4886 + }, + { + "epoch": 0.5349900106735269, + "grad_norm": 1.296371867926355, + "learning_rate": 2.225399276291482e-05, + "loss": 0.8247, + "num_input_tokens_seen": 881989472, + "step": 4887 + }, + { + "epoch": 0.5350994827444648, + "grad_norm": 1.2309257506376101, + "learning_rate": 2.2245446331296874e-05, + "loss": 0.6711, + "num_input_tokens_seen": 882151872, + "step": 4888 + }, + { + "epoch": 0.5352089548154028, + "grad_norm": 1.0745299287393781, + "learning_rate": 2.223690022553735e-05, + "loss": 0.6098, + "num_input_tokens_seen": 882358176, + "step": 4889 + }, + { + "epoch": 0.5353184268863406, + "grad_norm": 1.37233706016087, + "learning_rate": 2.2228354446647252e-05, + "loss": 0.6655, + "num_input_tokens_seen": 882541632, + "step": 4890 + }, + { + "epoch": 0.5354278989572785, + "grad_norm": 1.2667839489564667, + "learning_rate": 2.2219808995637524e-05, + "loss": 0.6633, + "num_input_tokens_seen": 882725760, + "step": 4891 + }, + { + "epoch": 0.5355373710282164, + "grad_norm": 1.2616579968260813, + "learning_rate": 2.2211263873519082e-05, + "loss": 0.9358, + "num_input_tokens_seen": 882919968, + "step": 4892 + }, + { + "epoch": 0.5356468430991543, + "grad_norm": 1.2701298637526022, + "learning_rate": 2.2202719081302785e-05, + "loss": 0.6534, + "num_input_tokens_seen": 883092224, + "step": 4893 + }, + { + "epoch": 0.5357563151700923, + "grad_norm": 1.1998743011284998, + "learning_rate": 2.219417461999947e-05, + "loss": 0.8227, + "num_input_tokens_seen": 883288448, + "step": 4894 + }, + { + "epoch": 0.5358657872410302, + "grad_norm": 1.2450470549335548, + "learning_rate": 2.218563049061995e-05, + "loss": 0.9085, + "num_input_tokens_seen": 883475488, + "step": 4895 + }, + { + "epoch": 0.535975259311968, + "grad_norm": 1.326897311784455, + "learning_rate": 2.217708669417495e-05, + "loss": 0.6109, + "num_input_tokens_seen": 883618400, + "step": 4896 + }, + { + "epoch": 0.5360847313829059, + "grad_norm": 1.2862397701942836, + "learning_rate": 2.2168543231675204e-05, + "loss": 0.7116, + "num_input_tokens_seen": 883790432, + "step": 4897 + }, + { + "epoch": 0.5361942034538438, + "grad_norm": 1.2651699802350431, + "learning_rate": 2.2160000104131372e-05, + "loss": 0.6061, + "num_input_tokens_seen": 883917440, + "step": 4898 + }, + { + "epoch": 0.5363036755247818, + "grad_norm": 1.2408945804456388, + "learning_rate": 2.215145731255411e-05, + "loss": 0.8024, + "num_input_tokens_seen": 884096640, + "step": 4899 + }, + { + "epoch": 0.5364131475957197, + "grad_norm": 1.3663412908573782, + "learning_rate": 2.2142914857953993e-05, + "loss": 0.6452, + "num_input_tokens_seen": 884271136, + "step": 4900 + }, + { + "epoch": 0.5365226196666576, + "grad_norm": 1.2370925637184877, + "learning_rate": 2.2134372741341585e-05, + "loss": 0.6526, + "num_input_tokens_seen": 884481696, + "step": 4901 + }, + { + "epoch": 0.5366320917375954, + "grad_norm": 1.2855322847144017, + "learning_rate": 2.2125830963727412e-05, + "loss": 0.858, + "num_input_tokens_seen": 884645216, + "step": 4902 + }, + { + "epoch": 0.5367415638085333, + "grad_norm": 1.1548748602513503, + "learning_rate": 2.2117289526121934e-05, + "loss": 0.685, + "num_input_tokens_seen": 884837184, + "step": 4903 + }, + { + "epoch": 0.5368510358794713, + "grad_norm": 1.169902412876778, + "learning_rate": 2.2108748429535603e-05, + "loss": 0.8191, + "num_input_tokens_seen": 885007872, + "step": 4904 + }, + { + "epoch": 0.5369605079504092, + "grad_norm": 1.2719964140505597, + "learning_rate": 2.21002076749788e-05, + "loss": 0.7797, + "num_input_tokens_seen": 885215072, + "step": 4905 + }, + { + "epoch": 0.5370699800213471, + "grad_norm": 1.345419799920753, + "learning_rate": 2.209166726346189e-05, + "loss": 0.7178, + "num_input_tokens_seen": 885391136, + "step": 4906 + }, + { + "epoch": 0.5371794520922849, + "grad_norm": 1.4551814305033455, + "learning_rate": 2.2083127195995176e-05, + "loss": 0.8455, + "num_input_tokens_seen": 885553536, + "step": 4907 + }, + { + "epoch": 0.5372889241632228, + "grad_norm": 1.3620752722516323, + "learning_rate": 2.2074587473588936e-05, + "loss": 0.718, + "num_input_tokens_seen": 885725792, + "step": 4908 + }, + { + "epoch": 0.5373983962341607, + "grad_norm": 1.3231188501963818, + "learning_rate": 2.206604809725342e-05, + "loss": 0.7993, + "num_input_tokens_seen": 885928960, + "step": 4909 + }, + { + "epoch": 0.5375078683050987, + "grad_norm": 1.2196389404294625, + "learning_rate": 2.205750906799879e-05, + "loss": 0.7634, + "num_input_tokens_seen": 886114208, + "step": 4910 + }, + { + "epoch": 0.5376173403760366, + "grad_norm": 1.1107705163806, + "learning_rate": 2.204897038683522e-05, + "loss": 0.5729, + "num_input_tokens_seen": 886288032, + "step": 4911 + }, + { + "epoch": 0.5377268124469745, + "grad_norm": 1.2001893033279947, + "learning_rate": 2.2040432054772807e-05, + "loss": 0.8703, + "num_input_tokens_seen": 886492992, + "step": 4912 + }, + { + "epoch": 0.5378362845179123, + "grad_norm": 1.3537487803530284, + "learning_rate": 2.2031894072821633e-05, + "loss": 0.6991, + "num_input_tokens_seen": 886663904, + "step": 4913 + }, + { + "epoch": 0.5379457565888502, + "grad_norm": 1.2112296459586116, + "learning_rate": 2.2023356441991712e-05, + "loss": 0.5344, + "num_input_tokens_seen": 886837056, + "step": 4914 + }, + { + "epoch": 0.5380552286597882, + "grad_norm": 1.200786894845197, + "learning_rate": 2.2014819163293028e-05, + "loss": 0.6827, + "num_input_tokens_seen": 886992960, + "step": 4915 + }, + { + "epoch": 0.5381647007307261, + "grad_norm": 1.2442444379244164, + "learning_rate": 2.200628223773554e-05, + "loss": 0.7791, + "num_input_tokens_seen": 887167904, + "step": 4916 + }, + { + "epoch": 0.538274172801664, + "grad_norm": 1.2769373442619079, + "learning_rate": 2.199774566632913e-05, + "loss": 0.8381, + "num_input_tokens_seen": 887362560, + "step": 4917 + }, + { + "epoch": 0.5383836448726019, + "grad_norm": 1.3248905064668464, + "learning_rate": 2.198920945008368e-05, + "loss": 1.0244, + "num_input_tokens_seen": 887569088, + "step": 4918 + }, + { + "epoch": 0.5384931169435397, + "grad_norm": 1.1621043485644973, + "learning_rate": 2.198067359000899e-05, + "loss": 0.5345, + "num_input_tokens_seen": 887761504, + "step": 4919 + }, + { + "epoch": 0.5386025890144777, + "grad_norm": 1.2379096884687548, + "learning_rate": 2.1972138087114835e-05, + "loss": 0.6864, + "num_input_tokens_seen": 887918752, + "step": 4920 + }, + { + "epoch": 0.5387120610854156, + "grad_norm": 1.2587389170244934, + "learning_rate": 2.1963602942410968e-05, + "loss": 0.5941, + "num_input_tokens_seen": 888087200, + "step": 4921 + }, + { + "epoch": 0.5388215331563535, + "grad_norm": 1.226690346439173, + "learning_rate": 2.195506815690706e-05, + "loss": 0.7483, + "num_input_tokens_seen": 888303808, + "step": 4922 + }, + { + "epoch": 0.5389310052272914, + "grad_norm": 1.2577610360747378, + "learning_rate": 2.1946533731612773e-05, + "loss": 0.5528, + "num_input_tokens_seen": 888461952, + "step": 4923 + }, + { + "epoch": 0.5390404772982292, + "grad_norm": 1.1504808270723152, + "learning_rate": 2.1937999667537704e-05, + "loss": 0.7557, + "num_input_tokens_seen": 888637120, + "step": 4924 + }, + { + "epoch": 0.5391499493691672, + "grad_norm": 1.2170944707183071, + "learning_rate": 2.192946596569143e-05, + "loss": 0.7313, + "num_input_tokens_seen": 888831776, + "step": 4925 + }, + { + "epoch": 0.5392594214401051, + "grad_norm": 1.2778103877250917, + "learning_rate": 2.192093262708345e-05, + "loss": 0.622, + "num_input_tokens_seen": 889020384, + "step": 4926 + }, + { + "epoch": 0.539368893511043, + "grad_norm": 1.2391666171380251, + "learning_rate": 2.1912399652723255e-05, + "loss": 0.668, + "num_input_tokens_seen": 889217056, + "step": 4927 + }, + { + "epoch": 0.5394783655819809, + "grad_norm": 1.4062064270050616, + "learning_rate": 2.190386704362029e-05, + "loss": 0.78, + "num_input_tokens_seen": 889398720, + "step": 4928 + }, + { + "epoch": 0.5395878376529188, + "grad_norm": 1.2832233305433316, + "learning_rate": 2.1895334800783925e-05, + "loss": 0.7584, + "num_input_tokens_seen": 889598976, + "step": 4929 + }, + { + "epoch": 0.5396973097238567, + "grad_norm": 1.4613957629831364, + "learning_rate": 2.188680292522353e-05, + "loss": 0.7702, + "num_input_tokens_seen": 889806400, + "step": 4930 + }, + { + "epoch": 0.5398067817947946, + "grad_norm": 1.2150526449783852, + "learning_rate": 2.1878271417948385e-05, + "loss": 0.6694, + "num_input_tokens_seen": 890024128, + "step": 4931 + }, + { + "epoch": 0.5399162538657325, + "grad_norm": 1.277906614640715, + "learning_rate": 2.1869740279967768e-05, + "loss": 0.7419, + "num_input_tokens_seen": 890206912, + "step": 4932 + }, + { + "epoch": 0.5400257259366704, + "grad_norm": 1.3461954796819038, + "learning_rate": 2.1861209512290888e-05, + "loss": 1.0243, + "num_input_tokens_seen": 890378272, + "step": 4933 + }, + { + "epoch": 0.5401351980076083, + "grad_norm": 1.2089302901340107, + "learning_rate": 2.1852679115926926e-05, + "loss": 0.7983, + "num_input_tokens_seen": 890549856, + "step": 4934 + }, + { + "epoch": 0.5402446700785463, + "grad_norm": 1.443457407841944, + "learning_rate": 2.184414909188501e-05, + "loss": 0.7991, + "num_input_tokens_seen": 890693440, + "step": 4935 + }, + { + "epoch": 0.5403541421494841, + "grad_norm": 1.2263035105968807, + "learning_rate": 2.1835619441174214e-05, + "loss": 0.851, + "num_input_tokens_seen": 890868832, + "step": 4936 + }, + { + "epoch": 0.540463614220422, + "grad_norm": 1.3252511740080046, + "learning_rate": 2.1827090164803605e-05, + "loss": 0.8924, + "num_input_tokens_seen": 891052512, + "step": 4937 + }, + { + "epoch": 0.5405730862913599, + "grad_norm": 1.3257566091195419, + "learning_rate": 2.181856126378215e-05, + "loss": 0.5744, + "num_input_tokens_seen": 891258592, + "step": 4938 + }, + { + "epoch": 0.5406825583622978, + "grad_norm": 1.22639955057819, + "learning_rate": 2.181003273911883e-05, + "loss": 0.7564, + "num_input_tokens_seen": 891467136, + "step": 4939 + }, + { + "epoch": 0.5407920304332358, + "grad_norm": 1.3604510203177003, + "learning_rate": 2.1801504591822526e-05, + "loss": 0.6313, + "num_input_tokens_seen": 891658656, + "step": 4940 + }, + { + "epoch": 0.5409015025041736, + "grad_norm": 1.458758764428595, + "learning_rate": 2.179297682290211e-05, + "loss": 0.8119, + "num_input_tokens_seen": 891823520, + "step": 4941 + }, + { + "epoch": 0.5410109745751115, + "grad_norm": 1.2357293919491246, + "learning_rate": 2.178444943336642e-05, + "loss": 0.6588, + "num_input_tokens_seen": 892011456, + "step": 4942 + }, + { + "epoch": 0.5411204466460494, + "grad_norm": 1.3240874909020297, + "learning_rate": 2.1775922424224203e-05, + "loss": 0.5618, + "num_input_tokens_seen": 892196928, + "step": 4943 + }, + { + "epoch": 0.5412299187169873, + "grad_norm": 1.3283335802436194, + "learning_rate": 2.1767395796484207e-05, + "loss": 0.7244, + "num_input_tokens_seen": 892331776, + "step": 4944 + }, + { + "epoch": 0.5413393907879253, + "grad_norm": 1.1435830416531798, + "learning_rate": 2.17588695511551e-05, + "loss": 0.5291, + "num_input_tokens_seen": 892524192, + "step": 4945 + }, + { + "epoch": 0.5414488628588632, + "grad_norm": 1.1830938309535177, + "learning_rate": 2.1750343689245544e-05, + "loss": 0.7808, + "num_input_tokens_seen": 892695104, + "step": 4946 + }, + { + "epoch": 0.541558334929801, + "grad_norm": 1.2624135186063812, + "learning_rate": 2.1741818211764103e-05, + "loss": 0.6706, + "num_input_tokens_seen": 892873632, + "step": 4947 + }, + { + "epoch": 0.5416678070007389, + "grad_norm": 1.2905745914396585, + "learning_rate": 2.173329311971934e-05, + "loss": 0.7458, + "num_input_tokens_seen": 893013856, + "step": 4948 + }, + { + "epoch": 0.5417772790716768, + "grad_norm": 1.2122643369557335, + "learning_rate": 2.1724768414119766e-05, + "loss": 0.5248, + "num_input_tokens_seen": 893171328, + "step": 4949 + }, + { + "epoch": 0.5418867511426148, + "grad_norm": 1.1659617962704625, + "learning_rate": 2.171624409597382e-05, + "loss": 0.6172, + "num_input_tokens_seen": 893381888, + "step": 4950 + }, + { + "epoch": 0.5419962232135527, + "grad_norm": 1.2681362057976158, + "learning_rate": 2.170772016628993e-05, + "loss": 0.9146, + "num_input_tokens_seen": 893562208, + "step": 4951 + }, + { + "epoch": 0.5421056952844906, + "grad_norm": 1.2687055044405224, + "learning_rate": 2.1699196626076437e-05, + "loss": 0.7089, + "num_input_tokens_seen": 893741856, + "step": 4952 + }, + { + "epoch": 0.5422151673554284, + "grad_norm": 1.3272833457346258, + "learning_rate": 2.169067347634168e-05, + "loss": 0.6406, + "num_input_tokens_seen": 893917472, + "step": 4953 + }, + { + "epoch": 0.5423246394263663, + "grad_norm": 1.2390635813916406, + "learning_rate": 2.168215071809392e-05, + "loss": 0.6338, + "num_input_tokens_seen": 894091968, + "step": 4954 + }, + { + "epoch": 0.5424341114973042, + "grad_norm": 1.2803731569610026, + "learning_rate": 2.167362835234139e-05, + "loss": 0.7927, + "num_input_tokens_seen": 894268256, + "step": 4955 + }, + { + "epoch": 0.5425435835682422, + "grad_norm": 1.2039542869970303, + "learning_rate": 2.166510638009227e-05, + "loss": 0.5165, + "num_input_tokens_seen": 894431776, + "step": 4956 + }, + { + "epoch": 0.5426530556391801, + "grad_norm": 1.1871114898391668, + "learning_rate": 2.1656584802354678e-05, + "loss": 0.6557, + "num_input_tokens_seen": 894613216, + "step": 4957 + }, + { + "epoch": 0.5427625277101179, + "grad_norm": 1.2410985624164101, + "learning_rate": 2.1648063620136733e-05, + "loss": 0.7383, + "num_input_tokens_seen": 894808096, + "step": 4958 + }, + { + "epoch": 0.5428719997810558, + "grad_norm": 1.1149492049514576, + "learning_rate": 2.1639542834446434e-05, + "loss": 0.4883, + "num_input_tokens_seen": 894985280, + "step": 4959 + }, + { + "epoch": 0.5429814718519937, + "grad_norm": 1.1263660508294453, + "learning_rate": 2.163102244629181e-05, + "loss": 0.7986, + "num_input_tokens_seen": 895175232, + "step": 4960 + }, + { + "epoch": 0.5430909439229317, + "grad_norm": 1.1184955222253776, + "learning_rate": 2.162250245668078e-05, + "loss": 0.5528, + "num_input_tokens_seen": 895342560, + "step": 4961 + }, + { + "epoch": 0.5432004159938696, + "grad_norm": 1.3579051522797498, + "learning_rate": 2.1613982866621252e-05, + "loss": 0.5849, + "num_input_tokens_seen": 895472256, + "step": 4962 + }, + { + "epoch": 0.5433098880648075, + "grad_norm": 1.3277315791236146, + "learning_rate": 2.1605463677121086e-05, + "loss": 0.5903, + "num_input_tokens_seen": 895659520, + "step": 4963 + }, + { + "epoch": 0.5434193601357453, + "grad_norm": 1.3134445243246453, + "learning_rate": 2.159694488918807e-05, + "loss": 0.805, + "num_input_tokens_seen": 895858208, + "step": 4964 + }, + { + "epoch": 0.5435288322066832, + "grad_norm": 1.1542640849198682, + "learning_rate": 2.158842650382997e-05, + "loss": 0.7356, + "num_input_tokens_seen": 896044800, + "step": 4965 + }, + { + "epoch": 0.5436383042776212, + "grad_norm": 1.6006465618762133, + "learning_rate": 2.157990852205449e-05, + "loss": 0.8172, + "num_input_tokens_seen": 896234304, + "step": 4966 + }, + { + "epoch": 0.5437477763485591, + "grad_norm": 1.3948711333690864, + "learning_rate": 2.1571390944869306e-05, + "loss": 0.9159, + "num_input_tokens_seen": 896397376, + "step": 4967 + }, + { + "epoch": 0.543857248419497, + "grad_norm": 1.2911132683237132, + "learning_rate": 2.1562873773282005e-05, + "loss": 1.0406, + "num_input_tokens_seen": 896594944, + "step": 4968 + }, + { + "epoch": 0.5439667204904349, + "grad_norm": 1.2623366505163174, + "learning_rate": 2.1554357008300164e-05, + "loss": 0.6399, + "num_input_tokens_seen": 896740992, + "step": 4969 + }, + { + "epoch": 0.5440761925613727, + "grad_norm": 1.2482483511664977, + "learning_rate": 2.1545840650931317e-05, + "loss": 0.7745, + "num_input_tokens_seen": 896937888, + "step": 4970 + }, + { + "epoch": 0.5441856646323107, + "grad_norm": 1.3343034533006772, + "learning_rate": 2.1537324702182907e-05, + "loss": 0.8711, + "num_input_tokens_seen": 897123136, + "step": 4971 + }, + { + "epoch": 0.5442951367032486, + "grad_norm": 1.2086028204135537, + "learning_rate": 2.1528809163062375e-05, + "loss": 0.6281, + "num_input_tokens_seen": 897322272, + "step": 4972 + }, + { + "epoch": 0.5444046087741865, + "grad_norm": 1.2408899692106312, + "learning_rate": 2.1520294034577072e-05, + "loss": 0.6747, + "num_input_tokens_seen": 897513120, + "step": 4973 + }, + { + "epoch": 0.5445140808451244, + "grad_norm": 1.3232875914719568, + "learning_rate": 2.1511779317734336e-05, + "loss": 0.8038, + "num_input_tokens_seen": 897697472, + "step": 4974 + }, + { + "epoch": 0.5446235529160622, + "grad_norm": 1.2736207795039036, + "learning_rate": 2.1503265013541433e-05, + "loss": 0.6547, + "num_input_tokens_seen": 897860320, + "step": 4975 + }, + { + "epoch": 0.5447330249870002, + "grad_norm": 1.2654831889259386, + "learning_rate": 2.1494751123005605e-05, + "loss": 0.8771, + "num_input_tokens_seen": 898075360, + "step": 4976 + }, + { + "epoch": 0.5448424970579381, + "grad_norm": 1.298523280572905, + "learning_rate": 2.1486237647134014e-05, + "loss": 0.6275, + "num_input_tokens_seen": 898273376, + "step": 4977 + }, + { + "epoch": 0.544951969128876, + "grad_norm": 1.2864613964131861, + "learning_rate": 2.147772458693379e-05, + "loss": 0.6879, + "num_input_tokens_seen": 898481920, + "step": 4978 + }, + { + "epoch": 0.5450614411998139, + "grad_norm": 1.3487688419981008, + "learning_rate": 2.146921194341202e-05, + "loss": 0.6392, + "num_input_tokens_seen": 898649472, + "step": 4979 + }, + { + "epoch": 0.5451709132707518, + "grad_norm": 1.1488833956635285, + "learning_rate": 2.1460699717575718e-05, + "loss": 0.5415, + "num_input_tokens_seen": 898834048, + "step": 4980 + }, + { + "epoch": 0.5452803853416897, + "grad_norm": 1.1583531814814063, + "learning_rate": 2.1452187910431875e-05, + "loss": 0.8757, + "num_input_tokens_seen": 899041696, + "step": 4981 + }, + { + "epoch": 0.5453898574126276, + "grad_norm": 1.2712442432074418, + "learning_rate": 2.1443676522987432e-05, + "loss": 0.8687, + "num_input_tokens_seen": 899218880, + "step": 4982 + }, + { + "epoch": 0.5454993294835655, + "grad_norm": 1.37382235814615, + "learning_rate": 2.1435165556249246e-05, + "loss": 0.728, + "num_input_tokens_seen": 899391808, + "step": 4983 + }, + { + "epoch": 0.5456088015545034, + "grad_norm": 1.3908272071156433, + "learning_rate": 2.142665501122417e-05, + "loss": 0.769, + "num_input_tokens_seen": 899596096, + "step": 4984 + }, + { + "epoch": 0.5457182736254413, + "grad_norm": 1.191168689494597, + "learning_rate": 2.141814488891896e-05, + "loss": 0.6841, + "num_input_tokens_seen": 899767456, + "step": 4985 + }, + { + "epoch": 0.5458277456963793, + "grad_norm": 1.2286435592822216, + "learning_rate": 2.1409635190340373e-05, + "loss": 0.6675, + "num_input_tokens_seen": 899954720, + "step": 4986 + }, + { + "epoch": 0.5459372177673171, + "grad_norm": 1.1165036863034694, + "learning_rate": 2.1401125916495072e-05, + "loss": 0.5039, + "num_input_tokens_seen": 900154528, + "step": 4987 + }, + { + "epoch": 0.546046689838255, + "grad_norm": 1.1694273755380087, + "learning_rate": 2.1392617068389697e-05, + "loss": 0.5491, + "num_input_tokens_seen": 900317824, + "step": 4988 + }, + { + "epoch": 0.5461561619091929, + "grad_norm": 1.3792440632811118, + "learning_rate": 2.1384108647030836e-05, + "loss": 0.8438, + "num_input_tokens_seen": 900527488, + "step": 4989 + }, + { + "epoch": 0.5462656339801308, + "grad_norm": 1.2340486795323373, + "learning_rate": 2.1375600653425003e-05, + "loss": 0.7102, + "num_input_tokens_seen": 900698176, + "step": 4990 + }, + { + "epoch": 0.5463751060510688, + "grad_norm": 1.3574311839271633, + "learning_rate": 2.136709308857869e-05, + "loss": 0.7787, + "num_input_tokens_seen": 900909856, + "step": 4991 + }, + { + "epoch": 0.5464845781220066, + "grad_norm": 1.2510408359665643, + "learning_rate": 2.135858595349831e-05, + "loss": 0.7027, + "num_input_tokens_seen": 901063744, + "step": 4992 + }, + { + "epoch": 0.5465940501929445, + "grad_norm": 1.1856164047742062, + "learning_rate": 2.135007924919026e-05, + "loss": 0.5379, + "num_input_tokens_seen": 901246976, + "step": 4993 + }, + { + "epoch": 0.5467035222638824, + "grad_norm": 1.4209694126925678, + "learning_rate": 2.134157297666085e-05, + "loss": 0.6421, + "num_input_tokens_seen": 901387872, + "step": 4994 + }, + { + "epoch": 0.5468129943348203, + "grad_norm": 1.5261214650960149, + "learning_rate": 2.133306713691636e-05, + "loss": 0.7068, + "num_input_tokens_seen": 901588352, + "step": 4995 + }, + { + "epoch": 0.5469224664057583, + "grad_norm": 1.2687415850881754, + "learning_rate": 2.1324561730963025e-05, + "loss": 0.7041, + "num_input_tokens_seen": 901753216, + "step": 4996 + }, + { + "epoch": 0.5470319384766962, + "grad_norm": 1.1995685079775231, + "learning_rate": 2.1316056759807006e-05, + "loss": 0.5395, + "num_input_tokens_seen": 901928160, + "step": 4997 + }, + { + "epoch": 0.547141410547634, + "grad_norm": 1.2632427164876006, + "learning_rate": 2.1307552224454435e-05, + "loss": 0.7918, + "num_input_tokens_seen": 902123936, + "step": 4998 + }, + { + "epoch": 0.5472508826185719, + "grad_norm": 1.5421431638456984, + "learning_rate": 2.129904812591137e-05, + "loss": 0.791, + "num_input_tokens_seen": 902269088, + "step": 4999 + }, + { + "epoch": 0.5473603546895098, + "grad_norm": 1.3400665631771513, + "learning_rate": 2.129054446518385e-05, + "loss": 0.989, + "num_input_tokens_seen": 902477632, + "step": 5000 + }, + { + "epoch": 0.5474698267604478, + "grad_norm": 1.4204002529518978, + "learning_rate": 2.1282041243277816e-05, + "loss": 0.8198, + "num_input_tokens_seen": 902657504, + "step": 5001 + }, + { + "epoch": 0.5475792988313857, + "grad_norm": 1.2502888822529727, + "learning_rate": 2.1273538461199194e-05, + "loss": 0.8118, + "num_input_tokens_seen": 902853728, + "step": 5002 + }, + { + "epoch": 0.5476887709023236, + "grad_norm": 1.2650252439973044, + "learning_rate": 2.1265036119953864e-05, + "loss": 0.6153, + "num_input_tokens_seen": 903033376, + "step": 5003 + }, + { + "epoch": 0.5477982429732614, + "grad_norm": 1.3317203998029552, + "learning_rate": 2.12565342205476e-05, + "loss": 0.7814, + "num_input_tokens_seen": 903221088, + "step": 5004 + }, + { + "epoch": 0.5479077150441993, + "grad_norm": 1.3905553478842472, + "learning_rate": 2.1248032763986203e-05, + "loss": 0.6778, + "num_input_tokens_seen": 903366240, + "step": 5005 + }, + { + "epoch": 0.5480171871151372, + "grad_norm": 1.2395147683978416, + "learning_rate": 2.1239531751275344e-05, + "loss": 0.8157, + "num_input_tokens_seen": 903549696, + "step": 5006 + }, + { + "epoch": 0.5481266591860752, + "grad_norm": 1.2170807582292975, + "learning_rate": 2.123103118342069e-05, + "loss": 0.8017, + "num_input_tokens_seen": 903746144, + "step": 5007 + }, + { + "epoch": 0.5482361312570131, + "grad_norm": 1.343180202949068, + "learning_rate": 2.1222531061427843e-05, + "loss": 0.8181, + "num_input_tokens_seen": 903936768, + "step": 5008 + }, + { + "epoch": 0.5483456033279509, + "grad_norm": 1.3433785923290131, + "learning_rate": 2.1214031386302347e-05, + "loss": 0.6677, + "num_input_tokens_seen": 904072064, + "step": 5009 + }, + { + "epoch": 0.5484550753988888, + "grad_norm": 1.35317178517571, + "learning_rate": 2.1205532159049714e-05, + "loss": 0.8803, + "num_input_tokens_seen": 904252832, + "step": 5010 + }, + { + "epoch": 0.5485645474698267, + "grad_norm": 1.4038915566517718, + "learning_rate": 2.1197033380675357e-05, + "loss": 0.8263, + "num_input_tokens_seen": 904433600, + "step": 5011 + }, + { + "epoch": 0.5486740195407647, + "grad_norm": 1.1902610903509492, + "learning_rate": 2.1188535052184695e-05, + "loss": 0.8637, + "num_input_tokens_seen": 904634528, + "step": 5012 + }, + { + "epoch": 0.5487834916117026, + "grad_norm": 1.1820791896621496, + "learning_rate": 2.118003717458304e-05, + "loss": 0.5752, + "num_input_tokens_seen": 904804768, + "step": 5013 + }, + { + "epoch": 0.5488929636826405, + "grad_norm": 1.083735586948317, + "learning_rate": 2.1171539748875692e-05, + "loss": 0.6304, + "num_input_tokens_seen": 904981728, + "step": 5014 + }, + { + "epoch": 0.5490024357535783, + "grad_norm": 1.236575422819575, + "learning_rate": 2.1163042776067865e-05, + "loss": 1.0495, + "num_input_tokens_seen": 905188704, + "step": 5015 + }, + { + "epoch": 0.5491119078245162, + "grad_norm": 1.2263481292969145, + "learning_rate": 2.1154546257164744e-05, + "loss": 0.6804, + "num_input_tokens_seen": 905377088, + "step": 5016 + }, + { + "epoch": 0.5492213798954542, + "grad_norm": 1.422642594952229, + "learning_rate": 2.114605019317145e-05, + "loss": 0.694, + "num_input_tokens_seen": 905549568, + "step": 5017 + }, + { + "epoch": 0.5493308519663921, + "grad_norm": 1.1707434722698857, + "learning_rate": 2.1137554585093056e-05, + "loss": 0.8062, + "num_input_tokens_seen": 905728992, + "step": 5018 + }, + { + "epoch": 0.54944032403733, + "grad_norm": 1.2008432822749602, + "learning_rate": 2.1129059433934567e-05, + "loss": 0.5581, + "num_input_tokens_seen": 905901696, + "step": 5019 + }, + { + "epoch": 0.5495497961082679, + "grad_norm": 1.3353971314253599, + "learning_rate": 2.1120564740700945e-05, + "loss": 0.7647, + "num_input_tokens_seen": 906082688, + "step": 5020 + }, + { + "epoch": 0.5496592681792057, + "grad_norm": 1.1398190302845035, + "learning_rate": 2.1112070506397105e-05, + "loss": 0.6781, + "num_input_tokens_seen": 906252704, + "step": 5021 + }, + { + "epoch": 0.5497687402501437, + "grad_norm": 1.3965764795620723, + "learning_rate": 2.1103576732027882e-05, + "loss": 0.715, + "num_input_tokens_seen": 906422720, + "step": 5022 + }, + { + "epoch": 0.5498782123210816, + "grad_norm": 1.3258467487796977, + "learning_rate": 2.1095083418598083e-05, + "loss": 0.7709, + "num_input_tokens_seen": 906594080, + "step": 5023 + }, + { + "epoch": 0.5499876843920195, + "grad_norm": 1.1666419106082146, + "learning_rate": 2.1086590567112463e-05, + "loss": 0.7461, + "num_input_tokens_seen": 906786272, + "step": 5024 + }, + { + "epoch": 0.5500971564629574, + "grad_norm": 1.1920991977372473, + "learning_rate": 2.1078098178575686e-05, + "loss": 0.5154, + "num_input_tokens_seen": 906967040, + "step": 5025 + }, + { + "epoch": 0.5502066285338952, + "grad_norm": 1.2662353456784592, + "learning_rate": 2.1069606253992406e-05, + "loss": 0.8948, + "num_input_tokens_seen": 907167968, + "step": 5026 + }, + { + "epoch": 0.5503161006048332, + "grad_norm": 1.2063230640637992, + "learning_rate": 2.1061114794367185e-05, + "loss": 0.7384, + "num_input_tokens_seen": 907356352, + "step": 5027 + }, + { + "epoch": 0.5504255726757711, + "grad_norm": 1.3240952785847115, + "learning_rate": 2.1052623800704557e-05, + "loss": 0.9204, + "num_input_tokens_seen": 907557056, + "step": 5028 + }, + { + "epoch": 0.550535044746709, + "grad_norm": 1.3671721103347005, + "learning_rate": 2.1044133274008983e-05, + "loss": 0.8109, + "num_input_tokens_seen": 907734688, + "step": 5029 + }, + { + "epoch": 0.5506445168176469, + "grad_norm": 1.2088568255017105, + "learning_rate": 2.1035643215284882e-05, + "loss": 0.7335, + "num_input_tokens_seen": 907937184, + "step": 5030 + }, + { + "epoch": 0.5507539888885848, + "grad_norm": 1.2399653586040318, + "learning_rate": 2.1027153625536616e-05, + "loss": 0.5591, + "num_input_tokens_seen": 908108544, + "step": 5031 + }, + { + "epoch": 0.5508634609595227, + "grad_norm": 1.3818279576155408, + "learning_rate": 2.1018664505768476e-05, + "loss": 0.6129, + "num_input_tokens_seen": 908271840, + "step": 5032 + }, + { + "epoch": 0.5509729330304606, + "grad_norm": 1.3599826287815089, + "learning_rate": 2.101017585698472e-05, + "loss": 0.66, + "num_input_tokens_seen": 908423040, + "step": 5033 + }, + { + "epoch": 0.5510824051013985, + "grad_norm": 1.2446140125291942, + "learning_rate": 2.1001687680189524e-05, + "loss": 0.777, + "num_input_tokens_seen": 908625088, + "step": 5034 + }, + { + "epoch": 0.5511918771723364, + "grad_norm": 1.3149521037904999, + "learning_rate": 2.0993199976387043e-05, + "loss": 0.6753, + "num_input_tokens_seen": 908813024, + "step": 5035 + }, + { + "epoch": 0.5513013492432743, + "grad_norm": 1.4274712566682006, + "learning_rate": 2.0984712746581337e-05, + "loss": 0.6891, + "num_input_tokens_seen": 908989088, + "step": 5036 + }, + { + "epoch": 0.5514108213142123, + "grad_norm": 1.3019256598056121, + "learning_rate": 2.0976225991776434e-05, + "loss": 0.6448, + "num_input_tokens_seen": 909151040, + "step": 5037 + }, + { + "epoch": 0.5515202933851501, + "grad_norm": 1.330150002933487, + "learning_rate": 2.0967739712976308e-05, + "loss": 0.7178, + "num_input_tokens_seen": 909338976, + "step": 5038 + }, + { + "epoch": 0.551629765456088, + "grad_norm": 1.1746611979209678, + "learning_rate": 2.0959253911184867e-05, + "loss": 0.6415, + "num_input_tokens_seen": 909522208, + "step": 5039 + }, + { + "epoch": 0.5517392375270259, + "grad_norm": 1.2120819098520077, + "learning_rate": 2.0950768587405963e-05, + "loss": 0.7438, + "num_input_tokens_seen": 909708352, + "step": 5040 + }, + { + "epoch": 0.5518487095979638, + "grad_norm": 1.2838416057021207, + "learning_rate": 2.0942283742643392e-05, + "loss": 0.7387, + "num_input_tokens_seen": 909893376, + "step": 5041 + }, + { + "epoch": 0.5519581816689018, + "grad_norm": 1.2117647360284824, + "learning_rate": 2.0933799377900907e-05, + "loss": 0.6153, + "num_input_tokens_seen": 910082432, + "step": 5042 + }, + { + "epoch": 0.5520676537398396, + "grad_norm": 1.3656287326368914, + "learning_rate": 2.0925315494182168e-05, + "loss": 0.7827, + "num_input_tokens_seen": 910248416, + "step": 5043 + }, + { + "epoch": 0.5521771258107775, + "grad_norm": 1.2655305709462141, + "learning_rate": 2.091683209249082e-05, + "loss": 0.7814, + "num_input_tokens_seen": 910433216, + "step": 5044 + }, + { + "epoch": 0.5522865978817154, + "grad_norm": 1.2962369153384095, + "learning_rate": 2.090834917383044e-05, + "loss": 0.6318, + "num_input_tokens_seen": 910613312, + "step": 5045 + }, + { + "epoch": 0.5523960699526533, + "grad_norm": 1.2595987845249315, + "learning_rate": 2.089986673920452e-05, + "loss": 0.6972, + "num_input_tokens_seen": 910800128, + "step": 5046 + }, + { + "epoch": 0.5525055420235913, + "grad_norm": 1.184090739739239, + "learning_rate": 2.0891384789616535e-05, + "loss": 0.7616, + "num_input_tokens_seen": 911003296, + "step": 5047 + }, + { + "epoch": 0.5526150140945292, + "grad_norm": 1.2167470563606442, + "learning_rate": 2.088290332606987e-05, + "loss": 0.7694, + "num_input_tokens_seen": 911171520, + "step": 5048 + }, + { + "epoch": 0.552724486165467, + "grad_norm": 1.4091816331793134, + "learning_rate": 2.0874422349567866e-05, + "loss": 0.6619, + "num_input_tokens_seen": 911342432, + "step": 5049 + }, + { + "epoch": 0.5528339582364049, + "grad_norm": 1.4352628465830704, + "learning_rate": 2.0865941861113818e-05, + "loss": 0.9613, + "num_input_tokens_seen": 911553664, + "step": 5050 + }, + { + "epoch": 0.5529434303073428, + "grad_norm": 1.5795345998357386, + "learning_rate": 2.085746186171094e-05, + "loss": 0.6672, + "num_input_tokens_seen": 911740928, + "step": 5051 + }, + { + "epoch": 0.5530529023782808, + "grad_norm": 1.2687362764114285, + "learning_rate": 2.0848982352362413e-05, + "loss": 0.8157, + "num_input_tokens_seen": 911899968, + "step": 5052 + }, + { + "epoch": 0.5531623744492187, + "grad_norm": 1.3854873132499976, + "learning_rate": 2.0840503334071332e-05, + "loss": 0.8, + "num_input_tokens_seen": 912071104, + "step": 5053 + }, + { + "epoch": 0.5532718465201566, + "grad_norm": 1.3909357988033892, + "learning_rate": 2.0832024807840762e-05, + "loss": 0.812, + "num_input_tokens_seen": 912278528, + "step": 5054 + }, + { + "epoch": 0.5533813185910944, + "grad_norm": 1.2773305909765136, + "learning_rate": 2.082354677467368e-05, + "loss": 0.767, + "num_input_tokens_seen": 912476544, + "step": 5055 + }, + { + "epoch": 0.5534907906620323, + "grad_norm": 1.1668715921443138, + "learning_rate": 2.081506923557303e-05, + "loss": 0.699, + "num_input_tokens_seen": 912651936, + "step": 5056 + }, + { + "epoch": 0.5536002627329702, + "grad_norm": 1.135722769442504, + "learning_rate": 2.08065921915417e-05, + "loss": 0.5955, + "num_input_tokens_seen": 912847712, + "step": 5057 + }, + { + "epoch": 0.5537097348039082, + "grad_norm": 1.1648255989610088, + "learning_rate": 2.079811564358249e-05, + "loss": 0.8396, + "num_input_tokens_seen": 913037440, + "step": 5058 + }, + { + "epoch": 0.5538192068748461, + "grad_norm": 1.251466320214087, + "learning_rate": 2.0789639592698164e-05, + "loss": 0.7926, + "num_input_tokens_seen": 913226048, + "step": 5059 + }, + { + "epoch": 0.5539286789457839, + "grad_norm": 1.4373908623439564, + "learning_rate": 2.0781164039891432e-05, + "loss": 0.8552, + "num_input_tokens_seen": 913391584, + "step": 5060 + }, + { + "epoch": 0.5540381510167218, + "grad_norm": 1.2120524042370282, + "learning_rate": 2.0772688986164928e-05, + "loss": 0.7468, + "num_input_tokens_seen": 913563392, + "step": 5061 + }, + { + "epoch": 0.5541476230876597, + "grad_norm": 1.3006390101472418, + "learning_rate": 2.076421443252123e-05, + "loss": 0.5846, + "num_input_tokens_seen": 913742368, + "step": 5062 + }, + { + "epoch": 0.5542570951585977, + "grad_norm": 1.401417035535466, + "learning_rate": 2.0755740379962864e-05, + "loss": 0.8922, + "num_input_tokens_seen": 913942624, + "step": 5063 + }, + { + "epoch": 0.5543665672295356, + "grad_norm": 1.2147022913176246, + "learning_rate": 2.0747266829492312e-05, + "loss": 0.7978, + "num_input_tokens_seen": 914136160, + "step": 5064 + }, + { + "epoch": 0.5544760393004735, + "grad_norm": 1.182020848159758, + "learning_rate": 2.0738793782111954e-05, + "loss": 0.8024, + "num_input_tokens_seen": 914324096, + "step": 5065 + }, + { + "epoch": 0.5545855113714113, + "grad_norm": 1.2587058170750616, + "learning_rate": 2.0730321238824156e-05, + "loss": 0.6321, + "num_input_tokens_seen": 914487392, + "step": 5066 + }, + { + "epoch": 0.5546949834423492, + "grad_norm": 1.1858838276535035, + "learning_rate": 2.072184920063118e-05, + "loss": 0.6425, + "num_input_tokens_seen": 914703328, + "step": 5067 + }, + { + "epoch": 0.5548044555132872, + "grad_norm": 1.0842146345391115, + "learning_rate": 2.0713377668535276e-05, + "loss": 0.7127, + "num_input_tokens_seen": 914892608, + "step": 5068 + }, + { + "epoch": 0.5549139275842251, + "grad_norm": 1.2430783322388768, + "learning_rate": 2.070490664353859e-05, + "loss": 0.7138, + "num_input_tokens_seen": 915070464, + "step": 5069 + }, + { + "epoch": 0.555023399655163, + "grad_norm": 1.2659315985902497, + "learning_rate": 2.0696436126643236e-05, + "loss": 0.9494, + "num_input_tokens_seen": 915258400, + "step": 5070 + }, + { + "epoch": 0.5551328717261009, + "grad_norm": 1.1646635820631446, + "learning_rate": 2.0687966118851268e-05, + "loss": 0.8068, + "num_input_tokens_seen": 915457536, + "step": 5071 + }, + { + "epoch": 0.5552423437970387, + "grad_norm": 1.3033481050143678, + "learning_rate": 2.067949662116466e-05, + "loss": 0.7889, + "num_input_tokens_seen": 915602912, + "step": 5072 + }, + { + "epoch": 0.5553518158679767, + "grad_norm": 1.3238156135442862, + "learning_rate": 2.067102763458535e-05, + "loss": 0.6716, + "num_input_tokens_seen": 915756352, + "step": 5073 + }, + { + "epoch": 0.5554612879389146, + "grad_norm": 1.246997613064853, + "learning_rate": 2.0662559160115186e-05, + "loss": 0.6023, + "num_input_tokens_seen": 915930848, + "step": 5074 + }, + { + "epoch": 0.5555707600098525, + "grad_norm": 1.2410749335401232, + "learning_rate": 2.065409119875599e-05, + "loss": 0.823, + "num_input_tokens_seen": 916127968, + "step": 5075 + }, + { + "epoch": 0.5556802320807904, + "grad_norm": 1.4161608391327205, + "learning_rate": 2.0645623751509495e-05, + "loss": 0.7421, + "num_input_tokens_seen": 916274688, + "step": 5076 + }, + { + "epoch": 0.5557897041517282, + "grad_norm": 1.2184551077874386, + "learning_rate": 2.0637156819377378e-05, + "loss": 0.6162, + "num_input_tokens_seen": 916468896, + "step": 5077 + }, + { + "epoch": 0.5558991762226662, + "grad_norm": 1.2970756926997238, + "learning_rate": 2.0628690403361285e-05, + "loss": 0.9308, + "num_input_tokens_seen": 916642944, + "step": 5078 + }, + { + "epoch": 0.5560086482936041, + "grad_norm": 1.4171919783718088, + "learning_rate": 2.0620224504462742e-05, + "loss": 0.7521, + "num_input_tokens_seen": 916838496, + "step": 5079 + }, + { + "epoch": 0.556118120364542, + "grad_norm": 1.298353111196432, + "learning_rate": 2.061175912368328e-05, + "loss": 0.5736, + "num_input_tokens_seen": 917018368, + "step": 5080 + }, + { + "epoch": 0.5562275924354799, + "grad_norm": 1.2501536751696658, + "learning_rate": 2.0603294262024323e-05, + "loss": 0.65, + "num_input_tokens_seen": 917200928, + "step": 5081 + }, + { + "epoch": 0.5563370645064178, + "grad_norm": 1.160996976948695, + "learning_rate": 2.059482992048725e-05, + "loss": 0.7929, + "num_input_tokens_seen": 917408128, + "step": 5082 + }, + { + "epoch": 0.5564465365773557, + "grad_norm": 1.3643115940292827, + "learning_rate": 2.058636610007337e-05, + "loss": 0.8905, + "num_input_tokens_seen": 917584416, + "step": 5083 + }, + { + "epoch": 0.5565560086482936, + "grad_norm": 1.219644022974743, + "learning_rate": 2.057790280178394e-05, + "loss": 0.6044, + "num_input_tokens_seen": 917769664, + "step": 5084 + }, + { + "epoch": 0.5566654807192315, + "grad_norm": 1.2334240497699498, + "learning_rate": 2.056944002662017e-05, + "loss": 0.8594, + "num_input_tokens_seen": 917963424, + "step": 5085 + }, + { + "epoch": 0.5567749527901694, + "grad_norm": 1.4013287794850986, + "learning_rate": 2.0560977775583162e-05, + "loss": 0.8714, + "num_input_tokens_seen": 918172416, + "step": 5086 + }, + { + "epoch": 0.5568844248611073, + "grad_norm": 1.3189530003946544, + "learning_rate": 2.0552516049674007e-05, + "loss": 0.6985, + "num_input_tokens_seen": 918320032, + "step": 5087 + }, + { + "epoch": 0.5569938969320453, + "grad_norm": 1.290488084823069, + "learning_rate": 2.0544054849893696e-05, + "loss": 0.6979, + "num_input_tokens_seen": 918512000, + "step": 5088 + }, + { + "epoch": 0.5571033690029831, + "grad_norm": 1.084231621665431, + "learning_rate": 2.0535594177243183e-05, + "loss": 0.7105, + "num_input_tokens_seen": 918718080, + "step": 5089 + }, + { + "epoch": 0.557212841073921, + "grad_norm": 1.282125546034542, + "learning_rate": 2.0527134032723337e-05, + "loss": 0.9929, + "num_input_tokens_seen": 918926400, + "step": 5090 + }, + { + "epoch": 0.5573223131448589, + "grad_norm": 1.3551065362243009, + "learning_rate": 2.0518674417334982e-05, + "loss": 0.7281, + "num_input_tokens_seen": 919061472, + "step": 5091 + }, + { + "epoch": 0.5574317852157968, + "grad_norm": 1.1970163306713322, + "learning_rate": 2.0510215332078884e-05, + "loss": 0.6037, + "num_input_tokens_seen": 919263744, + "step": 5092 + }, + { + "epoch": 0.5575412572867348, + "grad_norm": 1.4942085200821893, + "learning_rate": 2.050175677795572e-05, + "loss": 0.7978, + "num_input_tokens_seen": 919453920, + "step": 5093 + }, + { + "epoch": 0.5576507293576726, + "grad_norm": 1.2771407966484378, + "learning_rate": 2.0493298755966145e-05, + "loss": 0.7985, + "num_input_tokens_seen": 919602880, + "step": 5094 + }, + { + "epoch": 0.5577602014286105, + "grad_norm": 1.2601931770528678, + "learning_rate": 2.0484841267110698e-05, + "loss": 0.8073, + "num_input_tokens_seen": 919786112, + "step": 5095 + }, + { + "epoch": 0.5578696734995484, + "grad_norm": 1.2347140751872536, + "learning_rate": 2.0476384312389914e-05, + "loss": 0.6233, + "num_input_tokens_seen": 919982112, + "step": 5096 + }, + { + "epoch": 0.5579791455704863, + "grad_norm": 1.3606158326400446, + "learning_rate": 2.04679278928042e-05, + "loss": 0.6873, + "num_input_tokens_seen": 920157952, + "step": 5097 + }, + { + "epoch": 0.5580886176414243, + "grad_norm": 1.3929435175708669, + "learning_rate": 2.0459472009353957e-05, + "loss": 0.8741, + "num_input_tokens_seen": 920362016, + "step": 5098 + }, + { + "epoch": 0.5581980897123622, + "grad_norm": 1.3946243810677101, + "learning_rate": 2.0451016663039503e-05, + "loss": 0.8312, + "num_input_tokens_seen": 920548608, + "step": 5099 + }, + { + "epoch": 0.5583075617833, + "grad_norm": 1.234784263656013, + "learning_rate": 2.0442561854861076e-05, + "loss": 0.6009, + "num_input_tokens_seen": 920718400, + "step": 5100 + }, + { + "epoch": 0.5584170338542379, + "grad_norm": 1.2224455338028866, + "learning_rate": 2.043410758581887e-05, + "loss": 0.7294, + "num_input_tokens_seen": 920903648, + "step": 5101 + }, + { + "epoch": 0.5585265059251758, + "grad_norm": 1.0793248562632083, + "learning_rate": 2.042565385691301e-05, + "loss": 0.5218, + "num_input_tokens_seen": 921050144, + "step": 5102 + }, + { + "epoch": 0.5586359779961138, + "grad_norm": 1.2509549784529217, + "learning_rate": 2.041720066914355e-05, + "loss": 0.7643, + "num_input_tokens_seen": 921245696, + "step": 5103 + }, + { + "epoch": 0.5587454500670517, + "grad_norm": 1.1205250962633082, + "learning_rate": 2.040874802351049e-05, + "loss": 0.7074, + "num_input_tokens_seen": 921428704, + "step": 5104 + }, + { + "epoch": 0.5588549221379896, + "grad_norm": 1.3700597184106296, + "learning_rate": 2.040029592101376e-05, + "loss": 0.8414, + "num_input_tokens_seen": 921607904, + "step": 5105 + }, + { + "epoch": 0.5589643942089274, + "grad_norm": 1.307929117889458, + "learning_rate": 2.039184436265324e-05, + "loss": 0.8857, + "num_input_tokens_seen": 921800768, + "step": 5106 + }, + { + "epoch": 0.5590738662798653, + "grad_norm": 1.3589184312207583, + "learning_rate": 2.038339334942871e-05, + "loss": 0.7862, + "num_input_tokens_seen": 921958464, + "step": 5107 + }, + { + "epoch": 0.5591833383508032, + "grad_norm": 1.1809699559149174, + "learning_rate": 2.0374942882339935e-05, + "loss": 0.882, + "num_input_tokens_seen": 922137664, + "step": 5108 + }, + { + "epoch": 0.5592928104217412, + "grad_norm": 1.1507248708636102, + "learning_rate": 2.0366492962386563e-05, + "loss": 0.6233, + "num_input_tokens_seen": 922345312, + "step": 5109 + }, + { + "epoch": 0.5594022824926791, + "grad_norm": 1.2843248640610025, + "learning_rate": 2.0358043590568215e-05, + "loss": 0.6916, + "num_input_tokens_seen": 922530560, + "step": 5110 + }, + { + "epoch": 0.5595117545636169, + "grad_norm": 1.3343007730390999, + "learning_rate": 2.034959476788445e-05, + "loss": 0.6966, + "num_input_tokens_seen": 922694752, + "step": 5111 + }, + { + "epoch": 0.5596212266345548, + "grad_norm": 1.1553456661962416, + "learning_rate": 2.034114649533472e-05, + "loss": 0.9519, + "num_input_tokens_seen": 922889632, + "step": 5112 + }, + { + "epoch": 0.5597306987054927, + "grad_norm": 1.3822892055571014, + "learning_rate": 2.033269877391846e-05, + "loss": 0.7289, + "num_input_tokens_seen": 923084736, + "step": 5113 + }, + { + "epoch": 0.5598401707764307, + "grad_norm": 1.204379084549982, + "learning_rate": 2.032425160463501e-05, + "loss": 0.6876, + "num_input_tokens_seen": 923232800, + "step": 5114 + }, + { + "epoch": 0.5599496428473686, + "grad_norm": 1.2258528582126398, + "learning_rate": 2.0315804988483665e-05, + "loss": 0.7089, + "num_input_tokens_seen": 923392288, + "step": 5115 + }, + { + "epoch": 0.5600591149183065, + "grad_norm": 1.1675856864737952, + "learning_rate": 2.030735892646362e-05, + "loss": 0.5511, + "num_input_tokens_seen": 923554240, + "step": 5116 + }, + { + "epoch": 0.5601685869892443, + "grad_norm": 1.4479146266712482, + "learning_rate": 2.029891341957405e-05, + "loss": 0.9287, + "num_input_tokens_seen": 923715296, + "step": 5117 + }, + { + "epoch": 0.5602780590601822, + "grad_norm": 1.2702138151568065, + "learning_rate": 2.0290468468814045e-05, + "loss": 0.8578, + "num_input_tokens_seen": 923896736, + "step": 5118 + }, + { + "epoch": 0.5603875311311202, + "grad_norm": 1.293007564826426, + "learning_rate": 2.0282024075182603e-05, + "loss": 0.7242, + "num_input_tokens_seen": 924071680, + "step": 5119 + }, + { + "epoch": 0.5604970032020581, + "grad_norm": 1.1942826865077016, + "learning_rate": 2.0273580239678706e-05, + "loss": 0.6784, + "num_input_tokens_seen": 924244608, + "step": 5120 + }, + { + "epoch": 0.560606475272996, + "grad_norm": 1.2853839058104264, + "learning_rate": 2.0265136963301225e-05, + "loss": 0.7525, + "num_input_tokens_seen": 924404544, + "step": 5121 + }, + { + "epoch": 0.5607159473439339, + "grad_norm": 1.3411522084303755, + "learning_rate": 2.025669424704899e-05, + "loss": 0.72, + "num_input_tokens_seen": 924576800, + "step": 5122 + }, + { + "epoch": 0.5608254194148717, + "grad_norm": 1.2725929314754711, + "learning_rate": 2.0248252091920757e-05, + "loss": 0.6505, + "num_input_tokens_seen": 924776608, + "step": 5123 + }, + { + "epoch": 0.5609348914858097, + "grad_norm": 1.4460894171898724, + "learning_rate": 2.0239810498915213e-05, + "loss": 0.6875, + "num_input_tokens_seen": 924922880, + "step": 5124 + }, + { + "epoch": 0.5610443635567476, + "grad_norm": 1.2321295776994237, + "learning_rate": 2.0231369469030996e-05, + "loss": 0.6832, + "num_input_tokens_seen": 925113280, + "step": 5125 + }, + { + "epoch": 0.5611538356276855, + "grad_norm": 1.1781559279857179, + "learning_rate": 2.0222929003266645e-05, + "loss": 0.6811, + "num_input_tokens_seen": 925296064, + "step": 5126 + }, + { + "epoch": 0.5612633076986234, + "grad_norm": 1.262043866765946, + "learning_rate": 2.0214489102620675e-05, + "loss": 0.6185, + "num_input_tokens_seen": 925482656, + "step": 5127 + }, + { + "epoch": 0.5613727797695612, + "grad_norm": 1.4029207875382246, + "learning_rate": 2.0206049768091482e-05, + "loss": 0.8353, + "num_input_tokens_seen": 925655136, + "step": 5128 + }, + { + "epoch": 0.5614822518404992, + "grad_norm": 1.4281830924007297, + "learning_rate": 2.019761100067745e-05, + "loss": 0.8278, + "num_input_tokens_seen": 925840384, + "step": 5129 + }, + { + "epoch": 0.5615917239114371, + "grad_norm": 1.2857505096903308, + "learning_rate": 2.0189172801376845e-05, + "loss": 0.6623, + "num_input_tokens_seen": 926042656, + "step": 5130 + }, + { + "epoch": 0.561701195982375, + "grad_norm": 1.2753348228783556, + "learning_rate": 2.01807351711879e-05, + "loss": 0.7527, + "num_input_tokens_seen": 926250080, + "step": 5131 + }, + { + "epoch": 0.5618106680533129, + "grad_norm": 1.299143587967349, + "learning_rate": 2.0172298111108782e-05, + "loss": 0.6741, + "num_input_tokens_seen": 926436000, + "step": 5132 + }, + { + "epoch": 0.5619201401242508, + "grad_norm": 1.3152029724269654, + "learning_rate": 2.016386162213756e-05, + "loss": 0.657, + "num_input_tokens_seen": 926636928, + "step": 5133 + }, + { + "epoch": 0.5620296121951887, + "grad_norm": 1.2493905012000173, + "learning_rate": 2.0155425705272268e-05, + "loss": 0.8948, + "num_input_tokens_seen": 926828672, + "step": 5134 + }, + { + "epoch": 0.5621390842661266, + "grad_norm": 1.2971805993771695, + "learning_rate": 2.0146990361510844e-05, + "loss": 0.6581, + "num_input_tokens_seen": 926991520, + "step": 5135 + }, + { + "epoch": 0.5622485563370645, + "grad_norm": 1.3424615227212966, + "learning_rate": 2.0138555591851198e-05, + "loss": 0.7163, + "num_input_tokens_seen": 927157056, + "step": 5136 + }, + { + "epoch": 0.5623580284080024, + "grad_norm": 1.3165630847088439, + "learning_rate": 2.013012139729112e-05, + "loss": 0.7373, + "num_input_tokens_seen": 927351040, + "step": 5137 + }, + { + "epoch": 0.5624675004789403, + "grad_norm": 1.2173985544353116, + "learning_rate": 2.0121687778828372e-05, + "loss": 0.7935, + "num_input_tokens_seen": 927555104, + "step": 5138 + }, + { + "epoch": 0.5625769725498783, + "grad_norm": 1.436393809742391, + "learning_rate": 2.0113254737460643e-05, + "loss": 0.7021, + "num_input_tokens_seen": 927690176, + "step": 5139 + }, + { + "epoch": 0.5626864446208161, + "grad_norm": 1.3745194809255739, + "learning_rate": 2.0104822274185525e-05, + "loss": 0.7872, + "num_input_tokens_seen": 927858400, + "step": 5140 + }, + { + "epoch": 0.562795916691754, + "grad_norm": 1.4596894180785056, + "learning_rate": 2.009639039000059e-05, + "loss": 0.7356, + "num_input_tokens_seen": 928032000, + "step": 5141 + }, + { + "epoch": 0.5629053887626919, + "grad_norm": 1.3291236264585775, + "learning_rate": 2.0087959085903282e-05, + "loss": 0.6888, + "num_input_tokens_seen": 928218816, + "step": 5142 + }, + { + "epoch": 0.5630148608336298, + "grad_norm": 1.2461024556109985, + "learning_rate": 2.0079528362891032e-05, + "loss": 0.8143, + "num_input_tokens_seen": 928408096, + "step": 5143 + }, + { + "epoch": 0.5631243329045678, + "grad_norm": 1.3483433395520563, + "learning_rate": 2.0071098221961168e-05, + "loss": 0.8208, + "num_input_tokens_seen": 928609472, + "step": 5144 + }, + { + "epoch": 0.5632338049755056, + "grad_norm": 1.3347553131089225, + "learning_rate": 2.0062668664110957e-05, + "loss": 0.7201, + "num_input_tokens_seen": 928823168, + "step": 5145 + }, + { + "epoch": 0.5633432770464435, + "grad_norm": 1.12118492686115, + "learning_rate": 2.005423969033761e-05, + "loss": 0.5968, + "num_input_tokens_seen": 928981536, + "step": 5146 + }, + { + "epoch": 0.5634527491173814, + "grad_norm": 1.228447358822812, + "learning_rate": 2.004581130163825e-05, + "loss": 0.6834, + "num_input_tokens_seen": 929173280, + "step": 5147 + }, + { + "epoch": 0.5635622211883193, + "grad_norm": 1.389965801229968, + "learning_rate": 2.0037383499009948e-05, + "loss": 0.6464, + "num_input_tokens_seen": 929341952, + "step": 5148 + }, + { + "epoch": 0.5636716932592573, + "grad_norm": 1.2304003984146812, + "learning_rate": 2.0028956283449686e-05, + "loss": 0.6408, + "num_input_tokens_seen": 929548256, + "step": 5149 + }, + { + "epoch": 0.5637811653301952, + "grad_norm": 1.2116769317356544, + "learning_rate": 2.00205296559544e-05, + "loss": 0.91, + "num_input_tokens_seen": 929753216, + "step": 5150 + }, + { + "epoch": 0.563890637401133, + "grad_norm": 1.2797846439364913, + "learning_rate": 2.0012103617520926e-05, + "loss": 0.525, + "num_input_tokens_seen": 929905984, + "step": 5151 + }, + { + "epoch": 0.5640001094720709, + "grad_norm": 1.3159793874813377, + "learning_rate": 2.000367816914606e-05, + "loss": 0.6845, + "num_input_tokens_seen": 930061216, + "step": 5152 + }, + { + "epoch": 0.5641095815430088, + "grad_norm": 0.9789362020697663, + "learning_rate": 1.9995253311826526e-05, + "loss": 0.5558, + "num_input_tokens_seen": 930281184, + "step": 5153 + }, + { + "epoch": 0.5642190536139468, + "grad_norm": 1.1629193216255047, + "learning_rate": 1.9986829046558944e-05, + "loss": 0.56, + "num_input_tokens_seen": 930460832, + "step": 5154 + }, + { + "epoch": 0.5643285256848847, + "grad_norm": 1.2215255530296558, + "learning_rate": 1.997840537433991e-05, + "loss": 0.9095, + "num_input_tokens_seen": 930661088, + "step": 5155 + }, + { + "epoch": 0.5644379977558226, + "grad_norm": 1.1711131353218043, + "learning_rate": 1.9969982296165915e-05, + "loss": 0.5955, + "num_input_tokens_seen": 930841856, + "step": 5156 + }, + { + "epoch": 0.5645474698267604, + "grad_norm": 1.3170531633142348, + "learning_rate": 1.996155981303341e-05, + "loss": 0.639, + "num_input_tokens_seen": 930995520, + "step": 5157 + }, + { + "epoch": 0.5646569418976983, + "grad_norm": 1.2366260332353671, + "learning_rate": 1.9953137925938737e-05, + "loss": 0.6926, + "num_input_tokens_seen": 931182112, + "step": 5158 + }, + { + "epoch": 0.5647664139686362, + "grad_norm": 1.281973332242283, + "learning_rate": 1.9944716635878197e-05, + "loss": 0.7113, + "num_input_tokens_seen": 931346752, + "step": 5159 + }, + { + "epoch": 0.5648758860395742, + "grad_norm": 1.2712420864102723, + "learning_rate": 1.9936295943848028e-05, + "loss": 0.6718, + "num_input_tokens_seen": 931512064, + "step": 5160 + }, + { + "epoch": 0.5649853581105121, + "grad_norm": 1.2808459040104128, + "learning_rate": 1.9927875850844356e-05, + "loss": 0.6866, + "num_input_tokens_seen": 931686784, + "step": 5161 + }, + { + "epoch": 0.5650948301814499, + "grad_norm": 1.235431894698487, + "learning_rate": 1.9919456357863286e-05, + "loss": 0.6033, + "num_input_tokens_seen": 931836640, + "step": 5162 + }, + { + "epoch": 0.5652043022523878, + "grad_norm": 1.204786812613481, + "learning_rate": 1.9911037465900807e-05, + "loss": 0.6833, + "num_input_tokens_seen": 932000608, + "step": 5163 + }, + { + "epoch": 0.5653137743233257, + "grad_norm": 1.3125824675310211, + "learning_rate": 1.990261917595287e-05, + "loss": 0.7289, + "num_input_tokens_seen": 932180928, + "step": 5164 + }, + { + "epoch": 0.5654232463942637, + "grad_norm": 1.2447633726769638, + "learning_rate": 1.9894201489015342e-05, + "loss": 0.6289, + "num_input_tokens_seen": 932348480, + "step": 5165 + }, + { + "epoch": 0.5655327184652016, + "grad_norm": 1.1508913877034523, + "learning_rate": 1.9885784406084012e-05, + "loss": 0.7782, + "num_input_tokens_seen": 932545824, + "step": 5166 + }, + { + "epoch": 0.5656421905361395, + "grad_norm": 1.2571840789619417, + "learning_rate": 1.9877367928154618e-05, + "loss": 0.8401, + "num_input_tokens_seen": 932760192, + "step": 5167 + }, + { + "epoch": 0.5657516626070773, + "grad_norm": 1.2721784955959117, + "learning_rate": 1.9868952056222795e-05, + "loss": 0.6959, + "num_input_tokens_seen": 932947456, + "step": 5168 + }, + { + "epoch": 0.5658611346780152, + "grad_norm": 1.3999943937461863, + "learning_rate": 1.9860536791284148e-05, + "loss": 0.7142, + "num_input_tokens_seen": 933128896, + "step": 5169 + }, + { + "epoch": 0.5659706067489532, + "grad_norm": 1.2219814107857654, + "learning_rate": 1.985212213433416e-05, + "loss": 0.7648, + "num_input_tokens_seen": 933336768, + "step": 5170 + }, + { + "epoch": 0.5660800788198911, + "grad_norm": 1.3738377167011937, + "learning_rate": 1.9843708086368287e-05, + "loss": 0.7751, + "num_input_tokens_seen": 933525376, + "step": 5171 + }, + { + "epoch": 0.566189550890829, + "grad_norm": 1.216811717327089, + "learning_rate": 1.9835294648381898e-05, + "loss": 0.8413, + "num_input_tokens_seen": 933713536, + "step": 5172 + }, + { + "epoch": 0.5662990229617669, + "grad_norm": 1.2211979953934342, + "learning_rate": 1.9826881821370268e-05, + "loss": 0.7625, + "num_input_tokens_seen": 933899904, + "step": 5173 + }, + { + "epoch": 0.5664084950327047, + "grad_norm": 1.2398533513659475, + "learning_rate": 1.9818469606328642e-05, + "loss": 0.7137, + "num_input_tokens_seen": 934067008, + "step": 5174 + }, + { + "epoch": 0.5665179671036427, + "grad_norm": 1.51150345562973, + "learning_rate": 1.9810058004252146e-05, + "loss": 0.749, + "num_input_tokens_seen": 934240384, + "step": 5175 + }, + { + "epoch": 0.5666274391745806, + "grad_norm": 1.4867914708243601, + "learning_rate": 1.9801647016135868e-05, + "loss": 0.8713, + "num_input_tokens_seen": 934399200, + "step": 5176 + }, + { + "epoch": 0.5667369112455185, + "grad_norm": 1.437880258437315, + "learning_rate": 1.9793236642974806e-05, + "loss": 0.8931, + "num_input_tokens_seen": 934586240, + "step": 5177 + }, + { + "epoch": 0.5668463833164564, + "grad_norm": 1.2670395563690462, + "learning_rate": 1.9784826885763903e-05, + "loss": 0.5831, + "num_input_tokens_seen": 934777088, + "step": 5178 + }, + { + "epoch": 0.5669558553873942, + "grad_norm": 1.3032066026930162, + "learning_rate": 1.977641774549801e-05, + "loss": 0.6634, + "num_input_tokens_seen": 934931872, + "step": 5179 + }, + { + "epoch": 0.5670653274583322, + "grad_norm": 1.4159762252061725, + "learning_rate": 1.9768009223171907e-05, + "loss": 0.813, + "num_input_tokens_seen": 935108384, + "step": 5180 + }, + { + "epoch": 0.5671747995292701, + "grad_norm": 1.264582404691972, + "learning_rate": 1.9759601319780317e-05, + "loss": 0.5376, + "num_input_tokens_seen": 935251744, + "step": 5181 + }, + { + "epoch": 0.567284271600208, + "grad_norm": 1.214618870568069, + "learning_rate": 1.9751194036317868e-05, + "loss": 0.6096, + "num_input_tokens_seen": 935409888, + "step": 5182 + }, + { + "epoch": 0.5673937436711459, + "grad_norm": 1.2502480737571775, + "learning_rate": 1.9742787373779137e-05, + "loss": 0.7021, + "num_input_tokens_seen": 935602752, + "step": 5183 + }, + { + "epoch": 0.5675032157420838, + "grad_norm": 1.3684419816652214, + "learning_rate": 1.9734381333158604e-05, + "loss": 0.8124, + "num_input_tokens_seen": 935778368, + "step": 5184 + }, + { + "epoch": 0.5676126878130217, + "grad_norm": 1.316971066251946, + "learning_rate": 1.9725975915450687e-05, + "loss": 0.7234, + "num_input_tokens_seen": 935953312, + "step": 5185 + }, + { + "epoch": 0.5677221598839596, + "grad_norm": 1.3209139565652275, + "learning_rate": 1.971757112164975e-05, + "loss": 0.8517, + "num_input_tokens_seen": 936139456, + "step": 5186 + }, + { + "epoch": 0.5678316319548975, + "grad_norm": 1.1557055557587466, + "learning_rate": 1.970916695275004e-05, + "loss": 0.5732, + "num_input_tokens_seen": 936308128, + "step": 5187 + }, + { + "epoch": 0.5679411040258354, + "grad_norm": 1.247804573429419, + "learning_rate": 1.9700763409745773e-05, + "loss": 0.5845, + "num_input_tokens_seen": 936423488, + "step": 5188 + }, + { + "epoch": 0.5680505760967733, + "grad_norm": 1.322864030807133, + "learning_rate": 1.9692360493631058e-05, + "loss": 0.6367, + "num_input_tokens_seen": 936588800, + "step": 5189 + }, + { + "epoch": 0.5681600481677113, + "grad_norm": 1.391811893027727, + "learning_rate": 1.968395820539996e-05, + "loss": 0.9478, + "num_input_tokens_seen": 936772480, + "step": 5190 + }, + { + "epoch": 0.5682695202386491, + "grad_norm": 1.142432413835847, + "learning_rate": 1.967555654604643e-05, + "loss": 0.7164, + "num_input_tokens_seen": 936951680, + "step": 5191 + }, + { + "epoch": 0.568378992309587, + "grad_norm": 1.3467658155754338, + "learning_rate": 1.9667155516564385e-05, + "loss": 0.7618, + "num_input_tokens_seen": 937154176, + "step": 5192 + }, + { + "epoch": 0.5684884643805249, + "grad_norm": 1.2486546906383875, + "learning_rate": 1.9658755117947657e-05, + "loss": 0.9073, + "num_input_tokens_seen": 937346144, + "step": 5193 + }, + { + "epoch": 0.5685979364514628, + "grad_norm": 1.2975995900589734, + "learning_rate": 1.965035535118998e-05, + "loss": 0.8494, + "num_input_tokens_seen": 937540128, + "step": 5194 + }, + { + "epoch": 0.5687074085224008, + "grad_norm": 1.2278395401095408, + "learning_rate": 1.9641956217285048e-05, + "loss": 0.866, + "num_input_tokens_seen": 937719328, + "step": 5195 + }, + { + "epoch": 0.5688168805933386, + "grad_norm": 1.3096700768300553, + "learning_rate": 1.9633557717226443e-05, + "loss": 0.8548, + "num_input_tokens_seen": 937892256, + "step": 5196 + }, + { + "epoch": 0.5689263526642765, + "grad_norm": 1.364087016965718, + "learning_rate": 1.96251598520077e-05, + "loss": 0.6422, + "num_input_tokens_seen": 938082208, + "step": 5197 + }, + { + "epoch": 0.5690358247352144, + "grad_norm": 1.1702915429147498, + "learning_rate": 1.9616762622622272e-05, + "loss": 0.7881, + "num_input_tokens_seen": 938291648, + "step": 5198 + }, + { + "epoch": 0.5691452968061523, + "grad_norm": 1.307932535767733, + "learning_rate": 1.960836603006354e-05, + "loss": 0.8533, + "num_input_tokens_seen": 938441728, + "step": 5199 + }, + { + "epoch": 0.5692547688770903, + "grad_norm": 1.173001828441837, + "learning_rate": 1.9599970075324797e-05, + "loss": 0.5863, + "num_input_tokens_seen": 938616448, + "step": 5200 + }, + { + "epoch": 0.5693642409480282, + "grad_norm": 1.2281832343896075, + "learning_rate": 1.959157475939927e-05, + "loss": 0.6881, + "num_input_tokens_seen": 938810656, + "step": 5201 + }, + { + "epoch": 0.569473713018966, + "grad_norm": 1.3837547755654347, + "learning_rate": 1.9583180083280118e-05, + "loss": 0.7814, + "num_input_tokens_seen": 939003520, + "step": 5202 + }, + { + "epoch": 0.5695831850899039, + "grad_norm": 1.4158492628566912, + "learning_rate": 1.9574786047960394e-05, + "loss": 0.948, + "num_input_tokens_seen": 939188992, + "step": 5203 + }, + { + "epoch": 0.5696926571608418, + "grad_norm": 1.1572251203181942, + "learning_rate": 1.9566392654433123e-05, + "loss": 0.581, + "num_input_tokens_seen": 939354304, + "step": 5204 + }, + { + "epoch": 0.5698021292317798, + "grad_norm": 1.2424876012509702, + "learning_rate": 1.95579999036912e-05, + "loss": 0.5946, + "num_input_tokens_seen": 939508864, + "step": 5205 + }, + { + "epoch": 0.5699116013027177, + "grad_norm": 1.2444995500127731, + "learning_rate": 1.9549607796727487e-05, + "loss": 0.8921, + "num_input_tokens_seen": 939702400, + "step": 5206 + }, + { + "epoch": 0.5700210733736556, + "grad_norm": 1.4123012048369494, + "learning_rate": 1.9541216334534764e-05, + "loss": 0.6433, + "num_input_tokens_seen": 939870624, + "step": 5207 + }, + { + "epoch": 0.5701305454445934, + "grad_norm": 1.2029933114250362, + "learning_rate": 1.9532825518105702e-05, + "loss": 0.6541, + "num_input_tokens_seen": 940038848, + "step": 5208 + }, + { + "epoch": 0.5702400175155313, + "grad_norm": 1.232875299244264, + "learning_rate": 1.9524435348432933e-05, + "loss": 0.6135, + "num_input_tokens_seen": 940228800, + "step": 5209 + }, + { + "epoch": 0.5703494895864692, + "grad_norm": 1.22651356399009, + "learning_rate": 1.9516045826508994e-05, + "loss": 0.6384, + "num_input_tokens_seen": 940396576, + "step": 5210 + }, + { + "epoch": 0.5704589616574072, + "grad_norm": 1.2246611126400164, + "learning_rate": 1.9507656953326355e-05, + "loss": 0.839, + "num_input_tokens_seen": 940591680, + "step": 5211 + }, + { + "epoch": 0.5705684337283451, + "grad_norm": 1.2721238644978958, + "learning_rate": 1.949926872987739e-05, + "loss": 0.7935, + "num_input_tokens_seen": 940755872, + "step": 5212 + }, + { + "epoch": 0.5706779057992829, + "grad_norm": 1.3259402537716247, + "learning_rate": 1.9490881157154422e-05, + "loss": 0.7336, + "num_input_tokens_seen": 940936864, + "step": 5213 + }, + { + "epoch": 0.5707873778702208, + "grad_norm": 1.2970954983332048, + "learning_rate": 1.948249423614969e-05, + "loss": 0.6243, + "num_input_tokens_seen": 941108896, + "step": 5214 + }, + { + "epoch": 0.5708968499411587, + "grad_norm": 1.2606663995090595, + "learning_rate": 1.9474107967855332e-05, + "loss": 0.6991, + "num_input_tokens_seen": 941273760, + "step": 5215 + }, + { + "epoch": 0.5710063220120967, + "grad_norm": 1.3148917703365055, + "learning_rate": 1.9465722353263445e-05, + "loss": 0.7685, + "num_input_tokens_seen": 941485888, + "step": 5216 + }, + { + "epoch": 0.5711157940830346, + "grad_norm": 1.3955586386114704, + "learning_rate": 1.945733739336602e-05, + "loss": 0.839, + "num_input_tokens_seen": 941657248, + "step": 5217 + }, + { + "epoch": 0.5712252661539725, + "grad_norm": 1.226996521180809, + "learning_rate": 1.9448953089154982e-05, + "loss": 0.6621, + "num_input_tokens_seen": 941825696, + "step": 5218 + }, + { + "epoch": 0.5713347382249103, + "grad_norm": 1.3320715666008518, + "learning_rate": 1.9440569441622182e-05, + "loss": 0.8085, + "num_input_tokens_seen": 942024160, + "step": 5219 + }, + { + "epoch": 0.5714442102958482, + "grad_norm": 1.173037549978025, + "learning_rate": 1.9432186451759397e-05, + "loss": 0.682, + "num_input_tokens_seen": 942222624, + "step": 5220 + }, + { + "epoch": 0.5715536823667862, + "grad_norm": 1.1725106613390235, + "learning_rate": 1.9423804120558307e-05, + "loss": 0.6697, + "num_input_tokens_seen": 942443264, + "step": 5221 + }, + { + "epoch": 0.5716631544377241, + "grad_norm": 1.4050159974052903, + "learning_rate": 1.9415422449010523e-05, + "loss": 0.7611, + "num_input_tokens_seen": 942636576, + "step": 5222 + }, + { + "epoch": 0.571772626508662, + "grad_norm": 1.351981346190728, + "learning_rate": 1.94070414381076e-05, + "loss": 0.7642, + "num_input_tokens_seen": 942819584, + "step": 5223 + }, + { + "epoch": 0.5718820985795999, + "grad_norm": 1.1707900047204016, + "learning_rate": 1.9398661088840974e-05, + "loss": 0.6555, + "num_input_tokens_seen": 943009536, + "step": 5224 + }, + { + "epoch": 0.5719915706505377, + "grad_norm": 1.2309126280387634, + "learning_rate": 1.9390281402202043e-05, + "loss": 0.7859, + "num_input_tokens_seen": 943229952, + "step": 5225 + }, + { + "epoch": 0.5721010427214757, + "grad_norm": 1.1816397674809647, + "learning_rate": 1.9381902379182085e-05, + "loss": 0.6229, + "num_input_tokens_seen": 943406912, + "step": 5226 + }, + { + "epoch": 0.5722105147924136, + "grad_norm": 1.1631435376821335, + "learning_rate": 1.9373524020772337e-05, + "loss": 0.9615, + "num_input_tokens_seen": 943615008, + "step": 5227 + }, + { + "epoch": 0.5723199868633515, + "grad_norm": 1.245222546476818, + "learning_rate": 1.9365146327963955e-05, + "loss": 0.8624, + "num_input_tokens_seen": 943813472, + "step": 5228 + }, + { + "epoch": 0.5724294589342894, + "grad_norm": 1.1869220331001913, + "learning_rate": 1.9356769301747972e-05, + "loss": 0.6925, + "num_input_tokens_seen": 944006784, + "step": 5229 + }, + { + "epoch": 0.5725389310052272, + "grad_norm": 1.1227218555667946, + "learning_rate": 1.9348392943115405e-05, + "loss": 0.61, + "num_input_tokens_seen": 944207712, + "step": 5230 + }, + { + "epoch": 0.5726484030761652, + "grad_norm": 1.2557282327432007, + "learning_rate": 1.9340017253057142e-05, + "loss": 0.6085, + "num_input_tokens_seen": 944378848, + "step": 5231 + }, + { + "epoch": 0.5727578751471031, + "grad_norm": 1.1505020165390518, + "learning_rate": 1.933164223256403e-05, + "loss": 0.6213, + "num_input_tokens_seen": 944577760, + "step": 5232 + }, + { + "epoch": 0.572867347218041, + "grad_norm": 1.223973792228741, + "learning_rate": 1.932326788262679e-05, + "loss": 0.6846, + "num_input_tokens_seen": 944750464, + "step": 5233 + }, + { + "epoch": 0.5729768192889789, + "grad_norm": 1.3195698923852375, + "learning_rate": 1.931489420423611e-05, + "loss": 0.7816, + "num_input_tokens_seen": 944905248, + "step": 5234 + }, + { + "epoch": 0.5730862913599168, + "grad_norm": 1.306791790028527, + "learning_rate": 1.930652119838259e-05, + "loss": 0.6278, + "num_input_tokens_seen": 945085344, + "step": 5235 + }, + { + "epoch": 0.5731957634308547, + "grad_norm": 1.6302004276689257, + "learning_rate": 1.9298148866056716e-05, + "loss": 0.7355, + "num_input_tokens_seen": 945249088, + "step": 5236 + }, + { + "epoch": 0.5733052355017926, + "grad_norm": 1.1537778045511105, + "learning_rate": 1.9289777208248942e-05, + "loss": 0.7272, + "num_input_tokens_seen": 945391552, + "step": 5237 + }, + { + "epoch": 0.5734147075727305, + "grad_norm": 1.0983411071070022, + "learning_rate": 1.92814062259496e-05, + "loss": 0.6367, + "num_input_tokens_seen": 945579040, + "step": 5238 + }, + { + "epoch": 0.5735241796436684, + "grad_norm": 1.5070808493449854, + "learning_rate": 1.9273035920148966e-05, + "loss": 0.783, + "num_input_tokens_seen": 945723968, + "step": 5239 + }, + { + "epoch": 0.5736336517146063, + "grad_norm": 1.255076257660964, + "learning_rate": 1.9264666291837242e-05, + "loss": 0.7364, + "num_input_tokens_seen": 945894656, + "step": 5240 + }, + { + "epoch": 0.5737431237855443, + "grad_norm": 1.3275509210651526, + "learning_rate": 1.9256297342004527e-05, + "loss": 0.8578, + "num_input_tokens_seen": 946075200, + "step": 5241 + }, + { + "epoch": 0.5738525958564821, + "grad_norm": 1.3192076264169639, + "learning_rate": 1.924792907164086e-05, + "loss": 0.9042, + "num_input_tokens_seen": 946239840, + "step": 5242 + }, + { + "epoch": 0.57396206792742, + "grad_norm": 1.248637505887521, + "learning_rate": 1.9239561481736183e-05, + "loss": 0.6339, + "num_input_tokens_seen": 946408512, + "step": 5243 + }, + { + "epoch": 0.5740715399983579, + "grad_norm": 1.2099363556982627, + "learning_rate": 1.9231194573280383e-05, + "loss": 0.8207, + "num_input_tokens_seen": 946589728, + "step": 5244 + }, + { + "epoch": 0.5741810120692958, + "grad_norm": 1.3051319393239824, + "learning_rate": 1.9222828347263222e-05, + "loss": 0.6888, + "num_input_tokens_seen": 946794688, + "step": 5245 + }, + { + "epoch": 0.5742904841402338, + "grad_norm": 1.276779190031564, + "learning_rate": 1.9214462804674425e-05, + "loss": 0.673, + "num_input_tokens_seen": 946992032, + "step": 5246 + }, + { + "epoch": 0.5743999562111716, + "grad_norm": 1.2261898179213424, + "learning_rate": 1.9206097946503625e-05, + "loss": 0.8339, + "num_input_tokens_seen": 947158688, + "step": 5247 + }, + { + "epoch": 0.5745094282821095, + "grad_norm": 1.2158412456625396, + "learning_rate": 1.9197733773740356e-05, + "loss": 0.7893, + "num_input_tokens_seen": 947330048, + "step": 5248 + }, + { + "epoch": 0.5746189003530474, + "grad_norm": 1.281903309688496, + "learning_rate": 1.91893702873741e-05, + "loss": 0.6079, + "num_input_tokens_seen": 947504320, + "step": 5249 + }, + { + "epoch": 0.5747283724239853, + "grad_norm": 1.3135698590239244, + "learning_rate": 1.918100748839422e-05, + "loss": 0.7136, + "num_input_tokens_seen": 947674336, + "step": 5250 + }, + { + "epoch": 0.5748378444949233, + "grad_norm": 1.3016958601132689, + "learning_rate": 1.9172645377790037e-05, + "loss": 0.7487, + "num_input_tokens_seen": 947841216, + "step": 5251 + }, + { + "epoch": 0.5749473165658612, + "grad_norm": 1.403615773102633, + "learning_rate": 1.916428395655076e-05, + "loss": 0.9854, + "num_input_tokens_seen": 948018848, + "step": 5252 + }, + { + "epoch": 0.575056788636799, + "grad_norm": 1.328121993117575, + "learning_rate": 1.915592322566553e-05, + "loss": 0.7018, + "num_input_tokens_seen": 948181248, + "step": 5253 + }, + { + "epoch": 0.5751662607077369, + "grad_norm": 1.2965652831354104, + "learning_rate": 1.9147563186123423e-05, + "loss": 0.8095, + "num_input_tokens_seen": 948360672, + "step": 5254 + }, + { + "epoch": 0.5752757327786748, + "grad_norm": 1.2146938022914233, + "learning_rate": 1.9139203838913394e-05, + "loss": 0.6982, + "num_input_tokens_seen": 948560032, + "step": 5255 + }, + { + "epoch": 0.5753852048496128, + "grad_norm": 1.1903871273603515, + "learning_rate": 1.913084518502436e-05, + "loss": 0.7193, + "num_input_tokens_seen": 948714368, + "step": 5256 + }, + { + "epoch": 0.5754946769205507, + "grad_norm": 1.1717394941187382, + "learning_rate": 1.9122487225445107e-05, + "loss": 0.7993, + "num_input_tokens_seen": 948903648, + "step": 5257 + }, + { + "epoch": 0.5756041489914886, + "grad_norm": 1.2769986774375, + "learning_rate": 1.911412996116439e-05, + "loss": 0.9461, + "num_input_tokens_seen": 949118464, + "step": 5258 + }, + { + "epoch": 0.5757136210624264, + "grad_norm": 1.4394152777366287, + "learning_rate": 1.9105773393170836e-05, + "loss": 0.8147, + "num_input_tokens_seen": 949303712, + "step": 5259 + }, + { + "epoch": 0.5758230931333643, + "grad_norm": 1.3309360151781953, + "learning_rate": 1.9097417522453023e-05, + "loss": 0.5687, + "num_input_tokens_seen": 949504416, + "step": 5260 + }, + { + "epoch": 0.5759325652043022, + "grad_norm": 1.1665987891932703, + "learning_rate": 1.9089062349999437e-05, + "loss": 0.6587, + "num_input_tokens_seen": 949696608, + "step": 5261 + }, + { + "epoch": 0.5760420372752402, + "grad_norm": 1.426042423104122, + "learning_rate": 1.9080707876798475e-05, + "loss": 0.7386, + "num_input_tokens_seen": 949838848, + "step": 5262 + }, + { + "epoch": 0.5761515093461781, + "grad_norm": 1.280791596076171, + "learning_rate": 1.9072354103838458e-05, + "loss": 0.5829, + "num_input_tokens_seen": 950006848, + "step": 5263 + }, + { + "epoch": 0.5762609814171159, + "grad_norm": 1.3763871132079697, + "learning_rate": 1.9064001032107612e-05, + "loss": 0.7869, + "num_input_tokens_seen": 950194784, + "step": 5264 + }, + { + "epoch": 0.5763704534880538, + "grad_norm": 1.3561273563324805, + "learning_rate": 1.9055648662594107e-05, + "loss": 0.6898, + "num_input_tokens_seen": 950376224, + "step": 5265 + }, + { + "epoch": 0.5764799255589917, + "grad_norm": 1.2870385204025083, + "learning_rate": 1.904729699628599e-05, + "loss": 0.7049, + "num_input_tokens_seen": 950536832, + "step": 5266 + }, + { + "epoch": 0.5765893976299297, + "grad_norm": 1.2162725258727969, + "learning_rate": 1.9038946034171258e-05, + "loss": 0.7944, + "num_input_tokens_seen": 950711328, + "step": 5267 + }, + { + "epoch": 0.5766988697008676, + "grad_norm": 1.2134987109962914, + "learning_rate": 1.903059577723783e-05, + "loss": 0.6113, + "num_input_tokens_seen": 950870816, + "step": 5268 + }, + { + "epoch": 0.5768083417718055, + "grad_norm": 1.2041532407933935, + "learning_rate": 1.9022246226473494e-05, + "loss": 0.6373, + "num_input_tokens_seen": 951029184, + "step": 5269 + }, + { + "epoch": 0.5769178138427433, + "grad_norm": 1.2743357199489294, + "learning_rate": 1.9013897382866013e-05, + "loss": 0.6858, + "num_input_tokens_seen": 951206816, + "step": 5270 + }, + { + "epoch": 0.5770272859136812, + "grad_norm": 1.2433151309152795, + "learning_rate": 1.900554924740302e-05, + "loss": 0.7019, + "num_input_tokens_seen": 951358016, + "step": 5271 + }, + { + "epoch": 0.5771367579846192, + "grad_norm": 1.2792104910302833, + "learning_rate": 1.8997201821072097e-05, + "loss": 0.6708, + "num_input_tokens_seen": 951534528, + "step": 5272 + }, + { + "epoch": 0.5772462300555571, + "grad_norm": 1.4785398131071315, + "learning_rate": 1.8988855104860718e-05, + "loss": 0.9943, + "num_input_tokens_seen": 951704320, + "step": 5273 + }, + { + "epoch": 0.577355702126495, + "grad_norm": 1.2350342354295711, + "learning_rate": 1.8980509099756287e-05, + "loss": 0.783, + "num_input_tokens_seen": 951898752, + "step": 5274 + }, + { + "epoch": 0.5774651741974329, + "grad_norm": 1.2809441364206688, + "learning_rate": 1.8972163806746132e-05, + "loss": 0.7669, + "num_input_tokens_seen": 952101920, + "step": 5275 + }, + { + "epoch": 0.5775746462683707, + "grad_norm": 1.4111479761701495, + "learning_rate": 1.8963819226817468e-05, + "loss": 0.5525, + "num_input_tokens_seen": 952306880, + "step": 5276 + }, + { + "epoch": 0.5776841183393087, + "grad_norm": 1.123609584314299, + "learning_rate": 1.895547536095746e-05, + "loss": 0.5393, + "num_input_tokens_seen": 952478464, + "step": 5277 + }, + { + "epoch": 0.5777935904102466, + "grad_norm": 1.374859932787868, + "learning_rate": 1.8947132210153144e-05, + "loss": 0.8122, + "num_input_tokens_seen": 952644672, + "step": 5278 + }, + { + "epoch": 0.5779030624811845, + "grad_norm": 1.2568765795226868, + "learning_rate": 1.8938789775391536e-05, + "loss": 0.6977, + "num_input_tokens_seen": 952846048, + "step": 5279 + }, + { + "epoch": 0.5780125345521224, + "grad_norm": 1.1908488486213338, + "learning_rate": 1.8930448057659497e-05, + "loss": 0.8444, + "num_input_tokens_seen": 953036896, + "step": 5280 + }, + { + "epoch": 0.5781220066230603, + "grad_norm": 1.4301438567354865, + "learning_rate": 1.892210705794385e-05, + "loss": 0.7312, + "num_input_tokens_seen": 953207584, + "step": 5281 + }, + { + "epoch": 0.5782314786939982, + "grad_norm": 1.2090011373661267, + "learning_rate": 1.8913766777231322e-05, + "loss": 0.7561, + "num_input_tokens_seen": 953407616, + "step": 5282 + }, + { + "epoch": 0.5783409507649361, + "grad_norm": 1.3871546100619738, + "learning_rate": 1.8905427216508554e-05, + "loss": 0.7085, + "num_input_tokens_seen": 953551424, + "step": 5283 + }, + { + "epoch": 0.578450422835874, + "grad_norm": 1.2069083833349408, + "learning_rate": 1.8897088376762094e-05, + "loss": 0.7052, + "num_input_tokens_seen": 953737792, + "step": 5284 + }, + { + "epoch": 0.5785598949068119, + "grad_norm": 1.2982064731323093, + "learning_rate": 1.8888750258978404e-05, + "loss": 0.8348, + "num_input_tokens_seen": 953907808, + "step": 5285 + }, + { + "epoch": 0.5786693669777498, + "grad_norm": 1.1238939888501598, + "learning_rate": 1.8880412864143886e-05, + "loss": 0.852, + "num_input_tokens_seen": 954133824, + "step": 5286 + }, + { + "epoch": 0.5787788390486877, + "grad_norm": 1.165471549071032, + "learning_rate": 1.887207619324482e-05, + "loss": 0.6797, + "num_input_tokens_seen": 954309664, + "step": 5287 + }, + { + "epoch": 0.5788883111196256, + "grad_norm": 1.1371518912404934, + "learning_rate": 1.8863740247267426e-05, + "loss": 0.6149, + "num_input_tokens_seen": 954479904, + "step": 5288 + }, + { + "epoch": 0.5789977831905635, + "grad_norm": 1.4534513045618922, + "learning_rate": 1.8855405027197838e-05, + "loss": 0.9868, + "num_input_tokens_seen": 954656640, + "step": 5289 + }, + { + "epoch": 0.5791072552615014, + "grad_norm": 1.1980839930338831, + "learning_rate": 1.884707053402208e-05, + "loss": 0.5796, + "num_input_tokens_seen": 954838976, + "step": 5290 + }, + { + "epoch": 0.5792167273324393, + "grad_norm": 1.4264590807694053, + "learning_rate": 1.8838736768726125e-05, + "loss": 0.7819, + "num_input_tokens_seen": 955006752, + "step": 5291 + }, + { + "epoch": 0.5793261994033773, + "grad_norm": 1.286444299701126, + "learning_rate": 1.8830403732295823e-05, + "loss": 0.9763, + "num_input_tokens_seen": 955198048, + "step": 5292 + }, + { + "epoch": 0.5794356714743151, + "grad_norm": 1.133657574319275, + "learning_rate": 1.8822071425716968e-05, + "loss": 0.85, + "num_input_tokens_seen": 955400768, + "step": 5293 + }, + { + "epoch": 0.579545143545253, + "grad_norm": 1.2793236183690848, + "learning_rate": 1.881373984997525e-05, + "loss": 0.9957, + "num_input_tokens_seen": 955584896, + "step": 5294 + }, + { + "epoch": 0.5796546156161909, + "grad_norm": 1.353687225957503, + "learning_rate": 1.880540900605628e-05, + "loss": 0.9051, + "num_input_tokens_seen": 955756480, + "step": 5295 + }, + { + "epoch": 0.5797640876871288, + "grad_norm": 1.2526252358846248, + "learning_rate": 1.8797078894945596e-05, + "loss": 0.7964, + "num_input_tokens_seen": 955926272, + "step": 5296 + }, + { + "epoch": 0.5798735597580668, + "grad_norm": 1.3286560791928523, + "learning_rate": 1.8788749517628606e-05, + "loss": 0.8012, + "num_input_tokens_seen": 956122944, + "step": 5297 + }, + { + "epoch": 0.5799830318290047, + "grad_norm": 1.3033292633165452, + "learning_rate": 1.878042087509069e-05, + "loss": 0.6837, + "num_input_tokens_seen": 956307968, + "step": 5298 + }, + { + "epoch": 0.5800925038999425, + "grad_norm": 1.1389982563839394, + "learning_rate": 1.877209296831708e-05, + "loss": 0.6519, + "num_input_tokens_seen": 956492544, + "step": 5299 + }, + { + "epoch": 0.5802019759708804, + "grad_norm": 1.1855684935155677, + "learning_rate": 1.8763765798292966e-05, + "loss": 0.6671, + "num_input_tokens_seen": 956669280, + "step": 5300 + }, + { + "epoch": 0.5803114480418183, + "grad_norm": 1.0968014792992309, + "learning_rate": 1.8755439366003448e-05, + "loss": 0.71, + "num_input_tokens_seen": 956855872, + "step": 5301 + }, + { + "epoch": 0.5804209201127563, + "grad_norm": 1.227830802099495, + "learning_rate": 1.8747113672433505e-05, + "loss": 0.7924, + "num_input_tokens_seen": 957044928, + "step": 5302 + }, + { + "epoch": 0.5805303921836942, + "grad_norm": 1.1060434202428326, + "learning_rate": 1.8738788718568066e-05, + "loss": 0.7529, + "num_input_tokens_seen": 957248320, + "step": 5303 + }, + { + "epoch": 0.580639864254632, + "grad_norm": 1.08002271734293, + "learning_rate": 1.8730464505391953e-05, + "loss": 0.72, + "num_input_tokens_seen": 957406688, + "step": 5304 + }, + { + "epoch": 0.5807493363255699, + "grad_norm": 1.309041598534021, + "learning_rate": 1.8722141033889904e-05, + "loss": 0.7287, + "num_input_tokens_seen": 957584320, + "step": 5305 + }, + { + "epoch": 0.5808588083965078, + "grad_norm": 1.2654778665802273, + "learning_rate": 1.8713818305046566e-05, + "loss": 0.8282, + "num_input_tokens_seen": 957754560, + "step": 5306 + }, + { + "epoch": 0.5809682804674458, + "grad_norm": 1.2623946325314932, + "learning_rate": 1.87054963198465e-05, + "loss": 0.8704, + "num_input_tokens_seen": 957944064, + "step": 5307 + }, + { + "epoch": 0.5810777525383837, + "grad_norm": 1.1258960440333334, + "learning_rate": 1.86971750792742e-05, + "loss": 0.7101, + "num_input_tokens_seen": 958129760, + "step": 5308 + }, + { + "epoch": 0.5811872246093216, + "grad_norm": 1.2306948499486725, + "learning_rate": 1.8688854584314028e-05, + "loss": 0.8386, + "num_input_tokens_seen": 958335392, + "step": 5309 + }, + { + "epoch": 0.5812966966802594, + "grad_norm": 1.1643687619806462, + "learning_rate": 1.8680534835950302e-05, + "loss": 0.6684, + "num_input_tokens_seen": 958512352, + "step": 5310 + }, + { + "epoch": 0.5814061687511973, + "grad_norm": 1.2482457248791021, + "learning_rate": 1.8672215835167217e-05, + "loss": 0.7258, + "num_input_tokens_seen": 958689312, + "step": 5311 + }, + { + "epoch": 0.5815156408221352, + "grad_norm": 1.1098470623746193, + "learning_rate": 1.8663897582948912e-05, + "loss": 0.6262, + "num_input_tokens_seen": 958854624, + "step": 5312 + }, + { + "epoch": 0.5816251128930732, + "grad_norm": 1.36563440664443, + "learning_rate": 1.86555800802794e-05, + "loss": 0.9159, + "num_input_tokens_seen": 959038976, + "step": 5313 + }, + { + "epoch": 0.5817345849640111, + "grad_norm": 1.2183926376298169, + "learning_rate": 1.864726332814264e-05, + "loss": 0.7262, + "num_input_tokens_seen": 959190624, + "step": 5314 + }, + { + "epoch": 0.581844057034949, + "grad_norm": 1.246220358034045, + "learning_rate": 1.863894732752248e-05, + "loss": 0.635, + "num_input_tokens_seen": 959370048, + "step": 5315 + }, + { + "epoch": 0.5819535291058868, + "grad_norm": 1.317997454662499, + "learning_rate": 1.8630632079402693e-05, + "loss": 0.5877, + "num_input_tokens_seen": 959542304, + "step": 5316 + }, + { + "epoch": 0.5820630011768247, + "grad_norm": 1.1692265400925883, + "learning_rate": 1.8622317584766962e-05, + "loss": 0.6888, + "num_input_tokens_seen": 959735616, + "step": 5317 + }, + { + "epoch": 0.5821724732477627, + "grad_norm": 1.2245193277985535, + "learning_rate": 1.861400384459886e-05, + "loss": 0.682, + "num_input_tokens_seen": 959901376, + "step": 5318 + }, + { + "epoch": 0.5822819453187006, + "grad_norm": 1.4632774014888243, + "learning_rate": 1.86056908598819e-05, + "loss": 1.0569, + "num_input_tokens_seen": 960116416, + "step": 5319 + }, + { + "epoch": 0.5823914173896385, + "grad_norm": 1.2799088041034776, + "learning_rate": 1.8597378631599484e-05, + "loss": 0.8573, + "num_input_tokens_seen": 960311520, + "step": 5320 + }, + { + "epoch": 0.5825008894605763, + "grad_norm": 1.3119162214907982, + "learning_rate": 1.8589067160734935e-05, + "loss": 0.6862, + "num_input_tokens_seen": 960470336, + "step": 5321 + }, + { + "epoch": 0.5826103615315142, + "grad_norm": 1.1827014543419396, + "learning_rate": 1.8580756448271496e-05, + "loss": 0.7469, + "num_input_tokens_seen": 960672384, + "step": 5322 + }, + { + "epoch": 0.5827198336024522, + "grad_norm": 1.2754661455914087, + "learning_rate": 1.8572446495192288e-05, + "loss": 0.8786, + "num_input_tokens_seen": 960886304, + "step": 5323 + }, + { + "epoch": 0.5828293056733901, + "grad_norm": 1.2216929092728523, + "learning_rate": 1.8564137302480373e-05, + "loss": 0.711, + "num_input_tokens_seen": 961071776, + "step": 5324 + }, + { + "epoch": 0.582938777744328, + "grad_norm": 1.2394147913283022, + "learning_rate": 1.8555828871118715e-05, + "loss": 0.8633, + "num_input_tokens_seen": 961231040, + "step": 5325 + }, + { + "epoch": 0.5830482498152659, + "grad_norm": 1.2604881875840084, + "learning_rate": 1.8547521202090178e-05, + "loss": 0.8133, + "num_input_tokens_seen": 961409792, + "step": 5326 + }, + { + "epoch": 0.5831577218862037, + "grad_norm": 1.3160320621776296, + "learning_rate": 1.8539214296377545e-05, + "loss": 0.7706, + "num_input_tokens_seen": 961573760, + "step": 5327 + }, + { + "epoch": 0.5832671939571417, + "grad_norm": 1.101472404529114, + "learning_rate": 1.853090815496351e-05, + "loss": 0.702, + "num_input_tokens_seen": 961764384, + "step": 5328 + }, + { + "epoch": 0.5833766660280796, + "grad_norm": 1.276172485911886, + "learning_rate": 1.8522602778830688e-05, + "loss": 0.9349, + "num_input_tokens_seen": 961950080, + "step": 5329 + }, + { + "epoch": 0.5834861380990175, + "grad_norm": 1.2062317011734909, + "learning_rate": 1.851429816896156e-05, + "loss": 0.6777, + "num_input_tokens_seen": 962140256, + "step": 5330 + }, + { + "epoch": 0.5835956101699554, + "grad_norm": 1.289236022367485, + "learning_rate": 1.850599432633857e-05, + "loss": 0.6275, + "num_input_tokens_seen": 962322368, + "step": 5331 + }, + { + "epoch": 0.5837050822408933, + "grad_norm": 1.1207432747340638, + "learning_rate": 1.8497691251944027e-05, + "loss": 0.6795, + "num_input_tokens_seen": 962516576, + "step": 5332 + }, + { + "epoch": 0.5838145543118312, + "grad_norm": 1.1480374320603324, + "learning_rate": 1.848938894676019e-05, + "loss": 0.7046, + "num_input_tokens_seen": 962704288, + "step": 5333 + }, + { + "epoch": 0.5839240263827691, + "grad_norm": 1.1798182029822721, + "learning_rate": 1.8481087411769187e-05, + "loss": 0.5481, + "num_input_tokens_seen": 962881248, + "step": 5334 + }, + { + "epoch": 0.584033498453707, + "grad_norm": 1.3313446065033516, + "learning_rate": 1.8472786647953078e-05, + "loss": 0.6449, + "num_input_tokens_seen": 963031552, + "step": 5335 + }, + { + "epoch": 0.5841429705246449, + "grad_norm": 1.3301253569474953, + "learning_rate": 1.8464486656293834e-05, + "loss": 1.0847, + "num_input_tokens_seen": 963210528, + "step": 5336 + }, + { + "epoch": 0.5842524425955828, + "grad_norm": 1.3787370529327776, + "learning_rate": 1.845618743777332e-05, + "loss": 0.6754, + "num_input_tokens_seen": 963368672, + "step": 5337 + }, + { + "epoch": 0.5843619146665207, + "grad_norm": 1.1893620197752501, + "learning_rate": 1.8447888993373336e-05, + "loss": 0.8798, + "num_input_tokens_seen": 963577440, + "step": 5338 + }, + { + "epoch": 0.5844713867374586, + "grad_norm": 1.0913295120862903, + "learning_rate": 1.8439591324075545e-05, + "loss": 0.6739, + "num_input_tokens_seen": 963768064, + "step": 5339 + }, + { + "epoch": 0.5845808588083965, + "grad_norm": 1.226415783313757, + "learning_rate": 1.8431294430861572e-05, + "loss": 0.6251, + "num_input_tokens_seen": 963930464, + "step": 5340 + }, + { + "epoch": 0.5846903308793344, + "grad_norm": 1.205037219869601, + "learning_rate": 1.8422998314712897e-05, + "loss": 0.8172, + "num_input_tokens_seen": 964114816, + "step": 5341 + }, + { + "epoch": 0.5847998029502723, + "grad_norm": 1.3453925096775423, + "learning_rate": 1.841470297661095e-05, + "loss": 0.885, + "num_input_tokens_seen": 964317984, + "step": 5342 + }, + { + "epoch": 0.5849092750212103, + "grad_norm": 1.1153679281726852, + "learning_rate": 1.8406408417537064e-05, + "loss": 0.9545, + "num_input_tokens_seen": 964534816, + "step": 5343 + }, + { + "epoch": 0.5850187470921481, + "grad_norm": 1.3104453580512192, + "learning_rate": 1.8398114638472444e-05, + "loss": 0.5781, + "num_input_tokens_seen": 964693632, + "step": 5344 + }, + { + "epoch": 0.585128219163086, + "grad_norm": 1.2689390238999025, + "learning_rate": 1.8389821640398245e-05, + "loss": 0.9383, + "num_input_tokens_seen": 964883808, + "step": 5345 + }, + { + "epoch": 0.5852376912340239, + "grad_norm": 1.2978241905385794, + "learning_rate": 1.8381529424295516e-05, + "loss": 0.7277, + "num_input_tokens_seen": 965054048, + "step": 5346 + }, + { + "epoch": 0.5853471633049618, + "grad_norm": 1.3334305201317156, + "learning_rate": 1.8373237991145202e-05, + "loss": 0.8436, + "num_input_tokens_seen": 965228768, + "step": 5347 + }, + { + "epoch": 0.5854566353758998, + "grad_norm": 1.2536117350441067, + "learning_rate": 1.8364947341928156e-05, + "loss": 0.8186, + "num_input_tokens_seen": 965400576, + "step": 5348 + }, + { + "epoch": 0.5855661074468377, + "grad_norm": 1.2737070746109806, + "learning_rate": 1.8356657477625157e-05, + "loss": 0.7624, + "num_input_tokens_seen": 965569024, + "step": 5349 + }, + { + "epoch": 0.5856755795177755, + "grad_norm": 1.186809590060422, + "learning_rate": 1.8348368399216892e-05, + "loss": 0.9171, + "num_input_tokens_seen": 965762560, + "step": 5350 + }, + { + "epoch": 0.5857850515887134, + "grad_norm": 1.3539560545462757, + "learning_rate": 1.8340080107683915e-05, + "loss": 0.7223, + "num_input_tokens_seen": 965944000, + "step": 5351 + }, + { + "epoch": 0.5858945236596513, + "grad_norm": 1.2065553189814593, + "learning_rate": 1.833179260400674e-05, + "loss": 0.6672, + "num_input_tokens_seen": 966109984, + "step": 5352 + }, + { + "epoch": 0.5860039957305893, + "grad_norm": 1.372949241325063, + "learning_rate": 1.8323505889165747e-05, + "loss": 0.8184, + "num_input_tokens_seen": 966275072, + "step": 5353 + }, + { + "epoch": 0.5861134678015272, + "grad_norm": 1.4776396682484938, + "learning_rate": 1.831521996414125e-05, + "loss": 0.7877, + "num_input_tokens_seen": 966441056, + "step": 5354 + }, + { + "epoch": 0.586222939872465, + "grad_norm": 1.227554843166888, + "learning_rate": 1.8306934829913448e-05, + "loss": 0.6966, + "num_input_tokens_seen": 966613312, + "step": 5355 + }, + { + "epoch": 0.5863324119434029, + "grad_norm": 1.1674308431265967, + "learning_rate": 1.8298650487462455e-05, + "loss": 0.7099, + "num_input_tokens_seen": 966833504, + "step": 5356 + }, + { + "epoch": 0.5864418840143408, + "grad_norm": 1.3328603809695847, + "learning_rate": 1.8290366937768306e-05, + "loss": 1.0304, + "num_input_tokens_seen": 967043392, + "step": 5357 + }, + { + "epoch": 0.5865513560852788, + "grad_norm": 1.2706780052106141, + "learning_rate": 1.8282084181810915e-05, + "loss": 0.8782, + "num_input_tokens_seen": 967259104, + "step": 5358 + }, + { + "epoch": 0.5866608281562167, + "grad_norm": 1.2746307773412393, + "learning_rate": 1.827380222057013e-05, + "loss": 0.6825, + "num_input_tokens_seen": 967461600, + "step": 5359 + }, + { + "epoch": 0.5867703002271546, + "grad_norm": 1.1461794186059726, + "learning_rate": 1.8265521055025677e-05, + "loss": 0.5396, + "num_input_tokens_seen": 967644832, + "step": 5360 + }, + { + "epoch": 0.5868797722980924, + "grad_norm": 1.330068579820169, + "learning_rate": 1.825724068615721e-05, + "loss": 0.9616, + "num_input_tokens_seen": 967839264, + "step": 5361 + }, + { + "epoch": 0.5869892443690303, + "grad_norm": 1.1331111645787582, + "learning_rate": 1.824896111494429e-05, + "loss": 0.6433, + "num_input_tokens_seen": 967995616, + "step": 5362 + }, + { + "epoch": 0.5870987164399682, + "grad_norm": 1.3319753148480733, + "learning_rate": 1.8240682342366354e-05, + "loss": 0.7917, + "num_input_tokens_seen": 968177056, + "step": 5363 + }, + { + "epoch": 0.5872081885109062, + "grad_norm": 1.2787904159110388, + "learning_rate": 1.8232404369402784e-05, + "loss": 0.6896, + "num_input_tokens_seen": 968325344, + "step": 5364 + }, + { + "epoch": 0.5873176605818441, + "grad_norm": 1.1877088864966492, + "learning_rate": 1.822412719703283e-05, + "loss": 0.84, + "num_input_tokens_seen": 968509248, + "step": 5365 + }, + { + "epoch": 0.587427132652782, + "grad_norm": 1.2458330318148618, + "learning_rate": 1.8215850826235682e-05, + "loss": 0.7646, + "num_input_tokens_seen": 968724288, + "step": 5366 + }, + { + "epoch": 0.5875366047237198, + "grad_norm": 1.2944926245204118, + "learning_rate": 1.820757525799041e-05, + "loss": 0.8142, + "num_input_tokens_seen": 968907968, + "step": 5367 + }, + { + "epoch": 0.5876460767946577, + "grad_norm": 1.3990487835185934, + "learning_rate": 1.8199300493275993e-05, + "loss": 0.8325, + "num_input_tokens_seen": 969105088, + "step": 5368 + }, + { + "epoch": 0.5877555488655957, + "grad_norm": 1.2744844497594028, + "learning_rate": 1.8191026533071336e-05, + "loss": 0.8287, + "num_input_tokens_seen": 969294592, + "step": 5369 + }, + { + "epoch": 0.5878650209365336, + "grad_norm": 1.3787252507275383, + "learning_rate": 1.8182753378355218e-05, + "loss": 0.884, + "num_input_tokens_seen": 969442880, + "step": 5370 + }, + { + "epoch": 0.5879744930074715, + "grad_norm": 1.357960765544333, + "learning_rate": 1.817448103010635e-05, + "loss": 1.0454, + "num_input_tokens_seen": 969619392, + "step": 5371 + }, + { + "epoch": 0.5880839650784093, + "grad_norm": 1.322678201564231, + "learning_rate": 1.816620948930332e-05, + "loss": 0.9974, + "num_input_tokens_seen": 969809568, + "step": 5372 + }, + { + "epoch": 0.5881934371493472, + "grad_norm": 1.2358826230835418, + "learning_rate": 1.8157938756924656e-05, + "loss": 0.7218, + "num_input_tokens_seen": 969978912, + "step": 5373 + }, + { + "epoch": 0.5883029092202852, + "grad_norm": 1.2282156039685574, + "learning_rate": 1.8149668833948747e-05, + "loss": 0.8229, + "num_input_tokens_seen": 970169760, + "step": 5374 + }, + { + "epoch": 0.5884123812912231, + "grad_norm": 1.2773891055952786, + "learning_rate": 1.8141399721353915e-05, + "loss": 0.9436, + "num_input_tokens_seen": 970359040, + "step": 5375 + }, + { + "epoch": 0.588521853362161, + "grad_norm": 1.303055707886829, + "learning_rate": 1.81331314201184e-05, + "loss": 0.7799, + "num_input_tokens_seen": 970564448, + "step": 5376 + }, + { + "epoch": 0.5886313254330989, + "grad_norm": 1.1902345492716837, + "learning_rate": 1.8124863931220293e-05, + "loss": 0.7778, + "num_input_tokens_seen": 970747232, + "step": 5377 + }, + { + "epoch": 0.5887407975040367, + "grad_norm": 1.3968569385291103, + "learning_rate": 1.811659725563765e-05, + "loss": 0.7139, + "num_input_tokens_seen": 970927552, + "step": 5378 + }, + { + "epoch": 0.5888502695749747, + "grad_norm": 1.302483484452695, + "learning_rate": 1.8108331394348388e-05, + "loss": 0.7163, + "num_input_tokens_seen": 971126464, + "step": 5379 + }, + { + "epoch": 0.5889597416459126, + "grad_norm": 1.2196526730271473, + "learning_rate": 1.8100066348330356e-05, + "loss": 0.8249, + "num_input_tokens_seen": 971299840, + "step": 5380 + }, + { + "epoch": 0.5890692137168505, + "grad_norm": 1.228284222713649, + "learning_rate": 1.8091802118561272e-05, + "loss": 0.6595, + "num_input_tokens_seen": 971497632, + "step": 5381 + }, + { + "epoch": 0.5891786857877884, + "grad_norm": 1.12934977652624, + "learning_rate": 1.80835387060188e-05, + "loss": 0.7831, + "num_input_tokens_seen": 971697664, + "step": 5382 + }, + { + "epoch": 0.5892881578587263, + "grad_norm": 1.2670904552774296, + "learning_rate": 1.8075276111680478e-05, + "loss": 0.8302, + "num_input_tokens_seen": 971884928, + "step": 5383 + }, + { + "epoch": 0.5893976299296642, + "grad_norm": 1.2096103409658152, + "learning_rate": 1.806701433652375e-05, + "loss": 0.7442, + "num_input_tokens_seen": 972069728, + "step": 5384 + }, + { + "epoch": 0.5895071020006021, + "grad_norm": 1.2617615967080251, + "learning_rate": 1.805875338152598e-05, + "loss": 0.673, + "num_input_tokens_seen": 972274688, + "step": 5385 + }, + { + "epoch": 0.58961657407154, + "grad_norm": 1.3112798196112927, + "learning_rate": 1.8050493247664404e-05, + "loss": 0.8682, + "num_input_tokens_seen": 972455456, + "step": 5386 + }, + { + "epoch": 0.5897260461424779, + "grad_norm": 1.5071253816860881, + "learning_rate": 1.80422339359162e-05, + "loss": 1.0582, + "num_input_tokens_seen": 972633760, + "step": 5387 + }, + { + "epoch": 0.5898355182134158, + "grad_norm": 1.3773545133329181, + "learning_rate": 1.8033975447258416e-05, + "loss": 0.77, + "num_input_tokens_seen": 972775104, + "step": 5388 + }, + { + "epoch": 0.5899449902843537, + "grad_norm": 1.1572601957302615, + "learning_rate": 1.802571778266802e-05, + "loss": 0.5463, + "num_input_tokens_seen": 972933472, + "step": 5389 + }, + { + "epoch": 0.5900544623552916, + "grad_norm": 1.3083366483091567, + "learning_rate": 1.8017460943121878e-05, + "loss": 0.7448, + "num_input_tokens_seen": 973131488, + "step": 5390 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 1.1750470781025952, + "learning_rate": 1.8009204929596757e-05, + "loss": 0.6134, + "num_input_tokens_seen": 973299712, + "step": 5391 + }, + { + "epoch": 0.5902734064971674, + "grad_norm": 1.3222240627411357, + "learning_rate": 1.8000949743069336e-05, + "loss": 0.8962, + "num_input_tokens_seen": 973499072, + "step": 5392 + }, + { + "epoch": 0.5903828785681053, + "grad_norm": 1.258306370797878, + "learning_rate": 1.7992695384516174e-05, + "loss": 0.713, + "num_input_tokens_seen": 973654304, + "step": 5393 + }, + { + "epoch": 0.5904923506390433, + "grad_norm": 1.3926842262542187, + "learning_rate": 1.7984441854913758e-05, + "loss": 0.9427, + "num_input_tokens_seen": 973814016, + "step": 5394 + }, + { + "epoch": 0.5906018227099811, + "grad_norm": 1.26842537471204, + "learning_rate": 1.7976189155238448e-05, + "loss": 0.6623, + "num_input_tokens_seen": 973995008, + "step": 5395 + }, + { + "epoch": 0.590711294780919, + "grad_norm": 1.4504918678263492, + "learning_rate": 1.796793728646654e-05, + "loss": 0.9194, + "num_input_tokens_seen": 974191904, + "step": 5396 + }, + { + "epoch": 0.5908207668518569, + "grad_norm": 1.2949833229640704, + "learning_rate": 1.7959686249574214e-05, + "loss": 0.8079, + "num_input_tokens_seen": 974402688, + "step": 5397 + }, + { + "epoch": 0.5909302389227948, + "grad_norm": 1.200644872164357, + "learning_rate": 1.7951436045537536e-05, + "loss": 0.5146, + "num_input_tokens_seen": 974560160, + "step": 5398 + }, + { + "epoch": 0.5910397109937328, + "grad_norm": 1.4589748152454125, + "learning_rate": 1.7943186675332504e-05, + "loss": 0.9154, + "num_input_tokens_seen": 974739360, + "step": 5399 + }, + { + "epoch": 0.5911491830646707, + "grad_norm": 1.346490814928414, + "learning_rate": 1.7934938139934993e-05, + "loss": 0.7965, + "num_input_tokens_seen": 974869504, + "step": 5400 + }, + { + "epoch": 0.5912586551356085, + "grad_norm": 1.1140994628937826, + "learning_rate": 1.7926690440320803e-05, + "loss": 0.6232, + "num_input_tokens_seen": 975040640, + "step": 5401 + }, + { + "epoch": 0.5913681272065464, + "grad_norm": 1.2371654115975685, + "learning_rate": 1.7918443577465605e-05, + "loss": 0.7882, + "num_input_tokens_seen": 975214464, + "step": 5402 + }, + { + "epoch": 0.5914775992774843, + "grad_norm": 1.2632156798567928, + "learning_rate": 1.7910197552344993e-05, + "loss": 0.8397, + "num_input_tokens_seen": 975398368, + "step": 5403 + }, + { + "epoch": 0.5915870713484223, + "grad_norm": 1.2127734141451618, + "learning_rate": 1.7901952365934467e-05, + "loss": 0.854, + "num_input_tokens_seen": 975589216, + "step": 5404 + }, + { + "epoch": 0.5916965434193602, + "grad_norm": 1.326306343283991, + "learning_rate": 1.78937080192094e-05, + "loss": 0.7249, + "num_input_tokens_seen": 975723168, + "step": 5405 + }, + { + "epoch": 0.591806015490298, + "grad_norm": 1.2897409214410844, + "learning_rate": 1.7885464513145094e-05, + "loss": 0.8784, + "num_input_tokens_seen": 975884896, + "step": 5406 + }, + { + "epoch": 0.5919154875612359, + "grad_norm": 1.2183238042327433, + "learning_rate": 1.7877221848716726e-05, + "loss": 0.7616, + "num_input_tokens_seen": 976095232, + "step": 5407 + }, + { + "epoch": 0.5920249596321738, + "grad_norm": 1.3297619324772894, + "learning_rate": 1.7868980026899405e-05, + "loss": 0.8733, + "num_input_tokens_seen": 976256512, + "step": 5408 + }, + { + "epoch": 0.5921344317031118, + "grad_norm": 1.3481374130420327, + "learning_rate": 1.786073904866811e-05, + "loss": 0.7395, + "num_input_tokens_seen": 976448480, + "step": 5409 + }, + { + "epoch": 0.5922439037740497, + "grad_norm": 1.2611379319941194, + "learning_rate": 1.785249891499774e-05, + "loss": 0.9165, + "num_input_tokens_seen": 976637536, + "step": 5410 + }, + { + "epoch": 0.5923533758449876, + "grad_norm": 1.1836187781781637, + "learning_rate": 1.7844259626863083e-05, + "loss": 0.7528, + "num_input_tokens_seen": 976831744, + "step": 5411 + }, + { + "epoch": 0.5924628479159254, + "grad_norm": 1.3355271004626792, + "learning_rate": 1.783602118523883e-05, + "loss": 0.8915, + "num_input_tokens_seen": 977008928, + "step": 5412 + }, + { + "epoch": 0.5925723199868633, + "grad_norm": 1.3647817753748936, + "learning_rate": 1.7827783591099583e-05, + "loss": 0.6286, + "num_input_tokens_seen": 977186784, + "step": 5413 + }, + { + "epoch": 0.5926817920578012, + "grad_norm": 1.1193726517116467, + "learning_rate": 1.781954684541982e-05, + "loss": 0.6468, + "num_input_tokens_seen": 977389952, + "step": 5414 + }, + { + "epoch": 0.5927912641287392, + "grad_norm": 1.2960362307229052, + "learning_rate": 1.7811310949173947e-05, + "loss": 0.8473, + "num_input_tokens_seen": 977569824, + "step": 5415 + }, + { + "epoch": 0.5929007361996771, + "grad_norm": 1.3328274224039611, + "learning_rate": 1.7803075903336238e-05, + "loss": 0.8091, + "num_input_tokens_seen": 977736928, + "step": 5416 + }, + { + "epoch": 0.593010208270615, + "grad_norm": 1.194430505809234, + "learning_rate": 1.7794841708880888e-05, + "loss": 0.8207, + "num_input_tokens_seen": 977954432, + "step": 5417 + }, + { + "epoch": 0.5931196803415528, + "grad_norm": 1.302610788960219, + "learning_rate": 1.7786608366782002e-05, + "loss": 0.9478, + "num_input_tokens_seen": 978171040, + "step": 5418 + }, + { + "epoch": 0.5932291524124907, + "grad_norm": 1.145369551461972, + "learning_rate": 1.7778375878013547e-05, + "loss": 0.6887, + "num_input_tokens_seen": 978349120, + "step": 5419 + }, + { + "epoch": 0.5933386244834287, + "grad_norm": 1.225464882514412, + "learning_rate": 1.7770144243549425e-05, + "loss": 0.5265, + "num_input_tokens_seen": 978545568, + "step": 5420 + }, + { + "epoch": 0.5934480965543666, + "grad_norm": 1.313596676010661, + "learning_rate": 1.7761913464363413e-05, + "loss": 0.7012, + "num_input_tokens_seen": 978732384, + "step": 5421 + }, + { + "epoch": 0.5935575686253045, + "grad_norm": 1.237809485451317, + "learning_rate": 1.7753683541429212e-05, + "loss": 0.8515, + "num_input_tokens_seen": 978922112, + "step": 5422 + }, + { + "epoch": 0.5936670406962423, + "grad_norm": 1.313673675381609, + "learning_rate": 1.7745454475720387e-05, + "loss": 0.6774, + "num_input_tokens_seen": 979108256, + "step": 5423 + }, + { + "epoch": 0.5937765127671802, + "grad_norm": 1.231142035461054, + "learning_rate": 1.7737226268210423e-05, + "loss": 0.6143, + "num_input_tokens_seen": 979285888, + "step": 5424 + }, + { + "epoch": 0.5938859848381182, + "grad_norm": 1.2834968869327943, + "learning_rate": 1.7728998919872723e-05, + "loss": 0.9674, + "num_input_tokens_seen": 979485248, + "step": 5425 + }, + { + "epoch": 0.5939954569090561, + "grad_norm": 1.296417845156575, + "learning_rate": 1.7720772431680538e-05, + "loss": 0.8077, + "num_input_tokens_seen": 979658176, + "step": 5426 + }, + { + "epoch": 0.594104928979994, + "grad_norm": 1.2731693664221202, + "learning_rate": 1.771254680460707e-05, + "loss": 0.6688, + "num_input_tokens_seen": 979855520, + "step": 5427 + }, + { + "epoch": 0.5942144010509319, + "grad_norm": 1.4557112781970094, + "learning_rate": 1.770432203962537e-05, + "loss": 0.8152, + "num_input_tokens_seen": 980028672, + "step": 5428 + }, + { + "epoch": 0.5943238731218697, + "grad_norm": 1.344499645050104, + "learning_rate": 1.7696098137708434e-05, + "loss": 0.91, + "num_input_tokens_seen": 980241024, + "step": 5429 + }, + { + "epoch": 0.5944333451928077, + "grad_norm": 1.2113246175892969, + "learning_rate": 1.7687875099829127e-05, + "loss": 0.6866, + "num_input_tokens_seen": 980414624, + "step": 5430 + }, + { + "epoch": 0.5945428172637456, + "grad_norm": 1.2928345947738131, + "learning_rate": 1.767965292696021e-05, + "loss": 0.6133, + "num_input_tokens_seen": 980602784, + "step": 5431 + }, + { + "epoch": 0.5946522893346835, + "grad_norm": 1.4099363541273129, + "learning_rate": 1.7671431620074365e-05, + "loss": 0.789, + "num_input_tokens_seen": 980753984, + "step": 5432 + }, + { + "epoch": 0.5947617614056214, + "grad_norm": 1.4191329002432636, + "learning_rate": 1.7663211180144146e-05, + "loss": 1.0414, + "num_input_tokens_seen": 980930048, + "step": 5433 + }, + { + "epoch": 0.5948712334765593, + "grad_norm": 1.4314685005082675, + "learning_rate": 1.7654991608142024e-05, + "loss": 0.7856, + "num_input_tokens_seen": 981098496, + "step": 5434 + }, + { + "epoch": 0.5949807055474972, + "grad_norm": 1.405777364354256, + "learning_rate": 1.7646772905040347e-05, + "loss": 0.825, + "num_input_tokens_seen": 981279264, + "step": 5435 + }, + { + "epoch": 0.5950901776184351, + "grad_norm": 1.407046495150635, + "learning_rate": 1.7638555071811373e-05, + "loss": 0.8719, + "num_input_tokens_seen": 981448384, + "step": 5436 + }, + { + "epoch": 0.595199649689373, + "grad_norm": 1.1789759327775375, + "learning_rate": 1.763033810942728e-05, + "loss": 0.6106, + "num_input_tokens_seen": 981624672, + "step": 5437 + }, + { + "epoch": 0.5953091217603109, + "grad_norm": 1.2374633590976554, + "learning_rate": 1.7622122018860082e-05, + "loss": 0.7378, + "num_input_tokens_seen": 981779008, + "step": 5438 + }, + { + "epoch": 0.5954185938312488, + "grad_norm": 1.175013345277361, + "learning_rate": 1.761390680108176e-05, + "loss": 0.6691, + "num_input_tokens_seen": 981948128, + "step": 5439 + }, + { + "epoch": 0.5955280659021867, + "grad_norm": 1.1829554785076084, + "learning_rate": 1.7605692457064133e-05, + "loss": 0.6119, + "num_input_tokens_seen": 982133600, + "step": 5440 + }, + { + "epoch": 0.5956375379731246, + "grad_norm": 1.211177875902799, + "learning_rate": 1.7597478987778956e-05, + "loss": 0.7315, + "num_input_tokens_seen": 982314816, + "step": 5441 + }, + { + "epoch": 0.5957470100440625, + "grad_norm": 1.2671036736058843, + "learning_rate": 1.758926639419786e-05, + "loss": 0.8267, + "num_input_tokens_seen": 982511040, + "step": 5442 + }, + { + "epoch": 0.5958564821150004, + "grad_norm": 1.17799614927702, + "learning_rate": 1.7581054677292387e-05, + "loss": 0.7293, + "num_input_tokens_seen": 982694272, + "step": 5443 + }, + { + "epoch": 0.5959659541859383, + "grad_norm": 1.2439224794496102, + "learning_rate": 1.7572843838033964e-05, + "loss": 0.8918, + "num_input_tokens_seen": 982891840, + "step": 5444 + }, + { + "epoch": 0.5960754262568763, + "grad_norm": 1.2652471943906902, + "learning_rate": 1.756463387739391e-05, + "loss": 0.8333, + "num_input_tokens_seen": 983074176, + "step": 5445 + }, + { + "epoch": 0.5961848983278141, + "grad_norm": 1.1944803071121242, + "learning_rate": 1.7556424796343462e-05, + "loss": 0.6789, + "num_input_tokens_seen": 983230976, + "step": 5446 + }, + { + "epoch": 0.596294370398752, + "grad_norm": 1.1465991091140701, + "learning_rate": 1.754821659585373e-05, + "loss": 0.8542, + "num_input_tokens_seen": 983404800, + "step": 5447 + }, + { + "epoch": 0.5964038424696899, + "grad_norm": 1.241701570869405, + "learning_rate": 1.754000927689573e-05, + "loss": 0.7352, + "num_input_tokens_seen": 983607296, + "step": 5448 + }, + { + "epoch": 0.5965133145406278, + "grad_norm": 1.2268784717388266, + "learning_rate": 1.7531802840440364e-05, + "loss": 0.5752, + "num_input_tokens_seen": 983796352, + "step": 5449 + }, + { + "epoch": 0.5966227866115658, + "grad_norm": 1.3983588169914543, + "learning_rate": 1.7523597287458447e-05, + "loss": 0.8487, + "num_input_tokens_seen": 983955616, + "step": 5450 + }, + { + "epoch": 0.5967322586825037, + "grad_norm": 1.4307817117795067, + "learning_rate": 1.7515392618920686e-05, + "loss": 0.7848, + "num_input_tokens_seen": 984128768, + "step": 5451 + }, + { + "epoch": 0.5968417307534415, + "grad_norm": 1.227641317619757, + "learning_rate": 1.750718883579766e-05, + "loss": 0.751, + "num_input_tokens_seen": 984304832, + "step": 5452 + }, + { + "epoch": 0.5969512028243794, + "grad_norm": 1.0891941007463486, + "learning_rate": 1.749898593905988e-05, + "loss": 0.5144, + "num_input_tokens_seen": 984487616, + "step": 5453 + }, + { + "epoch": 0.5970606748953173, + "grad_norm": 1.276368868650863, + "learning_rate": 1.7490783929677714e-05, + "loss": 0.6408, + "num_input_tokens_seen": 984659424, + "step": 5454 + }, + { + "epoch": 0.5971701469662553, + "grad_norm": 1.2986434461043814, + "learning_rate": 1.748258280862147e-05, + "loss": 0.6533, + "num_input_tokens_seen": 984848928, + "step": 5455 + }, + { + "epoch": 0.5972796190371932, + "grad_norm": 1.2941939435361756, + "learning_rate": 1.7474382576861293e-05, + "loss": 0.7597, + "num_input_tokens_seen": 984977280, + "step": 5456 + }, + { + "epoch": 0.597389091108131, + "grad_norm": 1.2324229390038492, + "learning_rate": 1.7466183235367274e-05, + "loss": 0.8465, + "num_input_tokens_seen": 985152000, + "step": 5457 + }, + { + "epoch": 0.5974985631790689, + "grad_norm": 1.4234935226051877, + "learning_rate": 1.745798478510938e-05, + "loss": 0.9797, + "num_input_tokens_seen": 985338592, + "step": 5458 + }, + { + "epoch": 0.5976080352500068, + "grad_norm": 1.3422027373864505, + "learning_rate": 1.744978722705747e-05, + "loss": 0.9235, + "num_input_tokens_seen": 985530784, + "step": 5459 + }, + { + "epoch": 0.5977175073209448, + "grad_norm": 1.1192552863961793, + "learning_rate": 1.74415905621813e-05, + "loss": 0.6718, + "num_input_tokens_seen": 985720288, + "step": 5460 + }, + { + "epoch": 0.5978269793918827, + "grad_norm": 1.3054595764312682, + "learning_rate": 1.74333947914505e-05, + "loss": 0.7027, + "num_input_tokens_seen": 985897920, + "step": 5461 + }, + { + "epoch": 0.5979364514628206, + "grad_norm": 1.3268349609227654, + "learning_rate": 1.7425199915834646e-05, + "loss": 0.7748, + "num_input_tokens_seen": 986072416, + "step": 5462 + }, + { + "epoch": 0.5980459235337584, + "grad_norm": 1.249807247559253, + "learning_rate": 1.7417005936303155e-05, + "loss": 0.7614, + "num_input_tokens_seen": 986280064, + "step": 5463 + }, + { + "epoch": 0.5981553956046963, + "grad_norm": 1.0890127315670262, + "learning_rate": 1.740881285382537e-05, + "loss": 0.5321, + "num_input_tokens_seen": 986455008, + "step": 5464 + }, + { + "epoch": 0.5982648676756342, + "grad_norm": 1.168231590065531, + "learning_rate": 1.7400620669370513e-05, + "loss": 0.7024, + "num_input_tokens_seen": 986648768, + "step": 5465 + }, + { + "epoch": 0.5983743397465722, + "grad_norm": 1.1546578141501556, + "learning_rate": 1.7392429383907698e-05, + "loss": 0.7324, + "num_input_tokens_seen": 986835808, + "step": 5466 + }, + { + "epoch": 0.5984838118175101, + "grad_norm": 1.3318515331934437, + "learning_rate": 1.7384238998405954e-05, + "loss": 0.8376, + "num_input_tokens_seen": 987027104, + "step": 5467 + }, + { + "epoch": 0.598593283888448, + "grad_norm": 1.2778371155593589, + "learning_rate": 1.7376049513834165e-05, + "loss": 0.9533, + "num_input_tokens_seen": 987226912, + "step": 5468 + }, + { + "epoch": 0.5987027559593858, + "grad_norm": 1.0914414702557713, + "learning_rate": 1.7367860931161155e-05, + "loss": 0.5967, + "num_input_tokens_seen": 987399616, + "step": 5469 + }, + { + "epoch": 0.5988122280303237, + "grad_norm": 1.4023396696493562, + "learning_rate": 1.73596732513556e-05, + "loss": 0.8264, + "num_input_tokens_seen": 987573664, + "step": 5470 + }, + { + "epoch": 0.5989217001012617, + "grad_norm": 1.062767051346414, + "learning_rate": 1.7351486475386088e-05, + "loss": 0.6711, + "num_input_tokens_seen": 987790720, + "step": 5471 + }, + { + "epoch": 0.5990311721721996, + "grad_norm": 1.297456587945371, + "learning_rate": 1.7343300604221118e-05, + "loss": 0.8188, + "num_input_tokens_seen": 987949984, + "step": 5472 + }, + { + "epoch": 0.5991406442431375, + "grad_norm": 1.1963908498629037, + "learning_rate": 1.733511563882904e-05, + "loss": 0.7175, + "num_input_tokens_seen": 988148000, + "step": 5473 + }, + { + "epoch": 0.5992501163140753, + "grad_norm": 1.2154909721958798, + "learning_rate": 1.732693158017813e-05, + "loss": 0.8653, + "num_input_tokens_seen": 988364608, + "step": 5474 + }, + { + "epoch": 0.5993595883850132, + "grad_norm": 1.1248055925877316, + "learning_rate": 1.7318748429236547e-05, + "loss": 0.5839, + "num_input_tokens_seen": 988550976, + "step": 5475 + }, + { + "epoch": 0.5994690604559512, + "grad_norm": 1.296363132068269, + "learning_rate": 1.7310566186972345e-05, + "loss": 0.83, + "num_input_tokens_seen": 988704416, + "step": 5476 + }, + { + "epoch": 0.5995785325268891, + "grad_norm": 1.3052715433356141, + "learning_rate": 1.7302384854353455e-05, + "loss": 1.0223, + "num_input_tokens_seen": 988870848, + "step": 5477 + }, + { + "epoch": 0.599688004597827, + "grad_norm": 1.220945679027226, + "learning_rate": 1.7294204432347723e-05, + "loss": 0.7546, + "num_input_tokens_seen": 989056992, + "step": 5478 + }, + { + "epoch": 0.5997974766687649, + "grad_norm": 1.3476951261800934, + "learning_rate": 1.7286024921922887e-05, + "loss": 0.7647, + "num_input_tokens_seen": 989236864, + "step": 5479 + }, + { + "epoch": 0.5999069487397027, + "grad_norm": 1.3806349424916842, + "learning_rate": 1.7277846324046544e-05, + "loss": 0.8436, + "num_input_tokens_seen": 989414272, + "step": 5480 + }, + { + "epoch": 0.6000164208106407, + "grad_norm": 1.4535051074284386, + "learning_rate": 1.7269668639686225e-05, + "loss": 1.1026, + "num_input_tokens_seen": 989620800, + "step": 5481 + }, + { + "epoch": 0.6001258928815786, + "grad_norm": 1.477002598000258, + "learning_rate": 1.7261491869809327e-05, + "loss": 1.0973, + "num_input_tokens_seen": 989802688, + "step": 5482 + }, + { + "epoch": 0.6002353649525165, + "grad_norm": 1.25193955626849, + "learning_rate": 1.7253316015383145e-05, + "loss": 0.6267, + "num_input_tokens_seen": 989968000, + "step": 5483 + }, + { + "epoch": 0.6003448370234544, + "grad_norm": 1.3491772458087985, + "learning_rate": 1.7245141077374865e-05, + "loss": 0.842, + "num_input_tokens_seen": 990164448, + "step": 5484 + }, + { + "epoch": 0.6004543090943923, + "grad_norm": 1.3357805304860564, + "learning_rate": 1.723696705675158e-05, + "loss": 0.7046, + "num_input_tokens_seen": 990334464, + "step": 5485 + }, + { + "epoch": 0.6005637811653302, + "grad_norm": 1.3396759817893156, + "learning_rate": 1.7228793954480246e-05, + "loss": 0.7015, + "num_input_tokens_seen": 990489696, + "step": 5486 + }, + { + "epoch": 0.6006732532362681, + "grad_norm": 1.2240998152537081, + "learning_rate": 1.722062177152773e-05, + "loss": 0.7332, + "num_input_tokens_seen": 990666656, + "step": 5487 + }, + { + "epoch": 0.600782725307206, + "grad_norm": 1.292305122013121, + "learning_rate": 1.7212450508860794e-05, + "loss": 0.6961, + "num_input_tokens_seen": 990806656, + "step": 5488 + }, + { + "epoch": 0.6008921973781439, + "grad_norm": 1.2058175907983544, + "learning_rate": 1.720428016744607e-05, + "loss": 0.9414, + "num_input_tokens_seen": 990993696, + "step": 5489 + }, + { + "epoch": 0.6010016694490818, + "grad_norm": 1.41750729621207, + "learning_rate": 1.7196110748250095e-05, + "loss": 0.8922, + "num_input_tokens_seen": 991158784, + "step": 5490 + }, + { + "epoch": 0.6011111415200197, + "grad_norm": 1.200472992821638, + "learning_rate": 1.718794225223931e-05, + "loss": 0.7054, + "num_input_tokens_seen": 991330592, + "step": 5491 + }, + { + "epoch": 0.6012206135909576, + "grad_norm": 1.3093351626652086, + "learning_rate": 1.7179774680380014e-05, + "loss": 0.8146, + "num_input_tokens_seen": 991511584, + "step": 5492 + }, + { + "epoch": 0.6013300856618955, + "grad_norm": 1.323650451965912, + "learning_rate": 1.7171608033638435e-05, + "loss": 0.7095, + "num_input_tokens_seen": 991679808, + "step": 5493 + }, + { + "epoch": 0.6014395577328334, + "grad_norm": 1.196266043305316, + "learning_rate": 1.716344231298065e-05, + "loss": 0.9056, + "num_input_tokens_seen": 991882304, + "step": 5494 + }, + { + "epoch": 0.6015490298037713, + "grad_norm": 1.4802714596342672, + "learning_rate": 1.715527751937266e-05, + "loss": 0.9579, + "num_input_tokens_seen": 992036416, + "step": 5495 + }, + { + "epoch": 0.6016585018747093, + "grad_norm": 1.4152572389925275, + "learning_rate": 1.714711365378034e-05, + "loss": 0.9317, + "num_input_tokens_seen": 992193888, + "step": 5496 + }, + { + "epoch": 0.6017679739456471, + "grad_norm": 1.1793796567112333, + "learning_rate": 1.713895071716946e-05, + "loss": 0.6919, + "num_input_tokens_seen": 992388096, + "step": 5497 + }, + { + "epoch": 0.601877446016585, + "grad_norm": 1.1934191100408496, + "learning_rate": 1.7130788710505696e-05, + "loss": 0.6749, + "num_input_tokens_seen": 992584992, + "step": 5498 + }, + { + "epoch": 0.6019869180875229, + "grad_norm": 1.2315813635286748, + "learning_rate": 1.7122627634754573e-05, + "loss": 0.6773, + "num_input_tokens_seen": 992744704, + "step": 5499 + }, + { + "epoch": 0.6020963901584608, + "grad_norm": 1.393859616061718, + "learning_rate": 1.7114467490881553e-05, + "loss": 0.8476, + "num_input_tokens_seen": 992923008, + "step": 5500 + }, + { + "epoch": 0.6022058622293988, + "grad_norm": 1.2088860639740937, + "learning_rate": 1.7106308279851945e-05, + "loss": 0.6068, + "num_input_tokens_seen": 993128864, + "step": 5501 + }, + { + "epoch": 0.6023153343003367, + "grad_norm": 1.3915235209750785, + "learning_rate": 1.7098150002630988e-05, + "loss": 0.6729, + "num_input_tokens_seen": 993310976, + "step": 5502 + }, + { + "epoch": 0.6024248063712745, + "grad_norm": 1.2951748285883595, + "learning_rate": 1.7089992660183766e-05, + "loss": 0.6883, + "num_input_tokens_seen": 993510336, + "step": 5503 + }, + { + "epoch": 0.6025342784422124, + "grad_norm": 1.18269933766464, + "learning_rate": 1.7081836253475292e-05, + "loss": 0.7234, + "num_input_tokens_seen": 993709248, + "step": 5504 + }, + { + "epoch": 0.6026437505131503, + "grad_norm": 1.1426583547219373, + "learning_rate": 1.7073680783470457e-05, + "loss": 0.6737, + "num_input_tokens_seen": 993902560, + "step": 5505 + }, + { + "epoch": 0.6027532225840883, + "grad_norm": 1.2360146171730158, + "learning_rate": 1.7065526251134033e-05, + "loss": 0.6561, + "num_input_tokens_seen": 994063392, + "step": 5506 + }, + { + "epoch": 0.6028626946550262, + "grad_norm": 1.36329287985839, + "learning_rate": 1.7057372657430687e-05, + "loss": 0.8774, + "num_input_tokens_seen": 994213696, + "step": 5507 + }, + { + "epoch": 0.602972166725964, + "grad_norm": 1.214889917998964, + "learning_rate": 1.7049220003324964e-05, + "loss": 0.7296, + "num_input_tokens_seen": 994381472, + "step": 5508 + }, + { + "epoch": 0.6030816387969019, + "grad_norm": 1.390310830620938, + "learning_rate": 1.7041068289781326e-05, + "loss": 0.6512, + "num_input_tokens_seen": 994562912, + "step": 5509 + }, + { + "epoch": 0.6031911108678398, + "grad_norm": 1.2407467241891557, + "learning_rate": 1.7032917517764085e-05, + "loss": 1.0969, + "num_input_tokens_seen": 994788928, + "step": 5510 + }, + { + "epoch": 0.6033005829387778, + "grad_norm": 1.229063945723827, + "learning_rate": 1.7024767688237475e-05, + "loss": 0.6186, + "num_input_tokens_seen": 994947520, + "step": 5511 + }, + { + "epoch": 0.6034100550097157, + "grad_norm": 1.4021450741434363, + "learning_rate": 1.7016618802165607e-05, + "loss": 0.8869, + "num_input_tokens_seen": 995099392, + "step": 5512 + }, + { + "epoch": 0.6035195270806536, + "grad_norm": 1.1884281145437912, + "learning_rate": 1.7008470860512466e-05, + "loss": 0.6334, + "num_input_tokens_seen": 995279936, + "step": 5513 + }, + { + "epoch": 0.6036289991515914, + "grad_norm": 1.2241448064210816, + "learning_rate": 1.7000323864241953e-05, + "loss": 0.5778, + "num_input_tokens_seen": 995445472, + "step": 5514 + }, + { + "epoch": 0.6037384712225293, + "grad_norm": 1.1965963910878086, + "learning_rate": 1.699217781431782e-05, + "loss": 0.7222, + "num_input_tokens_seen": 995621312, + "step": 5515 + }, + { + "epoch": 0.6038479432934672, + "grad_norm": 1.3034460591611612, + "learning_rate": 1.6984032711703753e-05, + "loss": 0.9345, + "num_input_tokens_seen": 995824480, + "step": 5516 + }, + { + "epoch": 0.6039574153644052, + "grad_norm": 1.2014992368634578, + "learning_rate": 1.697588855736329e-05, + "loss": 0.6892, + "num_input_tokens_seen": 996008832, + "step": 5517 + }, + { + "epoch": 0.6040668874353431, + "grad_norm": 1.4279203906909093, + "learning_rate": 1.6967745352259868e-05, + "loss": 0.9348, + "num_input_tokens_seen": 996161152, + "step": 5518 + }, + { + "epoch": 0.604176359506281, + "grad_norm": 1.2830109938687912, + "learning_rate": 1.6959603097356823e-05, + "loss": 0.7976, + "num_input_tokens_seen": 996343264, + "step": 5519 + }, + { + "epoch": 0.6042858315772188, + "grad_norm": 1.2917056641793148, + "learning_rate": 1.6951461793617354e-05, + "loss": 0.6922, + "num_input_tokens_seen": 996519552, + "step": 5520 + }, + { + "epoch": 0.6043953036481567, + "grad_norm": 1.421980717679831, + "learning_rate": 1.694332144200458e-05, + "loss": 0.6488, + "num_input_tokens_seen": 996677248, + "step": 5521 + }, + { + "epoch": 0.6045047757190947, + "grad_norm": 1.31602286803601, + "learning_rate": 1.693518204348146e-05, + "loss": 1.0386, + "num_input_tokens_seen": 996856224, + "step": 5522 + }, + { + "epoch": 0.6046142477900326, + "grad_norm": 1.4426259408644804, + "learning_rate": 1.6927043599010906e-05, + "loss": 0.8737, + "num_input_tokens_seen": 996998912, + "step": 5523 + }, + { + "epoch": 0.6047237198609705, + "grad_norm": 1.263637960883903, + "learning_rate": 1.691890610955565e-05, + "loss": 0.8438, + "num_input_tokens_seen": 997210368, + "step": 5524 + }, + { + "epoch": 0.6048331919319083, + "grad_norm": 1.2695297945454398, + "learning_rate": 1.691076957607835e-05, + "loss": 0.6916, + "num_input_tokens_seen": 997382400, + "step": 5525 + }, + { + "epoch": 0.6049426640028462, + "grad_norm": 1.3382853350379358, + "learning_rate": 1.690263399954155e-05, + "loss": 0.6764, + "num_input_tokens_seen": 997560256, + "step": 5526 + }, + { + "epoch": 0.6050521360737842, + "grad_norm": 1.3368967638923763, + "learning_rate": 1.689449938090767e-05, + "loss": 1.0718, + "num_input_tokens_seen": 997747968, + "step": 5527 + }, + { + "epoch": 0.6051616081447221, + "grad_norm": 1.3030328365822827, + "learning_rate": 1.688636572113902e-05, + "loss": 0.8101, + "num_input_tokens_seen": 997946208, + "step": 5528 + }, + { + "epoch": 0.60527108021566, + "grad_norm": 1.125413130032348, + "learning_rate": 1.6878233021197783e-05, + "loss": 0.5713, + "num_input_tokens_seen": 998106816, + "step": 5529 + }, + { + "epoch": 0.6053805522865979, + "grad_norm": 1.3076984588232408, + "learning_rate": 1.687010128204607e-05, + "loss": 0.7715, + "num_input_tokens_seen": 998271904, + "step": 5530 + }, + { + "epoch": 0.6054900243575357, + "grad_norm": 1.2258599571502335, + "learning_rate": 1.6861970504645818e-05, + "loss": 0.6131, + "num_input_tokens_seen": 998465440, + "step": 5531 + }, + { + "epoch": 0.6055994964284737, + "grad_norm": 1.4064222442256573, + "learning_rate": 1.6853840689958904e-05, + "loss": 0.7166, + "num_input_tokens_seen": 998603424, + "step": 5532 + }, + { + "epoch": 0.6057089684994116, + "grad_norm": 1.121442202336825, + "learning_rate": 1.684571183894707e-05, + "loss": 0.5986, + "num_input_tokens_seen": 998796960, + "step": 5533 + }, + { + "epoch": 0.6058184405703495, + "grad_norm": 1.1958123452253226, + "learning_rate": 1.6837583952571927e-05, + "loss": 0.7069, + "num_input_tokens_seen": 998978848, + "step": 5534 + }, + { + "epoch": 0.6059279126412874, + "grad_norm": 1.1669731872799487, + "learning_rate": 1.682945703179501e-05, + "loss": 0.6636, + "num_input_tokens_seen": 999136096, + "step": 5535 + }, + { + "epoch": 0.6060373847122253, + "grad_norm": 1.4265291943375584, + "learning_rate": 1.6821331077577697e-05, + "loss": 0.716, + "num_input_tokens_seen": 999311488, + "step": 5536 + }, + { + "epoch": 0.6061468567831632, + "grad_norm": 1.2856473593095794, + "learning_rate": 1.6813206090881285e-05, + "loss": 0.8667, + "num_input_tokens_seen": 999493824, + "step": 5537 + }, + { + "epoch": 0.6062563288541011, + "grad_norm": 1.4598075862709874, + "learning_rate": 1.680508207266694e-05, + "loss": 0.8186, + "num_input_tokens_seen": 999650176, + "step": 5538 + }, + { + "epoch": 0.606365800925039, + "grad_norm": 1.2566736882786838, + "learning_rate": 1.6796959023895717e-05, + "loss": 0.6468, + "num_input_tokens_seen": 999826912, + "step": 5539 + }, + { + "epoch": 0.6064752729959769, + "grad_norm": 1.3997875069391157, + "learning_rate": 1.678883694552857e-05, + "loss": 0.8095, + "num_input_tokens_seen": 999996928, + "step": 5540 + }, + { + "epoch": 0.6065847450669148, + "grad_norm": 1.143861802687084, + "learning_rate": 1.678071583852631e-05, + "loss": 0.8928, + "num_input_tokens_seen": 1000215104, + "step": 5541 + }, + { + "epoch": 0.6066942171378527, + "grad_norm": 1.4138839356028177, + "learning_rate": 1.677259570384966e-05, + "loss": 1.0532, + "num_input_tokens_seen": 1000430144, + "step": 5542 + }, + { + "epoch": 0.6068036892087906, + "grad_norm": 1.1887660555552522, + "learning_rate": 1.67644765424592e-05, + "loss": 0.5697, + "num_input_tokens_seen": 1000591424, + "step": 5543 + }, + { + "epoch": 0.6069131612797285, + "grad_norm": 1.3410701199489432, + "learning_rate": 1.6756358355315433e-05, + "loss": 0.7783, + "num_input_tokens_seen": 1000786528, + "step": 5544 + }, + { + "epoch": 0.6070226333506664, + "grad_norm": 1.2441660640397414, + "learning_rate": 1.6748241143378702e-05, + "loss": 0.8278, + "num_input_tokens_seen": 1000958112, + "step": 5545 + }, + { + "epoch": 0.6071321054216043, + "grad_norm": 1.3475924711753662, + "learning_rate": 1.6740124907609266e-05, + "loss": 0.8167, + "num_input_tokens_seen": 1001154784, + "step": 5546 + }, + { + "epoch": 0.6072415774925423, + "grad_norm": 1.1727385327508986, + "learning_rate": 1.6732009648967272e-05, + "loss": 0.7418, + "num_input_tokens_seen": 1001326592, + "step": 5547 + }, + { + "epoch": 0.6073510495634801, + "grad_norm": 1.237287489629271, + "learning_rate": 1.6723895368412732e-05, + "loss": 0.6757, + "num_input_tokens_seen": 1001510496, + "step": 5548 + }, + { + "epoch": 0.607460521634418, + "grad_norm": 1.2392730110745507, + "learning_rate": 1.6715782066905544e-05, + "loss": 0.7962, + "num_input_tokens_seen": 1001681856, + "step": 5549 + }, + { + "epoch": 0.6075699937053559, + "grad_norm": 1.165297192662006, + "learning_rate": 1.6707669745405502e-05, + "loss": 0.9985, + "num_input_tokens_seen": 1001895104, + "step": 5550 + }, + { + "epoch": 0.6076794657762938, + "grad_norm": 1.1112708963632878, + "learning_rate": 1.6699558404872272e-05, + "loss": 0.9074, + "num_input_tokens_seen": 1002098048, + "step": 5551 + }, + { + "epoch": 0.6077889378472318, + "grad_norm": 1.2547264387613377, + "learning_rate": 1.6691448046265426e-05, + "loss": 0.7959, + "num_input_tokens_seen": 1002267840, + "step": 5552 + }, + { + "epoch": 0.6078984099181697, + "grad_norm": 1.120082279316553, + "learning_rate": 1.6683338670544384e-05, + "loss": 0.6694, + "num_input_tokens_seen": 1002453984, + "step": 5553 + }, + { + "epoch": 0.6080078819891075, + "grad_norm": 1.2206290993179734, + "learning_rate": 1.6675230278668484e-05, + "loss": 0.9039, + "num_input_tokens_seen": 1002635648, + "step": 5554 + }, + { + "epoch": 0.6081173540600454, + "grad_norm": 1.3098709933262167, + "learning_rate": 1.6667122871596925e-05, + "loss": 0.5707, + "num_input_tokens_seen": 1002845760, + "step": 5555 + }, + { + "epoch": 0.6082268261309833, + "grad_norm": 1.3461936200732159, + "learning_rate": 1.6659016450288805e-05, + "loss": 0.8132, + "num_input_tokens_seen": 1003031904, + "step": 5556 + }, + { + "epoch": 0.6083362982019213, + "grad_norm": 1.1559307069932054, + "learning_rate": 1.665091101570309e-05, + "loss": 0.6512, + "num_input_tokens_seen": 1003211776, + "step": 5557 + }, + { + "epoch": 0.6084457702728592, + "grad_norm": 1.2653541275077083, + "learning_rate": 1.6642806568798635e-05, + "loss": 0.5813, + "num_input_tokens_seen": 1003377536, + "step": 5558 + }, + { + "epoch": 0.608555242343797, + "grad_norm": 1.208230025161257, + "learning_rate": 1.6634703110534195e-05, + "loss": 0.7503, + "num_input_tokens_seen": 1003571072, + "step": 5559 + }, + { + "epoch": 0.6086647144147349, + "grad_norm": 1.2228088771143453, + "learning_rate": 1.662660064186838e-05, + "loss": 0.5809, + "num_input_tokens_seen": 1003759904, + "step": 5560 + }, + { + "epoch": 0.6087741864856728, + "grad_norm": 1.3501901439698603, + "learning_rate": 1.661849916375971e-05, + "loss": 0.7881, + "num_input_tokens_seen": 1003934624, + "step": 5561 + }, + { + "epoch": 0.6088836585566108, + "grad_norm": 1.2823650346348883, + "learning_rate": 1.6610398677166555e-05, + "loss": 0.554, + "num_input_tokens_seen": 1004079104, + "step": 5562 + }, + { + "epoch": 0.6089931306275487, + "grad_norm": 1.2520718094247294, + "learning_rate": 1.660229918304721e-05, + "loss": 0.6521, + "num_input_tokens_seen": 1004269504, + "step": 5563 + }, + { + "epoch": 0.6091026026984866, + "grad_norm": 1.1919972439164424, + "learning_rate": 1.659420068235981e-05, + "loss": 0.705, + "num_input_tokens_seen": 1004470880, + "step": 5564 + }, + { + "epoch": 0.6092120747694244, + "grad_norm": 1.1559764950979468, + "learning_rate": 1.6586103176062397e-05, + "loss": 0.5571, + "num_input_tokens_seen": 1004621184, + "step": 5565 + }, + { + "epoch": 0.6093215468403623, + "grad_norm": 1.3032866398300058, + "learning_rate": 1.6578006665112906e-05, + "loss": 0.8197, + "num_input_tokens_seen": 1004827712, + "step": 5566 + }, + { + "epoch": 0.6094310189113002, + "grad_norm": 1.1640509790452054, + "learning_rate": 1.6569911150469113e-05, + "loss": 0.5778, + "num_input_tokens_seen": 1005021024, + "step": 5567 + }, + { + "epoch": 0.6095404909822382, + "grad_norm": 1.3685230380493023, + "learning_rate": 1.6561816633088723e-05, + "loss": 0.8989, + "num_input_tokens_seen": 1005199552, + "step": 5568 + }, + { + "epoch": 0.6096499630531761, + "grad_norm": 1.2864687168870892, + "learning_rate": 1.6553723113929296e-05, + "loss": 0.9195, + "num_input_tokens_seen": 1005363968, + "step": 5569 + }, + { + "epoch": 0.609759435124114, + "grad_norm": 1.3362492426271495, + "learning_rate": 1.6545630593948276e-05, + "loss": 0.7277, + "num_input_tokens_seen": 1005554368, + "step": 5570 + }, + { + "epoch": 0.6098689071950518, + "grad_norm": 1.1472120429032033, + "learning_rate": 1.6537539074102988e-05, + "loss": 0.6542, + "num_input_tokens_seen": 1005733568, + "step": 5571 + }, + { + "epoch": 0.6099783792659897, + "grad_norm": 1.2393284650521972, + "learning_rate": 1.6529448555350653e-05, + "loss": 0.6329, + "num_input_tokens_seen": 1005929344, + "step": 5572 + }, + { + "epoch": 0.6100878513369277, + "grad_norm": 1.1863476030335243, + "learning_rate": 1.652135903864837e-05, + "loss": 0.6468, + "num_input_tokens_seen": 1006120192, + "step": 5573 + }, + { + "epoch": 0.6101973234078656, + "grad_norm": 1.2807552033526615, + "learning_rate": 1.6513270524953092e-05, + "loss": 0.8085, + "num_input_tokens_seen": 1006292672, + "step": 5574 + }, + { + "epoch": 0.6103067954788035, + "grad_norm": 1.2225325159222484, + "learning_rate": 1.65051830152217e-05, + "loss": 0.9803, + "num_input_tokens_seen": 1006487104, + "step": 5575 + }, + { + "epoch": 0.6104162675497413, + "grad_norm": 1.229648910362894, + "learning_rate": 1.6497096510410908e-05, + "loss": 0.7258, + "num_input_tokens_seen": 1006673696, + "step": 5576 + }, + { + "epoch": 0.6105257396206792, + "grad_norm": 1.4313375821032017, + "learning_rate": 1.648901101147735e-05, + "loss": 0.9891, + "num_input_tokens_seen": 1006853344, + "step": 5577 + }, + { + "epoch": 0.6106352116916172, + "grad_norm": 1.3220490992542329, + "learning_rate": 1.6480926519377514e-05, + "loss": 0.7626, + "num_input_tokens_seen": 1007018656, + "step": 5578 + }, + { + "epoch": 0.6107446837625551, + "grad_norm": 1.2875026165833783, + "learning_rate": 1.6472843035067784e-05, + "loss": 1.0454, + "num_input_tokens_seen": 1007218240, + "step": 5579 + }, + { + "epoch": 0.610854155833493, + "grad_norm": 1.1678846450173848, + "learning_rate": 1.6464760559504424e-05, + "loss": 0.7578, + "num_input_tokens_seen": 1007417152, + "step": 5580 + }, + { + "epoch": 0.6109636279044309, + "grad_norm": 1.342485897835484, + "learning_rate": 1.6456679093643572e-05, + "loss": 0.8691, + "num_input_tokens_seen": 1007611584, + "step": 5581 + }, + { + "epoch": 0.6110730999753687, + "grad_norm": 1.3088618672826586, + "learning_rate": 1.644859863844126e-05, + "loss": 0.6419, + "num_input_tokens_seen": 1007767712, + "step": 5582 + }, + { + "epoch": 0.6111825720463067, + "grad_norm": 1.0986845690933733, + "learning_rate": 1.644051919485337e-05, + "loss": 0.6541, + "num_input_tokens_seen": 1007960800, + "step": 5583 + }, + { + "epoch": 0.6112920441172446, + "grad_norm": 1.17228452520442, + "learning_rate": 1.643244076383571e-05, + "loss": 0.6812, + "num_input_tokens_seen": 1008135072, + "step": 5584 + }, + { + "epoch": 0.6114015161881825, + "grad_norm": 1.4457958341164094, + "learning_rate": 1.6424363346343912e-05, + "loss": 0.9242, + "num_input_tokens_seen": 1008317408, + "step": 5585 + }, + { + "epoch": 0.6115109882591204, + "grad_norm": 1.3247774009138462, + "learning_rate": 1.641628694333354e-05, + "loss": 0.7242, + "num_input_tokens_seen": 1008475776, + "step": 5586 + }, + { + "epoch": 0.6116204603300583, + "grad_norm": 1.1809544107680447, + "learning_rate": 1.6408211555760028e-05, + "loss": 0.656, + "num_input_tokens_seen": 1008647136, + "step": 5587 + }, + { + "epoch": 0.6117299324009962, + "grad_norm": 1.1537372504095043, + "learning_rate": 1.6400137184578647e-05, + "loss": 0.6507, + "num_input_tokens_seen": 1008812896, + "step": 5588 + }, + { + "epoch": 0.6118394044719341, + "grad_norm": 1.2008885987743658, + "learning_rate": 1.639206383074461e-05, + "loss": 0.7057, + "num_input_tokens_seen": 1008995680, + "step": 5589 + }, + { + "epoch": 0.611948876542872, + "grad_norm": 1.114416888396981, + "learning_rate": 1.6383991495212957e-05, + "loss": 0.8946, + "num_input_tokens_seen": 1009209600, + "step": 5590 + }, + { + "epoch": 0.6120583486138099, + "grad_norm": 1.2602041501462278, + "learning_rate": 1.6375920178938646e-05, + "loss": 0.6025, + "num_input_tokens_seen": 1009361696, + "step": 5591 + }, + { + "epoch": 0.6121678206847478, + "grad_norm": 1.3082032033670241, + "learning_rate": 1.6367849882876485e-05, + "loss": 0.7432, + "num_input_tokens_seen": 1009551200, + "step": 5592 + }, + { + "epoch": 0.6122772927556857, + "grad_norm": 1.206706426177961, + "learning_rate": 1.635978060798118e-05, + "loss": 0.937, + "num_input_tokens_seen": 1009746976, + "step": 5593 + }, + { + "epoch": 0.6123867648266236, + "grad_norm": 1.2226579075031196, + "learning_rate": 1.6351712355207323e-05, + "loss": 0.8917, + "num_input_tokens_seen": 1009928192, + "step": 5594 + }, + { + "epoch": 0.6124962368975615, + "grad_norm": 1.311239636492072, + "learning_rate": 1.6343645125509348e-05, + "loss": 0.8546, + "num_input_tokens_seen": 1010113216, + "step": 5595 + }, + { + "epoch": 0.6126057089684994, + "grad_norm": 1.3037401517530829, + "learning_rate": 1.633557891984162e-05, + "loss": 0.7644, + "num_input_tokens_seen": 1010279648, + "step": 5596 + }, + { + "epoch": 0.6127151810394373, + "grad_norm": 1.0758226662325383, + "learning_rate": 1.632751373915833e-05, + "loss": 0.551, + "num_input_tokens_seen": 1010474976, + "step": 5597 + }, + { + "epoch": 0.6128246531103753, + "grad_norm": 1.1968217185488803, + "learning_rate": 1.6319449584413596e-05, + "loss": 0.7171, + "num_input_tokens_seen": 1010668960, + "step": 5598 + }, + { + "epoch": 0.6129341251813131, + "grad_norm": 1.2759633669542414, + "learning_rate": 1.6311386456561373e-05, + "loss": 0.7969, + "num_input_tokens_seen": 1010881760, + "step": 5599 + }, + { + "epoch": 0.613043597252251, + "grad_norm": 1.4288095582311264, + "learning_rate": 1.6303324356555523e-05, + "loss": 0.8477, + "num_input_tokens_seen": 1011052224, + "step": 5600 + }, + { + "epoch": 0.6131530693231889, + "grad_norm": 1.3012495143130933, + "learning_rate": 1.6295263285349776e-05, + "loss": 0.7495, + "num_input_tokens_seen": 1011230752, + "step": 5601 + }, + { + "epoch": 0.6132625413941268, + "grad_norm": 1.307876891598468, + "learning_rate": 1.628720324389774e-05, + "loss": 0.5814, + "num_input_tokens_seen": 1011392704, + "step": 5602 + }, + { + "epoch": 0.6133720134650648, + "grad_norm": 1.400366747026097, + "learning_rate": 1.6279144233152922e-05, + "loss": 0.9052, + "num_input_tokens_seen": 1011561376, + "step": 5603 + }, + { + "epoch": 0.6134814855360027, + "grad_norm": 1.164410942206802, + "learning_rate": 1.6271086254068653e-05, + "loss": 0.7543, + "num_input_tokens_seen": 1011760512, + "step": 5604 + }, + { + "epoch": 0.6135909576069405, + "grad_norm": 1.420829556293259, + "learning_rate": 1.6263029307598198e-05, + "loss": 0.8816, + "num_input_tokens_seen": 1011946208, + "step": 5605 + }, + { + "epoch": 0.6137004296778784, + "grad_norm": 1.2391932164785355, + "learning_rate": 1.6254973394694672e-05, + "loss": 0.8905, + "num_input_tokens_seen": 1012138624, + "step": 5606 + }, + { + "epoch": 0.6138099017488163, + "grad_norm": 1.1225338502125657, + "learning_rate": 1.6246918516311072e-05, + "loss": 0.633, + "num_input_tokens_seen": 1012319840, + "step": 5607 + }, + { + "epoch": 0.6139193738197543, + "grad_norm": 1.2878454763350873, + "learning_rate": 1.623886467340029e-05, + "loss": 0.9574, + "num_input_tokens_seen": 1012523456, + "step": 5608 + }, + { + "epoch": 0.6140288458906922, + "grad_norm": 1.2966108397811393, + "learning_rate": 1.6230811866915057e-05, + "loss": 0.9073, + "num_input_tokens_seen": 1012695264, + "step": 5609 + }, + { + "epoch": 0.61413831796163, + "grad_norm": 1.3307152565106546, + "learning_rate": 1.622276009780802e-05, + "loss": 0.5939, + "num_input_tokens_seen": 1012852288, + "step": 5610 + }, + { + "epoch": 0.6142477900325679, + "grad_norm": 1.438106989211026, + "learning_rate": 1.621470936703169e-05, + "loss": 0.8305, + "num_input_tokens_seen": 1013010432, + "step": 5611 + }, + { + "epoch": 0.6143572621035058, + "grad_norm": 1.2315991250404552, + "learning_rate": 1.6206659675538445e-05, + "loss": 0.6938, + "num_input_tokens_seen": 1013190976, + "step": 5612 + }, + { + "epoch": 0.6144667341744438, + "grad_norm": 1.453244522265524, + "learning_rate": 1.6198611024280543e-05, + "loss": 0.8629, + "num_input_tokens_seen": 1013346432, + "step": 5613 + }, + { + "epoch": 0.6145762062453817, + "grad_norm": 1.2766626624566637, + "learning_rate": 1.6190563414210132e-05, + "loss": 0.5833, + "num_input_tokens_seen": 1013514432, + "step": 5614 + }, + { + "epoch": 0.6146856783163196, + "grad_norm": 1.3852517978785286, + "learning_rate": 1.6182516846279237e-05, + "loss": 1.0842, + "num_input_tokens_seen": 1013679296, + "step": 5615 + }, + { + "epoch": 0.6147951503872574, + "grad_norm": 1.2978310335830556, + "learning_rate": 1.6174471321439737e-05, + "loss": 0.8542, + "num_input_tokens_seen": 1013856928, + "step": 5616 + }, + { + "epoch": 0.6149046224581953, + "grad_norm": 1.0875989430459243, + "learning_rate": 1.6166426840643415e-05, + "loss": 0.7794, + "num_input_tokens_seen": 1014039040, + "step": 5617 + }, + { + "epoch": 0.6150140945291332, + "grad_norm": 1.1013320248871605, + "learning_rate": 1.6158383404841902e-05, + "loss": 0.8752, + "num_input_tokens_seen": 1014261472, + "step": 5618 + }, + { + "epoch": 0.6151235666000712, + "grad_norm": 1.3257749522502988, + "learning_rate": 1.615034101498673e-05, + "loss": 0.6257, + "num_input_tokens_seen": 1014436192, + "step": 5619 + }, + { + "epoch": 0.6152330386710091, + "grad_norm": 1.7451288685036805, + "learning_rate": 1.6142299672029307e-05, + "loss": 0.7707, + "num_input_tokens_seen": 1014618752, + "step": 5620 + }, + { + "epoch": 0.615342510741947, + "grad_norm": 1.3244298806010248, + "learning_rate": 1.613425937692089e-05, + "loss": 0.9942, + "num_input_tokens_seen": 1014798848, + "step": 5621 + }, + { + "epoch": 0.6154519828128848, + "grad_norm": 1.2437106696576385, + "learning_rate": 1.6126220130612646e-05, + "loss": 0.788, + "num_input_tokens_seen": 1014986560, + "step": 5622 + }, + { + "epoch": 0.6155614548838227, + "grad_norm": 1.2222606274803454, + "learning_rate": 1.6118181934055593e-05, + "loss": 0.5454, + "num_input_tokens_seen": 1015159264, + "step": 5623 + }, + { + "epoch": 0.6156709269547607, + "grad_norm": 1.2214364496533117, + "learning_rate": 1.611014478820064e-05, + "loss": 0.8409, + "num_input_tokens_seen": 1015347872, + "step": 5624 + }, + { + "epoch": 0.6157803990256986, + "grad_norm": 1.3503667580479182, + "learning_rate": 1.6102108693998568e-05, + "loss": 0.7603, + "num_input_tokens_seen": 1015498400, + "step": 5625 + }, + { + "epoch": 0.6158898710966365, + "grad_norm": 1.1959899777595318, + "learning_rate": 1.6094073652400014e-05, + "loss": 0.652, + "num_input_tokens_seen": 1015688576, + "step": 5626 + }, + { + "epoch": 0.6159993431675743, + "grad_norm": 1.3067928390910404, + "learning_rate": 1.608603966435554e-05, + "loss": 0.8421, + "num_input_tokens_seen": 1015868448, + "step": 5627 + }, + { + "epoch": 0.6161088152385122, + "grad_norm": 1.1101403618466894, + "learning_rate": 1.607800673081552e-05, + "loss": 0.6916, + "num_input_tokens_seen": 1016054816, + "step": 5628 + }, + { + "epoch": 0.6162182873094502, + "grad_norm": 1.2855190704130015, + "learning_rate": 1.6069974852730263e-05, + "loss": 0.7484, + "num_input_tokens_seen": 1016232224, + "step": 5629 + }, + { + "epoch": 0.6163277593803881, + "grad_norm": 1.1758515599922337, + "learning_rate": 1.6061944031049893e-05, + "loss": 0.6238, + "num_input_tokens_seen": 1016420384, + "step": 5630 + }, + { + "epoch": 0.616437231451326, + "grad_norm": 1.1796645610675918, + "learning_rate": 1.605391426672447e-05, + "loss": 0.8591, + "num_input_tokens_seen": 1016639904, + "step": 5631 + }, + { + "epoch": 0.6165467035222639, + "grad_norm": 1.330345586249244, + "learning_rate": 1.604588556070388e-05, + "loss": 0.8722, + "num_input_tokens_seen": 1016802976, + "step": 5632 + }, + { + "epoch": 0.6166561755932017, + "grad_norm": 1.2591786991220237, + "learning_rate": 1.6037857913937908e-05, + "loss": 0.7539, + "num_input_tokens_seen": 1016962688, + "step": 5633 + }, + { + "epoch": 0.6167656476641397, + "grad_norm": 1.331177916388552, + "learning_rate": 1.6029831327376217e-05, + "loss": 0.7009, + "num_input_tokens_seen": 1017139424, + "step": 5634 + }, + { + "epoch": 0.6168751197350776, + "grad_norm": 1.347543684086774, + "learning_rate": 1.6021805801968325e-05, + "loss": 0.8175, + "num_input_tokens_seen": 1017316384, + "step": 5635 + }, + { + "epoch": 0.6169845918060155, + "grad_norm": 1.2696744397914344, + "learning_rate": 1.6013781338663654e-05, + "loss": 0.7916, + "num_input_tokens_seen": 1017518656, + "step": 5636 + }, + { + "epoch": 0.6170940638769534, + "grad_norm": 1.2527305819457262, + "learning_rate": 1.6005757938411466e-05, + "loss": 0.6901, + "num_input_tokens_seen": 1017689792, + "step": 5637 + }, + { + "epoch": 0.6172035359478913, + "grad_norm": 1.3430425866618685, + "learning_rate": 1.5997735602160923e-05, + "loss": 0.7235, + "num_input_tokens_seen": 1017823296, + "step": 5638 + }, + { + "epoch": 0.6173130080188292, + "grad_norm": 1.2247758718710309, + "learning_rate": 1.5989714330861043e-05, + "loss": 0.7091, + "num_input_tokens_seen": 1018016832, + "step": 5639 + }, + { + "epoch": 0.6174224800897671, + "grad_norm": 1.2242365365358205, + "learning_rate": 1.5981694125460735e-05, + "loss": 0.6641, + "num_input_tokens_seen": 1018210816, + "step": 5640 + }, + { + "epoch": 0.617531952160705, + "grad_norm": 1.1379898148535106, + "learning_rate": 1.5973674986908778e-05, + "loss": 0.7093, + "num_input_tokens_seen": 1018406144, + "step": 5641 + }, + { + "epoch": 0.6176414242316429, + "grad_norm": 1.150733003038729, + "learning_rate": 1.596565691615381e-05, + "loss": 0.5434, + "num_input_tokens_seen": 1018583776, + "step": 5642 + }, + { + "epoch": 0.6177508963025808, + "grad_norm": 1.2202523581793283, + "learning_rate": 1.5957639914144358e-05, + "loss": 1.1702, + "num_input_tokens_seen": 1018783808, + "step": 5643 + }, + { + "epoch": 0.6178603683735187, + "grad_norm": 1.1769086432446945, + "learning_rate": 1.5949623981828815e-05, + "loss": 0.6512, + "num_input_tokens_seen": 1018968608, + "step": 5644 + }, + { + "epoch": 0.6179698404444566, + "grad_norm": 1.4329593928427473, + "learning_rate": 1.594160912015546e-05, + "loss": 0.866, + "num_input_tokens_seen": 1019151392, + "step": 5645 + }, + { + "epoch": 0.6180793125153945, + "grad_norm": 1.2180546953514093, + "learning_rate": 1.5933595330072425e-05, + "loss": 0.755, + "num_input_tokens_seen": 1019284896, + "step": 5646 + }, + { + "epoch": 0.6181887845863324, + "grad_norm": 1.2684179970951688, + "learning_rate": 1.5925582612527728e-05, + "loss": 0.8724, + "num_input_tokens_seen": 1019443936, + "step": 5647 + }, + { + "epoch": 0.6182982566572703, + "grad_norm": 1.1644969871780122, + "learning_rate": 1.591757096846927e-05, + "loss": 0.588, + "num_input_tokens_seen": 1019630528, + "step": 5648 + }, + { + "epoch": 0.6184077287282083, + "grad_norm": 1.16623025065849, + "learning_rate": 1.590956039884479e-05, + "loss": 0.6737, + "num_input_tokens_seen": 1019815552, + "step": 5649 + }, + { + "epoch": 0.6185172007991461, + "grad_norm": 1.317604854978732, + "learning_rate": 1.5901550904601952e-05, + "loss": 1.056, + "num_input_tokens_seen": 1019979744, + "step": 5650 + }, + { + "epoch": 0.618626672870084, + "grad_norm": 1.495365214440103, + "learning_rate": 1.589354248668824e-05, + "loss": 0.8429, + "num_input_tokens_seen": 1020164992, + "step": 5651 + }, + { + "epoch": 0.6187361449410219, + "grad_norm": 1.2477653078345157, + "learning_rate": 1.5885535146051046e-05, + "loss": 0.6074, + "num_input_tokens_seen": 1020344416, + "step": 5652 + }, + { + "epoch": 0.6188456170119598, + "grad_norm": 1.3226250705097073, + "learning_rate": 1.587752888363762e-05, + "loss": 1.0089, + "num_input_tokens_seen": 1020546688, + "step": 5653 + }, + { + "epoch": 0.6189550890828978, + "grad_norm": 1.343326457497454, + "learning_rate": 1.5869523700395085e-05, + "loss": 0.9002, + "num_input_tokens_seen": 1020694080, + "step": 5654 + }, + { + "epoch": 0.6190645611538357, + "grad_norm": 1.1860029421153895, + "learning_rate": 1.5861519597270442e-05, + "loss": 0.7451, + "num_input_tokens_seen": 1020870368, + "step": 5655 + }, + { + "epoch": 0.6191740332247735, + "grad_norm": 1.2542232694980453, + "learning_rate": 1.5853516575210558e-05, + "loss": 0.8295, + "num_input_tokens_seen": 1021055392, + "step": 5656 + }, + { + "epoch": 0.6192835052957114, + "grad_norm": 1.1180707734351123, + "learning_rate": 1.5845514635162188e-05, + "loss": 0.5985, + "num_input_tokens_seen": 1021213088, + "step": 5657 + }, + { + "epoch": 0.6193929773666493, + "grad_norm": 1.3517051577389008, + "learning_rate": 1.5837513778071927e-05, + "loss": 0.7283, + "num_input_tokens_seen": 1021382880, + "step": 5658 + }, + { + "epoch": 0.6195024494375873, + "grad_norm": 1.3045640532974545, + "learning_rate": 1.5829514004886282e-05, + "loss": 0.6655, + "num_input_tokens_seen": 1021566112, + "step": 5659 + }, + { + "epoch": 0.6196119215085252, + "grad_norm": 1.2733153302149953, + "learning_rate": 1.582151531655159e-05, + "loss": 1.0023, + "num_input_tokens_seen": 1021722464, + "step": 5660 + }, + { + "epoch": 0.619721393579463, + "grad_norm": 1.248479347339285, + "learning_rate": 1.5813517714014087e-05, + "loss": 0.9385, + "num_input_tokens_seen": 1021934816, + "step": 5661 + }, + { + "epoch": 0.6198308656504009, + "grad_norm": 1.3466773859547954, + "learning_rate": 1.5805521198219886e-05, + "loss": 0.8711, + "num_input_tokens_seen": 1022080192, + "step": 5662 + }, + { + "epoch": 0.6199403377213388, + "grad_norm": 1.2031408036420204, + "learning_rate": 1.579752577011494e-05, + "loss": 0.6519, + "num_input_tokens_seen": 1022259840, + "step": 5663 + }, + { + "epoch": 0.6200498097922768, + "grad_norm": 1.3286860459045782, + "learning_rate": 1.578953143064511e-05, + "loss": 0.9084, + "num_input_tokens_seen": 1022446208, + "step": 5664 + }, + { + "epoch": 0.6201592818632147, + "grad_norm": 1.222703585454879, + "learning_rate": 1.57815381807561e-05, + "loss": 0.7223, + "num_input_tokens_seen": 1022652064, + "step": 5665 + }, + { + "epoch": 0.6202687539341526, + "grad_norm": 1.2374627329291532, + "learning_rate": 1.577354602139351e-05, + "loss": 0.6946, + "num_input_tokens_seen": 1022787808, + "step": 5666 + }, + { + "epoch": 0.6203782260050904, + "grad_norm": 1.1542719339687302, + "learning_rate": 1.5765554953502777e-05, + "loss": 0.7203, + "num_input_tokens_seen": 1022960512, + "step": 5667 + }, + { + "epoch": 0.6204876980760283, + "grad_norm": 1.2303476909849407, + "learning_rate": 1.575756497802924e-05, + "loss": 0.726, + "num_input_tokens_seen": 1023137248, + "step": 5668 + }, + { + "epoch": 0.6205971701469662, + "grad_norm": 1.321539494929196, + "learning_rate": 1.574957609591811e-05, + "loss": 0.8754, + "num_input_tokens_seen": 1023316224, + "step": 5669 + }, + { + "epoch": 0.6207066422179042, + "grad_norm": 1.1611615587274273, + "learning_rate": 1.574158830811443e-05, + "loss": 0.7545, + "num_input_tokens_seen": 1023516480, + "step": 5670 + }, + { + "epoch": 0.6208161142888421, + "grad_norm": 1.3066213289897386, + "learning_rate": 1.5733601615563163e-05, + "loss": 0.7692, + "num_input_tokens_seen": 1023724128, + "step": 5671 + }, + { + "epoch": 0.62092558635978, + "grad_norm": 1.300732860867979, + "learning_rate": 1.5725616019209106e-05, + "loss": 0.7723, + "num_input_tokens_seen": 1023901984, + "step": 5672 + }, + { + "epoch": 0.6210350584307178, + "grad_norm": 1.3241290396851388, + "learning_rate": 1.5717631519996947e-05, + "loss": 0.8368, + "num_input_tokens_seen": 1024076928, + "step": 5673 + }, + { + "epoch": 0.6211445305016557, + "grad_norm": 1.3538305403402628, + "learning_rate": 1.5709648118871232e-05, + "loss": 0.8955, + "num_input_tokens_seen": 1024263968, + "step": 5674 + }, + { + "epoch": 0.6212540025725937, + "grad_norm": 1.2472319472079394, + "learning_rate": 1.5701665816776385e-05, + "loss": 0.7904, + "num_input_tokens_seen": 1024433760, + "step": 5675 + }, + { + "epoch": 0.6213634746435316, + "grad_norm": 1.35942042483147, + "learning_rate": 1.5693684614656697e-05, + "loss": 0.8252, + "num_input_tokens_seen": 1024613856, + "step": 5676 + }, + { + "epoch": 0.6214729467144695, + "grad_norm": 1.2600731762572404, + "learning_rate": 1.568570451345632e-05, + "loss": 0.6227, + "num_input_tokens_seen": 1024800672, + "step": 5677 + }, + { + "epoch": 0.6215824187854073, + "grad_norm": 1.431806734240239, + "learning_rate": 1.567772551411931e-05, + "loss": 0.9248, + "num_input_tokens_seen": 1024983232, + "step": 5678 + }, + { + "epoch": 0.6216918908563452, + "grad_norm": 1.2515179953597753, + "learning_rate": 1.5669747617589535e-05, + "loss": 0.7289, + "num_input_tokens_seen": 1025174528, + "step": 5679 + }, + { + "epoch": 0.6218013629272832, + "grad_norm": 1.292188011086304, + "learning_rate": 1.5661770824810785e-05, + "loss": 0.6663, + "num_input_tokens_seen": 1025353056, + "step": 5680 + }, + { + "epoch": 0.6219108349982211, + "grad_norm": 1.1719218944067757, + "learning_rate": 1.5653795136726705e-05, + "loss": 0.6899, + "num_input_tokens_seen": 1025542784, + "step": 5681 + }, + { + "epoch": 0.622020307069159, + "grad_norm": 1.2631692019226293, + "learning_rate": 1.5645820554280783e-05, + "loss": 0.8408, + "num_input_tokens_seen": 1025711456, + "step": 5682 + }, + { + "epoch": 0.6221297791400969, + "grad_norm": 1.3060164393373217, + "learning_rate": 1.5637847078416413e-05, + "loss": 0.8344, + "num_input_tokens_seen": 1025916640, + "step": 5683 + }, + { + "epoch": 0.6222392512110347, + "grad_norm": 1.3769400088720394, + "learning_rate": 1.562987471007683e-05, + "loss": 0.9935, + "num_input_tokens_seen": 1026105696, + "step": 5684 + }, + { + "epoch": 0.6223487232819727, + "grad_norm": 1.409583020136467, + "learning_rate": 1.5621903450205162e-05, + "loss": 0.8621, + "num_input_tokens_seen": 1026285792, + "step": 5685 + }, + { + "epoch": 0.6224581953529106, + "grad_norm": 1.3653619840302416, + "learning_rate": 1.561393329974438e-05, + "loss": 0.8591, + "num_input_tokens_seen": 1026441472, + "step": 5686 + }, + { + "epoch": 0.6225676674238485, + "grad_norm": 1.3965716568140254, + "learning_rate": 1.560596425963735e-05, + "loss": 0.8509, + "num_input_tokens_seen": 1026596032, + "step": 5687 + }, + { + "epoch": 0.6226771394947864, + "grad_norm": 1.3614220826222085, + "learning_rate": 1.559799633082679e-05, + "loss": 0.9961, + "num_input_tokens_seen": 1026760448, + "step": 5688 + }, + { + "epoch": 0.6227866115657243, + "grad_norm": 1.1771511575969018, + "learning_rate": 1.559002951425529e-05, + "loss": 0.6539, + "num_input_tokens_seen": 1026942336, + "step": 5689 + }, + { + "epoch": 0.6228960836366622, + "grad_norm": 1.2553220937417835, + "learning_rate": 1.5582063810865315e-05, + "loss": 0.6516, + "num_input_tokens_seen": 1027104736, + "step": 5690 + }, + { + "epoch": 0.6230055557076001, + "grad_norm": 1.3364964258479413, + "learning_rate": 1.557409922159918e-05, + "loss": 0.752, + "num_input_tokens_seen": 1027262208, + "step": 5691 + }, + { + "epoch": 0.623115027778538, + "grad_norm": 1.301251254928526, + "learning_rate": 1.5566135747399097e-05, + "loss": 0.7543, + "num_input_tokens_seen": 1027408032, + "step": 5692 + }, + { + "epoch": 0.6232244998494759, + "grad_norm": 1.2823097219469144, + "learning_rate": 1.555817338920711e-05, + "loss": 0.7811, + "num_input_tokens_seen": 1027578720, + "step": 5693 + }, + { + "epoch": 0.6233339719204138, + "grad_norm": 1.1901143558436083, + "learning_rate": 1.555021214796516e-05, + "loss": 0.6918, + "num_input_tokens_seen": 1027769792, + "step": 5694 + }, + { + "epoch": 0.6234434439913517, + "grad_norm": 1.2240317410579558, + "learning_rate": 1.5542252024615056e-05, + "loss": 0.9691, + "num_input_tokens_seen": 1027987968, + "step": 5695 + }, + { + "epoch": 0.6235529160622896, + "grad_norm": 1.135150522426325, + "learning_rate": 1.5534293020098454e-05, + "loss": 0.7457, + "num_input_tokens_seen": 1028158656, + "step": 5696 + }, + { + "epoch": 0.6236623881332275, + "grad_norm": 1.1830101966676436, + "learning_rate": 1.5526335135356895e-05, + "loss": 0.8696, + "num_input_tokens_seen": 1028331136, + "step": 5697 + }, + { + "epoch": 0.6237718602041654, + "grad_norm": 1.1715979184962804, + "learning_rate": 1.551837837133177e-05, + "loss": 0.8051, + "num_input_tokens_seen": 1028536096, + "step": 5698 + }, + { + "epoch": 0.6238813322751033, + "grad_norm": 1.2058881266249106, + "learning_rate": 1.5510422728964374e-05, + "loss": 0.6651, + "num_input_tokens_seen": 1028689536, + "step": 5699 + }, + { + "epoch": 0.6239908043460413, + "grad_norm": 1.0963023785873702, + "learning_rate": 1.5502468209195815e-05, + "loss": 0.6841, + "num_input_tokens_seen": 1028858880, + "step": 5700 + }, + { + "epoch": 0.6241002764169791, + "grad_norm": 1.2976080879216894, + "learning_rate": 1.549451481296711e-05, + "loss": 1.0211, + "num_input_tokens_seen": 1029048608, + "step": 5701 + }, + { + "epoch": 0.624209748487917, + "grad_norm": 1.1683644928815315, + "learning_rate": 1.548656254121914e-05, + "loss": 0.8262, + "num_input_tokens_seen": 1029253120, + "step": 5702 + }, + { + "epoch": 0.6243192205588549, + "grad_norm": 1.3334867518295759, + "learning_rate": 1.547861139489263e-05, + "loss": 0.6913, + "num_input_tokens_seen": 1029417312, + "step": 5703 + }, + { + "epoch": 0.6244286926297928, + "grad_norm": 1.2753728657956704, + "learning_rate": 1.5470661374928198e-05, + "loss": 0.8695, + "num_input_tokens_seen": 1029605472, + "step": 5704 + }, + { + "epoch": 0.6245381647007308, + "grad_norm": 1.1599070991114033, + "learning_rate": 1.5462712482266296e-05, + "loss": 0.8171, + "num_input_tokens_seen": 1029813120, + "step": 5705 + }, + { + "epoch": 0.6246476367716687, + "grad_norm": 1.0599622151100188, + "learning_rate": 1.545476471784728e-05, + "loss": 0.7275, + "num_input_tokens_seen": 1029999264, + "step": 5706 + }, + { + "epoch": 0.6247571088426065, + "grad_norm": 1.209540761386151, + "learning_rate": 1.544681808261135e-05, + "loss": 0.8385, + "num_input_tokens_seen": 1030182048, + "step": 5707 + }, + { + "epoch": 0.6248665809135444, + "grad_norm": 1.268417057268282, + "learning_rate": 1.5438872577498575e-05, + "loss": 0.6031, + "num_input_tokens_seen": 1030331680, + "step": 5708 + }, + { + "epoch": 0.6249760529844823, + "grad_norm": 1.2888162551114073, + "learning_rate": 1.5430928203448903e-05, + "loss": 0.6648, + "num_input_tokens_seen": 1030484000, + "step": 5709 + }, + { + "epoch": 0.6250855250554203, + "grad_norm": 1.3448980882742112, + "learning_rate": 1.5422984961402125e-05, + "loss": 0.8431, + "num_input_tokens_seen": 1030670592, + "step": 5710 + }, + { + "epoch": 0.6251949971263582, + "grad_norm": 1.1913047872395433, + "learning_rate": 1.541504285229793e-05, + "loss": 0.8321, + "num_input_tokens_seen": 1030849344, + "step": 5711 + }, + { + "epoch": 0.625304469197296, + "grad_norm": 1.3850322945475666, + "learning_rate": 1.5407101877075827e-05, + "loss": 0.8123, + "num_input_tokens_seen": 1031015552, + "step": 5712 + }, + { + "epoch": 0.6254139412682339, + "grad_norm": 1.236460360766674, + "learning_rate": 1.5399162036675245e-05, + "loss": 0.7077, + "num_input_tokens_seen": 1031200800, + "step": 5713 + }, + { + "epoch": 0.6255234133391718, + "grad_norm": 1.2723433585894204, + "learning_rate": 1.5391223332035434e-05, + "loss": 0.6698, + "num_input_tokens_seen": 1031360064, + "step": 5714 + }, + { + "epoch": 0.6256328854101098, + "grad_norm": 1.266649420311285, + "learning_rate": 1.5383285764095534e-05, + "loss": 0.8364, + "num_input_tokens_seen": 1031548896, + "step": 5715 + }, + { + "epoch": 0.6257423574810477, + "grad_norm": 1.4520976218674926, + "learning_rate": 1.5375349333794545e-05, + "loss": 0.739, + "num_input_tokens_seen": 1031703456, + "step": 5716 + }, + { + "epoch": 0.6258518295519856, + "grad_norm": 1.3347263311419233, + "learning_rate": 1.5367414042071333e-05, + "loss": 0.8619, + "num_input_tokens_seen": 1031851968, + "step": 5717 + }, + { + "epoch": 0.6259613016229234, + "grad_norm": 1.3469440734569769, + "learning_rate": 1.5359479889864625e-05, + "loss": 0.9245, + "num_input_tokens_seen": 1032032512, + "step": 5718 + }, + { + "epoch": 0.6260707736938613, + "grad_norm": 1.2442398870086668, + "learning_rate": 1.535154687811301e-05, + "loss": 0.7887, + "num_input_tokens_seen": 1032238592, + "step": 5719 + }, + { + "epoch": 0.6261802457647992, + "grad_norm": 1.0741516092299104, + "learning_rate": 1.534361500775497e-05, + "loss": 0.6371, + "num_input_tokens_seen": 1032438848, + "step": 5720 + }, + { + "epoch": 0.6262897178357372, + "grad_norm": 1.2238363110475206, + "learning_rate": 1.5335684279728798e-05, + "loss": 0.8898, + "num_input_tokens_seen": 1032620736, + "step": 5721 + }, + { + "epoch": 0.6263991899066751, + "grad_norm": 1.3786629956652787, + "learning_rate": 1.5327754694972705e-05, + "loss": 0.8825, + "num_input_tokens_seen": 1032786720, + "step": 5722 + }, + { + "epoch": 0.626508661977613, + "grad_norm": 1.3167755333442097, + "learning_rate": 1.531982625442475e-05, + "loss": 0.8472, + "num_input_tokens_seen": 1032952032, + "step": 5723 + }, + { + "epoch": 0.6266181340485508, + "grad_norm": 1.2592627652800958, + "learning_rate": 1.5311898959022832e-05, + "loss": 0.7449, + "num_input_tokens_seen": 1033103904, + "step": 5724 + }, + { + "epoch": 0.6267276061194887, + "grad_norm": 1.1943248083885867, + "learning_rate": 1.530397280970476e-05, + "loss": 0.6532, + "num_input_tokens_seen": 1033247936, + "step": 5725 + }, + { + "epoch": 0.6268370781904267, + "grad_norm": 1.1640874360761142, + "learning_rate": 1.5296047807408152e-05, + "loss": 0.5987, + "num_input_tokens_seen": 1033415936, + "step": 5726 + }, + { + "epoch": 0.6269465502613646, + "grad_norm": 1.3474916661700154, + "learning_rate": 1.5288123953070552e-05, + "loss": 0.626, + "num_input_tokens_seen": 1033545184, + "step": 5727 + }, + { + "epoch": 0.6270560223323025, + "grad_norm": 1.1700987005580112, + "learning_rate": 1.5280201247629312e-05, + "loss": 0.5331, + "num_input_tokens_seen": 1033710496, + "step": 5728 + }, + { + "epoch": 0.6271654944032403, + "grad_norm": 1.301689403708195, + "learning_rate": 1.527227969202169e-05, + "loss": 0.8359, + "num_input_tokens_seen": 1033872224, + "step": 5729 + }, + { + "epoch": 0.6272749664741782, + "grad_norm": 1.3277369212715084, + "learning_rate": 1.5264359287184783e-05, + "loss": 0.7496, + "num_input_tokens_seen": 1034061952, + "step": 5730 + }, + { + "epoch": 0.6273844385451162, + "grad_norm": 1.2020000301081801, + "learning_rate": 1.5256440034055557e-05, + "loss": 0.7454, + "num_input_tokens_seen": 1034254816, + "step": 5731 + }, + { + "epoch": 0.6274939106160541, + "grad_norm": 1.2030225933138157, + "learning_rate": 1.5248521933570858e-05, + "loss": 0.7045, + "num_input_tokens_seen": 1034429536, + "step": 5732 + }, + { + "epoch": 0.627603382686992, + "grad_norm": 1.1003586704542292, + "learning_rate": 1.5240604986667362e-05, + "loss": 0.6308, + "num_input_tokens_seen": 1034622624, + "step": 5733 + }, + { + "epoch": 0.6277128547579299, + "grad_norm": 1.12461645157904, + "learning_rate": 1.5232689194281652e-05, + "loss": 0.5376, + "num_input_tokens_seen": 1034785024, + "step": 5734 + }, + { + "epoch": 0.6278223268288677, + "grad_norm": 1.3329885851692536, + "learning_rate": 1.5224774557350125e-05, + "loss": 0.8973, + "num_input_tokens_seen": 1034974304, + "step": 5735 + }, + { + "epoch": 0.6279317988998057, + "grad_norm": 1.2214915910635158, + "learning_rate": 1.5216861076809083e-05, + "loss": 0.7951, + "num_input_tokens_seen": 1035155520, + "step": 5736 + }, + { + "epoch": 0.6280412709707436, + "grad_norm": 1.3059796085401443, + "learning_rate": 1.5208948753594677e-05, + "loss": 0.7006, + "num_input_tokens_seen": 1035318368, + "step": 5737 + }, + { + "epoch": 0.6281507430416815, + "grad_norm": 1.337809049249452, + "learning_rate": 1.5201037588642916e-05, + "loss": 0.7764, + "num_input_tokens_seen": 1035498240, + "step": 5738 + }, + { + "epoch": 0.6282602151126194, + "grad_norm": 1.2910365854659567, + "learning_rate": 1.5193127582889677e-05, + "loss": 0.8466, + "num_input_tokens_seen": 1035694240, + "step": 5739 + }, + { + "epoch": 0.6283696871835573, + "grad_norm": 1.2153850956110763, + "learning_rate": 1.5185218737270694e-05, + "loss": 0.7353, + "num_input_tokens_seen": 1035879936, + "step": 5740 + }, + { + "epoch": 0.6284791592544952, + "grad_norm": 1.2735324660387535, + "learning_rate": 1.5177311052721568e-05, + "loss": 0.9621, + "num_input_tokens_seen": 1036058464, + "step": 5741 + }, + { + "epoch": 0.6285886313254331, + "grad_norm": 1.374741790108759, + "learning_rate": 1.5169404530177778e-05, + "loss": 0.8621, + "num_input_tokens_seen": 1036263200, + "step": 5742 + }, + { + "epoch": 0.628698103396371, + "grad_norm": 1.0878406013205844, + "learning_rate": 1.5161499170574629e-05, + "loss": 0.7314, + "num_input_tokens_seen": 1036439264, + "step": 5743 + }, + { + "epoch": 0.6288075754673089, + "grad_norm": 1.3508266584751922, + "learning_rate": 1.515359497484733e-05, + "loss": 0.8541, + "num_input_tokens_seen": 1036611968, + "step": 5744 + }, + { + "epoch": 0.6289170475382468, + "grad_norm": 1.3381798201925832, + "learning_rate": 1.5145691943930914e-05, + "loss": 0.6829, + "num_input_tokens_seen": 1036769216, + "step": 5745 + }, + { + "epoch": 0.6290265196091847, + "grad_norm": 1.278960112297949, + "learning_rate": 1.513779007876031e-05, + "loss": 0.8594, + "num_input_tokens_seen": 1036967008, + "step": 5746 + }, + { + "epoch": 0.6291359916801226, + "grad_norm": 1.272063983120745, + "learning_rate": 1.5129889380270279e-05, + "loss": 0.9295, + "num_input_tokens_seen": 1037176672, + "step": 5747 + }, + { + "epoch": 0.6292454637510605, + "grad_norm": 1.2119297023504834, + "learning_rate": 1.5121989849395465e-05, + "loss": 0.8273, + "num_input_tokens_seen": 1037380288, + "step": 5748 + }, + { + "epoch": 0.6293549358219984, + "grad_norm": 1.2761253588991037, + "learning_rate": 1.5114091487070376e-05, + "loss": 0.8229, + "num_input_tokens_seen": 1037576512, + "step": 5749 + }, + { + "epoch": 0.6294644078929363, + "grad_norm": 1.1979947880136412, + "learning_rate": 1.5106194294229359e-05, + "loss": 0.6341, + "num_input_tokens_seen": 1037746304, + "step": 5750 + }, + { + "epoch": 0.6295738799638743, + "grad_norm": 1.432966297677067, + "learning_rate": 1.5098298271806649e-05, + "loss": 1.0341, + "num_input_tokens_seen": 1037949920, + "step": 5751 + }, + { + "epoch": 0.6296833520348121, + "grad_norm": 1.0815905282820508, + "learning_rate": 1.5090403420736315e-05, + "loss": 0.6469, + "num_input_tokens_seen": 1038119936, + "step": 5752 + }, + { + "epoch": 0.62979282410575, + "grad_norm": 1.2195787302044814, + "learning_rate": 1.5082509741952328e-05, + "loss": 0.6648, + "num_input_tokens_seen": 1038300480, + "step": 5753 + }, + { + "epoch": 0.6299022961766879, + "grad_norm": 1.3760573050076945, + "learning_rate": 1.5074617236388467e-05, + "loss": 1.0703, + "num_input_tokens_seen": 1038505440, + "step": 5754 + }, + { + "epoch": 0.6300117682476258, + "grad_norm": 1.1074613802959659, + "learning_rate": 1.506672590497841e-05, + "loss": 0.8052, + "num_input_tokens_seen": 1038677696, + "step": 5755 + }, + { + "epoch": 0.6301212403185638, + "grad_norm": 1.1727908815159538, + "learning_rate": 1.5058835748655703e-05, + "loss": 0.8284, + "num_input_tokens_seen": 1038899456, + "step": 5756 + }, + { + "epoch": 0.6302307123895017, + "grad_norm": 1.3196900384559218, + "learning_rate": 1.5050946768353708e-05, + "loss": 0.852, + "num_input_tokens_seen": 1039074624, + "step": 5757 + }, + { + "epoch": 0.6303401844604395, + "grad_norm": 1.2745783089801332, + "learning_rate": 1.5043058965005702e-05, + "loss": 0.8474, + "num_input_tokens_seen": 1039276896, + "step": 5758 + }, + { + "epoch": 0.6304496565313774, + "grad_norm": 1.3230040163453511, + "learning_rate": 1.5035172339544781e-05, + "loss": 0.8027, + "num_input_tokens_seen": 1039453856, + "step": 5759 + }, + { + "epoch": 0.6305591286023153, + "grad_norm": 1.1795084735360621, + "learning_rate": 1.5027286892903924e-05, + "loss": 0.7073, + "num_input_tokens_seen": 1039662176, + "step": 5760 + }, + { + "epoch": 0.6306686006732533, + "grad_norm": 0.9986811225735712, + "learning_rate": 1.501940262601596e-05, + "loss": 0.5238, + "num_input_tokens_seen": 1039833536, + "step": 5761 + }, + { + "epoch": 0.6307780727441912, + "grad_norm": 1.2683641438912936, + "learning_rate": 1.5011519539813584e-05, + "loss": 0.8213, + "num_input_tokens_seen": 1040011168, + "step": 5762 + }, + { + "epoch": 0.630887544815129, + "grad_norm": 1.0404222239288745, + "learning_rate": 1.5003637635229361e-05, + "loss": 0.5296, + "num_input_tokens_seen": 1040223072, + "step": 5763 + }, + { + "epoch": 0.6309970168860669, + "grad_norm": 1.3316541121077643, + "learning_rate": 1.4995756913195688e-05, + "loss": 0.9224, + "num_input_tokens_seen": 1040389952, + "step": 5764 + }, + { + "epoch": 0.6311064889570048, + "grad_norm": 1.0409270458217548, + "learning_rate": 1.4987877374644858e-05, + "loss": 0.5188, + "num_input_tokens_seen": 1040558400, + "step": 5765 + }, + { + "epoch": 0.6312159610279428, + "grad_norm": 1.272915458158345, + "learning_rate": 1.4979999020508983e-05, + "loss": 0.7875, + "num_input_tokens_seen": 1040736480, + "step": 5766 + }, + { + "epoch": 0.6313254330988807, + "grad_norm": 1.1756783373839013, + "learning_rate": 1.4972121851720078e-05, + "loss": 0.6848, + "num_input_tokens_seen": 1040884544, + "step": 5767 + }, + { + "epoch": 0.6314349051698186, + "grad_norm": 1.221890486987376, + "learning_rate": 1.4964245869209979e-05, + "loss": 0.8038, + "num_input_tokens_seen": 1041052768, + "step": 5768 + }, + { + "epoch": 0.6315443772407564, + "grad_norm": 1.1000320950074078, + "learning_rate": 1.4956371073910408e-05, + "loss": 0.8989, + "num_input_tokens_seen": 1041249888, + "step": 5769 + }, + { + "epoch": 0.6316538493116943, + "grad_norm": 1.2850486919068842, + "learning_rate": 1.4948497466752943e-05, + "loss": 0.7042, + "num_input_tokens_seen": 1041427296, + "step": 5770 + }, + { + "epoch": 0.6317633213826322, + "grad_norm": 1.390889860497381, + "learning_rate": 1.494062504866901e-05, + "loss": 0.7241, + "num_input_tokens_seen": 1041585888, + "step": 5771 + }, + { + "epoch": 0.6318727934535702, + "grad_norm": 1.334578151559829, + "learning_rate": 1.4932753820589912e-05, + "loss": 1.0199, + "num_input_tokens_seen": 1041787936, + "step": 5772 + }, + { + "epoch": 0.6319822655245081, + "grad_norm": 1.3016995233075823, + "learning_rate": 1.492488378344678e-05, + "loss": 0.6415, + "num_input_tokens_seen": 1041982816, + "step": 5773 + }, + { + "epoch": 0.632091737595446, + "grad_norm": 1.3517083767351905, + "learning_rate": 1.4917014938170648e-05, + "loss": 0.7796, + "num_input_tokens_seen": 1042157984, + "step": 5774 + }, + { + "epoch": 0.6322012096663838, + "grad_norm": 1.10702965197182, + "learning_rate": 1.4909147285692366e-05, + "loss": 0.5959, + "num_input_tokens_seen": 1042354432, + "step": 5775 + }, + { + "epoch": 0.6323106817373217, + "grad_norm": 1.1727011248232095, + "learning_rate": 1.4901280826942665e-05, + "loss": 0.7966, + "num_input_tokens_seen": 1042562752, + "step": 5776 + }, + { + "epoch": 0.6324201538082597, + "grad_norm": 1.15339007962007, + "learning_rate": 1.4893415562852148e-05, + "loss": 0.6455, + "num_input_tokens_seen": 1042744416, + "step": 5777 + }, + { + "epoch": 0.6325296258791976, + "grad_norm": 1.3195625748651256, + "learning_rate": 1.4885551494351242e-05, + "loss": 0.6854, + "num_input_tokens_seen": 1042935264, + "step": 5778 + }, + { + "epoch": 0.6326390979501355, + "grad_norm": 1.1578494685110245, + "learning_rate": 1.4877688622370262e-05, + "loss": 0.8252, + "num_input_tokens_seen": 1043127904, + "step": 5779 + }, + { + "epoch": 0.6327485700210733, + "grad_norm": 1.2302959017981996, + "learning_rate": 1.4869826947839366e-05, + "loss": 0.7803, + "num_input_tokens_seen": 1043299936, + "step": 5780 + }, + { + "epoch": 0.6328580420920112, + "grad_norm": 1.416175317638567, + "learning_rate": 1.4861966471688577e-05, + "loss": 0.7851, + "num_input_tokens_seen": 1043512288, + "step": 5781 + }, + { + "epoch": 0.6329675141629492, + "grad_norm": 1.2843106627302874, + "learning_rate": 1.4854107194847771e-05, + "loss": 0.863, + "num_input_tokens_seen": 1043714560, + "step": 5782 + }, + { + "epoch": 0.6330769862338871, + "grad_norm": 1.086660021272088, + "learning_rate": 1.4846249118246686e-05, + "loss": 0.6618, + "num_input_tokens_seen": 1043872256, + "step": 5783 + }, + { + "epoch": 0.633186458304825, + "grad_norm": 1.219614504819915, + "learning_rate": 1.483839224281493e-05, + "loss": 0.6614, + "num_input_tokens_seen": 1044055936, + "step": 5784 + }, + { + "epoch": 0.6332959303757629, + "grad_norm": 1.142634724576353, + "learning_rate": 1.4830536569481934e-05, + "loss": 0.5432, + "num_input_tokens_seen": 1044238944, + "step": 5785 + }, + { + "epoch": 0.6334054024467007, + "grad_norm": 1.2833354539151858, + "learning_rate": 1.4822682099177035e-05, + "loss": 0.7651, + "num_input_tokens_seen": 1044412320, + "step": 5786 + }, + { + "epoch": 0.6335148745176387, + "grad_norm": 1.2872770023740285, + "learning_rate": 1.4814828832829374e-05, + "loss": 0.6066, + "num_input_tokens_seen": 1044565536, + "step": 5787 + }, + { + "epoch": 0.6336243465885766, + "grad_norm": 1.233370118408811, + "learning_rate": 1.4806976771368006e-05, + "loss": 0.6904, + "num_input_tokens_seen": 1044763328, + "step": 5788 + }, + { + "epoch": 0.6337338186595145, + "grad_norm": 1.3210854609594287, + "learning_rate": 1.4799125915721787e-05, + "loss": 0.6534, + "num_input_tokens_seen": 1044920576, + "step": 5789 + }, + { + "epoch": 0.6338432907304524, + "grad_norm": 1.1935566490930023, + "learning_rate": 1.479127626681947e-05, + "loss": 0.692, + "num_input_tokens_seen": 1045115904, + "step": 5790 + }, + { + "epoch": 0.6339527628013903, + "grad_norm": 1.2455421590174423, + "learning_rate": 1.4783427825589663e-05, + "loss": 0.8748, + "num_input_tokens_seen": 1045308320, + "step": 5791 + }, + { + "epoch": 0.6340622348723282, + "grad_norm": 1.2699042608564368, + "learning_rate": 1.4775580592960808e-05, + "loss": 0.7778, + "num_input_tokens_seen": 1045513280, + "step": 5792 + }, + { + "epoch": 0.6341717069432661, + "grad_norm": 1.265852778153351, + "learning_rate": 1.4767734569861233e-05, + "loss": 0.8257, + "num_input_tokens_seen": 1045706816, + "step": 5793 + }, + { + "epoch": 0.634281179014204, + "grad_norm": 1.0582952324824835, + "learning_rate": 1.4759889757219087e-05, + "loss": 0.6331, + "num_input_tokens_seen": 1045925664, + "step": 5794 + }, + { + "epoch": 0.6343906510851419, + "grad_norm": 1.2811511397019888, + "learning_rate": 1.4752046155962418e-05, + "loss": 0.6771, + "num_input_tokens_seen": 1046105312, + "step": 5795 + }, + { + "epoch": 0.6345001231560798, + "grad_norm": 1.1953043220589323, + "learning_rate": 1.4744203767019088e-05, + "loss": 0.8011, + "num_input_tokens_seen": 1046269280, + "step": 5796 + }, + { + "epoch": 0.6346095952270177, + "grad_norm": 1.2706147719917753, + "learning_rate": 1.4736362591316844e-05, + "loss": 0.8938, + "num_input_tokens_seen": 1046483424, + "step": 5797 + }, + { + "epoch": 0.6347190672979556, + "grad_norm": 1.2486643330894562, + "learning_rate": 1.4728522629783297e-05, + "loss": 0.7271, + "num_input_tokens_seen": 1046633280, + "step": 5798 + }, + { + "epoch": 0.6348285393688935, + "grad_norm": 1.1537809041720848, + "learning_rate": 1.4720683883345876e-05, + "loss": 0.6378, + "num_input_tokens_seen": 1046832416, + "step": 5799 + }, + { + "epoch": 0.6349380114398314, + "grad_norm": 1.2104233665521626, + "learning_rate": 1.4712846352931909e-05, + "loss": 0.8368, + "num_input_tokens_seen": 1047003552, + "step": 5800 + }, + { + "epoch": 0.6350474835107693, + "grad_norm": 1.3654129282659893, + "learning_rate": 1.4705010039468547e-05, + "loss": 0.7684, + "num_input_tokens_seen": 1047169984, + "step": 5801 + }, + { + "epoch": 0.6351569555817073, + "grad_norm": 1.2037786219314586, + "learning_rate": 1.4697174943882821e-05, + "loss": 0.6879, + "num_input_tokens_seen": 1047364864, + "step": 5802 + }, + { + "epoch": 0.6352664276526451, + "grad_norm": 1.3226019968170917, + "learning_rate": 1.4689341067101597e-05, + "loss": 0.9187, + "num_input_tokens_seen": 1047541600, + "step": 5803 + }, + { + "epoch": 0.635375899723583, + "grad_norm": 1.2263820053738796, + "learning_rate": 1.4681508410051615e-05, + "loss": 0.6108, + "num_input_tokens_seen": 1047731776, + "step": 5804 + }, + { + "epoch": 0.6354853717945209, + "grad_norm": 1.180166589324723, + "learning_rate": 1.4673676973659473e-05, + "loss": 0.7885, + "num_input_tokens_seen": 1047918368, + "step": 5805 + }, + { + "epoch": 0.6355948438654588, + "grad_norm": 1.3408606315490883, + "learning_rate": 1.4665846758851593e-05, + "loss": 0.6892, + "num_input_tokens_seen": 1048073600, + "step": 5806 + }, + { + "epoch": 0.6357043159363968, + "grad_norm": 1.293285918734895, + "learning_rate": 1.4658017766554295e-05, + "loss": 0.7689, + "num_input_tokens_seen": 1048244288, + "step": 5807 + }, + { + "epoch": 0.6358137880073347, + "grad_norm": 1.324533025756812, + "learning_rate": 1.4650189997693717e-05, + "loss": 0.6985, + "num_input_tokens_seen": 1048424832, + "step": 5808 + }, + { + "epoch": 0.6359232600782725, + "grad_norm": 1.1692489191506648, + "learning_rate": 1.4642363453195874e-05, + "loss": 0.6583, + "num_input_tokens_seen": 1048605600, + "step": 5809 + }, + { + "epoch": 0.6360327321492104, + "grad_norm": 1.1897010482205026, + "learning_rate": 1.4634538133986647e-05, + "loss": 0.6551, + "num_input_tokens_seen": 1048767776, + "step": 5810 + }, + { + "epoch": 0.6361422042201483, + "grad_norm": 1.1199880324303337, + "learning_rate": 1.4626714040991733e-05, + "loss": 0.7421, + "num_input_tokens_seen": 1048955264, + "step": 5811 + }, + { + "epoch": 0.6362516762910863, + "grad_norm": 1.2682078814694924, + "learning_rate": 1.4618891175136724e-05, + "loss": 1.0403, + "num_input_tokens_seen": 1049141632, + "step": 5812 + }, + { + "epoch": 0.6363611483620242, + "grad_norm": 1.1741851923672117, + "learning_rate": 1.4611069537347032e-05, + "loss": 0.7462, + "num_input_tokens_seen": 1049318592, + "step": 5813 + }, + { + "epoch": 0.636470620432962, + "grad_norm": 1.2143894830410427, + "learning_rate": 1.4603249128547968e-05, + "loss": 0.7804, + "num_input_tokens_seen": 1049490848, + "step": 5814 + }, + { + "epoch": 0.6365800925038999, + "grad_norm": 1.1664550055644705, + "learning_rate": 1.4595429949664647e-05, + "loss": 0.5823, + "num_input_tokens_seen": 1049665344, + "step": 5815 + }, + { + "epoch": 0.6366895645748378, + "grad_norm": 1.1274527726358345, + "learning_rate": 1.4587612001622078e-05, + "loss": 0.8098, + "num_input_tokens_seen": 1049844544, + "step": 5816 + }, + { + "epoch": 0.6367990366457758, + "grad_norm": 1.3101317974940974, + "learning_rate": 1.4579795285345105e-05, + "loss": 0.8392, + "num_input_tokens_seen": 1050024416, + "step": 5817 + }, + { + "epoch": 0.6369085087167137, + "grad_norm": 1.097170033132515, + "learning_rate": 1.457197980175843e-05, + "loss": 0.6096, + "num_input_tokens_seen": 1050204064, + "step": 5818 + }, + { + "epoch": 0.6370179807876516, + "grad_norm": 1.3812906544083585, + "learning_rate": 1.4564165551786608e-05, + "loss": 1.0148, + "num_input_tokens_seen": 1050365120, + "step": 5819 + }, + { + "epoch": 0.6371274528585894, + "grad_norm": 1.3156371544210814, + "learning_rate": 1.455635253635404e-05, + "loss": 0.9983, + "num_input_tokens_seen": 1050573440, + "step": 5820 + }, + { + "epoch": 0.6372369249295273, + "grad_norm": 1.212679489850876, + "learning_rate": 1.454854075638502e-05, + "loss": 0.6563, + "num_input_tokens_seen": 1050714560, + "step": 5821 + }, + { + "epoch": 0.6373463970004652, + "grad_norm": 1.2563746034687795, + "learning_rate": 1.4540730212803633e-05, + "loss": 0.7213, + "num_input_tokens_seen": 1050869568, + "step": 5822 + }, + { + "epoch": 0.6374558690714032, + "grad_norm": 1.222837392045252, + "learning_rate": 1.4532920906533875e-05, + "loss": 0.5928, + "num_input_tokens_seen": 1051051456, + "step": 5823 + }, + { + "epoch": 0.6375653411423411, + "grad_norm": 1.0475135108604474, + "learning_rate": 1.4525112838499567e-05, + "loss": 0.6504, + "num_input_tokens_seen": 1051251488, + "step": 5824 + }, + { + "epoch": 0.637674813213279, + "grad_norm": 1.3203796730785815, + "learning_rate": 1.4517306009624382e-05, + "loss": 0.7007, + "num_input_tokens_seen": 1051432928, + "step": 5825 + }, + { + "epoch": 0.6377842852842168, + "grad_norm": 1.2940517626466395, + "learning_rate": 1.450950042083186e-05, + "loss": 0.7974, + "num_input_tokens_seen": 1051589280, + "step": 5826 + }, + { + "epoch": 0.6378937573551547, + "grad_norm": 1.398779076145336, + "learning_rate": 1.4501696073045382e-05, + "loss": 0.9087, + "num_input_tokens_seen": 1051791776, + "step": 5827 + }, + { + "epoch": 0.6380032294260927, + "grad_norm": 1.1576376140678744, + "learning_rate": 1.4493892967188188e-05, + "loss": 0.67, + "num_input_tokens_seen": 1052000096, + "step": 5828 + }, + { + "epoch": 0.6381127014970306, + "grad_norm": 1.2179919599728115, + "learning_rate": 1.4486091104183364e-05, + "loss": 0.7275, + "num_input_tokens_seen": 1052157344, + "step": 5829 + }, + { + "epoch": 0.6382221735679685, + "grad_norm": 1.2370246263148175, + "learning_rate": 1.4478290484953871e-05, + "loss": 0.6996, + "num_input_tokens_seen": 1052323776, + "step": 5830 + }, + { + "epoch": 0.6383316456389063, + "grad_norm": 1.3053246957959437, + "learning_rate": 1.4470491110422502e-05, + "loss": 0.6478, + "num_input_tokens_seen": 1052471392, + "step": 5831 + }, + { + "epoch": 0.6384411177098442, + "grad_norm": 1.1521939455063646, + "learning_rate": 1.4462692981511906e-05, + "loss": 0.6081, + "num_input_tokens_seen": 1052624608, + "step": 5832 + }, + { + "epoch": 0.6385505897807822, + "grad_norm": 1.1667311627861967, + "learning_rate": 1.4454896099144583e-05, + "loss": 0.8397, + "num_input_tokens_seen": 1052821952, + "step": 5833 + }, + { + "epoch": 0.6386600618517201, + "grad_norm": 1.1435961239390484, + "learning_rate": 1.4447100464242894e-05, + "loss": 0.8159, + "num_input_tokens_seen": 1053036544, + "step": 5834 + }, + { + "epoch": 0.638769533922658, + "grad_norm": 1.265745510439188, + "learning_rate": 1.4439306077729048e-05, + "loss": 0.8028, + "num_input_tokens_seen": 1053226720, + "step": 5835 + }, + { + "epoch": 0.6388790059935959, + "grad_norm": 1.2972605028886994, + "learning_rate": 1.4431512940525102e-05, + "loss": 0.8351, + "num_input_tokens_seen": 1053427424, + "step": 5836 + }, + { + "epoch": 0.6389884780645337, + "grad_norm": 1.4747386345199864, + "learning_rate": 1.4423721053552963e-05, + "loss": 0.8371, + "num_input_tokens_seen": 1053582432, + "step": 5837 + }, + { + "epoch": 0.6390979501354717, + "grad_norm": 1.3678554973855628, + "learning_rate": 1.4415930417734414e-05, + "loss": 0.8339, + "num_input_tokens_seen": 1053770592, + "step": 5838 + }, + { + "epoch": 0.6392074222064096, + "grad_norm": 1.1669471891657912, + "learning_rate": 1.4408141033991064e-05, + "loss": 0.808, + "num_input_tokens_seen": 1053956288, + "step": 5839 + }, + { + "epoch": 0.6393168942773475, + "grad_norm": 1.1809726308708546, + "learning_rate": 1.4400352903244382e-05, + "loss": 0.8097, + "num_input_tokens_seen": 1054167072, + "step": 5840 + }, + { + "epoch": 0.6394263663482854, + "grad_norm": 1.374148321813891, + "learning_rate": 1.4392566026415688e-05, + "loss": 0.7692, + "num_input_tokens_seen": 1054323200, + "step": 5841 + }, + { + "epoch": 0.6395358384192233, + "grad_norm": 1.3075974044841727, + "learning_rate": 1.4384780404426157e-05, + "loss": 0.7929, + "num_input_tokens_seen": 1054516064, + "step": 5842 + }, + { + "epoch": 0.6396453104901612, + "grad_norm": 1.2488080063369567, + "learning_rate": 1.4376996038196807e-05, + "loss": 0.6091, + "num_input_tokens_seen": 1054682272, + "step": 5843 + }, + { + "epoch": 0.6397547825610991, + "grad_norm": 1.3214064360572564, + "learning_rate": 1.4369212928648513e-05, + "loss": 0.8387, + "num_input_tokens_seen": 1054862816, + "step": 5844 + }, + { + "epoch": 0.639864254632037, + "grad_norm": 1.356919669141288, + "learning_rate": 1.4361431076702019e-05, + "loss": 0.8011, + "num_input_tokens_seen": 1055047616, + "step": 5845 + }, + { + "epoch": 0.6399737267029749, + "grad_norm": 1.2311079998719174, + "learning_rate": 1.4353650483277881e-05, + "loss": 0.7982, + "num_input_tokens_seen": 1055242272, + "step": 5846 + }, + { + "epoch": 0.6400831987739128, + "grad_norm": 1.2069080376313406, + "learning_rate": 1.4345871149296552e-05, + "loss": 0.5904, + "num_input_tokens_seen": 1055424384, + "step": 5847 + }, + { + "epoch": 0.6401926708448507, + "grad_norm": 1.4556038335667398, + "learning_rate": 1.433809307567828e-05, + "loss": 0.8858, + "num_input_tokens_seen": 1055622400, + "step": 5848 + }, + { + "epoch": 0.6403021429157886, + "grad_norm": 1.2168957714469093, + "learning_rate": 1.4330316263343224e-05, + "loss": 0.6447, + "num_input_tokens_seen": 1055786144, + "step": 5849 + }, + { + "epoch": 0.6404116149867265, + "grad_norm": 1.1622020011339764, + "learning_rate": 1.432254071321136e-05, + "loss": 0.6204, + "num_input_tokens_seen": 1055974976, + "step": 5850 + }, + { + "epoch": 0.6405210870576644, + "grad_norm": 1.3624539638756312, + "learning_rate": 1.4314766426202507e-05, + "loss": 0.7833, + "num_input_tokens_seen": 1056128416, + "step": 5851 + }, + { + "epoch": 0.6406305591286023, + "grad_norm": 1.1557488773149884, + "learning_rate": 1.430699340323638e-05, + "loss": 0.753, + "num_input_tokens_seen": 1056310976, + "step": 5852 + }, + { + "epoch": 0.6407400311995403, + "grad_norm": 1.2608778191379235, + "learning_rate": 1.429922164523247e-05, + "loss": 0.6965, + "num_input_tokens_seen": 1056462176, + "step": 5853 + }, + { + "epoch": 0.6408495032704781, + "grad_norm": 1.2169816319279232, + "learning_rate": 1.4291451153110202e-05, + "loss": 0.6152, + "num_input_tokens_seen": 1056637344, + "step": 5854 + }, + { + "epoch": 0.640958975341416, + "grad_norm": 1.3473165775086426, + "learning_rate": 1.4283681927788772e-05, + "loss": 0.7361, + "num_input_tokens_seen": 1056802432, + "step": 5855 + }, + { + "epoch": 0.6410684474123539, + "grad_norm": 1.420469406743769, + "learning_rate": 1.4275913970187305e-05, + "loss": 0.9905, + "num_input_tokens_seen": 1057023072, + "step": 5856 + }, + { + "epoch": 0.6411779194832918, + "grad_norm": 1.3142218875269298, + "learning_rate": 1.4268147281224695e-05, + "loss": 0.7575, + "num_input_tokens_seen": 1057219296, + "step": 5857 + }, + { + "epoch": 0.6412873915542298, + "grad_norm": 1.1142880889496984, + "learning_rate": 1.4260381861819755e-05, + "loss": 0.7153, + "num_input_tokens_seen": 1057403872, + "step": 5858 + }, + { + "epoch": 0.6413968636251677, + "grad_norm": 1.2423060138715583, + "learning_rate": 1.4252617712891109e-05, + "loss": 0.6355, + "num_input_tokens_seen": 1057575232, + "step": 5859 + }, + { + "epoch": 0.6415063356961055, + "grad_norm": 1.2629043158306188, + "learning_rate": 1.424485483535724e-05, + "loss": 0.8774, + "num_input_tokens_seen": 1057750176, + "step": 5860 + }, + { + "epoch": 0.6416158077670434, + "grad_norm": 1.2548644780700997, + "learning_rate": 1.4237093230136489e-05, + "loss": 0.7405, + "num_input_tokens_seen": 1057904736, + "step": 5861 + }, + { + "epoch": 0.6417252798379813, + "grad_norm": 1.2981082613604615, + "learning_rate": 1.4229332898147022e-05, + "loss": 0.7912, + "num_input_tokens_seen": 1058098272, + "step": 5862 + }, + { + "epoch": 0.6418347519089193, + "grad_norm": 1.354545168675875, + "learning_rate": 1.4221573840306902e-05, + "loss": 1.1416, + "num_input_tokens_seen": 1058305248, + "step": 5863 + }, + { + "epoch": 0.6419442239798572, + "grad_norm": 1.3110154702752697, + "learning_rate": 1.421381605753397e-05, + "loss": 0.8632, + "num_input_tokens_seen": 1058452416, + "step": 5864 + }, + { + "epoch": 0.642053696050795, + "grad_norm": 1.2927381880409965, + "learning_rate": 1.4206059550745993e-05, + "loss": 0.6574, + "num_input_tokens_seen": 1058604736, + "step": 5865 + }, + { + "epoch": 0.6421631681217329, + "grad_norm": 1.2601703791999863, + "learning_rate": 1.4198304320860534e-05, + "loss": 0.6998, + "num_input_tokens_seen": 1058768256, + "step": 5866 + }, + { + "epoch": 0.6422726401926708, + "grad_norm": 1.2613370808235882, + "learning_rate": 1.4190550368795024e-05, + "loss": 0.6786, + "num_input_tokens_seen": 1058956864, + "step": 5867 + }, + { + "epoch": 0.6423821122636088, + "grad_norm": 1.3043195468317945, + "learning_rate": 1.4182797695466743e-05, + "loss": 0.9602, + "num_input_tokens_seen": 1059140768, + "step": 5868 + }, + { + "epoch": 0.6424915843345467, + "grad_norm": 1.2381866611491157, + "learning_rate": 1.4175046301792816e-05, + "loss": 0.7441, + "num_input_tokens_seen": 1059326240, + "step": 5869 + }, + { + "epoch": 0.6426010564054846, + "grad_norm": 1.3656541345487008, + "learning_rate": 1.4167296188690204e-05, + "loss": 0.8584, + "num_input_tokens_seen": 1059486624, + "step": 5870 + }, + { + "epoch": 0.6427105284764224, + "grad_norm": 1.3963358766789578, + "learning_rate": 1.4159547357075759e-05, + "loss": 0.8884, + "num_input_tokens_seen": 1059651040, + "step": 5871 + }, + { + "epoch": 0.6428200005473603, + "grad_norm": 1.264570008427125, + "learning_rate": 1.4151799807866135e-05, + "loss": 0.8569, + "num_input_tokens_seen": 1059816352, + "step": 5872 + }, + { + "epoch": 0.6429294726182982, + "grad_norm": 1.1195246119279116, + "learning_rate": 1.4144053541977855e-05, + "loss": 0.5537, + "num_input_tokens_seen": 1059978752, + "step": 5873 + }, + { + "epoch": 0.6430389446892362, + "grad_norm": 1.21660479013747, + "learning_rate": 1.4136308560327288e-05, + "loss": 0.6876, + "num_input_tokens_seen": 1060146304, + "step": 5874 + }, + { + "epoch": 0.6431484167601741, + "grad_norm": 1.4759353839924523, + "learning_rate": 1.4128564863830655e-05, + "loss": 0.8544, + "num_input_tokens_seen": 1060300416, + "step": 5875 + }, + { + "epoch": 0.643257888831112, + "grad_norm": 1.3719527985013233, + "learning_rate": 1.4120822453404011e-05, + "loss": 0.8103, + "num_input_tokens_seen": 1060483648, + "step": 5876 + }, + { + "epoch": 0.6433673609020498, + "grad_norm": 1.2986114091697165, + "learning_rate": 1.4113081329963265e-05, + "loss": 0.6578, + "num_input_tokens_seen": 1060632160, + "step": 5877 + }, + { + "epoch": 0.6434768329729877, + "grad_norm": 1.447200058083773, + "learning_rate": 1.4105341494424206e-05, + "loss": 0.9817, + "num_input_tokens_seen": 1060810464, + "step": 5878 + }, + { + "epoch": 0.6435863050439257, + "grad_norm": 1.348138872056666, + "learning_rate": 1.40976029477024e-05, + "loss": 0.8496, + "num_input_tokens_seen": 1061007136, + "step": 5879 + }, + { + "epoch": 0.6436957771148636, + "grad_norm": 1.2685781804141982, + "learning_rate": 1.4089865690713337e-05, + "loss": 0.7507, + "num_input_tokens_seen": 1061191488, + "step": 5880 + }, + { + "epoch": 0.6438052491858015, + "grad_norm": 1.1787382942823974, + "learning_rate": 1.40821297243723e-05, + "loss": 0.8594, + "num_input_tokens_seen": 1061390176, + "step": 5881 + }, + { + "epoch": 0.6439147212567393, + "grad_norm": 1.182270985337023, + "learning_rate": 1.407439504959445e-05, + "loss": 0.8612, + "num_input_tokens_seen": 1061596704, + "step": 5882 + }, + { + "epoch": 0.6440241933276772, + "grad_norm": 1.1214451255826088, + "learning_rate": 1.4066661667294779e-05, + "loss": 0.7493, + "num_input_tokens_seen": 1061791136, + "step": 5883 + }, + { + "epoch": 0.6441336653986152, + "grad_norm": 1.4437190163990976, + "learning_rate": 1.405892957838812e-05, + "loss": 0.9063, + "num_input_tokens_seen": 1061930688, + "step": 5884 + }, + { + "epoch": 0.6442431374695531, + "grad_norm": 1.1778708611348525, + "learning_rate": 1.4051198783789196e-05, + "loss": 0.6925, + "num_input_tokens_seen": 1062122432, + "step": 5885 + }, + { + "epoch": 0.644352609540491, + "grad_norm": 1.203821302779941, + "learning_rate": 1.4043469284412509e-05, + "loss": 1.0481, + "num_input_tokens_seen": 1062326496, + "step": 5886 + }, + { + "epoch": 0.6444620816114289, + "grad_norm": 1.176006561692415, + "learning_rate": 1.4035741081172476e-05, + "loss": 0.6791, + "num_input_tokens_seen": 1062526976, + "step": 5887 + }, + { + "epoch": 0.6445715536823667, + "grad_norm": 1.266718827395015, + "learning_rate": 1.4028014174983295e-05, + "loss": 0.7345, + "num_input_tokens_seen": 1062695424, + "step": 5888 + }, + { + "epoch": 0.6446810257533047, + "grad_norm": 1.3524908945478709, + "learning_rate": 1.402028856675908e-05, + "loss": 0.8422, + "num_input_tokens_seen": 1062898592, + "step": 5889 + }, + { + "epoch": 0.6447904978242426, + "grad_norm": 1.059218106425093, + "learning_rate": 1.4012564257413718e-05, + "loss": 0.693, + "num_input_tokens_seen": 1063102656, + "step": 5890 + }, + { + "epoch": 0.6448999698951805, + "grad_norm": 1.2034618352045674, + "learning_rate": 1.4004841247861011e-05, + "loss": 0.6337, + "num_input_tokens_seen": 1063289248, + "step": 5891 + }, + { + "epoch": 0.6450094419661184, + "grad_norm": 1.242593231157317, + "learning_rate": 1.3997119539014566e-05, + "loss": 0.6774, + "num_input_tokens_seen": 1063457248, + "step": 5892 + }, + { + "epoch": 0.6451189140370563, + "grad_norm": 1.216334957907517, + "learning_rate": 1.3989399131787836e-05, + "loss": 0.8856, + "num_input_tokens_seen": 1063633536, + "step": 5893 + }, + { + "epoch": 0.6452283861079942, + "grad_norm": 1.3076395228033184, + "learning_rate": 1.398168002709416e-05, + "loss": 0.7736, + "num_input_tokens_seen": 1063819904, + "step": 5894 + }, + { + "epoch": 0.6453378581789321, + "grad_norm": 1.2830074631436006, + "learning_rate": 1.3973962225846654e-05, + "loss": 0.8029, + "num_input_tokens_seen": 1063976032, + "step": 5895 + }, + { + "epoch": 0.64544733024987, + "grad_norm": 1.1229348300934583, + "learning_rate": 1.3966245728958355e-05, + "loss": 0.805, + "num_input_tokens_seen": 1064201376, + "step": 5896 + }, + { + "epoch": 0.6455568023208079, + "grad_norm": 1.2358290883971192, + "learning_rate": 1.3958530537342075e-05, + "loss": 1.0171, + "num_input_tokens_seen": 1064413056, + "step": 5897 + }, + { + "epoch": 0.6456662743917458, + "grad_norm": 1.0234463523889572, + "learning_rate": 1.3950816651910537e-05, + "loss": 0.6528, + "num_input_tokens_seen": 1064597184, + "step": 5898 + }, + { + "epoch": 0.6457757464626837, + "grad_norm": 1.2488346866916145, + "learning_rate": 1.3943104073576263e-05, + "loss": 0.7325, + "num_input_tokens_seen": 1064786688, + "step": 5899 + }, + { + "epoch": 0.6458852185336216, + "grad_norm": 1.2229151832445928, + "learning_rate": 1.393539280325164e-05, + "loss": 0.5231, + "num_input_tokens_seen": 1064958944, + "step": 5900 + }, + { + "epoch": 0.6459946906045595, + "grad_norm": 1.2216737840053955, + "learning_rate": 1.3927682841848899e-05, + "loss": 0.7512, + "num_input_tokens_seen": 1065131872, + "step": 5901 + }, + { + "epoch": 0.6461041626754974, + "grad_norm": 1.3330871781976554, + "learning_rate": 1.391997419028011e-05, + "loss": 0.7698, + "num_input_tokens_seen": 1065284192, + "step": 5902 + }, + { + "epoch": 0.6462136347464353, + "grad_norm": 1.1618325809411933, + "learning_rate": 1.3912266849457195e-05, + "loss": 0.7973, + "num_input_tokens_seen": 1065424640, + "step": 5903 + }, + { + "epoch": 0.6463231068173733, + "grad_norm": 1.1213008676002851, + "learning_rate": 1.3904560820291902e-05, + "loss": 0.7326, + "num_input_tokens_seen": 1065624224, + "step": 5904 + }, + { + "epoch": 0.6464325788883111, + "grad_norm": 1.0976819887231954, + "learning_rate": 1.3896856103695866e-05, + "loss": 0.5482, + "num_input_tokens_seen": 1065808576, + "step": 5905 + }, + { + "epoch": 0.646542050959249, + "grad_norm": 1.0995455735230302, + "learning_rate": 1.3889152700580527e-05, + "loss": 0.6962, + "num_input_tokens_seen": 1065998304, + "step": 5906 + }, + { + "epoch": 0.6466515230301869, + "grad_norm": 1.2281005837534695, + "learning_rate": 1.3881450611857181e-05, + "loss": 0.9826, + "num_input_tokens_seen": 1066199232, + "step": 5907 + }, + { + "epoch": 0.6467609951011248, + "grad_norm": 1.222586047804861, + "learning_rate": 1.3873749838436972e-05, + "loss": 0.7421, + "num_input_tokens_seen": 1066403296, + "step": 5908 + }, + { + "epoch": 0.6468704671720628, + "grad_norm": 1.3765223485349722, + "learning_rate": 1.386605038123089e-05, + "loss": 0.8185, + "num_input_tokens_seen": 1066569728, + "step": 5909 + }, + { + "epoch": 0.6469799392430007, + "grad_norm": 1.1870975816397575, + "learning_rate": 1.3858352241149763e-05, + "loss": 0.6906, + "num_input_tokens_seen": 1066757216, + "step": 5910 + }, + { + "epoch": 0.6470894113139385, + "grad_norm": 1.2475045567584095, + "learning_rate": 1.3850655419104267e-05, + "loss": 1.0453, + "num_input_tokens_seen": 1066946944, + "step": 5911 + }, + { + "epoch": 0.6471988833848764, + "grad_norm": 1.2485838020166145, + "learning_rate": 1.3842959916004911e-05, + "loss": 0.7292, + "num_input_tokens_seen": 1067116512, + "step": 5912 + }, + { + "epoch": 0.6473083554558143, + "grad_norm": 1.3344675643941002, + "learning_rate": 1.3835265732762076e-05, + "loss": 0.7697, + "num_input_tokens_seen": 1067283392, + "step": 5913 + }, + { + "epoch": 0.6474178275267523, + "grad_norm": 1.4170309701629675, + "learning_rate": 1.3827572870285963e-05, + "loss": 0.8917, + "num_input_tokens_seen": 1067448928, + "step": 5914 + }, + { + "epoch": 0.6475272995976902, + "grad_norm": 1.120173910977921, + "learning_rate": 1.3819881329486622e-05, + "loss": 0.6908, + "num_input_tokens_seen": 1067642912, + "step": 5915 + }, + { + "epoch": 0.647636771668628, + "grad_norm": 1.0576467747657647, + "learning_rate": 1.3812191111273944e-05, + "loss": 0.5815, + "num_input_tokens_seen": 1067812704, + "step": 5916 + }, + { + "epoch": 0.6477462437395659, + "grad_norm": 1.360968740741929, + "learning_rate": 1.3804502216557675e-05, + "loss": 0.8752, + "num_input_tokens_seen": 1068014080, + "step": 5917 + }, + { + "epoch": 0.6478557158105038, + "grad_norm": 6.305811898748058, + "learning_rate": 1.3796814646247385e-05, + "loss": 0.7348, + "num_input_tokens_seen": 1068196192, + "step": 5918 + }, + { + "epoch": 0.6479651878814418, + "grad_norm": 1.2116503186458882, + "learning_rate": 1.3789128401252502e-05, + "loss": 0.772, + "num_input_tokens_seen": 1068367104, + "step": 5919 + }, + { + "epoch": 0.6480746599523797, + "grad_norm": 1.298242652127404, + "learning_rate": 1.3781443482482314e-05, + "loss": 0.8221, + "num_input_tokens_seen": 1068547424, + "step": 5920 + }, + { + "epoch": 0.6481841320233176, + "grad_norm": 1.2091423757222104, + "learning_rate": 1.37737598908459e-05, + "loss": 0.8589, + "num_input_tokens_seen": 1068746560, + "step": 5921 + }, + { + "epoch": 0.6482936040942554, + "grad_norm": 1.2384358980378796, + "learning_rate": 1.3766077627252233e-05, + "loss": 0.7561, + "num_input_tokens_seen": 1068929568, + "step": 5922 + }, + { + "epoch": 0.6484030761651933, + "grad_norm": 1.2456174316788822, + "learning_rate": 1.3758396692610112e-05, + "loss": 0.7756, + "num_input_tokens_seen": 1069092864, + "step": 5923 + }, + { + "epoch": 0.6485125482361312, + "grad_norm": 1.1999059441581452, + "learning_rate": 1.3750717087828172e-05, + "loss": 0.633, + "num_input_tokens_seen": 1069268928, + "step": 5924 + }, + { + "epoch": 0.6486220203070692, + "grad_norm": 1.1296489872573785, + "learning_rate": 1.3743038813814896e-05, + "loss": 0.6591, + "num_input_tokens_seen": 1069491584, + "step": 5925 + }, + { + "epoch": 0.6487314923780071, + "grad_norm": 1.2634659235245869, + "learning_rate": 1.3735361871478597e-05, + "loss": 0.8802, + "num_input_tokens_seen": 1069716256, + "step": 5926 + }, + { + "epoch": 0.648840964448945, + "grad_norm": 1.2033891264079482, + "learning_rate": 1.3727686261727474e-05, + "loss": 0.9045, + "num_input_tokens_seen": 1069909120, + "step": 5927 + }, + { + "epoch": 0.6489504365198828, + "grad_norm": 1.1823822468524356, + "learning_rate": 1.3720011985469494e-05, + "loss": 0.8696, + "num_input_tokens_seen": 1070080704, + "step": 5928 + }, + { + "epoch": 0.6490599085908207, + "grad_norm": 1.3032929053869877, + "learning_rate": 1.371233904361256e-05, + "loss": 0.897, + "num_input_tokens_seen": 1070278496, + "step": 5929 + }, + { + "epoch": 0.6491693806617587, + "grad_norm": 1.280996157924063, + "learning_rate": 1.370466743706431e-05, + "loss": 0.8333, + "num_input_tokens_seen": 1070482560, + "step": 5930 + }, + { + "epoch": 0.6492788527326966, + "grad_norm": 1.2545811628265677, + "learning_rate": 1.3696997166732328e-05, + "loss": 0.6972, + "num_input_tokens_seen": 1070661312, + "step": 5931 + }, + { + "epoch": 0.6493883248036345, + "grad_norm": 1.2103260096065147, + "learning_rate": 1.3689328233523968e-05, + "loss": 0.8837, + "num_input_tokens_seen": 1070833120, + "step": 5932 + }, + { + "epoch": 0.6494977968745723, + "grad_norm": 1.2130865408742815, + "learning_rate": 1.3681660638346455e-05, + "loss": 0.724, + "num_input_tokens_seen": 1070995296, + "step": 5933 + }, + { + "epoch": 0.6496072689455102, + "grad_norm": 1.2038207581382743, + "learning_rate": 1.3673994382106856e-05, + "loss": 0.8572, + "num_input_tokens_seen": 1071181216, + "step": 5934 + }, + { + "epoch": 0.6497167410164482, + "grad_norm": 1.3730453556318274, + "learning_rate": 1.3666329465712058e-05, + "loss": 0.9961, + "num_input_tokens_seen": 1071372960, + "step": 5935 + }, + { + "epoch": 0.6498262130873861, + "grad_norm": 1.3205470491901286, + "learning_rate": 1.3658665890068836e-05, + "loss": 0.9336, + "num_input_tokens_seen": 1071568960, + "step": 5936 + }, + { + "epoch": 0.649935685158324, + "grad_norm": 1.1788969106538096, + "learning_rate": 1.3651003656083742e-05, + "loss": 0.8987, + "num_input_tokens_seen": 1071787136, + "step": 5937 + }, + { + "epoch": 0.6500451572292619, + "grad_norm": 1.2233069397697598, + "learning_rate": 1.3643342764663225e-05, + "loss": 0.906, + "num_input_tokens_seen": 1071986048, + "step": 5938 + }, + { + "epoch": 0.6501546293001997, + "grad_norm": 1.2274848725732794, + "learning_rate": 1.3635683216713551e-05, + "loss": 0.9414, + "num_input_tokens_seen": 1072163456, + "step": 5939 + }, + { + "epoch": 0.6502641013711377, + "grad_norm": 1.1856706985088352, + "learning_rate": 1.362802501314083e-05, + "loss": 0.6553, + "num_input_tokens_seen": 1072376928, + "step": 5940 + }, + { + "epoch": 0.6503735734420756, + "grad_norm": 1.257969106082584, + "learning_rate": 1.3620368154851008e-05, + "loss": 0.6666, + "num_input_tokens_seen": 1072534400, + "step": 5941 + }, + { + "epoch": 0.6504830455130135, + "grad_norm": 1.2695914386273115, + "learning_rate": 1.361271264274988e-05, + "loss": 0.7073, + "num_input_tokens_seen": 1072709120, + "step": 5942 + }, + { + "epoch": 0.6505925175839514, + "grad_norm": 1.1739252592954657, + "learning_rate": 1.3605058477743077e-05, + "loss": 0.809, + "num_input_tokens_seen": 1072937824, + "step": 5943 + }, + { + "epoch": 0.6507019896548893, + "grad_norm": 1.2123793355146661, + "learning_rate": 1.3597405660736074e-05, + "loss": 0.6617, + "num_input_tokens_seen": 1073138304, + "step": 5944 + }, + { + "epoch": 0.6508114617258272, + "grad_norm": 1.160956161574939, + "learning_rate": 1.3589754192634168e-05, + "loss": 0.6744, + "num_input_tokens_seen": 1073324224, + "step": 5945 + }, + { + "epoch": 0.6509209337967651, + "grad_norm": 1.6682580344133715, + "learning_rate": 1.3582104074342544e-05, + "loss": 1.1437, + "num_input_tokens_seen": 1073459744, + "step": 5946 + }, + { + "epoch": 0.651030405867703, + "grad_norm": 1.281672245322704, + "learning_rate": 1.3574455306766179e-05, + "loss": 0.8746, + "num_input_tokens_seen": 1073674112, + "step": 5947 + }, + { + "epoch": 0.6511398779386409, + "grad_norm": 1.322783375831054, + "learning_rate": 1.3566807890809907e-05, + "loss": 0.7593, + "num_input_tokens_seen": 1073821952, + "step": 5948 + }, + { + "epoch": 0.6512493500095788, + "grad_norm": 1.3140587859946076, + "learning_rate": 1.3559161827378409e-05, + "loss": 0.9731, + "num_input_tokens_seen": 1074013248, + "step": 5949 + }, + { + "epoch": 0.6513588220805167, + "grad_norm": 1.3432212055271626, + "learning_rate": 1.3551517117376195e-05, + "loss": 0.736, + "num_input_tokens_seen": 1074175648, + "step": 5950 + }, + { + "epoch": 0.6514682941514546, + "grad_norm": 1.3684095316326623, + "learning_rate": 1.3543873761707617e-05, + "loss": 0.7391, + "num_input_tokens_seen": 1074358880, + "step": 5951 + }, + { + "epoch": 0.6515777662223925, + "grad_norm": 1.350854191925958, + "learning_rate": 1.3536231761276866e-05, + "loss": 0.795, + "num_input_tokens_seen": 1074542784, + "step": 5952 + }, + { + "epoch": 0.6516872382933304, + "grad_norm": 1.2889979490821686, + "learning_rate": 1.3528591116988e-05, + "loss": 0.7072, + "num_input_tokens_seen": 1074717056, + "step": 5953 + }, + { + "epoch": 0.6517967103642683, + "grad_norm": 1.1720125753392046, + "learning_rate": 1.3520951829744857e-05, + "loss": 0.8962, + "num_input_tokens_seen": 1074873856, + "step": 5954 + }, + { + "epoch": 0.6519061824352063, + "grad_norm": 1.1682661819659739, + "learning_rate": 1.351331390045118e-05, + "loss": 0.592, + "num_input_tokens_seen": 1075040512, + "step": 5955 + }, + { + "epoch": 0.6520156545061441, + "grad_norm": 1.3259784180969119, + "learning_rate": 1.3505677330010505e-05, + "loss": 0.5933, + "num_input_tokens_seen": 1075179616, + "step": 5956 + }, + { + "epoch": 0.652125126577082, + "grad_norm": 1.2187199222081038, + "learning_rate": 1.3498042119326232e-05, + "loss": 0.726, + "num_input_tokens_seen": 1075324096, + "step": 5957 + }, + { + "epoch": 0.6522345986480199, + "grad_norm": 1.2246651035995213, + "learning_rate": 1.3490408269301585e-05, + "loss": 0.8596, + "num_input_tokens_seen": 1075498368, + "step": 5958 + }, + { + "epoch": 0.6523440707189578, + "grad_norm": 1.2972771354332346, + "learning_rate": 1.3482775780839632e-05, + "loss": 0.8312, + "num_input_tokens_seen": 1075679360, + "step": 5959 + }, + { + "epoch": 0.6524535427898958, + "grad_norm": 1.3919338110775155, + "learning_rate": 1.3475144654843302e-05, + "loss": 1.0448, + "num_input_tokens_seen": 1075882304, + "step": 5960 + }, + { + "epoch": 0.6525630148608337, + "grad_norm": 1.3021138098646876, + "learning_rate": 1.346751489221531e-05, + "loss": 1.1132, + "num_input_tokens_seen": 1076095552, + "step": 5961 + }, + { + "epoch": 0.6526724869317715, + "grad_norm": 1.2784051713979727, + "learning_rate": 1.3459886493858282e-05, + "loss": 0.8572, + "num_input_tokens_seen": 1076272512, + "step": 5962 + }, + { + "epoch": 0.6527819590027094, + "grad_norm": 1.1876787753805234, + "learning_rate": 1.3452259460674599e-05, + "loss": 0.8037, + "num_input_tokens_seen": 1076486656, + "step": 5963 + }, + { + "epoch": 0.6528914310736473, + "grad_norm": 1.2842319492395569, + "learning_rate": 1.3444633793566556e-05, + "loss": 0.6937, + "num_input_tokens_seen": 1076696320, + "step": 5964 + }, + { + "epoch": 0.6530009031445853, + "grad_norm": 1.2545451500355749, + "learning_rate": 1.3437009493436243e-05, + "loss": 1.0418, + "num_input_tokens_seen": 1076897920, + "step": 5965 + }, + { + "epoch": 0.6531103752155232, + "grad_norm": 1.218899839922536, + "learning_rate": 1.3429386561185606e-05, + "loss": 0.8658, + "num_input_tokens_seen": 1077070176, + "step": 5966 + }, + { + "epoch": 0.653219847286461, + "grad_norm": 1.225006379869936, + "learning_rate": 1.3421764997716418e-05, + "loss": 0.6483, + "num_input_tokens_seen": 1077226080, + "step": 5967 + }, + { + "epoch": 0.6533293193573989, + "grad_norm": 1.2097366285190154, + "learning_rate": 1.3414144803930284e-05, + "loss": 0.6413, + "num_input_tokens_seen": 1077416256, + "step": 5968 + }, + { + "epoch": 0.6534387914283368, + "grad_norm": 1.186655999458959, + "learning_rate": 1.3406525980728697e-05, + "loss": 0.6472, + "num_input_tokens_seen": 1077597920, + "step": 5969 + }, + { + "epoch": 0.6535482634992748, + "grad_norm": 1.11462706483036, + "learning_rate": 1.3398908529012899e-05, + "loss": 0.6306, + "num_input_tokens_seen": 1077780928, + "step": 5970 + }, + { + "epoch": 0.6536577355702127, + "grad_norm": 1.303789023586133, + "learning_rate": 1.3391292449684067e-05, + "loss": 0.9286, + "num_input_tokens_seen": 1077981856, + "step": 5971 + }, + { + "epoch": 0.6537672076411506, + "grad_norm": 1.2487877212493823, + "learning_rate": 1.3383677743643126e-05, + "loss": 0.6376, + "num_input_tokens_seen": 1078189728, + "step": 5972 + }, + { + "epoch": 0.6538766797120884, + "grad_norm": 1.1479216474550795, + "learning_rate": 1.3376064411790909e-05, + "loss": 0.8157, + "num_input_tokens_seen": 1078378784, + "step": 5973 + }, + { + "epoch": 0.6539861517830263, + "grad_norm": 1.1378708077828816, + "learning_rate": 1.3368452455028052e-05, + "loss": 0.5301, + "num_input_tokens_seen": 1078562016, + "step": 5974 + }, + { + "epoch": 0.6540956238539642, + "grad_norm": 1.194519278427022, + "learning_rate": 1.3360841874255034e-05, + "loss": 0.7936, + "num_input_tokens_seen": 1078744128, + "step": 5975 + }, + { + "epoch": 0.6542050959249022, + "grad_norm": 1.159357431695064, + "learning_rate": 1.3353232670372173e-05, + "loss": 0.6272, + "num_input_tokens_seen": 1078921984, + "step": 5976 + }, + { + "epoch": 0.6543145679958401, + "grad_norm": 1.3200698719894353, + "learning_rate": 1.3345624844279611e-05, + "loss": 0.9028, + "num_input_tokens_seen": 1079100960, + "step": 5977 + }, + { + "epoch": 0.654424040066778, + "grad_norm": 1.2203271393624393, + "learning_rate": 1.3338018396877371e-05, + "loss": 0.7619, + "num_input_tokens_seen": 1079241184, + "step": 5978 + }, + { + "epoch": 0.6545335121377158, + "grad_norm": 1.2776019567425672, + "learning_rate": 1.3330413329065238e-05, + "loss": 0.8663, + "num_input_tokens_seen": 1079394400, + "step": 5979 + }, + { + "epoch": 0.6546429842086537, + "grad_norm": 1.2492438413427347, + "learning_rate": 1.3322809641742917e-05, + "loss": 0.6069, + "num_input_tokens_seen": 1079540672, + "step": 5980 + }, + { + "epoch": 0.6547524562795917, + "grad_norm": 1.341391289726997, + "learning_rate": 1.3315207335809888e-05, + "loss": 0.9535, + "num_input_tokens_seen": 1079689856, + "step": 5981 + }, + { + "epoch": 0.6548619283505296, + "grad_norm": 1.3518613804568962, + "learning_rate": 1.3307606412165491e-05, + "loss": 0.8527, + "num_input_tokens_seen": 1079883616, + "step": 5982 + }, + { + "epoch": 0.6549714004214675, + "grad_norm": 1.4806991035593193, + "learning_rate": 1.3300006871708905e-05, + "loss": 0.8021, + "num_input_tokens_seen": 1080059904, + "step": 5983 + }, + { + "epoch": 0.6550808724924054, + "grad_norm": 1.3216547717168075, + "learning_rate": 1.3292408715339141e-05, + "loss": 0.7427, + "num_input_tokens_seen": 1080251648, + "step": 5984 + }, + { + "epoch": 0.6551903445633432, + "grad_norm": 1.318891132630994, + "learning_rate": 1.3284811943955045e-05, + "loss": 0.7652, + "num_input_tokens_seen": 1080447424, + "step": 5985 + }, + { + "epoch": 0.6552998166342812, + "grad_norm": 1.298322765730102, + "learning_rate": 1.32772165584553e-05, + "loss": 0.8264, + "num_input_tokens_seen": 1080598176, + "step": 5986 + }, + { + "epoch": 0.6554092887052191, + "grad_norm": 1.2640510002150682, + "learning_rate": 1.3269622559738416e-05, + "loss": 0.9441, + "num_input_tokens_seen": 1080771552, + "step": 5987 + }, + { + "epoch": 0.655518760776157, + "grad_norm": 1.21174245352468, + "learning_rate": 1.3262029948702766e-05, + "loss": 0.7342, + "num_input_tokens_seen": 1080969120, + "step": 5988 + }, + { + "epoch": 0.6556282328470949, + "grad_norm": 1.3455618242953753, + "learning_rate": 1.3254438726246537e-05, + "loss": 0.8244, + "num_input_tokens_seen": 1081140704, + "step": 5989 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 1.1570772356351837, + "learning_rate": 1.3246848893267749e-05, + "loss": 0.6652, + "num_input_tokens_seen": 1081303104, + "step": 5990 + }, + { + "epoch": 0.6558471769889707, + "grad_norm": 1.2416212602714767, + "learning_rate": 1.3239260450664275e-05, + "loss": 0.7169, + "num_input_tokens_seen": 1081493280, + "step": 5991 + }, + { + "epoch": 0.6559566490599086, + "grad_norm": 1.2303407148323913, + "learning_rate": 1.3231673399333802e-05, + "loss": 0.6045, + "num_input_tokens_seen": 1081690848, + "step": 5992 + }, + { + "epoch": 0.6560661211308465, + "grad_norm": 1.3573983828480884, + "learning_rate": 1.3224087740173871e-05, + "loss": 0.7478, + "num_input_tokens_seen": 1081861088, + "step": 5993 + }, + { + "epoch": 0.6561755932017844, + "grad_norm": 1.320639846220033, + "learning_rate": 1.3216503474081838e-05, + "loss": 0.6801, + "num_input_tokens_seen": 1081991232, + "step": 5994 + }, + { + "epoch": 0.6562850652727223, + "grad_norm": 1.1987102312605609, + "learning_rate": 1.3208920601954938e-05, + "loss": 0.7745, + "num_input_tokens_seen": 1082214112, + "step": 5995 + }, + { + "epoch": 0.6563945373436602, + "grad_norm": 1.077475033977544, + "learning_rate": 1.320133912469017e-05, + "loss": 0.88, + "num_input_tokens_seen": 1082385472, + "step": 5996 + }, + { + "epoch": 0.6565040094145981, + "grad_norm": 1.227582276766441, + "learning_rate": 1.3193759043184437e-05, + "loss": 0.7173, + "num_input_tokens_seen": 1082553472, + "step": 5997 + }, + { + "epoch": 0.656613481485536, + "grad_norm": 1.3032151097307496, + "learning_rate": 1.318618035833444e-05, + "loss": 0.8104, + "num_input_tokens_seen": 1082733568, + "step": 5998 + }, + { + "epoch": 0.6567229535564739, + "grad_norm": 1.3838130542341724, + "learning_rate": 1.317860307103672e-05, + "loss": 0.8061, + "num_input_tokens_seen": 1082915904, + "step": 5999 + }, + { + "epoch": 0.6568324256274118, + "grad_norm": 1.0571408031049678, + "learning_rate": 1.3171027182187665e-05, + "loss": 0.6742, + "num_input_tokens_seen": 1083104288, + "step": 6000 + }, + { + "epoch": 0.6569418976983498, + "grad_norm": 1.3140549758169555, + "learning_rate": 1.3163452692683465e-05, + "loss": 0.6619, + "num_input_tokens_seen": 1083305888, + "step": 6001 + }, + { + "epoch": 0.6570513697692876, + "grad_norm": 1.2318447109700497, + "learning_rate": 1.3155879603420207e-05, + "loss": 0.6062, + "num_input_tokens_seen": 1083458208, + "step": 6002 + }, + { + "epoch": 0.6571608418402255, + "grad_norm": 1.2162525312626717, + "learning_rate": 1.3148307915293728e-05, + "loss": 0.6019, + "num_input_tokens_seen": 1083650176, + "step": 6003 + }, + { + "epoch": 0.6572703139111634, + "grad_norm": 1.3725396164837806, + "learning_rate": 1.3140737629199787e-05, + "loss": 0.7772, + "num_input_tokens_seen": 1083837216, + "step": 6004 + }, + { + "epoch": 0.6573797859821013, + "grad_norm": 1.2959097061874647, + "learning_rate": 1.3133168746033895e-05, + "loss": 0.8285, + "num_input_tokens_seen": 1083997152, + "step": 6005 + }, + { + "epoch": 0.6574892580530393, + "grad_norm": 1.2703406937251736, + "learning_rate": 1.3125601266691462e-05, + "loss": 0.8265, + "num_input_tokens_seen": 1084179264, + "step": 6006 + }, + { + "epoch": 0.6575987301239771, + "grad_norm": 1.3769394461310809, + "learning_rate": 1.3118035192067702e-05, + "loss": 0.8466, + "num_input_tokens_seen": 1084351968, + "step": 6007 + }, + { + "epoch": 0.657708202194915, + "grad_norm": 1.3415270316080232, + "learning_rate": 1.311047052305766e-05, + "loss": 0.7496, + "num_input_tokens_seen": 1084554464, + "step": 6008 + }, + { + "epoch": 0.6578176742658529, + "grad_norm": 1.0897471067176212, + "learning_rate": 1.310290726055623e-05, + "loss": 0.5124, + "num_input_tokens_seen": 1084725152, + "step": 6009 + }, + { + "epoch": 0.6579271463367908, + "grad_norm": 1.3452282248937468, + "learning_rate": 1.3095345405458115e-05, + "loss": 0.6117, + "num_input_tokens_seen": 1084877248, + "step": 6010 + }, + { + "epoch": 0.6580366184077288, + "grad_norm": 1.3123560781134873, + "learning_rate": 1.30877849586579e-05, + "loss": 0.7874, + "num_input_tokens_seen": 1085038752, + "step": 6011 + }, + { + "epoch": 0.6581460904786667, + "grad_norm": 1.453314573986411, + "learning_rate": 1.3080225921049938e-05, + "loss": 0.6582, + "num_input_tokens_seen": 1085184800, + "step": 6012 + }, + { + "epoch": 0.6582555625496045, + "grad_norm": 1.331459884933135, + "learning_rate": 1.3072668293528467e-05, + "loss": 1.0005, + "num_input_tokens_seen": 1085387968, + "step": 6013 + }, + { + "epoch": 0.6583650346205424, + "grad_norm": 1.2052094868638241, + "learning_rate": 1.306511207698754e-05, + "loss": 0.7605, + "num_input_tokens_seen": 1085574112, + "step": 6014 + }, + { + "epoch": 0.6584745066914803, + "grad_norm": 1.2179147841736666, + "learning_rate": 1.3057557272321036e-05, + "loss": 0.7655, + "num_input_tokens_seen": 1085742112, + "step": 6015 + }, + { + "epoch": 0.6585839787624183, + "grad_norm": 1.2426153441835748, + "learning_rate": 1.3050003880422676e-05, + "loss": 0.7383, + "num_input_tokens_seen": 1085933408, + "step": 6016 + }, + { + "epoch": 0.6586934508333562, + "grad_norm": 1.338027167487777, + "learning_rate": 1.3042451902186012e-05, + "loss": 0.9374, + "num_input_tokens_seen": 1086089312, + "step": 6017 + }, + { + "epoch": 0.6588029229042941, + "grad_norm": 1.3662619809587777, + "learning_rate": 1.303490133850443e-05, + "loss": 0.7916, + "num_input_tokens_seen": 1086225952, + "step": 6018 + }, + { + "epoch": 0.6589123949752319, + "grad_norm": 1.3161560635860379, + "learning_rate": 1.3027352190271136e-05, + "loss": 0.8897, + "num_input_tokens_seen": 1086415008, + "step": 6019 + }, + { + "epoch": 0.6590218670461698, + "grad_norm": 1.2335113691154702, + "learning_rate": 1.3019804458379204e-05, + "loss": 0.6153, + "num_input_tokens_seen": 1086553888, + "step": 6020 + }, + { + "epoch": 0.6591313391171078, + "grad_norm": 1.2128122891055098, + "learning_rate": 1.3012258143721499e-05, + "loss": 0.7156, + "num_input_tokens_seen": 1086725248, + "step": 6021 + }, + { + "epoch": 0.6592408111880457, + "grad_norm": 1.2286341478094742, + "learning_rate": 1.3004713247190736e-05, + "loss": 0.8381, + "num_input_tokens_seen": 1086923936, + "step": 6022 + }, + { + "epoch": 0.6593502832589836, + "grad_norm": 1.2545974110033422, + "learning_rate": 1.2997169769679468e-05, + "loss": 0.6207, + "num_input_tokens_seen": 1087081408, + "step": 6023 + }, + { + "epoch": 0.6594597553299214, + "grad_norm": 1.4368485342859687, + "learning_rate": 1.298962771208007e-05, + "loss": 0.8838, + "num_input_tokens_seen": 1087266208, + "step": 6024 + }, + { + "epoch": 0.6595692274008593, + "grad_norm": 1.3622240049472558, + "learning_rate": 1.2982087075284754e-05, + "loss": 0.8405, + "num_input_tokens_seen": 1087421440, + "step": 6025 + }, + { + "epoch": 0.6596786994717972, + "grad_norm": 1.1599400893722582, + "learning_rate": 1.2974547860185566e-05, + "loss": 0.6128, + "num_input_tokens_seen": 1087608928, + "step": 6026 + }, + { + "epoch": 0.6597881715427352, + "grad_norm": 1.2293342325251415, + "learning_rate": 1.2967010067674362e-05, + "loss": 0.888, + "num_input_tokens_seen": 1087778048, + "step": 6027 + }, + { + "epoch": 0.6598976436136731, + "grad_norm": 1.155568875919702, + "learning_rate": 1.2959473698642888e-05, + "loss": 0.7787, + "num_input_tokens_seen": 1088002944, + "step": 6028 + }, + { + "epoch": 0.660007115684611, + "grad_norm": 1.2506664406898906, + "learning_rate": 1.2951938753982634e-05, + "loss": 0.8233, + "num_input_tokens_seen": 1088193344, + "step": 6029 + }, + { + "epoch": 0.6601165877555488, + "grad_norm": 1.2627910863868066, + "learning_rate": 1.2944405234585005e-05, + "loss": 0.7897, + "num_input_tokens_seen": 1088376352, + "step": 6030 + }, + { + "epoch": 0.6602260598264867, + "grad_norm": 1.3035288708602268, + "learning_rate": 1.293687314134119e-05, + "loss": 0.9002, + "num_input_tokens_seen": 1088548832, + "step": 6031 + }, + { + "epoch": 0.6603355318974247, + "grad_norm": 1.177714738318365, + "learning_rate": 1.2929342475142225e-05, + "loss": 0.7395, + "num_input_tokens_seen": 1088729824, + "step": 6032 + }, + { + "epoch": 0.6604450039683626, + "grad_norm": 1.1640131895130796, + "learning_rate": 1.2921813236878965e-05, + "loss": 0.5672, + "num_input_tokens_seen": 1088920896, + "step": 6033 + }, + { + "epoch": 0.6605544760393005, + "grad_norm": 1.2242642879232675, + "learning_rate": 1.2914285427442102e-05, + "loss": 0.6098, + "num_input_tokens_seen": 1089099424, + "step": 6034 + }, + { + "epoch": 0.6606639481102384, + "grad_norm": 1.3217232745596312, + "learning_rate": 1.290675904772219e-05, + "loss": 0.6151, + "num_input_tokens_seen": 1089247712, + "step": 6035 + }, + { + "epoch": 0.6607734201811762, + "grad_norm": 1.272570074733817, + "learning_rate": 1.2899234098609541e-05, + "loss": 0.8356, + "num_input_tokens_seen": 1089447072, + "step": 6036 + }, + { + "epoch": 0.6608828922521142, + "grad_norm": 1.2325674898963355, + "learning_rate": 1.2891710580994387e-05, + "loss": 0.8525, + "num_input_tokens_seen": 1089637472, + "step": 6037 + }, + { + "epoch": 0.6609923643230521, + "grad_norm": 1.2938333714845642, + "learning_rate": 1.2884188495766709e-05, + "loss": 0.7664, + "num_input_tokens_seen": 1089835488, + "step": 6038 + }, + { + "epoch": 0.66110183639399, + "grad_norm": 1.2951644279045993, + "learning_rate": 1.2876667843816373e-05, + "loss": 1.2275, + "num_input_tokens_seen": 1090055680, + "step": 6039 + }, + { + "epoch": 0.6612113084649279, + "grad_norm": 1.268398824488884, + "learning_rate": 1.2869148626033059e-05, + "loss": 0.8112, + "num_input_tokens_seen": 1090246752, + "step": 6040 + }, + { + "epoch": 0.6613207805358657, + "grad_norm": 1.2608088469076317, + "learning_rate": 1.2861630843306271e-05, + "loss": 0.7249, + "num_input_tokens_seen": 1090412960, + "step": 6041 + }, + { + "epoch": 0.6614302526068037, + "grad_norm": 1.2658563567263716, + "learning_rate": 1.285411449652535e-05, + "loss": 0.7935, + "num_input_tokens_seen": 1090557216, + "step": 6042 + }, + { + "epoch": 0.6615397246777416, + "grad_norm": 1.2173631063965886, + "learning_rate": 1.2846599586579456e-05, + "loss": 0.9119, + "num_input_tokens_seen": 1090734176, + "step": 6043 + }, + { + "epoch": 0.6616491967486795, + "grad_norm": 1.2584106255260357, + "learning_rate": 1.2839086114357617e-05, + "loss": 0.7617, + "num_input_tokens_seen": 1090902848, + "step": 6044 + }, + { + "epoch": 0.6617586688196174, + "grad_norm": 1.3422373752408812, + "learning_rate": 1.2831574080748621e-05, + "loss": 0.9162, + "num_input_tokens_seen": 1091081824, + "step": 6045 + }, + { + "epoch": 0.6618681408905553, + "grad_norm": 1.3405929561600103, + "learning_rate": 1.2824063486641172e-05, + "loss": 0.7773, + "num_input_tokens_seen": 1091254528, + "step": 6046 + }, + { + "epoch": 0.6619776129614932, + "grad_norm": 1.1417445084331395, + "learning_rate": 1.2816554332923714e-05, + "loss": 0.6688, + "num_input_tokens_seen": 1091445152, + "step": 6047 + }, + { + "epoch": 0.6620870850324311, + "grad_norm": 1.3374588986792952, + "learning_rate": 1.28090466204846e-05, + "loss": 0.7595, + "num_input_tokens_seen": 1091625024, + "step": 6048 + }, + { + "epoch": 0.662196557103369, + "grad_norm": 1.2764603946337714, + "learning_rate": 1.2801540350211963e-05, + "loss": 0.8138, + "num_input_tokens_seen": 1091793248, + "step": 6049 + }, + { + "epoch": 0.6623060291743069, + "grad_norm": 1.2365253887724494, + "learning_rate": 1.2794035522993785e-05, + "loss": 0.7339, + "num_input_tokens_seen": 1091982752, + "step": 6050 + }, + { + "epoch": 0.6624155012452448, + "grad_norm": 1.265494210349426, + "learning_rate": 1.2786532139717872e-05, + "loss": 0.7657, + "num_input_tokens_seen": 1092163072, + "step": 6051 + }, + { + "epoch": 0.6625249733161828, + "grad_norm": 1.1508625921129716, + "learning_rate": 1.2779030201271846e-05, + "loss": 0.6465, + "num_input_tokens_seen": 1092331744, + "step": 6052 + }, + { + "epoch": 0.6626344453871206, + "grad_norm": 1.3471504920967918, + "learning_rate": 1.2771529708543203e-05, + "loss": 0.7728, + "num_input_tokens_seen": 1092510496, + "step": 6053 + }, + { + "epoch": 0.6627439174580585, + "grad_norm": 1.246154305352926, + "learning_rate": 1.2764030662419201e-05, + "loss": 0.8976, + "num_input_tokens_seen": 1092694400, + "step": 6054 + }, + { + "epoch": 0.6628533895289964, + "grad_norm": 1.2367911533890596, + "learning_rate": 1.2756533063786991e-05, + "loss": 0.6845, + "num_input_tokens_seen": 1092833280, + "step": 6055 + }, + { + "epoch": 0.6629628615999343, + "grad_norm": 1.2199880718210905, + "learning_rate": 1.2749036913533514e-05, + "loss": 0.8025, + "num_input_tokens_seen": 1092973728, + "step": 6056 + }, + { + "epoch": 0.6630723336708723, + "grad_norm": 1.265387477485768, + "learning_rate": 1.2741542212545549e-05, + "loss": 0.6519, + "num_input_tokens_seen": 1093153152, + "step": 6057 + }, + { + "epoch": 0.6631818057418101, + "grad_norm": 1.2105664084558172, + "learning_rate": 1.2734048961709707e-05, + "loss": 1.0345, + "num_input_tokens_seen": 1093359232, + "step": 6058 + }, + { + "epoch": 0.663291277812748, + "grad_norm": 1.3233054733626715, + "learning_rate": 1.2726557161912425e-05, + "loss": 0.6562, + "num_input_tokens_seen": 1093529248, + "step": 6059 + }, + { + "epoch": 0.6634007498836859, + "grad_norm": 1.1867138619538846, + "learning_rate": 1.2719066814039957e-05, + "loss": 0.8941, + "num_input_tokens_seen": 1093752352, + "step": 6060 + }, + { + "epoch": 0.6635102219546238, + "grad_norm": 1.279845048943113, + "learning_rate": 1.2711577918978417e-05, + "loss": 0.7756, + "num_input_tokens_seen": 1093933792, + "step": 6061 + }, + { + "epoch": 0.6636196940255618, + "grad_norm": 1.2804736599877502, + "learning_rate": 1.270409047761372e-05, + "loss": 0.7306, + "num_input_tokens_seen": 1094094624, + "step": 6062 + }, + { + "epoch": 0.6637291660964997, + "grad_norm": 1.2777162061359895, + "learning_rate": 1.2696604490831609e-05, + "loss": 0.7176, + "num_input_tokens_seen": 1094254336, + "step": 6063 + }, + { + "epoch": 0.6638386381674375, + "grad_norm": 1.1581343168819567, + "learning_rate": 1.268911995951767e-05, + "loss": 0.6973, + "num_input_tokens_seen": 1094463552, + "step": 6064 + }, + { + "epoch": 0.6639481102383754, + "grad_norm": 1.2507721900016138, + "learning_rate": 1.2681636884557307e-05, + "loss": 0.9368, + "num_input_tokens_seen": 1094674784, + "step": 6065 + }, + { + "epoch": 0.6640575823093133, + "grad_norm": 1.1992057277705523, + "learning_rate": 1.2674155266835753e-05, + "loss": 0.5613, + "num_input_tokens_seen": 1094836960, + "step": 6066 + }, + { + "epoch": 0.6641670543802513, + "grad_norm": 1.2821273126126653, + "learning_rate": 1.2666675107238052e-05, + "loss": 0.9068, + "num_input_tokens_seen": 1094999136, + "step": 6067 + }, + { + "epoch": 0.6642765264511892, + "grad_norm": 1.2077264851369727, + "learning_rate": 1.2659196406649132e-05, + "loss": 0.904, + "num_input_tokens_seen": 1095162432, + "step": 6068 + }, + { + "epoch": 0.6643859985221271, + "grad_norm": 1.3221289875424456, + "learning_rate": 1.2651719165953666e-05, + "loss": 0.7755, + "num_input_tokens_seen": 1095314528, + "step": 6069 + }, + { + "epoch": 0.6644954705930649, + "grad_norm": 1.3138527483917222, + "learning_rate": 1.2644243386036234e-05, + "loss": 0.9319, + "num_input_tokens_seen": 1095510080, + "step": 6070 + }, + { + "epoch": 0.6646049426640028, + "grad_norm": 1.208977768591855, + "learning_rate": 1.2636769067781173e-05, + "loss": 0.6669, + "num_input_tokens_seen": 1095724896, + "step": 6071 + }, + { + "epoch": 0.6647144147349408, + "grad_norm": 1.181055450565375, + "learning_rate": 1.2629296212072703e-05, + "loss": 0.7786, + "num_input_tokens_seen": 1095908128, + "step": 6072 + }, + { + "epoch": 0.6648238868058787, + "grad_norm": 1.4493012520474828, + "learning_rate": 1.2621824819794845e-05, + "loss": 0.9956, + "num_input_tokens_seen": 1096087328, + "step": 6073 + }, + { + "epoch": 0.6649333588768166, + "grad_norm": 1.150592054468335, + "learning_rate": 1.2614354891831437e-05, + "loss": 0.6956, + "num_input_tokens_seen": 1096290272, + "step": 6074 + }, + { + "epoch": 0.6650428309477544, + "grad_norm": 1.3019414544691221, + "learning_rate": 1.2606886429066186e-05, + "loss": 0.7308, + "num_input_tokens_seen": 1096457152, + "step": 6075 + }, + { + "epoch": 0.6651523030186923, + "grad_norm": 1.310183296993549, + "learning_rate": 1.2599419432382561e-05, + "loss": 1.0721, + "num_input_tokens_seen": 1096669952, + "step": 6076 + }, + { + "epoch": 0.6652617750896302, + "grad_norm": 1.1720009799634201, + "learning_rate": 1.259195390266393e-05, + "loss": 0.8268, + "num_input_tokens_seen": 1096871776, + "step": 6077 + }, + { + "epoch": 0.6653712471605682, + "grad_norm": 1.1663363477299642, + "learning_rate": 1.2584489840793414e-05, + "loss": 0.7313, + "num_input_tokens_seen": 1097084128, + "step": 6078 + }, + { + "epoch": 0.6654807192315061, + "grad_norm": 1.429739112130851, + "learning_rate": 1.2577027247654033e-05, + "loss": 0.8143, + "num_input_tokens_seen": 1097262880, + "step": 6079 + }, + { + "epoch": 0.665590191302444, + "grad_norm": 1.3119700133804588, + "learning_rate": 1.2569566124128563e-05, + "loss": 0.8541, + "num_input_tokens_seen": 1097426400, + "step": 6080 + }, + { + "epoch": 0.6656996633733818, + "grad_norm": 1.2070427557634518, + "learning_rate": 1.2562106471099667e-05, + "loss": 0.7082, + "num_input_tokens_seen": 1097608064, + "step": 6081 + }, + { + "epoch": 0.6658091354443197, + "grad_norm": 1.0871439756588546, + "learning_rate": 1.2554648289449798e-05, + "loss": 0.6727, + "num_input_tokens_seen": 1097815264, + "step": 6082 + }, + { + "epoch": 0.6659186075152577, + "grad_norm": 1.1854105184403434, + "learning_rate": 1.254719158006124e-05, + "loss": 0.653, + "num_input_tokens_seen": 1097969824, + "step": 6083 + }, + { + "epoch": 0.6660280795861956, + "grad_norm": 1.3791215852743386, + "learning_rate": 1.253973634381612e-05, + "loss": 0.8874, + "num_input_tokens_seen": 1098132224, + "step": 6084 + }, + { + "epoch": 0.6661375516571335, + "grad_norm": 1.058126874495061, + "learning_rate": 1.2532282581596355e-05, + "loss": 0.5278, + "num_input_tokens_seen": 1098320832, + "step": 6085 + }, + { + "epoch": 0.6662470237280714, + "grad_norm": 1.2422933473415203, + "learning_rate": 1.2524830294283748e-05, + "loss": 0.9085, + "num_input_tokens_seen": 1098487040, + "step": 6086 + }, + { + "epoch": 0.6663564957990092, + "grad_norm": 1.2297062524628357, + "learning_rate": 1.251737948275985e-05, + "loss": 0.8951, + "num_input_tokens_seen": 1098690208, + "step": 6087 + }, + { + "epoch": 0.6664659678699472, + "grad_norm": 1.375952780675759, + "learning_rate": 1.2509930147906107e-05, + "loss": 0.9142, + "num_input_tokens_seen": 1098881728, + "step": 6088 + }, + { + "epoch": 0.6665754399408851, + "grad_norm": 1.233174379948385, + "learning_rate": 1.2502482290603748e-05, + "loss": 0.8937, + "num_input_tokens_seen": 1099098336, + "step": 6089 + }, + { + "epoch": 0.666684912011823, + "grad_norm": 1.2842881537635071, + "learning_rate": 1.2495035911733844e-05, + "loss": 0.9336, + "num_input_tokens_seen": 1099292544, + "step": 6090 + }, + { + "epoch": 0.6667943840827609, + "grad_norm": 1.2520973253446854, + "learning_rate": 1.2487591012177285e-05, + "loss": 0.911, + "num_input_tokens_seen": 1099497952, + "step": 6091 + }, + { + "epoch": 0.6669038561536987, + "grad_norm": 1.3112685920990044, + "learning_rate": 1.2480147592814791e-05, + "loss": 0.8022, + "num_input_tokens_seen": 1099647584, + "step": 6092 + }, + { + "epoch": 0.6670133282246367, + "grad_norm": 1.2437883534101022, + "learning_rate": 1.2472705654526904e-05, + "loss": 0.635, + "num_input_tokens_seen": 1099802144, + "step": 6093 + }, + { + "epoch": 0.6671228002955746, + "grad_norm": 1.4404262115953823, + "learning_rate": 1.2465265198193977e-05, + "loss": 1.0026, + "num_input_tokens_seen": 1099975744, + "step": 6094 + }, + { + "epoch": 0.6672322723665125, + "grad_norm": 1.29005920717854, + "learning_rate": 1.2457826224696225e-05, + "loss": 0.7759, + "num_input_tokens_seen": 1100165696, + "step": 6095 + }, + { + "epoch": 0.6673417444374504, + "grad_norm": 1.2079423841984729, + "learning_rate": 1.2450388734913657e-05, + "loss": 0.983, + "num_input_tokens_seen": 1100363040, + "step": 6096 + }, + { + "epoch": 0.6674512165083883, + "grad_norm": 1.2023154668722298, + "learning_rate": 1.2442952729726109e-05, + "loss": 0.6779, + "num_input_tokens_seen": 1100553888, + "step": 6097 + }, + { + "epoch": 0.6675606885793262, + "grad_norm": 1.2394430684890834, + "learning_rate": 1.2435518210013248e-05, + "loss": 0.7319, + "num_input_tokens_seen": 1100712032, + "step": 6098 + }, + { + "epoch": 0.6676701606502641, + "grad_norm": 1.2238368954848018, + "learning_rate": 1.2428085176654563e-05, + "loss": 0.7459, + "num_input_tokens_seen": 1100894144, + "step": 6099 + }, + { + "epoch": 0.667779632721202, + "grad_norm": 1.2999024042761846, + "learning_rate": 1.2420653630529369e-05, + "loss": 0.9332, + "num_input_tokens_seen": 1101069088, + "step": 6100 + }, + { + "epoch": 0.6678891047921399, + "grad_norm": 1.1566557816553, + "learning_rate": 1.2413223572516802e-05, + "loss": 0.6245, + "num_input_tokens_seen": 1101256352, + "step": 6101 + }, + { + "epoch": 0.6679985768630778, + "grad_norm": 1.436623845328877, + "learning_rate": 1.2405795003495819e-05, + "loss": 0.7321, + "num_input_tokens_seen": 1101434208, + "step": 6102 + }, + { + "epoch": 0.6681080489340158, + "grad_norm": 1.2254703786407002, + "learning_rate": 1.2398367924345213e-05, + "loss": 0.8827, + "num_input_tokens_seen": 1101591232, + "step": 6103 + }, + { + "epoch": 0.6682175210049536, + "grad_norm": 1.3029770289143312, + "learning_rate": 1.2390942335943597e-05, + "loss": 0.7978, + "num_input_tokens_seen": 1101766624, + "step": 6104 + }, + { + "epoch": 0.6683269930758915, + "grad_norm": 1.1296791677840412, + "learning_rate": 1.2383518239169397e-05, + "loss": 0.7489, + "num_input_tokens_seen": 1101974496, + "step": 6105 + }, + { + "epoch": 0.6684364651468294, + "grad_norm": 1.1989404311888758, + "learning_rate": 1.2376095634900872e-05, + "loss": 0.7039, + "num_input_tokens_seen": 1102181696, + "step": 6106 + }, + { + "epoch": 0.6685459372177673, + "grad_norm": 1.1530387205187973, + "learning_rate": 1.2368674524016099e-05, + "loss": 0.6993, + "num_input_tokens_seen": 1102364032, + "step": 6107 + }, + { + "epoch": 0.6686554092887053, + "grad_norm": 1.4089332412650626, + "learning_rate": 1.2361254907392983e-05, + "loss": 0.8255, + "num_input_tokens_seen": 1102528000, + "step": 6108 + }, + { + "epoch": 0.6687648813596431, + "grad_norm": 1.2422048219636639, + "learning_rate": 1.235383678590924e-05, + "loss": 0.7664, + "num_input_tokens_seen": 1102719296, + "step": 6109 + }, + { + "epoch": 0.668874353430581, + "grad_norm": 1.222399602406622, + "learning_rate": 1.2346420160442449e-05, + "loss": 0.8007, + "num_input_tokens_seen": 1102883040, + "step": 6110 + }, + { + "epoch": 0.6689838255015189, + "grad_norm": 1.3788797521881913, + "learning_rate": 1.2339005031869947e-05, + "loss": 0.7223, + "num_input_tokens_seen": 1103021248, + "step": 6111 + }, + { + "epoch": 0.6690932975724568, + "grad_norm": 1.382804094709293, + "learning_rate": 1.2331591401068961e-05, + "loss": 0.8199, + "num_input_tokens_seen": 1103195296, + "step": 6112 + }, + { + "epoch": 0.6692027696433948, + "grad_norm": 1.2870598235020279, + "learning_rate": 1.2324179268916478e-05, + "loss": 0.7567, + "num_input_tokens_seen": 1103384576, + "step": 6113 + }, + { + "epoch": 0.6693122417143327, + "grad_norm": 1.3276172340481278, + "learning_rate": 1.2316768636289364e-05, + "loss": 0.8462, + "num_input_tokens_seen": 1103582368, + "step": 6114 + }, + { + "epoch": 0.6694217137852705, + "grad_norm": 1.1556458828134262, + "learning_rate": 1.2309359504064274e-05, + "loss": 0.7676, + "num_input_tokens_seen": 1103787552, + "step": 6115 + }, + { + "epoch": 0.6695311858562084, + "grad_norm": 1.245642505605047, + "learning_rate": 1.2301951873117687e-05, + "loss": 0.7054, + "num_input_tokens_seen": 1103959360, + "step": 6116 + }, + { + "epoch": 0.6696406579271463, + "grad_norm": 1.2175831711107388, + "learning_rate": 1.2294545744325935e-05, + "loss": 0.8366, + "num_input_tokens_seen": 1104164992, + "step": 6117 + }, + { + "epoch": 0.6697501299980843, + "grad_norm": 1.2701466169354518, + "learning_rate": 1.2287141118565116e-05, + "loss": 0.7889, + "num_input_tokens_seen": 1104357408, + "step": 6118 + }, + { + "epoch": 0.6698596020690222, + "grad_norm": 1.269734574688587, + "learning_rate": 1.2279737996711216e-05, + "loss": 0.7905, + "num_input_tokens_seen": 1104539744, + "step": 6119 + }, + { + "epoch": 0.6699690741399601, + "grad_norm": 1.2012623822609099, + "learning_rate": 1.2272336379639978e-05, + "loss": 0.6631, + "num_input_tokens_seen": 1104711776, + "step": 6120 + }, + { + "epoch": 0.6700785462108979, + "grad_norm": 1.2144473002414644, + "learning_rate": 1.2264936268227032e-05, + "loss": 0.9891, + "num_input_tokens_seen": 1104939584, + "step": 6121 + }, + { + "epoch": 0.6701880182818358, + "grad_norm": 1.3759603614476053, + "learning_rate": 1.2257537663347763e-05, + "loss": 0.8852, + "num_input_tokens_seen": 1105110720, + "step": 6122 + }, + { + "epoch": 0.6702974903527738, + "grad_norm": 1.0801706671087177, + "learning_rate": 1.2250140565877438e-05, + "loss": 0.6681, + "num_input_tokens_seen": 1105286112, + "step": 6123 + }, + { + "epoch": 0.6704069624237117, + "grad_norm": 1.3071673462053024, + "learning_rate": 1.2242744976691109e-05, + "loss": 0.9507, + "num_input_tokens_seen": 1105461728, + "step": 6124 + }, + { + "epoch": 0.6705164344946496, + "grad_norm": 1.2096618331910023, + "learning_rate": 1.2235350896663662e-05, + "loss": 0.5057, + "num_input_tokens_seen": 1105647648, + "step": 6125 + }, + { + "epoch": 0.6706259065655874, + "grad_norm": 1.2688967483666438, + "learning_rate": 1.2227958326669803e-05, + "loss": 1.0572, + "num_input_tokens_seen": 1105846784, + "step": 6126 + }, + { + "epoch": 0.6707353786365253, + "grad_norm": 1.1178981414825881, + "learning_rate": 1.2220567267584048e-05, + "loss": 0.7698, + "num_input_tokens_seen": 1106015904, + "step": 6127 + }, + { + "epoch": 0.6708448507074632, + "grad_norm": 1.2246389187742843, + "learning_rate": 1.2213177720280764e-05, + "loss": 0.7536, + "num_input_tokens_seen": 1106208992, + "step": 6128 + }, + { + "epoch": 0.6709543227784012, + "grad_norm": 1.0438742832039314, + "learning_rate": 1.2205789685634112e-05, + "loss": 0.7414, + "num_input_tokens_seen": 1106425152, + "step": 6129 + }, + { + "epoch": 0.6710637948493391, + "grad_norm": 1.1813661871389558, + "learning_rate": 1.2198403164518083e-05, + "loss": 0.8358, + "num_input_tokens_seen": 1106597408, + "step": 6130 + }, + { + "epoch": 0.671173266920277, + "grad_norm": 1.1537885498682185, + "learning_rate": 1.2191018157806485e-05, + "loss": 0.807, + "num_input_tokens_seen": 1106776832, + "step": 6131 + }, + { + "epoch": 0.6712827389912148, + "grad_norm": 1.3581503251405278, + "learning_rate": 1.2183634666372954e-05, + "loss": 0.9014, + "num_input_tokens_seen": 1106939680, + "step": 6132 + }, + { + "epoch": 0.6713922110621527, + "grad_norm": 1.333169445257339, + "learning_rate": 1.2176252691090939e-05, + "loss": 0.676, + "num_input_tokens_seen": 1107104544, + "step": 6133 + }, + { + "epoch": 0.6715016831330907, + "grad_norm": 1.2134284210847848, + "learning_rate": 1.2168872232833717e-05, + "loss": 0.6751, + "num_input_tokens_seen": 1107294720, + "step": 6134 + }, + { + "epoch": 0.6716111552040286, + "grad_norm": 1.2483512017856084, + "learning_rate": 1.216149329247437e-05, + "loss": 0.7461, + "num_input_tokens_seen": 1107487584, + "step": 6135 + }, + { + "epoch": 0.6717206272749665, + "grad_norm": 1.2494515646392443, + "learning_rate": 1.2154115870885838e-05, + "loss": 0.7174, + "num_input_tokens_seen": 1107647072, + "step": 6136 + }, + { + "epoch": 0.6718300993459044, + "grad_norm": 1.2628465932101502, + "learning_rate": 1.2146739968940838e-05, + "loss": 0.6933, + "num_input_tokens_seen": 1107805664, + "step": 6137 + }, + { + "epoch": 0.6719395714168422, + "grad_norm": 1.3051942309535873, + "learning_rate": 1.2139365587511927e-05, + "loss": 0.7058, + "num_input_tokens_seen": 1107973664, + "step": 6138 + }, + { + "epoch": 0.6720490434877802, + "grad_norm": 1.3453651745770672, + "learning_rate": 1.2131992727471484e-05, + "loss": 0.6256, + "num_input_tokens_seen": 1108133376, + "step": 6139 + }, + { + "epoch": 0.6721585155587181, + "grad_norm": 1.3198000122063376, + "learning_rate": 1.2124621389691702e-05, + "loss": 0.9088, + "num_input_tokens_seen": 1108321760, + "step": 6140 + }, + { + "epoch": 0.672267987629656, + "grad_norm": 1.2859808876254781, + "learning_rate": 1.2117251575044594e-05, + "loss": 0.7775, + "num_input_tokens_seen": 1108490432, + "step": 6141 + }, + { + "epoch": 0.6723774597005939, + "grad_norm": 1.241794644018079, + "learning_rate": 1.2109883284401987e-05, + "loss": 0.7001, + "num_input_tokens_seen": 1108668960, + "step": 6142 + }, + { + "epoch": 0.6724869317715317, + "grad_norm": 1.08463934279056, + "learning_rate": 1.2102516518635568e-05, + "loss": 0.686, + "num_input_tokens_seen": 1108865408, + "step": 6143 + }, + { + "epoch": 0.6725964038424697, + "grad_norm": 1.119086245934941, + "learning_rate": 1.2095151278616768e-05, + "loss": 0.6469, + "num_input_tokens_seen": 1109015936, + "step": 6144 + }, + { + "epoch": 0.6727058759134076, + "grad_norm": 1.2946371510572543, + "learning_rate": 1.208778756521691e-05, + "loss": 1.0076, + "num_input_tokens_seen": 1109195136, + "step": 6145 + }, + { + "epoch": 0.6728153479843455, + "grad_norm": 1.163919119997394, + "learning_rate": 1.20804253793071e-05, + "loss": 0.7697, + "num_input_tokens_seen": 1109403008, + "step": 6146 + }, + { + "epoch": 0.6729248200552834, + "grad_norm": 1.1830473793873622, + "learning_rate": 1.207306472175827e-05, + "loss": 0.6655, + "num_input_tokens_seen": 1109590944, + "step": 6147 + }, + { + "epoch": 0.6730342921262213, + "grad_norm": 1.0754485702561187, + "learning_rate": 1.2065705593441174e-05, + "loss": 0.6112, + "num_input_tokens_seen": 1109775296, + "step": 6148 + }, + { + "epoch": 0.6731437641971592, + "grad_norm": 1.2490038240158878, + "learning_rate": 1.2058347995226365e-05, + "loss": 0.9071, + "num_input_tokens_seen": 1109936576, + "step": 6149 + }, + { + "epoch": 0.6732532362680971, + "grad_norm": 1.3905832948753079, + "learning_rate": 1.2050991927984273e-05, + "loss": 0.9468, + "num_input_tokens_seen": 1110130784, + "step": 6150 + }, + { + "epoch": 0.673362708339035, + "grad_norm": 1.213368983391267, + "learning_rate": 1.2043637392585059e-05, + "loss": 0.6709, + "num_input_tokens_seen": 1110298560, + "step": 6151 + }, + { + "epoch": 0.6734721804099729, + "grad_norm": 1.2657803098729015, + "learning_rate": 1.2036284389898791e-05, + "loss": 0.6217, + "num_input_tokens_seen": 1110473056, + "step": 6152 + }, + { + "epoch": 0.6735816524809108, + "grad_norm": 1.237857252629918, + "learning_rate": 1.2028932920795284e-05, + "loss": 0.6576, + "num_input_tokens_seen": 1110640384, + "step": 6153 + }, + { + "epoch": 0.6736911245518488, + "grad_norm": 1.153129749236626, + "learning_rate": 1.2021582986144229e-05, + "loss": 0.8934, + "num_input_tokens_seen": 1110826304, + "step": 6154 + }, + { + "epoch": 0.6738005966227866, + "grad_norm": 1.2765177818132663, + "learning_rate": 1.2014234586815079e-05, + "loss": 0.8256, + "num_input_tokens_seen": 1111006624, + "step": 6155 + }, + { + "epoch": 0.6739100686937245, + "grad_norm": 1.1327715636124431, + "learning_rate": 1.2006887723677162e-05, + "loss": 0.6484, + "num_input_tokens_seen": 1111193216, + "step": 6156 + }, + { + "epoch": 0.6740195407646624, + "grad_norm": 1.2671176914690339, + "learning_rate": 1.1999542397599589e-05, + "loss": 0.6988, + "num_input_tokens_seen": 1111359424, + "step": 6157 + }, + { + "epoch": 0.6741290128356003, + "grad_norm": 1.463998372598171, + "learning_rate": 1.1992198609451288e-05, + "loss": 0.6204, + "num_input_tokens_seen": 1111534816, + "step": 6158 + }, + { + "epoch": 0.6742384849065383, + "grad_norm": 1.2645784454418827, + "learning_rate": 1.1984856360101045e-05, + "loss": 0.8895, + "num_input_tokens_seen": 1111714464, + "step": 6159 + }, + { + "epoch": 0.6743479569774761, + "grad_norm": 1.1329005766204543, + "learning_rate": 1.1977515650417396e-05, + "loss": 0.7933, + "num_input_tokens_seen": 1111899040, + "step": 6160 + }, + { + "epoch": 0.674457429048414, + "grad_norm": 1.3195571092835068, + "learning_rate": 1.1970176481268766e-05, + "loss": 0.9438, + "num_input_tokens_seen": 1112076448, + "step": 6161 + }, + { + "epoch": 0.6745669011193519, + "grad_norm": 1.293868382884122, + "learning_rate": 1.1962838853523335e-05, + "loss": 0.7082, + "num_input_tokens_seen": 1112230784, + "step": 6162 + }, + { + "epoch": 0.6746763731902898, + "grad_norm": 1.2771058867279652, + "learning_rate": 1.195550276804915e-05, + "loss": 0.698, + "num_input_tokens_seen": 1112420512, + "step": 6163 + }, + { + "epoch": 0.6747858452612278, + "grad_norm": 1.1847258337566329, + "learning_rate": 1.1948168225714051e-05, + "loss": 0.6488, + "num_input_tokens_seen": 1112602400, + "step": 6164 + }, + { + "epoch": 0.6748953173321657, + "grad_norm": 1.3913649401350325, + "learning_rate": 1.1940835227385702e-05, + "loss": 0.7395, + "num_input_tokens_seen": 1112771744, + "step": 6165 + }, + { + "epoch": 0.6750047894031035, + "grad_norm": 1.0816182474154594, + "learning_rate": 1.1933503773931581e-05, + "loss": 0.9476, + "num_input_tokens_seen": 1112982304, + "step": 6166 + }, + { + "epoch": 0.6751142614740414, + "grad_norm": 1.3087060225420395, + "learning_rate": 1.1926173866218984e-05, + "loss": 0.8011, + "num_input_tokens_seen": 1113165984, + "step": 6167 + }, + { + "epoch": 0.6752237335449793, + "grad_norm": 1.2350247278710917, + "learning_rate": 1.1918845505115025e-05, + "loss": 0.947, + "num_input_tokens_seen": 1113344736, + "step": 6168 + }, + { + "epoch": 0.6753332056159173, + "grad_norm": 1.4382088613450128, + "learning_rate": 1.1911518691486626e-05, + "loss": 0.8513, + "num_input_tokens_seen": 1113500416, + "step": 6169 + }, + { + "epoch": 0.6754426776868552, + "grad_norm": 1.237903525326488, + "learning_rate": 1.1904193426200554e-05, + "loss": 0.866, + "num_input_tokens_seen": 1113691040, + "step": 6170 + }, + { + "epoch": 0.6755521497577931, + "grad_norm": 1.1838120363086637, + "learning_rate": 1.1896869710123368e-05, + "loss": 0.9745, + "num_input_tokens_seen": 1113865984, + "step": 6171 + }, + { + "epoch": 0.6756616218287309, + "grad_norm": 1.1875751371203667, + "learning_rate": 1.1889547544121443e-05, + "loss": 0.6319, + "num_input_tokens_seen": 1114042720, + "step": 6172 + }, + { + "epoch": 0.6757710938996688, + "grad_norm": 1.2620300286917512, + "learning_rate": 1.1882226929060982e-05, + "loss": 0.9203, + "num_input_tokens_seen": 1114234688, + "step": 6173 + }, + { + "epoch": 0.6758805659706068, + "grad_norm": 1.4976539544141168, + "learning_rate": 1.1874907865808e-05, + "loss": 0.7321, + "num_input_tokens_seen": 1114382528, + "step": 6174 + }, + { + "epoch": 0.6759900380415447, + "grad_norm": 1.2586318954474378, + "learning_rate": 1.1867590355228326e-05, + "loss": 0.8856, + "num_input_tokens_seen": 1114561056, + "step": 6175 + }, + { + "epoch": 0.6760995101124826, + "grad_norm": 1.099706614990264, + "learning_rate": 1.1860274398187605e-05, + "loss": 0.8121, + "num_input_tokens_seen": 1114727936, + "step": 6176 + }, + { + "epoch": 0.6762089821834204, + "grad_norm": 1.1558496967927654, + "learning_rate": 1.1852959995551297e-05, + "loss": 0.6789, + "num_input_tokens_seen": 1114938048, + "step": 6177 + }, + { + "epoch": 0.6763184542543583, + "grad_norm": 1.1812101730818032, + "learning_rate": 1.1845647148184696e-05, + "loss": 0.9061, + "num_input_tokens_seen": 1115113888, + "step": 6178 + }, + { + "epoch": 0.6764279263252962, + "grad_norm": 1.2257832280976362, + "learning_rate": 1.1838335856952893e-05, + "loss": 0.55, + "num_input_tokens_seen": 1115291296, + "step": 6179 + }, + { + "epoch": 0.6765373983962342, + "grad_norm": 1.196851101629242, + "learning_rate": 1.1831026122720795e-05, + "loss": 0.8134, + "num_input_tokens_seen": 1115462880, + "step": 6180 + }, + { + "epoch": 0.6766468704671721, + "grad_norm": 1.2342427822718713, + "learning_rate": 1.1823717946353134e-05, + "loss": 0.8201, + "num_input_tokens_seen": 1115627968, + "step": 6181 + }, + { + "epoch": 0.67675634253811, + "grad_norm": 1.2074233223873412, + "learning_rate": 1.181641132871445e-05, + "loss": 0.8666, + "num_input_tokens_seen": 1115833824, + "step": 6182 + }, + { + "epoch": 0.6768658146090478, + "grad_norm": 1.1289934170871463, + "learning_rate": 1.1809106270669104e-05, + "loss": 0.8388, + "num_input_tokens_seen": 1116063200, + "step": 6183 + }, + { + "epoch": 0.6769752866799857, + "grad_norm": 1.1555361219345153, + "learning_rate": 1.1801802773081258e-05, + "loss": 0.6171, + "num_input_tokens_seen": 1116205888, + "step": 6184 + }, + { + "epoch": 0.6770847587509237, + "grad_norm": 1.3524083043320467, + "learning_rate": 1.1794500836814933e-05, + "loss": 0.879, + "num_input_tokens_seen": 1116367168, + "step": 6185 + }, + { + "epoch": 0.6771942308218616, + "grad_norm": 1.1894653520127922, + "learning_rate": 1.1787200462733897e-05, + "loss": 0.6222, + "num_input_tokens_seen": 1116566976, + "step": 6186 + }, + { + "epoch": 0.6773037028927995, + "grad_norm": 1.196770122106297, + "learning_rate": 1.1779901651701796e-05, + "loss": 0.7331, + "num_input_tokens_seen": 1116739904, + "step": 6187 + }, + { + "epoch": 0.6774131749637374, + "grad_norm": 1.1758481637157792, + "learning_rate": 1.1772604404582057e-05, + "loss": 0.8141, + "num_input_tokens_seen": 1116943072, + "step": 6188 + }, + { + "epoch": 0.6775226470346752, + "grad_norm": 1.2475906517254634, + "learning_rate": 1.176530872223793e-05, + "loss": 0.6077, + "num_input_tokens_seen": 1117128096, + "step": 6189 + }, + { + "epoch": 0.6776321191056132, + "grad_norm": 1.3573642197035163, + "learning_rate": 1.1758014605532483e-05, + "loss": 0.9152, + "num_input_tokens_seen": 1117299008, + "step": 6190 + }, + { + "epoch": 0.6777415911765511, + "grad_norm": 1.286593482863128, + "learning_rate": 1.1750722055328581e-05, + "loss": 0.6609, + "num_input_tokens_seen": 1117484704, + "step": 6191 + }, + { + "epoch": 0.677851063247489, + "grad_norm": 1.2917396258186222, + "learning_rate": 1.1743431072488952e-05, + "loss": 0.7009, + "num_input_tokens_seen": 1117683840, + "step": 6192 + }, + { + "epoch": 0.6779605353184269, + "grad_norm": 1.3694784969216873, + "learning_rate": 1.1736141657876068e-05, + "loss": 1.1124, + "num_input_tokens_seen": 1117873120, + "step": 6193 + }, + { + "epoch": 0.6780700073893647, + "grad_norm": 1.3298979649805927, + "learning_rate": 1.1728853812352286e-05, + "loss": 0.7856, + "num_input_tokens_seen": 1118009536, + "step": 6194 + }, + { + "epoch": 0.6781794794603027, + "grad_norm": 1.1784302030310632, + "learning_rate": 1.172156753677971e-05, + "loss": 0.6702, + "num_input_tokens_seen": 1118185600, + "step": 6195 + }, + { + "epoch": 0.6782889515312406, + "grad_norm": 1.249137342328086, + "learning_rate": 1.1714282832020318e-05, + "loss": 1.0739, + "num_input_tokens_seen": 1118387872, + "step": 6196 + }, + { + "epoch": 0.6783984236021785, + "grad_norm": 1.2104355294644942, + "learning_rate": 1.170699969893587e-05, + "loss": 0.674, + "num_input_tokens_seen": 1118593504, + "step": 6197 + }, + { + "epoch": 0.6785078956731164, + "grad_norm": 1.2776001839070914, + "learning_rate": 1.1699718138387947e-05, + "loss": 0.8084, + "num_input_tokens_seen": 1118769568, + "step": 6198 + }, + { + "epoch": 0.6786173677440543, + "grad_norm": 1.3135815206235002, + "learning_rate": 1.1692438151237942e-05, + "loss": 0.8557, + "num_input_tokens_seen": 1118948544, + "step": 6199 + }, + { + "epoch": 0.6787268398149922, + "grad_norm": 1.363200975038511, + "learning_rate": 1.1685159738347054e-05, + "loss": 0.9654, + "num_input_tokens_seen": 1119095712, + "step": 6200 + }, + { + "epoch": 0.6788363118859301, + "grad_norm": 1.3156093369953774, + "learning_rate": 1.1677882900576334e-05, + "loss": 0.7937, + "num_input_tokens_seen": 1119273568, + "step": 6201 + }, + { + "epoch": 0.678945783956868, + "grad_norm": 1.2977706390250563, + "learning_rate": 1.1670607638786579e-05, + "loss": 0.925, + "num_input_tokens_seen": 1119467552, + "step": 6202 + }, + { + "epoch": 0.6790552560278059, + "grad_norm": 1.4018538530500528, + "learning_rate": 1.166333395383847e-05, + "loss": 0.7855, + "num_input_tokens_seen": 1119625920, + "step": 6203 + }, + { + "epoch": 0.6791647280987438, + "grad_norm": 1.3064494770090318, + "learning_rate": 1.1656061846592458e-05, + "loss": 0.6556, + "num_input_tokens_seen": 1119830432, + "step": 6204 + }, + { + "epoch": 0.6792742001696818, + "grad_norm": 1.2618042057729717, + "learning_rate": 1.1648791317908822e-05, + "loss": 0.8691, + "num_input_tokens_seen": 1119971552, + "step": 6205 + }, + { + "epoch": 0.6793836722406196, + "grad_norm": 1.3277016357623264, + "learning_rate": 1.164152236864765e-05, + "loss": 0.8292, + "num_input_tokens_seen": 1120157248, + "step": 6206 + }, + { + "epoch": 0.6794931443115575, + "grad_norm": 1.1811805523030665, + "learning_rate": 1.1634254999668842e-05, + "loss": 0.6862, + "num_input_tokens_seen": 1120350112, + "step": 6207 + }, + { + "epoch": 0.6796026163824954, + "grad_norm": 1.2083270083733246, + "learning_rate": 1.162698921183212e-05, + "loss": 0.8896, + "num_input_tokens_seen": 1120559104, + "step": 6208 + }, + { + "epoch": 0.6797120884534333, + "grad_norm": 1.1282324974854532, + "learning_rate": 1.1619725005997007e-05, + "loss": 0.7586, + "num_input_tokens_seen": 1120738304, + "step": 6209 + }, + { + "epoch": 0.6798215605243713, + "grad_norm": 1.3420611125516242, + "learning_rate": 1.1612462383022838e-05, + "loss": 0.9708, + "num_input_tokens_seen": 1120937440, + "step": 6210 + }, + { + "epoch": 0.6799310325953091, + "grad_norm": 1.0344191319942226, + "learning_rate": 1.1605201343768787e-05, + "loss": 0.5127, + "num_input_tokens_seen": 1121113056, + "step": 6211 + }, + { + "epoch": 0.680040504666247, + "grad_norm": 1.1803996986779164, + "learning_rate": 1.1597941889093808e-05, + "loss": 0.7639, + "num_input_tokens_seen": 1121317344, + "step": 6212 + }, + { + "epoch": 0.6801499767371849, + "grad_norm": 1.2346263098513297, + "learning_rate": 1.1590684019856687e-05, + "loss": 0.6956, + "num_input_tokens_seen": 1121484672, + "step": 6213 + }, + { + "epoch": 0.6802594488081228, + "grad_norm": 1.3694662232093315, + "learning_rate": 1.1583427736916008e-05, + "loss": 0.866, + "num_input_tokens_seen": 1121647744, + "step": 6214 + }, + { + "epoch": 0.6803689208790608, + "grad_norm": 1.1350836250822538, + "learning_rate": 1.1576173041130184e-05, + "loss": 0.6814, + "num_input_tokens_seen": 1121851136, + "step": 6215 + }, + { + "epoch": 0.6804783929499987, + "grad_norm": 1.234138417810373, + "learning_rate": 1.1568919933357423e-05, + "loss": 0.8806, + "num_input_tokens_seen": 1122040640, + "step": 6216 + }, + { + "epoch": 0.6805878650209365, + "grad_norm": 1.2636771577845527, + "learning_rate": 1.1561668414455751e-05, + "loss": 0.7274, + "num_input_tokens_seen": 1122231712, + "step": 6217 + }, + { + "epoch": 0.6806973370918744, + "grad_norm": 1.2575164351707004, + "learning_rate": 1.1554418485283033e-05, + "loss": 0.7445, + "num_input_tokens_seen": 1122389632, + "step": 6218 + }, + { + "epoch": 0.6808068091628123, + "grad_norm": 1.3016576249072749, + "learning_rate": 1.1547170146696887e-05, + "loss": 0.6946, + "num_input_tokens_seen": 1122603776, + "step": 6219 + }, + { + "epoch": 0.6809162812337503, + "grad_norm": 1.327869884447717, + "learning_rate": 1.1539923399554805e-05, + "loss": 0.9101, + "num_input_tokens_seen": 1122769760, + "step": 6220 + }, + { + "epoch": 0.6810257533046882, + "grad_norm": 1.244559512443062, + "learning_rate": 1.1532678244714055e-05, + "loss": 0.7528, + "num_input_tokens_seen": 1122940896, + "step": 6221 + }, + { + "epoch": 0.6811352253756261, + "grad_norm": 1.3500705930125463, + "learning_rate": 1.1525434683031718e-05, + "loss": 0.691, + "num_input_tokens_seen": 1123101504, + "step": 6222 + }, + { + "epoch": 0.6812446974465639, + "grad_norm": 1.2328528664752667, + "learning_rate": 1.1518192715364704e-05, + "loss": 0.7209, + "num_input_tokens_seen": 1123290336, + "step": 6223 + }, + { + "epoch": 0.6813541695175018, + "grad_norm": 1.2962650565263418, + "learning_rate": 1.1510952342569708e-05, + "loss": 0.8309, + "num_input_tokens_seen": 1123474912, + "step": 6224 + }, + { + "epoch": 0.6814636415884398, + "grad_norm": 1.2878655627620506, + "learning_rate": 1.1503713565503282e-05, + "loss": 0.7177, + "num_input_tokens_seen": 1123648288, + "step": 6225 + }, + { + "epoch": 0.6815731136593777, + "grad_norm": 1.439112877973551, + "learning_rate": 1.1496476385021723e-05, + "loss": 0.7875, + "num_input_tokens_seen": 1123829728, + "step": 6226 + }, + { + "epoch": 0.6816825857303156, + "grad_norm": 1.2708064321370172, + "learning_rate": 1.148924080198121e-05, + "loss": 0.7414, + "num_input_tokens_seen": 1123987872, + "step": 6227 + }, + { + "epoch": 0.6817920578012534, + "grad_norm": 1.354433835401564, + "learning_rate": 1.1482006817237665e-05, + "loss": 0.7983, + "num_input_tokens_seen": 1124173792, + "step": 6228 + }, + { + "epoch": 0.6819015298721913, + "grad_norm": 1.2103248276834944, + "learning_rate": 1.1474774431646878e-05, + "loss": 0.7431, + "num_input_tokens_seen": 1124341792, + "step": 6229 + }, + { + "epoch": 0.6820110019431292, + "grad_norm": 1.2289847492578523, + "learning_rate": 1.1467543646064424e-05, + "loss": 0.6668, + "num_input_tokens_seen": 1124510912, + "step": 6230 + }, + { + "epoch": 0.6821204740140672, + "grad_norm": 1.1676717016830445, + "learning_rate": 1.1460314461345684e-05, + "loss": 0.843, + "num_input_tokens_seen": 1124711616, + "step": 6231 + }, + { + "epoch": 0.6822299460850051, + "grad_norm": 1.1564947333242754, + "learning_rate": 1.1453086878345862e-05, + "loss": 0.5774, + "num_input_tokens_seen": 1124856992, + "step": 6232 + }, + { + "epoch": 0.682339418155943, + "grad_norm": 1.2962519516373807, + "learning_rate": 1.1445860897919951e-05, + "loss": 0.7167, + "num_input_tokens_seen": 1125007296, + "step": 6233 + }, + { + "epoch": 0.6824488902268808, + "grad_norm": 1.2494155471122668, + "learning_rate": 1.1438636520922807e-05, + "loss": 1.0314, + "num_input_tokens_seen": 1125205088, + "step": 6234 + }, + { + "epoch": 0.6825583622978187, + "grad_norm": 1.0938264002683196, + "learning_rate": 1.1431413748209021e-05, + "loss": 0.594, + "num_input_tokens_seen": 1125390112, + "step": 6235 + }, + { + "epoch": 0.6826678343687567, + "grad_norm": 1.2471503677900453, + "learning_rate": 1.1424192580633067e-05, + "loss": 0.6734, + "num_input_tokens_seen": 1125586560, + "step": 6236 + }, + { + "epoch": 0.6827773064396946, + "grad_norm": 1.1027432870048686, + "learning_rate": 1.1416973019049156e-05, + "loss": 0.6722, + "num_input_tokens_seen": 1125789728, + "step": 6237 + }, + { + "epoch": 0.6828867785106325, + "grad_norm": 1.3034469280025416, + "learning_rate": 1.1409755064311384e-05, + "loss": 0.7834, + "num_input_tokens_seen": 1125974304, + "step": 6238 + }, + { + "epoch": 0.6829962505815704, + "grad_norm": 1.297447262171787, + "learning_rate": 1.1402538717273605e-05, + "loss": 0.6759, + "num_input_tokens_seen": 1126190464, + "step": 6239 + }, + { + "epoch": 0.6831057226525082, + "grad_norm": 1.3187675239881154, + "learning_rate": 1.1395323978789504e-05, + "loss": 0.789, + "num_input_tokens_seen": 1126371232, + "step": 6240 + }, + { + "epoch": 0.6832151947234462, + "grad_norm": 1.2579584451674333, + "learning_rate": 1.138811084971257e-05, + "loss": 0.8276, + "num_input_tokens_seen": 1126569696, + "step": 6241 + }, + { + "epoch": 0.6833246667943841, + "grad_norm": 1.3054054249460965, + "learning_rate": 1.1380899330896086e-05, + "loss": 0.7347, + "num_input_tokens_seen": 1126769280, + "step": 6242 + }, + { + "epoch": 0.683434138865322, + "grad_norm": 1.1681736795029232, + "learning_rate": 1.13736894231932e-05, + "loss": 0.7242, + "num_input_tokens_seen": 1126942880, + "step": 6243 + }, + { + "epoch": 0.6835436109362599, + "grad_norm": 1.1291646166755864, + "learning_rate": 1.1366481127456785e-05, + "loss": 0.8621, + "num_input_tokens_seen": 1127129024, + "step": 6244 + }, + { + "epoch": 0.6836530830071977, + "grad_norm": 1.4827178538981003, + "learning_rate": 1.1359274444539596e-05, + "loss": 0.8514, + "num_input_tokens_seen": 1127304864, + "step": 6245 + }, + { + "epoch": 0.6837625550781357, + "grad_norm": 1.185616756686047, + "learning_rate": 1.1352069375294169e-05, + "loss": 0.7992, + "num_input_tokens_seen": 1127475776, + "step": 6246 + }, + { + "epoch": 0.6838720271490736, + "grad_norm": 1.440421163242786, + "learning_rate": 1.1344865920572842e-05, + "loss": 0.9037, + "num_input_tokens_seen": 1127643552, + "step": 6247 + }, + { + "epoch": 0.6839814992200115, + "grad_norm": 1.1129199596219683, + "learning_rate": 1.1337664081227773e-05, + "loss": 0.6161, + "num_input_tokens_seen": 1127804832, + "step": 6248 + }, + { + "epoch": 0.6840909712909494, + "grad_norm": 1.1741671207572397, + "learning_rate": 1.1330463858110927e-05, + "loss": 0.952, + "num_input_tokens_seen": 1127989856, + "step": 6249 + }, + { + "epoch": 0.6842004433618873, + "grad_norm": 1.2864597747835844, + "learning_rate": 1.132326525207406e-05, + "loss": 0.8606, + "num_input_tokens_seen": 1128178464, + "step": 6250 + }, + { + "epoch": 0.6843099154328252, + "grad_norm": 1.3026776851918846, + "learning_rate": 1.1316068263968793e-05, + "loss": 0.9025, + "num_input_tokens_seen": 1128336384, + "step": 6251 + }, + { + "epoch": 0.6844193875037631, + "grad_norm": 1.0648196088219193, + "learning_rate": 1.130887289464647e-05, + "loss": 0.7587, + "num_input_tokens_seen": 1128537536, + "step": 6252 + }, + { + "epoch": 0.684528859574701, + "grad_norm": 1.1324288606510453, + "learning_rate": 1.1301679144958318e-05, + "loss": 0.6645, + "num_input_tokens_seen": 1128690080, + "step": 6253 + }, + { + "epoch": 0.6846383316456389, + "grad_norm": 1.2369323025576358, + "learning_rate": 1.1294487015755339e-05, + "loss": 0.7183, + "num_input_tokens_seen": 1128860096, + "step": 6254 + }, + { + "epoch": 0.6847478037165768, + "grad_norm": 1.1412220267704205, + "learning_rate": 1.1287296507888345e-05, + "loss": 0.8811, + "num_input_tokens_seen": 1129056096, + "step": 6255 + }, + { + "epoch": 0.6848572757875148, + "grad_norm": 1.267500347460938, + "learning_rate": 1.1280107622207962e-05, + "loss": 0.8225, + "num_input_tokens_seen": 1129250752, + "step": 6256 + }, + { + "epoch": 0.6849667478584526, + "grad_norm": 1.2298493779232622, + "learning_rate": 1.1272920359564607e-05, + "loss": 0.8315, + "num_input_tokens_seen": 1129418528, + "step": 6257 + }, + { + "epoch": 0.6850762199293905, + "grad_norm": 1.4306199831893371, + "learning_rate": 1.1265734720808549e-05, + "loss": 0.8788, + "num_input_tokens_seen": 1129597728, + "step": 6258 + }, + { + "epoch": 0.6851856920003284, + "grad_norm": 1.403800016957495, + "learning_rate": 1.1258550706789803e-05, + "loss": 1.1224, + "num_input_tokens_seen": 1129783872, + "step": 6259 + }, + { + "epoch": 0.6852951640712663, + "grad_norm": 1.2849600505927268, + "learning_rate": 1.1251368318358254e-05, + "loss": 0.7769, + "num_input_tokens_seen": 1129941792, + "step": 6260 + }, + { + "epoch": 0.6854046361422043, + "grad_norm": 1.2529189361325055, + "learning_rate": 1.1244187556363536e-05, + "loss": 0.8395, + "num_input_tokens_seen": 1130145408, + "step": 6261 + }, + { + "epoch": 0.6855141082131421, + "grad_norm": 1.047121275216771, + "learning_rate": 1.123700842165514e-05, + "loss": 0.6023, + "num_input_tokens_seen": 1130351712, + "step": 6262 + }, + { + "epoch": 0.68562358028408, + "grad_norm": 1.245253611551932, + "learning_rate": 1.1229830915082337e-05, + "loss": 0.8637, + "num_input_tokens_seen": 1130546592, + "step": 6263 + }, + { + "epoch": 0.6857330523550179, + "grad_norm": 1.342295413847106, + "learning_rate": 1.1222655037494215e-05, + "loss": 0.838, + "num_input_tokens_seen": 1130727584, + "step": 6264 + }, + { + "epoch": 0.6858425244259558, + "grad_norm": 1.3024235344551796, + "learning_rate": 1.1215480789739662e-05, + "loss": 0.8799, + "num_input_tokens_seen": 1130896480, + "step": 6265 + }, + { + "epoch": 0.6859519964968938, + "grad_norm": 1.261178340861026, + "learning_rate": 1.120830817266737e-05, + "loss": 0.7976, + "num_input_tokens_seen": 1131084640, + "step": 6266 + }, + { + "epoch": 0.6860614685678317, + "grad_norm": 1.1530644636106957, + "learning_rate": 1.1201137187125876e-05, + "loss": 0.8153, + "num_input_tokens_seen": 1131271008, + "step": 6267 + }, + { + "epoch": 0.6861709406387695, + "grad_norm": 1.263779224393657, + "learning_rate": 1.1193967833963455e-05, + "loss": 0.7176, + "num_input_tokens_seen": 1131432960, + "step": 6268 + }, + { + "epoch": 0.6862804127097074, + "grad_norm": 1.2892592308893074, + "learning_rate": 1.1186800114028268e-05, + "loss": 0.7403, + "num_input_tokens_seen": 1131625152, + "step": 6269 + }, + { + "epoch": 0.6863898847806453, + "grad_norm": 1.2653198345232721, + "learning_rate": 1.1179634028168198e-05, + "loss": 0.7302, + "num_input_tokens_seen": 1131808384, + "step": 6270 + }, + { + "epoch": 0.6864993568515833, + "grad_norm": 1.3097604589363243, + "learning_rate": 1.1172469577231016e-05, + "loss": 0.6424, + "num_input_tokens_seen": 1131938080, + "step": 6271 + }, + { + "epoch": 0.6866088289225212, + "grad_norm": 1.0621321546294062, + "learning_rate": 1.1165306762064246e-05, + "loss": 0.6995, + "num_input_tokens_seen": 1132131616, + "step": 6272 + }, + { + "epoch": 0.6867183009934591, + "grad_norm": 1.3317389293315776, + "learning_rate": 1.115814558351524e-05, + "loss": 0.7676, + "num_input_tokens_seen": 1132302752, + "step": 6273 + }, + { + "epoch": 0.6868277730643969, + "grad_norm": 1.449629897535399, + "learning_rate": 1.1150986042431147e-05, + "loss": 0.9329, + "num_input_tokens_seen": 1132481504, + "step": 6274 + }, + { + "epoch": 0.6869372451353348, + "grad_norm": 1.4289044179430146, + "learning_rate": 1.1143828139658924e-05, + "loss": 0.979, + "num_input_tokens_seen": 1132662048, + "step": 6275 + }, + { + "epoch": 0.6870467172062728, + "grad_norm": 1.2824474414667104, + "learning_rate": 1.1136671876045363e-05, + "loss": 0.7348, + "num_input_tokens_seen": 1132854688, + "step": 6276 + }, + { + "epoch": 0.6871561892772107, + "grad_norm": 1.1942059749532379, + "learning_rate": 1.1129517252436996e-05, + "loss": 0.5928, + "num_input_tokens_seen": 1133035456, + "step": 6277 + }, + { + "epoch": 0.6872656613481486, + "grad_norm": 1.2258046232466508, + "learning_rate": 1.1122364269680236e-05, + "loss": 0.8864, + "num_input_tokens_seen": 1133244448, + "step": 6278 + }, + { + "epoch": 0.6873751334190864, + "grad_norm": 1.4249351185534, + "learning_rate": 1.1115212928621249e-05, + "loss": 0.7834, + "num_input_tokens_seen": 1133388480, + "step": 6279 + }, + { + "epoch": 0.6874846054900243, + "grad_norm": 1.3499203888060203, + "learning_rate": 1.1108063230106031e-05, + "loss": 0.7376, + "num_input_tokens_seen": 1133576416, + "step": 6280 + }, + { + "epoch": 0.6875940775609622, + "grad_norm": 1.2496555330576402, + "learning_rate": 1.1100915174980375e-05, + "loss": 0.8915, + "num_input_tokens_seen": 1133771072, + "step": 6281 + }, + { + "epoch": 0.6877035496319002, + "grad_norm": 1.1659272893674788, + "learning_rate": 1.1093768764089881e-05, + "loss": 0.814, + "num_input_tokens_seen": 1133956096, + "step": 6282 + }, + { + "epoch": 0.6878130217028381, + "grad_norm": 1.270530753254767, + "learning_rate": 1.1086623998279963e-05, + "loss": 0.7482, + "num_input_tokens_seen": 1134137088, + "step": 6283 + }, + { + "epoch": 0.687922493773776, + "grad_norm": 1.1375821429765163, + "learning_rate": 1.1079480878395817e-05, + "loss": 0.8329, + "num_input_tokens_seen": 1134311584, + "step": 6284 + }, + { + "epoch": 0.6880319658447138, + "grad_norm": 1.174227223013576, + "learning_rate": 1.107233940528248e-05, + "loss": 0.7921, + "num_input_tokens_seen": 1134512064, + "step": 6285 + }, + { + "epoch": 0.6881414379156517, + "grad_norm": 1.2774911964546147, + "learning_rate": 1.1065199579784767e-05, + "loss": 0.7124, + "num_input_tokens_seen": 1134682080, + "step": 6286 + }, + { + "epoch": 0.6882509099865897, + "grad_norm": 1.215569723877945, + "learning_rate": 1.1058061402747305e-05, + "loss": 0.7193, + "num_input_tokens_seen": 1134857696, + "step": 6287 + }, + { + "epoch": 0.6883603820575276, + "grad_norm": 1.1306778361815437, + "learning_rate": 1.1050924875014527e-05, + "loss": 0.6807, + "num_input_tokens_seen": 1135041824, + "step": 6288 + }, + { + "epoch": 0.6884698541284655, + "grad_norm": 1.1596173919891997, + "learning_rate": 1.1043789997430668e-05, + "loss": 0.7353, + "num_input_tokens_seen": 1135229760, + "step": 6289 + }, + { + "epoch": 0.6885793261994034, + "grad_norm": 1.0499983923763503, + "learning_rate": 1.1036656770839774e-05, + "loss": 0.6339, + "num_input_tokens_seen": 1135401792, + "step": 6290 + }, + { + "epoch": 0.6886887982703412, + "grad_norm": 1.232028952955041, + "learning_rate": 1.1029525196085691e-05, + "loss": 0.7396, + "num_input_tokens_seen": 1135569568, + "step": 6291 + }, + { + "epoch": 0.6887982703412792, + "grad_norm": 1.2619026451615765, + "learning_rate": 1.102239527401206e-05, + "loss": 0.8012, + "num_input_tokens_seen": 1135728384, + "step": 6292 + }, + { + "epoch": 0.6889077424122171, + "grad_norm": 1.235854891440886, + "learning_rate": 1.1015267005462366e-05, + "loss": 0.9246, + "num_input_tokens_seen": 1135907136, + "step": 6293 + }, + { + "epoch": 0.689017214483155, + "grad_norm": 1.234703792772518, + "learning_rate": 1.1008140391279834e-05, + "loss": 0.8184, + "num_input_tokens_seen": 1136100000, + "step": 6294 + }, + { + "epoch": 0.6891266865540929, + "grad_norm": 1.269986209043582, + "learning_rate": 1.1001015432307554e-05, + "loss": 0.8115, + "num_input_tokens_seen": 1136301376, + "step": 6295 + }, + { + "epoch": 0.6892361586250307, + "grad_norm": 1.3984032738615642, + "learning_rate": 1.0993892129388385e-05, + "loss": 0.8367, + "num_input_tokens_seen": 1136484608, + "step": 6296 + }, + { + "epoch": 0.6893456306959687, + "grad_norm": 1.3888855674492009, + "learning_rate": 1.0986770483365005e-05, + "loss": 0.8339, + "num_input_tokens_seen": 1136664480, + "step": 6297 + }, + { + "epoch": 0.6894551027669066, + "grad_norm": 1.3618800879108122, + "learning_rate": 1.0979650495079888e-05, + "loss": 0.8491, + "num_input_tokens_seen": 1136812320, + "step": 6298 + }, + { + "epoch": 0.6895645748378445, + "grad_norm": 1.153166448163536, + "learning_rate": 1.0972532165375305e-05, + "loss": 0.7752, + "num_input_tokens_seen": 1137021312, + "step": 6299 + }, + { + "epoch": 0.6896740469087824, + "grad_norm": 1.1073740792430509, + "learning_rate": 1.0965415495093368e-05, + "loss": 0.8371, + "num_input_tokens_seen": 1137206336, + "step": 6300 + }, + { + "epoch": 0.6897835189797203, + "grad_norm": 1.3367617111392214, + "learning_rate": 1.0958300485075931e-05, + "loss": 0.9483, + "num_input_tokens_seen": 1137359104, + "step": 6301 + }, + { + "epoch": 0.6898929910506582, + "grad_norm": 1.2855123009287905, + "learning_rate": 1.095118713616472e-05, + "loss": 0.6081, + "num_input_tokens_seen": 1137535392, + "step": 6302 + }, + { + "epoch": 0.6900024631215961, + "grad_norm": 1.3657397643167821, + "learning_rate": 1.09440754492012e-05, + "loss": 0.6693, + "num_input_tokens_seen": 1137717280, + "step": 6303 + }, + { + "epoch": 0.690111935192534, + "grad_norm": 1.2946780336233272, + "learning_rate": 1.093696542502669e-05, + "loss": 0.8012, + "num_input_tokens_seen": 1137894016, + "step": 6304 + }, + { + "epoch": 0.6902214072634719, + "grad_norm": 1.2326779832446009, + "learning_rate": 1.0929857064482285e-05, + "loss": 0.8489, + "num_input_tokens_seen": 1138088448, + "step": 6305 + }, + { + "epoch": 0.6903308793344098, + "grad_norm": 1.0727624628058277, + "learning_rate": 1.0922750368408896e-05, + "loss": 0.6417, + "num_input_tokens_seen": 1138257344, + "step": 6306 + }, + { + "epoch": 0.6904403514053478, + "grad_norm": 1.3577537076137707, + "learning_rate": 1.0915645337647224e-05, + "loss": 0.7816, + "num_input_tokens_seen": 1138405632, + "step": 6307 + }, + { + "epoch": 0.6905498234762856, + "grad_norm": 1.2047889894317974, + "learning_rate": 1.0908541973037775e-05, + "loss": 0.762, + "num_input_tokens_seen": 1138571168, + "step": 6308 + }, + { + "epoch": 0.6906592955472235, + "grad_norm": 1.2209297152979655, + "learning_rate": 1.090144027542089e-05, + "loss": 0.7214, + "num_input_tokens_seen": 1138761344, + "step": 6309 + }, + { + "epoch": 0.6907687676181614, + "grad_norm": 1.0738840587448357, + "learning_rate": 1.0894340245636652e-05, + "loss": 0.731, + "num_input_tokens_seen": 1138947712, + "step": 6310 + }, + { + "epoch": 0.6908782396890993, + "grad_norm": 1.421941231662719, + "learning_rate": 1.0887241884525014e-05, + "loss": 0.6828, + "num_input_tokens_seen": 1139087040, + "step": 6311 + }, + { + "epoch": 0.6909877117600373, + "grad_norm": 1.3963739098027372, + "learning_rate": 1.0880145192925666e-05, + "loss": 1.0169, + "num_input_tokens_seen": 1139290432, + "step": 6312 + }, + { + "epoch": 0.6910971838309751, + "grad_norm": 1.2675528736861539, + "learning_rate": 1.087305017167816e-05, + "loss": 1.306, + "num_input_tokens_seen": 1139510624, + "step": 6313 + }, + { + "epoch": 0.691206655901913, + "grad_norm": 1.1727052417867307, + "learning_rate": 1.0865956821621808e-05, + "loss": 0.6743, + "num_input_tokens_seen": 1139683552, + "step": 6314 + }, + { + "epoch": 0.6913161279728509, + "grad_norm": 1.081723827111555, + "learning_rate": 1.0858865143595749e-05, + "loss": 0.6142, + "num_input_tokens_seen": 1139876640, + "step": 6315 + }, + { + "epoch": 0.6914256000437888, + "grad_norm": 1.369298297330477, + "learning_rate": 1.0851775138438906e-05, + "loss": 0.9913, + "num_input_tokens_seen": 1140058528, + "step": 6316 + }, + { + "epoch": 0.6915350721147268, + "grad_norm": 1.0724219258663343, + "learning_rate": 1.084468680699001e-05, + "loss": 0.5566, + "num_input_tokens_seen": 1140255872, + "step": 6317 + }, + { + "epoch": 0.6916445441856647, + "grad_norm": 1.170009858098029, + "learning_rate": 1.0837600150087612e-05, + "loss": 0.7737, + "num_input_tokens_seen": 1140482560, + "step": 6318 + }, + { + "epoch": 0.6917540162566025, + "grad_norm": 1.4034475164028408, + "learning_rate": 1.0830515168570043e-05, + "loss": 0.889, + "num_input_tokens_seen": 1140664224, + "step": 6319 + }, + { + "epoch": 0.6918634883275404, + "grad_norm": 1.2628528234155807, + "learning_rate": 1.0823431863275443e-05, + "loss": 0.8727, + "num_input_tokens_seen": 1140848352, + "step": 6320 + }, + { + "epoch": 0.6919729603984783, + "grad_norm": 1.3197498364080815, + "learning_rate": 1.081635023504175e-05, + "loss": 0.8677, + "num_input_tokens_seen": 1141030912, + "step": 6321 + }, + { + "epoch": 0.6920824324694163, + "grad_norm": 1.1344760650087056, + "learning_rate": 1.0809270284706713e-05, + "loss": 0.6294, + "num_input_tokens_seen": 1141190400, + "step": 6322 + }, + { + "epoch": 0.6921919045403542, + "grad_norm": 1.1226953636369308, + "learning_rate": 1.0802192013107873e-05, + "loss": 0.7092, + "num_input_tokens_seen": 1141365344, + "step": 6323 + }, + { + "epoch": 0.6923013766112921, + "grad_norm": 1.3041506822152904, + "learning_rate": 1.0795115421082574e-05, + "loss": 0.8443, + "num_input_tokens_seen": 1141551040, + "step": 6324 + }, + { + "epoch": 0.6924108486822299, + "grad_norm": 1.250818413796462, + "learning_rate": 1.0788040509467958e-05, + "loss": 0.7903, + "num_input_tokens_seen": 1141755104, + "step": 6325 + }, + { + "epoch": 0.6925203207531678, + "grad_norm": 1.2801832084373381, + "learning_rate": 1.078096727910099e-05, + "loss": 0.8951, + "num_input_tokens_seen": 1141941920, + "step": 6326 + }, + { + "epoch": 0.6926297928241057, + "grad_norm": 1.2857874100190232, + "learning_rate": 1.0773895730818409e-05, + "loss": 0.7337, + "num_input_tokens_seen": 1142111040, + "step": 6327 + }, + { + "epoch": 0.6927392648950437, + "grad_norm": 1.2890517263251249, + "learning_rate": 1.076682586545677e-05, + "loss": 0.8593, + "num_input_tokens_seen": 1142306368, + "step": 6328 + }, + { + "epoch": 0.6928487369659816, + "grad_norm": 1.2800310430636692, + "learning_rate": 1.075975768385242e-05, + "loss": 0.7259, + "num_input_tokens_seen": 1142488928, + "step": 6329 + }, + { + "epoch": 0.6929582090369194, + "grad_norm": 1.1387153211520178, + "learning_rate": 1.0752691186841516e-05, + "loss": 0.6954, + "num_input_tokens_seen": 1142641248, + "step": 6330 + }, + { + "epoch": 0.6930676811078573, + "grad_norm": 1.2000287827377762, + "learning_rate": 1.0745626375260004e-05, + "loss": 0.8957, + "num_input_tokens_seen": 1142809472, + "step": 6331 + }, + { + "epoch": 0.6931771531787952, + "grad_norm": 1.2698177073152672, + "learning_rate": 1.0738563249943637e-05, + "loss": 0.6792, + "num_input_tokens_seen": 1142945888, + "step": 6332 + }, + { + "epoch": 0.6932866252497332, + "grad_norm": 1.2696527041512693, + "learning_rate": 1.073150181172799e-05, + "loss": 0.7611, + "num_input_tokens_seen": 1143118368, + "step": 6333 + }, + { + "epoch": 0.6933960973206711, + "grad_norm": 1.4366166261571482, + "learning_rate": 1.0724442061448383e-05, + "loss": 0.8674, + "num_input_tokens_seen": 1143291744, + "step": 6334 + }, + { + "epoch": 0.693505569391609, + "grad_norm": 1.1938540468259429, + "learning_rate": 1.0717383999940011e-05, + "loss": 0.8049, + "num_input_tokens_seen": 1143481024, + "step": 6335 + }, + { + "epoch": 0.6936150414625468, + "grad_norm": 1.0934369865475146, + "learning_rate": 1.0710327628037788e-05, + "loss": 0.7893, + "num_input_tokens_seen": 1143651488, + "step": 6336 + }, + { + "epoch": 0.6937245135334847, + "grad_norm": 1.1979921510659004, + "learning_rate": 1.0703272946576501e-05, + "loss": 0.8885, + "num_input_tokens_seen": 1143840768, + "step": 6337 + }, + { + "epoch": 0.6938339856044227, + "grad_norm": 1.3749558701802895, + "learning_rate": 1.069621995639069e-05, + "loss": 0.9201, + "num_input_tokens_seen": 1144006752, + "step": 6338 + }, + { + "epoch": 0.6939434576753606, + "grad_norm": 1.1940453987723953, + "learning_rate": 1.0689168658314708e-05, + "loss": 0.7376, + "num_input_tokens_seen": 1144179680, + "step": 6339 + }, + { + "epoch": 0.6940529297462985, + "grad_norm": 1.2611730948707358, + "learning_rate": 1.0682119053182731e-05, + "loss": 0.9221, + "num_input_tokens_seen": 1144360448, + "step": 6340 + }, + { + "epoch": 0.6941624018172364, + "grad_norm": 1.2599323014585855, + "learning_rate": 1.0675071141828682e-05, + "loss": 1.0042, + "num_input_tokens_seen": 1144523968, + "step": 6341 + }, + { + "epoch": 0.6942718738881742, + "grad_norm": 1.261802174553091, + "learning_rate": 1.0668024925086353e-05, + "loss": 1.052, + "num_input_tokens_seen": 1144714368, + "step": 6342 + }, + { + "epoch": 0.6943813459591122, + "grad_norm": 1.1032290466345027, + "learning_rate": 1.0660980403789256e-05, + "loss": 0.5977, + "num_input_tokens_seen": 1144902976, + "step": 6343 + }, + { + "epoch": 0.6944908180300501, + "grad_norm": 1.0149563275417404, + "learning_rate": 1.0653937578770787e-05, + "loss": 0.6321, + "num_input_tokens_seen": 1145082176, + "step": 6344 + }, + { + "epoch": 0.694600290100988, + "grad_norm": 0.9778046299638093, + "learning_rate": 1.0646896450864056e-05, + "loss": 0.6477, + "num_input_tokens_seen": 1145253984, + "step": 6345 + }, + { + "epoch": 0.6947097621719259, + "grad_norm": 1.2187145912699844, + "learning_rate": 1.0639857020902048e-05, + "loss": 0.7436, + "num_input_tokens_seen": 1145411904, + "step": 6346 + }, + { + "epoch": 0.6948192342428637, + "grad_norm": 1.2588697456444902, + "learning_rate": 1.0632819289717499e-05, + "loss": 0.7433, + "num_input_tokens_seen": 1145597152, + "step": 6347 + }, + { + "epoch": 0.6949287063138017, + "grad_norm": 1.1145195749650818, + "learning_rate": 1.0625783258142965e-05, + "loss": 0.7622, + "num_input_tokens_seen": 1145787552, + "step": 6348 + }, + { + "epoch": 0.6950381783847396, + "grad_norm": 1.379195661134378, + "learning_rate": 1.0618748927010794e-05, + "loss": 1.0516, + "num_input_tokens_seen": 1145996544, + "step": 6349 + }, + { + "epoch": 0.6951476504556775, + "grad_norm": 1.187943676833594, + "learning_rate": 1.0611716297153121e-05, + "loss": 0.7735, + "num_input_tokens_seen": 1146165664, + "step": 6350 + }, + { + "epoch": 0.6952571225266154, + "grad_norm": 1.1729432387021297, + "learning_rate": 1.0604685369401923e-05, + "loss": 0.4957, + "num_input_tokens_seen": 1146351808, + "step": 6351 + }, + { + "epoch": 0.6953665945975533, + "grad_norm": 1.3662601922886182, + "learning_rate": 1.059765614458891e-05, + "loss": 0.7416, + "num_input_tokens_seen": 1146524736, + "step": 6352 + }, + { + "epoch": 0.6954760666684912, + "grad_norm": 1.1541282156221173, + "learning_rate": 1.059062862354565e-05, + "loss": 0.7054, + "num_input_tokens_seen": 1146733056, + "step": 6353 + }, + { + "epoch": 0.6955855387394291, + "grad_norm": 1.3091518293851585, + "learning_rate": 1.0583602807103479e-05, + "loss": 1.0509, + "num_input_tokens_seen": 1146908896, + "step": 6354 + }, + { + "epoch": 0.695695010810367, + "grad_norm": 1.1283133774818792, + "learning_rate": 1.0576578696093536e-05, + "loss": 0.7343, + "num_input_tokens_seen": 1147107808, + "step": 6355 + }, + { + "epoch": 0.6958044828813049, + "grad_norm": 1.1441398185604714, + "learning_rate": 1.0569556291346761e-05, + "loss": 0.7191, + "num_input_tokens_seen": 1147302688, + "step": 6356 + }, + { + "epoch": 0.6959139549522428, + "grad_norm": 1.2619563962503983, + "learning_rate": 1.056253559369389e-05, + "loss": 0.7672, + "num_input_tokens_seen": 1147474944, + "step": 6357 + }, + { + "epoch": 0.6960234270231808, + "grad_norm": 1.3844927907737432, + "learning_rate": 1.0555516603965457e-05, + "loss": 0.7266, + "num_input_tokens_seen": 1147623008, + "step": 6358 + }, + { + "epoch": 0.6961328990941186, + "grad_norm": 1.3650461484632626, + "learning_rate": 1.0548499322991789e-05, + "loss": 0.9644, + "num_input_tokens_seen": 1147806464, + "step": 6359 + }, + { + "epoch": 0.6962423711650565, + "grad_norm": 1.4442845168384322, + "learning_rate": 1.0541483751603031e-05, + "loss": 0.6975, + "num_input_tokens_seen": 1147986112, + "step": 6360 + }, + { + "epoch": 0.6963518432359944, + "grad_norm": 1.3702391218767331, + "learning_rate": 1.0534469890629109e-05, + "loss": 0.8517, + "num_input_tokens_seen": 1148167776, + "step": 6361 + }, + { + "epoch": 0.6964613153069323, + "grad_norm": 1.1045538685218081, + "learning_rate": 1.0527457740899744e-05, + "loss": 0.825, + "num_input_tokens_seen": 1148389088, + "step": 6362 + }, + { + "epoch": 0.6965707873778703, + "grad_norm": 1.2725868426294888, + "learning_rate": 1.0520447303244463e-05, + "loss": 0.7819, + "num_input_tokens_seen": 1148592928, + "step": 6363 + }, + { + "epoch": 0.6966802594488081, + "grad_norm": 1.2393794439700696, + "learning_rate": 1.0513438578492582e-05, + "loss": 0.6988, + "num_input_tokens_seen": 1148741216, + "step": 6364 + }, + { + "epoch": 0.696789731519746, + "grad_norm": 1.1734255895505417, + "learning_rate": 1.0506431567473226e-05, + "loss": 0.7086, + "num_input_tokens_seen": 1148943264, + "step": 6365 + }, + { + "epoch": 0.6968992035906839, + "grad_norm": 1.1068147478554176, + "learning_rate": 1.049942627101531e-05, + "loss": 0.5867, + "num_input_tokens_seen": 1149144640, + "step": 6366 + }, + { + "epoch": 0.6970086756616218, + "grad_norm": 1.170498357257562, + "learning_rate": 1.0492422689947534e-05, + "loss": 0.8374, + "num_input_tokens_seen": 1149319808, + "step": 6367 + }, + { + "epoch": 0.6971181477325598, + "grad_norm": 1.3674629152001279, + "learning_rate": 1.048542082509843e-05, + "loss": 0.8275, + "num_input_tokens_seen": 1149503040, + "step": 6368 + }, + { + "epoch": 0.6972276198034977, + "grad_norm": 1.2478893102960045, + "learning_rate": 1.0478420677296297e-05, + "loss": 0.7418, + "num_input_tokens_seen": 1149674400, + "step": 6369 + }, + { + "epoch": 0.6973370918744355, + "grad_norm": 1.3521307043036523, + "learning_rate": 1.0471422247369233e-05, + "loss": 0.8802, + "num_input_tokens_seen": 1149838368, + "step": 6370 + }, + { + "epoch": 0.6974465639453734, + "grad_norm": 1.2463881763622187, + "learning_rate": 1.0464425536145148e-05, + "loss": 0.9948, + "num_input_tokens_seen": 1150051616, + "step": 6371 + }, + { + "epoch": 0.6975560360163113, + "grad_norm": 1.1032629753471797, + "learning_rate": 1.0457430544451733e-05, + "loss": 0.7484, + "num_input_tokens_seen": 1150233728, + "step": 6372 + }, + { + "epoch": 0.6976655080872493, + "grad_norm": 1.3032489086492678, + "learning_rate": 1.0450437273116484e-05, + "loss": 0.857, + "num_input_tokens_seen": 1150407104, + "step": 6373 + }, + { + "epoch": 0.6977749801581872, + "grad_norm": 1.1758733567568729, + "learning_rate": 1.044344572296668e-05, + "loss": 0.6776, + "num_input_tokens_seen": 1150579584, + "step": 6374 + }, + { + "epoch": 0.6978844522291251, + "grad_norm": 1.4150311434927654, + "learning_rate": 1.0436455894829442e-05, + "loss": 0.9439, + "num_input_tokens_seen": 1150759904, + "step": 6375 + }, + { + "epoch": 0.6979939243000629, + "grad_norm": 1.145121399469777, + "learning_rate": 1.0429467789531608e-05, + "loss": 0.8197, + "num_input_tokens_seen": 1150913344, + "step": 6376 + }, + { + "epoch": 0.6981033963710008, + "grad_norm": 1.383912807146284, + "learning_rate": 1.04224814078999e-05, + "loss": 0.6964, + "num_input_tokens_seen": 1151088512, + "step": 6377 + }, + { + "epoch": 0.6982128684419387, + "grad_norm": 1.1752016787224213, + "learning_rate": 1.041549675076076e-05, + "loss": 0.5428, + "num_input_tokens_seen": 1151246432, + "step": 6378 + }, + { + "epoch": 0.6983223405128767, + "grad_norm": 1.13390970697148, + "learning_rate": 1.0408513818940477e-05, + "loss": 0.8796, + "num_input_tokens_seen": 1151439744, + "step": 6379 + }, + { + "epoch": 0.6984318125838146, + "grad_norm": 1.181850294947314, + "learning_rate": 1.040153261326512e-05, + "loss": 0.7313, + "num_input_tokens_seen": 1151627456, + "step": 6380 + }, + { + "epoch": 0.6985412846547524, + "grad_norm": 1.2696667408167412, + "learning_rate": 1.0394553134560533e-05, + "loss": 0.8966, + "num_input_tokens_seen": 1151844736, + "step": 6381 + }, + { + "epoch": 0.6986507567256903, + "grad_norm": 1.1505423220758528, + "learning_rate": 1.0387575383652411e-05, + "loss": 0.7179, + "num_input_tokens_seen": 1152041408, + "step": 6382 + }, + { + "epoch": 0.6987602287966282, + "grad_norm": 1.1801382555221704, + "learning_rate": 1.0380599361366169e-05, + "loss": 0.7517, + "num_input_tokens_seen": 1152227776, + "step": 6383 + }, + { + "epoch": 0.6988697008675662, + "grad_norm": 1.1204986587850023, + "learning_rate": 1.037362506852709e-05, + "loss": 0.6359, + "num_input_tokens_seen": 1152395776, + "step": 6384 + }, + { + "epoch": 0.6989791729385041, + "grad_norm": 1.244309633953272, + "learning_rate": 1.036665250596019e-05, + "loss": 0.8278, + "num_input_tokens_seen": 1152586400, + "step": 6385 + }, + { + "epoch": 0.699088645009442, + "grad_norm": 1.2203199593954654, + "learning_rate": 1.0359681674490332e-05, + "loss": 0.8987, + "num_input_tokens_seen": 1152802336, + "step": 6386 + }, + { + "epoch": 0.6991981170803798, + "grad_norm": 1.21148816785651, + "learning_rate": 1.0352712574942144e-05, + "loss": 0.734, + "num_input_tokens_seen": 1152998112, + "step": 6387 + }, + { + "epoch": 0.6993075891513177, + "grad_norm": 1.208936305161206, + "learning_rate": 1.0345745208140056e-05, + "loss": 0.6399, + "num_input_tokens_seen": 1153192320, + "step": 6388 + }, + { + "epoch": 0.6994170612222557, + "grad_norm": 1.4039427585344544, + "learning_rate": 1.03387795749083e-05, + "loss": 0.8757, + "num_input_tokens_seen": 1153385856, + "step": 6389 + }, + { + "epoch": 0.6995265332931936, + "grad_norm": 1.3540002084785425, + "learning_rate": 1.0331815676070888e-05, + "loss": 0.9226, + "num_input_tokens_seen": 1153544672, + "step": 6390 + }, + { + "epoch": 0.6996360053641315, + "grad_norm": 1.32800723283016, + "learning_rate": 1.0324853512451643e-05, + "loss": 0.9723, + "num_input_tokens_seen": 1153737088, + "step": 6391 + }, + { + "epoch": 0.6997454774350694, + "grad_norm": 1.5248257302940305, + "learning_rate": 1.0317893084874167e-05, + "loss": 1.0053, + "num_input_tokens_seen": 1153938688, + "step": 6392 + }, + { + "epoch": 0.6998549495060072, + "grad_norm": 1.2051556282531488, + "learning_rate": 1.0310934394161875e-05, + "loss": 0.7724, + "num_input_tokens_seen": 1154154400, + "step": 6393 + }, + { + "epoch": 0.6999644215769452, + "grad_norm": 1.2229802004130799, + "learning_rate": 1.0303977441137968e-05, + "loss": 0.7241, + "num_input_tokens_seen": 1154311648, + "step": 6394 + }, + { + "epoch": 0.7000738936478831, + "grad_norm": 1.1955464950607784, + "learning_rate": 1.0297022226625434e-05, + "loss": 0.7457, + "num_input_tokens_seen": 1154482560, + "step": 6395 + }, + { + "epoch": 0.700183365718821, + "grad_norm": 1.2391401134297746, + "learning_rate": 1.0290068751447062e-05, + "loss": 0.8359, + "num_input_tokens_seen": 1154640032, + "step": 6396 + }, + { + "epoch": 0.7002928377897589, + "grad_norm": 1.1946359851695716, + "learning_rate": 1.0283117016425439e-05, + "loss": 0.6436, + "num_input_tokens_seen": 1154833120, + "step": 6397 + }, + { + "epoch": 0.7004023098606967, + "grad_norm": 1.2117238106735675, + "learning_rate": 1.0276167022382937e-05, + "loss": 0.7168, + "num_input_tokens_seen": 1155027104, + "step": 6398 + }, + { + "epoch": 0.7005117819316347, + "grad_norm": 1.1405834033644877, + "learning_rate": 1.0269218770141728e-05, + "loss": 0.628, + "num_input_tokens_seen": 1155222208, + "step": 6399 + }, + { + "epoch": 0.7006212540025726, + "grad_norm": 1.2298963396074523, + "learning_rate": 1.0262272260523772e-05, + "loss": 0.777, + "num_input_tokens_seen": 1155424032, + "step": 6400 + }, + { + "epoch": 0.7007307260735105, + "grad_norm": 1.2746289068485581, + "learning_rate": 1.0255327494350841e-05, + "loss": 0.7518, + "num_input_tokens_seen": 1155594496, + "step": 6401 + }, + { + "epoch": 0.7008401981444484, + "grad_norm": 1.216428648951307, + "learning_rate": 1.0248384472444481e-05, + "loss": 0.6944, + "num_input_tokens_seen": 1155737856, + "step": 6402 + }, + { + "epoch": 0.7009496702153863, + "grad_norm": 1.3237030465477362, + "learning_rate": 1.0241443195626038e-05, + "loss": 0.8583, + "num_input_tokens_seen": 1155930048, + "step": 6403 + }, + { + "epoch": 0.7010591422863242, + "grad_norm": 1.2908948417639121, + "learning_rate": 1.0234503664716649e-05, + "loss": 0.7252, + "num_input_tokens_seen": 1156068704, + "step": 6404 + }, + { + "epoch": 0.7011686143572621, + "grad_norm": 1.466970298866875, + "learning_rate": 1.0227565880537252e-05, + "loss": 0.9761, + "num_input_tokens_seen": 1156262016, + "step": 6405 + }, + { + "epoch": 0.7012780864282, + "grad_norm": 1.1970013424794916, + "learning_rate": 1.0220629843908572e-05, + "loss": 0.704, + "num_input_tokens_seen": 1156424640, + "step": 6406 + }, + { + "epoch": 0.7013875584991379, + "grad_norm": 1.1825488422783135, + "learning_rate": 1.0213695555651118e-05, + "loss": 0.9158, + "num_input_tokens_seen": 1156626464, + "step": 6407 + }, + { + "epoch": 0.7014970305700758, + "grad_norm": 1.3241275992291008, + "learning_rate": 1.020676301658523e-05, + "loss": 0.7378, + "num_input_tokens_seen": 1156800960, + "step": 6408 + }, + { + "epoch": 0.7016065026410138, + "grad_norm": 1.290828119878582, + "learning_rate": 1.0199832227530979e-05, + "loss": 0.9443, + "num_input_tokens_seen": 1156966048, + "step": 6409 + }, + { + "epoch": 0.7017159747119516, + "grad_norm": 1.164154356334118, + "learning_rate": 1.0192903189308293e-05, + "loss": 0.9808, + "num_input_tokens_seen": 1157192288, + "step": 6410 + }, + { + "epoch": 0.7018254467828895, + "grad_norm": 1.2202836681676985, + "learning_rate": 1.0185975902736853e-05, + "loss": 0.9313, + "num_input_tokens_seen": 1157391200, + "step": 6411 + }, + { + "epoch": 0.7019349188538274, + "grad_norm": 1.0346293709987417, + "learning_rate": 1.0179050368636146e-05, + "loss": 0.604, + "num_input_tokens_seen": 1157568608, + "step": 6412 + }, + { + "epoch": 0.7020443909247653, + "grad_norm": 1.2646641321570766, + "learning_rate": 1.0172126587825442e-05, + "loss": 0.8807, + "num_input_tokens_seen": 1157750944, + "step": 6413 + }, + { + "epoch": 0.7021538629957033, + "grad_norm": 1.0881659606200575, + "learning_rate": 1.0165204561123811e-05, + "loss": 0.6767, + "num_input_tokens_seen": 1157974944, + "step": 6414 + }, + { + "epoch": 0.7022633350666411, + "grad_norm": 1.4184305579231387, + "learning_rate": 1.015828428935014e-05, + "loss": 1.1053, + "num_input_tokens_seen": 1158170272, + "step": 6415 + }, + { + "epoch": 0.702372807137579, + "grad_norm": 1.364606546490881, + "learning_rate": 1.0151365773323046e-05, + "loss": 1.0652, + "num_input_tokens_seen": 1158357760, + "step": 6416 + }, + { + "epoch": 0.7024822792085169, + "grad_norm": 1.1859447936385359, + "learning_rate": 1.0144449013861013e-05, + "loss": 0.7868, + "num_input_tokens_seen": 1158538528, + "step": 6417 + }, + { + "epoch": 0.7025917512794548, + "grad_norm": 1.1386033523382906, + "learning_rate": 1.0137534011782246e-05, + "loss": 0.7195, + "num_input_tokens_seen": 1158718848, + "step": 6418 + }, + { + "epoch": 0.7027012233503928, + "grad_norm": 1.2484169472541151, + "learning_rate": 1.013062076790481e-05, + "loss": 0.9056, + "num_input_tokens_seen": 1158915968, + "step": 6419 + }, + { + "epoch": 0.7028106954213307, + "grad_norm": 1.3028132516296518, + "learning_rate": 1.0123709283046495e-05, + "loss": 0.819, + "num_input_tokens_seen": 1159096288, + "step": 6420 + }, + { + "epoch": 0.7029201674922685, + "grad_norm": 1.0870968235729175, + "learning_rate": 1.011679955802494e-05, + "loss": 0.7292, + "num_input_tokens_seen": 1159284000, + "step": 6421 + }, + { + "epoch": 0.7030296395632064, + "grad_norm": 1.1530255902867375, + "learning_rate": 1.0109891593657547e-05, + "loss": 0.7629, + "num_input_tokens_seen": 1159471488, + "step": 6422 + }, + { + "epoch": 0.7031391116341443, + "grad_norm": 1.382172905372466, + "learning_rate": 1.0102985390761505e-05, + "loss": 1.0206, + "num_input_tokens_seen": 1159622688, + "step": 6423 + }, + { + "epoch": 0.7032485837050823, + "grad_norm": 1.1966825624347046, + "learning_rate": 1.009608095015383e-05, + "loss": 0.5952, + "num_input_tokens_seen": 1159820032, + "step": 6424 + }, + { + "epoch": 0.7033580557760202, + "grad_norm": 1.4892421233804733, + "learning_rate": 1.0089178272651267e-05, + "loss": 0.9418, + "num_input_tokens_seen": 1160023872, + "step": 6425 + }, + { + "epoch": 0.7034675278469581, + "grad_norm": 1.0770031717797401, + "learning_rate": 1.008227735907043e-05, + "loss": 0.7461, + "num_input_tokens_seen": 1160201056, + "step": 6426 + }, + { + "epoch": 0.7035769999178959, + "grad_norm": 1.1205116914226863, + "learning_rate": 1.0075378210227645e-05, + "loss": 0.7075, + "num_input_tokens_seen": 1160381600, + "step": 6427 + }, + { + "epoch": 0.7036864719888338, + "grad_norm": 1.1716807395140563, + "learning_rate": 1.0068480826939097e-05, + "loss": 0.689, + "num_input_tokens_seen": 1160564160, + "step": 6428 + }, + { + "epoch": 0.7037959440597717, + "grad_norm": 1.1131554783499058, + "learning_rate": 1.006158521002072e-05, + "loss": 0.751, + "num_input_tokens_seen": 1160759712, + "step": 6429 + }, + { + "epoch": 0.7039054161307097, + "grad_norm": 1.0962348504580137, + "learning_rate": 1.005469136028826e-05, + "loss": 0.7435, + "num_input_tokens_seen": 1160925920, + "step": 6430 + }, + { + "epoch": 0.7040148882016476, + "grad_norm": 1.332484551316934, + "learning_rate": 1.0047799278557238e-05, + "loss": 0.864, + "num_input_tokens_seen": 1161122592, + "step": 6431 + }, + { + "epoch": 0.7041243602725854, + "grad_norm": 1.1724428962515416, + "learning_rate": 1.0040908965642979e-05, + "loss": 0.9053, + "num_input_tokens_seen": 1161308288, + "step": 6432 + }, + { + "epoch": 0.7042338323435233, + "grad_norm": 1.1923247252116145, + "learning_rate": 1.0034020422360591e-05, + "loss": 0.7465, + "num_input_tokens_seen": 1161485024, + "step": 6433 + }, + { + "epoch": 0.7043433044144612, + "grad_norm": 1.254955340118944, + "learning_rate": 1.002713364952497e-05, + "loss": 1.0233, + "num_input_tokens_seen": 1161649888, + "step": 6434 + }, + { + "epoch": 0.7044527764853992, + "grad_norm": 1.1581616965180972, + "learning_rate": 1.0020248647950822e-05, + "loss": 0.7132, + "num_input_tokens_seen": 1161826176, + "step": 6435 + }, + { + "epoch": 0.7045622485563371, + "grad_norm": 1.274705314240541, + "learning_rate": 1.0013365418452625e-05, + "loss": 0.7798, + "num_input_tokens_seen": 1162030464, + "step": 6436 + }, + { + "epoch": 0.704671720627275, + "grad_norm": 1.1963205512437396, + "learning_rate": 1.0006483961844645e-05, + "loss": 0.8542, + "num_input_tokens_seen": 1162197568, + "step": 6437 + }, + { + "epoch": 0.7047811926982128, + "grad_norm": 1.1570551363007475, + "learning_rate": 9.999604278940956e-06, + "loss": 0.6759, + "num_input_tokens_seen": 1162371392, + "step": 6438 + }, + { + "epoch": 0.7048906647691507, + "grad_norm": 1.2254812249060982, + "learning_rate": 9.9927263705554e-06, + "loss": 0.8491, + "num_input_tokens_seen": 1162552384, + "step": 6439 + }, + { + "epoch": 0.7050001368400887, + "grad_norm": 1.2036434889401502, + "learning_rate": 9.985850237501618e-06, + "loss": 0.8174, + "num_input_tokens_seen": 1162740768, + "step": 6440 + }, + { + "epoch": 0.7051096089110266, + "grad_norm": 1.151678222887179, + "learning_rate": 9.978975880593067e-06, + "loss": 0.9848, + "num_input_tokens_seen": 1162915040, + "step": 6441 + }, + { + "epoch": 0.7052190809819645, + "grad_norm": 1.2608323896009128, + "learning_rate": 9.972103300642937e-06, + "loss": 0.6803, + "num_input_tokens_seen": 1163070496, + "step": 6442 + }, + { + "epoch": 0.7053285530529024, + "grad_norm": 1.3007208549523355, + "learning_rate": 9.965232498464266e-06, + "loss": 0.8383, + "num_input_tokens_seen": 1163247904, + "step": 6443 + }, + { + "epoch": 0.7054380251238402, + "grad_norm": 1.2935939326115127, + "learning_rate": 9.958363474869853e-06, + "loss": 0.7422, + "num_input_tokens_seen": 1163432480, + "step": 6444 + }, + { + "epoch": 0.7055474971947782, + "grad_norm": 1.2127145341457857, + "learning_rate": 9.951496230672283e-06, + "loss": 0.932, + "num_input_tokens_seen": 1163645728, + "step": 6445 + }, + { + "epoch": 0.7056569692657161, + "grad_norm": 1.2929467144612563, + "learning_rate": 9.94463076668394e-06, + "loss": 0.8108, + "num_input_tokens_seen": 1163845536, + "step": 6446 + }, + { + "epoch": 0.705766441336654, + "grad_norm": 1.377367495690756, + "learning_rate": 9.937767083716989e-06, + "loss": 0.7338, + "num_input_tokens_seen": 1163994496, + "step": 6447 + }, + { + "epoch": 0.7058759134075919, + "grad_norm": 1.095580884673124, + "learning_rate": 9.930905182583417e-06, + "loss": 0.7519, + "num_input_tokens_seen": 1164164288, + "step": 6448 + }, + { + "epoch": 0.7059853854785297, + "grad_norm": 1.3480476142839268, + "learning_rate": 9.924045064094934e-06, + "loss": 0.6995, + "num_input_tokens_seen": 1164315712, + "step": 6449 + }, + { + "epoch": 0.7060948575494677, + "grad_norm": 1.2611429418186888, + "learning_rate": 9.917186729063118e-06, + "loss": 0.9028, + "num_input_tokens_seen": 1164506336, + "step": 6450 + }, + { + "epoch": 0.7062043296204056, + "grad_norm": 1.168095457418835, + "learning_rate": 9.910330178299262e-06, + "loss": 0.6168, + "num_input_tokens_seen": 1164650816, + "step": 6451 + }, + { + "epoch": 0.7063138016913435, + "grad_norm": 1.2423621960521434, + "learning_rate": 9.903475412614507e-06, + "loss": 0.9615, + "num_input_tokens_seen": 1164866304, + "step": 6452 + }, + { + "epoch": 0.7064232737622814, + "grad_norm": 1.2076916909344755, + "learning_rate": 9.896622432819753e-06, + "loss": 0.8257, + "num_input_tokens_seen": 1165058272, + "step": 6453 + }, + { + "epoch": 0.7065327458332193, + "grad_norm": 1.1940985605145416, + "learning_rate": 9.889771239725693e-06, + "loss": 0.6831, + "num_input_tokens_seen": 1165240384, + "step": 6454 + }, + { + "epoch": 0.7066422179041572, + "grad_norm": 1.269595851721714, + "learning_rate": 9.882921834142806e-06, + "loss": 0.7427, + "num_input_tokens_seen": 1165418016, + "step": 6455 + }, + { + "epoch": 0.7067516899750951, + "grad_norm": 1.1638021497996014, + "learning_rate": 9.876074216881359e-06, + "loss": 0.8651, + "num_input_tokens_seen": 1165595424, + "step": 6456 + }, + { + "epoch": 0.706861162046033, + "grad_norm": 1.2098371859636847, + "learning_rate": 9.86922838875144e-06, + "loss": 0.7917, + "num_input_tokens_seen": 1165781568, + "step": 6457 + }, + { + "epoch": 0.7069706341169709, + "grad_norm": 1.2713431706227134, + "learning_rate": 9.86238435056286e-06, + "loss": 0.9143, + "num_input_tokens_seen": 1165971744, + "step": 6458 + }, + { + "epoch": 0.7070801061879088, + "grad_norm": 1.0098677386185597, + "learning_rate": 9.855542103125286e-06, + "loss": 0.6487, + "num_input_tokens_seen": 1166159904, + "step": 6459 + }, + { + "epoch": 0.7071895782588468, + "grad_norm": 1.1564830854393628, + "learning_rate": 9.848701647248118e-06, + "loss": 0.8435, + "num_input_tokens_seen": 1166335520, + "step": 6460 + }, + { + "epoch": 0.7072990503297846, + "grad_norm": 1.1517617000302693, + "learning_rate": 9.841862983740584e-06, + "loss": 0.8041, + "num_input_tokens_seen": 1166534208, + "step": 6461 + }, + { + "epoch": 0.7074085224007225, + "grad_norm": 1.1495063115959236, + "learning_rate": 9.835026113411685e-06, + "loss": 0.7024, + "num_input_tokens_seen": 1166728416, + "step": 6462 + }, + { + "epoch": 0.7075179944716604, + "grad_norm": 1.3473928442903307, + "learning_rate": 9.828191037070208e-06, + "loss": 0.9265, + "num_input_tokens_seen": 1166874016, + "step": 6463 + }, + { + "epoch": 0.7076274665425983, + "grad_norm": 1.253081623953393, + "learning_rate": 9.821357755524727e-06, + "loss": 0.8755, + "num_input_tokens_seen": 1167041568, + "step": 6464 + }, + { + "epoch": 0.7077369386135363, + "grad_norm": 1.0393772652349602, + "learning_rate": 9.814526269583596e-06, + "loss": 0.5768, + "num_input_tokens_seen": 1167231296, + "step": 6465 + }, + { + "epoch": 0.7078464106844741, + "grad_norm": 1.1037526104076445, + "learning_rate": 9.807696580054994e-06, + "loss": 0.7677, + "num_input_tokens_seen": 1167423936, + "step": 6466 + }, + { + "epoch": 0.707955882755412, + "grad_norm": 1.2091427700819566, + "learning_rate": 9.800868687746832e-06, + "loss": 1.0006, + "num_input_tokens_seen": 1167621728, + "step": 6467 + }, + { + "epoch": 0.7080653548263499, + "grad_norm": 1.3393910702637124, + "learning_rate": 9.794042593466851e-06, + "loss": 0.7862, + "num_input_tokens_seen": 1167780768, + "step": 6468 + }, + { + "epoch": 0.7081748268972878, + "grad_norm": 1.1984925199011858, + "learning_rate": 9.787218298022565e-06, + "loss": 0.791, + "num_input_tokens_seen": 1167939584, + "step": 6469 + }, + { + "epoch": 0.7082842989682258, + "grad_norm": 1.4085476648257234, + "learning_rate": 9.780395802221274e-06, + "loss": 1.1715, + "num_input_tokens_seen": 1168136256, + "step": 6470 + }, + { + "epoch": 0.7083937710391637, + "grad_norm": 1.2967379394170433, + "learning_rate": 9.773575106870061e-06, + "loss": 0.6974, + "num_input_tokens_seen": 1168294624, + "step": 6471 + }, + { + "epoch": 0.7085032431101015, + "grad_norm": 1.1663871441634999, + "learning_rate": 9.766756212775807e-06, + "loss": 0.9761, + "num_input_tokens_seen": 1168475616, + "step": 6472 + }, + { + "epoch": 0.7086127151810394, + "grad_norm": 1.1576302512358405, + "learning_rate": 9.759939120745171e-06, + "loss": 0.7774, + "num_input_tokens_seen": 1168679232, + "step": 6473 + }, + { + "epoch": 0.7087221872519773, + "grad_norm": 1.1102140235766986, + "learning_rate": 9.753123831584604e-06, + "loss": 0.9091, + "num_input_tokens_seen": 1168881952, + "step": 6474 + }, + { + "epoch": 0.7088316593229153, + "grad_norm": 1.0985409575316027, + "learning_rate": 9.746310346100331e-06, + "loss": 0.7007, + "num_input_tokens_seen": 1169085792, + "step": 6475 + }, + { + "epoch": 0.7089411313938532, + "grad_norm": 1.010623353615896, + "learning_rate": 9.739498665098395e-06, + "loss": 0.6447, + "num_input_tokens_seen": 1169272160, + "step": 6476 + }, + { + "epoch": 0.7090506034647911, + "grad_norm": 1.188514426763692, + "learning_rate": 9.732688789384593e-06, + "loss": 0.753, + "num_input_tokens_seen": 1169463680, + "step": 6477 + }, + { + "epoch": 0.7091600755357289, + "grad_norm": 1.1786478779508573, + "learning_rate": 9.725880719764519e-06, + "loss": 0.7734, + "num_input_tokens_seen": 1169657888, + "step": 6478 + }, + { + "epoch": 0.7092695476066668, + "grad_norm": 1.2401249874865932, + "learning_rate": 9.71907445704356e-06, + "loss": 0.9554, + "num_input_tokens_seen": 1169844032, + "step": 6479 + }, + { + "epoch": 0.7093790196776047, + "grad_norm": 1.276823865451988, + "learning_rate": 9.712270002026877e-06, + "loss": 0.9312, + "num_input_tokens_seen": 1170018304, + "step": 6480 + }, + { + "epoch": 0.7094884917485427, + "grad_norm": 1.2314544120773039, + "learning_rate": 9.705467355519428e-06, + "loss": 0.8453, + "num_input_tokens_seen": 1170192352, + "step": 6481 + }, + { + "epoch": 0.7095979638194806, + "grad_norm": 1.2959253442036422, + "learning_rate": 9.698666518325943e-06, + "loss": 0.9522, + "num_input_tokens_seen": 1170342208, + "step": 6482 + }, + { + "epoch": 0.7097074358904184, + "grad_norm": 1.1578378857340579, + "learning_rate": 9.69186749125098e-06, + "loss": 0.7133, + "num_input_tokens_seen": 1170519392, + "step": 6483 + }, + { + "epoch": 0.7098169079613563, + "grad_norm": 1.198762688912707, + "learning_rate": 9.685070275098806e-06, + "loss": 0.7641, + "num_input_tokens_seen": 1170706432, + "step": 6484 + }, + { + "epoch": 0.7099263800322942, + "grad_norm": 1.2656156280547342, + "learning_rate": 9.678274870673555e-06, + "loss": 0.8149, + "num_input_tokens_seen": 1170879808, + "step": 6485 + }, + { + "epoch": 0.7100358521032322, + "grad_norm": 1.1536538645491323, + "learning_rate": 9.671481278779094e-06, + "loss": 0.6595, + "num_input_tokens_seen": 1171057664, + "step": 6486 + }, + { + "epoch": 0.7101453241741701, + "grad_norm": 1.172906955297095, + "learning_rate": 9.664689500219092e-06, + "loss": 0.8191, + "num_input_tokens_seen": 1171255904, + "step": 6487 + }, + { + "epoch": 0.710254796245108, + "grad_norm": 1.2167320662823389, + "learning_rate": 9.65789953579701e-06, + "loss": 0.5721, + "num_input_tokens_seen": 1171434432, + "step": 6488 + }, + { + "epoch": 0.7103642683160458, + "grad_norm": 1.1905259178765761, + "learning_rate": 9.651111386316072e-06, + "loss": 0.6667, + "num_input_tokens_seen": 1171603552, + "step": 6489 + }, + { + "epoch": 0.7104737403869837, + "grad_norm": 1.1714403490810268, + "learning_rate": 9.644325052579333e-06, + "loss": 0.7402, + "num_input_tokens_seen": 1171805600, + "step": 6490 + }, + { + "epoch": 0.7105832124579217, + "grad_norm": 1.348912502182861, + "learning_rate": 9.63754053538957e-06, + "loss": 0.696, + "num_input_tokens_seen": 1171934400, + "step": 6491 + }, + { + "epoch": 0.7106926845288596, + "grad_norm": 1.187585927966356, + "learning_rate": 9.630757835549412e-06, + "loss": 0.791, + "num_input_tokens_seen": 1172109568, + "step": 6492 + }, + { + "epoch": 0.7108021565997975, + "grad_norm": 1.0957063889072376, + "learning_rate": 9.623976953861199e-06, + "loss": 0.6513, + "num_input_tokens_seen": 1172289440, + "step": 6493 + }, + { + "epoch": 0.7109116286707354, + "grad_norm": 1.3727221260915938, + "learning_rate": 9.617197891127131e-06, + "loss": 0.8444, + "num_input_tokens_seen": 1172480960, + "step": 6494 + }, + { + "epoch": 0.7110211007416732, + "grad_norm": 1.088302561708054, + "learning_rate": 9.610420648149144e-06, + "loss": 0.556, + "num_input_tokens_seen": 1172656352, + "step": 6495 + }, + { + "epoch": 0.7111305728126112, + "grad_norm": 1.4014178436374767, + "learning_rate": 9.603645225728975e-06, + "loss": 1.0299, + "num_input_tokens_seen": 1172854368, + "step": 6496 + }, + { + "epoch": 0.7112400448835491, + "grad_norm": 1.2741914635127571, + "learning_rate": 9.59687162466814e-06, + "loss": 0.9438, + "num_input_tokens_seen": 1173058880, + "step": 6497 + }, + { + "epoch": 0.711349516954487, + "grad_norm": 1.3004911631951621, + "learning_rate": 9.590099845767941e-06, + "loss": 0.7718, + "num_input_tokens_seen": 1173198432, + "step": 6498 + }, + { + "epoch": 0.7114589890254249, + "grad_norm": 1.0521760213580993, + "learning_rate": 9.583329889829486e-06, + "loss": 0.6266, + "num_input_tokens_seen": 1173377184, + "step": 6499 + }, + { + "epoch": 0.7115684610963627, + "grad_norm": 1.2398927233037194, + "learning_rate": 9.576561757653618e-06, + "loss": 0.9927, + "num_input_tokens_seen": 1173543616, + "step": 6500 + }, + { + "epoch": 0.7116779331673007, + "grad_norm": 1.3363333221664548, + "learning_rate": 9.569795450041028e-06, + "loss": 0.8954, + "num_input_tokens_seen": 1173708256, + "step": 6501 + }, + { + "epoch": 0.7117874052382386, + "grad_norm": 1.1556136983728358, + "learning_rate": 9.563030967792119e-06, + "loss": 0.8229, + "num_input_tokens_seen": 1173914560, + "step": 6502 + }, + { + "epoch": 0.7118968773091765, + "grad_norm": 1.458694776521053, + "learning_rate": 9.556268311707145e-06, + "loss": 0.8115, + "num_input_tokens_seen": 1174079872, + "step": 6503 + }, + { + "epoch": 0.7120063493801144, + "grad_norm": 1.160862973928569, + "learning_rate": 9.549507482586107e-06, + "loss": 0.6053, + "num_input_tokens_seen": 1174231744, + "step": 6504 + }, + { + "epoch": 0.7121158214510523, + "grad_norm": 1.1863713170875536, + "learning_rate": 9.542748481228796e-06, + "loss": 0.7187, + "num_input_tokens_seen": 1174430656, + "step": 6505 + }, + { + "epoch": 0.7122252935219902, + "grad_norm": 1.2706398686384828, + "learning_rate": 9.535991308434795e-06, + "loss": 1.0123, + "num_input_tokens_seen": 1174636288, + "step": 6506 + }, + { + "epoch": 0.7123347655929281, + "grad_norm": 1.0982699053389129, + "learning_rate": 9.529235965003447e-06, + "loss": 0.8889, + "num_input_tokens_seen": 1174825568, + "step": 6507 + }, + { + "epoch": 0.712444237663866, + "grad_norm": 1.3021940972582406, + "learning_rate": 9.52248245173392e-06, + "loss": 0.8864, + "num_input_tokens_seen": 1174979680, + "step": 6508 + }, + { + "epoch": 0.7125537097348039, + "grad_norm": 1.2457471024557825, + "learning_rate": 9.51573076942513e-06, + "loss": 0.785, + "num_input_tokens_seen": 1175151040, + "step": 6509 + }, + { + "epoch": 0.7126631818057418, + "grad_norm": 1.2678542108329167, + "learning_rate": 9.508980918875787e-06, + "loss": 0.9181, + "num_input_tokens_seen": 1175339424, + "step": 6510 + }, + { + "epoch": 0.7127726538766798, + "grad_norm": 1.151950833850961, + "learning_rate": 9.50223290088439e-06, + "loss": 0.8352, + "num_input_tokens_seen": 1175518176, + "step": 6511 + }, + { + "epoch": 0.7128821259476176, + "grad_norm": 1.1707104300524662, + "learning_rate": 9.495486716249213e-06, + "loss": 0.9036, + "num_input_tokens_seen": 1175688192, + "step": 6512 + }, + { + "epoch": 0.7129915980185555, + "grad_norm": 1.5116888629807665, + "learning_rate": 9.48874236576832e-06, + "loss": 0.9446, + "num_input_tokens_seen": 1175863808, + "step": 6513 + }, + { + "epoch": 0.7131010700894934, + "grad_norm": 1.1614673540024498, + "learning_rate": 9.48199985023955e-06, + "loss": 0.7969, + "num_input_tokens_seen": 1176081088, + "step": 6514 + }, + { + "epoch": 0.7132105421604313, + "grad_norm": 1.0644735631817972, + "learning_rate": 9.475259170460527e-06, + "loss": 0.5672, + "num_input_tokens_seen": 1176278208, + "step": 6515 + }, + { + "epoch": 0.7133200142313693, + "grad_norm": 1.3238657703151762, + "learning_rate": 9.468520327228681e-06, + "loss": 0.6445, + "num_input_tokens_seen": 1176446432, + "step": 6516 + }, + { + "epoch": 0.7134294863023071, + "grad_norm": 1.2037409902296852, + "learning_rate": 9.46178332134117e-06, + "loss": 0.6244, + "num_input_tokens_seen": 1176598976, + "step": 6517 + }, + { + "epoch": 0.713538958373245, + "grad_norm": 1.3206750496128568, + "learning_rate": 9.455048153594998e-06, + "loss": 0.772, + "num_input_tokens_seen": 1176727552, + "step": 6518 + }, + { + "epoch": 0.7136484304441829, + "grad_norm": 1.2502671909870475, + "learning_rate": 9.448314824786913e-06, + "loss": 0.7132, + "num_input_tokens_seen": 1176926016, + "step": 6519 + }, + { + "epoch": 0.7137579025151208, + "grad_norm": 1.30302830801637, + "learning_rate": 9.441583335713455e-06, + "loss": 0.9246, + "num_input_tokens_seen": 1177139264, + "step": 6520 + }, + { + "epoch": 0.7138673745860588, + "grad_norm": 1.157723750575317, + "learning_rate": 9.434853687170947e-06, + "loss": 0.696, + "num_input_tokens_seen": 1177325856, + "step": 6521 + }, + { + "epoch": 0.7139768466569967, + "grad_norm": 1.126674624298329, + "learning_rate": 9.42812587995548e-06, + "loss": 0.8583, + "num_input_tokens_seen": 1177524096, + "step": 6522 + }, + { + "epoch": 0.7140863187279345, + "grad_norm": 1.2208128368666453, + "learning_rate": 9.421399914862975e-06, + "loss": 0.7706, + "num_input_tokens_seen": 1177697024, + "step": 6523 + }, + { + "epoch": 0.7141957907988724, + "grad_norm": 1.3899123691132786, + "learning_rate": 9.414675792689056e-06, + "loss": 0.8262, + "num_input_tokens_seen": 1177872416, + "step": 6524 + }, + { + "epoch": 0.7143052628698103, + "grad_norm": 1.2284166936051246, + "learning_rate": 9.407953514229218e-06, + "loss": 0.751, + "num_input_tokens_seen": 1178070656, + "step": 6525 + }, + { + "epoch": 0.7144147349407483, + "grad_norm": 1.142399909311713, + "learning_rate": 9.401233080278655e-06, + "loss": 0.8409, + "num_input_tokens_seen": 1178272256, + "step": 6526 + }, + { + "epoch": 0.7145242070116862, + "grad_norm": 1.2850011482678934, + "learning_rate": 9.394514491632406e-06, + "loss": 1.0876, + "num_input_tokens_seen": 1178485280, + "step": 6527 + }, + { + "epoch": 0.7146336790826241, + "grad_norm": 1.2383372297178625, + "learning_rate": 9.38779774908526e-06, + "loss": 0.7362, + "num_input_tokens_seen": 1178662240, + "step": 6528 + }, + { + "epoch": 0.7147431511535619, + "grad_norm": 1.2556084698148644, + "learning_rate": 9.381082853431795e-06, + "loss": 0.7825, + "num_input_tokens_seen": 1178811200, + "step": 6529 + }, + { + "epoch": 0.7148526232244998, + "grad_norm": 1.4074738157557192, + "learning_rate": 9.374369805466369e-06, + "loss": 0.847, + "num_input_tokens_seen": 1179000480, + "step": 6530 + }, + { + "epoch": 0.7149620952954377, + "grad_norm": 1.3882752615349219, + "learning_rate": 9.367658605983117e-06, + "loss": 1.0344, + "num_input_tokens_seen": 1179169600, + "step": 6531 + }, + { + "epoch": 0.7150715673663757, + "grad_norm": 1.2889213715950796, + "learning_rate": 9.360949255775986e-06, + "loss": 0.9918, + "num_input_tokens_seen": 1179371648, + "step": 6532 + }, + { + "epoch": 0.7151810394373136, + "grad_norm": 1.2515665251896932, + "learning_rate": 9.354241755638641e-06, + "loss": 0.7314, + "num_input_tokens_seen": 1179583328, + "step": 6533 + }, + { + "epoch": 0.7152905115082514, + "grad_norm": 1.2321913030431708, + "learning_rate": 9.347536106364607e-06, + "loss": 0.7905, + "num_input_tokens_seen": 1179756928, + "step": 6534 + }, + { + "epoch": 0.7153999835791893, + "grad_norm": 1.2982419175385467, + "learning_rate": 9.34083230874711e-06, + "loss": 0.7886, + "num_input_tokens_seen": 1179916416, + "step": 6535 + }, + { + "epoch": 0.7155094556501272, + "grad_norm": 1.26269484045694, + "learning_rate": 9.334130363579224e-06, + "loss": 0.5847, + "num_input_tokens_seen": 1180097184, + "step": 6536 + }, + { + "epoch": 0.7156189277210652, + "grad_norm": 1.2395935325309821, + "learning_rate": 9.32743027165377e-06, + "loss": 0.67, + "num_input_tokens_seen": 1180249952, + "step": 6537 + }, + { + "epoch": 0.7157283997920031, + "grad_norm": 1.1594847203079057, + "learning_rate": 9.320732033763351e-06, + "loss": 0.7916, + "num_input_tokens_seen": 1180444608, + "step": 6538 + }, + { + "epoch": 0.715837871862941, + "grad_norm": 1.262583905443964, + "learning_rate": 9.314035650700361e-06, + "loss": 0.7682, + "num_input_tokens_seen": 1180609024, + "step": 6539 + }, + { + "epoch": 0.7159473439338788, + "grad_norm": 1.236210918445197, + "learning_rate": 9.307341123256957e-06, + "loss": 0.9032, + "num_input_tokens_seen": 1180815552, + "step": 6540 + }, + { + "epoch": 0.7160568160048167, + "grad_norm": 1.1579347656599865, + "learning_rate": 9.300648452225119e-06, + "loss": 0.7888, + "num_input_tokens_seen": 1181021408, + "step": 6541 + }, + { + "epoch": 0.7161662880757547, + "grad_norm": 1.1402062993188595, + "learning_rate": 9.293957638396535e-06, + "loss": 0.6452, + "num_input_tokens_seen": 1181194336, + "step": 6542 + }, + { + "epoch": 0.7162757601466926, + "grad_norm": 1.1383679673067815, + "learning_rate": 9.28726868256275e-06, + "loss": 0.7183, + "num_input_tokens_seen": 1181378688, + "step": 6543 + }, + { + "epoch": 0.7163852322176305, + "grad_norm": 1.1233379484378028, + "learning_rate": 9.280581585515042e-06, + "loss": 1.0053, + "num_input_tokens_seen": 1181557216, + "step": 6544 + }, + { + "epoch": 0.7164947042885684, + "grad_norm": 1.3168635720081654, + "learning_rate": 9.273896348044481e-06, + "loss": 0.9864, + "num_input_tokens_seen": 1181766656, + "step": 6545 + }, + { + "epoch": 0.7166041763595062, + "grad_norm": 1.1252654610135064, + "learning_rate": 9.267212970941919e-06, + "loss": 0.7048, + "num_input_tokens_seen": 1181931296, + "step": 6546 + }, + { + "epoch": 0.7167136484304442, + "grad_norm": 1.1617913844907637, + "learning_rate": 9.260531454997987e-06, + "loss": 0.7509, + "num_input_tokens_seen": 1182099296, + "step": 6547 + }, + { + "epoch": 0.7168231205013821, + "grad_norm": 1.208345654304214, + "learning_rate": 9.253851801003094e-06, + "loss": 0.8319, + "num_input_tokens_seen": 1182291264, + "step": 6548 + }, + { + "epoch": 0.71693259257232, + "grad_norm": 1.1556341747953351, + "learning_rate": 9.247174009747422e-06, + "loss": 1.0581, + "num_input_tokens_seen": 1182467552, + "step": 6549 + }, + { + "epoch": 0.7170420646432579, + "grad_norm": 1.2285858764939512, + "learning_rate": 9.240498082020962e-06, + "loss": 0.7627, + "num_input_tokens_seen": 1182653472, + "step": 6550 + }, + { + "epoch": 0.7171515367141957, + "grad_norm": 1.314281707514703, + "learning_rate": 9.23382401861345e-06, + "loss": 0.6753, + "num_input_tokens_seen": 1182831552, + "step": 6551 + }, + { + "epoch": 0.7172610087851337, + "grad_norm": 1.29290633036524, + "learning_rate": 9.227151820314417e-06, + "loss": 1.2305, + "num_input_tokens_seen": 1183056896, + "step": 6552 + }, + { + "epoch": 0.7173704808560716, + "grad_norm": 1.271380582871264, + "learning_rate": 9.22048148791317e-06, + "loss": 0.768, + "num_input_tokens_seen": 1183232288, + "step": 6553 + }, + { + "epoch": 0.7174799529270095, + "grad_norm": 1.3195810492668143, + "learning_rate": 9.2138130221988e-06, + "loss": 0.8484, + "num_input_tokens_seen": 1183384160, + "step": 6554 + }, + { + "epoch": 0.7175894249979474, + "grad_norm": 1.2333343078420416, + "learning_rate": 9.20714642396017e-06, + "loss": 0.9022, + "num_input_tokens_seen": 1183568736, + "step": 6555 + }, + { + "epoch": 0.7176988970688853, + "grad_norm": 1.3529925856432936, + "learning_rate": 9.200481693985928e-06, + "loss": 0.8375, + "num_input_tokens_seen": 1183730912, + "step": 6556 + }, + { + "epoch": 0.7178083691398232, + "grad_norm": 1.2633205674280503, + "learning_rate": 9.193818833064489e-06, + "loss": 0.757, + "num_input_tokens_seen": 1183927584, + "step": 6557 + }, + { + "epoch": 0.7179178412107611, + "grad_norm": 1.2472124489219836, + "learning_rate": 9.187157841984082e-06, + "loss": 0.6658, + "num_input_tokens_seen": 1184099392, + "step": 6558 + }, + { + "epoch": 0.718027313281699, + "grad_norm": 1.2325711167498894, + "learning_rate": 9.180498721532657e-06, + "loss": 0.7983, + "num_input_tokens_seen": 1184320480, + "step": 6559 + }, + { + "epoch": 0.7181367853526369, + "grad_norm": 1.487031026048354, + "learning_rate": 9.173841472498001e-06, + "loss": 0.9572, + "num_input_tokens_seen": 1184491616, + "step": 6560 + }, + { + "epoch": 0.7182462574235748, + "grad_norm": 1.1557098879874463, + "learning_rate": 9.167186095667643e-06, + "loss": 0.7051, + "num_input_tokens_seen": 1184654464, + "step": 6561 + }, + { + "epoch": 0.7183557294945128, + "grad_norm": 1.181330566080812, + "learning_rate": 9.160532591828902e-06, + "loss": 0.6713, + "num_input_tokens_seen": 1184837472, + "step": 6562 + }, + { + "epoch": 0.7184652015654506, + "grad_norm": 1.157460790180144, + "learning_rate": 9.153880961768877e-06, + "loss": 0.7939, + "num_input_tokens_seen": 1185054080, + "step": 6563 + }, + { + "epoch": 0.7185746736363885, + "grad_norm": 1.3452600821142238, + "learning_rate": 9.147231206274431e-06, + "loss": 0.7731, + "num_input_tokens_seen": 1185227232, + "step": 6564 + }, + { + "epoch": 0.7186841457073264, + "grad_norm": 1.2054602018833243, + "learning_rate": 9.140583326132249e-06, + "loss": 0.7692, + "num_input_tokens_seen": 1185370368, + "step": 6565 + }, + { + "epoch": 0.7187936177782643, + "grad_norm": 1.1817164373116698, + "learning_rate": 9.133937322128722e-06, + "loss": 0.6939, + "num_input_tokens_seen": 1185543744, + "step": 6566 + }, + { + "epoch": 0.7189030898492023, + "grad_norm": 1.3826687409410825, + "learning_rate": 9.127293195050096e-06, + "loss": 1.0002, + "num_input_tokens_seen": 1185722720, + "step": 6567 + }, + { + "epoch": 0.7190125619201401, + "grad_norm": 1.0877336974451133, + "learning_rate": 9.120650945682325e-06, + "loss": 0.5654, + "num_input_tokens_seen": 1185913792, + "step": 6568 + }, + { + "epoch": 0.719122033991078, + "grad_norm": 1.2380590871320665, + "learning_rate": 9.114010574811197e-06, + "loss": 0.6868, + "num_input_tokens_seen": 1186092768, + "step": 6569 + }, + { + "epoch": 0.7192315060620159, + "grad_norm": 1.2451626159757985, + "learning_rate": 9.107372083222251e-06, + "loss": 0.8304, + "num_input_tokens_seen": 1186254720, + "step": 6570 + }, + { + "epoch": 0.7193409781329538, + "grad_norm": 1.2583633543479629, + "learning_rate": 9.100735471700805e-06, + "loss": 0.8053, + "num_input_tokens_seen": 1186450944, + "step": 6571 + }, + { + "epoch": 0.7194504502038918, + "grad_norm": 1.0885486085417886, + "learning_rate": 9.094100741031961e-06, + "loss": 0.5698, + "num_input_tokens_seen": 1186634400, + "step": 6572 + }, + { + "epoch": 0.7195599222748297, + "grad_norm": 1.3494616371036057, + "learning_rate": 9.087467892000582e-06, + "loss": 0.6843, + "num_input_tokens_seen": 1186778432, + "step": 6573 + }, + { + "epoch": 0.7196693943457675, + "grad_norm": 1.120691313603234, + "learning_rate": 9.08083692539135e-06, + "loss": 0.7123, + "num_input_tokens_seen": 1186964128, + "step": 6574 + }, + { + "epoch": 0.7197788664167054, + "grad_norm": 1.2031637408472002, + "learning_rate": 9.07420784198866e-06, + "loss": 0.8055, + "num_input_tokens_seen": 1187123392, + "step": 6575 + }, + { + "epoch": 0.7198883384876433, + "grad_norm": 1.317440543525163, + "learning_rate": 9.067580642576746e-06, + "loss": 0.7627, + "num_input_tokens_seen": 1187283552, + "step": 6576 + }, + { + "epoch": 0.7199978105585813, + "grad_norm": 1.4554506785017725, + "learning_rate": 9.060955327939582e-06, + "loss": 0.9206, + "num_input_tokens_seen": 1187473952, + "step": 6577 + }, + { + "epoch": 0.7201072826295192, + "grad_norm": 1.3321586838786148, + "learning_rate": 9.054331898860935e-06, + "loss": 0.7725, + "num_input_tokens_seen": 1187635456, + "step": 6578 + }, + { + "epoch": 0.7202167547004571, + "grad_norm": 1.225821204230291, + "learning_rate": 9.047710356124342e-06, + "loss": 0.9809, + "num_input_tokens_seen": 1187807712, + "step": 6579 + }, + { + "epoch": 0.7203262267713949, + "grad_norm": 1.1687547469425188, + "learning_rate": 9.041090700513117e-06, + "loss": 0.6504, + "num_input_tokens_seen": 1187987360, + "step": 6580 + }, + { + "epoch": 0.7204356988423328, + "grad_norm": 1.19390731693655, + "learning_rate": 9.034472932810354e-06, + "loss": 0.7909, + "num_input_tokens_seen": 1188194112, + "step": 6581 + }, + { + "epoch": 0.7205451709132707, + "grad_norm": 1.1702826298771225, + "learning_rate": 9.027857053798913e-06, + "loss": 0.9215, + "num_input_tokens_seen": 1188406912, + "step": 6582 + }, + { + "epoch": 0.7206546429842087, + "grad_norm": 1.2674236467978421, + "learning_rate": 9.02124306426146e-06, + "loss": 1.0656, + "num_input_tokens_seen": 1188616576, + "step": 6583 + }, + { + "epoch": 0.7207641150551466, + "grad_norm": 1.217481637851493, + "learning_rate": 9.014630964980404e-06, + "loss": 0.9511, + "num_input_tokens_seen": 1188824448, + "step": 6584 + }, + { + "epoch": 0.7208735871260844, + "grad_norm": 1.1204178530477644, + "learning_rate": 9.008020756737945e-06, + "loss": 0.8529, + "num_input_tokens_seen": 1189011936, + "step": 6585 + }, + { + "epoch": 0.7209830591970223, + "grad_norm": 1.3666433504875901, + "learning_rate": 9.001412440316059e-06, + "loss": 0.9814, + "num_input_tokens_seen": 1189202336, + "step": 6586 + }, + { + "epoch": 0.7210925312679602, + "grad_norm": 1.216096336560989, + "learning_rate": 8.994806016496499e-06, + "loss": 0.7476, + "num_input_tokens_seen": 1189381536, + "step": 6587 + }, + { + "epoch": 0.7212020033388982, + "grad_norm": 1.0799802766870987, + "learning_rate": 8.988201486060791e-06, + "loss": 0.7003, + "num_input_tokens_seen": 1189580224, + "step": 6588 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 1.2473073090404505, + "learning_rate": 8.981598849790238e-06, + "loss": 0.6302, + "num_input_tokens_seen": 1189753376, + "step": 6589 + }, + { + "epoch": 0.721420947480774, + "grad_norm": 1.2307285579508231, + "learning_rate": 8.974998108465907e-06, + "loss": 0.9024, + "num_input_tokens_seen": 1189929664, + "step": 6590 + }, + { + "epoch": 0.7215304195517118, + "grad_norm": 1.2281234430589587, + "learning_rate": 8.968399262868677e-06, + "loss": 0.6359, + "num_input_tokens_seen": 1190118720, + "step": 6591 + }, + { + "epoch": 0.7216398916226497, + "grad_norm": 1.284439675589189, + "learning_rate": 8.961802313779166e-06, + "loss": 0.7358, + "num_input_tokens_seen": 1190302624, + "step": 6592 + }, + { + "epoch": 0.7217493636935877, + "grad_norm": 1.1162100297151547, + "learning_rate": 8.955207261977783e-06, + "loss": 0.768, + "num_input_tokens_seen": 1190510720, + "step": 6593 + }, + { + "epoch": 0.7218588357645256, + "grad_norm": 1.1462461363749143, + "learning_rate": 8.948614108244705e-06, + "loss": 0.83, + "num_input_tokens_seen": 1190721952, + "step": 6594 + }, + { + "epoch": 0.7219683078354635, + "grad_norm": 1.118491099145267, + "learning_rate": 8.942022853359896e-06, + "loss": 0.6181, + "num_input_tokens_seen": 1190887264, + "step": 6595 + }, + { + "epoch": 0.7220777799064014, + "grad_norm": 1.1399304090976121, + "learning_rate": 8.935433498103086e-06, + "loss": 0.837, + "num_input_tokens_seen": 1191047872, + "step": 6596 + }, + { + "epoch": 0.7221872519773392, + "grad_norm": 1.151489096052825, + "learning_rate": 8.928846043253772e-06, + "loss": 0.5566, + "num_input_tokens_seen": 1191261344, + "step": 6597 + }, + { + "epoch": 0.7222967240482772, + "grad_norm": 1.2235145846621203, + "learning_rate": 8.922260489591266e-06, + "loss": 0.7768, + "num_input_tokens_seen": 1191432032, + "step": 6598 + }, + { + "epoch": 0.7224061961192151, + "grad_norm": 1.264475594863898, + "learning_rate": 8.915676837894593e-06, + "loss": 0.7494, + "num_input_tokens_seen": 1191609216, + "step": 6599 + }, + { + "epoch": 0.722515668190153, + "grad_norm": 1.0767910763124209, + "learning_rate": 8.909095088942617e-06, + "loss": 0.624, + "num_input_tokens_seen": 1191799392, + "step": 6600 + }, + { + "epoch": 0.7226251402610909, + "grad_norm": 1.137895479725553, + "learning_rate": 8.902515243513918e-06, + "loss": 0.658, + "num_input_tokens_seen": 1191994048, + "step": 6601 + }, + { + "epoch": 0.7227346123320287, + "grad_norm": 1.2106294301611213, + "learning_rate": 8.895937302386898e-06, + "loss": 0.6592, + "num_input_tokens_seen": 1192150176, + "step": 6602 + }, + { + "epoch": 0.7228440844029667, + "grad_norm": 1.230984582709941, + "learning_rate": 8.88936126633971e-06, + "loss": 0.8872, + "num_input_tokens_seen": 1192355136, + "step": 6603 + }, + { + "epoch": 0.7229535564739046, + "grad_norm": 1.138999459990791, + "learning_rate": 8.882787136150275e-06, + "loss": 0.8755, + "num_input_tokens_seen": 1192540832, + "step": 6604 + }, + { + "epoch": 0.7230630285448425, + "grad_norm": 1.1846559498298073, + "learning_rate": 8.876214912596331e-06, + "loss": 0.6953, + "num_input_tokens_seen": 1192746240, + "step": 6605 + }, + { + "epoch": 0.7231725006157804, + "grad_norm": 1.211233878013625, + "learning_rate": 8.869644596455324e-06, + "loss": 0.7222, + "num_input_tokens_seen": 1192951648, + "step": 6606 + }, + { + "epoch": 0.7232819726867183, + "grad_norm": 1.1030110243577516, + "learning_rate": 8.863076188504537e-06, + "loss": 0.7606, + "num_input_tokens_seen": 1193149440, + "step": 6607 + }, + { + "epoch": 0.7233914447576562, + "grad_norm": 1.1957899648442774, + "learning_rate": 8.856509689520976e-06, + "loss": 0.7701, + "num_input_tokens_seen": 1193323488, + "step": 6608 + }, + { + "epoch": 0.7235009168285941, + "grad_norm": 1.1398362345771291, + "learning_rate": 8.849945100281474e-06, + "loss": 0.6302, + "num_input_tokens_seen": 1193487456, + "step": 6609 + }, + { + "epoch": 0.723610388899532, + "grad_norm": 1.243306789310867, + "learning_rate": 8.843382421562573e-06, + "loss": 0.7829, + "num_input_tokens_seen": 1193662400, + "step": 6610 + }, + { + "epoch": 0.7237198609704699, + "grad_norm": 1.279250934736914, + "learning_rate": 8.836821654140656e-06, + "loss": 1.2094, + "num_input_tokens_seen": 1193868704, + "step": 6611 + }, + { + "epoch": 0.7238293330414078, + "grad_norm": 1.1509247917573668, + "learning_rate": 8.830262798791838e-06, + "loss": 0.8877, + "num_input_tokens_seen": 1194062464, + "step": 6612 + }, + { + "epoch": 0.7239388051123458, + "grad_norm": 1.259115503804588, + "learning_rate": 8.823705856292019e-06, + "loss": 0.644, + "num_input_tokens_seen": 1194226432, + "step": 6613 + }, + { + "epoch": 0.7240482771832836, + "grad_norm": 1.218631298446852, + "learning_rate": 8.817150827416876e-06, + "loss": 0.7796, + "num_input_tokens_seen": 1194406976, + "step": 6614 + }, + { + "epoch": 0.7241577492542215, + "grad_norm": 1.1601765948736962, + "learning_rate": 8.810597712941843e-06, + "loss": 0.8061, + "num_input_tokens_seen": 1194581696, + "step": 6615 + }, + { + "epoch": 0.7242672213251594, + "grad_norm": 1.1959537954942203, + "learning_rate": 8.80404651364217e-06, + "loss": 0.9857, + "num_input_tokens_seen": 1194760000, + "step": 6616 + }, + { + "epoch": 0.7243766933960973, + "grad_norm": 1.2571814239302022, + "learning_rate": 8.797497230292814e-06, + "loss": 0.9998, + "num_input_tokens_seen": 1194905600, + "step": 6617 + }, + { + "epoch": 0.7244861654670353, + "grad_norm": 1.1618379676666954, + "learning_rate": 8.790949863668571e-06, + "loss": 1.0156, + "num_input_tokens_seen": 1195112352, + "step": 6618 + }, + { + "epoch": 0.7245956375379731, + "grad_norm": 1.2097274148515476, + "learning_rate": 8.784404414543973e-06, + "loss": 0.8343, + "num_input_tokens_seen": 1195259296, + "step": 6619 + }, + { + "epoch": 0.724705109608911, + "grad_norm": 1.224734213274696, + "learning_rate": 8.777860883693335e-06, + "loss": 0.9077, + "num_input_tokens_seen": 1195444544, + "step": 6620 + }, + { + "epoch": 0.7248145816798489, + "grad_norm": 1.3274238979846626, + "learning_rate": 8.771319271890741e-06, + "loss": 0.7657, + "num_input_tokens_seen": 1195644576, + "step": 6621 + }, + { + "epoch": 0.7249240537507868, + "grad_norm": 1.2860973124430999, + "learning_rate": 8.764779579910054e-06, + "loss": 0.6545, + "num_input_tokens_seen": 1195815040, + "step": 6622 + }, + { + "epoch": 0.7250335258217248, + "grad_norm": 1.358716257022027, + "learning_rate": 8.758241808524906e-06, + "loss": 0.9177, + "num_input_tokens_seen": 1195992224, + "step": 6623 + }, + { + "epoch": 0.7251429978926627, + "grad_norm": 1.3144597454386118, + "learning_rate": 8.751705958508697e-06, + "loss": 1.0859, + "num_input_tokens_seen": 1196187328, + "step": 6624 + }, + { + "epoch": 0.7252524699636005, + "grad_norm": 1.1752731391302085, + "learning_rate": 8.745172030634616e-06, + "loss": 0.9452, + "num_input_tokens_seen": 1196350624, + "step": 6625 + }, + { + "epoch": 0.7253619420345384, + "grad_norm": 1.244393171851205, + "learning_rate": 8.738640025675612e-06, + "loss": 0.9741, + "num_input_tokens_seen": 1196531616, + "step": 6626 + }, + { + "epoch": 0.7254714141054763, + "grad_norm": 1.163812495273471, + "learning_rate": 8.732109944404408e-06, + "loss": 0.6813, + "num_input_tokens_seen": 1196722240, + "step": 6627 + }, + { + "epoch": 0.7255808861764143, + "grad_norm": 1.2197933132943437, + "learning_rate": 8.725581787593496e-06, + "loss": 0.9786, + "num_input_tokens_seen": 1196941312, + "step": 6628 + }, + { + "epoch": 0.7256903582473522, + "grad_norm": 1.3848204608600092, + "learning_rate": 8.719055556015149e-06, + "loss": 0.9005, + "num_input_tokens_seen": 1197095872, + "step": 6629 + }, + { + "epoch": 0.7257998303182901, + "grad_norm": 1.2048826878374925, + "learning_rate": 8.712531250441394e-06, + "loss": 0.8307, + "num_input_tokens_seen": 1197283808, + "step": 6630 + }, + { + "epoch": 0.7259093023892279, + "grad_norm": 1.2437617564963608, + "learning_rate": 8.706008871644075e-06, + "loss": 0.8498, + "num_input_tokens_seen": 1197463904, + "step": 6631 + }, + { + "epoch": 0.7260187744601658, + "grad_norm": 1.245839921217291, + "learning_rate": 8.699488420394741e-06, + "loss": 0.6923, + "num_input_tokens_seen": 1197631904, + "step": 6632 + }, + { + "epoch": 0.7261282465311037, + "grad_norm": 1.2706097056926802, + "learning_rate": 8.692969897464775e-06, + "loss": 0.7962, + "num_input_tokens_seen": 1197826560, + "step": 6633 + }, + { + "epoch": 0.7262377186020417, + "grad_norm": 1.2774821915103787, + "learning_rate": 8.686453303625294e-06, + "loss": 0.8388, + "num_input_tokens_seen": 1198005088, + "step": 6634 + }, + { + "epoch": 0.7263471906729796, + "grad_norm": 1.375614245804689, + "learning_rate": 8.679938639647203e-06, + "loss": 0.9238, + "num_input_tokens_seen": 1198174656, + "step": 6635 + }, + { + "epoch": 0.7264566627439174, + "grad_norm": 1.1035028777609992, + "learning_rate": 8.673425906301171e-06, + "loss": 0.7076, + "num_input_tokens_seen": 1198376704, + "step": 6636 + }, + { + "epoch": 0.7265661348148553, + "grad_norm": 1.3342351049485617, + "learning_rate": 8.666915104357637e-06, + "loss": 0.8841, + "num_input_tokens_seen": 1198542016, + "step": 6637 + }, + { + "epoch": 0.7266756068857932, + "grad_norm": 1.3761613016537162, + "learning_rate": 8.660406234586838e-06, + "loss": 0.7606, + "num_input_tokens_seen": 1198709568, + "step": 6638 + }, + { + "epoch": 0.7267850789567312, + "grad_norm": 1.0854964973150714, + "learning_rate": 8.653899297758728e-06, + "loss": 0.6159, + "num_input_tokens_seen": 1198902432, + "step": 6639 + }, + { + "epoch": 0.7268945510276691, + "grad_norm": 1.1897486427653683, + "learning_rate": 8.647394294643099e-06, + "loss": 0.8114, + "num_input_tokens_seen": 1199097088, + "step": 6640 + }, + { + "epoch": 0.727004023098607, + "grad_norm": 1.3017380778025158, + "learning_rate": 8.640891226009449e-06, + "loss": 0.7512, + "num_input_tokens_seen": 1199253664, + "step": 6641 + }, + { + "epoch": 0.7271134951695448, + "grad_norm": 1.2244242112929813, + "learning_rate": 8.63439009262711e-06, + "loss": 0.7364, + "num_input_tokens_seen": 1199444064, + "step": 6642 + }, + { + "epoch": 0.7272229672404827, + "grad_norm": 1.369325720537141, + "learning_rate": 8.62789089526512e-06, + "loss": 0.8733, + "num_input_tokens_seen": 1199642304, + "step": 6643 + }, + { + "epoch": 0.7273324393114207, + "grad_norm": 1.2503425128884216, + "learning_rate": 8.621393634692346e-06, + "loss": 0.9894, + "num_input_tokens_seen": 1199855104, + "step": 6644 + }, + { + "epoch": 0.7274419113823586, + "grad_norm": 1.1934375893392277, + "learning_rate": 8.614898311677397e-06, + "loss": 0.7521, + "num_input_tokens_seen": 1200036320, + "step": 6645 + }, + { + "epoch": 0.7275513834532965, + "grad_norm": 1.2593361292441656, + "learning_rate": 8.608404926988644e-06, + "loss": 0.8166, + "num_input_tokens_seen": 1200191776, + "step": 6646 + }, + { + "epoch": 0.7276608555242344, + "grad_norm": 1.174190420899119, + "learning_rate": 8.601913481394273e-06, + "loss": 0.5999, + "num_input_tokens_seen": 1200319904, + "step": 6647 + }, + { + "epoch": 0.7277703275951722, + "grad_norm": 1.2455571852013663, + "learning_rate": 8.595423975662168e-06, + "loss": 0.6633, + "num_input_tokens_seen": 1200505376, + "step": 6648 + }, + { + "epoch": 0.7278797996661102, + "grad_norm": 1.1604605638475447, + "learning_rate": 8.588936410560065e-06, + "loss": 0.7045, + "num_input_tokens_seen": 1200663296, + "step": 6649 + }, + { + "epoch": 0.7279892717370481, + "grad_norm": 1.2133102305452033, + "learning_rate": 8.582450786855394e-06, + "loss": 0.8198, + "num_input_tokens_seen": 1200840704, + "step": 6650 + }, + { + "epoch": 0.728098743807986, + "grad_norm": 1.1293218567023733, + "learning_rate": 8.575967105315422e-06, + "loss": 0.6319, + "num_input_tokens_seen": 1200995488, + "step": 6651 + }, + { + "epoch": 0.7282082158789239, + "grad_norm": 1.2459389521674311, + "learning_rate": 8.569485366707142e-06, + "loss": 0.9258, + "num_input_tokens_seen": 1201162144, + "step": 6652 + }, + { + "epoch": 0.7283176879498617, + "grad_norm": 1.3339295941522369, + "learning_rate": 8.563005571797334e-06, + "loss": 0.7841, + "num_input_tokens_seen": 1201322528, + "step": 6653 + }, + { + "epoch": 0.7284271600207997, + "grad_norm": 1.1364452649188852, + "learning_rate": 8.556527721352542e-06, + "loss": 0.932, + "num_input_tokens_seen": 1201507104, + "step": 6654 + }, + { + "epoch": 0.7285366320917376, + "grad_norm": 1.1523635086693222, + "learning_rate": 8.550051816139088e-06, + "loss": 0.9286, + "num_input_tokens_seen": 1201679808, + "step": 6655 + }, + { + "epoch": 0.7286461041626755, + "grad_norm": 1.3971108478567602, + "learning_rate": 8.543577856923058e-06, + "loss": 0.9688, + "num_input_tokens_seen": 1201852288, + "step": 6656 + }, + { + "epoch": 0.7287555762336134, + "grad_norm": 1.1180903914390574, + "learning_rate": 8.537105844470297e-06, + "loss": 0.8774, + "num_input_tokens_seen": 1202016032, + "step": 6657 + }, + { + "epoch": 0.7288650483045513, + "grad_norm": 1.4452251511031011, + "learning_rate": 8.530635779546453e-06, + "loss": 0.9674, + "num_input_tokens_seen": 1202197920, + "step": 6658 + }, + { + "epoch": 0.7289745203754892, + "grad_norm": 1.327389816553535, + "learning_rate": 8.524167662916913e-06, + "loss": 0.8467, + "num_input_tokens_seen": 1202364128, + "step": 6659 + }, + { + "epoch": 0.7290839924464271, + "grad_norm": 1.3141634254842947, + "learning_rate": 8.517701495346842e-06, + "loss": 0.8004, + "num_input_tokens_seen": 1202526976, + "step": 6660 + }, + { + "epoch": 0.729193464517365, + "grad_norm": 1.1968939797114169, + "learning_rate": 8.511237277601174e-06, + "loss": 0.7838, + "num_input_tokens_seen": 1202679296, + "step": 6661 + }, + { + "epoch": 0.7293029365883029, + "grad_norm": 1.2644506115633567, + "learning_rate": 8.504775010444616e-06, + "loss": 0.6777, + "num_input_tokens_seen": 1202850880, + "step": 6662 + }, + { + "epoch": 0.7294124086592408, + "grad_norm": 1.376322370492135, + "learning_rate": 8.49831469464164e-06, + "loss": 0.9366, + "num_input_tokens_seen": 1203017536, + "step": 6663 + }, + { + "epoch": 0.7295218807301788, + "grad_norm": 1.3642430451453351, + "learning_rate": 8.491856330956491e-06, + "loss": 1.0457, + "num_input_tokens_seen": 1203173888, + "step": 6664 + }, + { + "epoch": 0.7296313528011166, + "grad_norm": 1.302032189954251, + "learning_rate": 8.48539992015317e-06, + "loss": 0.8974, + "num_input_tokens_seen": 1203339872, + "step": 6665 + }, + { + "epoch": 0.7297408248720545, + "grad_norm": 1.2006001918598754, + "learning_rate": 8.478945462995477e-06, + "loss": 0.6084, + "num_input_tokens_seen": 1203505632, + "step": 6666 + }, + { + "epoch": 0.7298502969429924, + "grad_norm": 1.1983055589204588, + "learning_rate": 8.472492960246953e-06, + "loss": 0.8562, + "num_input_tokens_seen": 1203686848, + "step": 6667 + }, + { + "epoch": 0.7299597690139303, + "grad_norm": 1.1954718589517925, + "learning_rate": 8.466042412670916e-06, + "loss": 0.7483, + "num_input_tokens_seen": 1203832448, + "step": 6668 + }, + { + "epoch": 0.7300692410848683, + "grad_norm": 1.0362928376586853, + "learning_rate": 8.459593821030454e-06, + "loss": 0.5752, + "num_input_tokens_seen": 1204020608, + "step": 6669 + }, + { + "epoch": 0.7301787131558061, + "grad_norm": 1.1127672045990413, + "learning_rate": 8.453147186088423e-06, + "loss": 0.7345, + "num_input_tokens_seen": 1204195104, + "step": 6670 + }, + { + "epoch": 0.730288185226744, + "grad_norm": 1.2594275678922904, + "learning_rate": 8.446702508607449e-06, + "loss": 1.0612, + "num_input_tokens_seen": 1204366688, + "step": 6671 + }, + { + "epoch": 0.7303976572976819, + "grad_norm": 1.190658634942217, + "learning_rate": 8.440259789349913e-06, + "loss": 0.6685, + "num_input_tokens_seen": 1204555744, + "step": 6672 + }, + { + "epoch": 0.7305071293686198, + "grad_norm": 1.1772117657207297, + "learning_rate": 8.433819029078005e-06, + "loss": 0.9997, + "num_input_tokens_seen": 1204745920, + "step": 6673 + }, + { + "epoch": 0.7306166014395578, + "grad_norm": 1.185778172662783, + "learning_rate": 8.42738022855362e-06, + "loss": 0.6609, + "num_input_tokens_seen": 1204930496, + "step": 6674 + }, + { + "epoch": 0.7307260735104957, + "grad_norm": 1.2847068487857043, + "learning_rate": 8.42094338853848e-06, + "loss": 0.8714, + "num_input_tokens_seen": 1205102528, + "step": 6675 + }, + { + "epoch": 0.7308355455814335, + "grad_norm": 1.1881529869864926, + "learning_rate": 8.414508509794044e-06, + "loss": 0.9924, + "num_input_tokens_seen": 1205309952, + "step": 6676 + }, + { + "epoch": 0.7309450176523714, + "grad_norm": 1.1711847434088545, + "learning_rate": 8.408075593081546e-06, + "loss": 0.6985, + "num_input_tokens_seen": 1205486240, + "step": 6677 + }, + { + "epoch": 0.7310544897233093, + "grad_norm": 1.328271745540735, + "learning_rate": 8.401644639161987e-06, + "loss": 1.0486, + "num_input_tokens_seen": 1205669024, + "step": 6678 + }, + { + "epoch": 0.7311639617942473, + "grad_norm": 1.1868310349102038, + "learning_rate": 8.39521564879613e-06, + "loss": 0.7749, + "num_input_tokens_seen": 1205823808, + "step": 6679 + }, + { + "epoch": 0.7312734338651852, + "grad_norm": 1.3479984457137206, + "learning_rate": 8.38878862274453e-06, + "loss": 0.8423, + "num_input_tokens_seen": 1205988000, + "step": 6680 + }, + { + "epoch": 0.7313829059361231, + "grad_norm": 1.2239053214206141, + "learning_rate": 8.382363561767467e-06, + "loss": 0.7935, + "num_input_tokens_seen": 1206185792, + "step": 6681 + }, + { + "epoch": 0.7314923780070609, + "grad_norm": 1.4119660237468805, + "learning_rate": 8.375940466625047e-06, + "loss": 0.9047, + "num_input_tokens_seen": 1206380448, + "step": 6682 + }, + { + "epoch": 0.7316018500779988, + "grad_norm": 1.1864415521191294, + "learning_rate": 8.369519338077067e-06, + "loss": 0.7655, + "num_input_tokens_seen": 1206564800, + "step": 6683 + }, + { + "epoch": 0.7317113221489367, + "grad_norm": 1.1595447097324054, + "learning_rate": 8.36310017688318e-06, + "loss": 0.7353, + "num_input_tokens_seen": 1206716448, + "step": 6684 + }, + { + "epoch": 0.7318207942198747, + "grad_norm": 1.1493478916408588, + "learning_rate": 8.356682983802717e-06, + "loss": 0.9517, + "num_input_tokens_seen": 1206926560, + "step": 6685 + }, + { + "epoch": 0.7319302662908126, + "grad_norm": 1.1673057996656946, + "learning_rate": 8.35026775959485e-06, + "loss": 0.7143, + "num_input_tokens_seen": 1207103296, + "step": 6686 + }, + { + "epoch": 0.7320397383617505, + "grad_norm": 1.3634877745272966, + "learning_rate": 8.343854505018477e-06, + "loss": 0.8815, + "num_input_tokens_seen": 1207278912, + "step": 6687 + }, + { + "epoch": 0.7321492104326883, + "grad_norm": 1.158789912239096, + "learning_rate": 8.337443220832267e-06, + "loss": 0.8753, + "num_input_tokens_seen": 1207440864, + "step": 6688 + }, + { + "epoch": 0.7322586825036262, + "grad_norm": 1.1550606461493522, + "learning_rate": 8.331033907794689e-06, + "loss": 0.7774, + "num_input_tokens_seen": 1207619168, + "step": 6689 + }, + { + "epoch": 0.7323681545745642, + "grad_norm": 1.2312416676660527, + "learning_rate": 8.324626566663914e-06, + "loss": 0.9379, + "num_input_tokens_seen": 1207806880, + "step": 6690 + }, + { + "epoch": 0.7324776266455021, + "grad_norm": 1.2900262639972386, + "learning_rate": 8.31822119819796e-06, + "loss": 0.8858, + "num_input_tokens_seen": 1207997952, + "step": 6691 + }, + { + "epoch": 0.73258709871644, + "grad_norm": 1.049204611381202, + "learning_rate": 8.311817803154525e-06, + "loss": 0.7763, + "num_input_tokens_seen": 1208164384, + "step": 6692 + }, + { + "epoch": 0.7326965707873778, + "grad_norm": 1.1116643038490575, + "learning_rate": 8.305416382291157e-06, + "loss": 0.9212, + "num_input_tokens_seen": 1208371360, + "step": 6693 + }, + { + "epoch": 0.7328060428583157, + "grad_norm": 1.2508166506521186, + "learning_rate": 8.299016936365111e-06, + "loss": 0.7534, + "num_input_tokens_seen": 1208562432, + "step": 6694 + }, + { + "epoch": 0.7329155149292537, + "grad_norm": 1.288509828404936, + "learning_rate": 8.292619466133437e-06, + "loss": 0.9269, + "num_input_tokens_seen": 1208761792, + "step": 6695 + }, + { + "epoch": 0.7330249870001916, + "grad_norm": 1.1330320474379996, + "learning_rate": 8.286223972352939e-06, + "loss": 0.7216, + "num_input_tokens_seen": 1208947936, + "step": 6696 + }, + { + "epoch": 0.7331344590711295, + "grad_norm": 1.3580234868486332, + "learning_rate": 8.279830455780196e-06, + "loss": 1.0054, + "num_input_tokens_seen": 1209122656, + "step": 6697 + }, + { + "epoch": 0.7332439311420674, + "grad_norm": 1.3711882555960526, + "learning_rate": 8.273438917171536e-06, + "loss": 0.9168, + "num_input_tokens_seen": 1209277440, + "step": 6698 + }, + { + "epoch": 0.7333534032130052, + "grad_norm": 1.2507018979674356, + "learning_rate": 8.267049357283088e-06, + "loss": 0.739, + "num_input_tokens_seen": 1209453952, + "step": 6699 + }, + { + "epoch": 0.7334628752839432, + "grad_norm": 1.5128073703148577, + "learning_rate": 8.26066177687071e-06, + "loss": 0.8182, + "num_input_tokens_seen": 1209643904, + "step": 6700 + }, + { + "epoch": 0.7335723473548811, + "grad_norm": 1.2659901692609885, + "learning_rate": 8.254276176690045e-06, + "loss": 0.7026, + "num_input_tokens_seen": 1209818624, + "step": 6701 + }, + { + "epoch": 0.733681819425819, + "grad_norm": 1.177525642585559, + "learning_rate": 8.247892557496495e-06, + "loss": 0.6626, + "num_input_tokens_seen": 1209992224, + "step": 6702 + }, + { + "epoch": 0.7337912914967569, + "grad_norm": 1.0599522056241015, + "learning_rate": 8.241510920045232e-06, + "loss": 0.959, + "num_input_tokens_seen": 1210194944, + "step": 6703 + }, + { + "epoch": 0.7339007635676948, + "grad_norm": 1.1399192194175363, + "learning_rate": 8.235131265091189e-06, + "loss": 0.8675, + "num_input_tokens_seen": 1210379072, + "step": 6704 + }, + { + "epoch": 0.7340102356386327, + "grad_norm": 1.2366057893176094, + "learning_rate": 8.22875359338906e-06, + "loss": 0.8466, + "num_input_tokens_seen": 1210559392, + "step": 6705 + }, + { + "epoch": 0.7341197077095706, + "grad_norm": 1.1808244897533233, + "learning_rate": 8.222377905693338e-06, + "loss": 0.5915, + "num_input_tokens_seen": 1210756512, + "step": 6706 + }, + { + "epoch": 0.7342291797805085, + "grad_norm": 1.2781917078942098, + "learning_rate": 8.21600420275822e-06, + "loss": 1.0757, + "num_input_tokens_seen": 1210969312, + "step": 6707 + }, + { + "epoch": 0.7343386518514464, + "grad_norm": 1.235812304071978, + "learning_rate": 8.209632485337727e-06, + "loss": 0.9046, + "num_input_tokens_seen": 1211147840, + "step": 6708 + }, + { + "epoch": 0.7344481239223843, + "grad_norm": 1.1723884983361461, + "learning_rate": 8.203262754185611e-06, + "loss": 0.822, + "num_input_tokens_seen": 1211362208, + "step": 6709 + }, + { + "epoch": 0.7345575959933222, + "grad_norm": 1.1965534027144318, + "learning_rate": 8.196895010055403e-06, + "loss": 0.6619, + "num_input_tokens_seen": 1211544992, + "step": 6710 + }, + { + "epoch": 0.7346670680642601, + "grad_norm": 1.1957297002212544, + "learning_rate": 8.190529253700393e-06, + "loss": 0.6354, + "num_input_tokens_seen": 1211710080, + "step": 6711 + }, + { + "epoch": 0.734776540135198, + "grad_norm": 1.288610066865293, + "learning_rate": 8.184165485873633e-06, + "loss": 1.0866, + "num_input_tokens_seen": 1211886368, + "step": 6712 + }, + { + "epoch": 0.7348860122061359, + "grad_norm": 1.2434058782022859, + "learning_rate": 8.177803707327961e-06, + "loss": 0.8596, + "num_input_tokens_seen": 1212058848, + "step": 6713 + }, + { + "epoch": 0.7349954842770738, + "grad_norm": 1.163171619298882, + "learning_rate": 8.171443918815939e-06, + "loss": 0.8312, + "num_input_tokens_seen": 1212255744, + "step": 6714 + }, + { + "epoch": 0.7351049563480118, + "grad_norm": 1.20577577153457, + "learning_rate": 8.165086121089944e-06, + "loss": 0.7439, + "num_input_tokens_seen": 1212444352, + "step": 6715 + }, + { + "epoch": 0.7352144284189496, + "grad_norm": 1.4364302427972218, + "learning_rate": 8.158730314902063e-06, + "loss": 0.8844, + "num_input_tokens_seen": 1212627136, + "step": 6716 + }, + { + "epoch": 0.7353239004898875, + "grad_norm": 1.2963554076815538, + "learning_rate": 8.152376501004199e-06, + "loss": 0.8864, + "num_input_tokens_seen": 1212820896, + "step": 6717 + }, + { + "epoch": 0.7354333725608254, + "grad_norm": 1.329314563060718, + "learning_rate": 8.146024680147987e-06, + "loss": 0.8577, + "num_input_tokens_seen": 1213006368, + "step": 6718 + }, + { + "epoch": 0.7355428446317633, + "grad_norm": 1.1237947047319115, + "learning_rate": 8.139674853084838e-06, + "loss": 0.9356, + "num_input_tokens_seen": 1213188480, + "step": 6719 + }, + { + "epoch": 0.7356523167027013, + "grad_norm": 1.3230824053813688, + "learning_rate": 8.13332702056592e-06, + "loss": 0.857, + "num_input_tokens_seen": 1213347744, + "step": 6720 + }, + { + "epoch": 0.7357617887736392, + "grad_norm": 1.0636787607654463, + "learning_rate": 8.126981183342167e-06, + "loss": 0.6646, + "num_input_tokens_seen": 1213526496, + "step": 6721 + }, + { + "epoch": 0.735871260844577, + "grad_norm": 1.1246232885551426, + "learning_rate": 8.120637342164298e-06, + "loss": 1.0344, + "num_input_tokens_seen": 1213721600, + "step": 6722 + }, + { + "epoch": 0.7359807329155149, + "grad_norm": 1.2608868953945076, + "learning_rate": 8.114295497782748e-06, + "loss": 0.7786, + "num_input_tokens_seen": 1213906176, + "step": 6723 + }, + { + "epoch": 0.7360902049864528, + "grad_norm": 1.2624522341766344, + "learning_rate": 8.107955650947777e-06, + "loss": 0.8771, + "num_input_tokens_seen": 1214069248, + "step": 6724 + }, + { + "epoch": 0.7361996770573908, + "grad_norm": 1.2209346948322441, + "learning_rate": 8.101617802409343e-06, + "loss": 0.9971, + "num_input_tokens_seen": 1214238368, + "step": 6725 + }, + { + "epoch": 0.7363091491283287, + "grad_norm": 1.3521112639956814, + "learning_rate": 8.095281952917227e-06, + "loss": 0.6737, + "num_input_tokens_seen": 1214379712, + "step": 6726 + }, + { + "epoch": 0.7364186211992665, + "grad_norm": 1.2216657825291561, + "learning_rate": 8.088948103220942e-06, + "loss": 0.758, + "num_input_tokens_seen": 1214520384, + "step": 6727 + }, + { + "epoch": 0.7365280932702044, + "grad_norm": 1.1423514377657242, + "learning_rate": 8.082616254069767e-06, + "loss": 0.9206, + "num_input_tokens_seen": 1214699136, + "step": 6728 + }, + { + "epoch": 0.7366375653411423, + "grad_norm": 1.1493290146158717, + "learning_rate": 8.076286406212747e-06, + "loss": 0.8122, + "num_input_tokens_seen": 1214875648, + "step": 6729 + }, + { + "epoch": 0.7367470374120803, + "grad_norm": 1.155809731044226, + "learning_rate": 8.069958560398686e-06, + "loss": 0.8615, + "num_input_tokens_seen": 1215072544, + "step": 6730 + }, + { + "epoch": 0.7368565094830182, + "grad_norm": 1.0963517443463286, + "learning_rate": 8.063632717376177e-06, + "loss": 0.7409, + "num_input_tokens_seen": 1215232032, + "step": 6731 + }, + { + "epoch": 0.7369659815539561, + "grad_norm": 1.1810144704308518, + "learning_rate": 8.057308877893524e-06, + "loss": 0.7533, + "num_input_tokens_seen": 1215415040, + "step": 6732 + }, + { + "epoch": 0.7370754536248939, + "grad_norm": 1.0565142600039987, + "learning_rate": 8.050987042698852e-06, + "loss": 0.5853, + "num_input_tokens_seen": 1215594016, + "step": 6733 + }, + { + "epoch": 0.7371849256958318, + "grad_norm": 1.1437516926403757, + "learning_rate": 8.04466721254001e-06, + "loss": 0.7491, + "num_input_tokens_seen": 1215788000, + "step": 6734 + }, + { + "epoch": 0.7372943977667697, + "grad_norm": 1.1673104973347088, + "learning_rate": 8.038349388164627e-06, + "loss": 0.8814, + "num_input_tokens_seen": 1215961600, + "step": 6735 + }, + { + "epoch": 0.7374038698377077, + "grad_norm": 1.2747351932417568, + "learning_rate": 8.032033570320083e-06, + "loss": 0.8221, + "num_input_tokens_seen": 1216098688, + "step": 6736 + }, + { + "epoch": 0.7375133419086456, + "grad_norm": 1.1336673010764795, + "learning_rate": 8.025719759753531e-06, + "loss": 0.66, + "num_input_tokens_seen": 1216277216, + "step": 6737 + }, + { + "epoch": 0.7376228139795835, + "grad_norm": 1.24532689142467, + "learning_rate": 8.019407957211883e-06, + "loss": 1.0393, + "num_input_tokens_seen": 1216462688, + "step": 6738 + }, + { + "epoch": 0.7377322860505213, + "grad_norm": 1.2759790158624098, + "learning_rate": 8.013098163441813e-06, + "loss": 0.9223, + "num_input_tokens_seen": 1216653984, + "step": 6739 + }, + { + "epoch": 0.7378417581214592, + "grad_norm": 1.4001894584253496, + "learning_rate": 8.006790379189746e-06, + "loss": 1.2815, + "num_input_tokens_seen": 1216865440, + "step": 6740 + }, + { + "epoch": 0.7379512301923972, + "grad_norm": 1.179972229000394, + "learning_rate": 8.000484605201902e-06, + "loss": 1.0093, + "num_input_tokens_seen": 1217068160, + "step": 6741 + }, + { + "epoch": 0.7380607022633351, + "grad_norm": 1.5316234055484887, + "learning_rate": 7.99418084222423e-06, + "loss": 0.9698, + "num_input_tokens_seen": 1217218912, + "step": 6742 + }, + { + "epoch": 0.738170174334273, + "grad_norm": 1.2296121673581364, + "learning_rate": 7.987879091002456e-06, + "loss": 0.7231, + "num_input_tokens_seen": 1217382656, + "step": 6743 + }, + { + "epoch": 0.7382796464052108, + "grad_norm": 1.3972075183912938, + "learning_rate": 7.981579352282064e-06, + "loss": 0.7381, + "num_input_tokens_seen": 1217549312, + "step": 6744 + }, + { + "epoch": 0.7383891184761487, + "grad_norm": 1.1382353847377222, + "learning_rate": 7.9752816268083e-06, + "loss": 0.7621, + "num_input_tokens_seen": 1217738144, + "step": 6745 + }, + { + "epoch": 0.7384985905470867, + "grad_norm": 1.24983877095889, + "learning_rate": 7.968985915326175e-06, + "loss": 0.978, + "num_input_tokens_seen": 1217935936, + "step": 6746 + }, + { + "epoch": 0.7386080626180246, + "grad_norm": 1.3131698533921772, + "learning_rate": 7.962692218580451e-06, + "loss": 0.8273, + "num_input_tokens_seen": 1218120064, + "step": 6747 + }, + { + "epoch": 0.7387175346889625, + "grad_norm": 1.2170462314566644, + "learning_rate": 7.956400537315681e-06, + "loss": 1.0233, + "num_input_tokens_seen": 1218276864, + "step": 6748 + }, + { + "epoch": 0.7388270067599004, + "grad_norm": 1.1968361611700513, + "learning_rate": 7.950110872276131e-06, + "loss": 0.6925, + "num_input_tokens_seen": 1218446880, + "step": 6749 + }, + { + "epoch": 0.7389364788308382, + "grad_norm": 1.3050939417967862, + "learning_rate": 7.943823224205879e-06, + "loss": 0.7661, + "num_input_tokens_seen": 1218612192, + "step": 6750 + }, + { + "epoch": 0.7390459509017762, + "grad_norm": 1.1550820096117993, + "learning_rate": 7.937537593848734e-06, + "loss": 0.9334, + "num_input_tokens_seen": 1218814240, + "step": 6751 + }, + { + "epoch": 0.7391554229727141, + "grad_norm": 1.2103221191055524, + "learning_rate": 7.931253981948275e-06, + "loss": 0.6535, + "num_input_tokens_seen": 1218999264, + "step": 6752 + }, + { + "epoch": 0.739264895043652, + "grad_norm": 1.1496686499528672, + "learning_rate": 7.924972389247836e-06, + "loss": 0.6463, + "num_input_tokens_seen": 1219163904, + "step": 6753 + }, + { + "epoch": 0.7393743671145899, + "grad_norm": 1.255544715012157, + "learning_rate": 7.918692816490517e-06, + "loss": 1.0891, + "num_input_tokens_seen": 1219355424, + "step": 6754 + }, + { + "epoch": 0.7394838391855278, + "grad_norm": 1.0772247425764885, + "learning_rate": 7.912415264419198e-06, + "loss": 0.8389, + "num_input_tokens_seen": 1219561056, + "step": 6755 + }, + { + "epoch": 0.7395933112564657, + "grad_norm": 1.1728772264606022, + "learning_rate": 7.906139733776474e-06, + "loss": 0.7436, + "num_input_tokens_seen": 1219731520, + "step": 6756 + }, + { + "epoch": 0.7397027833274036, + "grad_norm": 1.15400271209451, + "learning_rate": 7.899866225304756e-06, + "loss": 0.9019, + "num_input_tokens_seen": 1219912736, + "step": 6757 + }, + { + "epoch": 0.7398122553983415, + "grad_norm": 1.0945841197192665, + "learning_rate": 7.893594739746157e-06, + "loss": 0.6783, + "num_input_tokens_seen": 1220117024, + "step": 6758 + }, + { + "epoch": 0.7399217274692794, + "grad_norm": 1.2546205001557287, + "learning_rate": 7.887325277842605e-06, + "loss": 0.7174, + "num_input_tokens_seen": 1220292416, + "step": 6759 + }, + { + "epoch": 0.7400311995402173, + "grad_norm": 1.3410218956231514, + "learning_rate": 7.881057840335762e-06, + "loss": 0.9755, + "num_input_tokens_seen": 1220511264, + "step": 6760 + }, + { + "epoch": 0.7401406716111552, + "grad_norm": 0.9704050876837856, + "learning_rate": 7.874792427967048e-06, + "loss": 0.6291, + "num_input_tokens_seen": 1220711296, + "step": 6761 + }, + { + "epoch": 0.7402501436820931, + "grad_norm": 1.2340439944160901, + "learning_rate": 7.868529041477654e-06, + "loss": 0.8917, + "num_input_tokens_seen": 1220884896, + "step": 6762 + }, + { + "epoch": 0.740359615753031, + "grad_norm": 1.2779764830614997, + "learning_rate": 7.862267681608514e-06, + "loss": 0.7946, + "num_input_tokens_seen": 1221047520, + "step": 6763 + }, + { + "epoch": 0.7404690878239689, + "grad_norm": 1.3414905981473007, + "learning_rate": 7.856008349100366e-06, + "loss": 0.8309, + "num_input_tokens_seen": 1221237696, + "step": 6764 + }, + { + "epoch": 0.7405785598949068, + "grad_norm": 1.2886072453078603, + "learning_rate": 7.849751044693637e-06, + "loss": 0.8358, + "num_input_tokens_seen": 1221433024, + "step": 6765 + }, + { + "epoch": 0.7406880319658448, + "grad_norm": 1.2237309129607765, + "learning_rate": 7.843495769128584e-06, + "loss": 0.73, + "num_input_tokens_seen": 1221616256, + "step": 6766 + }, + { + "epoch": 0.7407975040367826, + "grad_norm": 1.154694361272721, + "learning_rate": 7.83724252314518e-06, + "loss": 0.8665, + "num_input_tokens_seen": 1221827712, + "step": 6767 + }, + { + "epoch": 0.7409069761077205, + "grad_norm": 1.415233232801473, + "learning_rate": 7.830991307483179e-06, + "loss": 1.1668, + "num_input_tokens_seen": 1221996608, + "step": 6768 + }, + { + "epoch": 0.7410164481786584, + "grad_norm": 1.3268024816115513, + "learning_rate": 7.824742122882083e-06, + "loss": 0.73, + "num_input_tokens_seen": 1222167744, + "step": 6769 + }, + { + "epoch": 0.7411259202495963, + "grad_norm": 1.2597127740700307, + "learning_rate": 7.818494970081161e-06, + "loss": 0.8455, + "num_input_tokens_seen": 1222335072, + "step": 6770 + }, + { + "epoch": 0.7412353923205343, + "grad_norm": 1.1968998560304034, + "learning_rate": 7.812249849819439e-06, + "loss": 0.6304, + "num_input_tokens_seen": 1222520544, + "step": 6771 + }, + { + "epoch": 0.7413448643914722, + "grad_norm": 1.2467436814531474, + "learning_rate": 7.806006762835696e-06, + "loss": 0.7753, + "num_input_tokens_seen": 1222695040, + "step": 6772 + }, + { + "epoch": 0.74145433646241, + "grad_norm": 1.4461898202994967, + "learning_rate": 7.79976570986849e-06, + "loss": 0.9039, + "num_input_tokens_seen": 1222873568, + "step": 6773 + }, + { + "epoch": 0.7415638085333479, + "grad_norm": 1.2808307333882405, + "learning_rate": 7.793526691656117e-06, + "loss": 0.8444, + "num_input_tokens_seen": 1223068896, + "step": 6774 + }, + { + "epoch": 0.7416732806042858, + "grad_norm": 1.198815094549785, + "learning_rate": 7.787289708936645e-06, + "loss": 0.7585, + "num_input_tokens_seen": 1223273184, + "step": 6775 + }, + { + "epoch": 0.7417827526752238, + "grad_norm": 1.2317872748645313, + "learning_rate": 7.781054762447898e-06, + "loss": 0.7415, + "num_input_tokens_seen": 1223463584, + "step": 6776 + }, + { + "epoch": 0.7418922247461617, + "grad_norm": 1.188176464348808, + "learning_rate": 7.774821852927453e-06, + "loss": 0.8687, + "num_input_tokens_seen": 1223660032, + "step": 6777 + }, + { + "epoch": 0.7420016968170995, + "grad_norm": 1.222258189182699, + "learning_rate": 7.768590981112654e-06, + "loss": 0.7623, + "num_input_tokens_seen": 1223858496, + "step": 6778 + }, + { + "epoch": 0.7421111688880374, + "grad_norm": 1.316962240739394, + "learning_rate": 7.762362147740601e-06, + "loss": 0.8207, + "num_input_tokens_seen": 1224017088, + "step": 6779 + }, + { + "epoch": 0.7422206409589753, + "grad_norm": 0.9792964964212751, + "learning_rate": 7.756135353548145e-06, + "loss": 0.6315, + "num_input_tokens_seen": 1224211296, + "step": 6780 + }, + { + "epoch": 0.7423301130299133, + "grad_norm": 1.0611575565581168, + "learning_rate": 7.749910599271928e-06, + "loss": 0.7878, + "num_input_tokens_seen": 1224394080, + "step": 6781 + }, + { + "epoch": 0.7424395851008512, + "grad_norm": 1.4328288602570944, + "learning_rate": 7.743687885648293e-06, + "loss": 0.7693, + "num_input_tokens_seen": 1224540800, + "step": 6782 + }, + { + "epoch": 0.7425490571717891, + "grad_norm": 1.1048837468543038, + "learning_rate": 7.737467213413405e-06, + "loss": 0.9423, + "num_input_tokens_seen": 1224720224, + "step": 6783 + }, + { + "epoch": 0.7426585292427269, + "grad_norm": 1.2150530864750624, + "learning_rate": 7.731248583303142e-06, + "loss": 0.8774, + "num_input_tokens_seen": 1224890016, + "step": 6784 + }, + { + "epoch": 0.7427680013136648, + "grad_norm": 1.178048264410132, + "learning_rate": 7.725031996053159e-06, + "loss": 1.0115, + "num_input_tokens_seen": 1225078848, + "step": 6785 + }, + { + "epoch": 0.7428774733846027, + "grad_norm": 1.2717069322669936, + "learning_rate": 7.718817452398869e-06, + "loss": 0.8412, + "num_input_tokens_seen": 1225271936, + "step": 6786 + }, + { + "epoch": 0.7429869454555407, + "grad_norm": 1.2322726635449848, + "learning_rate": 7.712604953075428e-06, + "loss": 0.9265, + "num_input_tokens_seen": 1225457632, + "step": 6787 + }, + { + "epoch": 0.7430964175264786, + "grad_norm": 1.1325834371554422, + "learning_rate": 7.70639449881779e-06, + "loss": 0.621, + "num_input_tokens_seen": 1225619360, + "step": 6788 + }, + { + "epoch": 0.7432058895974165, + "grad_norm": 1.3092468909182444, + "learning_rate": 7.700186090360609e-06, + "loss": 0.8448, + "num_input_tokens_seen": 1225774144, + "step": 6789 + }, + { + "epoch": 0.7433153616683543, + "grad_norm": 1.2479383155757806, + "learning_rate": 7.693979728438355e-06, + "loss": 0.997, + "num_input_tokens_seen": 1225961184, + "step": 6790 + }, + { + "epoch": 0.7434248337392922, + "grad_norm": 1.1050636438881958, + "learning_rate": 7.687775413785201e-06, + "loss": 0.6648, + "num_input_tokens_seen": 1226130304, + "step": 6791 + }, + { + "epoch": 0.7435343058102302, + "grad_norm": 1.3921483300860673, + "learning_rate": 7.681573147135126e-06, + "loss": 0.8546, + "num_input_tokens_seen": 1226317792, + "step": 6792 + }, + { + "epoch": 0.7436437778811681, + "grad_norm": 1.3398481986191635, + "learning_rate": 7.675372929221844e-06, + "loss": 1.0252, + "num_input_tokens_seen": 1226485568, + "step": 6793 + }, + { + "epoch": 0.743753249952106, + "grad_norm": 1.3068974674433533, + "learning_rate": 7.669174760778825e-06, + "loss": 0.6643, + "num_input_tokens_seen": 1226661408, + "step": 6794 + }, + { + "epoch": 0.7438627220230438, + "grad_norm": 1.0661890835818746, + "learning_rate": 7.662978642539298e-06, + "loss": 0.6442, + "num_input_tokens_seen": 1226851584, + "step": 6795 + }, + { + "epoch": 0.7439721940939817, + "grad_norm": 1.215079870306751, + "learning_rate": 7.65678457523625e-06, + "loss": 0.7448, + "num_input_tokens_seen": 1227038400, + "step": 6796 + }, + { + "epoch": 0.7440816661649197, + "grad_norm": 1.3395166470474658, + "learning_rate": 7.650592559602446e-06, + "loss": 0.6665, + "num_input_tokens_seen": 1227194528, + "step": 6797 + }, + { + "epoch": 0.7441911382358576, + "grad_norm": 1.268198763608909, + "learning_rate": 7.644402596370361e-06, + "loss": 1.1128, + "num_input_tokens_seen": 1227379104, + "step": 6798 + }, + { + "epoch": 0.7443006103067955, + "grad_norm": 1.1589429271295053, + "learning_rate": 7.638214686272285e-06, + "loss": 0.6007, + "num_input_tokens_seen": 1227529856, + "step": 6799 + }, + { + "epoch": 0.7444100823777334, + "grad_norm": 1.11620810734481, + "learning_rate": 7.632028830040208e-06, + "loss": 0.8588, + "num_input_tokens_seen": 1227724512, + "step": 6800 + }, + { + "epoch": 0.7445195544486712, + "grad_norm": 1.067116357713611, + "learning_rate": 7.6258450284059255e-06, + "loss": 0.8388, + "num_input_tokens_seen": 1227891168, + "step": 6801 + }, + { + "epoch": 0.7446290265196092, + "grad_norm": 1.0743844199308556, + "learning_rate": 7.619663282100961e-06, + "loss": 0.6731, + "num_input_tokens_seen": 1228069696, + "step": 6802 + }, + { + "epoch": 0.7447384985905471, + "grad_norm": 1.2763063376879296, + "learning_rate": 7.613483591856605e-06, + "loss": 0.8809, + "num_input_tokens_seen": 1228255840, + "step": 6803 + }, + { + "epoch": 0.744847970661485, + "grad_norm": 1.2315830574168696, + "learning_rate": 7.607305958403904e-06, + "loss": 0.7567, + "num_input_tokens_seen": 1228453408, + "step": 6804 + }, + { + "epoch": 0.7449574427324229, + "grad_norm": 1.2341619380127966, + "learning_rate": 7.601130382473651e-06, + "loss": 0.8681, + "num_input_tokens_seen": 1228635296, + "step": 6805 + }, + { + "epoch": 0.7450669148033608, + "grad_norm": 1.1477378738284556, + "learning_rate": 7.5949568647964265e-06, + "loss": 0.8736, + "num_input_tokens_seen": 1228836224, + "step": 6806 + }, + { + "epoch": 0.7451763868742987, + "grad_norm": 1.2785717955038483, + "learning_rate": 7.58878540610252e-06, + "loss": 0.8627, + "num_input_tokens_seen": 1229037152, + "step": 6807 + }, + { + "epoch": 0.7452858589452366, + "grad_norm": 1.097766802776249, + "learning_rate": 7.58261600712202e-06, + "loss": 0.7538, + "num_input_tokens_seen": 1229217248, + "step": 6808 + }, + { + "epoch": 0.7453953310161745, + "grad_norm": 1.1850457679988111, + "learning_rate": 7.576448668584752e-06, + "loss": 0.6557, + "num_input_tokens_seen": 1229405184, + "step": 6809 + }, + { + "epoch": 0.7455048030871124, + "grad_norm": 1.0737487874673697, + "learning_rate": 7.570283391220295e-06, + "loss": 0.8186, + "num_input_tokens_seen": 1229609920, + "step": 6810 + }, + { + "epoch": 0.7456142751580503, + "grad_norm": 1.25482391337943, + "learning_rate": 7.564120175757996e-06, + "loss": 0.7368, + "num_input_tokens_seen": 1229770528, + "step": 6811 + }, + { + "epoch": 0.7457237472289882, + "grad_norm": 1.3579112536094315, + "learning_rate": 7.557959022926947e-06, + "loss": 0.883, + "num_input_tokens_seen": 1229946368, + "step": 6812 + }, + { + "epoch": 0.7458332192999261, + "grad_norm": 1.1994048232067607, + "learning_rate": 7.551799933456003e-06, + "loss": 0.8674, + "num_input_tokens_seen": 1230145728, + "step": 6813 + }, + { + "epoch": 0.745942691370864, + "grad_norm": 1.105292585914517, + "learning_rate": 7.5456429080737635e-06, + "loss": 0.7522, + "num_input_tokens_seen": 1230323136, + "step": 6814 + }, + { + "epoch": 0.7460521634418019, + "grad_norm": 1.1668300116948105, + "learning_rate": 7.5394879475086085e-06, + "loss": 0.5962, + "num_input_tokens_seen": 1230479040, + "step": 6815 + }, + { + "epoch": 0.7461616355127398, + "grad_norm": 1.276226943701683, + "learning_rate": 7.533335052488652e-06, + "loss": 0.6823, + "num_input_tokens_seen": 1230657344, + "step": 6816 + }, + { + "epoch": 0.7462711075836778, + "grad_norm": 1.2122871998723592, + "learning_rate": 7.527184223741765e-06, + "loss": 0.5537, + "num_input_tokens_seen": 1230829152, + "step": 6817 + }, + { + "epoch": 0.7463805796546156, + "grad_norm": 1.3118770574269147, + "learning_rate": 7.521035461995585e-06, + "loss": 0.8846, + "num_input_tokens_seen": 1230977440, + "step": 6818 + }, + { + "epoch": 0.7464900517255535, + "grad_norm": 1.3209251472178931, + "learning_rate": 7.514888767977493e-06, + "loss": 0.9078, + "num_input_tokens_seen": 1231151712, + "step": 6819 + }, + { + "epoch": 0.7465995237964914, + "grad_norm": 1.2876226533409743, + "learning_rate": 7.508744142414629e-06, + "loss": 0.6792, + "num_input_tokens_seen": 1231312768, + "step": 6820 + }, + { + "epoch": 0.7467089958674293, + "grad_norm": 1.2198755495508167, + "learning_rate": 7.502601586033908e-06, + "loss": 0.701, + "num_input_tokens_seen": 1231495776, + "step": 6821 + }, + { + "epoch": 0.7468184679383673, + "grad_norm": 1.3165150945473973, + "learning_rate": 7.496461099561958e-06, + "loss": 0.9869, + "num_input_tokens_seen": 1231663104, + "step": 6822 + }, + { + "epoch": 0.7469279400093052, + "grad_norm": 1.1727438185575074, + "learning_rate": 7.490322683725204e-06, + "loss": 0.8417, + "num_input_tokens_seen": 1231853952, + "step": 6823 + }, + { + "epoch": 0.747037412080243, + "grad_norm": 1.1815759057851074, + "learning_rate": 7.484186339249804e-06, + "loss": 0.9362, + "num_input_tokens_seen": 1232032480, + "step": 6824 + }, + { + "epoch": 0.7471468841511809, + "grad_norm": 1.2469420221428473, + "learning_rate": 7.4780520668616765e-06, + "loss": 0.8023, + "num_input_tokens_seen": 1232213920, + "step": 6825 + }, + { + "epoch": 0.7472563562221188, + "grad_norm": 1.3139315019372746, + "learning_rate": 7.471919867286492e-06, + "loss": 0.7166, + "num_input_tokens_seen": 1232398272, + "step": 6826 + }, + { + "epoch": 0.7473658282930568, + "grad_norm": 1.113141449298581, + "learning_rate": 7.465789741249671e-06, + "loss": 0.8036, + "num_input_tokens_seen": 1232577920, + "step": 6827 + }, + { + "epoch": 0.7474753003639947, + "grad_norm": 1.3251659415355859, + "learning_rate": 7.4596616894764215e-06, + "loss": 0.8175, + "num_input_tokens_seen": 1232743680, + "step": 6828 + }, + { + "epoch": 0.7475847724349325, + "grad_norm": 1.2927151341942058, + "learning_rate": 7.4535357126916446e-06, + "loss": 0.8717, + "num_input_tokens_seen": 1232926688, + "step": 6829 + }, + { + "epoch": 0.7476942445058704, + "grad_norm": 1.4272465948938566, + "learning_rate": 7.447411811620067e-06, + "loss": 0.6184, + "num_input_tokens_seen": 1233110368, + "step": 6830 + }, + { + "epoch": 0.7478037165768083, + "grad_norm": 1.0965011329036205, + "learning_rate": 7.441289986986102e-06, + "loss": 0.6001, + "num_input_tokens_seen": 1233260896, + "step": 6831 + }, + { + "epoch": 0.7479131886477463, + "grad_norm": 1.2308902079244246, + "learning_rate": 7.43517023951398e-06, + "loss": 0.8753, + "num_input_tokens_seen": 1233459808, + "step": 6832 + }, + { + "epoch": 0.7480226607186842, + "grad_norm": 1.3158436373805869, + "learning_rate": 7.429052569927625e-06, + "loss": 1.1509, + "num_input_tokens_seen": 1233681120, + "step": 6833 + }, + { + "epoch": 0.7481321327896221, + "grad_norm": 1.3080153824050447, + "learning_rate": 7.4229369789507706e-06, + "loss": 0.9619, + "num_input_tokens_seen": 1233870624, + "step": 6834 + }, + { + "epoch": 0.7482416048605599, + "grad_norm": 1.3071237990771538, + "learning_rate": 7.416823467306866e-06, + "loss": 1.0941, + "num_input_tokens_seen": 1234074912, + "step": 6835 + }, + { + "epoch": 0.7483510769314978, + "grad_norm": 1.2857539865252845, + "learning_rate": 7.410712035719133e-06, + "loss": 0.9603, + "num_input_tokens_seen": 1234270464, + "step": 6836 + }, + { + "epoch": 0.7484605490024357, + "grad_norm": 1.3768011780358638, + "learning_rate": 7.4046026849105445e-06, + "loss": 1.1083, + "num_input_tokens_seen": 1234458624, + "step": 6837 + }, + { + "epoch": 0.7485700210733737, + "grad_norm": 1.3389531948920292, + "learning_rate": 7.3984954156038095e-06, + "loss": 0.8416, + "num_input_tokens_seen": 1234628416, + "step": 6838 + }, + { + "epoch": 0.7486794931443116, + "grad_norm": 1.3158787426029706, + "learning_rate": 7.392390228521437e-06, + "loss": 0.6634, + "num_input_tokens_seen": 1234793504, + "step": 6839 + }, + { + "epoch": 0.7487889652152495, + "grad_norm": 1.3869580559360046, + "learning_rate": 7.386287124385624e-06, + "loss": 0.8972, + "num_input_tokens_seen": 1234967104, + "step": 6840 + }, + { + "epoch": 0.7488984372861873, + "grad_norm": 1.1983992171358828, + "learning_rate": 7.3801861039183796e-06, + "loss": 0.8314, + "num_input_tokens_seen": 1235159520, + "step": 6841 + }, + { + "epoch": 0.7490079093571252, + "grad_norm": 1.2568029297093386, + "learning_rate": 7.374087167841437e-06, + "loss": 0.9452, + "num_input_tokens_seen": 1235341856, + "step": 6842 + }, + { + "epoch": 0.7491173814280632, + "grad_norm": 1.2174077590058403, + "learning_rate": 7.367990316876286e-06, + "loss": 0.8712, + "num_input_tokens_seen": 1235501120, + "step": 6843 + }, + { + "epoch": 0.7492268534990011, + "grad_norm": 1.1297633207632816, + "learning_rate": 7.361895551744175e-06, + "loss": 0.6424, + "num_input_tokens_seen": 1235711456, + "step": 6844 + }, + { + "epoch": 0.749336325569939, + "grad_norm": 1.2364096471536632, + "learning_rate": 7.355802873166101e-06, + "loss": 1.0514, + "num_input_tokens_seen": 1235903648, + "step": 6845 + }, + { + "epoch": 0.7494457976408768, + "grad_norm": 1.3763768498501219, + "learning_rate": 7.349712281862817e-06, + "loss": 1.1335, + "num_input_tokens_seen": 1236106144, + "step": 6846 + }, + { + "epoch": 0.7495552697118147, + "grad_norm": 1.222513452545602, + "learning_rate": 7.34362377855482e-06, + "loss": 0.6468, + "num_input_tokens_seen": 1236276160, + "step": 6847 + }, + { + "epoch": 0.7496647417827527, + "grad_norm": 1.3341411090780033, + "learning_rate": 7.3375373639623876e-06, + "loss": 0.7023, + "num_input_tokens_seen": 1236434304, + "step": 6848 + }, + { + "epoch": 0.7497742138536906, + "grad_norm": 1.107508687917418, + "learning_rate": 7.331453038805517e-06, + "loss": 0.5547, + "num_input_tokens_seen": 1236612160, + "step": 6849 + }, + { + "epoch": 0.7498836859246285, + "grad_norm": 1.1561298307915324, + "learning_rate": 7.325370803803977e-06, + "loss": 0.8195, + "num_input_tokens_seen": 1236782624, + "step": 6850 + }, + { + "epoch": 0.7499931579955664, + "grad_norm": 1.233240741183828, + "learning_rate": 7.319290659677283e-06, + "loss": 0.7762, + "num_input_tokens_seen": 1236971904, + "step": 6851 + }, + { + "epoch": 0.7501026300665042, + "grad_norm": 1.3132165587659104, + "learning_rate": 7.313212607144704e-06, + "loss": 0.7099, + "num_input_tokens_seen": 1237136992, + "step": 6852 + }, + { + "epoch": 0.7502121021374422, + "grad_norm": 1.1768746598983169, + "learning_rate": 7.307136646925261e-06, + "loss": 0.7497, + "num_input_tokens_seen": 1237317760, + "step": 6853 + }, + { + "epoch": 0.7503215742083801, + "grad_norm": 1.2867218499299757, + "learning_rate": 7.30106277973773e-06, + "loss": 0.6882, + "num_input_tokens_seen": 1237500992, + "step": 6854 + }, + { + "epoch": 0.750431046279318, + "grad_norm": 1.1524356611419113, + "learning_rate": 7.294991006300631e-06, + "loss": 0.6981, + "num_input_tokens_seen": 1237671008, + "step": 6855 + }, + { + "epoch": 0.7505405183502559, + "grad_norm": 1.238311093328766, + "learning_rate": 7.288921327332254e-06, + "loss": 0.7309, + "num_input_tokens_seen": 1237856032, + "step": 6856 + }, + { + "epoch": 0.7506499904211938, + "grad_norm": 1.0656463374801124, + "learning_rate": 7.28285374355063e-06, + "loss": 0.6799, + "num_input_tokens_seen": 1238044864, + "step": 6857 + }, + { + "epoch": 0.7507594624921317, + "grad_norm": 1.3607329678048594, + "learning_rate": 7.276788255673539e-06, + "loss": 0.9884, + "num_input_tokens_seen": 1238226528, + "step": 6858 + }, + { + "epoch": 0.7508689345630696, + "grad_norm": 1.1825160291202321, + "learning_rate": 7.270724864418513e-06, + "loss": 0.7491, + "num_input_tokens_seen": 1238429024, + "step": 6859 + }, + { + "epoch": 0.7509784066340075, + "grad_norm": 1.2806522556753768, + "learning_rate": 7.264663570502844e-06, + "loss": 1.0001, + "num_input_tokens_seen": 1238642720, + "step": 6860 + }, + { + "epoch": 0.7510878787049454, + "grad_norm": 1.1952642792593002, + "learning_rate": 7.258604374643571e-06, + "loss": 1.0472, + "num_input_tokens_seen": 1238817664, + "step": 6861 + }, + { + "epoch": 0.7511973507758833, + "grad_norm": 1.2841519312886798, + "learning_rate": 7.252547277557478e-06, + "loss": 0.8808, + "num_input_tokens_seen": 1238987904, + "step": 6862 + }, + { + "epoch": 0.7513068228468212, + "grad_norm": 1.1343898971882522, + "learning_rate": 7.246492279961129e-06, + "loss": 0.6106, + "num_input_tokens_seen": 1239161280, + "step": 6863 + }, + { + "epoch": 0.7514162949177591, + "grad_norm": 1.199884633638098, + "learning_rate": 7.24043938257079e-06, + "loss": 0.7211, + "num_input_tokens_seen": 1239359520, + "step": 6864 + }, + { + "epoch": 0.751525766988697, + "grad_norm": 1.211601124704468, + "learning_rate": 7.234388586102528e-06, + "loss": 0.7162, + "num_input_tokens_seen": 1239512960, + "step": 6865 + }, + { + "epoch": 0.7516352390596349, + "grad_norm": 1.1977479345932465, + "learning_rate": 7.228339891272135e-06, + "loss": 0.9926, + "num_input_tokens_seen": 1239712992, + "step": 6866 + }, + { + "epoch": 0.7517447111305728, + "grad_norm": 1.391064392280517, + "learning_rate": 7.222293298795158e-06, + "loss": 0.8644, + "num_input_tokens_seen": 1239866208, + "step": 6867 + }, + { + "epoch": 0.7518541832015108, + "grad_norm": 1.1921460465690006, + "learning_rate": 7.216248809386899e-06, + "loss": 1.0343, + "num_input_tokens_seen": 1240059968, + "step": 6868 + }, + { + "epoch": 0.7519636552724486, + "grad_norm": 1.2309011517029593, + "learning_rate": 7.210206423762403e-06, + "loss": 0.7215, + "num_input_tokens_seen": 1240262016, + "step": 6869 + }, + { + "epoch": 0.7520731273433865, + "grad_norm": 1.283706403770203, + "learning_rate": 7.2041661426364925e-06, + "loss": 0.8203, + "num_input_tokens_seen": 1240452640, + "step": 6870 + }, + { + "epoch": 0.7521825994143244, + "grad_norm": 1.1366283996960778, + "learning_rate": 7.198127966723692e-06, + "loss": 0.8787, + "num_input_tokens_seen": 1240643040, + "step": 6871 + }, + { + "epoch": 0.7522920714852623, + "grad_norm": 1.2307133991502992, + "learning_rate": 7.192091896738337e-06, + "loss": 0.7175, + "num_input_tokens_seen": 1240845760, + "step": 6872 + }, + { + "epoch": 0.7524015435562003, + "grad_norm": 1.3220922900230219, + "learning_rate": 7.1860579333944525e-06, + "loss": 0.9158, + "num_input_tokens_seen": 1241020032, + "step": 6873 + }, + { + "epoch": 0.7525110156271382, + "grad_norm": 1.2414748348738287, + "learning_rate": 7.180026077405877e-06, + "loss": 0.738, + "num_input_tokens_seen": 1241201920, + "step": 6874 + }, + { + "epoch": 0.752620487698076, + "grad_norm": 1.1680715254151681, + "learning_rate": 7.1739963294861325e-06, + "loss": 0.9236, + "num_input_tokens_seen": 1241411584, + "step": 6875 + }, + { + "epoch": 0.7527299597690139, + "grad_norm": 1.166446625303928, + "learning_rate": 7.167968690348554e-06, + "loss": 0.8722, + "num_input_tokens_seen": 1241601536, + "step": 6876 + }, + { + "epoch": 0.7528394318399518, + "grad_norm": 1.1262928633413567, + "learning_rate": 7.161943160706189e-06, + "loss": 0.8053, + "num_input_tokens_seen": 1241787232, + "step": 6877 + }, + { + "epoch": 0.7529489039108898, + "grad_norm": 1.2731873904387287, + "learning_rate": 7.155919741271849e-06, + "loss": 0.9984, + "num_input_tokens_seen": 1241959040, + "step": 6878 + }, + { + "epoch": 0.7530583759818277, + "grad_norm": 1.1633190879022444, + "learning_rate": 7.149898432758093e-06, + "loss": 0.6845, + "num_input_tokens_seen": 1242146304, + "step": 6879 + }, + { + "epoch": 0.7531678480527655, + "grad_norm": 1.2993785638287851, + "learning_rate": 7.143879235877218e-06, + "loss": 0.7861, + "num_input_tokens_seen": 1242342976, + "step": 6880 + }, + { + "epoch": 0.7532773201237034, + "grad_norm": 1.17085486196543, + "learning_rate": 7.13786215134131e-06, + "loss": 0.7791, + "num_input_tokens_seen": 1242525536, + "step": 6881 + }, + { + "epoch": 0.7533867921946413, + "grad_norm": 1.2159613470064068, + "learning_rate": 7.131847179862148e-06, + "loss": 1.0009, + "num_input_tokens_seen": 1242710784, + "step": 6882 + }, + { + "epoch": 0.7534962642655793, + "grad_norm": 1.2321386238483503, + "learning_rate": 7.125834322151315e-06, + "loss": 0.8608, + "num_input_tokens_seen": 1242918208, + "step": 6883 + }, + { + "epoch": 0.7536057363365172, + "grad_norm": 1.3269745724763682, + "learning_rate": 7.119823578920112e-06, + "loss": 0.7602, + "num_input_tokens_seen": 1243107936, + "step": 6884 + }, + { + "epoch": 0.7537152084074551, + "grad_norm": 1.1348127398008747, + "learning_rate": 7.113814950879596e-06, + "loss": 0.7868, + "num_input_tokens_seen": 1243276384, + "step": 6885 + }, + { + "epoch": 0.7538246804783929, + "grad_norm": 1.3039738879251475, + "learning_rate": 7.1078084387405815e-06, + "loss": 0.7359, + "num_input_tokens_seen": 1243439456, + "step": 6886 + }, + { + "epoch": 0.7539341525493308, + "grad_norm": 1.133133415279291, + "learning_rate": 7.101804043213625e-06, + "loss": 0.7925, + "num_input_tokens_seen": 1243644416, + "step": 6887 + }, + { + "epoch": 0.7540436246202687, + "grad_norm": 1.323135157687155, + "learning_rate": 7.0958017650090245e-06, + "loss": 0.8946, + "num_input_tokens_seen": 1243808160, + "step": 6888 + }, + { + "epoch": 0.7541530966912067, + "grad_norm": 1.134109964226281, + "learning_rate": 7.089801604836857e-06, + "loss": 0.8432, + "num_input_tokens_seen": 1243992288, + "step": 6889 + }, + { + "epoch": 0.7542625687621446, + "grad_norm": 1.0762042752077334, + "learning_rate": 7.083803563406924e-06, + "loss": 0.709, + "num_input_tokens_seen": 1244214720, + "step": 6890 + }, + { + "epoch": 0.7543720408330825, + "grad_norm": 1.324607049506308, + "learning_rate": 7.077807641428777e-06, + "loss": 0.8934, + "num_input_tokens_seen": 1244378016, + "step": 6891 + }, + { + "epoch": 0.7544815129040203, + "grad_norm": 1.2721363277165232, + "learning_rate": 7.071813839611724e-06, + "loss": 0.7872, + "num_input_tokens_seen": 1244565504, + "step": 6892 + }, + { + "epoch": 0.7545909849749582, + "grad_norm": 1.1948271121172058, + "learning_rate": 7.0658221586648195e-06, + "loss": 0.8506, + "num_input_tokens_seen": 1244752320, + "step": 6893 + }, + { + "epoch": 0.7547004570458962, + "grad_norm": 1.217469055753837, + "learning_rate": 7.059832599296873e-06, + "loss": 0.7866, + "num_input_tokens_seen": 1244909120, + "step": 6894 + }, + { + "epoch": 0.7548099291168341, + "grad_norm": 1.2979976436934515, + "learning_rate": 7.053845162216424e-06, + "loss": 0.6671, + "num_input_tokens_seen": 1245084736, + "step": 6895 + }, + { + "epoch": 0.754919401187772, + "grad_norm": 1.298884340374737, + "learning_rate": 7.047859848131802e-06, + "loss": 0.8998, + "num_input_tokens_seen": 1245269984, + "step": 6896 + }, + { + "epoch": 0.7550288732587098, + "grad_norm": 1.326395546839294, + "learning_rate": 7.041876657751023e-06, + "loss": 0.7954, + "num_input_tokens_seen": 1245409760, + "step": 6897 + }, + { + "epoch": 0.7551383453296477, + "grad_norm": 1.247643395096704, + "learning_rate": 7.035895591781916e-06, + "loss": 1.0313, + "num_input_tokens_seen": 1245605984, + "step": 6898 + }, + { + "epoch": 0.7552478174005857, + "grad_norm": 1.3647373151411535, + "learning_rate": 7.0299166509320194e-06, + "loss": 0.7462, + "num_input_tokens_seen": 1245775328, + "step": 6899 + }, + { + "epoch": 0.7553572894715236, + "grad_norm": 1.248720420128124, + "learning_rate": 7.023939835908627e-06, + "loss": 0.7666, + "num_input_tokens_seen": 1245912192, + "step": 6900 + }, + { + "epoch": 0.7554667615424615, + "grad_norm": 1.2004938956967008, + "learning_rate": 7.0179651474187895e-06, + "loss": 0.8747, + "num_input_tokens_seen": 1246109984, + "step": 6901 + }, + { + "epoch": 0.7555762336133994, + "grad_norm": 1.1673508352361557, + "learning_rate": 7.011992586169291e-06, + "loss": 0.6244, + "num_input_tokens_seen": 1246277536, + "step": 6902 + }, + { + "epoch": 0.7556857056843372, + "grad_norm": 1.314764566921466, + "learning_rate": 7.006022152866698e-06, + "loss": 0.7359, + "num_input_tokens_seen": 1246459424, + "step": 6903 + }, + { + "epoch": 0.7557951777552752, + "grad_norm": 1.293430487942542, + "learning_rate": 7.000053848217272e-06, + "loss": 0.9037, + "num_input_tokens_seen": 1246654528, + "step": 6904 + }, + { + "epoch": 0.7559046498262131, + "grad_norm": 1.0954950308432412, + "learning_rate": 6.99408767292708e-06, + "loss": 0.7057, + "num_input_tokens_seen": 1246832384, + "step": 6905 + }, + { + "epoch": 0.756014121897151, + "grad_norm": 1.2357155968385856, + "learning_rate": 6.988123627701879e-06, + "loss": 0.6909, + "num_input_tokens_seen": 1247007104, + "step": 6906 + }, + { + "epoch": 0.7561235939680889, + "grad_norm": 1.2221812827786835, + "learning_rate": 6.982161713247226e-06, + "loss": 0.708, + "num_input_tokens_seen": 1247153152, + "step": 6907 + }, + { + "epoch": 0.7562330660390268, + "grad_norm": 1.2403371693221121, + "learning_rate": 6.9762019302684e-06, + "loss": 0.7185, + "num_input_tokens_seen": 1247310400, + "step": 6908 + }, + { + "epoch": 0.7563425381099647, + "grad_norm": 1.2019839139360236, + "learning_rate": 6.970244279470431e-06, + "loss": 0.7195, + "num_input_tokens_seen": 1247493184, + "step": 6909 + }, + { + "epoch": 0.7564520101809026, + "grad_norm": 1.2180652161788663, + "learning_rate": 6.964288761558094e-06, + "loss": 0.8429, + "num_input_tokens_seen": 1247666336, + "step": 6910 + }, + { + "epoch": 0.7565614822518405, + "grad_norm": 1.1898155224082372, + "learning_rate": 6.958335377235911e-06, + "loss": 0.7443, + "num_input_tokens_seen": 1247817312, + "step": 6911 + }, + { + "epoch": 0.7566709543227784, + "grad_norm": 1.2126653342244058, + "learning_rate": 6.952384127208181e-06, + "loss": 0.7569, + "num_input_tokens_seen": 1248025408, + "step": 6912 + }, + { + "epoch": 0.7567804263937163, + "grad_norm": 1.263104271508663, + "learning_rate": 6.94643501217889e-06, + "loss": 1.1645, + "num_input_tokens_seen": 1248225216, + "step": 6913 + }, + { + "epoch": 0.7568898984646542, + "grad_norm": 1.1138831937289702, + "learning_rate": 6.940488032851839e-06, + "loss": 0.8654, + "num_input_tokens_seen": 1248390976, + "step": 6914 + }, + { + "epoch": 0.7569993705355921, + "grad_norm": 1.2787751742930595, + "learning_rate": 6.934543189930515e-06, + "loss": 0.9939, + "num_input_tokens_seen": 1248560992, + "step": 6915 + }, + { + "epoch": 0.75710884260653, + "grad_norm": 1.318034537527093, + "learning_rate": 6.928600484118206e-06, + "loss": 1.0641, + "num_input_tokens_seen": 1248731232, + "step": 6916 + }, + { + "epoch": 0.7572183146774679, + "grad_norm": 1.1616919016092546, + "learning_rate": 6.92265991611791e-06, + "loss": 0.7616, + "num_input_tokens_seen": 1248906400, + "step": 6917 + }, + { + "epoch": 0.7573277867484058, + "grad_norm": 1.1325377032338553, + "learning_rate": 6.916721486632391e-06, + "loss": 0.826, + "num_input_tokens_seen": 1249075072, + "step": 6918 + }, + { + "epoch": 0.7574372588193438, + "grad_norm": 1.2699936714164868, + "learning_rate": 6.9107851963641505e-06, + "loss": 0.8093, + "num_input_tokens_seen": 1249240384, + "step": 6919 + }, + { + "epoch": 0.7575467308902816, + "grad_norm": 1.0729278020682136, + "learning_rate": 6.9048510460154315e-06, + "loss": 0.6396, + "num_input_tokens_seen": 1249422944, + "step": 6920 + }, + { + "epoch": 0.7576562029612195, + "grad_norm": 1.2900873445172072, + "learning_rate": 6.8989190362882565e-06, + "loss": 1.0675, + "num_input_tokens_seen": 1249618496, + "step": 6921 + }, + { + "epoch": 0.7577656750321574, + "grad_norm": 1.0023215287704557, + "learning_rate": 6.892989167884342e-06, + "loss": 0.603, + "num_input_tokens_seen": 1249805088, + "step": 6922 + }, + { + "epoch": 0.7578751471030953, + "grad_norm": 1.1081275911510657, + "learning_rate": 6.887061441505202e-06, + "loss": 0.8421, + "num_input_tokens_seen": 1250003776, + "step": 6923 + }, + { + "epoch": 0.7579846191740333, + "grad_norm": 1.0680239079260405, + "learning_rate": 6.881135857852067e-06, + "loss": 0.6441, + "num_input_tokens_seen": 1250176928, + "step": 6924 + }, + { + "epoch": 0.7580940912449712, + "grad_norm": 1.3067074061549866, + "learning_rate": 6.87521241762592e-06, + "loss": 0.9146, + "num_input_tokens_seen": 1250367776, + "step": 6925 + }, + { + "epoch": 0.758203563315909, + "grad_norm": 1.377179196243753, + "learning_rate": 6.869291121527499e-06, + "loss": 0.8882, + "num_input_tokens_seen": 1250518976, + "step": 6926 + }, + { + "epoch": 0.7583130353868469, + "grad_norm": 1.2250846658369914, + "learning_rate": 6.863371970257276e-06, + "loss": 0.9045, + "num_input_tokens_seen": 1250728640, + "step": 6927 + }, + { + "epoch": 0.7584225074577848, + "grad_norm": 1.2154130491024806, + "learning_rate": 6.857454964515481e-06, + "loss": 0.6783, + "num_input_tokens_seen": 1250921952, + "step": 6928 + }, + { + "epoch": 0.7585319795287228, + "grad_norm": 1.4282767417643765, + "learning_rate": 6.851540105002077e-06, + "loss": 0.9814, + "num_input_tokens_seen": 1251121088, + "step": 6929 + }, + { + "epoch": 0.7586414515996607, + "grad_norm": 1.1809571867021367, + "learning_rate": 6.845627392416779e-06, + "loss": 0.745, + "num_input_tokens_seen": 1251289984, + "step": 6930 + }, + { + "epoch": 0.7587509236705985, + "grad_norm": 1.3452016397094715, + "learning_rate": 6.839716827459064e-06, + "loss": 1.0792, + "num_input_tokens_seen": 1251471872, + "step": 6931 + }, + { + "epoch": 0.7588603957415364, + "grad_norm": 1.253418444286772, + "learning_rate": 6.83380841082813e-06, + "loss": 0.7691, + "num_input_tokens_seen": 1251646592, + "step": 6932 + }, + { + "epoch": 0.7589698678124743, + "grad_norm": 1.2811349026225347, + "learning_rate": 6.827902143222933e-06, + "loss": 0.9978, + "num_input_tokens_seen": 1251850208, + "step": 6933 + }, + { + "epoch": 0.7590793398834123, + "grad_norm": 1.2314961337463675, + "learning_rate": 6.821998025342172e-06, + "loss": 0.7411, + "num_input_tokens_seen": 1252013280, + "step": 6934 + }, + { + "epoch": 0.7591888119543502, + "grad_norm": 1.2398253720128707, + "learning_rate": 6.816096057884297e-06, + "loss": 0.8251, + "num_input_tokens_seen": 1252196512, + "step": 6935 + }, + { + "epoch": 0.7592982840252881, + "grad_norm": 1.2364777629625132, + "learning_rate": 6.810196241547495e-06, + "loss": 0.9841, + "num_input_tokens_seen": 1252335392, + "step": 6936 + }, + { + "epoch": 0.7594077560962259, + "grad_norm": 1.2592995897911443, + "learning_rate": 6.804298577029697e-06, + "loss": 0.8199, + "num_input_tokens_seen": 1252529376, + "step": 6937 + }, + { + "epoch": 0.7595172281671638, + "grad_norm": 1.2781887234422722, + "learning_rate": 6.798403065028611e-06, + "loss": 0.8666, + "num_input_tokens_seen": 1252733440, + "step": 6938 + }, + { + "epoch": 0.7596267002381017, + "grad_norm": 1.1816022882469093, + "learning_rate": 6.792509706241629e-06, + "loss": 1.1484, + "num_input_tokens_seen": 1252940416, + "step": 6939 + }, + { + "epoch": 0.7597361723090397, + "grad_norm": 1.3573113046524834, + "learning_rate": 6.786618501365949e-06, + "loss": 1.1212, + "num_input_tokens_seen": 1253130368, + "step": 6940 + }, + { + "epoch": 0.7598456443799776, + "grad_norm": 1.2342401744752007, + "learning_rate": 6.780729451098483e-06, + "loss": 0.8732, + "num_input_tokens_seen": 1253337568, + "step": 6941 + }, + { + "epoch": 0.7599551164509155, + "grad_norm": 1.2046188394298531, + "learning_rate": 6.7748425561358934e-06, + "loss": 0.9424, + "num_input_tokens_seen": 1253526624, + "step": 6942 + }, + { + "epoch": 0.7600645885218533, + "grad_norm": 1.326021301081178, + "learning_rate": 6.76895781717459e-06, + "loss": 0.9154, + "num_input_tokens_seen": 1253709632, + "step": 6943 + }, + { + "epoch": 0.7601740605927912, + "grad_norm": 1.236098619224111, + "learning_rate": 6.763075234910715e-06, + "loss": 0.8865, + "num_input_tokens_seen": 1253914144, + "step": 6944 + }, + { + "epoch": 0.7602835326637292, + "grad_norm": 1.3581436104690918, + "learning_rate": 6.757194810040193e-06, + "loss": 0.9831, + "num_input_tokens_seen": 1254094464, + "step": 6945 + }, + { + "epoch": 0.7603930047346671, + "grad_norm": 1.3008941051568057, + "learning_rate": 6.751316543258637e-06, + "loss": 0.8722, + "num_input_tokens_seen": 1254283744, + "step": 6946 + }, + { + "epoch": 0.760502476805605, + "grad_norm": 1.1629304950146617, + "learning_rate": 6.745440435261463e-06, + "loss": 0.9107, + "num_input_tokens_seen": 1254500128, + "step": 6947 + }, + { + "epoch": 0.7606119488765428, + "grad_norm": 1.2440345515331659, + "learning_rate": 6.739566486743773e-06, + "loss": 0.8474, + "num_input_tokens_seen": 1254681792, + "step": 6948 + }, + { + "epoch": 0.7607214209474807, + "grad_norm": 1.1618260655350703, + "learning_rate": 6.733694698400467e-06, + "loss": 0.8152, + "num_input_tokens_seen": 1254854944, + "step": 6949 + }, + { + "epoch": 0.7608308930184187, + "grad_norm": 1.2860227869426601, + "learning_rate": 6.727825070926158e-06, + "loss": 0.7308, + "num_input_tokens_seen": 1255035040, + "step": 6950 + }, + { + "epoch": 0.7609403650893566, + "grad_norm": 1.2648679564847545, + "learning_rate": 6.721957605015214e-06, + "loss": 0.9711, + "num_input_tokens_seen": 1255208864, + "step": 6951 + }, + { + "epoch": 0.7610498371602945, + "grad_norm": 1.165684229596194, + "learning_rate": 6.716092301361743e-06, + "loss": 0.808, + "num_input_tokens_seen": 1255393664, + "step": 6952 + }, + { + "epoch": 0.7611593092312324, + "grad_norm": 1.3124655764470627, + "learning_rate": 6.710229160659593e-06, + "loss": 0.969, + "num_input_tokens_seen": 1255591680, + "step": 6953 + }, + { + "epoch": 0.7612687813021702, + "grad_norm": 1.1300786672064083, + "learning_rate": 6.704368183602386e-06, + "loss": 0.7616, + "num_input_tokens_seen": 1255765504, + "step": 6954 + }, + { + "epoch": 0.7613782533731082, + "grad_norm": 1.097637027015562, + "learning_rate": 6.698509370883429e-06, + "loss": 0.7269, + "num_input_tokens_seen": 1255965760, + "step": 6955 + }, + { + "epoch": 0.7614877254440461, + "grad_norm": 1.2143096240278226, + "learning_rate": 6.692652723195836e-06, + "loss": 0.9119, + "num_input_tokens_seen": 1256157056, + "step": 6956 + }, + { + "epoch": 0.761597197514984, + "grad_norm": 1.1634559331105325, + "learning_rate": 6.686798241232428e-06, + "loss": 0.8523, + "num_input_tokens_seen": 1256359328, + "step": 6957 + }, + { + "epoch": 0.7617066695859219, + "grad_norm": 1.1661746429433, + "learning_rate": 6.680945925685778e-06, + "loss": 0.665, + "num_input_tokens_seen": 1256545248, + "step": 6958 + }, + { + "epoch": 0.7618161416568598, + "grad_norm": 1.2161233424762476, + "learning_rate": 6.675095777248208e-06, + "loss": 0.7031, + "num_input_tokens_seen": 1256688160, + "step": 6959 + }, + { + "epoch": 0.7619256137277977, + "grad_norm": 1.3170202617433526, + "learning_rate": 6.669247796611774e-06, + "loss": 0.6921, + "num_input_tokens_seen": 1256820768, + "step": 6960 + }, + { + "epoch": 0.7620350857987356, + "grad_norm": 1.2187074018027466, + "learning_rate": 6.663401984468281e-06, + "loss": 0.8674, + "num_input_tokens_seen": 1257016320, + "step": 6961 + }, + { + "epoch": 0.7621445578696735, + "grad_norm": 1.1357689523864245, + "learning_rate": 6.657558341509276e-06, + "loss": 0.8157, + "num_input_tokens_seen": 1257197088, + "step": 6962 + }, + { + "epoch": 0.7622540299406114, + "grad_norm": 1.3966283763987057, + "learning_rate": 6.651716868426061e-06, + "loss": 1.2146, + "num_input_tokens_seen": 1257392864, + "step": 6963 + }, + { + "epoch": 0.7623635020115493, + "grad_norm": 1.2050829721906668, + "learning_rate": 6.645877565909664e-06, + "loss": 0.9292, + "num_input_tokens_seen": 1257574976, + "step": 6964 + }, + { + "epoch": 0.7624729740824872, + "grad_norm": 1.2337851261754278, + "learning_rate": 6.6400404346508625e-06, + "loss": 0.6703, + "num_input_tokens_seen": 1257756416, + "step": 6965 + }, + { + "epoch": 0.7625824461534251, + "grad_norm": 1.2319874428536437, + "learning_rate": 6.634205475340182e-06, + "loss": 0.8921, + "num_input_tokens_seen": 1257921056, + "step": 6966 + }, + { + "epoch": 0.762691918224363, + "grad_norm": 1.1461608534230943, + "learning_rate": 6.628372688667883e-06, + "loss": 0.8477, + "num_input_tokens_seen": 1258098016, + "step": 6967 + }, + { + "epoch": 0.7628013902953009, + "grad_norm": 1.049877643268036, + "learning_rate": 6.622542075323973e-06, + "loss": 0.8869, + "num_input_tokens_seen": 1258296480, + "step": 6968 + }, + { + "epoch": 0.7629108623662388, + "grad_norm": 1.3323299486964943, + "learning_rate": 6.6167136359982064e-06, + "loss": 0.9329, + "num_input_tokens_seen": 1258457312, + "step": 6969 + }, + { + "epoch": 0.7630203344371768, + "grad_norm": 1.1760048384372281, + "learning_rate": 6.610887371380064e-06, + "loss": 0.8995, + "num_input_tokens_seen": 1258649504, + "step": 6970 + }, + { + "epoch": 0.7631298065081146, + "grad_norm": 1.1667830148083767, + "learning_rate": 6.605063282158808e-06, + "loss": 0.8872, + "num_input_tokens_seen": 1258846624, + "step": 6971 + }, + { + "epoch": 0.7632392785790525, + "grad_norm": 1.2292268813734564, + "learning_rate": 6.599241369023385e-06, + "loss": 0.8379, + "num_input_tokens_seen": 1259029632, + "step": 6972 + }, + { + "epoch": 0.7633487506499904, + "grad_norm": 1.2185065197431715, + "learning_rate": 6.593421632662539e-06, + "loss": 0.9019, + "num_input_tokens_seen": 1259207712, + "step": 6973 + }, + { + "epoch": 0.7634582227209283, + "grad_norm": 1.157189940826784, + "learning_rate": 6.587604073764728e-06, + "loss": 0.7225, + "num_input_tokens_seen": 1259388928, + "step": 6974 + }, + { + "epoch": 0.7635676947918663, + "grad_norm": 1.196862555854985, + "learning_rate": 6.581788693018154e-06, + "loss": 0.9872, + "num_input_tokens_seen": 1259575072, + "step": 6975 + }, + { + "epoch": 0.7636771668628042, + "grad_norm": 1.1716624257882333, + "learning_rate": 6.575975491110769e-06, + "loss": 0.7461, + "num_input_tokens_seen": 1259744416, + "step": 6976 + }, + { + "epoch": 0.763786638933742, + "grad_norm": 1.0412056411325092, + "learning_rate": 6.570164468730258e-06, + "loss": 0.7605, + "num_input_tokens_seen": 1259931008, + "step": 6977 + }, + { + "epoch": 0.7638961110046799, + "grad_norm": 1.1698039254927322, + "learning_rate": 6.56435562656407e-06, + "loss": 0.6357, + "num_input_tokens_seen": 1260091616, + "step": 6978 + }, + { + "epoch": 0.7640055830756178, + "grad_norm": 1.2888310080079384, + "learning_rate": 6.558548965299355e-06, + "loss": 1.0362, + "num_input_tokens_seen": 1260286272, + "step": 6979 + }, + { + "epoch": 0.7641150551465558, + "grad_norm": 1.3623058688250926, + "learning_rate": 6.552744485623058e-06, + "loss": 0.967, + "num_input_tokens_seen": 1260465248, + "step": 6980 + }, + { + "epoch": 0.7642245272174937, + "grad_norm": 1.0484787343042046, + "learning_rate": 6.5469421882218075e-06, + "loss": 0.6375, + "num_input_tokens_seen": 1260670208, + "step": 6981 + }, + { + "epoch": 0.7643339992884315, + "grad_norm": 1.5081114793749208, + "learning_rate": 6.541142073782028e-06, + "loss": 0.9057, + "num_input_tokens_seen": 1260809760, + "step": 6982 + }, + { + "epoch": 0.7644434713593694, + "grad_norm": 1.1941210724262794, + "learning_rate": 6.535344142989852e-06, + "loss": 1.0672, + "num_input_tokens_seen": 1260990976, + "step": 6983 + }, + { + "epoch": 0.7645529434303073, + "grad_norm": 1.1383204236748374, + "learning_rate": 6.529548396531168e-06, + "loss": 0.7156, + "num_input_tokens_seen": 1261149344, + "step": 6984 + }, + { + "epoch": 0.7646624155012453, + "grad_norm": 1.1845834956178523, + "learning_rate": 6.523754835091597e-06, + "loss": 0.909, + "num_input_tokens_seen": 1261351840, + "step": 6985 + }, + { + "epoch": 0.7647718875721832, + "grad_norm": 1.2157664336863316, + "learning_rate": 6.517963459356502e-06, + "loss": 0.7079, + "num_input_tokens_seen": 1261511552, + "step": 6986 + }, + { + "epoch": 0.7648813596431211, + "grad_norm": 1.3056900832123401, + "learning_rate": 6.512174270011015e-06, + "loss": 0.9208, + "num_input_tokens_seen": 1261675296, + "step": 6987 + }, + { + "epoch": 0.7649908317140589, + "grad_norm": 1.3306889188867401, + "learning_rate": 6.5063872677399525e-06, + "loss": 1.0308, + "num_input_tokens_seen": 1261855392, + "step": 6988 + }, + { + "epoch": 0.7651003037849968, + "grad_norm": 1.1950721748454798, + "learning_rate": 6.500602453227936e-06, + "loss": 0.8272, + "num_input_tokens_seen": 1262009056, + "step": 6989 + }, + { + "epoch": 0.7652097758559347, + "grad_norm": 1.1952449804446779, + "learning_rate": 6.494819827159271e-06, + "loss": 0.801, + "num_input_tokens_seen": 1262213568, + "step": 6990 + }, + { + "epoch": 0.7653192479268727, + "grad_norm": 1.225874981478472, + "learning_rate": 6.489039390218052e-06, + "loss": 0.6824, + "num_input_tokens_seen": 1262375520, + "step": 6991 + }, + { + "epoch": 0.7654287199978106, + "grad_norm": 1.2916350155715903, + "learning_rate": 6.483261143088084e-06, + "loss": 0.7836, + "num_input_tokens_seen": 1262536352, + "step": 6992 + }, + { + "epoch": 0.7655381920687485, + "grad_norm": 1.3313516354156867, + "learning_rate": 6.477485086452928e-06, + "loss": 1.2439, + "num_input_tokens_seen": 1262738624, + "step": 6993 + }, + { + "epoch": 0.7656476641396863, + "grad_norm": 1.1299885772026457, + "learning_rate": 6.471711220995877e-06, + "loss": 0.7429, + "num_input_tokens_seen": 1262928800, + "step": 6994 + }, + { + "epoch": 0.7657571362106242, + "grad_norm": 1.0650870299262585, + "learning_rate": 6.46593954739996e-06, + "loss": 0.7205, + "num_input_tokens_seen": 1263103520, + "step": 6995 + }, + { + "epoch": 0.7658666082815622, + "grad_norm": 1.1783507902281212, + "learning_rate": 6.460170066347979e-06, + "loss": 1.137, + "num_input_tokens_seen": 1263319904, + "step": 6996 + }, + { + "epoch": 0.7659760803525001, + "grad_norm": 1.3050386789954986, + "learning_rate": 6.4544027785224195e-06, + "loss": 0.7584, + "num_input_tokens_seen": 1263523520, + "step": 6997 + }, + { + "epoch": 0.766085552423438, + "grad_norm": 1.1487695642077593, + "learning_rate": 6.448637684605569e-06, + "loss": 0.7762, + "num_input_tokens_seen": 1263737216, + "step": 6998 + }, + { + "epoch": 0.7661950244943758, + "grad_norm": 1.228062532500933, + "learning_rate": 6.442874785279415e-06, + "loss": 0.8547, + "num_input_tokens_seen": 1263916416, + "step": 6999 + }, + { + "epoch": 0.7663044965653137, + "grad_norm": 1.277207860222012, + "learning_rate": 6.437114081225698e-06, + "loss": 0.7911, + "num_input_tokens_seen": 1264121600, + "step": 7000 + }, + { + "epoch": 0.7664139686362517, + "grad_norm": 1.1454881957472869, + "learning_rate": 6.431355573125899e-06, + "loss": 0.5821, + "num_input_tokens_seen": 1264298560, + "step": 7001 + }, + { + "epoch": 0.7665234407071896, + "grad_norm": 1.1530508167419942, + "learning_rate": 6.4255992616612385e-06, + "loss": 0.8606, + "num_input_tokens_seen": 1264478432, + "step": 7002 + }, + { + "epoch": 0.7666329127781275, + "grad_norm": 1.2410805046165252, + "learning_rate": 6.419845147512679e-06, + "loss": 0.8312, + "num_input_tokens_seen": 1264644640, + "step": 7003 + }, + { + "epoch": 0.7667423848490654, + "grad_norm": 1.0884764924075878, + "learning_rate": 6.4140932313609096e-06, + "loss": 0.8881, + "num_input_tokens_seen": 1264820256, + "step": 7004 + }, + { + "epoch": 0.7668518569200032, + "grad_norm": 1.2405923642110626, + "learning_rate": 6.408343513886389e-06, + "loss": 0.7918, + "num_input_tokens_seen": 1265003264, + "step": 7005 + }, + { + "epoch": 0.7669613289909412, + "grad_norm": 1.1543872881753199, + "learning_rate": 6.402595995769289e-06, + "loss": 0.8011, + "num_input_tokens_seen": 1265193888, + "step": 7006 + }, + { + "epoch": 0.7670708010618791, + "grad_norm": 1.174167882207093, + "learning_rate": 6.396850677689531e-06, + "loss": 0.6789, + "num_input_tokens_seen": 1265360768, + "step": 7007 + }, + { + "epoch": 0.767180273132817, + "grad_norm": 1.1433097909407202, + "learning_rate": 6.391107560326776e-06, + "loss": 0.844, + "num_input_tokens_seen": 1265537952, + "step": 7008 + }, + { + "epoch": 0.7672897452037549, + "grad_norm": 1.3030304122004688, + "learning_rate": 6.385366644360419e-06, + "loss": 0.969, + "num_input_tokens_seen": 1265738880, + "step": 7009 + }, + { + "epoch": 0.7673992172746928, + "grad_norm": 1.1056187935628548, + "learning_rate": 6.379627930469598e-06, + "loss": 0.7987, + "num_input_tokens_seen": 1265912704, + "step": 7010 + }, + { + "epoch": 0.7675086893456307, + "grad_norm": 1.2308274972286999, + "learning_rate": 6.373891419333211e-06, + "loss": 0.7798, + "num_input_tokens_seen": 1266097952, + "step": 7011 + }, + { + "epoch": 0.7676181614165686, + "grad_norm": 1.1554606552820594, + "learning_rate": 6.368157111629846e-06, + "loss": 0.6739, + "num_input_tokens_seen": 1266272896, + "step": 7012 + }, + { + "epoch": 0.7677276334875065, + "grad_norm": 1.033305172126309, + "learning_rate": 6.362425008037895e-06, + "loss": 0.6608, + "num_input_tokens_seen": 1266457024, + "step": 7013 + }, + { + "epoch": 0.7678371055584444, + "grad_norm": 1.4252050620319705, + "learning_rate": 6.35669510923542e-06, + "loss": 0.9824, + "num_input_tokens_seen": 1266645632, + "step": 7014 + }, + { + "epoch": 0.7679465776293823, + "grad_norm": 1.3566540639152924, + "learning_rate": 6.35096741590028e-06, + "loss": 0.819, + "num_input_tokens_seen": 1266784960, + "step": 7015 + }, + { + "epoch": 0.7680560497003202, + "grad_norm": 1.1452512071557752, + "learning_rate": 6.345241928710044e-06, + "loss": 0.8471, + "num_input_tokens_seen": 1266954080, + "step": 7016 + }, + { + "epoch": 0.7681655217712581, + "grad_norm": 1.1043227223413208, + "learning_rate": 6.339518648342019e-06, + "loss": 0.6178, + "num_input_tokens_seen": 1267128576, + "step": 7017 + }, + { + "epoch": 0.768274993842196, + "grad_norm": 1.5018900249109053, + "learning_rate": 6.33379757547328e-06, + "loss": 1.0199, + "num_input_tokens_seen": 1267295456, + "step": 7018 + }, + { + "epoch": 0.7683844659131339, + "grad_norm": 1.3495639291004125, + "learning_rate": 6.328078710780588e-06, + "loss": 1.1263, + "num_input_tokens_seen": 1267465024, + "step": 7019 + }, + { + "epoch": 0.7684939379840718, + "grad_norm": 1.158361620294622, + "learning_rate": 6.322362054940506e-06, + "loss": 0.7976, + "num_input_tokens_seen": 1267633472, + "step": 7020 + }, + { + "epoch": 0.7686034100550098, + "grad_norm": 1.0885946026696471, + "learning_rate": 6.316647608629272e-06, + "loss": 0.7009, + "num_input_tokens_seen": 1267799232, + "step": 7021 + }, + { + "epoch": 0.7687128821259476, + "grad_norm": 1.1638114197581937, + "learning_rate": 6.310935372522925e-06, + "loss": 0.596, + "num_input_tokens_seen": 1267963872, + "step": 7022 + }, + { + "epoch": 0.7688223541968855, + "grad_norm": 1.1263198527733331, + "learning_rate": 6.305225347297181e-06, + "loss": 0.811, + "num_input_tokens_seen": 1268163456, + "step": 7023 + }, + { + "epoch": 0.7689318262678234, + "grad_norm": 1.2502468819002486, + "learning_rate": 6.299517533627547e-06, + "loss": 0.8374, + "num_input_tokens_seen": 1268354752, + "step": 7024 + }, + { + "epoch": 0.7690412983387613, + "grad_norm": 1.2542371937046854, + "learning_rate": 6.293811932189239e-06, + "loss": 1.1124, + "num_input_tokens_seen": 1268526112, + "step": 7025 + }, + { + "epoch": 0.7691507704096993, + "grad_norm": 1.1294474640880057, + "learning_rate": 6.28810854365722e-06, + "loss": 0.707, + "num_input_tokens_seen": 1268702400, + "step": 7026 + }, + { + "epoch": 0.7692602424806372, + "grad_norm": 1.2661245796473322, + "learning_rate": 6.282407368706189e-06, + "loss": 1.0515, + "num_input_tokens_seen": 1268887200, + "step": 7027 + }, + { + "epoch": 0.769369714551575, + "grad_norm": 1.1864946024338308, + "learning_rate": 6.276708408010576e-06, + "loss": 0.6614, + "num_input_tokens_seen": 1269044000, + "step": 7028 + }, + { + "epoch": 0.7694791866225129, + "grad_norm": 1.2832201719966183, + "learning_rate": 6.27101166224458e-06, + "loss": 0.9788, + "num_input_tokens_seen": 1269220512, + "step": 7029 + }, + { + "epoch": 0.7695886586934508, + "grad_norm": 1.2762445509118931, + "learning_rate": 6.265317132082088e-06, + "loss": 0.7799, + "num_input_tokens_seen": 1269404192, + "step": 7030 + }, + { + "epoch": 0.7696981307643888, + "grad_norm": 1.1166841133140193, + "learning_rate": 6.259624818196772e-06, + "loss": 0.8535, + "num_input_tokens_seen": 1269595040, + "step": 7031 + }, + { + "epoch": 0.7698076028353267, + "grad_norm": 0.9651365260658894, + "learning_rate": 6.253934721262014e-06, + "loss": 0.5934, + "num_input_tokens_seen": 1269758336, + "step": 7032 + }, + { + "epoch": 0.7699170749062645, + "grad_norm": 1.2679405693452916, + "learning_rate": 6.248246841950942e-06, + "loss": 0.9774, + "num_input_tokens_seen": 1269940000, + "step": 7033 + }, + { + "epoch": 0.7700265469772024, + "grad_norm": 1.2354311710083044, + "learning_rate": 6.242561180936421e-06, + "loss": 0.7631, + "num_input_tokens_seen": 1270098144, + "step": 7034 + }, + { + "epoch": 0.7701360190481403, + "grad_norm": 1.342938155735062, + "learning_rate": 6.236877738891053e-06, + "loss": 0.8519, + "num_input_tokens_seen": 1270312288, + "step": 7035 + }, + { + "epoch": 0.7702454911190783, + "grad_norm": 1.326385211218262, + "learning_rate": 6.231196516487181e-06, + "loss": 0.9846, + "num_input_tokens_seen": 1270495520, + "step": 7036 + }, + { + "epoch": 0.7703549631900162, + "grad_norm": 1.141064389397076, + "learning_rate": 6.225517514396873e-06, + "loss": 0.8724, + "num_input_tokens_seen": 1270685248, + "step": 7037 + }, + { + "epoch": 0.7704644352609541, + "grad_norm": 1.0125795921784202, + "learning_rate": 6.219840733291959e-06, + "loss": 0.5854, + "num_input_tokens_seen": 1270857952, + "step": 7038 + }, + { + "epoch": 0.7705739073318919, + "grad_norm": 1.0653924839366928, + "learning_rate": 6.2141661738439884e-06, + "loss": 0.8448, + "num_input_tokens_seen": 1271030208, + "step": 7039 + }, + { + "epoch": 0.7706833794028298, + "grad_norm": 1.1659289763953686, + "learning_rate": 6.208493836724244e-06, + "loss": 0.6917, + "num_input_tokens_seen": 1271230016, + "step": 7040 + }, + { + "epoch": 0.7707928514737677, + "grad_norm": 1.1376439160701486, + "learning_rate": 6.202823722603757e-06, + "loss": 0.6913, + "num_input_tokens_seen": 1271426240, + "step": 7041 + }, + { + "epoch": 0.7709023235447057, + "grad_norm": 1.1277462500491042, + "learning_rate": 6.197155832153287e-06, + "loss": 1.1706, + "num_input_tokens_seen": 1271620448, + "step": 7042 + }, + { + "epoch": 0.7710117956156436, + "grad_norm": 1.0874347930300958, + "learning_rate": 6.191490166043337e-06, + "loss": 1.0093, + "num_input_tokens_seen": 1271820928, + "step": 7043 + }, + { + "epoch": 0.7711212676865815, + "grad_norm": 1.1824174329550121, + "learning_rate": 6.185826724944146e-06, + "loss": 0.6871, + "num_input_tokens_seen": 1271972576, + "step": 7044 + }, + { + "epoch": 0.7712307397575193, + "grad_norm": 1.2114412305985345, + "learning_rate": 6.180165509525682e-06, + "loss": 0.7507, + "num_input_tokens_seen": 1272135872, + "step": 7045 + }, + { + "epoch": 0.7713402118284572, + "grad_norm": 1.0534884383187724, + "learning_rate": 6.174506520457665e-06, + "loss": 0.7419, + "num_input_tokens_seen": 1272308352, + "step": 7046 + }, + { + "epoch": 0.7714496838993952, + "grad_norm": 1.2466010612592215, + "learning_rate": 6.168849758409539e-06, + "loss": 0.7314, + "num_input_tokens_seen": 1272460224, + "step": 7047 + }, + { + "epoch": 0.7715591559703331, + "grad_norm": 1.198652748960276, + "learning_rate": 6.163195224050488e-06, + "loss": 0.997, + "num_input_tokens_seen": 1272653984, + "step": 7048 + }, + { + "epoch": 0.771668628041271, + "grad_norm": 1.1594454453445775, + "learning_rate": 6.157542918049433e-06, + "loss": 0.8242, + "num_input_tokens_seen": 1272813248, + "step": 7049 + }, + { + "epoch": 0.7717781001122088, + "grad_norm": 1.204167397486763, + "learning_rate": 6.151892841075027e-06, + "loss": 0.9673, + "num_input_tokens_seen": 1273024032, + "step": 7050 + }, + { + "epoch": 0.7718875721831467, + "grad_norm": 1.0747445224321568, + "learning_rate": 6.146244993795669e-06, + "loss": 0.7719, + "num_input_tokens_seen": 1273196736, + "step": 7051 + }, + { + "epoch": 0.7719970442540847, + "grad_norm": 1.1557246380729136, + "learning_rate": 6.14059937687948e-06, + "loss": 0.8508, + "num_input_tokens_seen": 1273373920, + "step": 7052 + }, + { + "epoch": 0.7721065163250226, + "grad_norm": 1.288896353446969, + "learning_rate": 6.1349559909943425e-06, + "loss": 1.0682, + "num_input_tokens_seen": 1273554240, + "step": 7053 + }, + { + "epoch": 0.7722159883959605, + "grad_norm": 1.4451893522018668, + "learning_rate": 6.129314836807834e-06, + "loss": 0.7695, + "num_input_tokens_seen": 1273749344, + "step": 7054 + }, + { + "epoch": 0.7723254604668984, + "grad_norm": 1.1389076682354202, + "learning_rate": 6.123675914987323e-06, + "loss": 0.7262, + "num_input_tokens_seen": 1273951840, + "step": 7055 + }, + { + "epoch": 0.7724349325378362, + "grad_norm": 1.2676376540396261, + "learning_rate": 6.1180392261998484e-06, + "loss": 0.9653, + "num_input_tokens_seen": 1274122080, + "step": 7056 + }, + { + "epoch": 0.7725444046087742, + "grad_norm": 1.2455100483161679, + "learning_rate": 6.112404771112246e-06, + "loss": 0.7795, + "num_input_tokens_seen": 1274315392, + "step": 7057 + }, + { + "epoch": 0.7726538766797121, + "grad_norm": 1.2979126421446068, + "learning_rate": 6.106772550391052e-06, + "loss": 1.023, + "num_input_tokens_seen": 1274510944, + "step": 7058 + }, + { + "epoch": 0.77276334875065, + "grad_norm": 1.2132181655980248, + "learning_rate": 6.101142564702539e-06, + "loss": 0.7885, + "num_input_tokens_seen": 1274704032, + "step": 7059 + }, + { + "epoch": 0.7728728208215879, + "grad_norm": 1.2115810038068877, + "learning_rate": 6.095514814712747e-06, + "loss": 0.7571, + "num_input_tokens_seen": 1274912128, + "step": 7060 + }, + { + "epoch": 0.7729822928925258, + "grad_norm": 1.284983892970683, + "learning_rate": 6.089889301087398e-06, + "loss": 0.8818, + "num_input_tokens_seen": 1275083040, + "step": 7061 + }, + { + "epoch": 0.7730917649634637, + "grad_norm": 1.1864593865331587, + "learning_rate": 6.084266024492011e-06, + "loss": 0.9021, + "num_input_tokens_seen": 1275254848, + "step": 7062 + }, + { + "epoch": 0.7732012370344016, + "grad_norm": 1.2066307514511618, + "learning_rate": 6.078644985591778e-06, + "loss": 0.6819, + "num_input_tokens_seen": 1275415232, + "step": 7063 + }, + { + "epoch": 0.7733107091053395, + "grad_norm": 1.073057899932271, + "learning_rate": 6.0730261850516865e-06, + "loss": 0.6285, + "num_input_tokens_seen": 1275602720, + "step": 7064 + }, + { + "epoch": 0.7734201811762774, + "grad_norm": 1.1312032911236596, + "learning_rate": 6.067409623536399e-06, + "loss": 0.8252, + "num_input_tokens_seen": 1275782816, + "step": 7065 + }, + { + "epoch": 0.7735296532472153, + "grad_norm": 1.2534810709031907, + "learning_rate": 6.061795301710368e-06, + "loss": 0.7356, + "num_input_tokens_seen": 1275976800, + "step": 7066 + }, + { + "epoch": 0.7736391253181532, + "grad_norm": 1.2162426318487471, + "learning_rate": 6.056183220237749e-06, + "loss": 1.1747, + "num_input_tokens_seen": 1276187360, + "step": 7067 + }, + { + "epoch": 0.7737485973890911, + "grad_norm": 1.26806245787795, + "learning_rate": 6.05057337978244e-06, + "loss": 0.9811, + "num_input_tokens_seen": 1276390752, + "step": 7068 + }, + { + "epoch": 0.773858069460029, + "grad_norm": 1.2655334439540162, + "learning_rate": 6.044965781008077e-06, + "loss": 0.7985, + "num_input_tokens_seen": 1276588544, + "step": 7069 + }, + { + "epoch": 0.7739675415309669, + "grad_norm": 1.1765349244309586, + "learning_rate": 6.039360424578017e-06, + "loss": 0.7445, + "num_input_tokens_seen": 1276759008, + "step": 7070 + }, + { + "epoch": 0.7740770136019048, + "grad_norm": 1.3554684861592412, + "learning_rate": 6.033757311155386e-06, + "loss": 1.134, + "num_input_tokens_seen": 1276952992, + "step": 7071 + }, + { + "epoch": 0.7741864856728428, + "grad_norm": 1.2299333164313164, + "learning_rate": 6.028156441402996e-06, + "loss": 0.7267, + "num_input_tokens_seen": 1277110016, + "step": 7072 + }, + { + "epoch": 0.7742959577437806, + "grad_norm": 1.2207396967177888, + "learning_rate": 6.022557815983437e-06, + "loss": 0.7452, + "num_input_tokens_seen": 1277310048, + "step": 7073 + }, + { + "epoch": 0.7744054298147185, + "grad_norm": 1.1748898190723414, + "learning_rate": 6.01696143555901e-06, + "loss": 0.7344, + "num_input_tokens_seen": 1277491936, + "step": 7074 + }, + { + "epoch": 0.7745149018856564, + "grad_norm": 1.275192549134747, + "learning_rate": 6.011367300791754e-06, + "loss": 0.8633, + "num_input_tokens_seen": 1277665088, + "step": 7075 + }, + { + "epoch": 0.7746243739565943, + "grad_norm": 1.3043473765469205, + "learning_rate": 6.005775412343448e-06, + "loss": 1.0539, + "num_input_tokens_seen": 1277852800, + "step": 7076 + }, + { + "epoch": 0.7747338460275323, + "grad_norm": 1.1537785277967492, + "learning_rate": 6.0001857708755996e-06, + "loss": 0.8265, + "num_input_tokens_seen": 1278046784, + "step": 7077 + }, + { + "epoch": 0.7748433180984702, + "grad_norm": 1.0949119390498885, + "learning_rate": 5.994598377049446e-06, + "loss": 0.5865, + "num_input_tokens_seen": 1278240768, + "step": 7078 + }, + { + "epoch": 0.774952790169408, + "grad_norm": 1.2708817092359714, + "learning_rate": 5.989013231525978e-06, + "loss": 0.7526, + "num_input_tokens_seen": 1278422880, + "step": 7079 + }, + { + "epoch": 0.7750622622403459, + "grad_norm": 1.0849631899201697, + "learning_rate": 5.983430334965903e-06, + "loss": 0.7515, + "num_input_tokens_seen": 1278627392, + "step": 7080 + }, + { + "epoch": 0.7751717343112838, + "grad_norm": 1.2051042402457965, + "learning_rate": 5.977849688029666e-06, + "loss": 0.804, + "num_input_tokens_seen": 1278802336, + "step": 7081 + }, + { + "epoch": 0.7752812063822218, + "grad_norm": 1.1516238275188877, + "learning_rate": 5.972271291377446e-06, + "loss": 0.9395, + "num_input_tokens_seen": 1278980192, + "step": 7082 + }, + { + "epoch": 0.7753906784531597, + "grad_norm": 1.3492462950580162, + "learning_rate": 5.96669514566916e-06, + "loss": 1.0355, + "num_input_tokens_seen": 1279157376, + "step": 7083 + }, + { + "epoch": 0.7755001505240975, + "grad_norm": 1.1861372707816449, + "learning_rate": 5.96112125156445e-06, + "loss": 0.9748, + "num_input_tokens_seen": 1279368832, + "step": 7084 + }, + { + "epoch": 0.7756096225950354, + "grad_norm": 1.2026595168870364, + "learning_rate": 5.9555496097226934e-06, + "loss": 0.8729, + "num_input_tokens_seen": 1279528768, + "step": 7085 + }, + { + "epoch": 0.7757190946659733, + "grad_norm": 1.108677765859701, + "learning_rate": 5.949980220803025e-06, + "loss": 0.871, + "num_input_tokens_seen": 1279718944, + "step": 7086 + }, + { + "epoch": 0.7758285667369113, + "grad_norm": 1.2159838953237945, + "learning_rate": 5.944413085464265e-06, + "loss": 1.0069, + "num_input_tokens_seen": 1279898368, + "step": 7087 + }, + { + "epoch": 0.7759380388078492, + "grad_norm": 1.2337772515561791, + "learning_rate": 5.938848204365016e-06, + "loss": 0.894, + "num_input_tokens_seen": 1280114528, + "step": 7088 + }, + { + "epoch": 0.7760475108787871, + "grad_norm": 1.0914794241992458, + "learning_rate": 5.933285578163586e-06, + "loss": 0.7342, + "num_input_tokens_seen": 1280276480, + "step": 7089 + }, + { + "epoch": 0.7761569829497249, + "grad_norm": 1.071171715011401, + "learning_rate": 5.927725207518023e-06, + "loss": 0.7912, + "num_input_tokens_seen": 1280450528, + "step": 7090 + }, + { + "epoch": 0.7762664550206628, + "grad_norm": 1.1421807013647216, + "learning_rate": 5.922167093086107e-06, + "loss": 0.7341, + "num_input_tokens_seen": 1280652576, + "step": 7091 + }, + { + "epoch": 0.7763759270916007, + "grad_norm": 1.1265345809376381, + "learning_rate": 5.916611235525346e-06, + "loss": 0.6909, + "num_input_tokens_seen": 1280861120, + "step": 7092 + }, + { + "epoch": 0.7764853991625387, + "grad_norm": 1.2048701720518455, + "learning_rate": 5.9110576354930085e-06, + "loss": 0.7855, + "num_input_tokens_seen": 1281064512, + "step": 7093 + }, + { + "epoch": 0.7765948712334766, + "grad_norm": 1.229646971445492, + "learning_rate": 5.9055062936460484e-06, + "loss": 0.8154, + "num_input_tokens_seen": 1281263872, + "step": 7094 + }, + { + "epoch": 0.7767043433044145, + "grad_norm": 1.1850187580217468, + "learning_rate": 5.899957210641205e-06, + "loss": 0.9646, + "num_input_tokens_seen": 1281441280, + "step": 7095 + }, + { + "epoch": 0.7768138153753523, + "grad_norm": 1.3770973941625693, + "learning_rate": 5.894410387134896e-06, + "loss": 0.757, + "num_input_tokens_seen": 1281597184, + "step": 7096 + }, + { + "epoch": 0.7769232874462902, + "grad_norm": 1.1558716644737324, + "learning_rate": 5.888865823783329e-06, + "loss": 0.7303, + "num_input_tokens_seen": 1281771456, + "step": 7097 + }, + { + "epoch": 0.7770327595172282, + "grad_norm": 1.1975097905119136, + "learning_rate": 5.883323521242387e-06, + "loss": 0.9309, + "num_input_tokens_seen": 1281940800, + "step": 7098 + }, + { + "epoch": 0.7771422315881661, + "grad_norm": 1.273916049275303, + "learning_rate": 5.877783480167734e-06, + "loss": 0.8703, + "num_input_tokens_seen": 1282097152, + "step": 7099 + }, + { + "epoch": 0.777251703659104, + "grad_norm": 1.2430515283321637, + "learning_rate": 5.872245701214741e-06, + "loss": 0.869, + "num_input_tokens_seen": 1282274560, + "step": 7100 + }, + { + "epoch": 0.7773611757300418, + "grad_norm": 1.0899735787015463, + "learning_rate": 5.8667101850385045e-06, + "loss": 0.7411, + "num_input_tokens_seen": 1282458912, + "step": 7101 + }, + { + "epoch": 0.7774706478009797, + "grad_norm": 1.2463887980467014, + "learning_rate": 5.861176932293894e-06, + "loss": 0.7192, + "num_input_tokens_seen": 1282643040, + "step": 7102 + }, + { + "epoch": 0.7775801198719177, + "grad_norm": 1.1485511567842184, + "learning_rate": 5.855645943635449e-06, + "loss": 0.9403, + "num_input_tokens_seen": 1282827392, + "step": 7103 + }, + { + "epoch": 0.7776895919428556, + "grad_norm": 1.1990404027474029, + "learning_rate": 5.850117219717507e-06, + "loss": 0.7367, + "num_input_tokens_seen": 1282984640, + "step": 7104 + }, + { + "epoch": 0.7777990640137935, + "grad_norm": 1.3780098397716125, + "learning_rate": 5.8445907611940745e-06, + "loss": 0.9722, + "num_input_tokens_seen": 1283152864, + "step": 7105 + }, + { + "epoch": 0.7779085360847314, + "grad_norm": 1.1600772304147398, + "learning_rate": 5.839066568718946e-06, + "loss": 0.7742, + "num_input_tokens_seen": 1283334080, + "step": 7106 + }, + { + "epoch": 0.7780180081556692, + "grad_norm": 1.2877019464572736, + "learning_rate": 5.83354464294561e-06, + "loss": 0.8517, + "num_input_tokens_seen": 1283527616, + "step": 7107 + }, + { + "epoch": 0.7781274802266072, + "grad_norm": 1.2991194090079188, + "learning_rate": 5.8280249845273025e-06, + "loss": 1.0476, + "num_input_tokens_seen": 1283724288, + "step": 7108 + }, + { + "epoch": 0.7782369522975451, + "grad_norm": 1.1442738526758367, + "learning_rate": 5.822507594116988e-06, + "loss": 0.7596, + "num_input_tokens_seen": 1283916704, + "step": 7109 + }, + { + "epoch": 0.778346424368483, + "grad_norm": 1.1345756227247858, + "learning_rate": 5.816992472367366e-06, + "loss": 0.659, + "num_input_tokens_seen": 1284100832, + "step": 7110 + }, + { + "epoch": 0.7784558964394209, + "grad_norm": 1.2776396988931125, + "learning_rate": 5.811479619930862e-06, + "loss": 0.8332, + "num_input_tokens_seen": 1284288320, + "step": 7111 + }, + { + "epoch": 0.7785653685103588, + "grad_norm": 1.207150746575288, + "learning_rate": 5.80596903745963e-06, + "loss": 1.1182, + "num_input_tokens_seen": 1284474464, + "step": 7112 + }, + { + "epoch": 0.7786748405812967, + "grad_norm": 1.2678198914456997, + "learning_rate": 5.800460725605575e-06, + "loss": 0.8664, + "num_input_tokens_seen": 1284657920, + "step": 7113 + }, + { + "epoch": 0.7787843126522346, + "grad_norm": 1.1681665871750273, + "learning_rate": 5.794954685020312e-06, + "loss": 0.8599, + "num_input_tokens_seen": 1284817184, + "step": 7114 + }, + { + "epoch": 0.7788937847231725, + "grad_norm": 1.2244374034297987, + "learning_rate": 5.7894509163551995e-06, + "loss": 0.8686, + "num_input_tokens_seen": 1284976896, + "step": 7115 + }, + { + "epoch": 0.7790032567941104, + "grad_norm": 1.017904101270918, + "learning_rate": 5.783949420261315e-06, + "loss": 0.9102, + "num_input_tokens_seen": 1285169088, + "step": 7116 + }, + { + "epoch": 0.7791127288650483, + "grad_norm": 1.2582938653343605, + "learning_rate": 5.778450197389481e-06, + "loss": 0.9065, + "num_input_tokens_seen": 1285357248, + "step": 7117 + }, + { + "epoch": 0.7792222009359862, + "grad_norm": 1.0363221134912985, + "learning_rate": 5.772953248390242e-06, + "loss": 0.6077, + "num_input_tokens_seen": 1285534432, + "step": 7118 + }, + { + "epoch": 0.7793316730069241, + "grad_norm": 1.1797639430459554, + "learning_rate": 5.767458573913881e-06, + "loss": 0.8168, + "num_input_tokens_seen": 1285741856, + "step": 7119 + }, + { + "epoch": 0.779441145077862, + "grad_norm": 1.280352115032192, + "learning_rate": 5.761966174610395e-06, + "loss": 0.8357, + "num_input_tokens_seen": 1285939872, + "step": 7120 + }, + { + "epoch": 0.7795506171487999, + "grad_norm": 1.1848633758235507, + "learning_rate": 5.756476051129542e-06, + "loss": 0.6239, + "num_input_tokens_seen": 1286115936, + "step": 7121 + }, + { + "epoch": 0.7796600892197378, + "grad_norm": 1.3258827579732022, + "learning_rate": 5.750988204120783e-06, + "loss": 1.2033, + "num_input_tokens_seen": 1286327840, + "step": 7122 + }, + { + "epoch": 0.7797695612906758, + "grad_norm": 1.0981083274969514, + "learning_rate": 5.7455026342333215e-06, + "loss": 0.796, + "num_input_tokens_seen": 1286504576, + "step": 7123 + }, + { + "epoch": 0.7798790333616136, + "grad_norm": 1.193770217724301, + "learning_rate": 5.74001934211609e-06, + "loss": 0.69, + "num_input_tokens_seen": 1286691616, + "step": 7124 + }, + { + "epoch": 0.7799885054325515, + "grad_norm": 1.1112293809623432, + "learning_rate": 5.734538328417754e-06, + "loss": 0.6984, + "num_input_tokens_seen": 1286876192, + "step": 7125 + }, + { + "epoch": 0.7800979775034894, + "grad_norm": 1.261442455356672, + "learning_rate": 5.729059593786701e-06, + "loss": 0.824, + "num_input_tokens_seen": 1287077792, + "step": 7126 + }, + { + "epoch": 0.7802074495744273, + "grad_norm": 1.172147998800198, + "learning_rate": 5.7235831388710524e-06, + "loss": 1.001, + "num_input_tokens_seen": 1287267520, + "step": 7127 + }, + { + "epoch": 0.7803169216453653, + "grad_norm": 1.0617691218965122, + "learning_rate": 5.718108964318683e-06, + "loss": 0.6067, + "num_input_tokens_seen": 1287458592, + "step": 7128 + }, + { + "epoch": 0.7804263937163032, + "grad_norm": 1.1524235584618354, + "learning_rate": 5.7126370707771495e-06, + "loss": 0.888, + "num_input_tokens_seen": 1287629056, + "step": 7129 + }, + { + "epoch": 0.780535865787241, + "grad_norm": 1.1265634163735059, + "learning_rate": 5.707167458893786e-06, + "loss": 0.8089, + "num_input_tokens_seen": 1287772640, + "step": 7130 + }, + { + "epoch": 0.7806453378581789, + "grad_norm": 1.1409709875980218, + "learning_rate": 5.701700129315629e-06, + "loss": 0.7197, + "num_input_tokens_seen": 1287960576, + "step": 7131 + }, + { + "epoch": 0.7807548099291168, + "grad_norm": 1.1981154843521133, + "learning_rate": 5.696235082689455e-06, + "loss": 0.7743, + "num_input_tokens_seen": 1288129472, + "step": 7132 + }, + { + "epoch": 0.7808642820000548, + "grad_norm": 1.4542238736126432, + "learning_rate": 5.690772319661769e-06, + "loss": 1.0666, + "num_input_tokens_seen": 1288305312, + "step": 7133 + }, + { + "epoch": 0.7809737540709927, + "grad_norm": 1.13373496545078, + "learning_rate": 5.685311840878796e-06, + "loss": 0.8718, + "num_input_tokens_seen": 1288510720, + "step": 7134 + }, + { + "epoch": 0.7810832261419305, + "grad_norm": 1.1351571384329315, + "learning_rate": 5.679853646986524e-06, + "loss": 0.7753, + "num_input_tokens_seen": 1288684096, + "step": 7135 + }, + { + "epoch": 0.7811926982128684, + "grad_norm": 1.2084045992037982, + "learning_rate": 5.674397738630619e-06, + "loss": 0.9181, + "num_input_tokens_seen": 1288864864, + "step": 7136 + }, + { + "epoch": 0.7813021702838063, + "grad_norm": 1.1613635322416023, + "learning_rate": 5.668944116456529e-06, + "loss": 1.0108, + "num_input_tokens_seen": 1289059968, + "step": 7137 + }, + { + "epoch": 0.7814116423547443, + "grad_norm": 1.271232474552308, + "learning_rate": 5.663492781109381e-06, + "loss": 1.027, + "num_input_tokens_seen": 1289279264, + "step": 7138 + }, + { + "epoch": 0.7815211144256822, + "grad_norm": 1.3479004127351815, + "learning_rate": 5.658043733234081e-06, + "loss": 0.7495, + "num_input_tokens_seen": 1289470336, + "step": 7139 + }, + { + "epoch": 0.7816305864966201, + "grad_norm": 1.3306826479470695, + "learning_rate": 5.65259697347523e-06, + "loss": 1.0545, + "num_input_tokens_seen": 1289658048, + "step": 7140 + }, + { + "epoch": 0.7817400585675579, + "grad_norm": 1.1381057719199006, + "learning_rate": 5.647152502477171e-06, + "loss": 0.7799, + "num_input_tokens_seen": 1289839488, + "step": 7141 + }, + { + "epoch": 0.7818495306384958, + "grad_norm": 1.1959106843412555, + "learning_rate": 5.641710320883975e-06, + "loss": 0.8366, + "num_input_tokens_seen": 1290011744, + "step": 7142 + }, + { + "epoch": 0.7819590027094337, + "grad_norm": 1.1607599710549301, + "learning_rate": 5.636270429339436e-06, + "loss": 0.8165, + "num_input_tokens_seen": 1290148832, + "step": 7143 + }, + { + "epoch": 0.7820684747803717, + "grad_norm": 1.068569908807634, + "learning_rate": 5.630832828487101e-06, + "loss": 0.7619, + "num_input_tokens_seen": 1290331168, + "step": 7144 + }, + { + "epoch": 0.7821779468513096, + "grad_norm": 1.1192189666862868, + "learning_rate": 5.625397518970199e-06, + "loss": 0.9793, + "num_input_tokens_seen": 1290560544, + "step": 7145 + }, + { + "epoch": 0.7822874189222475, + "grad_norm": 1.2029838974684406, + "learning_rate": 5.619964501431743e-06, + "loss": 0.8817, + "num_input_tokens_seen": 1290773344, + "step": 7146 + }, + { + "epoch": 0.7823968909931853, + "grad_norm": 1.27232433882837, + "learning_rate": 5.614533776514436e-06, + "loss": 0.7341, + "num_input_tokens_seen": 1290940224, + "step": 7147 + }, + { + "epoch": 0.7825063630641232, + "grad_norm": 1.1058961291523364, + "learning_rate": 5.609105344860724e-06, + "loss": 1.0079, + "num_input_tokens_seen": 1291160416, + "step": 7148 + }, + { + "epoch": 0.7826158351350612, + "grad_norm": 1.0933251236882613, + "learning_rate": 5.603679207112781e-06, + "loss": 0.8004, + "num_input_tokens_seen": 1291334016, + "step": 7149 + }, + { + "epoch": 0.7827253072059991, + "grad_norm": 1.2036035750115732, + "learning_rate": 5.598255363912508e-06, + "loss": 0.8687, + "num_input_tokens_seen": 1291542560, + "step": 7150 + }, + { + "epoch": 0.782834779276937, + "grad_norm": 1.1414240298495422, + "learning_rate": 5.592833815901538e-06, + "loss": 0.8457, + "num_input_tokens_seen": 1291735648, + "step": 7151 + }, + { + "epoch": 0.7829442513478748, + "grad_norm": 1.1772216389290424, + "learning_rate": 5.5874145637212245e-06, + "loss": 0.8753, + "num_input_tokens_seen": 1291952480, + "step": 7152 + }, + { + "epoch": 0.7830537234188127, + "grad_norm": 1.1352415678933006, + "learning_rate": 5.581997608012651e-06, + "loss": 0.7886, + "num_input_tokens_seen": 1292151616, + "step": 7153 + }, + { + "epoch": 0.7831631954897507, + "grad_norm": 1.1389741839798042, + "learning_rate": 5.576582949416648e-06, + "loss": 0.9653, + "num_input_tokens_seen": 1292345824, + "step": 7154 + }, + { + "epoch": 0.7832726675606886, + "grad_norm": 1.2715438149148817, + "learning_rate": 5.571170588573751e-06, + "loss": 0.8671, + "num_input_tokens_seen": 1292519872, + "step": 7155 + }, + { + "epoch": 0.7833821396316265, + "grad_norm": 1.086383810197007, + "learning_rate": 5.56576052612423e-06, + "loss": 0.9059, + "num_input_tokens_seen": 1292719680, + "step": 7156 + }, + { + "epoch": 0.7834916117025644, + "grad_norm": 1.251749721434882, + "learning_rate": 5.560352762708088e-06, + "loss": 0.9734, + "num_input_tokens_seen": 1292913440, + "step": 7157 + }, + { + "epoch": 0.7836010837735022, + "grad_norm": 1.3495325268121818, + "learning_rate": 5.554947298965052e-06, + "loss": 0.8488, + "num_input_tokens_seen": 1293044480, + "step": 7158 + }, + { + "epoch": 0.7837105558444402, + "grad_norm": 1.1819476272643752, + "learning_rate": 5.5495441355345766e-06, + "loss": 0.8337, + "num_input_tokens_seen": 1293223232, + "step": 7159 + }, + { + "epoch": 0.7838200279153781, + "grad_norm": 1.3272023754934443, + "learning_rate": 5.54414327305584e-06, + "loss": 0.7503, + "num_input_tokens_seen": 1293377568, + "step": 7160 + }, + { + "epoch": 0.783929499986316, + "grad_norm": 1.095754040845748, + "learning_rate": 5.538744712167776e-06, + "loss": 0.605, + "num_input_tokens_seen": 1293554528, + "step": 7161 + }, + { + "epoch": 0.7840389720572539, + "grad_norm": 1.2472474309237613, + "learning_rate": 5.533348453508996e-06, + "loss": 0.8616, + "num_input_tokens_seen": 1293730816, + "step": 7162 + }, + { + "epoch": 0.7841484441281918, + "grad_norm": 1.1643618000488354, + "learning_rate": 5.527954497717886e-06, + "loss": 0.9879, + "num_input_tokens_seen": 1293921216, + "step": 7163 + }, + { + "epoch": 0.7842579161991297, + "grad_norm": 1.221311615919138, + "learning_rate": 5.522562845432533e-06, + "loss": 0.9222, + "num_input_tokens_seen": 1294078016, + "step": 7164 + }, + { + "epoch": 0.7843673882700676, + "grad_norm": 1.2682917721104974, + "learning_rate": 5.517173497290762e-06, + "loss": 0.9044, + "num_input_tokens_seen": 1294277376, + "step": 7165 + }, + { + "epoch": 0.7844768603410055, + "grad_norm": 1.1879729784612174, + "learning_rate": 5.511786453930124e-06, + "loss": 0.8362, + "num_input_tokens_seen": 1294487712, + "step": 7166 + }, + { + "epoch": 0.7845863324119434, + "grad_norm": 1.214318999267036, + "learning_rate": 5.5064017159878826e-06, + "loss": 0.9422, + "num_input_tokens_seen": 1294690432, + "step": 7167 + }, + { + "epoch": 0.7846958044828813, + "grad_norm": 1.2723898761665509, + "learning_rate": 5.501019284101067e-06, + "loss": 0.9463, + "num_input_tokens_seen": 1294843648, + "step": 7168 + }, + { + "epoch": 0.7848052765538192, + "grad_norm": 1.277674267672023, + "learning_rate": 5.495639158906382e-06, + "loss": 0.8343, + "num_input_tokens_seen": 1294992832, + "step": 7169 + }, + { + "epoch": 0.7849147486247571, + "grad_norm": 1.3052061043872762, + "learning_rate": 5.490261341040312e-06, + "loss": 1.1909, + "num_input_tokens_seen": 1295171584, + "step": 7170 + }, + { + "epoch": 0.785024220695695, + "grad_norm": 1.2557331217520646, + "learning_rate": 5.4848858311390165e-06, + "loss": 0.8948, + "num_input_tokens_seen": 1295358400, + "step": 7171 + }, + { + "epoch": 0.7851336927666329, + "grad_norm": 1.3391514538270985, + "learning_rate": 5.479512629838426e-06, + "loss": 0.9567, + "num_input_tokens_seen": 1295507360, + "step": 7172 + }, + { + "epoch": 0.7852431648375708, + "grad_norm": 1.150036690997404, + "learning_rate": 5.4741417377741745e-06, + "loss": 0.9231, + "num_input_tokens_seen": 1295699104, + "step": 7173 + }, + { + "epoch": 0.7853526369085088, + "grad_norm": 1.2708347143363745, + "learning_rate": 5.468773155581627e-06, + "loss": 0.6667, + "num_input_tokens_seen": 1295878752, + "step": 7174 + }, + { + "epoch": 0.7854621089794466, + "grad_norm": 1.0782321931928387, + "learning_rate": 5.46340688389588e-06, + "loss": 0.8825, + "num_input_tokens_seen": 1296097376, + "step": 7175 + }, + { + "epoch": 0.7855715810503845, + "grad_norm": 1.2602187177259978, + "learning_rate": 5.458042923351744e-06, + "loss": 0.8169, + "num_input_tokens_seen": 1296296736, + "step": 7176 + }, + { + "epoch": 0.7856810531213224, + "grad_norm": 1.0374733381407473, + "learning_rate": 5.452681274583784e-06, + "loss": 0.7495, + "num_input_tokens_seen": 1296479520, + "step": 7177 + }, + { + "epoch": 0.7857905251922603, + "grad_norm": 1.0416221927049703, + "learning_rate": 5.447321938226249e-06, + "loss": 0.5854, + "num_input_tokens_seen": 1296658496, + "step": 7178 + }, + { + "epoch": 0.7858999972631983, + "grad_norm": 1.2913108868936665, + "learning_rate": 5.441964914913164e-06, + "loss": 0.8077, + "num_input_tokens_seen": 1296829408, + "step": 7179 + }, + { + "epoch": 0.7860094693341362, + "grad_norm": 1.0083231851892032, + "learning_rate": 5.436610205278228e-06, + "loss": 0.6957, + "num_input_tokens_seen": 1297037504, + "step": 7180 + }, + { + "epoch": 0.786118941405074, + "grad_norm": 1.2049787038290423, + "learning_rate": 5.4312578099549125e-06, + "loss": 0.7868, + "num_input_tokens_seen": 1297217824, + "step": 7181 + }, + { + "epoch": 0.7862284134760119, + "grad_norm": 1.154985612886997, + "learning_rate": 5.425907729576388e-06, + "loss": 0.7377, + "num_input_tokens_seen": 1297409792, + "step": 7182 + }, + { + "epoch": 0.7863378855469498, + "grad_norm": 1.2170565161296636, + "learning_rate": 5.42055996477556e-06, + "loss": 1.0815, + "num_input_tokens_seen": 1297616096, + "step": 7183 + }, + { + "epoch": 0.7864473576178878, + "grad_norm": 1.115908123602869, + "learning_rate": 5.415214516185061e-06, + "loss": 0.7193, + "num_input_tokens_seen": 1297822848, + "step": 7184 + }, + { + "epoch": 0.7865568296888257, + "grad_norm": 1.251070517853934, + "learning_rate": 5.409871384437234e-06, + "loss": 0.7032, + "num_input_tokens_seen": 1298005184, + "step": 7185 + }, + { + "epoch": 0.7866663017597635, + "grad_norm": 1.1934889802619775, + "learning_rate": 5.404530570164187e-06, + "loss": 0.6884, + "num_input_tokens_seen": 1298186624, + "step": 7186 + }, + { + "epoch": 0.7867757738307014, + "grad_norm": 1.239396805179474, + "learning_rate": 5.399192073997703e-06, + "loss": 0.7827, + "num_input_tokens_seen": 1298337376, + "step": 7187 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 1.1817922951957631, + "learning_rate": 5.39385589656933e-06, + "loss": 0.6941, + "num_input_tokens_seen": 1298523520, + "step": 7188 + }, + { + "epoch": 0.7869947179725773, + "grad_norm": 1.1723398939514398, + "learning_rate": 5.3885220385103245e-06, + "loss": 0.7276, + "num_input_tokens_seen": 1298726464, + "step": 7189 + }, + { + "epoch": 0.7871041900435152, + "grad_norm": 1.1332503952271407, + "learning_rate": 5.383190500451671e-06, + "loss": 0.9199, + "num_input_tokens_seen": 1298916192, + "step": 7190 + }, + { + "epoch": 0.7872136621144531, + "grad_norm": 1.2270858522830792, + "learning_rate": 5.3778612830240795e-06, + "loss": 0.6858, + "num_input_tokens_seen": 1299055296, + "step": 7191 + }, + { + "epoch": 0.7873231341853909, + "grad_norm": 1.2024162979497302, + "learning_rate": 5.372534386857988e-06, + "loss": 0.8372, + "num_input_tokens_seen": 1299217920, + "step": 7192 + }, + { + "epoch": 0.7874326062563288, + "grad_norm": 1.1896891242362644, + "learning_rate": 5.367209812583557e-06, + "loss": 0.7408, + "num_input_tokens_seen": 1299400032, + "step": 7193 + }, + { + "epoch": 0.7875420783272667, + "grad_norm": 1.0831185396929606, + "learning_rate": 5.361887560830675e-06, + "loss": 0.9779, + "num_input_tokens_seen": 1299587296, + "step": 7194 + }, + { + "epoch": 0.7876515503982047, + "grad_norm": 1.1307599644952495, + "learning_rate": 5.356567632228943e-06, + "loss": 0.782, + "num_input_tokens_seen": 1299755520, + "step": 7195 + }, + { + "epoch": 0.7877610224691426, + "grad_norm": 1.3175658147685234, + "learning_rate": 5.351250027407717e-06, + "loss": 1.1653, + "num_input_tokens_seen": 1299918144, + "step": 7196 + }, + { + "epoch": 0.7878704945400805, + "grad_norm": 1.2238662143968955, + "learning_rate": 5.345934746996051e-06, + "loss": 0.8483, + "num_input_tokens_seen": 1300093088, + "step": 7197 + }, + { + "epoch": 0.7879799666110183, + "grad_norm": 1.1520348603649782, + "learning_rate": 5.340621791622733e-06, + "loss": 0.7346, + "num_input_tokens_seen": 1300265792, + "step": 7198 + }, + { + "epoch": 0.7880894386819562, + "grad_norm": 1.1835515987331133, + "learning_rate": 5.335311161916273e-06, + "loss": 0.6802, + "num_input_tokens_seen": 1300451712, + "step": 7199 + }, + { + "epoch": 0.7881989107528942, + "grad_norm": 1.0792470428306107, + "learning_rate": 5.330002858504904e-06, + "loss": 0.6344, + "num_input_tokens_seen": 1300640768, + "step": 7200 + }, + { + "epoch": 0.7883083828238321, + "grad_norm": 1.2088956784943186, + "learning_rate": 5.324696882016606e-06, + "loss": 1.0618, + "num_input_tokens_seen": 1300811680, + "step": 7201 + }, + { + "epoch": 0.78841785489477, + "grad_norm": 1.177455230072366, + "learning_rate": 5.319393233079042e-06, + "loss": 0.768, + "num_input_tokens_seen": 1300986848, + "step": 7202 + }, + { + "epoch": 0.7885273269657078, + "grad_norm": 1.3539761286959797, + "learning_rate": 5.314091912319649e-06, + "loss": 0.8251, + "num_input_tokens_seen": 1301154400, + "step": 7203 + }, + { + "epoch": 0.7886367990366457, + "grad_norm": 1.2505148304269962, + "learning_rate": 5.3087929203655375e-06, + "loss": 0.9199, + "num_input_tokens_seen": 1301341216, + "step": 7204 + }, + { + "epoch": 0.7887462711075837, + "grad_norm": 1.0358972164739366, + "learning_rate": 5.303496257843585e-06, + "loss": 0.5308, + "num_input_tokens_seen": 1301513472, + "step": 7205 + }, + { + "epoch": 0.7888557431785216, + "grad_norm": 1.122998523500581, + "learning_rate": 5.2982019253803725e-06, + "loss": 0.812, + "num_input_tokens_seen": 1301702752, + "step": 7206 + }, + { + "epoch": 0.7889652152494595, + "grad_norm": 1.090720750163412, + "learning_rate": 5.29290992360221e-06, + "loss": 0.8791, + "num_input_tokens_seen": 1301909504, + "step": 7207 + }, + { + "epoch": 0.7890746873203974, + "grad_norm": 1.1250555766471, + "learning_rate": 5.2876202531351285e-06, + "loss": 0.792, + "num_input_tokens_seen": 1302091840, + "step": 7208 + }, + { + "epoch": 0.7891841593913352, + "grad_norm": 1.1904864152968773, + "learning_rate": 5.2823329146048815e-06, + "loss": 0.8927, + "num_input_tokens_seen": 1302268128, + "step": 7209 + }, + { + "epoch": 0.7892936314622732, + "grad_norm": 1.2805117829721702, + "learning_rate": 5.27704790863697e-06, + "loss": 0.8556, + "num_input_tokens_seen": 1302402528, + "step": 7210 + }, + { + "epoch": 0.7894031035332111, + "grad_norm": 1.2177359935446024, + "learning_rate": 5.271765235856574e-06, + "loss": 0.9642, + "num_input_tokens_seen": 1302564480, + "step": 7211 + }, + { + "epoch": 0.789512575604149, + "grad_norm": 1.137471739449363, + "learning_rate": 5.266484896888649e-06, + "loss": 0.7, + "num_input_tokens_seen": 1302734720, + "step": 7212 + }, + { + "epoch": 0.7896220476750869, + "grad_norm": 0.9872212511495655, + "learning_rate": 5.261206892357825e-06, + "loss": 0.6713, + "num_input_tokens_seen": 1302937664, + "step": 7213 + }, + { + "epoch": 0.7897315197460248, + "grad_norm": 1.1557425854754728, + "learning_rate": 5.255931222888497e-06, + "loss": 0.7201, + "num_input_tokens_seen": 1303122016, + "step": 7214 + }, + { + "epoch": 0.7898409918169627, + "grad_norm": 1.2087277831883068, + "learning_rate": 5.25065788910476e-06, + "loss": 1.0483, + "num_input_tokens_seen": 1303321152, + "step": 7215 + }, + { + "epoch": 0.7899504638879006, + "grad_norm": 1.2299016705418608, + "learning_rate": 5.245386891630441e-06, + "loss": 0.9278, + "num_input_tokens_seen": 1303507520, + "step": 7216 + }, + { + "epoch": 0.7900599359588385, + "grad_norm": 1.4197444673763917, + "learning_rate": 5.240118231089089e-06, + "loss": 1.0411, + "num_input_tokens_seen": 1303677312, + "step": 7217 + }, + { + "epoch": 0.7901694080297764, + "grad_norm": 1.13981207528399, + "learning_rate": 5.234851908103969e-06, + "loss": 0.8597, + "num_input_tokens_seen": 1303861888, + "step": 7218 + }, + { + "epoch": 0.7902788801007143, + "grad_norm": 1.298357518345872, + "learning_rate": 5.229587923298099e-06, + "loss": 0.689, + "num_input_tokens_seen": 1304040192, + "step": 7219 + }, + { + "epoch": 0.7903883521716522, + "grad_norm": 1.2299074376176107, + "learning_rate": 5.224326277294167e-06, + "loss": 0.8445, + "num_input_tokens_seen": 1304228128, + "step": 7220 + }, + { + "epoch": 0.7904978242425901, + "grad_norm": 1.2772787001725303, + "learning_rate": 5.219066970714639e-06, + "loss": 0.9796, + "num_input_tokens_seen": 1304409568, + "step": 7221 + }, + { + "epoch": 0.790607296313528, + "grad_norm": 1.0862701784636761, + "learning_rate": 5.2138100041816736e-06, + "loss": 0.7517, + "num_input_tokens_seen": 1304597728, + "step": 7222 + }, + { + "epoch": 0.7907167683844659, + "grad_norm": 1.135060257341442, + "learning_rate": 5.208555378317159e-06, + "loss": 0.8287, + "num_input_tokens_seen": 1304768416, + "step": 7223 + }, + { + "epoch": 0.7908262404554038, + "grad_norm": 1.3490959990766898, + "learning_rate": 5.203303093742712e-06, + "loss": 0.7679, + "num_input_tokens_seen": 1304953888, + "step": 7224 + }, + { + "epoch": 0.7909357125263418, + "grad_norm": 1.2276888492698967, + "learning_rate": 5.1980531510796595e-06, + "loss": 0.879, + "num_input_tokens_seen": 1305138240, + "step": 7225 + }, + { + "epoch": 0.7910451845972796, + "grad_norm": 1.1766392311719625, + "learning_rate": 5.192805550949068e-06, + "loss": 0.9277, + "num_input_tokens_seen": 1305310496, + "step": 7226 + }, + { + "epoch": 0.7911546566682175, + "grad_norm": 1.3808971437633388, + "learning_rate": 5.187560293971705e-06, + "loss": 1.0396, + "num_input_tokens_seen": 1305491264, + "step": 7227 + }, + { + "epoch": 0.7912641287391554, + "grad_norm": 1.17792606849728, + "learning_rate": 5.182317380768092e-06, + "loss": 0.6558, + "num_input_tokens_seen": 1305661056, + "step": 7228 + }, + { + "epoch": 0.7913736008100933, + "grad_norm": 1.2130627103101048, + "learning_rate": 5.177076811958451e-06, + "loss": 0.7057, + "num_input_tokens_seen": 1305797920, + "step": 7229 + }, + { + "epoch": 0.7914830728810313, + "grad_norm": 1.2377632573942747, + "learning_rate": 5.171838588162725e-06, + "loss": 0.788, + "num_input_tokens_seen": 1305939264, + "step": 7230 + }, + { + "epoch": 0.7915925449519692, + "grad_norm": 1.1410092268427792, + "learning_rate": 5.16660271000059e-06, + "loss": 0.6852, + "num_input_tokens_seen": 1306124512, + "step": 7231 + }, + { + "epoch": 0.791702017022907, + "grad_norm": 1.109258457563414, + "learning_rate": 5.161369178091438e-06, + "loss": 0.804, + "num_input_tokens_seen": 1306303936, + "step": 7232 + }, + { + "epoch": 0.7918114890938449, + "grad_norm": 1.2613002685419563, + "learning_rate": 5.1561379930543885e-06, + "loss": 0.9677, + "num_input_tokens_seen": 1306499936, + "step": 7233 + }, + { + "epoch": 0.7919209611647828, + "grad_norm": 1.2233943965719307, + "learning_rate": 5.1509091555082794e-06, + "loss": 0.8378, + "num_input_tokens_seen": 1306706688, + "step": 7234 + }, + { + "epoch": 0.7920304332357208, + "grad_norm": 1.18454349295626, + "learning_rate": 5.145682666071663e-06, + "loss": 1.0767, + "num_input_tokens_seen": 1306913888, + "step": 7235 + }, + { + "epoch": 0.7921399053066587, + "grad_norm": 1.2475596448442245, + "learning_rate": 5.140458525362848e-06, + "loss": 0.7659, + "num_input_tokens_seen": 1307046944, + "step": 7236 + }, + { + "epoch": 0.7922493773775965, + "grad_norm": 1.2756800184482309, + "learning_rate": 5.135236733999813e-06, + "loss": 0.7439, + "num_input_tokens_seen": 1307246528, + "step": 7237 + }, + { + "epoch": 0.7923588494485344, + "grad_norm": 1.3470495212715357, + "learning_rate": 5.1300172926003e-06, + "loss": 0.9518, + "num_input_tokens_seen": 1307440064, + "step": 7238 + }, + { + "epoch": 0.7924683215194723, + "grad_norm": 1.3776881643645944, + "learning_rate": 5.1248002017817596e-06, + "loss": 0.9752, + "num_input_tokens_seen": 1307623744, + "step": 7239 + }, + { + "epoch": 0.7925777935904103, + "grad_norm": 1.163598909168119, + "learning_rate": 5.119585462161358e-06, + "loss": 0.7972, + "num_input_tokens_seen": 1307824448, + "step": 7240 + }, + { + "epoch": 0.7926872656613482, + "grad_norm": 1.178101844458856, + "learning_rate": 5.114373074355994e-06, + "loss": 1.0068, + "num_input_tokens_seen": 1308033888, + "step": 7241 + }, + { + "epoch": 0.7927967377322861, + "grad_norm": 1.0976551096628584, + "learning_rate": 5.10916303898227e-06, + "loss": 0.9101, + "num_input_tokens_seen": 1308208832, + "step": 7242 + }, + { + "epoch": 0.7929062098032239, + "grad_norm": 1.1875387988528303, + "learning_rate": 5.1039553566565505e-06, + "loss": 0.9809, + "num_input_tokens_seen": 1308407072, + "step": 7243 + }, + { + "epoch": 0.7930156818741618, + "grad_norm": 1.1190977504433213, + "learning_rate": 5.098750027994862e-06, + "loss": 0.6347, + "num_input_tokens_seen": 1308583808, + "step": 7244 + }, + { + "epoch": 0.7931251539450997, + "grad_norm": 1.1704225312725787, + "learning_rate": 5.0935470536130155e-06, + "loss": 0.6307, + "num_input_tokens_seen": 1308761440, + "step": 7245 + }, + { + "epoch": 0.7932346260160377, + "grad_norm": 1.351172109682714, + "learning_rate": 5.088346434126481e-06, + "loss": 0.8108, + "num_input_tokens_seen": 1308919808, + "step": 7246 + }, + { + "epoch": 0.7933440980869756, + "grad_norm": 1.1828914862883386, + "learning_rate": 5.083148170150509e-06, + "loss": 0.7119, + "num_input_tokens_seen": 1309096320, + "step": 7247 + }, + { + "epoch": 0.7934535701579135, + "grad_norm": 1.3096133686585758, + "learning_rate": 5.0779522623000345e-06, + "loss": 0.8216, + "num_input_tokens_seen": 1309264768, + "step": 7248 + }, + { + "epoch": 0.7935630422288513, + "grad_norm": 1.1897321602385025, + "learning_rate": 5.072758711189721e-06, + "loss": 1.054, + "num_input_tokens_seen": 1309460768, + "step": 7249 + }, + { + "epoch": 0.7936725142997892, + "grad_norm": 1.1854246978398322, + "learning_rate": 5.067567517433958e-06, + "loss": 0.7784, + "num_input_tokens_seen": 1309654752, + "step": 7250 + }, + { + "epoch": 0.7937819863707272, + "grad_norm": 1.228231570058395, + "learning_rate": 5.062378681646845e-06, + "loss": 0.8269, + "num_input_tokens_seen": 1309837536, + "step": 7251 + }, + { + "epoch": 0.7938914584416651, + "grad_norm": 1.1795162556632963, + "learning_rate": 5.057192204442235e-06, + "loss": 0.8039, + "num_input_tokens_seen": 1310024576, + "step": 7252 + }, + { + "epoch": 0.794000930512603, + "grad_norm": 1.2026896989732208, + "learning_rate": 5.052008086433649e-06, + "loss": 0.7508, + "num_input_tokens_seen": 1310167264, + "step": 7253 + }, + { + "epoch": 0.7941104025835408, + "grad_norm": 1.31607585801978, + "learning_rate": 5.046826328234386e-06, + "loss": 0.8507, + "num_input_tokens_seen": 1310328320, + "step": 7254 + }, + { + "epoch": 0.7942198746544787, + "grad_norm": 1.277792615504891, + "learning_rate": 5.041646930457411e-06, + "loss": 0.8398, + "num_input_tokens_seen": 1310496992, + "step": 7255 + }, + { + "epoch": 0.7943293467254167, + "grad_norm": 1.0564682797040397, + "learning_rate": 5.0364698937154565e-06, + "loss": 0.5934, + "num_input_tokens_seen": 1310669920, + "step": 7256 + }, + { + "epoch": 0.7944388187963546, + "grad_norm": 1.117988459362047, + "learning_rate": 5.031295218620952e-06, + "loss": 0.9749, + "num_input_tokens_seen": 1310882944, + "step": 7257 + }, + { + "epoch": 0.7945482908672925, + "grad_norm": 1.1177887799031425, + "learning_rate": 5.026122905786046e-06, + "loss": 0.7317, + "num_input_tokens_seen": 1311019360, + "step": 7258 + }, + { + "epoch": 0.7946577629382304, + "grad_norm": 1.1681559230901646, + "learning_rate": 5.020952955822619e-06, + "loss": 0.8124, + "num_input_tokens_seen": 1311194752, + "step": 7259 + }, + { + "epoch": 0.7947672350091682, + "grad_norm": 1.2787773649955076, + "learning_rate": 5.015785369342255e-06, + "loss": 0.9262, + "num_input_tokens_seen": 1311364992, + "step": 7260 + }, + { + "epoch": 0.7948767070801062, + "grad_norm": 1.1326955274501522, + "learning_rate": 5.010620146956293e-06, + "loss": 0.8334, + "num_input_tokens_seen": 1311524928, + "step": 7261 + }, + { + "epoch": 0.7949861791510441, + "grad_norm": 1.1547081435736595, + "learning_rate": 5.0054572892757416e-06, + "loss": 0.9194, + "num_input_tokens_seen": 1311721600, + "step": 7262 + }, + { + "epoch": 0.795095651221982, + "grad_norm": 1.1970265384686414, + "learning_rate": 5.000296796911377e-06, + "loss": 0.8079, + "num_input_tokens_seen": 1311903040, + "step": 7263 + }, + { + "epoch": 0.7952051232929199, + "grad_norm": 1.190346017827803, + "learning_rate": 4.995138670473667e-06, + "loss": 0.7586, + "num_input_tokens_seen": 1312098592, + "step": 7264 + }, + { + "epoch": 0.7953145953638578, + "grad_norm": 1.2240287706384634, + "learning_rate": 4.98998291057281e-06, + "loss": 0.947, + "num_input_tokens_seen": 1312314080, + "step": 7265 + }, + { + "epoch": 0.7954240674347957, + "grad_norm": 1.170978050293343, + "learning_rate": 4.984829517818723e-06, + "loss": 0.8285, + "num_input_tokens_seen": 1312511200, + "step": 7266 + }, + { + "epoch": 0.7955335395057336, + "grad_norm": 1.1886200892625494, + "learning_rate": 4.979678492821041e-06, + "loss": 0.9699, + "num_input_tokens_seen": 1312689056, + "step": 7267 + }, + { + "epoch": 0.7956430115766715, + "grad_norm": 1.151702961319878, + "learning_rate": 4.974529836189113e-06, + "loss": 0.9445, + "num_input_tokens_seen": 1312887968, + "step": 7268 + }, + { + "epoch": 0.7957524836476094, + "grad_norm": 1.1022789971942855, + "learning_rate": 4.969383548532031e-06, + "loss": 0.841, + "num_input_tokens_seen": 1313066048, + "step": 7269 + }, + { + "epoch": 0.7958619557185473, + "grad_norm": 1.2762889648377684, + "learning_rate": 4.9642396304585834e-06, + "loss": 0.692, + "num_input_tokens_seen": 1313217920, + "step": 7270 + }, + { + "epoch": 0.7959714277894852, + "grad_norm": 1.2305448478297922, + "learning_rate": 4.959098082577284e-06, + "loss": 0.8613, + "num_input_tokens_seen": 1313387264, + "step": 7271 + }, + { + "epoch": 0.7960808998604231, + "grad_norm": 1.2147188275740466, + "learning_rate": 4.953958905496372e-06, + "loss": 0.8678, + "num_input_tokens_seen": 1313579008, + "step": 7272 + }, + { + "epoch": 0.796190371931361, + "grad_norm": 1.0573783738736022, + "learning_rate": 4.948822099823797e-06, + "loss": 0.7518, + "num_input_tokens_seen": 1313761792, + "step": 7273 + }, + { + "epoch": 0.7962998440022989, + "grad_norm": 1.0888887001273149, + "learning_rate": 4.943687666167238e-06, + "loss": 0.7532, + "num_input_tokens_seen": 1313943904, + "step": 7274 + }, + { + "epoch": 0.7964093160732368, + "grad_norm": 1.225333114018661, + "learning_rate": 4.938555605134082e-06, + "loss": 0.9277, + "num_input_tokens_seen": 1314101824, + "step": 7275 + }, + { + "epoch": 0.7965187881441748, + "grad_norm": 1.4659303424362025, + "learning_rate": 4.9334259173314575e-06, + "loss": 0.8782, + "num_input_tokens_seen": 1314267584, + "step": 7276 + }, + { + "epoch": 0.7966282602151126, + "grad_norm": 1.1556015773976336, + "learning_rate": 4.928298603366174e-06, + "loss": 0.8843, + "num_input_tokens_seen": 1314460672, + "step": 7277 + }, + { + "epoch": 0.7967377322860505, + "grad_norm": 1.0641937219759692, + "learning_rate": 4.92317366384481e-06, + "loss": 0.6677, + "num_input_tokens_seen": 1314642112, + "step": 7278 + }, + { + "epoch": 0.7968472043569884, + "grad_norm": 1.2174573058175993, + "learning_rate": 4.918051099373605e-06, + "loss": 0.7358, + "num_input_tokens_seen": 1314856256, + "step": 7279 + }, + { + "epoch": 0.7969566764279263, + "grad_norm": 1.0962823706863898, + "learning_rate": 4.912930910558572e-06, + "loss": 0.6461, + "num_input_tokens_seen": 1315023136, + "step": 7280 + }, + { + "epoch": 0.7970661484988643, + "grad_norm": 1.1668203570541553, + "learning_rate": 4.907813098005415e-06, + "loss": 0.7788, + "num_input_tokens_seen": 1315220704, + "step": 7281 + }, + { + "epoch": 0.7971756205698022, + "grad_norm": 1.5085853611985982, + "learning_rate": 4.90269766231955e-06, + "loss": 0.9316, + "num_input_tokens_seen": 1315414240, + "step": 7282 + }, + { + "epoch": 0.79728509264074, + "grad_norm": 1.1651473142944493, + "learning_rate": 4.897584604106145e-06, + "loss": 0.7114, + "num_input_tokens_seen": 1315555136, + "step": 7283 + }, + { + "epoch": 0.7973945647116779, + "grad_norm": 1.0704022669216493, + "learning_rate": 4.89247392397004e-06, + "loss": 1.1238, + "num_input_tokens_seen": 1315754496, + "step": 7284 + }, + { + "epoch": 0.7975040367826158, + "grad_norm": 1.2657739998977457, + "learning_rate": 4.8873656225158405e-06, + "loss": 0.8349, + "num_input_tokens_seen": 1315921824, + "step": 7285 + }, + { + "epoch": 0.7976135088535538, + "grad_norm": 1.2464653106924326, + "learning_rate": 4.88225970034783e-06, + "loss": 0.8369, + "num_input_tokens_seen": 1316085344, + "step": 7286 + }, + { + "epoch": 0.7977229809244917, + "grad_norm": 1.4335841872392656, + "learning_rate": 4.8771561580700505e-06, + "loss": 1.3288, + "num_input_tokens_seen": 1316281120, + "step": 7287 + }, + { + "epoch": 0.7978324529954295, + "grad_norm": 1.1743891893700023, + "learning_rate": 4.872054996286216e-06, + "loss": 0.7825, + "num_input_tokens_seen": 1316450464, + "step": 7288 + }, + { + "epoch": 0.7979419250663674, + "grad_norm": 1.2607090928678193, + "learning_rate": 4.866956215599802e-06, + "loss": 0.6704, + "num_input_tokens_seen": 1316614432, + "step": 7289 + }, + { + "epoch": 0.7980513971373053, + "grad_norm": 1.3834248280811925, + "learning_rate": 4.861859816613981e-06, + "loss": 0.8262, + "num_input_tokens_seen": 1316775488, + "step": 7290 + }, + { + "epoch": 0.7981608692082433, + "grad_norm": 1.274710785095958, + "learning_rate": 4.856765799931648e-06, + "loss": 1.0612, + "num_input_tokens_seen": 1316976640, + "step": 7291 + }, + { + "epoch": 0.7982703412791812, + "grad_norm": 1.205697319428583, + "learning_rate": 4.851674166155412e-06, + "loss": 0.7451, + "num_input_tokens_seen": 1317163680, + "step": 7292 + }, + { + "epoch": 0.7983798133501191, + "grad_norm": 1.026784534335199, + "learning_rate": 4.846584915887597e-06, + "loss": 0.7143, + "num_input_tokens_seen": 1317356320, + "step": 7293 + }, + { + "epoch": 0.7984892854210569, + "grad_norm": 1.277234553998017, + "learning_rate": 4.8414980497302755e-06, + "loss": 0.8668, + "num_input_tokens_seen": 1317524320, + "step": 7294 + }, + { + "epoch": 0.7985987574919948, + "grad_norm": 1.1892904035638623, + "learning_rate": 4.836413568285183e-06, + "loss": 0.799, + "num_input_tokens_seen": 1317682464, + "step": 7295 + }, + { + "epoch": 0.7987082295629327, + "grad_norm": 1.2734643664188066, + "learning_rate": 4.831331472153828e-06, + "loss": 0.8909, + "num_input_tokens_seen": 1317851136, + "step": 7296 + }, + { + "epoch": 0.7988177016338707, + "grad_norm": 1.2733464588783658, + "learning_rate": 4.8262517619374e-06, + "loss": 0.724, + "num_input_tokens_seen": 1318050720, + "step": 7297 + }, + { + "epoch": 0.7989271737048086, + "grad_norm": 1.2637909209647147, + "learning_rate": 4.821174438236825e-06, + "loss": 0.9177, + "num_input_tokens_seen": 1318231936, + "step": 7298 + }, + { + "epoch": 0.7990366457757465, + "grad_norm": 1.2320661076638175, + "learning_rate": 4.816099501652741e-06, + "loss": 0.8303, + "num_input_tokens_seen": 1318403968, + "step": 7299 + }, + { + "epoch": 0.7991461178466843, + "grad_norm": 1.3570963736869852, + "learning_rate": 4.8110269527854965e-06, + "loss": 0.8109, + "num_input_tokens_seen": 1318550016, + "step": 7300 + }, + { + "epoch": 0.7992555899176222, + "grad_norm": 1.3473428997686092, + "learning_rate": 4.805956792235172e-06, + "loss": 0.7822, + "num_input_tokens_seen": 1318721600, + "step": 7301 + }, + { + "epoch": 0.7993650619885602, + "grad_norm": 1.2866878948206433, + "learning_rate": 4.800889020601548e-06, + "loss": 1.0638, + "num_input_tokens_seen": 1318866304, + "step": 7302 + }, + { + "epoch": 0.7994745340594981, + "grad_norm": 1.3234478442295616, + "learning_rate": 4.795823638484142e-06, + "loss": 0.9173, + "num_input_tokens_seen": 1319056480, + "step": 7303 + }, + { + "epoch": 0.799584006130436, + "grad_norm": 1.189827946077665, + "learning_rate": 4.790760646482178e-06, + "loss": 0.8702, + "num_input_tokens_seen": 1319217312, + "step": 7304 + }, + { + "epoch": 0.7996934782013738, + "grad_norm": 1.1272937914273065, + "learning_rate": 4.785700045194596e-06, + "loss": 0.7576, + "num_input_tokens_seen": 1319394048, + "step": 7305 + }, + { + "epoch": 0.7998029502723117, + "grad_norm": 1.1147497831225905, + "learning_rate": 4.7806418352200565e-06, + "loss": 0.9075, + "num_input_tokens_seen": 1319601024, + "step": 7306 + }, + { + "epoch": 0.7999124223432497, + "grad_norm": 1.1080264101530337, + "learning_rate": 4.775586017156936e-06, + "loss": 0.8667, + "num_input_tokens_seen": 1319786944, + "step": 7307 + }, + { + "epoch": 0.8000218944141876, + "grad_norm": 1.201322022024568, + "learning_rate": 4.770532591603324e-06, + "loss": 0.8673, + "num_input_tokens_seen": 1319966592, + "step": 7308 + }, + { + "epoch": 0.8001313664851255, + "grad_norm": 1.2677962434369103, + "learning_rate": 4.765481559157034e-06, + "loss": 0.8826, + "num_input_tokens_seen": 1320149152, + "step": 7309 + }, + { + "epoch": 0.8002408385560634, + "grad_norm": 1.1283530493515839, + "learning_rate": 4.760432920415589e-06, + "loss": 0.7683, + "num_input_tokens_seen": 1320353440, + "step": 7310 + }, + { + "epoch": 0.8003503106270012, + "grad_norm": 1.178689041503209, + "learning_rate": 4.755386675976245e-06, + "loss": 0.7083, + "num_input_tokens_seen": 1320536224, + "step": 7311 + }, + { + "epoch": 0.8004597826979392, + "grad_norm": 1.1952117678015715, + "learning_rate": 4.750342826435955e-06, + "loss": 0.8471, + "num_input_tokens_seen": 1320739392, + "step": 7312 + }, + { + "epoch": 0.8005692547688771, + "grad_norm": 1.3379104946916405, + "learning_rate": 4.745301372391397e-06, + "loss": 1.229, + "num_input_tokens_seen": 1320927328, + "step": 7313 + }, + { + "epoch": 0.800678726839815, + "grad_norm": 1.2792887680146277, + "learning_rate": 4.740262314438968e-06, + "loss": 0.827, + "num_input_tokens_seen": 1321123104, + "step": 7314 + }, + { + "epoch": 0.8007881989107529, + "grad_norm": 1.3331864792695283, + "learning_rate": 4.7352256531747766e-06, + "loss": 0.8672, + "num_input_tokens_seen": 1321284832, + "step": 7315 + }, + { + "epoch": 0.8008976709816908, + "grad_norm": 1.072225212072136, + "learning_rate": 4.730191389194652e-06, + "loss": 0.6346, + "num_input_tokens_seen": 1321465824, + "step": 7316 + }, + { + "epoch": 0.8010071430526287, + "grad_norm": 1.0176523953972034, + "learning_rate": 4.725159523094127e-06, + "loss": 0.7188, + "num_input_tokens_seen": 1321632256, + "step": 7317 + }, + { + "epoch": 0.8011166151235666, + "grad_norm": 1.0315676691156332, + "learning_rate": 4.720130055468488e-06, + "loss": 0.7554, + "num_input_tokens_seen": 1321836096, + "step": 7318 + }, + { + "epoch": 0.8012260871945045, + "grad_norm": 1.0791246506650534, + "learning_rate": 4.7151029869126784e-06, + "loss": 0.7997, + "num_input_tokens_seen": 1322027392, + "step": 7319 + }, + { + "epoch": 0.8013355592654424, + "grad_norm": 1.1932773593121375, + "learning_rate": 4.710078318021424e-06, + "loss": 0.924, + "num_input_tokens_seen": 1322213536, + "step": 7320 + }, + { + "epoch": 0.8014450313363803, + "grad_norm": 1.3496136147726459, + "learning_rate": 4.705056049389101e-06, + "loss": 0.7621, + "num_input_tokens_seen": 1322377056, + "step": 7321 + }, + { + "epoch": 0.8015545034073182, + "grad_norm": 1.203070948984348, + "learning_rate": 4.700036181609857e-06, + "loss": 0.6781, + "num_input_tokens_seen": 1322556704, + "step": 7322 + }, + { + "epoch": 0.8016639754782561, + "grad_norm": 1.2202227081985708, + "learning_rate": 4.695018715277527e-06, + "loss": 0.6953, + "num_input_tokens_seen": 1322729856, + "step": 7323 + }, + { + "epoch": 0.801773447549194, + "grad_norm": 1.2451629989274537, + "learning_rate": 4.690003650985658e-06, + "loss": 1.0356, + "num_input_tokens_seen": 1322934368, + "step": 7324 + }, + { + "epoch": 0.8018829196201319, + "grad_norm": 1.1953807636789398, + "learning_rate": 4.684990989327548e-06, + "loss": 0.8542, + "num_input_tokens_seen": 1323103712, + "step": 7325 + }, + { + "epoch": 0.8019923916910698, + "grad_norm": 1.1208226211678827, + "learning_rate": 4.679980730896153e-06, + "loss": 0.8998, + "num_input_tokens_seen": 1323295232, + "step": 7326 + }, + { + "epoch": 0.8021018637620078, + "grad_norm": 1.3371762338383457, + "learning_rate": 4.674972876284203e-06, + "loss": 0.9954, + "num_input_tokens_seen": 1323467712, + "step": 7327 + }, + { + "epoch": 0.8022113358329456, + "grad_norm": 1.2359506714862905, + "learning_rate": 4.6699674260840955e-06, + "loss": 0.8045, + "num_input_tokens_seen": 1323649600, + "step": 7328 + }, + { + "epoch": 0.8023208079038835, + "grad_norm": 1.174535198806391, + "learning_rate": 4.664964380887985e-06, + "loss": 0.6878, + "num_input_tokens_seen": 1323840448, + "step": 7329 + }, + { + "epoch": 0.8024302799748214, + "grad_norm": 1.2560310780330977, + "learning_rate": 4.6599637412877125e-06, + "loss": 0.8504, + "num_input_tokens_seen": 1324028160, + "step": 7330 + }, + { + "epoch": 0.8025397520457593, + "grad_norm": 1.203613380292846, + "learning_rate": 4.654965507874845e-06, + "loss": 0.8893, + "num_input_tokens_seen": 1324204672, + "step": 7331 + }, + { + "epoch": 0.8026492241166973, + "grad_norm": 1.195845640874568, + "learning_rate": 4.649969681240668e-06, + "loss": 0.8834, + "num_input_tokens_seen": 1324373120, + "step": 7332 + }, + { + "epoch": 0.8027586961876352, + "grad_norm": 1.1607160149026923, + "learning_rate": 4.644976261976172e-06, + "loss": 1.0278, + "num_input_tokens_seen": 1324556352, + "step": 7333 + }, + { + "epoch": 0.802868168258573, + "grad_norm": 1.2515635724962, + "learning_rate": 4.639985250672074e-06, + "loss": 0.6972, + "num_input_tokens_seen": 1324750784, + "step": 7334 + }, + { + "epoch": 0.8029776403295109, + "grad_norm": 1.1850950583626834, + "learning_rate": 4.634996647918791e-06, + "loss": 0.8335, + "num_input_tokens_seen": 1324914976, + "step": 7335 + }, + { + "epoch": 0.8030871124004488, + "grad_norm": 1.1468530741162637, + "learning_rate": 4.63001045430648e-06, + "loss": 0.7898, + "num_input_tokens_seen": 1325077376, + "step": 7336 + }, + { + "epoch": 0.8031965844713868, + "grad_norm": 1.2732857923886594, + "learning_rate": 4.625026670424992e-06, + "loss": 0.9254, + "num_input_tokens_seen": 1325234176, + "step": 7337 + }, + { + "epoch": 0.8033060565423247, + "grad_norm": 1.2473685222441862, + "learning_rate": 4.620045296863898e-06, + "loss": 0.9004, + "num_input_tokens_seen": 1325400160, + "step": 7338 + }, + { + "epoch": 0.8034155286132625, + "grad_norm": 1.1603984130482965, + "learning_rate": 4.615066334212487e-06, + "loss": 0.8151, + "num_input_tokens_seen": 1325586976, + "step": 7339 + }, + { + "epoch": 0.8035250006842004, + "grad_norm": 1.1624931663394364, + "learning_rate": 4.61008978305976e-06, + "loss": 0.8776, + "num_input_tokens_seen": 1325754080, + "step": 7340 + }, + { + "epoch": 0.8036344727551383, + "grad_norm": 1.2337775414203553, + "learning_rate": 4.605115643994429e-06, + "loss": 1.0879, + "num_input_tokens_seen": 1325964416, + "step": 7341 + }, + { + "epoch": 0.8037439448260763, + "grad_norm": 1.1676941105398664, + "learning_rate": 4.6001439176049325e-06, + "loss": 0.8148, + "num_input_tokens_seen": 1326135104, + "step": 7342 + }, + { + "epoch": 0.8038534168970142, + "grad_norm": 1.2434707826587843, + "learning_rate": 4.595174604479405e-06, + "loss": 0.8462, + "num_input_tokens_seen": 1326312064, + "step": 7343 + }, + { + "epoch": 0.8039628889679521, + "grad_norm": 1.1142271607685628, + "learning_rate": 4.590207705205718e-06, + "loss": 0.8595, + "num_input_tokens_seen": 1326497984, + "step": 7344 + }, + { + "epoch": 0.8040723610388899, + "grad_norm": 1.1627934854496207, + "learning_rate": 4.585243220371446e-06, + "loss": 0.8105, + "num_input_tokens_seen": 1326673824, + "step": 7345 + }, + { + "epoch": 0.8041818331098278, + "grad_norm": 1.1280589900689495, + "learning_rate": 4.580281150563873e-06, + "loss": 0.9916, + "num_input_tokens_seen": 1326889088, + "step": 7346 + }, + { + "epoch": 0.8042913051807657, + "grad_norm": 1.093526817439467, + "learning_rate": 4.575321496370005e-06, + "loss": 0.815, + "num_input_tokens_seen": 1327073216, + "step": 7347 + }, + { + "epoch": 0.8044007772517037, + "grad_norm": 1.2166731818995806, + "learning_rate": 4.570364258376558e-06, + "loss": 0.6458, + "num_input_tokens_seen": 1327245248, + "step": 7348 + }, + { + "epoch": 0.8045102493226416, + "grad_norm": 1.1274090510731996, + "learning_rate": 4.565409437169965e-06, + "loss": 0.9682, + "num_input_tokens_seen": 1327428256, + "step": 7349 + }, + { + "epoch": 0.8046197213935795, + "grad_norm": 1.0768629786993276, + "learning_rate": 4.560457033336365e-06, + "loss": 0.8056, + "num_input_tokens_seen": 1327626720, + "step": 7350 + }, + { + "epoch": 0.8047291934645173, + "grad_norm": 0.9335784272408881, + "learning_rate": 4.555507047461638e-06, + "loss": 0.7252, + "num_input_tokens_seen": 1327783072, + "step": 7351 + }, + { + "epoch": 0.8048386655354552, + "grad_norm": 1.0347485009141317, + "learning_rate": 4.550559480131328e-06, + "loss": 0.8611, + "num_input_tokens_seen": 1327957120, + "step": 7352 + }, + { + "epoch": 0.8049481376063932, + "grad_norm": 1.2165580012946016, + "learning_rate": 4.5456143319307475e-06, + "loss": 0.953, + "num_input_tokens_seen": 1328127360, + "step": 7353 + }, + { + "epoch": 0.8050576096773311, + "grad_norm": 1.2749665798220304, + "learning_rate": 4.5406716034448905e-06, + "loss": 0.8742, + "num_input_tokens_seen": 1328302080, + "step": 7354 + }, + { + "epoch": 0.805167081748269, + "grad_norm": 1.2814804428162119, + "learning_rate": 4.535731295258469e-06, + "loss": 0.7185, + "num_input_tokens_seen": 1328458208, + "step": 7355 + }, + { + "epoch": 0.8052765538192068, + "grad_norm": 1.1541906524167311, + "learning_rate": 4.530793407955913e-06, + "loss": 0.8009, + "num_input_tokens_seen": 1328626208, + "step": 7356 + }, + { + "epoch": 0.8053860258901447, + "grad_norm": 1.2424951808076115, + "learning_rate": 4.525857942121364e-06, + "loss": 0.9275, + "num_input_tokens_seen": 1328810336, + "step": 7357 + }, + { + "epoch": 0.8054954979610827, + "grad_norm": 1.149414736634918, + "learning_rate": 4.520924898338691e-06, + "loss": 0.9113, + "num_input_tokens_seen": 1328994240, + "step": 7358 + }, + { + "epoch": 0.8056049700320206, + "grad_norm": 1.2355586786758583, + "learning_rate": 4.51599427719144e-06, + "loss": 0.9001, + "num_input_tokens_seen": 1329153280, + "step": 7359 + }, + { + "epoch": 0.8057144421029585, + "grad_norm": 1.2040675545969255, + "learning_rate": 4.511066079262921e-06, + "loss": 0.7351, + "num_input_tokens_seen": 1329318368, + "step": 7360 + }, + { + "epoch": 0.8058239141738964, + "grad_norm": 1.2925079077620774, + "learning_rate": 4.506140305136103e-06, + "loss": 1.0326, + "num_input_tokens_seen": 1329524448, + "step": 7361 + }, + { + "epoch": 0.8059333862448342, + "grad_norm": 1.0909378958280398, + "learning_rate": 4.501216955393722e-06, + "loss": 0.7761, + "num_input_tokens_seen": 1329713504, + "step": 7362 + }, + { + "epoch": 0.8060428583157722, + "grad_norm": 1.1045407555220093, + "learning_rate": 4.496296030618177e-06, + "loss": 0.7, + "num_input_tokens_seen": 1329896736, + "step": 7363 + }, + { + "epoch": 0.8061523303867101, + "grad_norm": 1.3074963423347734, + "learning_rate": 4.491377531391619e-06, + "loss": 0.7794, + "num_input_tokens_seen": 1330080864, + "step": 7364 + }, + { + "epoch": 0.806261802457648, + "grad_norm": 1.307313845422261, + "learning_rate": 4.486461458295896e-06, + "loss": 1.1626, + "num_input_tokens_seen": 1330271264, + "step": 7365 + }, + { + "epoch": 0.8063712745285859, + "grad_norm": 1.1121980450208797, + "learning_rate": 4.4815478119125595e-06, + "loss": 0.7979, + "num_input_tokens_seen": 1330438816, + "step": 7366 + }, + { + "epoch": 0.8064807465995238, + "grad_norm": 1.156230204644738, + "learning_rate": 4.4766365928229054e-06, + "loss": 0.7302, + "num_input_tokens_seen": 1330600544, + "step": 7367 + }, + { + "epoch": 0.8065902186704617, + "grad_norm": 1.2010782344883422, + "learning_rate": 4.471727801607895e-06, + "loss": 0.7553, + "num_input_tokens_seen": 1330782208, + "step": 7368 + }, + { + "epoch": 0.8066996907413996, + "grad_norm": 1.1941072957856727, + "learning_rate": 4.466821438848254e-06, + "loss": 0.6178, + "num_input_tokens_seen": 1330932512, + "step": 7369 + }, + { + "epoch": 0.8068091628123375, + "grad_norm": 1.088822026021829, + "learning_rate": 4.461917505124375e-06, + "loss": 0.5548, + "num_input_tokens_seen": 1331111936, + "step": 7370 + }, + { + "epoch": 0.8069186348832754, + "grad_norm": 1.1136249212551228, + "learning_rate": 4.457016001016395e-06, + "loss": 0.7824, + "num_input_tokens_seen": 1331275680, + "step": 7371 + }, + { + "epoch": 0.8070281069542133, + "grad_norm": 1.1218785851721473, + "learning_rate": 4.452116927104152e-06, + "loss": 0.7388, + "num_input_tokens_seen": 1331448832, + "step": 7372 + }, + { + "epoch": 0.8071375790251512, + "grad_norm": 1.1911062894839426, + "learning_rate": 4.447220283967196e-06, + "loss": 0.8116, + "num_input_tokens_seen": 1331641024, + "step": 7373 + }, + { + "epoch": 0.8072470510960891, + "grad_norm": 1.1189448799589243, + "learning_rate": 4.442326072184791e-06, + "loss": 0.8541, + "num_input_tokens_seen": 1331842176, + "step": 7374 + }, + { + "epoch": 0.807356523167027, + "grad_norm": 0.9672480599666651, + "learning_rate": 4.4374342923359125e-06, + "loss": 0.6279, + "num_input_tokens_seen": 1332058560, + "step": 7375 + }, + { + "epoch": 0.8074659952379649, + "grad_norm": 1.1565144726135879, + "learning_rate": 4.4325449449992455e-06, + "loss": 0.8353, + "num_input_tokens_seen": 1332270464, + "step": 7376 + }, + { + "epoch": 0.8075754673089028, + "grad_norm": 1.0910306640637413, + "learning_rate": 4.42765803075319e-06, + "loss": 0.7302, + "num_input_tokens_seen": 1332467360, + "step": 7377 + }, + { + "epoch": 0.8076849393798408, + "grad_norm": 1.1953143948808, + "learning_rate": 4.4227735501758654e-06, + "loss": 0.7016, + "num_input_tokens_seen": 1332676576, + "step": 7378 + }, + { + "epoch": 0.8077944114507786, + "grad_norm": 1.0523386478654386, + "learning_rate": 4.417891503845095e-06, + "loss": 0.7291, + "num_input_tokens_seen": 1332868768, + "step": 7379 + }, + { + "epoch": 0.8079038835217165, + "grad_norm": 1.1731924662144262, + "learning_rate": 4.413011892338412e-06, + "loss": 1.055, + "num_input_tokens_seen": 1333081120, + "step": 7380 + }, + { + "epoch": 0.8080133555926544, + "grad_norm": 1.3588690419063394, + "learning_rate": 4.408134716233067e-06, + "loss": 0.8997, + "num_input_tokens_seen": 1333270400, + "step": 7381 + }, + { + "epoch": 0.8081228276635923, + "grad_norm": 1.0482178231924881, + "learning_rate": 4.403259976106019e-06, + "loss": 0.7662, + "num_input_tokens_seen": 1333459008, + "step": 7382 + }, + { + "epoch": 0.8082322997345303, + "grad_norm": 1.0773490656665565, + "learning_rate": 4.398387672533944e-06, + "loss": 0.8401, + "num_input_tokens_seen": 1333674272, + "step": 7383 + }, + { + "epoch": 0.8083417718054682, + "grad_norm": 1.1404888646653768, + "learning_rate": 4.393517806093219e-06, + "loss": 0.8886, + "num_input_tokens_seen": 1333882144, + "step": 7384 + }, + { + "epoch": 0.808451243876406, + "grad_norm": 1.1708097575428487, + "learning_rate": 4.388650377359943e-06, + "loss": 0.9777, + "num_input_tokens_seen": 1334082624, + "step": 7385 + }, + { + "epoch": 0.8085607159473439, + "grad_norm": 1.2604357452630048, + "learning_rate": 4.383785386909931e-06, + "loss": 1.0062, + "num_input_tokens_seen": 1334251072, + "step": 7386 + }, + { + "epoch": 0.8086701880182818, + "grad_norm": 1.3047898760620935, + "learning_rate": 4.378922835318694e-06, + "loss": 0.9016, + "num_input_tokens_seen": 1334385024, + "step": 7387 + }, + { + "epoch": 0.8087796600892198, + "grad_norm": 1.1906607374700084, + "learning_rate": 4.374062723161468e-06, + "loss": 0.8015, + "num_input_tokens_seen": 1334564672, + "step": 7388 + }, + { + "epoch": 0.8088891321601577, + "grad_norm": 1.2988650209162669, + "learning_rate": 4.369205051013189e-06, + "loss": 0.9851, + "num_input_tokens_seen": 1334707360, + "step": 7389 + }, + { + "epoch": 0.8089986042310956, + "grad_norm": 1.1222745412385378, + "learning_rate": 4.364349819448507e-06, + "loss": 0.5972, + "num_input_tokens_seen": 1334874016, + "step": 7390 + }, + { + "epoch": 0.8091080763020334, + "grad_norm": 1.2402865181121672, + "learning_rate": 4.359497029041807e-06, + "loss": 1.1233, + "num_input_tokens_seen": 1335070688, + "step": 7391 + }, + { + "epoch": 0.8092175483729713, + "grad_norm": 1.0245059893066726, + "learning_rate": 4.354646680367136e-06, + "loss": 0.5571, + "num_input_tokens_seen": 1335242272, + "step": 7392 + }, + { + "epoch": 0.8093270204439093, + "grad_norm": 1.2433641247264613, + "learning_rate": 4.34979877399831e-06, + "loss": 0.784, + "num_input_tokens_seen": 1335412288, + "step": 7393 + }, + { + "epoch": 0.8094364925148472, + "grad_norm": 1.1517762419267816, + "learning_rate": 4.3449533105087984e-06, + "loss": 0.9471, + "num_input_tokens_seen": 1335611424, + "step": 7394 + }, + { + "epoch": 0.8095459645857851, + "grad_norm": 1.1999970436059773, + "learning_rate": 4.3401102904718296e-06, + "loss": 0.7161, + "num_input_tokens_seen": 1335776288, + "step": 7395 + }, + { + "epoch": 0.8096554366567229, + "grad_norm": 1.144642847238417, + "learning_rate": 4.335269714460322e-06, + "loss": 0.6695, + "num_input_tokens_seen": 1335943616, + "step": 7396 + }, + { + "epoch": 0.8097649087276608, + "grad_norm": 1.1035726616897077, + "learning_rate": 4.3304315830468985e-06, + "loss": 0.9346, + "num_input_tokens_seen": 1336130880, + "step": 7397 + }, + { + "epoch": 0.8098743807985987, + "grad_norm": 1.1082114444241555, + "learning_rate": 4.325595896803908e-06, + "loss": 0.7704, + "num_input_tokens_seen": 1336346144, + "step": 7398 + }, + { + "epoch": 0.8099838528695367, + "grad_norm": 1.2890324906988697, + "learning_rate": 4.320762656303392e-06, + "loss": 0.8845, + "num_input_tokens_seen": 1336521984, + "step": 7399 + }, + { + "epoch": 0.8100933249404746, + "grad_norm": 1.1746239527110762, + "learning_rate": 4.315931862117137e-06, + "loss": 0.8613, + "num_input_tokens_seen": 1336681920, + "step": 7400 + }, + { + "epoch": 0.8102027970114125, + "grad_norm": 1.3854642539587578, + "learning_rate": 4.311103514816589e-06, + "loss": 0.753, + "num_input_tokens_seen": 1336815872, + "step": 7401 + }, + { + "epoch": 0.8103122690823503, + "grad_norm": 1.1015329018803142, + "learning_rate": 4.306277614972956e-06, + "loss": 0.7864, + "num_input_tokens_seen": 1337000896, + "step": 7402 + }, + { + "epoch": 0.8104217411532882, + "grad_norm": 1.2392695481245941, + "learning_rate": 4.3014541631571095e-06, + "loss": 0.848, + "num_input_tokens_seen": 1337183680, + "step": 7403 + }, + { + "epoch": 0.8105312132242262, + "grad_norm": 1.1027420438255913, + "learning_rate": 4.29663315993967e-06, + "loss": 0.9624, + "num_input_tokens_seen": 1337376768, + "step": 7404 + }, + { + "epoch": 0.8106406852951641, + "grad_norm": 1.2937385134716866, + "learning_rate": 4.291814605890954e-06, + "loss": 0.9327, + "num_input_tokens_seen": 1337541632, + "step": 7405 + }, + { + "epoch": 0.810750157366102, + "grad_norm": 1.0958871941813197, + "learning_rate": 4.28699850158098e-06, + "loss": 0.6438, + "num_input_tokens_seen": 1337694624, + "step": 7406 + }, + { + "epoch": 0.8108596294370399, + "grad_norm": 1.1932116727685784, + "learning_rate": 4.2821848475794875e-06, + "loss": 1.0823, + "num_input_tokens_seen": 1337878080, + "step": 7407 + }, + { + "epoch": 0.8109691015079777, + "grad_norm": 1.146980866995143, + "learning_rate": 4.277373644455915e-06, + "loss": 0.8478, + "num_input_tokens_seen": 1338059744, + "step": 7408 + }, + { + "epoch": 0.8110785735789157, + "grad_norm": 1.1891802394888091, + "learning_rate": 4.272564892779438e-06, + "loss": 0.9285, + "num_input_tokens_seen": 1338242304, + "step": 7409 + }, + { + "epoch": 0.8111880456498536, + "grad_norm": 1.251418024646674, + "learning_rate": 4.267758593118898e-06, + "loss": 0.919, + "num_input_tokens_seen": 1338426656, + "step": 7410 + }, + { + "epoch": 0.8112975177207915, + "grad_norm": 1.0561735069069853, + "learning_rate": 4.262954746042888e-06, + "loss": 0.7394, + "num_input_tokens_seen": 1338594656, + "step": 7411 + }, + { + "epoch": 0.8114069897917294, + "grad_norm": 1.1365933693235442, + "learning_rate": 4.258153352119693e-06, + "loss": 0.821, + "num_input_tokens_seen": 1338753024, + "step": 7412 + }, + { + "epoch": 0.8115164618626672, + "grad_norm": 1.0400747660208227, + "learning_rate": 4.253354411917302e-06, + "loss": 0.7586, + "num_input_tokens_seen": 1338927744, + "step": 7413 + }, + { + "epoch": 0.8116259339336052, + "grad_norm": 1.1490979015051992, + "learning_rate": 4.2485579260034215e-06, + "loss": 0.6698, + "num_input_tokens_seen": 1339084992, + "step": 7414 + }, + { + "epoch": 0.8117354060045431, + "grad_norm": 1.27556110078121, + "learning_rate": 4.243763894945471e-06, + "loss": 0.8649, + "num_input_tokens_seen": 1339261280, + "step": 7415 + }, + { + "epoch": 0.811844878075481, + "grad_norm": 1.3438054893924298, + "learning_rate": 4.238972319310572e-06, + "loss": 0.893, + "num_input_tokens_seen": 1339400384, + "step": 7416 + }, + { + "epoch": 0.8119543501464189, + "grad_norm": 1.2202762925641437, + "learning_rate": 4.234183199665559e-06, + "loss": 1.0016, + "num_input_tokens_seen": 1339573088, + "step": 7417 + }, + { + "epoch": 0.8120638222173568, + "grad_norm": 1.0971311351669342, + "learning_rate": 4.229396536576968e-06, + "loss": 0.7836, + "num_input_tokens_seen": 1339782976, + "step": 7418 + }, + { + "epoch": 0.8121732942882947, + "grad_norm": 1.2551667244795988, + "learning_rate": 4.224612330611069e-06, + "loss": 0.6798, + "num_input_tokens_seen": 1339929248, + "step": 7419 + }, + { + "epoch": 0.8122827663592326, + "grad_norm": 1.1993083112741831, + "learning_rate": 4.219830582333814e-06, + "loss": 0.8248, + "num_input_tokens_seen": 1340105536, + "step": 7420 + }, + { + "epoch": 0.8123922384301705, + "grad_norm": 1.2423809068890384, + "learning_rate": 4.215051292310876e-06, + "loss": 0.8452, + "num_input_tokens_seen": 1340273536, + "step": 7421 + }, + { + "epoch": 0.8125017105011084, + "grad_norm": 1.1457173317653688, + "learning_rate": 4.210274461107638e-06, + "loss": 0.6397, + "num_input_tokens_seen": 1340458560, + "step": 7422 + }, + { + "epoch": 0.8126111825720463, + "grad_norm": 1.1431544228047459, + "learning_rate": 4.205500089289185e-06, + "loss": 0.9328, + "num_input_tokens_seen": 1340648512, + "step": 7423 + }, + { + "epoch": 0.8127206546429843, + "grad_norm": 1.0639828264205713, + "learning_rate": 4.200728177420321e-06, + "loss": 0.7542, + "num_input_tokens_seen": 1340833312, + "step": 7424 + }, + { + "epoch": 0.8128301267139221, + "grad_norm": 1.3497814725685302, + "learning_rate": 4.1959587260655465e-06, + "loss": 1.1945, + "num_input_tokens_seen": 1341012512, + "step": 7425 + }, + { + "epoch": 0.81293959878486, + "grad_norm": 1.063196402814476, + "learning_rate": 4.191191735789096e-06, + "loss": 0.7304, + "num_input_tokens_seen": 1341217696, + "step": 7426 + }, + { + "epoch": 0.8130490708557979, + "grad_norm": 1.1377562412433606, + "learning_rate": 4.186427207154869e-06, + "loss": 0.847, + "num_input_tokens_seen": 1341394208, + "step": 7427 + }, + { + "epoch": 0.8131585429267358, + "grad_norm": 1.2583137129628714, + "learning_rate": 4.181665140726523e-06, + "loss": 0.8174, + "num_input_tokens_seen": 1341558176, + "step": 7428 + }, + { + "epoch": 0.8132680149976738, + "grad_norm": 1.1074942644198937, + "learning_rate": 4.176905537067394e-06, + "loss": 0.8906, + "num_input_tokens_seen": 1341716992, + "step": 7429 + }, + { + "epoch": 0.8133774870686116, + "grad_norm": 1.1695300737373793, + "learning_rate": 4.1721483967405305e-06, + "loss": 0.8367, + "num_input_tokens_seen": 1341893280, + "step": 7430 + }, + { + "epoch": 0.8134869591395495, + "grad_norm": 1.1871793715642296, + "learning_rate": 4.167393720308699e-06, + "loss": 0.8482, + "num_input_tokens_seen": 1342072032, + "step": 7431 + }, + { + "epoch": 0.8135964312104874, + "grad_norm": 1.0969931340519987, + "learning_rate": 4.162641508334355e-06, + "loss": 0.7623, + "num_input_tokens_seen": 1342274080, + "step": 7432 + }, + { + "epoch": 0.8137059032814253, + "grad_norm": 1.1990558625475731, + "learning_rate": 4.157891761379701e-06, + "loss": 0.9178, + "num_input_tokens_seen": 1342449920, + "step": 7433 + }, + { + "epoch": 0.8138153753523633, + "grad_norm": 1.1242703084674366, + "learning_rate": 4.153144480006593e-06, + "loss": 0.9183, + "num_input_tokens_seen": 1342636736, + "step": 7434 + }, + { + "epoch": 0.8139248474233012, + "grad_norm": 1.3044426511248195, + "learning_rate": 4.148399664776656e-06, + "loss": 0.9112, + "num_input_tokens_seen": 1342808544, + "step": 7435 + }, + { + "epoch": 0.814034319494239, + "grad_norm": 1.1854599947572764, + "learning_rate": 4.143657316251165e-06, + "loss": 0.7983, + "num_input_tokens_seen": 1342996704, + "step": 7436 + }, + { + "epoch": 0.8141437915651769, + "grad_norm": 1.1471338463324003, + "learning_rate": 4.1389174349911495e-06, + "loss": 0.8805, + "num_input_tokens_seen": 1343168736, + "step": 7437 + }, + { + "epoch": 0.8142532636361148, + "grad_norm": 1.0993510564524787, + "learning_rate": 4.1341800215573185e-06, + "loss": 0.6404, + "num_input_tokens_seen": 1343357344, + "step": 7438 + }, + { + "epoch": 0.8143627357070528, + "grad_norm": 1.20561881277851, + "learning_rate": 4.129445076510105e-06, + "loss": 0.9419, + "num_input_tokens_seen": 1343525792, + "step": 7439 + }, + { + "epoch": 0.8144722077779907, + "grad_norm": 1.2458297306293633, + "learning_rate": 4.124712600409638e-06, + "loss": 0.7908, + "num_input_tokens_seen": 1343701184, + "step": 7440 + }, + { + "epoch": 0.8145816798489286, + "grad_norm": 1.143859770463256, + "learning_rate": 4.119982593815761e-06, + "loss": 0.9502, + "num_input_tokens_seen": 1343877248, + "step": 7441 + }, + { + "epoch": 0.8146911519198664, + "grad_norm": 1.0383646636591708, + "learning_rate": 4.11525505728804e-06, + "loss": 0.8549, + "num_input_tokens_seen": 1344084448, + "step": 7442 + }, + { + "epoch": 0.8148006239908043, + "grad_norm": 1.1706584974100211, + "learning_rate": 4.110529991385706e-06, + "loss": 0.9725, + "num_input_tokens_seen": 1344243040, + "step": 7443 + }, + { + "epoch": 0.8149100960617423, + "grad_norm": 1.0895119438829697, + "learning_rate": 4.105807396667755e-06, + "loss": 0.8335, + "num_input_tokens_seen": 1344424928, + "step": 7444 + }, + { + "epoch": 0.8150195681326802, + "grad_norm": 1.1968380038367676, + "learning_rate": 4.10108727369283e-06, + "loss": 1.1022, + "num_input_tokens_seen": 1344641088, + "step": 7445 + }, + { + "epoch": 0.8151290402036181, + "grad_norm": 1.2540540756600431, + "learning_rate": 4.0963696230193385e-06, + "loss": 0.9508, + "num_input_tokens_seen": 1344801920, + "step": 7446 + }, + { + "epoch": 0.8152385122745559, + "grad_norm": 1.2438634446897052, + "learning_rate": 4.091654445205356e-06, + "loss": 0.7839, + "num_input_tokens_seen": 1344972160, + "step": 7447 + }, + { + "epoch": 0.8153479843454938, + "grad_norm": 1.1621919490424408, + "learning_rate": 4.086941740808686e-06, + "loss": 0.7279, + "num_input_tokens_seen": 1345161216, + "step": 7448 + }, + { + "epoch": 0.8154574564164317, + "grad_norm": 1.1426591371687307, + "learning_rate": 4.082231510386828e-06, + "loss": 0.7182, + "num_input_tokens_seen": 1345324960, + "step": 7449 + }, + { + "epoch": 0.8155669284873697, + "grad_norm": 1.1652381643706193, + "learning_rate": 4.077523754496987e-06, + "loss": 0.9246, + "num_input_tokens_seen": 1345540896, + "step": 7450 + }, + { + "epoch": 0.8156764005583076, + "grad_norm": 1.3193086046709563, + "learning_rate": 4.0728184736961025e-06, + "loss": 0.7945, + "num_input_tokens_seen": 1345674400, + "step": 7451 + }, + { + "epoch": 0.8157858726292455, + "grad_norm": 1.298413203226722, + "learning_rate": 4.068115668540776e-06, + "loss": 0.9205, + "num_input_tokens_seen": 1345828960, + "step": 7452 + }, + { + "epoch": 0.8158953447001833, + "grad_norm": 1.084851057667262, + "learning_rate": 4.063415339587354e-06, + "loss": 0.8473, + "num_input_tokens_seen": 1346009728, + "step": 7453 + }, + { + "epoch": 0.8160048167711212, + "grad_norm": 1.1766758553863896, + "learning_rate": 4.058717487391875e-06, + "loss": 1.0731, + "num_input_tokens_seen": 1346209536, + "step": 7454 + }, + { + "epoch": 0.8161142888420592, + "grad_norm": 1.373309396578378, + "learning_rate": 4.0540221125100835e-06, + "loss": 0.9373, + "num_input_tokens_seen": 1346372832, + "step": 7455 + }, + { + "epoch": 0.8162237609129971, + "grad_norm": 1.0554817882215373, + "learning_rate": 4.049329215497433e-06, + "loss": 0.7965, + "num_input_tokens_seen": 1346573984, + "step": 7456 + }, + { + "epoch": 0.816333232983935, + "grad_norm": 1.1971650073538145, + "learning_rate": 4.0446387969090865e-06, + "loss": 0.8718, + "num_input_tokens_seen": 1346783648, + "step": 7457 + }, + { + "epoch": 0.8164427050548729, + "grad_norm": 1.1220027946430564, + "learning_rate": 4.039950857299907e-06, + "loss": 0.8114, + "num_input_tokens_seen": 1346974048, + "step": 7458 + }, + { + "epoch": 0.8165521771258107, + "grad_norm": 1.1759619589803112, + "learning_rate": 4.0352653972244805e-06, + "loss": 0.9372, + "num_input_tokens_seen": 1347157504, + "step": 7459 + }, + { + "epoch": 0.8166616491967487, + "grad_norm": 1.0631668018014666, + "learning_rate": 4.030582417237069e-06, + "loss": 0.8335, + "num_input_tokens_seen": 1347339168, + "step": 7460 + }, + { + "epoch": 0.8167711212676866, + "grad_norm": 1.1838062460727967, + "learning_rate": 4.025901917891678e-06, + "loss": 0.9939, + "num_input_tokens_seen": 1347473792, + "step": 7461 + }, + { + "epoch": 0.8168805933386245, + "grad_norm": 1.2341669607494827, + "learning_rate": 4.021223899741993e-06, + "loss": 0.9088, + "num_input_tokens_seen": 1347679424, + "step": 7462 + }, + { + "epoch": 0.8169900654095624, + "grad_norm": 1.1613951980925115, + "learning_rate": 4.016548363341416e-06, + "loss": 0.918, + "num_input_tokens_seen": 1347882368, + "step": 7463 + }, + { + "epoch": 0.8170995374805002, + "grad_norm": 1.2206236302240658, + "learning_rate": 4.011875309243054e-06, + "loss": 1.0119, + "num_input_tokens_seen": 1348074784, + "step": 7464 + }, + { + "epoch": 0.8172090095514382, + "grad_norm": 1.2581749624838803, + "learning_rate": 4.0072047379997175e-06, + "loss": 1.0402, + "num_input_tokens_seen": 1348251072, + "step": 7465 + }, + { + "epoch": 0.8173184816223761, + "grad_norm": 1.0911811088498513, + "learning_rate": 4.002536650163938e-06, + "loss": 0.674, + "num_input_tokens_seen": 1348422656, + "step": 7466 + }, + { + "epoch": 0.817427953693314, + "grad_norm": 1.1433175066602306, + "learning_rate": 3.9978710462879206e-06, + "loss": 0.8421, + "num_input_tokens_seen": 1348630304, + "step": 7467 + }, + { + "epoch": 0.8175374257642519, + "grad_norm": 1.160731779742193, + "learning_rate": 3.993207926923623e-06, + "loss": 0.9513, + "num_input_tokens_seen": 1348832128, + "step": 7468 + }, + { + "epoch": 0.8176468978351898, + "grad_norm": 1.0409705061018657, + "learning_rate": 3.988547292622655e-06, + "loss": 0.7267, + "num_input_tokens_seen": 1348996544, + "step": 7469 + }, + { + "epoch": 0.8177563699061277, + "grad_norm": 1.2540584959018035, + "learning_rate": 3.9838891439363816e-06, + "loss": 0.8311, + "num_input_tokens_seen": 1349166784, + "step": 7470 + }, + { + "epoch": 0.8178658419770656, + "grad_norm": 1.3087678708055968, + "learning_rate": 3.979233481415848e-06, + "loss": 0.9876, + "num_input_tokens_seen": 1349322240, + "step": 7471 + }, + { + "epoch": 0.8179753140480035, + "grad_norm": 1.4776667749611536, + "learning_rate": 3.974580305611808e-06, + "loss": 0.8015, + "num_input_tokens_seen": 1349502784, + "step": 7472 + }, + { + "epoch": 0.8180847861189414, + "grad_norm": 1.2487418518717017, + "learning_rate": 3.9699296170747245e-06, + "loss": 0.9212, + "num_input_tokens_seen": 1349659808, + "step": 7473 + }, + { + "epoch": 0.8181942581898793, + "grad_norm": 1.162498652549701, + "learning_rate": 3.965281416354757e-06, + "loss": 0.8054, + "num_input_tokens_seen": 1349826464, + "step": 7474 + }, + { + "epoch": 0.8183037302608173, + "grad_norm": 1.2687364643295687, + "learning_rate": 3.9606357040018e-06, + "loss": 0.9847, + "num_input_tokens_seen": 1350000736, + "step": 7475 + }, + { + "epoch": 0.8184132023317551, + "grad_norm": 1.2742080229606478, + "learning_rate": 3.955992480565407e-06, + "loss": 0.9934, + "num_input_tokens_seen": 1350188448, + "step": 7476 + }, + { + "epoch": 0.818522674402693, + "grad_norm": 1.0901545693230246, + "learning_rate": 3.951351746594883e-06, + "loss": 0.6949, + "num_input_tokens_seen": 1350362272, + "step": 7477 + }, + { + "epoch": 0.8186321464736309, + "grad_norm": 1.1472722064703915, + "learning_rate": 3.9467135026392015e-06, + "loss": 0.7585, + "num_input_tokens_seen": 1350558496, + "step": 7478 + }, + { + "epoch": 0.8187416185445688, + "grad_norm": 1.2094723605608741, + "learning_rate": 3.94207774924707e-06, + "loss": 1.0478, + "num_input_tokens_seen": 1350728736, + "step": 7479 + }, + { + "epoch": 0.8188510906155068, + "grad_norm": 1.2127146815950975, + "learning_rate": 3.937444486966885e-06, + "loss": 1.0816, + "num_input_tokens_seen": 1350930112, + "step": 7480 + }, + { + "epoch": 0.8189605626864446, + "grad_norm": 1.0379178618969513, + "learning_rate": 3.932813716346751e-06, + "loss": 0.5673, + "num_input_tokens_seen": 1351106848, + "step": 7481 + }, + { + "epoch": 0.8190700347573825, + "grad_norm": 1.1555161080188858, + "learning_rate": 3.928185437934481e-06, + "loss": 0.7269, + "num_input_tokens_seen": 1351303296, + "step": 7482 + }, + { + "epoch": 0.8191795068283204, + "grad_norm": 1.2486235192351498, + "learning_rate": 3.923559652277586e-06, + "loss": 0.9375, + "num_input_tokens_seen": 1351471296, + "step": 7483 + }, + { + "epoch": 0.8192889788992583, + "grad_norm": 1.3029907065884982, + "learning_rate": 3.918936359923306e-06, + "loss": 0.8935, + "num_input_tokens_seen": 1351682976, + "step": 7484 + }, + { + "epoch": 0.8193984509701963, + "grad_norm": 1.1359406000276056, + "learning_rate": 3.914315561418541e-06, + "loss": 0.7457, + "num_input_tokens_seen": 1351848736, + "step": 7485 + }, + { + "epoch": 0.8195079230411342, + "grad_norm": 1.2239697503172913, + "learning_rate": 3.909697257309941e-06, + "loss": 0.8354, + "num_input_tokens_seen": 1352026816, + "step": 7486 + }, + { + "epoch": 0.819617395112072, + "grad_norm": 1.0353003635495392, + "learning_rate": 3.905081448143841e-06, + "loss": 0.7392, + "num_input_tokens_seen": 1352186080, + "step": 7487 + }, + { + "epoch": 0.8197268671830099, + "grad_norm": 1.1695609578836101, + "learning_rate": 3.9004681344662755e-06, + "loss": 0.7189, + "num_input_tokens_seen": 1352362368, + "step": 7488 + }, + { + "epoch": 0.8198363392539478, + "grad_norm": 1.0511042442381984, + "learning_rate": 3.895857316822996e-06, + "loss": 0.8238, + "num_input_tokens_seen": 1352541568, + "step": 7489 + }, + { + "epoch": 0.8199458113248858, + "grad_norm": 1.2746954479376558, + "learning_rate": 3.891248995759453e-06, + "loss": 0.8071, + "num_input_tokens_seen": 1352752576, + "step": 7490 + }, + { + "epoch": 0.8200552833958237, + "grad_norm": 1.1962901586410795, + "learning_rate": 3.886643171820797e-06, + "loss": 0.8081, + "num_input_tokens_seen": 1352907808, + "step": 7491 + }, + { + "epoch": 0.8201647554667616, + "grad_norm": 1.154295790618542, + "learning_rate": 3.882039845551888e-06, + "loss": 0.8746, + "num_input_tokens_seen": 1353098208, + "step": 7492 + }, + { + "epoch": 0.8202742275376994, + "grad_norm": 1.3443750657596332, + "learning_rate": 3.877439017497303e-06, + "loss": 0.936, + "num_input_tokens_seen": 1353287936, + "step": 7493 + }, + { + "epoch": 0.8203836996086373, + "grad_norm": 1.402031016212137, + "learning_rate": 3.872840688201299e-06, + "loss": 0.8818, + "num_input_tokens_seen": 1353456384, + "step": 7494 + }, + { + "epoch": 0.8204931716795753, + "grad_norm": 1.1987601033760538, + "learning_rate": 3.868244858207854e-06, + "loss": 1.0529, + "num_input_tokens_seen": 1353632224, + "step": 7495 + }, + { + "epoch": 0.8206026437505132, + "grad_norm": 1.1743274711991698, + "learning_rate": 3.863651528060647e-06, + "loss": 0.793, + "num_input_tokens_seen": 1353804480, + "step": 7496 + }, + { + "epoch": 0.8207121158214511, + "grad_norm": 1.3670471991348636, + "learning_rate": 3.859060698303058e-06, + "loss": 1.0167, + "num_input_tokens_seen": 1353968000, + "step": 7497 + }, + { + "epoch": 0.8208215878923889, + "grad_norm": 1.2498497872696204, + "learning_rate": 3.8544723694781706e-06, + "loss": 0.5841, + "num_input_tokens_seen": 1354139136, + "step": 7498 + }, + { + "epoch": 0.8209310599633268, + "grad_norm": 1.260355067783969, + "learning_rate": 3.849886542128784e-06, + "loss": 0.9554, + "num_input_tokens_seen": 1354338944, + "step": 7499 + }, + { + "epoch": 0.8210405320342647, + "grad_norm": 1.3785069565623422, + "learning_rate": 3.845303216797377e-06, + "loss": 0.9102, + "num_input_tokens_seen": 1354505152, + "step": 7500 + }, + { + "epoch": 0.8211500041052027, + "grad_norm": 1.173677609357351, + "learning_rate": 3.8407223940261725e-06, + "loss": 0.885, + "num_input_tokens_seen": 1354704512, + "step": 7501 + }, + { + "epoch": 0.8212594761761406, + "grad_norm": 1.245037049351224, + "learning_rate": 3.8361440743570456e-06, + "loss": 0.9483, + "num_input_tokens_seen": 1354901408, + "step": 7502 + }, + { + "epoch": 0.8213689482470785, + "grad_norm": 1.1249995761446685, + "learning_rate": 3.8315682583316224e-06, + "loss": 0.8181, + "num_input_tokens_seen": 1355066720, + "step": 7503 + }, + { + "epoch": 0.8214784203180163, + "grad_norm": 1.10158793102155, + "learning_rate": 3.826994946491208e-06, + "loss": 0.9978, + "num_input_tokens_seen": 1355258688, + "step": 7504 + }, + { + "epoch": 0.8215878923889542, + "grad_norm": 1.025046677805898, + "learning_rate": 3.822424139376815e-06, + "loss": 0.6953, + "num_input_tokens_seen": 1355456704, + "step": 7505 + }, + { + "epoch": 0.8216973644598922, + "grad_norm": 1.1361065556425267, + "learning_rate": 3.817855837529164e-06, + "loss": 0.8468, + "num_input_tokens_seen": 1355641280, + "step": 7506 + }, + { + "epoch": 0.8218068365308301, + "grad_norm": 1.2509898081546513, + "learning_rate": 3.8132900414886653e-06, + "loss": 0.9035, + "num_input_tokens_seen": 1355810848, + "step": 7507 + }, + { + "epoch": 0.821916308601768, + "grad_norm": 1.1194947433109739, + "learning_rate": 3.8087267517954633e-06, + "loss": 0.8664, + "num_input_tokens_seen": 1355988032, + "step": 7508 + }, + { + "epoch": 0.8220257806727059, + "grad_norm": 1.0844075794437729, + "learning_rate": 3.8041659689893677e-06, + "loss": 0.8168, + "num_input_tokens_seen": 1356180672, + "step": 7509 + }, + { + "epoch": 0.8221352527436437, + "grad_norm": 1.2303496287980735, + "learning_rate": 3.799607693609927e-06, + "loss": 0.7194, + "num_input_tokens_seen": 1356344416, + "step": 7510 + }, + { + "epoch": 0.8222447248145817, + "grad_norm": 1.2282774289494762, + "learning_rate": 3.795051926196358e-06, + "loss": 0.7303, + "num_input_tokens_seen": 1356494048, + "step": 7511 + }, + { + "epoch": 0.8223541968855196, + "grad_norm": 1.1320372987405256, + "learning_rate": 3.7904986672876146e-06, + "loss": 1.0664, + "num_input_tokens_seen": 1356677504, + "step": 7512 + }, + { + "epoch": 0.8224636689564575, + "grad_norm": 1.1945470715330113, + "learning_rate": 3.7859479174223333e-06, + "loss": 0.8093, + "num_input_tokens_seen": 1356860960, + "step": 7513 + }, + { + "epoch": 0.8225731410273954, + "grad_norm": 1.097401816857671, + "learning_rate": 3.7813996771388583e-06, + "loss": 0.8858, + "num_input_tokens_seen": 1357061888, + "step": 7514 + }, + { + "epoch": 0.8226826130983332, + "grad_norm": 1.446058008874439, + "learning_rate": 3.7768539469752397e-06, + "loss": 1.0126, + "num_input_tokens_seen": 1357222720, + "step": 7515 + }, + { + "epoch": 0.8227920851692712, + "grad_norm": 1.2199262664745196, + "learning_rate": 3.7723107274692193e-06, + "loss": 0.8464, + "num_input_tokens_seen": 1357406848, + "step": 7516 + }, + { + "epoch": 0.8229015572402091, + "grad_norm": 1.2144790543579256, + "learning_rate": 3.767770019158273e-06, + "loss": 0.8982, + "num_input_tokens_seen": 1357585824, + "step": 7517 + }, + { + "epoch": 0.823011029311147, + "grad_norm": 1.1972024972524449, + "learning_rate": 3.76323182257953e-06, + "loss": 0.8991, + "num_input_tokens_seen": 1357799968, + "step": 7518 + }, + { + "epoch": 0.8231205013820849, + "grad_norm": 1.1588814150003455, + "learning_rate": 3.758696138269874e-06, + "loss": 0.839, + "num_input_tokens_seen": 1357962816, + "step": 7519 + }, + { + "epoch": 0.8232299734530228, + "grad_norm": 1.045309804049039, + "learning_rate": 3.7541629667658564e-06, + "loss": 0.6424, + "num_input_tokens_seen": 1358142688, + "step": 7520 + }, + { + "epoch": 0.8233394455239607, + "grad_norm": 1.2276047572945215, + "learning_rate": 3.7496323086037456e-06, + "loss": 0.8478, + "num_input_tokens_seen": 1358314720, + "step": 7521 + }, + { + "epoch": 0.8234489175948986, + "grad_norm": 1.1822351898730616, + "learning_rate": 3.7451041643195074e-06, + "loss": 0.8221, + "num_input_tokens_seen": 1358465024, + "step": 7522 + }, + { + "epoch": 0.8235583896658365, + "grad_norm": 1.1502822799049979, + "learning_rate": 3.7405785344488157e-06, + "loss": 0.9457, + "num_input_tokens_seen": 1358640640, + "step": 7523 + }, + { + "epoch": 0.8236678617367744, + "grad_norm": 1.1213989374082596, + "learning_rate": 3.7360554195270403e-06, + "loss": 0.75, + "num_input_tokens_seen": 1358835520, + "step": 7524 + }, + { + "epoch": 0.8237773338077123, + "grad_norm": 1.297478822524278, + "learning_rate": 3.731534820089255e-06, + "loss": 1.0283, + "num_input_tokens_seen": 1359009792, + "step": 7525 + }, + { + "epoch": 0.8238868058786503, + "grad_norm": 1.178231003340848, + "learning_rate": 3.727016736670247e-06, + "loss": 0.9679, + "num_input_tokens_seen": 1359193472, + "step": 7526 + }, + { + "epoch": 0.8239962779495881, + "grad_norm": 1.3158324035116191, + "learning_rate": 3.722501169804493e-06, + "loss": 1.0486, + "num_input_tokens_seen": 1359382080, + "step": 7527 + }, + { + "epoch": 0.824105750020526, + "grad_norm": 1.207392814333406, + "learning_rate": 3.7179881200261753e-06, + "loss": 0.8244, + "num_input_tokens_seen": 1359566208, + "step": 7528 + }, + { + "epoch": 0.8242152220914639, + "grad_norm": 1.2934516857666063, + "learning_rate": 3.7134775878691767e-06, + "loss": 0.8143, + "num_input_tokens_seen": 1359730848, + "step": 7529 + }, + { + "epoch": 0.8243246941624018, + "grad_norm": 1.0831747244610366, + "learning_rate": 3.708969573867088e-06, + "loss": 0.6925, + "num_input_tokens_seen": 1359911392, + "step": 7530 + }, + { + "epoch": 0.8244341662333398, + "grad_norm": 1.3644002786794314, + "learning_rate": 3.7044640785531966e-06, + "loss": 1.038, + "num_input_tokens_seen": 1360111648, + "step": 7531 + }, + { + "epoch": 0.8245436383042776, + "grad_norm": 1.209254270141078, + "learning_rate": 3.699961102460495e-06, + "loss": 0.6033, + "num_input_tokens_seen": 1360298688, + "step": 7532 + }, + { + "epoch": 0.8246531103752155, + "grad_norm": 1.24350217909367, + "learning_rate": 3.6954606461216706e-06, + "loss": 0.8414, + "num_input_tokens_seen": 1360473856, + "step": 7533 + }, + { + "epoch": 0.8247625824461534, + "grad_norm": 1.1892313131837362, + "learning_rate": 3.6909627100691293e-06, + "loss": 0.8728, + "num_input_tokens_seen": 1360681952, + "step": 7534 + }, + { + "epoch": 0.8248720545170913, + "grad_norm": 1.2382741741375367, + "learning_rate": 3.686467294834964e-06, + "loss": 0.9224, + "num_input_tokens_seen": 1360843232, + "step": 7535 + }, + { + "epoch": 0.8249815265880293, + "grad_norm": 1.11383530064265, + "learning_rate": 3.6819744009509715e-06, + "loss": 0.9103, + "num_input_tokens_seen": 1360996448, + "step": 7536 + }, + { + "epoch": 0.8250909986589672, + "grad_norm": 1.2221151501302638, + "learning_rate": 3.677484028948658e-06, + "loss": 0.816, + "num_input_tokens_seen": 1361169600, + "step": 7537 + }, + { + "epoch": 0.825200470729905, + "grad_norm": 1.2242672577713283, + "learning_rate": 3.67299617935922e-06, + "loss": 0.8759, + "num_input_tokens_seen": 1361341856, + "step": 7538 + }, + { + "epoch": 0.8253099428008429, + "grad_norm": 1.2778003588123208, + "learning_rate": 3.6685108527135635e-06, + "loss": 0.8793, + "num_input_tokens_seen": 1361507168, + "step": 7539 + }, + { + "epoch": 0.8254194148717808, + "grad_norm": 1.1705765206094438, + "learning_rate": 3.664028049542287e-06, + "loss": 0.8645, + "num_input_tokens_seen": 1361678528, + "step": 7540 + }, + { + "epoch": 0.8255288869427188, + "grad_norm": 1.15706744810324, + "learning_rate": 3.659547770375718e-06, + "loss": 0.9495, + "num_input_tokens_seen": 1361865792, + "step": 7541 + }, + { + "epoch": 0.8256383590136567, + "grad_norm": 1.2070652731634122, + "learning_rate": 3.655070015743839e-06, + "loss": 1.0012, + "num_input_tokens_seen": 1362046336, + "step": 7542 + }, + { + "epoch": 0.8257478310845946, + "grad_norm": 1.4438332075441662, + "learning_rate": 3.6505947861763867e-06, + "loss": 0.8556, + "num_input_tokens_seen": 1362197984, + "step": 7543 + }, + { + "epoch": 0.8258573031555324, + "grad_norm": 1.179950406923559, + "learning_rate": 3.6461220822027437e-06, + "loss": 0.8818, + "num_input_tokens_seen": 1362373600, + "step": 7544 + }, + { + "epoch": 0.8259667752264703, + "grad_norm": 1.0850411427515654, + "learning_rate": 3.641651904352045e-06, + "loss": 0.6813, + "num_input_tokens_seen": 1362539136, + "step": 7545 + }, + { + "epoch": 0.8260762472974083, + "grad_norm": 1.2525730829750672, + "learning_rate": 3.637184253153095e-06, + "loss": 1.0741, + "num_input_tokens_seen": 1362695264, + "step": 7546 + }, + { + "epoch": 0.8261857193683462, + "grad_norm": 1.09813541257608, + "learning_rate": 3.6327191291344015e-06, + "loss": 0.7019, + "num_input_tokens_seen": 1362886112, + "step": 7547 + }, + { + "epoch": 0.8262951914392841, + "grad_norm": 1.22211978343386, + "learning_rate": 3.6282565328242007e-06, + "loss": 0.9411, + "num_input_tokens_seen": 1363078752, + "step": 7548 + }, + { + "epoch": 0.8264046635102219, + "grad_norm": 1.1383176484896098, + "learning_rate": 3.623796464750384e-06, + "loss": 0.8995, + "num_input_tokens_seen": 1363248320, + "step": 7549 + }, + { + "epoch": 0.8265141355811598, + "grad_norm": 1.3055683292052607, + "learning_rate": 3.6193389254405934e-06, + "loss": 0.9078, + "num_input_tokens_seen": 1363435808, + "step": 7550 + }, + { + "epoch": 0.8266236076520977, + "grad_norm": 1.1847449014412719, + "learning_rate": 3.6148839154221236e-06, + "loss": 0.6517, + "num_input_tokens_seen": 1363602688, + "step": 7551 + }, + { + "epoch": 0.8267330797230357, + "grad_norm": 1.0955597754831998, + "learning_rate": 3.610431435222017e-06, + "loss": 0.7979, + "num_input_tokens_seen": 1363788384, + "step": 7552 + }, + { + "epoch": 0.8268425517939736, + "grad_norm": 1.1506671670276825, + "learning_rate": 3.605981485366969e-06, + "loss": 0.7443, + "num_input_tokens_seen": 1363944288, + "step": 7553 + }, + { + "epoch": 0.8269520238649115, + "grad_norm": 1.1903261385311827, + "learning_rate": 3.601534066383419e-06, + "loss": 0.9126, + "num_input_tokens_seen": 1364135360, + "step": 7554 + }, + { + "epoch": 0.8270614959358493, + "grad_norm": 1.187337814097345, + "learning_rate": 3.597089178797483e-06, + "loss": 0.761, + "num_input_tokens_seen": 1364321280, + "step": 7555 + }, + { + "epoch": 0.8271709680067872, + "grad_norm": 1.2002298770704303, + "learning_rate": 3.5926468231349817e-06, + "loss": 1.2141, + "num_input_tokens_seen": 1364514816, + "step": 7556 + }, + { + "epoch": 0.8272804400777252, + "grad_norm": 1.2425514024045414, + "learning_rate": 3.5882069999214366e-06, + "loss": 0.7947, + "num_input_tokens_seen": 1364708800, + "step": 7557 + }, + { + "epoch": 0.8273899121486631, + "grad_norm": 1.1230025572947244, + "learning_rate": 3.583769709682064e-06, + "loss": 0.8245, + "num_input_tokens_seen": 1364902784, + "step": 7558 + }, + { + "epoch": 0.827499384219601, + "grad_norm": 1.1899274810364422, + "learning_rate": 3.579334952941807e-06, + "loss": 0.8493, + "num_input_tokens_seen": 1365069888, + "step": 7559 + }, + { + "epoch": 0.8276088562905389, + "grad_norm": 1.1814761717302873, + "learning_rate": 3.574902730225263e-06, + "loss": 0.8438, + "num_input_tokens_seen": 1365244608, + "step": 7560 + }, + { + "epoch": 0.8277183283614767, + "grad_norm": 1.070288553457311, + "learning_rate": 3.570473042056777e-06, + "loss": 0.9277, + "num_input_tokens_seen": 1365436800, + "step": 7561 + }, + { + "epoch": 0.8278278004324147, + "grad_norm": 1.1234081450169957, + "learning_rate": 3.5660458889603594e-06, + "loss": 0.9307, + "num_input_tokens_seen": 1365613984, + "step": 7562 + }, + { + "epoch": 0.8279372725033526, + "grad_norm": 1.1394984286315448, + "learning_rate": 3.561621271459742e-06, + "loss": 0.8611, + "num_input_tokens_seen": 1365812448, + "step": 7563 + }, + { + "epoch": 0.8280467445742905, + "grad_norm": 1.1864424564057292, + "learning_rate": 3.557199190078342e-06, + "loss": 0.9351, + "num_input_tokens_seen": 1365991872, + "step": 7564 + }, + { + "epoch": 0.8281562166452284, + "grad_norm": 1.3386701331726978, + "learning_rate": 3.5527796453392882e-06, + "loss": 0.9575, + "num_input_tokens_seen": 1366165248, + "step": 7565 + }, + { + "epoch": 0.8282656887161662, + "grad_norm": 1.0889801103686436, + "learning_rate": 3.548362637765401e-06, + "loss": 0.9666, + "num_input_tokens_seen": 1366350496, + "step": 7566 + }, + { + "epoch": 0.8283751607871042, + "grad_norm": 1.214348891562154, + "learning_rate": 3.543948167879202e-06, + "loss": 0.8587, + "num_input_tokens_seen": 1366517376, + "step": 7567 + }, + { + "epoch": 0.8284846328580421, + "grad_norm": 1.1438252742156778, + "learning_rate": 3.5395362362029198e-06, + "loss": 0.7803, + "num_input_tokens_seen": 1366671712, + "step": 7568 + }, + { + "epoch": 0.82859410492898, + "grad_norm": 1.151525899028512, + "learning_rate": 3.5351268432584796e-06, + "loss": 0.6866, + "num_input_tokens_seen": 1366827616, + "step": 7569 + }, + { + "epoch": 0.8287035769999179, + "grad_norm": 1.2072039730598185, + "learning_rate": 3.5307199895674963e-06, + "loss": 0.9221, + "num_input_tokens_seen": 1366995616, + "step": 7570 + }, + { + "epoch": 0.8288130490708558, + "grad_norm": 1.2059530732749952, + "learning_rate": 3.5263156756512983e-06, + "loss": 0.9502, + "num_input_tokens_seen": 1367181536, + "step": 7571 + }, + { + "epoch": 0.8289225211417937, + "grad_norm": 1.2356864143665105, + "learning_rate": 3.521913902030902e-06, + "loss": 0.859, + "num_input_tokens_seen": 1367358496, + "step": 7572 + }, + { + "epoch": 0.8290319932127316, + "grad_norm": 1.2739168914683596, + "learning_rate": 3.5175146692270344e-06, + "loss": 0.8655, + "num_input_tokens_seen": 1367540832, + "step": 7573 + }, + { + "epoch": 0.8291414652836695, + "grad_norm": 1.1416433308761902, + "learning_rate": 3.5131179777601136e-06, + "loss": 0.808, + "num_input_tokens_seen": 1367724960, + "step": 7574 + }, + { + "epoch": 0.8292509373546074, + "grad_norm": 1.300266610829947, + "learning_rate": 3.508723828150254e-06, + "loss": 0.7754, + "num_input_tokens_seen": 1367863616, + "step": 7575 + }, + { + "epoch": 0.8293604094255453, + "grad_norm": 1.256203991251464, + "learning_rate": 3.504332220917289e-06, + "loss": 1.0484, + "num_input_tokens_seen": 1368075072, + "step": 7576 + }, + { + "epoch": 0.8294698814964833, + "grad_norm": 1.239946274824064, + "learning_rate": 3.499943156580726e-06, + "loss": 0.9036, + "num_input_tokens_seen": 1368244192, + "step": 7577 + }, + { + "epoch": 0.8295793535674211, + "grad_norm": 1.3706416936258736, + "learning_rate": 3.4955566356597887e-06, + "loss": 1.1906, + "num_input_tokens_seen": 1368401888, + "step": 7578 + }, + { + "epoch": 0.829688825638359, + "grad_norm": 1.2389938277736618, + "learning_rate": 3.491172658673392e-06, + "loss": 1.1065, + "num_input_tokens_seen": 1368578848, + "step": 7579 + }, + { + "epoch": 0.8297982977092969, + "grad_norm": 1.1583953749327245, + "learning_rate": 3.4867912261401458e-06, + "loss": 0.903, + "num_input_tokens_seen": 1368767680, + "step": 7580 + }, + { + "epoch": 0.8299077697802348, + "grad_norm": 1.1240409896103245, + "learning_rate": 3.4824123385783807e-06, + "loss": 0.7845, + "num_input_tokens_seen": 1368932992, + "step": 7581 + }, + { + "epoch": 0.8300172418511728, + "grad_norm": 1.1244304062750652, + "learning_rate": 3.4780359965060934e-06, + "loss": 0.7248, + "num_input_tokens_seen": 1369096064, + "step": 7582 + }, + { + "epoch": 0.8301267139221106, + "grad_norm": 1.1317849596209604, + "learning_rate": 3.4736622004410136e-06, + "loss": 0.8587, + "num_input_tokens_seen": 1369294752, + "step": 7583 + }, + { + "epoch": 0.8302361859930485, + "grad_norm": 1.2588399635600611, + "learning_rate": 3.469290950900533e-06, + "loss": 0.9185, + "num_input_tokens_seen": 1369470816, + "step": 7584 + }, + { + "epoch": 0.8303456580639864, + "grad_norm": 1.1608793015608292, + "learning_rate": 3.4649222484017836e-06, + "loss": 1.2163, + "num_input_tokens_seen": 1369676448, + "step": 7585 + }, + { + "epoch": 0.8304551301349243, + "grad_norm": 1.1534881598861597, + "learning_rate": 3.460556093461556e-06, + "loss": 0.8272, + "num_input_tokens_seen": 1369842656, + "step": 7586 + }, + { + "epoch": 0.8305646022058623, + "grad_norm": 1.1852097762611973, + "learning_rate": 3.4561924865963685e-06, + "loss": 0.8145, + "num_input_tokens_seen": 1370022304, + "step": 7587 + }, + { + "epoch": 0.8306740742768002, + "grad_norm": 1.1385397466356586, + "learning_rate": 3.4518314283224275e-06, + "loss": 1.0451, + "num_input_tokens_seen": 1370218752, + "step": 7588 + }, + { + "epoch": 0.830783546347738, + "grad_norm": 1.2272182582558844, + "learning_rate": 3.447472919155628e-06, + "loss": 0.6608, + "num_input_tokens_seen": 1370383840, + "step": 7589 + }, + { + "epoch": 0.8308930184186759, + "grad_norm": 1.2175704432010492, + "learning_rate": 3.443116959611592e-06, + "loss": 0.7695, + "num_input_tokens_seen": 1370551616, + "step": 7590 + }, + { + "epoch": 0.8310024904896138, + "grad_norm": 1.1058337683965482, + "learning_rate": 3.438763550205601e-06, + "loss": 0.7232, + "num_input_tokens_seen": 1370696992, + "step": 7591 + }, + { + "epoch": 0.8311119625605518, + "grad_norm": 1.064990378770665, + "learning_rate": 3.4344126914526735e-06, + "loss": 0.6909, + "num_input_tokens_seen": 1370866560, + "step": 7592 + }, + { + "epoch": 0.8312214346314897, + "grad_norm": 1.428587193061495, + "learning_rate": 3.430064383867487e-06, + "loss": 1.0863, + "num_input_tokens_seen": 1371055392, + "step": 7593 + }, + { + "epoch": 0.8313309067024276, + "grad_norm": 1.046842432227208, + "learning_rate": 3.4257186279644554e-06, + "loss": 0.8436, + "num_input_tokens_seen": 1371222720, + "step": 7594 + }, + { + "epoch": 0.8314403787733654, + "grad_norm": 1.1831421446930839, + "learning_rate": 3.4213754242576668e-06, + "loss": 0.8846, + "num_input_tokens_seen": 1371439328, + "step": 7595 + }, + { + "epoch": 0.8315498508443033, + "grad_norm": 1.1983826546385243, + "learning_rate": 3.417034773260913e-06, + "loss": 0.7815, + "num_input_tokens_seen": 1371591200, + "step": 7596 + }, + { + "epoch": 0.8316593229152413, + "grad_norm": 1.1995748680763878, + "learning_rate": 3.4126966754876867e-06, + "loss": 1.1292, + "num_input_tokens_seen": 1371776224, + "step": 7597 + }, + { + "epoch": 0.8317687949861792, + "grad_norm": 1.0134015908174017, + "learning_rate": 3.4083611314511763e-06, + "loss": 0.7694, + "num_input_tokens_seen": 1371977600, + "step": 7598 + }, + { + "epoch": 0.8318782670571171, + "grad_norm": 1.1250740132886825, + "learning_rate": 3.4040281416642672e-06, + "loss": 0.9288, + "num_input_tokens_seen": 1372148288, + "step": 7599 + }, + { + "epoch": 0.8319877391280549, + "grad_norm": 1.1019400632693346, + "learning_rate": 3.3996977066395376e-06, + "loss": 0.8855, + "num_input_tokens_seen": 1372305536, + "step": 7600 + }, + { + "epoch": 0.8320972111989928, + "grad_norm": 1.1207631118691566, + "learning_rate": 3.3953698268892857e-06, + "loss": 0.6935, + "num_input_tokens_seen": 1372483840, + "step": 7601 + }, + { + "epoch": 0.8322066832699307, + "grad_norm": 1.1567135474482089, + "learning_rate": 3.391044502925478e-06, + "loss": 0.8322, + "num_input_tokens_seen": 1372691712, + "step": 7602 + }, + { + "epoch": 0.8323161553408687, + "grad_norm": 1.2975882441771338, + "learning_rate": 3.3867217352597984e-06, + "loss": 0.9013, + "num_input_tokens_seen": 1372867104, + "step": 7603 + }, + { + "epoch": 0.8324256274118066, + "grad_norm": 1.241804675725415, + "learning_rate": 3.3824015244036222e-06, + "loss": 0.9737, + "num_input_tokens_seen": 1373060192, + "step": 7604 + }, + { + "epoch": 0.8325350994827445, + "grad_norm": 1.1630107044963918, + "learning_rate": 3.3780838708680153e-06, + "loss": 0.7662, + "num_input_tokens_seen": 1373264704, + "step": 7605 + }, + { + "epoch": 0.8326445715536823, + "grad_norm": 1.2872926526555384, + "learning_rate": 3.373768775163755e-06, + "loss": 0.831, + "num_input_tokens_seen": 1373380736, + "step": 7606 + }, + { + "epoch": 0.8327540436246202, + "grad_norm": 1.4090541009066762, + "learning_rate": 3.3694562378013076e-06, + "loss": 0.8734, + "num_input_tokens_seen": 1373528576, + "step": 7607 + }, + { + "epoch": 0.8328635156955582, + "grad_norm": 1.0294182432595917, + "learning_rate": 3.3651462592908275e-06, + "loss": 0.8528, + "num_input_tokens_seen": 1373733312, + "step": 7608 + }, + { + "epoch": 0.8329729877664961, + "grad_norm": 1.118145245389566, + "learning_rate": 3.3608388401421943e-06, + "loss": 0.7326, + "num_input_tokens_seen": 1373911168, + "step": 7609 + }, + { + "epoch": 0.833082459837434, + "grad_norm": 1.078615657446313, + "learning_rate": 3.356533980864959e-06, + "loss": 0.8214, + "num_input_tokens_seen": 1374110080, + "step": 7610 + }, + { + "epoch": 0.8331919319083719, + "grad_norm": 1.2361682469072992, + "learning_rate": 3.352231681968379e-06, + "loss": 0.9811, + "num_input_tokens_seen": 1374275616, + "step": 7611 + }, + { + "epoch": 0.8333014039793097, + "grad_norm": 1.1547892854709474, + "learning_rate": 3.347931943961405e-06, + "loss": 0.7587, + "num_input_tokens_seen": 1374459296, + "step": 7612 + }, + { + "epoch": 0.8334108760502477, + "grad_norm": 1.1140735684523138, + "learning_rate": 3.3436347673526936e-06, + "loss": 0.8577, + "num_input_tokens_seen": 1374632672, + "step": 7613 + }, + { + "epoch": 0.8335203481211856, + "grad_norm": 1.1456209795143018, + "learning_rate": 3.3393401526505856e-06, + "loss": 0.721, + "num_input_tokens_seen": 1374798432, + "step": 7614 + }, + { + "epoch": 0.8336298201921235, + "grad_norm": 1.159966655613695, + "learning_rate": 3.335048100363125e-06, + "loss": 1.0255, + "num_input_tokens_seen": 1374968224, + "step": 7615 + }, + { + "epoch": 0.8337392922630614, + "grad_norm": 1.2060639786458918, + "learning_rate": 3.330758610998072e-06, + "loss": 0.9368, + "num_input_tokens_seen": 1375150112, + "step": 7616 + }, + { + "epoch": 0.8338487643339992, + "grad_norm": 1.2357746832047667, + "learning_rate": 3.3264716850628375e-06, + "loss": 0.8971, + "num_input_tokens_seen": 1375333792, + "step": 7617 + }, + { + "epoch": 0.8339582364049372, + "grad_norm": 1.2590788159292015, + "learning_rate": 3.322187323064574e-06, + "loss": 1.0951, + "num_input_tokens_seen": 1375522624, + "step": 7618 + }, + { + "epoch": 0.8340677084758751, + "grad_norm": 1.0383035856877028, + "learning_rate": 3.3179055255101096e-06, + "loss": 0.8395, + "num_input_tokens_seen": 1375707872, + "step": 7619 + }, + { + "epoch": 0.834177180546813, + "grad_norm": 1.0255229174333005, + "learning_rate": 3.3136262929059746e-06, + "loss": 0.8208, + "num_input_tokens_seen": 1375885056, + "step": 7620 + }, + { + "epoch": 0.8342866526177509, + "grad_norm": 1.0084725517656863, + "learning_rate": 3.30934962575839e-06, + "loss": 0.8335, + "num_input_tokens_seen": 1376069184, + "step": 7621 + }, + { + "epoch": 0.8343961246886888, + "grad_norm": 1.1769095548559707, + "learning_rate": 3.3050755245732758e-06, + "loss": 0.8384, + "num_input_tokens_seen": 1376243232, + "step": 7622 + }, + { + "epoch": 0.8345055967596267, + "grad_norm": 1.131917297230173, + "learning_rate": 3.3008039898562603e-06, + "loss": 1.0023, + "num_input_tokens_seen": 1376403616, + "step": 7623 + }, + { + "epoch": 0.8346150688305646, + "grad_norm": 1.2162916870657332, + "learning_rate": 3.296535022112643e-06, + "loss": 1.0856, + "num_input_tokens_seen": 1376597824, + "step": 7624 + }, + { + "epoch": 0.8347245409015025, + "grad_norm": 1.1471995214187283, + "learning_rate": 3.2922686218474524e-06, + "loss": 0.8147, + "num_input_tokens_seen": 1376777920, + "step": 7625 + }, + { + "epoch": 0.8348340129724404, + "grad_norm": 1.1347445618977563, + "learning_rate": 3.288004789565377e-06, + "loss": 0.7421, + "num_input_tokens_seen": 1376900672, + "step": 7626 + }, + { + "epoch": 0.8349434850433783, + "grad_norm": 1.1867620283199924, + "learning_rate": 3.2837435257708383e-06, + "loss": 0.851, + "num_input_tokens_seen": 1377093088, + "step": 7627 + }, + { + "epoch": 0.8350529571143163, + "grad_norm": 1.114673640552939, + "learning_rate": 3.2794848309679134e-06, + "loss": 0.9608, + "num_input_tokens_seen": 1377285280, + "step": 7628 + }, + { + "epoch": 0.8351624291852541, + "grad_norm": 1.122620768772804, + "learning_rate": 3.2752287056604187e-06, + "loss": 0.7284, + "num_input_tokens_seen": 1377472544, + "step": 7629 + }, + { + "epoch": 0.835271901256192, + "grad_norm": 1.1477350175494945, + "learning_rate": 3.270975150351835e-06, + "loss": 0.8578, + "num_input_tokens_seen": 1377643008, + "step": 7630 + }, + { + "epoch": 0.8353813733271299, + "grad_norm": 1.2603387519567262, + "learning_rate": 3.2667241655453485e-06, + "loss": 0.9372, + "num_input_tokens_seen": 1377822656, + "step": 7631 + }, + { + "epoch": 0.8354908453980678, + "grad_norm": 1.2906893088365168, + "learning_rate": 3.262475751743857e-06, + "loss": 1.0047, + "num_input_tokens_seen": 1377994016, + "step": 7632 + }, + { + "epoch": 0.8356003174690058, + "grad_norm": 1.08424509023338, + "learning_rate": 3.2582299094499168e-06, + "loss": 0.6135, + "num_input_tokens_seen": 1378154400, + "step": 7633 + }, + { + "epoch": 0.8357097895399436, + "grad_norm": 1.1899273808544593, + "learning_rate": 3.253986639165826e-06, + "loss": 0.7317, + "num_input_tokens_seen": 1378318368, + "step": 7634 + }, + { + "epoch": 0.8358192616108815, + "grad_norm": 1.0526679589547683, + "learning_rate": 3.2497459413935337e-06, + "loss": 0.8143, + "num_input_tokens_seen": 1378525568, + "step": 7635 + }, + { + "epoch": 0.8359287336818194, + "grad_norm": 1.0825455748058266, + "learning_rate": 3.2455078166347242e-06, + "loss": 0.7541, + "num_input_tokens_seen": 1378729408, + "step": 7636 + }, + { + "epoch": 0.8360382057527573, + "grad_norm": 1.3421016163820731, + "learning_rate": 3.241272265390752e-06, + "loss": 0.7979, + "num_input_tokens_seen": 1378898528, + "step": 7637 + }, + { + "epoch": 0.8361476778236953, + "grad_norm": 1.0979422743028961, + "learning_rate": 3.2370392881626743e-06, + "loss": 0.8118, + "num_input_tokens_seen": 1379094976, + "step": 7638 + }, + { + "epoch": 0.8362571498946332, + "grad_norm": 1.212039126475949, + "learning_rate": 3.232808885451244e-06, + "loss": 0.7194, + "num_input_tokens_seen": 1379291424, + "step": 7639 + }, + { + "epoch": 0.836366621965571, + "grad_norm": 1.138523883919198, + "learning_rate": 3.228581057756913e-06, + "loss": 0.692, + "num_input_tokens_seen": 1379436800, + "step": 7640 + }, + { + "epoch": 0.8364760940365089, + "grad_norm": 1.123822708832833, + "learning_rate": 3.2243558055798234e-06, + "loss": 0.9853, + "num_input_tokens_seen": 1379622272, + "step": 7641 + }, + { + "epoch": 0.8365855661074468, + "grad_norm": 1.175990342720109, + "learning_rate": 3.2201331294198057e-06, + "loss": 0.606, + "num_input_tokens_seen": 1379767424, + "step": 7642 + }, + { + "epoch": 0.8366950381783848, + "grad_norm": 1.3468244998961476, + "learning_rate": 3.21591302977641e-06, + "loss": 1.0203, + "num_input_tokens_seen": 1379947296, + "step": 7643 + }, + { + "epoch": 0.8368045102493227, + "grad_norm": 1.2074163125143578, + "learning_rate": 3.2116955071488597e-06, + "loss": 0.9275, + "num_input_tokens_seen": 1380128960, + "step": 7644 + }, + { + "epoch": 0.8369139823202606, + "grad_norm": 1.1770988001974731, + "learning_rate": 3.2074805620360775e-06, + "loss": 1.1151, + "num_input_tokens_seen": 1380315776, + "step": 7645 + }, + { + "epoch": 0.8370234543911984, + "grad_norm": 1.0688790516710183, + "learning_rate": 3.2032681949366845e-06, + "loss": 1.0043, + "num_input_tokens_seen": 1380523872, + "step": 7646 + }, + { + "epoch": 0.8371329264621363, + "grad_norm": 1.2377783298567298, + "learning_rate": 3.1990584063489955e-06, + "loss": 1.0074, + "num_input_tokens_seen": 1380703520, + "step": 7647 + }, + { + "epoch": 0.8372423985330743, + "grad_norm": 1.2494199837643758, + "learning_rate": 3.194851196771015e-06, + "loss": 0.7592, + "num_input_tokens_seen": 1380868160, + "step": 7648 + }, + { + "epoch": 0.8373518706040122, + "grad_norm": 1.2382928023416169, + "learning_rate": 3.190646566700464e-06, + "loss": 0.9049, + "num_input_tokens_seen": 1381070208, + "step": 7649 + }, + { + "epoch": 0.8374613426749501, + "grad_norm": 1.2082013134799405, + "learning_rate": 3.1864445166347235e-06, + "loss": 1.0248, + "num_input_tokens_seen": 1381256576, + "step": 7650 + }, + { + "epoch": 0.8375708147458879, + "grad_norm": 1.2337730968288367, + "learning_rate": 3.1822450470709003e-06, + "loss": 0.9314, + "num_input_tokens_seen": 1381438016, + "step": 7651 + }, + { + "epoch": 0.8376802868168258, + "grad_norm": 1.247561985914542, + "learning_rate": 3.178048158505778e-06, + "loss": 0.8797, + "num_input_tokens_seen": 1381624832, + "step": 7652 + }, + { + "epoch": 0.8377897588877637, + "grad_norm": 1.1644023936679182, + "learning_rate": 3.1738538514358457e-06, + "loss": 0.8262, + "num_input_tokens_seen": 1381786560, + "step": 7653 + }, + { + "epoch": 0.8378992309587017, + "grad_norm": 1.2259153857972664, + "learning_rate": 3.1696621263572755e-06, + "loss": 0.8704, + "num_input_tokens_seen": 1381950528, + "step": 7654 + }, + { + "epoch": 0.8380087030296396, + "grad_norm": 1.186001886723322, + "learning_rate": 3.165472983765938e-06, + "loss": 0.7766, + "num_input_tokens_seen": 1382095232, + "step": 7655 + }, + { + "epoch": 0.8381181751005775, + "grad_norm": 1.0557145945605526, + "learning_rate": 3.161286424157417e-06, + "loss": 0.6433, + "num_input_tokens_seen": 1382256960, + "step": 7656 + }, + { + "epoch": 0.8382276471715153, + "grad_norm": 1.1704791084129615, + "learning_rate": 3.1571024480269524e-06, + "loss": 0.8679, + "num_input_tokens_seen": 1382443104, + "step": 7657 + }, + { + "epoch": 0.8383371192424532, + "grad_norm": 1.052082319857169, + "learning_rate": 3.152921055869523e-06, + "loss": 0.7664, + "num_input_tokens_seen": 1382631488, + "step": 7658 + }, + { + "epoch": 0.8384465913133912, + "grad_norm": 1.3005784251770482, + "learning_rate": 3.1487422481797565e-06, + "loss": 0.9303, + "num_input_tokens_seen": 1382831520, + "step": 7659 + }, + { + "epoch": 0.8385560633843291, + "grad_norm": 1.2437555265064235, + "learning_rate": 3.1445660254520173e-06, + "loss": 0.9343, + "num_input_tokens_seen": 1383037824, + "step": 7660 + }, + { + "epoch": 0.838665535455267, + "grad_norm": 1.1875976221213043, + "learning_rate": 3.1403923881803354e-06, + "loss": 0.7319, + "num_input_tokens_seen": 1383221728, + "step": 7661 + }, + { + "epoch": 0.8387750075262049, + "grad_norm": 1.2246398921978816, + "learning_rate": 3.1362213368584442e-06, + "loss": 1.0251, + "num_input_tokens_seen": 1383397120, + "step": 7662 + }, + { + "epoch": 0.8388844795971427, + "grad_norm": 1.311301592529163, + "learning_rate": 3.132052871979774e-06, + "loss": 0.9082, + "num_input_tokens_seen": 1383594464, + "step": 7663 + }, + { + "epoch": 0.8389939516680807, + "grad_norm": 1.2995532313500535, + "learning_rate": 3.1278869940374378e-06, + "loss": 1.176, + "num_input_tokens_seen": 1383768064, + "step": 7664 + }, + { + "epoch": 0.8391034237390186, + "grad_norm": 1.1292144460293891, + "learning_rate": 3.12372370352427e-06, + "loss": 0.7747, + "num_input_tokens_seen": 1383945024, + "step": 7665 + }, + { + "epoch": 0.8392128958099565, + "grad_norm": 1.0507275876094848, + "learning_rate": 3.119563000932757e-06, + "loss": 0.8138, + "num_input_tokens_seen": 1384124896, + "step": 7666 + }, + { + "epoch": 0.8393223678808944, + "grad_norm": 1.1886305697482404, + "learning_rate": 3.115404886755122e-06, + "loss": 0.813, + "num_input_tokens_seen": 1384302976, + "step": 7667 + }, + { + "epoch": 0.8394318399518322, + "grad_norm": 1.3704863504330598, + "learning_rate": 3.1112493614832426e-06, + "loss": 0.818, + "num_input_tokens_seen": 1384450592, + "step": 7668 + }, + { + "epoch": 0.8395413120227702, + "grad_norm": 1.157320351964761, + "learning_rate": 3.107096425608727e-06, + "loss": 0.9293, + "num_input_tokens_seen": 1384660928, + "step": 7669 + }, + { + "epoch": 0.8396507840937081, + "grad_norm": 1.1831201795670236, + "learning_rate": 3.1029460796228483e-06, + "loss": 0.7254, + "num_input_tokens_seen": 1384825792, + "step": 7670 + }, + { + "epoch": 0.839760256164646, + "grad_norm": 1.2775824554170288, + "learning_rate": 3.0987983240165914e-06, + "loss": 1.048, + "num_input_tokens_seen": 1385018880, + "step": 7671 + }, + { + "epoch": 0.8398697282355839, + "grad_norm": 1.1522532282043512, + "learning_rate": 3.0946531592806222e-06, + "loss": 0.721, + "num_input_tokens_seen": 1385186208, + "step": 7672 + }, + { + "epoch": 0.8399792003065218, + "grad_norm": 1.1375428977674509, + "learning_rate": 3.0905105859053068e-06, + "loss": 0.8112, + "num_input_tokens_seen": 1385369888, + "step": 7673 + }, + { + "epoch": 0.8400886723774597, + "grad_norm": 1.1433986228303354, + "learning_rate": 3.0863706043807115e-06, + "loss": 0.8217, + "num_input_tokens_seen": 1385575520, + "step": 7674 + }, + { + "epoch": 0.8401981444483976, + "grad_norm": 1.0908842418109137, + "learning_rate": 3.0822332151965754e-06, + "loss": 0.6689, + "num_input_tokens_seen": 1385768384, + "step": 7675 + }, + { + "epoch": 0.8403076165193355, + "grad_norm": 1.1738192895947117, + "learning_rate": 3.078098418842354e-06, + "loss": 0.8717, + "num_input_tokens_seen": 1385946464, + "step": 7676 + }, + { + "epoch": 0.8404170885902734, + "grad_norm": 1.046208867183993, + "learning_rate": 3.073966215807181e-06, + "loss": 0.7271, + "num_input_tokens_seen": 1386109760, + "step": 7677 + }, + { + "epoch": 0.8405265606612113, + "grad_norm": 1.0564881953434333, + "learning_rate": 3.06983660657989e-06, + "loss": 0.758, + "num_input_tokens_seen": 1386275296, + "step": 7678 + }, + { + "epoch": 0.8406360327321493, + "grad_norm": 1.1838393255607798, + "learning_rate": 3.0657095916490046e-06, + "loss": 0.8687, + "num_input_tokens_seen": 1386454496, + "step": 7679 + }, + { + "epoch": 0.8407455048030871, + "grad_norm": 1.2993737014242916, + "learning_rate": 3.0615851715027426e-06, + "loss": 0.821, + "num_input_tokens_seen": 1386634592, + "step": 7680 + }, + { + "epoch": 0.840854976874025, + "grad_norm": 1.195409334536316, + "learning_rate": 3.0574633466290166e-06, + "loss": 1.0898, + "num_input_tokens_seen": 1386850752, + "step": 7681 + }, + { + "epoch": 0.8409644489449629, + "grad_norm": 1.287851678168549, + "learning_rate": 3.0533441175154305e-06, + "loss": 1.0014, + "num_input_tokens_seen": 1387013824, + "step": 7682 + }, + { + "epoch": 0.8410739210159008, + "grad_norm": 1.1429186712885535, + "learning_rate": 3.049227484649275e-06, + "loss": 0.8438, + "num_input_tokens_seen": 1387199968, + "step": 7683 + }, + { + "epoch": 0.8411833930868388, + "grad_norm": 1.2408642708683537, + "learning_rate": 3.04511344851755e-06, + "loss": 0.872, + "num_input_tokens_seen": 1387379392, + "step": 7684 + }, + { + "epoch": 0.8412928651577766, + "grad_norm": 1.112069095783894, + "learning_rate": 3.041002009606933e-06, + "loss": 0.6675, + "num_input_tokens_seen": 1387537088, + "step": 7685 + }, + { + "epoch": 0.8414023372287145, + "grad_norm": 1.161249846262352, + "learning_rate": 3.036893168403801e-06, + "loss": 0.7315, + "num_input_tokens_seen": 1387696352, + "step": 7686 + }, + { + "epoch": 0.8415118092996524, + "grad_norm": 1.1904070056384837, + "learning_rate": 3.0327869253942183e-06, + "loss": 0.7395, + "num_input_tokens_seen": 1387824032, + "step": 7687 + }, + { + "epoch": 0.8416212813705903, + "grad_norm": 1.141403559603905, + "learning_rate": 3.0286832810639515e-06, + "loss": 0.6728, + "num_input_tokens_seen": 1387993824, + "step": 7688 + }, + { + "epoch": 0.8417307534415283, + "grad_norm": 1.2445164566453937, + "learning_rate": 3.024582235898449e-06, + "loss": 1.0413, + "num_input_tokens_seen": 1388176384, + "step": 7689 + }, + { + "epoch": 0.8418402255124662, + "grad_norm": 1.1029137514482015, + "learning_rate": 3.0204837903828525e-06, + "loss": 0.8173, + "num_input_tokens_seen": 1388377984, + "step": 7690 + }, + { + "epoch": 0.841949697583404, + "grad_norm": 1.1985343943670936, + "learning_rate": 3.0163879450020166e-06, + "loss": 0.908, + "num_input_tokens_seen": 1388514176, + "step": 7691 + }, + { + "epoch": 0.8420591696543419, + "grad_norm": 1.1923548689683585, + "learning_rate": 3.0122947002404504e-06, + "loss": 0.9092, + "num_input_tokens_seen": 1388690464, + "step": 7692 + }, + { + "epoch": 0.8421686417252798, + "grad_norm": 1.2494346293750886, + "learning_rate": 3.008204056582392e-06, + "loss": 1.1221, + "num_input_tokens_seen": 1388879968, + "step": 7693 + }, + { + "epoch": 0.8422781137962178, + "grad_norm": 1.3142628411559503, + "learning_rate": 3.004116014511754e-06, + "loss": 0.7336, + "num_input_tokens_seen": 1389056256, + "step": 7694 + }, + { + "epoch": 0.8423875858671557, + "grad_norm": 1.1714815623728845, + "learning_rate": 3.0000305745121443e-06, + "loss": 1.0401, + "num_input_tokens_seen": 1389235456, + "step": 7695 + }, + { + "epoch": 0.8424970579380936, + "grad_norm": 1.2021380748658514, + "learning_rate": 2.995947737066859e-06, + "loss": 0.9094, + "num_input_tokens_seen": 1389449376, + "step": 7696 + }, + { + "epoch": 0.8426065300090314, + "grad_norm": 1.2155042612871225, + "learning_rate": 2.9918675026588876e-06, + "loss": 0.8304, + "num_input_tokens_seen": 1389636192, + "step": 7697 + }, + { + "epoch": 0.8427160020799693, + "grad_norm": 1.2753620699479717, + "learning_rate": 2.987789871770927e-06, + "loss": 0.9536, + "num_input_tokens_seen": 1389837120, + "step": 7698 + }, + { + "epoch": 0.8428254741509073, + "grad_norm": 1.1340384328206243, + "learning_rate": 2.9837148448853353e-06, + "loss": 0.9262, + "num_input_tokens_seen": 1390019904, + "step": 7699 + }, + { + "epoch": 0.8429349462218452, + "grad_norm": 1.2911227783426285, + "learning_rate": 2.979642422484197e-06, + "loss": 0.9046, + "num_input_tokens_seen": 1390229568, + "step": 7700 + }, + { + "epoch": 0.8430444182927831, + "grad_norm": 1.0588501904674552, + "learning_rate": 2.9755726050492566e-06, + "loss": 0.6904, + "num_input_tokens_seen": 1390430272, + "step": 7701 + }, + { + "epoch": 0.8431538903637209, + "grad_norm": 1.2607162792046478, + "learning_rate": 2.9715053930619798e-06, + "loss": 1.0272, + "num_input_tokens_seen": 1390609248, + "step": 7702 + }, + { + "epoch": 0.8432633624346588, + "grad_norm": 1.2165321809217884, + "learning_rate": 2.9674407870035004e-06, + "loss": 1.0622, + "num_input_tokens_seen": 1390772992, + "step": 7703 + }, + { + "epoch": 0.8433728345055967, + "grad_norm": 1.1885808743767006, + "learning_rate": 2.963378787354659e-06, + "loss": 1.0109, + "num_input_tokens_seen": 1390972352, + "step": 7704 + }, + { + "epoch": 0.8434823065765347, + "grad_norm": 1.228299993807041, + "learning_rate": 2.95931939459598e-06, + "loss": 0.8237, + "num_input_tokens_seen": 1391175968, + "step": 7705 + }, + { + "epoch": 0.8435917786474726, + "grad_norm": 1.1009412055016818, + "learning_rate": 2.9552626092076765e-06, + "loss": 0.8984, + "num_input_tokens_seen": 1391375552, + "step": 7706 + }, + { + "epoch": 0.8437012507184105, + "grad_norm": 1.096172429453642, + "learning_rate": 2.951208431669675e-06, + "loss": 0.7692, + "num_input_tokens_seen": 1391563488, + "step": 7707 + }, + { + "epoch": 0.8438107227893483, + "grad_norm": 1.2243653560801908, + "learning_rate": 2.9471568624615533e-06, + "loss": 0.9528, + "num_input_tokens_seen": 1391759936, + "step": 7708 + }, + { + "epoch": 0.8439201948602862, + "grad_norm": 1.1998001826340068, + "learning_rate": 2.9431079020626253e-06, + "loss": 1.0567, + "num_input_tokens_seen": 1391940032, + "step": 7709 + }, + { + "epoch": 0.8440296669312242, + "grad_norm": 1.2327914160024245, + "learning_rate": 2.939061550951863e-06, + "loss": 0.8089, + "num_input_tokens_seen": 1392120128, + "step": 7710 + }, + { + "epoch": 0.8441391390021621, + "grad_norm": 1.1615460224435077, + "learning_rate": 2.9350178096079486e-06, + "loss": 0.9647, + "num_input_tokens_seen": 1392308960, + "step": 7711 + }, + { + "epoch": 0.8442486110731, + "grad_norm": 1.0716286245409585, + "learning_rate": 2.930976678509245e-06, + "loss": 0.8485, + "num_input_tokens_seen": 1392500480, + "step": 7712 + }, + { + "epoch": 0.8443580831440379, + "grad_norm": 1.0416220782591599, + "learning_rate": 2.926938158133813e-06, + "loss": 0.8408, + "num_input_tokens_seen": 1392715072, + "step": 7713 + }, + { + "epoch": 0.8444675552149757, + "grad_norm": 1.1628429501736224, + "learning_rate": 2.9229022489594e-06, + "loss": 0.7933, + "num_input_tokens_seen": 1392900768, + "step": 7714 + }, + { + "epoch": 0.8445770272859137, + "grad_norm": 1.1178266390961242, + "learning_rate": 2.9188689514634408e-06, + "loss": 0.601, + "num_input_tokens_seen": 1393043232, + "step": 7715 + }, + { + "epoch": 0.8446864993568516, + "grad_norm": 1.1054711763065523, + "learning_rate": 2.9148382661230766e-06, + "loss": 0.83, + "num_input_tokens_seen": 1393228480, + "step": 7716 + }, + { + "epoch": 0.8447959714277895, + "grad_norm": 1.1545673705344015, + "learning_rate": 2.9108101934151285e-06, + "loss": 0.7581, + "num_input_tokens_seen": 1393394016, + "step": 7717 + }, + { + "epoch": 0.8449054434987274, + "grad_norm": 1.1730346536777774, + "learning_rate": 2.9067847338161063e-06, + "loss": 0.7452, + "num_input_tokens_seen": 1393559328, + "step": 7718 + }, + { + "epoch": 0.8450149155696652, + "grad_norm": 1.2137404457761263, + "learning_rate": 2.9027618878022134e-06, + "loss": 0.8109, + "num_input_tokens_seen": 1393730240, + "step": 7719 + }, + { + "epoch": 0.8451243876406032, + "grad_norm": 1.191200113442789, + "learning_rate": 2.898741655849349e-06, + "loss": 0.7156, + "num_input_tokens_seen": 1393877408, + "step": 7720 + }, + { + "epoch": 0.8452338597115411, + "grad_norm": 1.1115276244759482, + "learning_rate": 2.8947240384330945e-06, + "loss": 0.8589, + "num_input_tokens_seen": 1394059520, + "step": 7721 + }, + { + "epoch": 0.845343331782479, + "grad_norm": 1.3359445939795889, + "learning_rate": 2.89070903602873e-06, + "loss": 0.9219, + "num_input_tokens_seen": 1394254176, + "step": 7722 + }, + { + "epoch": 0.8454528038534169, + "grad_norm": 1.141700655484619, + "learning_rate": 2.8866966491112144e-06, + "loss": 1.1178, + "num_input_tokens_seen": 1394473248, + "step": 7723 + }, + { + "epoch": 0.8455622759243548, + "grad_norm": 1.2154365883860718, + "learning_rate": 2.8826868781552217e-06, + "loss": 0.8521, + "num_input_tokens_seen": 1394664768, + "step": 7724 + }, + { + "epoch": 0.8456717479952927, + "grad_norm": 1.2217679925682225, + "learning_rate": 2.8786797236350806e-06, + "loss": 0.8521, + "num_input_tokens_seen": 1394859424, + "step": 7725 + }, + { + "epoch": 0.8457812200662306, + "grad_norm": 1.038501731591946, + "learning_rate": 2.8746751860248415e-06, + "loss": 0.6363, + "num_input_tokens_seen": 1395046464, + "step": 7726 + }, + { + "epoch": 0.8458906921371685, + "grad_norm": 1.235675271786272, + "learning_rate": 2.8706732657982347e-06, + "loss": 0.9084, + "num_input_tokens_seen": 1395197440, + "step": 7727 + }, + { + "epoch": 0.8460001642081064, + "grad_norm": 1.1708544038028297, + "learning_rate": 2.866673963428676e-06, + "loss": 0.8158, + "num_input_tokens_seen": 1395364096, + "step": 7728 + }, + { + "epoch": 0.8461096362790443, + "grad_norm": 1.037020404684819, + "learning_rate": 2.862677279389275e-06, + "loss": 1.0561, + "num_input_tokens_seen": 1395572192, + "step": 7729 + }, + { + "epoch": 0.8462191083499823, + "grad_norm": 1.168308476570461, + "learning_rate": 2.85868321415283e-06, + "loss": 0.8771, + "num_input_tokens_seen": 1395741088, + "step": 7730 + }, + { + "epoch": 0.8463285804209201, + "grad_norm": 1.1532169977308198, + "learning_rate": 2.8546917681918417e-06, + "loss": 0.8788, + "num_input_tokens_seen": 1395931488, + "step": 7731 + }, + { + "epoch": 0.846438052491858, + "grad_norm": 1.125817743433929, + "learning_rate": 2.8507029419784696e-06, + "loss": 0.7878, + "num_input_tokens_seen": 1396131968, + "step": 7732 + }, + { + "epoch": 0.8465475245627959, + "grad_norm": 1.3053716362034342, + "learning_rate": 2.8467167359846115e-06, + "loss": 1.047, + "num_input_tokens_seen": 1396309600, + "step": 7733 + }, + { + "epoch": 0.8466569966337338, + "grad_norm": 1.0998187501008634, + "learning_rate": 2.842733150681803e-06, + "loss": 1.0107, + "num_input_tokens_seen": 1396525312, + "step": 7734 + }, + { + "epoch": 0.8467664687046718, + "grad_norm": 1.368123415357457, + "learning_rate": 2.83875218654131e-06, + "loss": 0.7566, + "num_input_tokens_seen": 1396678752, + "step": 7735 + }, + { + "epoch": 0.8468759407756096, + "grad_norm": 1.1688373104673098, + "learning_rate": 2.8347738440340663e-06, + "loss": 0.9354, + "num_input_tokens_seen": 1396863328, + "step": 7736 + }, + { + "epoch": 0.8469854128465475, + "grad_norm": 1.3417739088822185, + "learning_rate": 2.830798123630707e-06, + "loss": 0.8837, + "num_input_tokens_seen": 1397048576, + "step": 7737 + }, + { + "epoch": 0.8470948849174854, + "grad_norm": 1.363125942532486, + "learning_rate": 2.8268250258015467e-06, + "loss": 0.9969, + "num_input_tokens_seen": 1397210304, + "step": 7738 + }, + { + "epoch": 0.8472043569884233, + "grad_norm": 1.1782477985202062, + "learning_rate": 2.822854551016593e-06, + "loss": 0.7809, + "num_input_tokens_seen": 1397405408, + "step": 7739 + }, + { + "epoch": 0.8473138290593613, + "grad_norm": 1.1527434092432327, + "learning_rate": 2.8188866997455626e-06, + "loss": 0.6858, + "num_input_tokens_seen": 1397577440, + "step": 7740 + }, + { + "epoch": 0.8474233011302992, + "grad_norm": 1.1306028717557888, + "learning_rate": 2.814921472457821e-06, + "loss": 0.7818, + "num_input_tokens_seen": 1397793824, + "step": 7741 + }, + { + "epoch": 0.847532773201237, + "grad_norm": 1.1797319113348443, + "learning_rate": 2.810958869622471e-06, + "loss": 0.9396, + "num_input_tokens_seen": 1397962496, + "step": 7742 + }, + { + "epoch": 0.8476422452721749, + "grad_norm": 1.1102066146733247, + "learning_rate": 2.8069988917082566e-06, + "loss": 0.8358, + "num_input_tokens_seen": 1398127136, + "step": 7743 + }, + { + "epoch": 0.8477517173431128, + "grad_norm": 1.0503058510331247, + "learning_rate": 2.8030415391836513e-06, + "loss": 0.7324, + "num_input_tokens_seen": 1398318656, + "step": 7744 + }, + { + "epoch": 0.8478611894140508, + "grad_norm": 1.0493637791409138, + "learning_rate": 2.799086812516799e-06, + "loss": 0.8542, + "num_input_tokens_seen": 1398517344, + "step": 7745 + }, + { + "epoch": 0.8479706614849887, + "grad_norm": 1.1896377695770508, + "learning_rate": 2.7951347121755373e-06, + "loss": 0.8449, + "num_input_tokens_seen": 1398642112, + "step": 7746 + }, + { + "epoch": 0.8480801335559266, + "grad_norm": 1.136371625091909, + "learning_rate": 2.791185238627389e-06, + "loss": 0.9099, + "num_input_tokens_seen": 1398841696, + "step": 7747 + }, + { + "epoch": 0.8481896056268644, + "grad_norm": 1.3205731828654577, + "learning_rate": 2.7872383923395667e-06, + "loss": 0.8627, + "num_input_tokens_seen": 1399002080, + "step": 7748 + }, + { + "epoch": 0.8482990776978023, + "grad_norm": 1.2010453320348324, + "learning_rate": 2.7832941737789912e-06, + "loss": 0.8029, + "num_input_tokens_seen": 1399171648, + "step": 7749 + }, + { + "epoch": 0.8484085497687402, + "grad_norm": 1.1605696022070704, + "learning_rate": 2.7793525834122315e-06, + "loss": 0.7536, + "num_input_tokens_seen": 1399367648, + "step": 7750 + }, + { + "epoch": 0.8485180218396782, + "grad_norm": 1.333957297635375, + "learning_rate": 2.775413621705586e-06, + "loss": 1.0358, + "num_input_tokens_seen": 1399522208, + "step": 7751 + }, + { + "epoch": 0.8486274939106161, + "grad_norm": 1.190571927467199, + "learning_rate": 2.771477289125024e-06, + "loss": 0.9609, + "num_input_tokens_seen": 1399731648, + "step": 7752 + }, + { + "epoch": 0.8487369659815539, + "grad_norm": 1.1561820035993824, + "learning_rate": 2.7675435861362064e-06, + "loss": 0.7147, + "num_input_tokens_seen": 1399932352, + "step": 7753 + }, + { + "epoch": 0.8488464380524918, + "grad_norm": 0.9995858705834809, + "learning_rate": 2.7636125132044806e-06, + "loss": 0.9408, + "num_input_tokens_seen": 1400138432, + "step": 7754 + }, + { + "epoch": 0.8489559101234297, + "grad_norm": 1.1450291094348855, + "learning_rate": 2.759684070794885e-06, + "loss": 1.0131, + "num_input_tokens_seen": 1400311360, + "step": 7755 + }, + { + "epoch": 0.8490653821943677, + "grad_norm": 1.1601329248825785, + "learning_rate": 2.755758259372149e-06, + "loss": 0.9525, + "num_input_tokens_seen": 1400486304, + "step": 7756 + }, + { + "epoch": 0.8491748542653056, + "grad_norm": 1.048799993894302, + "learning_rate": 2.7518350794006804e-06, + "loss": 0.7636, + "num_input_tokens_seen": 1400690816, + "step": 7757 + }, + { + "epoch": 0.8492843263362435, + "grad_norm": 1.0386254101159147, + "learning_rate": 2.7479145313445974e-06, + "loss": 0.9384, + "num_input_tokens_seen": 1400886592, + "step": 7758 + }, + { + "epoch": 0.8493937984071813, + "grad_norm": 1.1104113077078517, + "learning_rate": 2.743996615667685e-06, + "loss": 0.712, + "num_input_tokens_seen": 1401071168, + "step": 7759 + }, + { + "epoch": 0.8495032704781192, + "grad_norm": 1.0968499531290778, + "learning_rate": 2.7400813328334273e-06, + "loss": 0.8476, + "num_input_tokens_seen": 1401262912, + "step": 7760 + }, + { + "epoch": 0.8496127425490572, + "grad_norm": 1.3095030401077317, + "learning_rate": 2.736168683304996e-06, + "loss": 0.9655, + "num_input_tokens_seen": 1401448832, + "step": 7761 + }, + { + "epoch": 0.8497222146199951, + "grad_norm": 1.1187067971865055, + "learning_rate": 2.7322586675452454e-06, + "loss": 0.7751, + "num_input_tokens_seen": 1401637216, + "step": 7762 + }, + { + "epoch": 0.849831686690933, + "grad_norm": 1.3495628249535863, + "learning_rate": 2.728351286016725e-06, + "loss": 0.8916, + "num_input_tokens_seen": 1401788416, + "step": 7763 + }, + { + "epoch": 0.8499411587618709, + "grad_norm": 1.093780190187412, + "learning_rate": 2.7244465391816742e-06, + "loss": 0.7469, + "num_input_tokens_seen": 1401946560, + "step": 7764 + }, + { + "epoch": 0.8500506308328087, + "grad_norm": 1.051283932386556, + "learning_rate": 2.720544427502009e-06, + "loss": 0.6088, + "num_input_tokens_seen": 1402127328, + "step": 7765 + }, + { + "epoch": 0.8501601029037467, + "grad_norm": 1.032145949060671, + "learning_rate": 2.7166449514393565e-06, + "loss": 0.7795, + "num_input_tokens_seen": 1402313472, + "step": 7766 + }, + { + "epoch": 0.8502695749746846, + "grad_norm": 1.1243584711069632, + "learning_rate": 2.7127481114549965e-06, + "loss": 0.7013, + "num_input_tokens_seen": 1402485952, + "step": 7767 + }, + { + "epoch": 0.8503790470456225, + "grad_norm": 1.0858429792146025, + "learning_rate": 2.708853908009934e-06, + "loss": 0.9505, + "num_input_tokens_seen": 1402691136, + "step": 7768 + }, + { + "epoch": 0.8504885191165604, + "grad_norm": 1.2030011893636734, + "learning_rate": 2.7049623415648427e-06, + "loss": 1.0753, + "num_input_tokens_seen": 1402875936, + "step": 7769 + }, + { + "epoch": 0.8505979911874982, + "grad_norm": 1.2637704047301817, + "learning_rate": 2.7010734125800824e-06, + "loss": 0.9341, + "num_input_tokens_seen": 1403061632, + "step": 7770 + }, + { + "epoch": 0.8507074632584362, + "grad_norm": 1.3015893024373082, + "learning_rate": 2.6971871215157126e-06, + "loss": 0.9258, + "num_input_tokens_seen": 1403248224, + "step": 7771 + }, + { + "epoch": 0.8508169353293741, + "grad_norm": 1.2054108542126263, + "learning_rate": 2.6933034688314624e-06, + "loss": 0.9328, + "num_input_tokens_seen": 1403426080, + "step": 7772 + }, + { + "epoch": 0.850926407400312, + "grad_norm": 1.0687388525487433, + "learning_rate": 2.6894224549867815e-06, + "loss": 0.689, + "num_input_tokens_seen": 1403594528, + "step": 7773 + }, + { + "epoch": 0.8510358794712499, + "grad_norm": 1.0939085709201628, + "learning_rate": 2.6855440804407635e-06, + "loss": 0.8507, + "num_input_tokens_seen": 1403793664, + "step": 7774 + }, + { + "epoch": 0.8511453515421878, + "grad_norm": 1.1564846316255288, + "learning_rate": 2.68166834565223e-06, + "loss": 0.8813, + "num_input_tokens_seen": 1403989440, + "step": 7775 + }, + { + "epoch": 0.8512548236131257, + "grad_norm": 1.1950607034595808, + "learning_rate": 2.6777952510796565e-06, + "loss": 0.9282, + "num_input_tokens_seen": 1404186336, + "step": 7776 + }, + { + "epoch": 0.8513642956840636, + "grad_norm": 1.3478142687164383, + "learning_rate": 2.6739247971812375e-06, + "loss": 1.127, + "num_input_tokens_seen": 1404369792, + "step": 7777 + }, + { + "epoch": 0.8514737677550015, + "grad_norm": 1.1337597799667922, + "learning_rate": 2.6700569844148372e-06, + "loss": 1.0784, + "num_input_tokens_seen": 1404575648, + "step": 7778 + }, + { + "epoch": 0.8515832398259394, + "grad_norm": 1.173690762472288, + "learning_rate": 2.666191813238006e-06, + "loss": 0.8727, + "num_input_tokens_seen": 1404734912, + "step": 7779 + }, + { + "epoch": 0.8516927118968773, + "grad_norm": 1.3540435245961833, + "learning_rate": 2.662329284107987e-06, + "loss": 0.9576, + "num_input_tokens_seen": 1404890592, + "step": 7780 + }, + { + "epoch": 0.8518021839678153, + "grad_norm": 1.2944587351092482, + "learning_rate": 2.6584693974817084e-06, + "loss": 0.7759, + "num_input_tokens_seen": 1405040896, + "step": 7781 + }, + { + "epoch": 0.8519116560387531, + "grad_norm": 1.1818511523135964, + "learning_rate": 2.6546121538157998e-06, + "loss": 1.1778, + "num_input_tokens_seen": 1405236448, + "step": 7782 + }, + { + "epoch": 0.852021128109691, + "grad_norm": 1.0815518415444454, + "learning_rate": 2.650757553566546e-06, + "loss": 0.7133, + "num_input_tokens_seen": 1405427296, + "step": 7783 + }, + { + "epoch": 0.8521306001806289, + "grad_norm": 1.0886146971131485, + "learning_rate": 2.6469055971899525e-06, + "loss": 0.744, + "num_input_tokens_seen": 1405610976, + "step": 7784 + }, + { + "epoch": 0.8522400722515668, + "grad_norm": 1.5846870641720938, + "learning_rate": 2.6430562851416983e-06, + "loss": 1.1366, + "num_input_tokens_seen": 1405772704, + "step": 7785 + }, + { + "epoch": 0.8523495443225048, + "grad_norm": 1.1746601830807633, + "learning_rate": 2.6392096178771447e-06, + "loss": 0.9202, + "num_input_tokens_seen": 1405972064, + "step": 7786 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 1.1131907108366168, + "learning_rate": 2.635365595851344e-06, + "loss": 0.8187, + "num_input_tokens_seen": 1406174336, + "step": 7787 + }, + { + "epoch": 0.8525684884643805, + "grad_norm": 1.3257379959774374, + "learning_rate": 2.6315242195190436e-06, + "loss": 0.8218, + "num_input_tokens_seen": 1406321504, + "step": 7788 + }, + { + "epoch": 0.8526779605353184, + "grad_norm": 1.1690060911256481, + "learning_rate": 2.6276854893346636e-06, + "loss": 0.8718, + "num_input_tokens_seen": 1406480992, + "step": 7789 + }, + { + "epoch": 0.8527874326062563, + "grad_norm": 1.2748991645881464, + "learning_rate": 2.6238494057523183e-06, + "loss": 0.8997, + "num_input_tokens_seen": 1406623904, + "step": 7790 + }, + { + "epoch": 0.8528969046771943, + "grad_norm": 1.0352667407681513, + "learning_rate": 2.6200159692258195e-06, + "loss": 1.0995, + "num_input_tokens_seen": 1406842528, + "step": 7791 + }, + { + "epoch": 0.8530063767481322, + "grad_norm": 1.1825023693036787, + "learning_rate": 2.616185180208644e-06, + "loss": 1.093, + "num_input_tokens_seen": 1407031360, + "step": 7792 + }, + { + "epoch": 0.85311584881907, + "grad_norm": 1.3023520484847573, + "learning_rate": 2.612357039153973e-06, + "loss": 0.9311, + "num_input_tokens_seen": 1407216832, + "step": 7793 + }, + { + "epoch": 0.8532253208900079, + "grad_norm": 1.2167223667214961, + "learning_rate": 2.608531546514667e-06, + "loss": 0.8651, + "num_input_tokens_seen": 1407421792, + "step": 7794 + }, + { + "epoch": 0.8533347929609458, + "grad_norm": 1.2839511218289619, + "learning_rate": 2.6047087027432746e-06, + "loss": 0.9297, + "num_input_tokens_seen": 1407629888, + "step": 7795 + }, + { + "epoch": 0.8534442650318838, + "grad_norm": 1.3035791223302688, + "learning_rate": 2.600888508292029e-06, + "loss": 1.033, + "num_input_tokens_seen": 1407827904, + "step": 7796 + }, + { + "epoch": 0.8535537371028217, + "grad_norm": 1.1752587865368338, + "learning_rate": 2.597070963612852e-06, + "loss": 0.7996, + "num_input_tokens_seen": 1408001728, + "step": 7797 + }, + { + "epoch": 0.8536632091737596, + "grad_norm": 1.3542624610861367, + "learning_rate": 2.5932560691573487e-06, + "loss": 1.094, + "num_input_tokens_seen": 1408196832, + "step": 7798 + }, + { + "epoch": 0.8537726812446974, + "grad_norm": 1.0585862613427175, + "learning_rate": 2.5894438253768223e-06, + "loss": 0.6372, + "num_input_tokens_seen": 1408365952, + "step": 7799 + }, + { + "epoch": 0.8538821533156353, + "grad_norm": 1.1504666832746402, + "learning_rate": 2.5856342327222505e-06, + "loss": 0.8728, + "num_input_tokens_seen": 1408534400, + "step": 7800 + }, + { + "epoch": 0.8539916253865732, + "grad_norm": 1.0286105508024734, + "learning_rate": 2.581827291644301e-06, + "loss": 0.6661, + "num_input_tokens_seen": 1408706432, + "step": 7801 + }, + { + "epoch": 0.8541010974575112, + "grad_norm": 1.2467429165207695, + "learning_rate": 2.5780230025933245e-06, + "loss": 0.8886, + "num_input_tokens_seen": 1408875552, + "step": 7802 + }, + { + "epoch": 0.8542105695284491, + "grad_norm": 1.198345450356404, + "learning_rate": 2.5742213660193637e-06, + "loss": 0.8864, + "num_input_tokens_seen": 1409063488, + "step": 7803 + }, + { + "epoch": 0.8543200415993869, + "grad_norm": 1.0591795591944821, + "learning_rate": 2.5704223823721453e-06, + "loss": 0.9737, + "num_input_tokens_seen": 1409255232, + "step": 7804 + }, + { + "epoch": 0.8544295136703248, + "grad_norm": 1.3066437269325102, + "learning_rate": 2.5666260521010758e-06, + "loss": 0.899, + "num_input_tokens_seen": 1409398816, + "step": 7805 + }, + { + "epoch": 0.8545389857412627, + "grad_norm": 1.2660526447951128, + "learning_rate": 2.562832375655269e-06, + "loss": 1.0309, + "num_input_tokens_seen": 1409565472, + "step": 7806 + }, + { + "epoch": 0.8546484578122007, + "grad_norm": 1.089859500894245, + "learning_rate": 2.5590413534834906e-06, + "loss": 0.7464, + "num_input_tokens_seen": 1409750720, + "step": 7807 + }, + { + "epoch": 0.8547579298831386, + "grad_norm": 1.217990736551872, + "learning_rate": 2.555252986034229e-06, + "loss": 0.8388, + "num_input_tokens_seen": 1409935520, + "step": 7808 + }, + { + "epoch": 0.8548674019540765, + "grad_norm": 1.2941770871429235, + "learning_rate": 2.55146727375562e-06, + "loss": 1.0589, + "num_input_tokens_seen": 1410129504, + "step": 7809 + }, + { + "epoch": 0.8549768740250143, + "grad_norm": 1.0081830785000632, + "learning_rate": 2.547684217095528e-06, + "loss": 0.712, + "num_input_tokens_seen": 1410342080, + "step": 7810 + }, + { + "epoch": 0.8550863460959522, + "grad_norm": 1.1523658362389526, + "learning_rate": 2.5439038165014666e-06, + "loss": 1.0402, + "num_input_tokens_seen": 1410527328, + "step": 7811 + }, + { + "epoch": 0.8551958181668902, + "grad_norm": 0.9933770325561788, + "learning_rate": 2.5401260724206537e-06, + "loss": 0.5315, + "num_input_tokens_seen": 1410716832, + "step": 7812 + }, + { + "epoch": 0.8553052902378281, + "grad_norm": 1.2358405672008699, + "learning_rate": 2.5363509852999983e-06, + "loss": 1.1034, + "num_input_tokens_seen": 1410928736, + "step": 7813 + }, + { + "epoch": 0.855414762308766, + "grad_norm": 1.0805579666518306, + "learning_rate": 2.532578555586068e-06, + "loss": 0.959, + "num_input_tokens_seen": 1411123168, + "step": 7814 + }, + { + "epoch": 0.8555242343797039, + "grad_norm": 1.287128546749999, + "learning_rate": 2.5288087837251564e-06, + "loss": 0.9765, + "num_input_tokens_seen": 1411292512, + "step": 7815 + }, + { + "epoch": 0.8556337064506417, + "grad_norm": 1.2880302689028407, + "learning_rate": 2.5250416701631976e-06, + "loss": 0.9359, + "num_input_tokens_seen": 1411463424, + "step": 7816 + }, + { + "epoch": 0.8557431785215797, + "grad_norm": 1.248536492959785, + "learning_rate": 2.521277215345852e-06, + "loss": 0.7694, + "num_input_tokens_seen": 1411622688, + "step": 7817 + }, + { + "epoch": 0.8558526505925176, + "grad_norm": 1.102083502314554, + "learning_rate": 2.517515419718433e-06, + "loss": 0.9374, + "num_input_tokens_seen": 1411810624, + "step": 7818 + }, + { + "epoch": 0.8559621226634555, + "grad_norm": 1.1341791787859954, + "learning_rate": 2.5137562837259626e-06, + "loss": 0.6854, + "num_input_tokens_seen": 1411989152, + "step": 7819 + }, + { + "epoch": 0.8560715947343934, + "grad_norm": 1.1882296880185443, + "learning_rate": 2.5099998078131376e-06, + "loss": 0.9311, + "num_input_tokens_seen": 1412194336, + "step": 7820 + }, + { + "epoch": 0.8561810668053312, + "grad_norm": 1.2532710668047649, + "learning_rate": 2.5062459924243442e-06, + "loss": 0.867, + "num_input_tokens_seen": 1412343072, + "step": 7821 + }, + { + "epoch": 0.8562905388762692, + "grad_norm": 1.2863724817620739, + "learning_rate": 2.5024948380036468e-06, + "loss": 0.936, + "num_input_tokens_seen": 1412488448, + "step": 7822 + }, + { + "epoch": 0.8564000109472071, + "grad_norm": 1.203518295408937, + "learning_rate": 2.4987463449947986e-06, + "loss": 0.9399, + "num_input_tokens_seen": 1412679296, + "step": 7823 + }, + { + "epoch": 0.856509483018145, + "grad_norm": 1.3825019444595101, + "learning_rate": 2.495000513841253e-06, + "loss": 0.9686, + "num_input_tokens_seen": 1412844160, + "step": 7824 + }, + { + "epoch": 0.8566189550890829, + "grad_norm": 1.151490752470821, + "learning_rate": 2.491257344986114e-06, + "loss": 0.7123, + "num_input_tokens_seen": 1412986400, + "step": 7825 + }, + { + "epoch": 0.8567284271600208, + "grad_norm": 1.2481015570993417, + "learning_rate": 2.4875168388722057e-06, + "loss": 0.927, + "num_input_tokens_seen": 1413198752, + "step": 7826 + }, + { + "epoch": 0.8568378992309587, + "grad_norm": 1.0228195654522771, + "learning_rate": 2.4837789959420184e-06, + "loss": 0.7741, + "num_input_tokens_seen": 1413380192, + "step": 7827 + }, + { + "epoch": 0.8569473713018966, + "grad_norm": 1.191297532421212, + "learning_rate": 2.4800438166377337e-06, + "loss": 1.1526, + "num_input_tokens_seen": 1413557824, + "step": 7828 + }, + { + "epoch": 0.8570568433728345, + "grad_norm": 1.2731081296751547, + "learning_rate": 2.4763113014012155e-06, + "loss": 0.7669, + "num_input_tokens_seen": 1413743072, + "step": 7829 + }, + { + "epoch": 0.8571663154437724, + "grad_norm": 1.1208440522505048, + "learning_rate": 2.472581450674011e-06, + "loss": 0.9124, + "num_input_tokens_seen": 1413950496, + "step": 7830 + }, + { + "epoch": 0.8572757875147103, + "grad_norm": 1.1172634178989664, + "learning_rate": 2.468854264897355e-06, + "loss": 1.0738, + "num_input_tokens_seen": 1414148064, + "step": 7831 + }, + { + "epoch": 0.8573852595856483, + "grad_norm": 1.128198897036382, + "learning_rate": 2.4651297445121625e-06, + "loss": 0.8156, + "num_input_tokens_seen": 1414321440, + "step": 7832 + }, + { + "epoch": 0.8574947316565861, + "grad_norm": 1.1354826403474676, + "learning_rate": 2.461407889959047e-06, + "loss": 1.0631, + "num_input_tokens_seen": 1414532448, + "step": 7833 + }, + { + "epoch": 0.857604203727524, + "grad_norm": 1.2734236453186123, + "learning_rate": 2.4576887016782927e-06, + "loss": 0.9928, + "num_input_tokens_seen": 1414705152, + "step": 7834 + }, + { + "epoch": 0.8577136757984619, + "grad_norm": 1.0954043819183135, + "learning_rate": 2.4539721801098704e-06, + "loss": 0.7673, + "num_input_tokens_seen": 1414876288, + "step": 7835 + }, + { + "epoch": 0.8578231478693998, + "grad_norm": 1.2552783149848838, + "learning_rate": 2.4502583256934388e-06, + "loss": 1.0046, + "num_input_tokens_seen": 1415042944, + "step": 7836 + }, + { + "epoch": 0.8579326199403378, + "grad_norm": 1.167720296198978, + "learning_rate": 2.4465471388683383e-06, + "loss": 0.872, + "num_input_tokens_seen": 1415205792, + "step": 7837 + }, + { + "epoch": 0.8580420920112756, + "grad_norm": 1.0461894395407925, + "learning_rate": 2.4428386200735924e-06, + "loss": 0.6125, + "num_input_tokens_seen": 1415408288, + "step": 7838 + }, + { + "epoch": 0.8581515640822135, + "grad_norm": 1.1978188958558331, + "learning_rate": 2.439132769747926e-06, + "loss": 0.6882, + "num_input_tokens_seen": 1415593536, + "step": 7839 + }, + { + "epoch": 0.8582610361531514, + "grad_norm": 1.0400416414470666, + "learning_rate": 2.435429588329716e-06, + "loss": 0.9777, + "num_input_tokens_seen": 1415807680, + "step": 7840 + }, + { + "epoch": 0.8583705082240893, + "grad_norm": 1.1450040706412241, + "learning_rate": 2.431729076257053e-06, + "loss": 1.1333, + "num_input_tokens_seen": 1415999200, + "step": 7841 + }, + { + "epoch": 0.8584799802950273, + "grad_norm": 1.0723450011176996, + "learning_rate": 2.4280312339676953e-06, + "loss": 0.8651, + "num_input_tokens_seen": 1416158688, + "step": 7842 + }, + { + "epoch": 0.8585894523659652, + "grad_norm": 1.0662117805205096, + "learning_rate": 2.4243360618990934e-06, + "loss": 0.8586, + "num_input_tokens_seen": 1416366336, + "step": 7843 + }, + { + "epoch": 0.858698924436903, + "grad_norm": 1.3612219442063278, + "learning_rate": 2.4206435604883782e-06, + "loss": 0.9977, + "num_input_tokens_seen": 1416551584, + "step": 7844 + }, + { + "epoch": 0.8588083965078409, + "grad_norm": 1.1821080820107934, + "learning_rate": 2.416953730172361e-06, + "loss": 0.8665, + "num_input_tokens_seen": 1416710176, + "step": 7845 + }, + { + "epoch": 0.8589178685787788, + "grad_norm": 1.1445262505223572, + "learning_rate": 2.4132665713875542e-06, + "loss": 0.8248, + "num_input_tokens_seen": 1416887808, + "step": 7846 + }, + { + "epoch": 0.8590273406497168, + "grad_norm": 1.1545633953956045, + "learning_rate": 2.409582084570125e-06, + "loss": 1.0299, + "num_input_tokens_seen": 1417063424, + "step": 7847 + }, + { + "epoch": 0.8591368127206547, + "grad_norm": 1.0957713930346749, + "learning_rate": 2.4059002701559587e-06, + "loss": 0.7771, + "num_input_tokens_seen": 1417249344, + "step": 7848 + }, + { + "epoch": 0.8592462847915926, + "grad_norm": 1.1264511921108733, + "learning_rate": 2.4022211285805895e-06, + "loss": 1.0066, + "num_input_tokens_seen": 1417431904, + "step": 7849 + }, + { + "epoch": 0.8593557568625304, + "grad_norm": 1.1418750131958875, + "learning_rate": 2.39854466027927e-06, + "loss": 0.9284, + "num_input_tokens_seen": 1417621184, + "step": 7850 + }, + { + "epoch": 0.8594652289334683, + "grad_norm": 1.1064357773992, + "learning_rate": 2.394870865686899e-06, + "loss": 0.8133, + "num_input_tokens_seen": 1417787840, + "step": 7851 + }, + { + "epoch": 0.8595747010044062, + "grad_norm": 1.298114597828034, + "learning_rate": 2.3911997452380987e-06, + "loss": 0.8186, + "num_input_tokens_seen": 1417950464, + "step": 7852 + }, + { + "epoch": 0.8596841730753442, + "grad_norm": 1.0935403895478153, + "learning_rate": 2.387531299367146e-06, + "loss": 0.9724, + "num_input_tokens_seen": 1418118464, + "step": 7853 + }, + { + "epoch": 0.8597936451462821, + "grad_norm": 0.9940835452977472, + "learning_rate": 2.3838655285080085e-06, + "loss": 0.8241, + "num_input_tokens_seen": 1418322976, + "step": 7854 + }, + { + "epoch": 0.8599031172172199, + "grad_norm": 1.2309675870667496, + "learning_rate": 2.3802024330943556e-06, + "loss": 0.8793, + "num_input_tokens_seen": 1418464544, + "step": 7855 + }, + { + "epoch": 0.8600125892881578, + "grad_norm": 1.103745428142821, + "learning_rate": 2.376542013559502e-06, + "loss": 0.8008, + "num_input_tokens_seen": 1418648224, + "step": 7856 + }, + { + "epoch": 0.8601220613590957, + "grad_norm": 1.184563117072257, + "learning_rate": 2.3728842703364894e-06, + "loss": 0.7788, + "num_input_tokens_seen": 1418841984, + "step": 7857 + }, + { + "epoch": 0.8602315334300337, + "grad_norm": 1.2143554196666406, + "learning_rate": 2.3692292038580006e-06, + "loss": 0.8563, + "num_input_tokens_seen": 1419022080, + "step": 7858 + }, + { + "epoch": 0.8603410055009716, + "grad_norm": 1.0423282556836093, + "learning_rate": 2.3655768145564416e-06, + "loss": 0.692, + "num_input_tokens_seen": 1419219648, + "step": 7859 + }, + { + "epoch": 0.8604504775719095, + "grad_norm": 0.9527987250023262, + "learning_rate": 2.361927102863873e-06, + "loss": 0.655, + "num_input_tokens_seen": 1419388768, + "step": 7860 + }, + { + "epoch": 0.8605599496428473, + "grad_norm": 1.1932553809472413, + "learning_rate": 2.3582800692120542e-06, + "loss": 1.0623, + "num_input_tokens_seen": 1419586112, + "step": 7861 + }, + { + "epoch": 0.8606694217137852, + "grad_norm": 1.1410261520009832, + "learning_rate": 2.354635714032419e-06, + "loss": 0.8117, + "num_input_tokens_seen": 1419757472, + "step": 7862 + }, + { + "epoch": 0.8607788937847232, + "grad_norm": 1.2761207348467074, + "learning_rate": 2.3509940377560878e-06, + "loss": 1.0653, + "num_input_tokens_seen": 1419934208, + "step": 7863 + }, + { + "epoch": 0.8608883658556611, + "grad_norm": 1.3319540588744372, + "learning_rate": 2.3473550408138645e-06, + "loss": 1.0587, + "num_input_tokens_seen": 1420104896, + "step": 7864 + }, + { + "epoch": 0.860997837926599, + "grad_norm": 1.130358016595522, + "learning_rate": 2.343718723636232e-06, + "loss": 0.9916, + "num_input_tokens_seen": 1420281408, + "step": 7865 + }, + { + "epoch": 0.8611073099975369, + "grad_norm": 1.1975898241507752, + "learning_rate": 2.3400850866533654e-06, + "loss": 0.8977, + "num_input_tokens_seen": 1420445600, + "step": 7866 + }, + { + "epoch": 0.8612167820684747, + "grad_norm": 1.295943327688963, + "learning_rate": 2.3364541302951154e-06, + "loss": 0.902, + "num_input_tokens_seen": 1420635104, + "step": 7867 + }, + { + "epoch": 0.8613262541394127, + "grad_norm": 1.915003455734123, + "learning_rate": 2.3328258549910166e-06, + "loss": 0.8734, + "num_input_tokens_seen": 1420809824, + "step": 7868 + }, + { + "epoch": 0.8614357262103506, + "grad_norm": 1.193262074401238, + "learning_rate": 2.3292002611702863e-06, + "loss": 0.9852, + "num_input_tokens_seen": 1420987232, + "step": 7869 + }, + { + "epoch": 0.8615451982812885, + "grad_norm": 1.275993870469322, + "learning_rate": 2.325577349261826e-06, + "loss": 0.9151, + "num_input_tokens_seen": 1421175840, + "step": 7870 + }, + { + "epoch": 0.8616546703522264, + "grad_norm": 1.1264628859669539, + "learning_rate": 2.321957119694221e-06, + "loss": 0.9278, + "num_input_tokens_seen": 1421374752, + "step": 7871 + }, + { + "epoch": 0.8617641424231642, + "grad_norm": 1.1119848901867881, + "learning_rate": 2.3183395728957334e-06, + "loss": 1.0867, + "num_input_tokens_seen": 1421580384, + "step": 7872 + }, + { + "epoch": 0.8618736144941022, + "grad_norm": 1.1698584435896613, + "learning_rate": 2.3147247092943107e-06, + "loss": 0.8585, + "num_input_tokens_seen": 1421784448, + "step": 7873 + }, + { + "epoch": 0.8619830865650401, + "grad_norm": 1.2993237001715368, + "learning_rate": 2.311112529317591e-06, + "loss": 0.8153, + "num_input_tokens_seen": 1421932960, + "step": 7874 + }, + { + "epoch": 0.862092558635978, + "grad_norm": 1.1573631495738297, + "learning_rate": 2.307503033392888e-06, + "loss": 0.8775, + "num_input_tokens_seen": 1422136128, + "step": 7875 + }, + { + "epoch": 0.8622020307069159, + "grad_norm": 1.1371526250604556, + "learning_rate": 2.303896221947194e-06, + "loss": 0.8707, + "num_input_tokens_seen": 1422313536, + "step": 7876 + }, + { + "epoch": 0.8623115027778538, + "grad_norm": 1.1372007416501682, + "learning_rate": 2.3002920954071916e-06, + "loss": 0.9514, + "num_input_tokens_seen": 1422473472, + "step": 7877 + }, + { + "epoch": 0.8624209748487917, + "grad_norm": 1.0029166721290639, + "learning_rate": 2.296690654199238e-06, + "loss": 0.7847, + "num_input_tokens_seen": 1422662080, + "step": 7878 + }, + { + "epoch": 0.8625304469197296, + "grad_norm": 1.149950963509028, + "learning_rate": 2.293091898749378e-06, + "loss": 0.7288, + "num_input_tokens_seen": 1422838816, + "step": 7879 + }, + { + "epoch": 0.8626399189906675, + "grad_norm": 1.1532981411759409, + "learning_rate": 2.2894958294833317e-06, + "loss": 0.8464, + "num_input_tokens_seen": 1423000768, + "step": 7880 + }, + { + "epoch": 0.8627493910616054, + "grad_norm": 1.3390396418896193, + "learning_rate": 2.2859024468265265e-06, + "loss": 0.8793, + "num_input_tokens_seen": 1423182880, + "step": 7881 + }, + { + "epoch": 0.8628588631325433, + "grad_norm": 1.1295159617254922, + "learning_rate": 2.2823117512040304e-06, + "loss": 0.9295, + "num_input_tokens_seen": 1423353792, + "step": 7882 + }, + { + "epoch": 0.8629683352034813, + "grad_norm": 1.1190340480686902, + "learning_rate": 2.2787237430406285e-06, + "loss": 0.8576, + "num_input_tokens_seen": 1423506560, + "step": 7883 + }, + { + "epoch": 0.8630778072744191, + "grad_norm": 1.1318950752824528, + "learning_rate": 2.2751384227607727e-06, + "loss": 1.2177, + "num_input_tokens_seen": 1423701440, + "step": 7884 + }, + { + "epoch": 0.863187279345357, + "grad_norm": 1.3722929315757606, + "learning_rate": 2.2715557907885986e-06, + "loss": 0.971, + "num_input_tokens_seen": 1423872800, + "step": 7885 + }, + { + "epoch": 0.8632967514162949, + "grad_norm": 1.2873239526477664, + "learning_rate": 2.2679758475479235e-06, + "loss": 1.0315, + "num_input_tokens_seen": 1424060512, + "step": 7886 + }, + { + "epoch": 0.8634062234872328, + "grad_norm": 1.0350139016217574, + "learning_rate": 2.264398593462247e-06, + "loss": 0.9044, + "num_input_tokens_seen": 1424288768, + "step": 7887 + }, + { + "epoch": 0.8635156955581708, + "grad_norm": 1.2526920417315428, + "learning_rate": 2.260824028954764e-06, + "loss": 1.0692, + "num_input_tokens_seen": 1424446016, + "step": 7888 + }, + { + "epoch": 0.8636251676291086, + "grad_norm": 1.2105139699269252, + "learning_rate": 2.2572521544483166e-06, + "loss": 0.7548, + "num_input_tokens_seen": 1424625440, + "step": 7889 + }, + { + "epoch": 0.8637346397000465, + "grad_norm": 1.2445145887850868, + "learning_rate": 2.2536829703654727e-06, + "loss": 0.7942, + "num_input_tokens_seen": 1424815168, + "step": 7890 + }, + { + "epoch": 0.8638441117709844, + "grad_norm": 1.2119124886673907, + "learning_rate": 2.2501164771284418e-06, + "loss": 0.8447, + "num_input_tokens_seen": 1425019456, + "step": 7891 + }, + { + "epoch": 0.8639535838419223, + "grad_norm": 1.0671125036560036, + "learning_rate": 2.246552675159147e-06, + "loss": 0.6941, + "num_input_tokens_seen": 1425190144, + "step": 7892 + }, + { + "epoch": 0.8640630559128603, + "grad_norm": 1.2667389194477447, + "learning_rate": 2.2429915648791684e-06, + "loss": 0.9953, + "num_input_tokens_seen": 1425405856, + "step": 7893 + }, + { + "epoch": 0.8641725279837982, + "grad_norm": 1.165635447928314, + "learning_rate": 2.239433146709785e-06, + "loss": 0.8594, + "num_input_tokens_seen": 1425579904, + "step": 7894 + }, + { + "epoch": 0.864282000054736, + "grad_norm": 1.304273345451697, + "learning_rate": 2.2358774210719523e-06, + "loss": 0.9179, + "num_input_tokens_seen": 1425729312, + "step": 7895 + }, + { + "epoch": 0.8643914721256739, + "grad_norm": 1.1787333387570744, + "learning_rate": 2.2323243883862976e-06, + "loss": 0.8734, + "num_input_tokens_seen": 1425897984, + "step": 7896 + }, + { + "epoch": 0.8645009441966118, + "grad_norm": 1.0706788153186766, + "learning_rate": 2.2287740490731514e-06, + "loss": 0.9315, + "num_input_tokens_seen": 1426101600, + "step": 7897 + }, + { + "epoch": 0.8646104162675498, + "grad_norm": 1.1785845115893268, + "learning_rate": 2.2252264035524968e-06, + "loss": 0.7682, + "num_input_tokens_seen": 1426287520, + "step": 7898 + }, + { + "epoch": 0.8647198883384877, + "grad_norm": 1.1798230023909626, + "learning_rate": 2.2216814522440233e-06, + "loss": 0.8621, + "num_input_tokens_seen": 1426484416, + "step": 7899 + }, + { + "epoch": 0.8648293604094256, + "grad_norm": 1.1960808273006036, + "learning_rate": 2.21813919556709e-06, + "loss": 0.7531, + "num_input_tokens_seen": 1426677056, + "step": 7900 + }, + { + "epoch": 0.8649388324803634, + "grad_norm": 1.1171763492908064, + "learning_rate": 2.214599633940739e-06, + "loss": 0.8248, + "num_input_tokens_seen": 1426873280, + "step": 7901 + }, + { + "epoch": 0.8650483045513013, + "grad_norm": 1.2095310041562635, + "learning_rate": 2.211062767783692e-06, + "loss": 0.8414, + "num_input_tokens_seen": 1427047328, + "step": 7902 + }, + { + "epoch": 0.8651577766222392, + "grad_norm": 1.1648670462844286, + "learning_rate": 2.207528597514355e-06, + "loss": 0.7419, + "num_input_tokens_seen": 1427194272, + "step": 7903 + }, + { + "epoch": 0.8652672486931772, + "grad_norm": 1.0763653756037985, + "learning_rate": 2.2039971235508135e-06, + "loss": 0.7807, + "num_input_tokens_seen": 1427386016, + "step": 7904 + }, + { + "epoch": 0.8653767207641151, + "grad_norm": 1.0204523828958172, + "learning_rate": 2.200468346310833e-06, + "loss": 1.0509, + "num_input_tokens_seen": 1427602176, + "step": 7905 + }, + { + "epoch": 0.8654861928350529, + "grad_norm": 1.2366508556822164, + "learning_rate": 2.1969422662118572e-06, + "loss": 1.077, + "num_input_tokens_seen": 1427741952, + "step": 7906 + }, + { + "epoch": 0.8655956649059908, + "grad_norm": 1.1948981470730726, + "learning_rate": 2.193418883671025e-06, + "loss": 1.0904, + "num_input_tokens_seen": 1427925184, + "step": 7907 + }, + { + "epoch": 0.8657051369769287, + "grad_norm": 0.9960591927312086, + "learning_rate": 2.189898199105139e-06, + "loss": 0.6643, + "num_input_tokens_seen": 1428103040, + "step": 7908 + }, + { + "epoch": 0.8658146090478667, + "grad_norm": 1.2634439395473416, + "learning_rate": 2.1863802129306886e-06, + "loss": 0.7444, + "num_input_tokens_seen": 1428278208, + "step": 7909 + }, + { + "epoch": 0.8659240811188046, + "grad_norm": 1.2249058998711384, + "learning_rate": 2.182864925563849e-06, + "loss": 1.0175, + "num_input_tokens_seen": 1428468608, + "step": 7910 + }, + { + "epoch": 0.8660335531897425, + "grad_norm": 1.0630869646573786, + "learning_rate": 2.1793523374204706e-06, + "loss": 0.7739, + "num_input_tokens_seen": 1428634816, + "step": 7911 + }, + { + "epoch": 0.8661430252606803, + "grad_norm": 1.1072299259422036, + "learning_rate": 2.175842448916085e-06, + "loss": 0.5897, + "num_input_tokens_seen": 1428819616, + "step": 7912 + }, + { + "epoch": 0.8662524973316182, + "grad_norm": 1.227212575681055, + "learning_rate": 2.1723352604658994e-06, + "loss": 0.75, + "num_input_tokens_seen": 1428977984, + "step": 7913 + }, + { + "epoch": 0.8663619694025562, + "grad_norm": 1.036420920509867, + "learning_rate": 2.1688307724848227e-06, + "loss": 0.6415, + "num_input_tokens_seen": 1429164128, + "step": 7914 + }, + { + "epoch": 0.8664714414734941, + "grad_norm": 1.277267733770561, + "learning_rate": 2.1653289853874103e-06, + "loss": 0.9209, + "num_input_tokens_seen": 1429324064, + "step": 7915 + }, + { + "epoch": 0.866580913544432, + "grad_norm": 1.1827635418574862, + "learning_rate": 2.161829899587933e-06, + "loss": 0.8351, + "num_input_tokens_seen": 1429507520, + "step": 7916 + }, + { + "epoch": 0.8666903856153699, + "grad_norm": 1.2171008371785461, + "learning_rate": 2.158333515500316e-06, + "loss": 1.0325, + "num_input_tokens_seen": 1429681792, + "step": 7917 + }, + { + "epoch": 0.8667998576863077, + "grad_norm": 1.1361792683591982, + "learning_rate": 2.1548398335381802e-06, + "loss": 0.9532, + "num_input_tokens_seen": 1429875776, + "step": 7918 + }, + { + "epoch": 0.8669093297572457, + "grad_norm": 1.0646908555847083, + "learning_rate": 2.151348854114821e-06, + "loss": 0.7295, + "num_input_tokens_seen": 1430077824, + "step": 7919 + }, + { + "epoch": 0.8670188018281836, + "grad_norm": 1.2008484443721503, + "learning_rate": 2.147860577643207e-06, + "loss": 0.9101, + "num_input_tokens_seen": 1430275840, + "step": 7920 + }, + { + "epoch": 0.8671282738991215, + "grad_norm": 1.2755784835445778, + "learning_rate": 2.144375004536012e-06, + "loss": 1.249, + "num_input_tokens_seen": 1430482144, + "step": 7921 + }, + { + "epoch": 0.8672377459700594, + "grad_norm": 1.170535733739548, + "learning_rate": 2.1408921352055496e-06, + "loss": 0.8179, + "num_input_tokens_seen": 1430691136, + "step": 7922 + }, + { + "epoch": 0.8673472180409972, + "grad_norm": 1.1462653761636319, + "learning_rate": 2.1374119700638575e-06, + "loss": 0.8916, + "num_input_tokens_seen": 1430857344, + "step": 7923 + }, + { + "epoch": 0.8674566901119352, + "grad_norm": 1.04151083098451, + "learning_rate": 2.1339345095226144e-06, + "loss": 0.7561, + "num_input_tokens_seen": 1431059168, + "step": 7924 + }, + { + "epoch": 0.8675661621828731, + "grad_norm": 1.0728463578874872, + "learning_rate": 2.1304597539932137e-06, + "loss": 0.7718, + "num_input_tokens_seen": 1431208800, + "step": 7925 + }, + { + "epoch": 0.867675634253811, + "grad_norm": 1.0780837009646818, + "learning_rate": 2.1269877038867013e-06, + "loss": 0.8912, + "num_input_tokens_seen": 1431385088, + "step": 7926 + }, + { + "epoch": 0.8677851063247489, + "grad_norm": 0.9477757604202867, + "learning_rate": 2.1235183596138214e-06, + "loss": 0.941, + "num_input_tokens_seen": 1431588704, + "step": 7927 + }, + { + "epoch": 0.8678945783956868, + "grad_norm": 1.0611011047512835, + "learning_rate": 2.120051721584984e-06, + "loss": 0.7733, + "num_input_tokens_seen": 1431794336, + "step": 7928 + }, + { + "epoch": 0.8680040504666247, + "grad_norm": 1.2535960446550332, + "learning_rate": 2.1165877902102867e-06, + "loss": 1.0319, + "num_input_tokens_seen": 1431978240, + "step": 7929 + }, + { + "epoch": 0.8681135225375626, + "grad_norm": 1.2702417351600808, + "learning_rate": 2.113126565899515e-06, + "loss": 0.8809, + "num_input_tokens_seen": 1432164384, + "step": 7930 + }, + { + "epoch": 0.8682229946085005, + "grad_norm": 1.1563723215920139, + "learning_rate": 2.1096680490621107e-06, + "loss": 1.0229, + "num_input_tokens_seen": 1432323872, + "step": 7931 + }, + { + "epoch": 0.8683324666794384, + "grad_norm": 1.2383820886798917, + "learning_rate": 2.106212240107225e-06, + "loss": 0.8813, + "num_input_tokens_seen": 1432488512, + "step": 7932 + }, + { + "epoch": 0.8684419387503763, + "grad_norm": 1.0141837824868942, + "learning_rate": 2.102759139443658e-06, + "loss": 0.6869, + "num_input_tokens_seen": 1432624928, + "step": 7933 + }, + { + "epoch": 0.8685514108213143, + "grad_norm": 1.1704062349192497, + "learning_rate": 2.0993087474799166e-06, + "loss": 0.9822, + "num_input_tokens_seen": 1432839520, + "step": 7934 + }, + { + "epoch": 0.8686608828922521, + "grad_norm": 1.0869473489084056, + "learning_rate": 2.0958610646241717e-06, + "loss": 1.054, + "num_input_tokens_seen": 1433056576, + "step": 7935 + }, + { + "epoch": 0.86877035496319, + "grad_norm": 1.1470205167915948, + "learning_rate": 2.09241609128428e-06, + "loss": 0.7834, + "num_input_tokens_seen": 1433224128, + "step": 7936 + }, + { + "epoch": 0.8688798270341279, + "grad_norm": 1.1738279218879817, + "learning_rate": 2.0889738278677686e-06, + "loss": 0.887, + "num_input_tokens_seen": 1433401984, + "step": 7937 + }, + { + "epoch": 0.8689892991050658, + "grad_norm": 1.1341528493715518, + "learning_rate": 2.085534274781853e-06, + "loss": 0.7389, + "num_input_tokens_seen": 1433591264, + "step": 7938 + }, + { + "epoch": 0.8690987711760038, + "grad_norm": 1.1715697844719688, + "learning_rate": 2.0820974324334356e-06, + "loss": 0.8667, + "num_input_tokens_seen": 1433762176, + "step": 7939 + }, + { + "epoch": 0.8692082432469416, + "grad_norm": 1.310428847158777, + "learning_rate": 2.0786633012290723e-06, + "loss": 1.0546, + "num_input_tokens_seen": 1433930848, + "step": 7940 + }, + { + "epoch": 0.8693177153178795, + "grad_norm": 1.227295723211582, + "learning_rate": 2.0752318815750265e-06, + "loss": 0.8716, + "num_input_tokens_seen": 1434077120, + "step": 7941 + }, + { + "epoch": 0.8694271873888174, + "grad_norm": 1.0248067301458463, + "learning_rate": 2.0718031738772265e-06, + "loss": 0.7107, + "num_input_tokens_seen": 1434270432, + "step": 7942 + }, + { + "epoch": 0.8695366594597553, + "grad_norm": 1.088205288593136, + "learning_rate": 2.068377178541275e-06, + "loss": 0.6412, + "num_input_tokens_seen": 1434432832, + "step": 7943 + }, + { + "epoch": 0.8696461315306933, + "grad_norm": 1.147853261554649, + "learning_rate": 2.0649538959724686e-06, + "loss": 0.8234, + "num_input_tokens_seen": 1434610240, + "step": 7944 + }, + { + "epoch": 0.8697556036016312, + "grad_norm": 1.0977666398874213, + "learning_rate": 2.0615333265757737e-06, + "loss": 0.6637, + "num_input_tokens_seen": 1434799744, + "step": 7945 + }, + { + "epoch": 0.869865075672569, + "grad_norm": 1.1635810829437168, + "learning_rate": 2.058115470755831e-06, + "loss": 0.6997, + "num_input_tokens_seen": 1434969088, + "step": 7946 + }, + { + "epoch": 0.8699745477435069, + "grad_norm": 1.1543803176798708, + "learning_rate": 2.0547003289169724e-06, + "loss": 0.8498, + "num_input_tokens_seen": 1435127008, + "step": 7947 + }, + { + "epoch": 0.8700840198144448, + "grad_norm": 1.10288667562877, + "learning_rate": 2.0512879014631976e-06, + "loss": 0.8177, + "num_input_tokens_seen": 1435333536, + "step": 7948 + }, + { + "epoch": 0.8701934918853828, + "grad_norm": 1.1852799795463222, + "learning_rate": 2.047878188798197e-06, + "loss": 0.8377, + "num_input_tokens_seen": 1435527296, + "step": 7949 + }, + { + "epoch": 0.8703029639563207, + "grad_norm": 0.99063210710449, + "learning_rate": 2.0444711913253312e-06, + "loss": 0.7697, + "num_input_tokens_seen": 1435710304, + "step": 7950 + }, + { + "epoch": 0.8704124360272586, + "grad_norm": 1.2143777033078296, + "learning_rate": 2.041066909447639e-06, + "loss": 0.8658, + "num_input_tokens_seen": 1435906752, + "step": 7951 + }, + { + "epoch": 0.8705219080981964, + "grad_norm": 1.2270101714330532, + "learning_rate": 2.0376653435678405e-06, + "loss": 0.8749, + "num_input_tokens_seen": 1436108352, + "step": 7952 + }, + { + "epoch": 0.8706313801691343, + "grad_norm": 1.0669217383365888, + "learning_rate": 2.0342664940883353e-06, + "loss": 0.6855, + "num_input_tokens_seen": 1436282400, + "step": 7953 + }, + { + "epoch": 0.8707408522400722, + "grad_norm": 1.1790648073790069, + "learning_rate": 2.030870361411202e-06, + "loss": 1.1256, + "num_input_tokens_seen": 1436473696, + "step": 7954 + }, + { + "epoch": 0.8708503243110102, + "grad_norm": 1.3055444518534713, + "learning_rate": 2.027476945938189e-06, + "loss": 1.1362, + "num_input_tokens_seen": 1436646400, + "step": 7955 + }, + { + "epoch": 0.8709597963819481, + "grad_norm": 0.9880607570102203, + "learning_rate": 2.0240862480707475e-06, + "loss": 0.8049, + "num_input_tokens_seen": 1436832992, + "step": 7956 + }, + { + "epoch": 0.8710692684528859, + "grad_norm": 1.035859757810367, + "learning_rate": 2.0206982682099723e-06, + "loss": 1.0491, + "num_input_tokens_seen": 1437024960, + "step": 7957 + }, + { + "epoch": 0.8711787405238238, + "grad_norm": 1.0665957663238352, + "learning_rate": 2.017313006756666e-06, + "loss": 0.8016, + "num_input_tokens_seen": 1437203040, + "step": 7958 + }, + { + "epoch": 0.8712882125947617, + "grad_norm": 1.1780606097932205, + "learning_rate": 2.0139304641112966e-06, + "loss": 0.8894, + "num_input_tokens_seen": 1437396800, + "step": 7959 + }, + { + "epoch": 0.8713976846656997, + "grad_norm": 1.260277222835568, + "learning_rate": 2.010550640674011e-06, + "loss": 0.9047, + "num_input_tokens_seen": 1437573088, + "step": 7960 + }, + { + "epoch": 0.8715071567366376, + "grad_norm": 1.0131875941186086, + "learning_rate": 2.0071735368446364e-06, + "loss": 0.8187, + "num_input_tokens_seen": 1437775360, + "step": 7961 + }, + { + "epoch": 0.8716166288075755, + "grad_norm": 1.1522257598988999, + "learning_rate": 2.003799153022673e-06, + "loss": 0.8468, + "num_input_tokens_seen": 1437974272, + "step": 7962 + }, + { + "epoch": 0.8717261008785133, + "grad_norm": 1.1375222528817124, + "learning_rate": 2.0004274896073176e-06, + "loss": 0.7723, + "num_input_tokens_seen": 1438170720, + "step": 7963 + }, + { + "epoch": 0.8718355729494512, + "grad_norm": 1.0883573835460265, + "learning_rate": 1.9970585469974127e-06, + "loss": 0.97, + "num_input_tokens_seen": 1438372320, + "step": 7964 + }, + { + "epoch": 0.8719450450203892, + "grad_norm": 1.0204263316940678, + "learning_rate": 1.9936923255915175e-06, + "loss": 0.8117, + "num_input_tokens_seen": 1438589376, + "step": 7965 + }, + { + "epoch": 0.8720545170913271, + "grad_norm": 1.1330162654325313, + "learning_rate": 1.9903288257878292e-06, + "loss": 0.7077, + "num_input_tokens_seen": 1438776416, + "step": 7966 + }, + { + "epoch": 0.872163989162265, + "grad_norm": 1.1146244445539295, + "learning_rate": 1.986968047984261e-06, + "loss": 0.9362, + "num_input_tokens_seen": 1438984064, + "step": 7967 + }, + { + "epoch": 0.8722734612332029, + "grad_norm": 1.1739554693169623, + "learning_rate": 1.983609992578375e-06, + "loss": 0.9842, + "num_input_tokens_seen": 1439177376, + "step": 7968 + }, + { + "epoch": 0.8723829333041407, + "grad_norm": 1.166131708702199, + "learning_rate": 1.9802546599674313e-06, + "loss": 1.1032, + "num_input_tokens_seen": 1439351424, + "step": 7969 + }, + { + "epoch": 0.8724924053750787, + "grad_norm": 1.1597599675678787, + "learning_rate": 1.9769020505483544e-06, + "loss": 0.9115, + "num_input_tokens_seen": 1439527040, + "step": 7970 + }, + { + "epoch": 0.8726018774460166, + "grad_norm": 1.303656164726055, + "learning_rate": 1.973552164717746e-06, + "loss": 0.9054, + "num_input_tokens_seen": 1439734912, + "step": 7971 + }, + { + "epoch": 0.8727113495169545, + "grad_norm": 1.1008947526448938, + "learning_rate": 1.9702050028719056e-06, + "loss": 0.9975, + "num_input_tokens_seen": 1439918816, + "step": 7972 + }, + { + "epoch": 0.8728208215878924, + "grad_norm": 1.1970097080050561, + "learning_rate": 1.9668605654067805e-06, + "loss": 0.8377, + "num_input_tokens_seen": 1440094208, + "step": 7973 + }, + { + "epoch": 0.8729302936588302, + "grad_norm": 1.1772063480789323, + "learning_rate": 1.9635188527180244e-06, + "loss": 0.9173, + "num_input_tokens_seen": 1440272512, + "step": 7974 + }, + { + "epoch": 0.8730397657297682, + "grad_norm": 1.1611789601157378, + "learning_rate": 1.960179865200948e-06, + "loss": 0.7722, + "num_input_tokens_seen": 1440415872, + "step": 7975 + }, + { + "epoch": 0.8731492378007061, + "grad_norm": 1.129400758771471, + "learning_rate": 1.9568436032505493e-06, + "loss": 0.83, + "num_input_tokens_seen": 1440602688, + "step": 7976 + }, + { + "epoch": 0.873258709871644, + "grad_norm": 1.2654948226479166, + "learning_rate": 1.953510067261499e-06, + "loss": 0.897, + "num_input_tokens_seen": 1440794656, + "step": 7977 + }, + { + "epoch": 0.8733681819425819, + "grad_norm": 1.1922968803014928, + "learning_rate": 1.950179257628154e-06, + "loss": 1.0808, + "num_input_tokens_seen": 1440986400, + "step": 7978 + }, + { + "epoch": 0.8734776540135198, + "grad_norm": 1.3332134381668523, + "learning_rate": 1.946851174744538e-06, + "loss": 0.9389, + "num_input_tokens_seen": 1441177024, + "step": 7979 + }, + { + "epoch": 0.8735871260844577, + "grad_norm": 1.0544448573425775, + "learning_rate": 1.943525819004352e-06, + "loss": 0.8517, + "num_input_tokens_seen": 1441377504, + "step": 7980 + }, + { + "epoch": 0.8736965981553956, + "grad_norm": 1.0922893034677639, + "learning_rate": 1.9402031908009904e-06, + "loss": 0.9728, + "num_input_tokens_seen": 1441543040, + "step": 7981 + }, + { + "epoch": 0.8738060702263335, + "grad_norm": 1.2332705131632722, + "learning_rate": 1.936883290527508e-06, + "loss": 0.9247, + "num_input_tokens_seen": 1441724256, + "step": 7982 + }, + { + "epoch": 0.8739155422972714, + "grad_norm": 1.1144270505104512, + "learning_rate": 1.9335661185766436e-06, + "loss": 0.8652, + "num_input_tokens_seen": 1441917344, + "step": 7983 + }, + { + "epoch": 0.8740250143682093, + "grad_norm": 1.1309940343610096, + "learning_rate": 1.9302516753408136e-06, + "loss": 0.8265, + "num_input_tokens_seen": 1442082432, + "step": 7984 + }, + { + "epoch": 0.8741344864391473, + "grad_norm": 1.1770845205025326, + "learning_rate": 1.926939961212107e-06, + "loss": 0.8603, + "num_input_tokens_seen": 1442282240, + "step": 7985 + }, + { + "epoch": 0.8742439585100851, + "grad_norm": 1.2169893703388293, + "learning_rate": 1.923630976582294e-06, + "loss": 1.0881, + "num_input_tokens_seen": 1442443296, + "step": 7986 + }, + { + "epoch": 0.874353430581023, + "grad_norm": 1.1622806711332638, + "learning_rate": 1.9203247218428226e-06, + "loss": 1.0638, + "num_input_tokens_seen": 1442633920, + "step": 7987 + }, + { + "epoch": 0.8744629026519609, + "grad_norm": 1.2349090386688448, + "learning_rate": 1.9170211973848106e-06, + "loss": 1.1263, + "num_input_tokens_seen": 1442801696, + "step": 7988 + }, + { + "epoch": 0.8745723747228988, + "grad_norm": 1.034829545197875, + "learning_rate": 1.9137204035990704e-06, + "loss": 0.8317, + "num_input_tokens_seen": 1442992544, + "step": 7989 + }, + { + "epoch": 0.8746818467938368, + "grad_norm": 1.0948763905610122, + "learning_rate": 1.9104223408760698e-06, + "loss": 0.805, + "num_input_tokens_seen": 1443156288, + "step": 7990 + }, + { + "epoch": 0.8747913188647746, + "grad_norm": 1.3052907223080026, + "learning_rate": 1.907127009605969e-06, + "loss": 0.9913, + "num_input_tokens_seen": 1443312416, + "step": 7991 + }, + { + "epoch": 0.8749007909357125, + "grad_norm": 1.2560002795206962, + "learning_rate": 1.9038344101785954e-06, + "loss": 0.9264, + "num_input_tokens_seen": 1443474368, + "step": 7992 + }, + { + "epoch": 0.8750102630066504, + "grad_norm": 1.184384575958593, + "learning_rate": 1.9005445429834595e-06, + "loss": 0.8144, + "num_input_tokens_seen": 1443650656, + "step": 7993 + }, + { + "epoch": 0.8751197350775883, + "grad_norm": 1.0493638359416575, + "learning_rate": 1.8972574084097472e-06, + "loss": 1.0021, + "num_input_tokens_seen": 1443844640, + "step": 7994 + }, + { + "epoch": 0.8752292071485263, + "grad_norm": 1.1511206391229458, + "learning_rate": 1.8939730068463114e-06, + "loss": 0.955, + "num_input_tokens_seen": 1444039744, + "step": 7995 + }, + { + "epoch": 0.8753386792194642, + "grad_norm": 1.2266407352450133, + "learning_rate": 1.8906913386817077e-06, + "loss": 0.8502, + "num_input_tokens_seen": 1444210880, + "step": 7996 + }, + { + "epoch": 0.875448151290402, + "grad_norm": 1.0889683424146224, + "learning_rate": 1.8874124043041314e-06, + "loss": 0.8335, + "num_input_tokens_seen": 1444397248, + "step": 7997 + }, + { + "epoch": 0.8755576233613399, + "grad_norm": 0.9977786306480476, + "learning_rate": 1.8841362041014944e-06, + "loss": 0.6665, + "num_input_tokens_seen": 1444570624, + "step": 7998 + }, + { + "epoch": 0.8756670954322778, + "grad_norm": 1.1491373226484773, + "learning_rate": 1.8808627384613448e-06, + "loss": 1.1255, + "num_input_tokens_seen": 1444762592, + "step": 7999 + }, + { + "epoch": 0.8757765675032158, + "grad_norm": 1.1700819412294525, + "learning_rate": 1.8775920077709397e-06, + "loss": 1.0141, + "num_input_tokens_seen": 1444942688, + "step": 8000 + }, + { + "epoch": 0.8758860395741537, + "grad_norm": 1.08836318869385, + "learning_rate": 1.8743240124172002e-06, + "loss": 0.6989, + "num_input_tokens_seen": 1445108672, + "step": 8001 + }, + { + "epoch": 0.8759955116450916, + "grad_norm": 1.1750846142434654, + "learning_rate": 1.8710587527867196e-06, + "loss": 0.7678, + "num_input_tokens_seen": 1445253152, + "step": 8002 + }, + { + "epoch": 0.8761049837160294, + "grad_norm": 1.0846986359134796, + "learning_rate": 1.8677962292657724e-06, + "loss": 0.7638, + "num_input_tokens_seen": 1445420032, + "step": 8003 + }, + { + "epoch": 0.8762144557869673, + "grad_norm": 1.157264934177253, + "learning_rate": 1.8645364422403083e-06, + "loss": 0.7675, + "num_input_tokens_seen": 1445598560, + "step": 8004 + }, + { + "epoch": 0.8763239278579052, + "grad_norm": 1.3302316451015277, + "learning_rate": 1.8612793920959632e-06, + "loss": 0.9291, + "num_input_tokens_seen": 1445749088, + "step": 8005 + }, + { + "epoch": 0.8764333999288432, + "grad_norm": 1.0556453734939413, + "learning_rate": 1.8580250792180232e-06, + "loss": 0.7447, + "num_input_tokens_seen": 1445935904, + "step": 8006 + }, + { + "epoch": 0.8765428719997811, + "grad_norm": 1.1841564792156363, + "learning_rate": 1.8547735039914859e-06, + "loss": 1.0742, + "num_input_tokens_seen": 1446120928, + "step": 8007 + }, + { + "epoch": 0.8766523440707189, + "grad_norm": 1.2467862300775732, + "learning_rate": 1.8515246668009883e-06, + "loss": 0.9839, + "num_input_tokens_seen": 1446295424, + "step": 8008 + }, + { + "epoch": 0.8767618161416568, + "grad_norm": 1.1729508103075696, + "learning_rate": 1.8482785680308728e-06, + "loss": 0.8744, + "num_input_tokens_seen": 1446488960, + "step": 8009 + }, + { + "epoch": 0.8768712882125947, + "grad_norm": 1.1368726383664944, + "learning_rate": 1.845035208065146e-06, + "loss": 0.8698, + "num_input_tokens_seen": 1446676000, + "step": 8010 + }, + { + "epoch": 0.8769807602835327, + "grad_norm": 1.179958287164526, + "learning_rate": 1.8417945872874875e-06, + "loss": 0.8216, + "num_input_tokens_seen": 1446831232, + "step": 8011 + }, + { + "epoch": 0.8770902323544706, + "grad_norm": 1.039574747808217, + "learning_rate": 1.8385567060812598e-06, + "loss": 1.0143, + "num_input_tokens_seen": 1447016928, + "step": 8012 + }, + { + "epoch": 0.8771997044254085, + "grad_norm": 1.0674218452485438, + "learning_rate": 1.8353215648294925e-06, + "loss": 0.7935, + "num_input_tokens_seen": 1447212928, + "step": 8013 + }, + { + "epoch": 0.8773091764963463, + "grad_norm": 1.0561552219642647, + "learning_rate": 1.8320891639149101e-06, + "loss": 0.8023, + "num_input_tokens_seen": 1447387424, + "step": 8014 + }, + { + "epoch": 0.8774186485672842, + "grad_norm": 1.2228451908849138, + "learning_rate": 1.828859503719879e-06, + "loss": 0.8365, + "num_input_tokens_seen": 1447562592, + "step": 8015 + }, + { + "epoch": 0.8775281206382222, + "grad_norm": 1.0606581487291635, + "learning_rate": 1.82563258462648e-06, + "loss": 0.7126, + "num_input_tokens_seen": 1447718720, + "step": 8016 + }, + { + "epoch": 0.8776375927091601, + "grad_norm": 1.084488100425401, + "learning_rate": 1.8224084070164405e-06, + "loss": 0.7217, + "num_input_tokens_seen": 1447912704, + "step": 8017 + }, + { + "epoch": 0.877747064780098, + "grad_norm": 1.1510008665192604, + "learning_rate": 1.8191869712711807e-06, + "loss": 1.1072, + "num_input_tokens_seen": 1448108928, + "step": 8018 + }, + { + "epoch": 0.8778565368510359, + "grad_norm": 1.282124476788706, + "learning_rate": 1.81596827777179e-06, + "loss": 1.1257, + "num_input_tokens_seen": 1448298880, + "step": 8019 + }, + { + "epoch": 0.8779660089219737, + "grad_norm": 1.0958602710985912, + "learning_rate": 1.8127523268990282e-06, + "loss": 1.0175, + "num_input_tokens_seen": 1448476064, + "step": 8020 + }, + { + "epoch": 0.8780754809929117, + "grad_norm": 1.2285677318105495, + "learning_rate": 1.8095391190333404e-06, + "loss": 1.049, + "num_input_tokens_seen": 1448653248, + "step": 8021 + }, + { + "epoch": 0.8781849530638496, + "grad_norm": 1.1837723096862038, + "learning_rate": 1.8063286545548398e-06, + "loss": 0.6419, + "num_input_tokens_seen": 1448834688, + "step": 8022 + }, + { + "epoch": 0.8782944251347875, + "grad_norm": 1.1666068618069043, + "learning_rate": 1.8031209338433246e-06, + "loss": 1.2523, + "num_input_tokens_seen": 1449035392, + "step": 8023 + }, + { + "epoch": 0.8784038972057254, + "grad_norm": 1.1510585536264644, + "learning_rate": 1.799915957278256e-06, + "loss": 0.9236, + "num_input_tokens_seen": 1449248640, + "step": 8024 + }, + { + "epoch": 0.8785133692766632, + "grad_norm": 1.3345005271170223, + "learning_rate": 1.79671372523878e-06, + "loss": 1.3181, + "num_input_tokens_seen": 1449445760, + "step": 8025 + }, + { + "epoch": 0.8786228413476012, + "grad_norm": 1.2429702979365311, + "learning_rate": 1.7935142381037135e-06, + "loss": 0.7695, + "num_input_tokens_seen": 1449598304, + "step": 8026 + }, + { + "epoch": 0.8787323134185391, + "grad_norm": 1.0301201150355024, + "learning_rate": 1.7903174962515478e-06, + "loss": 0.6428, + "num_input_tokens_seen": 1449779296, + "step": 8027 + }, + { + "epoch": 0.878841785489477, + "grad_norm": 1.1388443412183622, + "learning_rate": 1.7871235000604503e-06, + "loss": 0.8285, + "num_input_tokens_seen": 1449953344, + "step": 8028 + }, + { + "epoch": 0.8789512575604149, + "grad_norm": 1.2664036651340205, + "learning_rate": 1.7839322499082738e-06, + "loss": 0.8732, + "num_input_tokens_seen": 1450129184, + "step": 8029 + }, + { + "epoch": 0.8790607296313528, + "grad_norm": 1.070845478201743, + "learning_rate": 1.7807437461725252e-06, + "loss": 0.9521, + "num_input_tokens_seen": 1450331008, + "step": 8030 + }, + { + "epoch": 0.8791702017022907, + "grad_norm": 1.2825548459199176, + "learning_rate": 1.7775579892304051e-06, + "loss": 1.0078, + "num_input_tokens_seen": 1450483776, + "step": 8031 + }, + { + "epoch": 0.8792796737732286, + "grad_norm": 1.2019359608473184, + "learning_rate": 1.7743749794587817e-06, + "loss": 0.8961, + "num_input_tokens_seen": 1450654688, + "step": 8032 + }, + { + "epoch": 0.8793891458441665, + "grad_norm": 1.1173147382505202, + "learning_rate": 1.7711947172342009e-06, + "loss": 0.7725, + "num_input_tokens_seen": 1450849568, + "step": 8033 + }, + { + "epoch": 0.8794986179151044, + "grad_norm": 0.9970377618779979, + "learning_rate": 1.7680172029328757e-06, + "loss": 0.7195, + "num_input_tokens_seen": 1451050720, + "step": 8034 + }, + { + "epoch": 0.8796080899860423, + "grad_norm": 1.1265221999918975, + "learning_rate": 1.7648424369307e-06, + "loss": 1.1587, + "num_input_tokens_seen": 1451231264, + "step": 8035 + }, + { + "epoch": 0.8797175620569803, + "grad_norm": 1.2542360531611485, + "learning_rate": 1.7616704196032564e-06, + "loss": 1.1482, + "num_input_tokens_seen": 1451401504, + "step": 8036 + }, + { + "epoch": 0.8798270341279181, + "grad_norm": 1.0193566066782882, + "learning_rate": 1.758501151325767e-06, + "loss": 0.7707, + "num_input_tokens_seen": 1451575104, + "step": 8037 + }, + { + "epoch": 0.879936506198856, + "grad_norm": 0.9785444365807299, + "learning_rate": 1.7553346324731712e-06, + "loss": 0.6864, + "num_input_tokens_seen": 1451754528, + "step": 8038 + }, + { + "epoch": 0.8800459782697939, + "grad_norm": 0.9873921914946849, + "learning_rate": 1.7521708634200413e-06, + "loss": 0.7789, + "num_input_tokens_seen": 1451941568, + "step": 8039 + }, + { + "epoch": 0.8801554503407318, + "grad_norm": 1.0770969741715037, + "learning_rate": 1.7490098445406667e-06, + "loss": 0.898, + "num_input_tokens_seen": 1452157504, + "step": 8040 + }, + { + "epoch": 0.8802649224116698, + "grad_norm": 1.1370240945133512, + "learning_rate": 1.7458515762089706e-06, + "loss": 0.7523, + "num_input_tokens_seen": 1452342528, + "step": 8041 + }, + { + "epoch": 0.8803743944826076, + "grad_norm": 1.170877413525155, + "learning_rate": 1.742696058798582e-06, + "loss": 0.9647, + "num_input_tokens_seen": 1452514784, + "step": 8042 + }, + { + "epoch": 0.8804838665535455, + "grad_norm": 1.310028564927927, + "learning_rate": 1.7395432926827909e-06, + "loss": 0.7674, + "num_input_tokens_seen": 1452692192, + "step": 8043 + }, + { + "epoch": 0.8805933386244834, + "grad_norm": 1.2201599377402874, + "learning_rate": 1.7363932782345603e-06, + "loss": 0.7177, + "num_input_tokens_seen": 1452884384, + "step": 8044 + }, + { + "epoch": 0.8807028106954213, + "grad_norm": 1.3100625065458247, + "learning_rate": 1.7332460158265313e-06, + "loss": 0.9239, + "num_input_tokens_seen": 1453081504, + "step": 8045 + }, + { + "epoch": 0.8808122827663593, + "grad_norm": 1.0254152131538572, + "learning_rate": 1.7301015058310194e-06, + "loss": 0.705, + "num_input_tokens_seen": 1453268320, + "step": 8046 + }, + { + "epoch": 0.8809217548372972, + "grad_norm": 1.0339276778497042, + "learning_rate": 1.726959748620019e-06, + "loss": 0.6399, + "num_input_tokens_seen": 1453453792, + "step": 8047 + }, + { + "epoch": 0.881031226908235, + "grad_norm": 1.1473676408215223, + "learning_rate": 1.7238207445651855e-06, + "loss": 0.8233, + "num_input_tokens_seen": 1453638144, + "step": 8048 + }, + { + "epoch": 0.8811406989791729, + "grad_norm": 1.143861385820952, + "learning_rate": 1.7206844940378636e-06, + "loss": 0.7879, + "num_input_tokens_seen": 1453804800, + "step": 8049 + }, + { + "epoch": 0.8812501710501108, + "grad_norm": 1.084327602175102, + "learning_rate": 1.7175509974090647e-06, + "loss": 0.9437, + "num_input_tokens_seen": 1453989600, + "step": 8050 + }, + { + "epoch": 0.8813596431210488, + "grad_norm": 1.2197747934913448, + "learning_rate": 1.714420255049473e-06, + "loss": 0.866, + "num_input_tokens_seen": 1454163872, + "step": 8051 + }, + { + "epoch": 0.8814691151919867, + "grad_norm": 1.1064092726195462, + "learning_rate": 1.7112922673294507e-06, + "loss": 0.9615, + "num_input_tokens_seen": 1454343296, + "step": 8052 + }, + { + "epoch": 0.8815785872629246, + "grad_norm": 1.0681881392529873, + "learning_rate": 1.708167034619032e-06, + "loss": 0.7319, + "num_input_tokens_seen": 1454501888, + "step": 8053 + }, + { + "epoch": 0.8816880593338624, + "grad_norm": 1.1710178037604457, + "learning_rate": 1.705044557287927e-06, + "loss": 0.9225, + "num_input_tokens_seen": 1454681536, + "step": 8054 + }, + { + "epoch": 0.8817975314048003, + "grad_norm": 1.139511296275289, + "learning_rate": 1.701924835705515e-06, + "loss": 0.8514, + "num_input_tokens_seen": 1454873952, + "step": 8055 + }, + { + "epoch": 0.8819070034757382, + "grad_norm": 1.202507702888038, + "learning_rate": 1.6988078702408622e-06, + "loss": 0.9843, + "num_input_tokens_seen": 1455029408, + "step": 8056 + }, + { + "epoch": 0.8820164755466762, + "grad_norm": 0.9977741204684214, + "learning_rate": 1.6956936612626928e-06, + "loss": 0.6851, + "num_input_tokens_seen": 1455220704, + "step": 8057 + }, + { + "epoch": 0.8821259476176141, + "grad_norm": 1.1790738056853078, + "learning_rate": 1.6925822091394121e-06, + "loss": 0.8278, + "num_input_tokens_seen": 1455431936, + "step": 8058 + }, + { + "epoch": 0.8822354196885519, + "grad_norm": 0.9925363182802512, + "learning_rate": 1.689473514239101e-06, + "loss": 0.8114, + "num_input_tokens_seen": 1455650784, + "step": 8059 + }, + { + "epoch": 0.8823448917594898, + "grad_norm": 1.201720817860744, + "learning_rate": 1.6863675769295096e-06, + "loss": 1.0355, + "num_input_tokens_seen": 1455812288, + "step": 8060 + }, + { + "epoch": 0.8824543638304277, + "grad_norm": 1.1161358558587393, + "learning_rate": 1.683264397578066e-06, + "loss": 0.6941, + "num_input_tokens_seen": 1455980512, + "step": 8061 + }, + { + "epoch": 0.8825638359013657, + "grad_norm": 1.0608780767010306, + "learning_rate": 1.6801639765518712e-06, + "loss": 0.7013, + "num_input_tokens_seen": 1456156800, + "step": 8062 + }, + { + "epoch": 0.8826733079723036, + "grad_norm": 1.1568711648524028, + "learning_rate": 1.6770663142176957e-06, + "loss": 0.9795, + "num_input_tokens_seen": 1456328608, + "step": 8063 + }, + { + "epoch": 0.8827827800432415, + "grad_norm": 1.2188892773983373, + "learning_rate": 1.6739714109419907e-06, + "loss": 0.8617, + "num_input_tokens_seen": 1456512736, + "step": 8064 + }, + { + "epoch": 0.8828922521141793, + "grad_norm": 1.1281504493442311, + "learning_rate": 1.6708792670908746e-06, + "loss": 0.8722, + "num_input_tokens_seen": 1456709408, + "step": 8065 + }, + { + "epoch": 0.8830017241851172, + "grad_norm": 1.1169324463691603, + "learning_rate": 1.6677898830301463e-06, + "loss": 0.8306, + "num_input_tokens_seen": 1456896672, + "step": 8066 + }, + { + "epoch": 0.8831111962560552, + "grad_norm": 1.1583645532823794, + "learning_rate": 1.664703259125272e-06, + "loss": 0.7984, + "num_input_tokens_seen": 1457034208, + "step": 8067 + }, + { + "epoch": 0.8832206683269931, + "grad_norm": 1.067102728809796, + "learning_rate": 1.66161939574139e-06, + "loss": 0.7835, + "num_input_tokens_seen": 1457187648, + "step": 8068 + }, + { + "epoch": 0.883330140397931, + "grad_norm": 1.150693584762331, + "learning_rate": 1.6585382932433197e-06, + "loss": 1.1522, + "num_input_tokens_seen": 1457349376, + "step": 8069 + }, + { + "epoch": 0.8834396124688689, + "grad_norm": 0.968599276969194, + "learning_rate": 1.6554599519955417e-06, + "loss": 1.0939, + "num_input_tokens_seen": 1457573376, + "step": 8070 + }, + { + "epoch": 0.8835490845398067, + "grad_norm": 1.2791306249322814, + "learning_rate": 1.652384372362234e-06, + "loss": 1.0179, + "num_input_tokens_seen": 1457766240, + "step": 8071 + }, + { + "epoch": 0.8836585566107447, + "grad_norm": 1.1772150061674587, + "learning_rate": 1.649311554707214e-06, + "loss": 0.7176, + "num_input_tokens_seen": 1457947232, + "step": 8072 + }, + { + "epoch": 0.8837680286816826, + "grad_norm": 1.0209628756066522, + "learning_rate": 1.6462414993940023e-06, + "loss": 0.8226, + "num_input_tokens_seen": 1458161600, + "step": 8073 + }, + { + "epoch": 0.8838775007526205, + "grad_norm": 1.201623698289508, + "learning_rate": 1.6431742067857775e-06, + "loss": 0.8731, + "num_input_tokens_seen": 1458339232, + "step": 8074 + }, + { + "epoch": 0.8839869728235584, + "grad_norm": 1.193628159142014, + "learning_rate": 1.6401096772453912e-06, + "loss": 0.6466, + "num_input_tokens_seen": 1458492224, + "step": 8075 + }, + { + "epoch": 0.8840964448944962, + "grad_norm": 1.2496417008917415, + "learning_rate": 1.6370479111353754e-06, + "loss": 0.8376, + "num_input_tokens_seen": 1458677472, + "step": 8076 + }, + { + "epoch": 0.8842059169654342, + "grad_norm": 1.1849772107472085, + "learning_rate": 1.633988908817924e-06, + "loss": 1.002, + "num_input_tokens_seen": 1458847264, + "step": 8077 + }, + { + "epoch": 0.8843153890363721, + "grad_norm": 1.0543091271880354, + "learning_rate": 1.6309326706549221e-06, + "loss": 0.8866, + "num_input_tokens_seen": 1459009664, + "step": 8078 + }, + { + "epoch": 0.88442486110731, + "grad_norm": 1.015359579763569, + "learning_rate": 1.627879197007906e-06, + "loss": 0.8243, + "num_input_tokens_seen": 1459196928, + "step": 8079 + }, + { + "epoch": 0.8845343331782479, + "grad_norm": 1.1254245698649685, + "learning_rate": 1.6248284882381087e-06, + "loss": 0.7807, + "num_input_tokens_seen": 1459371648, + "step": 8080 + }, + { + "epoch": 0.8846438052491858, + "grad_norm": 1.0895923611696656, + "learning_rate": 1.6217805447064083e-06, + "loss": 0.9257, + "num_input_tokens_seen": 1459573248, + "step": 8081 + }, + { + "epoch": 0.8847532773201237, + "grad_norm": 1.084134423120733, + "learning_rate": 1.6187353667733856e-06, + "loss": 0.8314, + "num_input_tokens_seen": 1459762080, + "step": 8082 + }, + { + "epoch": 0.8848627493910616, + "grad_norm": 1.1857602274689092, + "learning_rate": 1.6156929547992638e-06, + "loss": 1.12, + "num_input_tokens_seen": 1459927840, + "step": 8083 + }, + { + "epoch": 0.8849722214619995, + "grad_norm": 1.0923774826871921, + "learning_rate": 1.6126533091439661e-06, + "loss": 0.672, + "num_input_tokens_seen": 1460118912, + "step": 8084 + }, + { + "epoch": 0.8850816935329374, + "grad_norm": 1.4222258250286908, + "learning_rate": 1.6096164301670712e-06, + "loss": 0.7913, + "num_input_tokens_seen": 1460306400, + "step": 8085 + }, + { + "epoch": 0.8851911656038753, + "grad_norm": 1.0513681807523059, + "learning_rate": 1.6065823182278366e-06, + "loss": 0.722, + "num_input_tokens_seen": 1460497248, + "step": 8086 + }, + { + "epoch": 0.8853006376748133, + "grad_norm": 1.1549104715303113, + "learning_rate": 1.6035509736851973e-06, + "loss": 0.9942, + "num_input_tokens_seen": 1460651808, + "step": 8087 + }, + { + "epoch": 0.8854101097457511, + "grad_norm": 1.343205541270074, + "learning_rate": 1.6005223968977468e-06, + "loss": 0.9091, + "num_input_tokens_seen": 1460810400, + "step": 8088 + }, + { + "epoch": 0.885519581816689, + "grad_norm": 1.260488518592631, + "learning_rate": 1.597496588223768e-06, + "loss": 0.9068, + "num_input_tokens_seen": 1460989152, + "step": 8089 + }, + { + "epoch": 0.8856290538876269, + "grad_norm": 1.2248573356387886, + "learning_rate": 1.5944735480212026e-06, + "loss": 0.8857, + "num_input_tokens_seen": 1461173056, + "step": 8090 + }, + { + "epoch": 0.8857385259585648, + "grad_norm": 1.2308686106246651, + "learning_rate": 1.5914532766476753e-06, + "loss": 0.818, + "num_input_tokens_seen": 1461334112, + "step": 8091 + }, + { + "epoch": 0.8858479980295028, + "grad_norm": 1.310405240314844, + "learning_rate": 1.5884357744604756e-06, + "loss": 1.0169, + "num_input_tokens_seen": 1461464928, + "step": 8092 + }, + { + "epoch": 0.8859574701004407, + "grad_norm": 1.115203857004318, + "learning_rate": 1.5854210418165678e-06, + "loss": 1.1004, + "num_input_tokens_seen": 1461685120, + "step": 8093 + }, + { + "epoch": 0.8860669421713785, + "grad_norm": 1.1170864459663992, + "learning_rate": 1.5824090790725887e-06, + "loss": 0.6956, + "num_input_tokens_seen": 1461862976, + "step": 8094 + }, + { + "epoch": 0.8861764142423164, + "grad_norm": 0.9880486316353108, + "learning_rate": 1.579399886584848e-06, + "loss": 0.8613, + "num_input_tokens_seen": 1462030752, + "step": 8095 + }, + { + "epoch": 0.8862858863132543, + "grad_norm": 1.087784493164446, + "learning_rate": 1.5763934647093275e-06, + "loss": 0.8494, + "num_input_tokens_seen": 1462192256, + "step": 8096 + }, + { + "epoch": 0.8863953583841923, + "grad_norm": 1.4268200578108003, + "learning_rate": 1.5733898138016845e-06, + "loss": 1.0877, + "num_input_tokens_seen": 1462351744, + "step": 8097 + }, + { + "epoch": 0.8865048304551302, + "grad_norm": 1.0647046833090466, + "learning_rate": 1.5703889342172401e-06, + "loss": 0.8029, + "num_input_tokens_seen": 1462551776, + "step": 8098 + }, + { + "epoch": 0.886614302526068, + "grad_norm": 1.1013667155841669, + "learning_rate": 1.5673908263109966e-06, + "loss": 0.859, + "num_input_tokens_seen": 1462700288, + "step": 8099 + }, + { + "epoch": 0.8867237745970059, + "grad_norm": 1.217953201400237, + "learning_rate": 1.564395490437623e-06, + "loss": 0.963, + "num_input_tokens_seen": 1462899872, + "step": 8100 + }, + { + "epoch": 0.8868332466679438, + "grad_norm": 1.0502610744881298, + "learning_rate": 1.5614029269514634e-06, + "loss": 0.9545, + "num_input_tokens_seen": 1463092736, + "step": 8101 + }, + { + "epoch": 0.8869427187388818, + "grad_norm": 1.1916432129390342, + "learning_rate": 1.558413136206527e-06, + "loss": 0.9477, + "num_input_tokens_seen": 1463301728, + "step": 8102 + }, + { + "epoch": 0.8870521908098197, + "grad_norm": 1.211505977954356, + "learning_rate": 1.5554261185565022e-06, + "loss": 0.6839, + "num_input_tokens_seen": 1463452928, + "step": 8103 + }, + { + "epoch": 0.8871616628807576, + "grad_norm": 1.1289152257225898, + "learning_rate": 1.552441874354757e-06, + "loss": 0.8789, + "num_input_tokens_seen": 1463607488, + "step": 8104 + }, + { + "epoch": 0.8872711349516954, + "grad_norm": 1.1810399065438046, + "learning_rate": 1.5494604039543058e-06, + "loss": 0.6964, + "num_input_tokens_seen": 1463797888, + "step": 8105 + }, + { + "epoch": 0.8873806070226333, + "grad_norm": 1.1637977964786503, + "learning_rate": 1.5464817077078614e-06, + "loss": 0.8435, + "num_input_tokens_seen": 1463946624, + "step": 8106 + }, + { + "epoch": 0.8874900790935712, + "grad_norm": 1.2264699718782133, + "learning_rate": 1.543505785967797e-06, + "loss": 0.9518, + "num_input_tokens_seen": 1464126720, + "step": 8107 + }, + { + "epoch": 0.8875995511645092, + "grad_norm": 1.0085664634647986, + "learning_rate": 1.5405326390861562e-06, + "loss": 0.6915, + "num_input_tokens_seen": 1464311968, + "step": 8108 + }, + { + "epoch": 0.8877090232354471, + "grad_norm": 1.1545367048235102, + "learning_rate": 1.5375622674146577e-06, + "loss": 0.9181, + "num_input_tokens_seen": 1464499680, + "step": 8109 + }, + { + "epoch": 0.887818495306385, + "grad_norm": 1.1772352587574286, + "learning_rate": 1.5345946713046872e-06, + "loss": 0.9051, + "num_input_tokens_seen": 1464691424, + "step": 8110 + }, + { + "epoch": 0.8879279673773228, + "grad_norm": 1.3691434982639699, + "learning_rate": 1.5316298511073164e-06, + "loss": 0.7612, + "num_input_tokens_seen": 1464858304, + "step": 8111 + }, + { + "epoch": 0.8880374394482607, + "grad_norm": 1.196457906212925, + "learning_rate": 1.528667807173262e-06, + "loss": 1.09, + "num_input_tokens_seen": 1465057888, + "step": 8112 + }, + { + "epoch": 0.8881469115191987, + "grad_norm": 1.022606257301523, + "learning_rate": 1.5257085398529436e-06, + "loss": 0.6979, + "num_input_tokens_seen": 1465237536, + "step": 8113 + }, + { + "epoch": 0.8882563835901366, + "grad_norm": 1.100340848913434, + "learning_rate": 1.5227520494964232e-06, + "loss": 0.8239, + "num_input_tokens_seen": 1465443392, + "step": 8114 + }, + { + "epoch": 0.8883658556610745, + "grad_norm": 1.1962556297145859, + "learning_rate": 1.5197983364534597e-06, + "loss": 0.7103, + "num_input_tokens_seen": 1465636704, + "step": 8115 + }, + { + "epoch": 0.8884753277320123, + "grad_norm": 1.062989066267797, + "learning_rate": 1.5168474010734622e-06, + "loss": 0.6803, + "num_input_tokens_seen": 1465819488, + "step": 8116 + }, + { + "epoch": 0.8885847998029502, + "grad_norm": 1.1989500260362194, + "learning_rate": 1.5138992437055299e-06, + "loss": 0.8589, + "num_input_tokens_seen": 1466013472, + "step": 8117 + }, + { + "epoch": 0.8886942718738882, + "grad_norm": 1.1359480509733677, + "learning_rate": 1.5109538646984167e-06, + "loss": 0.8557, + "num_input_tokens_seen": 1466190656, + "step": 8118 + }, + { + "epoch": 0.8888037439448261, + "grad_norm": 1.1076548497808245, + "learning_rate": 1.5080112644005523e-06, + "loss": 1.0078, + "num_input_tokens_seen": 1466368736, + "step": 8119 + }, + { + "epoch": 0.888913216015764, + "grad_norm": 1.106145644788579, + "learning_rate": 1.5050714431600554e-06, + "loss": 0.888, + "num_input_tokens_seen": 1466570336, + "step": 8120 + }, + { + "epoch": 0.8890226880867019, + "grad_norm": 1.190638560620877, + "learning_rate": 1.502134401324687e-06, + "loss": 0.85, + "num_input_tokens_seen": 1466713696, + "step": 8121 + }, + { + "epoch": 0.8891321601576397, + "grad_norm": 1.1796165091267716, + "learning_rate": 1.499200139241902e-06, + "loss": 0.9244, + "num_input_tokens_seen": 1466895360, + "step": 8122 + }, + { + "epoch": 0.8892416322285777, + "grad_norm": 1.0973275666562383, + "learning_rate": 1.4962686572588091e-06, + "loss": 0.9121, + "num_input_tokens_seen": 1467075008, + "step": 8123 + }, + { + "epoch": 0.8893511042995156, + "grad_norm": 1.064999613336484, + "learning_rate": 1.493339955722206e-06, + "loss": 0.8043, + "num_input_tokens_seen": 1467251744, + "step": 8124 + }, + { + "epoch": 0.8894605763704535, + "grad_norm": 1.013216596318676, + "learning_rate": 1.4904140349785488e-06, + "loss": 1.006, + "num_input_tokens_seen": 1467444608, + "step": 8125 + }, + { + "epoch": 0.8895700484413914, + "grad_norm": 1.1488572475940717, + "learning_rate": 1.4874908953739691e-06, + "loss": 0.882, + "num_input_tokens_seen": 1467621344, + "step": 8126 + }, + { + "epoch": 0.8896795205123293, + "grad_norm": 1.1341019756376958, + "learning_rate": 1.4845705372542707e-06, + "loss": 0.6848, + "num_input_tokens_seen": 1467790240, + "step": 8127 + }, + { + "epoch": 0.8897889925832672, + "grad_norm": 1.1128468519274008, + "learning_rate": 1.481652960964916e-06, + "loss": 0.7481, + "num_input_tokens_seen": 1467948384, + "step": 8128 + }, + { + "epoch": 0.8898984646542051, + "grad_norm": 1.0113784040828226, + "learning_rate": 1.4787381668510653e-06, + "loss": 0.7216, + "num_input_tokens_seen": 1468117056, + "step": 8129 + }, + { + "epoch": 0.890007936725143, + "grad_norm": 1.1743622895791865, + "learning_rate": 1.4758261552575175e-06, + "loss": 0.7021, + "num_input_tokens_seen": 1468303424, + "step": 8130 + }, + { + "epoch": 0.8901174087960809, + "grad_norm": 1.2101229974008845, + "learning_rate": 1.4729169265287695e-06, + "loss": 0.712, + "num_input_tokens_seen": 1468495392, + "step": 8131 + }, + { + "epoch": 0.8902268808670188, + "grad_norm": 1.2072060961455016, + "learning_rate": 1.470010481008971e-06, + "loss": 0.8649, + "num_input_tokens_seen": 1468663392, + "step": 8132 + }, + { + "epoch": 0.8903363529379567, + "grad_norm": 1.0295970848108917, + "learning_rate": 1.4671068190419524e-06, + "loss": 0.7878, + "num_input_tokens_seen": 1468858720, + "step": 8133 + }, + { + "epoch": 0.8904458250088946, + "grad_norm": 1.0753425961545042, + "learning_rate": 1.4642059409712082e-06, + "loss": 0.8735, + "num_input_tokens_seen": 1469031424, + "step": 8134 + }, + { + "epoch": 0.8905552970798325, + "grad_norm": 1.066712723174782, + "learning_rate": 1.461307847139909e-06, + "loss": 0.7509, + "num_input_tokens_seen": 1469213312, + "step": 8135 + }, + { + "epoch": 0.8906647691507704, + "grad_norm": 1.1990656055989832, + "learning_rate": 1.4584125378908935e-06, + "loss": 1.0316, + "num_input_tokens_seen": 1469415808, + "step": 8136 + }, + { + "epoch": 0.8907742412217083, + "grad_norm": 1.3168439731645238, + "learning_rate": 1.455520013566672e-06, + "loss": 0.8483, + "num_input_tokens_seen": 1469572832, + "step": 8137 + }, + { + "epoch": 0.8908837132926463, + "grad_norm": 1.2938965295196863, + "learning_rate": 1.45263027450942e-06, + "loss": 1.114, + "num_input_tokens_seen": 1469761216, + "step": 8138 + }, + { + "epoch": 0.8909931853635841, + "grad_norm": 1.212956769187025, + "learning_rate": 1.4497433210609923e-06, + "loss": 0.8024, + "num_input_tokens_seen": 1469909056, + "step": 8139 + }, + { + "epoch": 0.891102657434522, + "grad_norm": 1.2974597118348352, + "learning_rate": 1.4468591535629127e-06, + "loss": 0.8456, + "num_input_tokens_seen": 1470105280, + "step": 8140 + }, + { + "epoch": 0.8912121295054599, + "grad_norm": 1.3492276967276067, + "learning_rate": 1.443977772356367e-06, + "loss": 1.0542, + "num_input_tokens_seen": 1470271488, + "step": 8141 + }, + { + "epoch": 0.8913216015763978, + "grad_norm": 1.1929443436819687, + "learning_rate": 1.4410991777822209e-06, + "loss": 0.7493, + "num_input_tokens_seen": 1470434784, + "step": 8142 + }, + { + "epoch": 0.8914310736473358, + "grad_norm": 1.0597974401943768, + "learning_rate": 1.4382233701810022e-06, + "loss": 0.8244, + "num_input_tokens_seen": 1470615776, + "step": 8143 + }, + { + "epoch": 0.8915405457182737, + "grad_norm": 1.0486140823041465, + "learning_rate": 1.4353503498929193e-06, + "loss": 1.0386, + "num_input_tokens_seen": 1470816928, + "step": 8144 + }, + { + "epoch": 0.8916500177892115, + "grad_norm": 1.0536862742566617, + "learning_rate": 1.4324801172578366e-06, + "loss": 0.878, + "num_input_tokens_seen": 1471008896, + "step": 8145 + }, + { + "epoch": 0.8917594898601494, + "grad_norm": 1.0323623668942945, + "learning_rate": 1.4296126726153102e-06, + "loss": 0.9795, + "num_input_tokens_seen": 1471199072, + "step": 8146 + }, + { + "epoch": 0.8918689619310873, + "grad_norm": 1.106906185475494, + "learning_rate": 1.4267480163045384e-06, + "loss": 0.7253, + "num_input_tokens_seen": 1471373120, + "step": 8147 + }, + { + "epoch": 0.8919784340020253, + "grad_norm": 1.1135824766373907, + "learning_rate": 1.4238861486644162e-06, + "loss": 1.068, + "num_input_tokens_seen": 1471572256, + "step": 8148 + }, + { + "epoch": 0.8920879060729632, + "grad_norm": 1.1606020087130575, + "learning_rate": 1.4210270700334927e-06, + "loss": 0.6819, + "num_input_tokens_seen": 1471724128, + "step": 8149 + }, + { + "epoch": 0.892197378143901, + "grad_norm": 1.1194901644507655, + "learning_rate": 1.4181707807499917e-06, + "loss": 0.907, + "num_input_tokens_seen": 1471910496, + "step": 8150 + }, + { + "epoch": 0.8923068502148389, + "grad_norm": 1.208621560721811, + "learning_rate": 1.4153172811518067e-06, + "loss": 1.1784, + "num_input_tokens_seen": 1472081632, + "step": 8151 + }, + { + "epoch": 0.8924163222857768, + "grad_norm": 1.166725338868331, + "learning_rate": 1.4124665715764957e-06, + "loss": 0.9356, + "num_input_tokens_seen": 1472245376, + "step": 8152 + }, + { + "epoch": 0.8925257943567148, + "grad_norm": 1.1422378946398841, + "learning_rate": 1.4096186523613052e-06, + "loss": 0.8015, + "num_input_tokens_seen": 1472442496, + "step": 8153 + }, + { + "epoch": 0.8926352664276527, + "grad_norm": 1.2218212164629834, + "learning_rate": 1.4067735238431245e-06, + "loss": 0.7774, + "num_input_tokens_seen": 1472630432, + "step": 8154 + }, + { + "epoch": 0.8927447384985906, + "grad_norm": 1.143086429568857, + "learning_rate": 1.4039311863585425e-06, + "loss": 0.9688, + "num_input_tokens_seen": 1472784544, + "step": 8155 + }, + { + "epoch": 0.8928542105695284, + "grad_norm": 1.0175451467372338, + "learning_rate": 1.4010916402437845e-06, + "loss": 0.7513, + "num_input_tokens_seen": 1472992640, + "step": 8156 + }, + { + "epoch": 0.8929636826404663, + "grad_norm": 1.090341597846942, + "learning_rate": 1.3982548858347738e-06, + "loss": 0.933, + "num_input_tokens_seen": 1473193568, + "step": 8157 + }, + { + "epoch": 0.8930731547114042, + "grad_norm": 1.2458562833953732, + "learning_rate": 1.3954209234670917e-06, + "loss": 0.9874, + "num_input_tokens_seen": 1473410624, + "step": 8158 + }, + { + "epoch": 0.8931826267823422, + "grad_norm": 0.9757510056095869, + "learning_rate": 1.3925897534759925e-06, + "loss": 0.5702, + "num_input_tokens_seen": 1473609760, + "step": 8159 + }, + { + "epoch": 0.8932920988532801, + "grad_norm": 1.0927569649744275, + "learning_rate": 1.389761376196394e-06, + "loss": 0.7562, + "num_input_tokens_seen": 1473795680, + "step": 8160 + }, + { + "epoch": 0.893401570924218, + "grad_norm": 1.0403869906587795, + "learning_rate": 1.3869357919628845e-06, + "loss": 0.7635, + "num_input_tokens_seen": 1473973536, + "step": 8161 + }, + { + "epoch": 0.8935110429951558, + "grad_norm": 1.1558170023348646, + "learning_rate": 1.3841130011097408e-06, + "loss": 1.0854, + "num_input_tokens_seen": 1474168640, + "step": 8162 + }, + { + "epoch": 0.8936205150660937, + "grad_norm": 1.1068558364837666, + "learning_rate": 1.3812930039708738e-06, + "loss": 0.8992, + "num_input_tokens_seen": 1474343808, + "step": 8163 + }, + { + "epoch": 0.8937299871370317, + "grad_norm": 1.1322117231843292, + "learning_rate": 1.378475800879897e-06, + "loss": 0.8204, + "num_input_tokens_seen": 1474531296, + "step": 8164 + }, + { + "epoch": 0.8938394592079696, + "grad_norm": 1.0560487228120694, + "learning_rate": 1.3756613921700774e-06, + "loss": 0.7473, + "num_input_tokens_seen": 1474721472, + "step": 8165 + }, + { + "epoch": 0.8939489312789075, + "grad_norm": 1.112510226770608, + "learning_rate": 1.372849778174351e-06, + "loss": 0.7822, + "num_input_tokens_seen": 1474903360, + "step": 8166 + }, + { + "epoch": 0.8940584033498453, + "grad_norm": 1.2053798501837771, + "learning_rate": 1.3700409592253299e-06, + "loss": 0.844, + "num_input_tokens_seen": 1475066656, + "step": 8167 + }, + { + "epoch": 0.8941678754207832, + "grad_norm": 1.1261647340969567, + "learning_rate": 1.3672349356552899e-06, + "loss": 0.7506, + "num_input_tokens_seen": 1475230848, + "step": 8168 + }, + { + "epoch": 0.8942773474917212, + "grad_norm": 1.030477176426467, + "learning_rate": 1.3644317077961794e-06, + "loss": 0.6415, + "num_input_tokens_seen": 1475406464, + "step": 8169 + }, + { + "epoch": 0.8943868195626591, + "grad_norm": 1.0950443783132553, + "learning_rate": 1.3616312759796079e-06, + "loss": 0.885, + "num_input_tokens_seen": 1475626656, + "step": 8170 + }, + { + "epoch": 0.894496291633597, + "grad_norm": 1.0962747045188739, + "learning_rate": 1.3588336405368745e-06, + "loss": 0.5948, + "num_input_tokens_seen": 1475775840, + "step": 8171 + }, + { + "epoch": 0.8946057637045349, + "grad_norm": 1.0725488618688621, + "learning_rate": 1.3560388017989256e-06, + "loss": 0.8246, + "num_input_tokens_seen": 1475989536, + "step": 8172 + }, + { + "epoch": 0.8947152357754727, + "grad_norm": 1.2764640835536571, + "learning_rate": 1.3532467600963883e-06, + "loss": 0.9701, + "num_input_tokens_seen": 1476180608, + "step": 8173 + }, + { + "epoch": 0.8948247078464107, + "grad_norm": 1.0999624115850615, + "learning_rate": 1.350457515759554e-06, + "loss": 0.9495, + "num_input_tokens_seen": 1476354432, + "step": 8174 + }, + { + "epoch": 0.8949341799173486, + "grad_norm": 1.1210724831933225, + "learning_rate": 1.3476710691183837e-06, + "loss": 0.7738, + "num_input_tokens_seen": 1476556032, + "step": 8175 + }, + { + "epoch": 0.8950436519882865, + "grad_norm": 1.0147335541814662, + "learning_rate": 1.3448874205025137e-06, + "loss": 0.7371, + "num_input_tokens_seen": 1476760544, + "step": 8176 + }, + { + "epoch": 0.8951531240592244, + "grad_norm": 1.0238080700624268, + "learning_rate": 1.342106570241239e-06, + "loss": 0.8386, + "num_input_tokens_seen": 1476972448, + "step": 8177 + }, + { + "epoch": 0.8952625961301623, + "grad_norm": 1.2798802567143146, + "learning_rate": 1.3393285186635268e-06, + "loss": 0.7986, + "num_input_tokens_seen": 1477148288, + "step": 8178 + }, + { + "epoch": 0.8953720682011002, + "grad_norm": 1.2818953586786273, + "learning_rate": 1.3365532660980256e-06, + "loss": 0.878, + "num_input_tokens_seen": 1477313152, + "step": 8179 + }, + { + "epoch": 0.8954815402720381, + "grad_norm": 1.1119388986960068, + "learning_rate": 1.333780812873031e-06, + "loss": 0.6281, + "num_input_tokens_seen": 1477464352, + "step": 8180 + }, + { + "epoch": 0.895591012342976, + "grad_norm": 1.0771620499692331, + "learning_rate": 1.3310111593165254e-06, + "loss": 0.892, + "num_input_tokens_seen": 1477651168, + "step": 8181 + }, + { + "epoch": 0.8957004844139139, + "grad_norm": 1.370658174975199, + "learning_rate": 1.3282443057561545e-06, + "loss": 0.9016, + "num_input_tokens_seen": 1477836416, + "step": 8182 + }, + { + "epoch": 0.8958099564848518, + "grad_norm": 1.1216161487722829, + "learning_rate": 1.3254802525192289e-06, + "loss": 0.9771, + "num_input_tokens_seen": 1478016960, + "step": 8183 + }, + { + "epoch": 0.8959194285557897, + "grad_norm": 1.1152932173242318, + "learning_rate": 1.3227189999327316e-06, + "loss": 0.9035, + "num_input_tokens_seen": 1478196608, + "step": 8184 + }, + { + "epoch": 0.8960289006267276, + "grad_norm": 1.0325991157637302, + "learning_rate": 1.3199605483233096e-06, + "loss": 0.843, + "num_input_tokens_seen": 1478363936, + "step": 8185 + }, + { + "epoch": 0.8961383726976655, + "grad_norm": 1.110810331853855, + "learning_rate": 1.3172048980172935e-06, + "loss": 0.8142, + "num_input_tokens_seen": 1478557024, + "step": 8186 + }, + { + "epoch": 0.8962478447686034, + "grad_norm": 1.1033641609963776, + "learning_rate": 1.314452049340656e-06, + "loss": 1.0974, + "num_input_tokens_seen": 1478722560, + "step": 8187 + }, + { + "epoch": 0.8963573168395413, + "grad_norm": 1.3064061797011637, + "learning_rate": 1.3117020026190696e-06, + "loss": 1.0359, + "num_input_tokens_seen": 1478919680, + "step": 8188 + }, + { + "epoch": 0.8964667889104793, + "grad_norm": 1.0119455444174519, + "learning_rate": 1.3089547581778438e-06, + "loss": 0.8444, + "num_input_tokens_seen": 1479116800, + "step": 8189 + }, + { + "epoch": 0.8965762609814171, + "grad_norm": 1.1709610490388178, + "learning_rate": 1.3062103163419853e-06, + "loss": 0.7548, + "num_input_tokens_seen": 1479304288, + "step": 8190 + }, + { + "epoch": 0.896685733052355, + "grad_norm": 1.1643832999981472, + "learning_rate": 1.303468677436151e-06, + "loss": 0.8489, + "num_input_tokens_seen": 1479482816, + "step": 8191 + }, + { + "epoch": 0.8967952051232929, + "grad_norm": 1.087949466997373, + "learning_rate": 1.3007298417846731e-06, + "loss": 0.775, + "num_input_tokens_seen": 1479678592, + "step": 8192 + }, + { + "epoch": 0.8969046771942308, + "grad_norm": 1.1274853908835731, + "learning_rate": 1.2979938097115507e-06, + "loss": 1.0827, + "num_input_tokens_seen": 1479872352, + "step": 8193 + }, + { + "epoch": 0.8970141492651688, + "grad_norm": 1.0605030928930408, + "learning_rate": 1.2952605815404445e-06, + "loss": 1.1926, + "num_input_tokens_seen": 1480076416, + "step": 8194 + }, + { + "epoch": 0.8971236213361067, + "grad_norm": 1.1031666431725728, + "learning_rate": 1.2925301575947013e-06, + "loss": 0.8696, + "num_input_tokens_seen": 1480251808, + "step": 8195 + }, + { + "epoch": 0.8972330934070445, + "grad_norm": 1.125601236959915, + "learning_rate": 1.2898025381973155e-06, + "loss": 0.6899, + "num_input_tokens_seen": 1480390240, + "step": 8196 + }, + { + "epoch": 0.8973425654779824, + "grad_norm": 1.0603271593991224, + "learning_rate": 1.287077723670968e-06, + "loss": 0.8551, + "num_input_tokens_seen": 1480592064, + "step": 8197 + }, + { + "epoch": 0.8974520375489203, + "grad_norm": 1.038777018171583, + "learning_rate": 1.2843557143379897e-06, + "loss": 0.7425, + "num_input_tokens_seen": 1480778432, + "step": 8198 + }, + { + "epoch": 0.8975615096198583, + "grad_norm": 1.228568411028125, + "learning_rate": 1.2816365105203953e-06, + "loss": 0.9109, + "num_input_tokens_seen": 1480961216, + "step": 8199 + }, + { + "epoch": 0.8976709816907962, + "grad_norm": 1.295531024607014, + "learning_rate": 1.278920112539861e-06, + "loss": 1.1402, + "num_input_tokens_seen": 1481145568, + "step": 8200 + }, + { + "epoch": 0.897780453761734, + "grad_norm": 1.0644289906416817, + "learning_rate": 1.2762065207177292e-06, + "loss": 0.7234, + "num_input_tokens_seen": 1481329696, + "step": 8201 + }, + { + "epoch": 0.8978899258326719, + "grad_norm": 1.25094220890757, + "learning_rate": 1.273495735375016e-06, + "loss": 0.9701, + "num_input_tokens_seen": 1481488736, + "step": 8202 + }, + { + "epoch": 0.8979993979036098, + "grad_norm": 1.078810763984929, + "learning_rate": 1.270787756832395e-06, + "loss": 0.8281, + "num_input_tokens_seen": 1481657408, + "step": 8203 + }, + { + "epoch": 0.8981088699745478, + "grad_norm": 1.2709127568091474, + "learning_rate": 1.2680825854102268e-06, + "loss": 0.7454, + "num_input_tokens_seen": 1481832800, + "step": 8204 + }, + { + "epoch": 0.8982183420454857, + "grad_norm": 1.1602778001811203, + "learning_rate": 1.2653802214285137e-06, + "loss": 0.8103, + "num_input_tokens_seen": 1481989152, + "step": 8205 + }, + { + "epoch": 0.8983278141164236, + "grad_norm": 1.2078664414770668, + "learning_rate": 1.2626806652069501e-06, + "loss": 1.1708, + "num_input_tokens_seen": 1482159392, + "step": 8206 + }, + { + "epoch": 0.8984372861873614, + "grad_norm": 1.0587735181806344, + "learning_rate": 1.259983917064886e-06, + "loss": 0.7148, + "num_input_tokens_seen": 1482316192, + "step": 8207 + }, + { + "epoch": 0.8985467582582993, + "grad_norm": 1.074637646928936, + "learning_rate": 1.2572899773213437e-06, + "loss": 0.9127, + "num_input_tokens_seen": 1482507264, + "step": 8208 + }, + { + "epoch": 0.8986562303292372, + "grad_norm": 1.0573879567664874, + "learning_rate": 1.2545988462950077e-06, + "loss": 0.9315, + "num_input_tokens_seen": 1482684448, + "step": 8209 + }, + { + "epoch": 0.8987657024001752, + "grad_norm": 1.0738223922537613, + "learning_rate": 1.251910524304234e-06, + "loss": 0.6121, + "num_input_tokens_seen": 1482849760, + "step": 8210 + }, + { + "epoch": 0.8988751744711131, + "grad_norm": 1.2441174850827141, + "learning_rate": 1.249225011667046e-06, + "loss": 1.0885, + "num_input_tokens_seen": 1483023136, + "step": 8211 + }, + { + "epoch": 0.898984646542051, + "grad_norm": 0.967356325368795, + "learning_rate": 1.246542308701132e-06, + "loss": 0.7088, + "num_input_tokens_seen": 1483213088, + "step": 8212 + }, + { + "epoch": 0.8990941186129888, + "grad_norm": 1.1006894811818522, + "learning_rate": 1.2438624157238593e-06, + "loss": 0.7588, + "num_input_tokens_seen": 1483377728, + "step": 8213 + }, + { + "epoch": 0.8992035906839267, + "grad_norm": 1.1868816572809155, + "learning_rate": 1.2411853330522472e-06, + "loss": 0.8855, + "num_input_tokens_seen": 1483546400, + "step": 8214 + }, + { + "epoch": 0.8993130627548647, + "grad_norm": 1.3188983182981562, + "learning_rate": 1.238511061002992e-06, + "loss": 0.9485, + "num_input_tokens_seen": 1483706112, + "step": 8215 + }, + { + "epoch": 0.8994225348258026, + "grad_norm": 1.1429157508144587, + "learning_rate": 1.235839599892455e-06, + "loss": 0.9716, + "num_input_tokens_seen": 1483898528, + "step": 8216 + }, + { + "epoch": 0.8995320068967405, + "grad_norm": 1.1453251607697317, + "learning_rate": 1.2331709500366606e-06, + "loss": 0.9124, + "num_input_tokens_seen": 1484089600, + "step": 8217 + }, + { + "epoch": 0.8996414789676783, + "grad_norm": 1.0580802882797826, + "learning_rate": 1.2305051117513067e-06, + "loss": 0.8838, + "num_input_tokens_seen": 1484239008, + "step": 8218 + }, + { + "epoch": 0.8997509510386162, + "grad_norm": 1.2153285491268164, + "learning_rate": 1.2278420853517658e-06, + "loss": 0.9587, + "num_input_tokens_seen": 1484414400, + "step": 8219 + }, + { + "epoch": 0.8998604231095542, + "grad_norm": 1.0176930426727964, + "learning_rate": 1.2251818711530556e-06, + "loss": 0.8158, + "num_input_tokens_seen": 1484593376, + "step": 8220 + }, + { + "epoch": 0.8999698951804921, + "grad_norm": 1.0404871302280323, + "learning_rate": 1.222524469469885e-06, + "loss": 0.831, + "num_input_tokens_seen": 1484776160, + "step": 8221 + }, + { + "epoch": 0.90007936725143, + "grad_norm": 1.2062133684673995, + "learning_rate": 1.2198698806166086e-06, + "loss": 0.8508, + "num_input_tokens_seen": 1484975744, + "step": 8222 + }, + { + "epoch": 0.9001888393223679, + "grad_norm": 1.0918788708518317, + "learning_rate": 1.2172181049072695e-06, + "loss": 0.9728, + "num_input_tokens_seen": 1485174208, + "step": 8223 + }, + { + "epoch": 0.9002983113933057, + "grad_norm": 1.1519808957648356, + "learning_rate": 1.214569142655564e-06, + "loss": 0.839, + "num_input_tokens_seen": 1485350272, + "step": 8224 + }, + { + "epoch": 0.9004077834642437, + "grad_norm": 1.1765326446757574, + "learning_rate": 1.211922994174855e-06, + "loss": 1.0387, + "num_input_tokens_seen": 1485539776, + "step": 8225 + }, + { + "epoch": 0.9005172555351816, + "grad_norm": 1.1819756151256195, + "learning_rate": 1.209279659778187e-06, + "loss": 0.7375, + "num_input_tokens_seen": 1485720096, + "step": 8226 + }, + { + "epoch": 0.9006267276061195, + "grad_norm": 1.1748936239747645, + "learning_rate": 1.2066391397782484e-06, + "loss": 0.8056, + "num_input_tokens_seen": 1485908480, + "step": 8227 + }, + { + "epoch": 0.9007361996770574, + "grad_norm": 1.1020621390849705, + "learning_rate": 1.204001434487423e-06, + "loss": 0.776, + "num_input_tokens_seen": 1486050272, + "step": 8228 + }, + { + "epoch": 0.9008456717479953, + "grad_norm": 1.0232973075535288, + "learning_rate": 1.2013665442177275e-06, + "loss": 0.8227, + "num_input_tokens_seen": 1486258592, + "step": 8229 + }, + { + "epoch": 0.9009551438189332, + "grad_norm": 1.1499136954555131, + "learning_rate": 1.1987344692808849e-06, + "loss": 0.7831, + "num_input_tokens_seen": 1486437792, + "step": 8230 + }, + { + "epoch": 0.9010646158898711, + "grad_norm": 1.1957300491566807, + "learning_rate": 1.1961052099882435e-06, + "loss": 0.748, + "num_input_tokens_seen": 1486608704, + "step": 8231 + }, + { + "epoch": 0.901174087960809, + "grad_norm": 1.0815568565699771, + "learning_rate": 1.1934787666508573e-06, + "loss": 0.8954, + "num_input_tokens_seen": 1486783648, + "step": 8232 + }, + { + "epoch": 0.9012835600317469, + "grad_norm": 1.0229370990481326, + "learning_rate": 1.190855139579422e-06, + "loss": 0.7173, + "num_input_tokens_seen": 1486965312, + "step": 8233 + }, + { + "epoch": 0.9013930321026848, + "grad_norm": 1.2127113885561969, + "learning_rate": 1.1882343290843063e-06, + "loss": 1.1024, + "num_input_tokens_seen": 1487155936, + "step": 8234 + }, + { + "epoch": 0.9015025041736227, + "grad_norm": 1.3405283521836149, + "learning_rate": 1.1856163354755505e-06, + "loss": 1.03, + "num_input_tokens_seen": 1487315872, + "step": 8235 + }, + { + "epoch": 0.9016119762445606, + "grad_norm": 1.1237094682591118, + "learning_rate": 1.1830011590628547e-06, + "loss": 1.0913, + "num_input_tokens_seen": 1487497984, + "step": 8236 + }, + { + "epoch": 0.9017214483154985, + "grad_norm": 0.9768684823373398, + "learning_rate": 1.1803888001555963e-06, + "loss": 0.7215, + "num_input_tokens_seen": 1487675840, + "step": 8237 + }, + { + "epoch": 0.9018309203864364, + "grad_norm": 1.331142005659181, + "learning_rate": 1.1777792590628028e-06, + "loss": 0.958, + "num_input_tokens_seen": 1487861088, + "step": 8238 + }, + { + "epoch": 0.9019403924573743, + "grad_norm": 1.1512031211793485, + "learning_rate": 1.175172536093183e-06, + "loss": 0.822, + "num_input_tokens_seen": 1488042528, + "step": 8239 + }, + { + "epoch": 0.9020498645283123, + "grad_norm": 1.2411452425326874, + "learning_rate": 1.1725686315551099e-06, + "loss": 0.9403, + "num_input_tokens_seen": 1488231808, + "step": 8240 + }, + { + "epoch": 0.9021593365992501, + "grad_norm": 1.0232987054957368, + "learning_rate": 1.1699675457566144e-06, + "loss": 1.0121, + "num_input_tokens_seen": 1488421984, + "step": 8241 + }, + { + "epoch": 0.902268808670188, + "grad_norm": 1.3386567755260281, + "learning_rate": 1.1673692790054063e-06, + "loss": 0.8738, + "num_input_tokens_seen": 1488556160, + "step": 8242 + }, + { + "epoch": 0.9023782807411259, + "grad_norm": 1.1592958387304306, + "learning_rate": 1.1647738316088508e-06, + "loss": 0.8242, + "num_input_tokens_seen": 1488752160, + "step": 8243 + }, + { + "epoch": 0.9024877528120638, + "grad_norm": 1.0144446923834878, + "learning_rate": 1.1621812038739855e-06, + "loss": 0.7347, + "num_input_tokens_seen": 1488958688, + "step": 8244 + }, + { + "epoch": 0.9025972248830018, + "grad_norm": 1.1530372730989737, + "learning_rate": 1.1595913961075094e-06, + "loss": 1.034, + "num_input_tokens_seen": 1489147968, + "step": 8245 + }, + { + "epoch": 0.9027066969539397, + "grad_norm": 1.0271052674794385, + "learning_rate": 1.1570044086158e-06, + "loss": 0.8037, + "num_input_tokens_seen": 1489328288, + "step": 8246 + }, + { + "epoch": 0.9028161690248775, + "grad_norm": 0.9707368349788366, + "learning_rate": 1.15442024170489e-06, + "loss": 0.516, + "num_input_tokens_seen": 1489507488, + "step": 8247 + }, + { + "epoch": 0.9029256410958154, + "grad_norm": 1.2018424294631445, + "learning_rate": 1.1518388956804793e-06, + "loss": 0.8438, + "num_input_tokens_seen": 1489695200, + "step": 8248 + }, + { + "epoch": 0.9030351131667533, + "grad_norm": 1.0897508260291047, + "learning_rate": 1.149260370847935e-06, + "loss": 0.9425, + "num_input_tokens_seen": 1489855360, + "step": 8249 + }, + { + "epoch": 0.9031445852376913, + "grad_norm": 1.2323871976444367, + "learning_rate": 1.1466846675122988e-06, + "loss": 0.7014, + "num_input_tokens_seen": 1489975872, + "step": 8250 + }, + { + "epoch": 0.9032540573086292, + "grad_norm": 1.3657320395274504, + "learning_rate": 1.1441117859782636e-06, + "loss": 0.942, + "num_input_tokens_seen": 1490164032, + "step": 8251 + }, + { + "epoch": 0.903363529379567, + "grad_norm": 0.9926039354976134, + "learning_rate": 1.1415417265501993e-06, + "loss": 0.7039, + "num_input_tokens_seen": 1490330688, + "step": 8252 + }, + { + "epoch": 0.9034730014505049, + "grad_norm": 1.1005387114146714, + "learning_rate": 1.138974489532138e-06, + "loss": 0.8314, + "num_input_tokens_seen": 1490529824, + "step": 8253 + }, + { + "epoch": 0.9035824735214428, + "grad_norm": 0.9848423711157928, + "learning_rate": 1.1364100752277812e-06, + "loss": 0.7096, + "num_input_tokens_seen": 1490724256, + "step": 8254 + }, + { + "epoch": 0.9036919455923808, + "grad_norm": 1.106846358770366, + "learning_rate": 1.1338484839404944e-06, + "loss": 1.0336, + "num_input_tokens_seen": 1490889792, + "step": 8255 + }, + { + "epoch": 0.9038014176633187, + "grad_norm": 1.081715451645606, + "learning_rate": 1.131289715973305e-06, + "loss": 0.6998, + "num_input_tokens_seen": 1491058016, + "step": 8256 + }, + { + "epoch": 0.9039108897342566, + "grad_norm": 1.2188887883903703, + "learning_rate": 1.1287337716289149e-06, + "loss": 0.9641, + "num_input_tokens_seen": 1491242816, + "step": 8257 + }, + { + "epoch": 0.9040203618051944, + "grad_norm": 1.2409639390383063, + "learning_rate": 1.1261806512096878e-06, + "loss": 0.9602, + "num_input_tokens_seen": 1491435904, + "step": 8258 + }, + { + "epoch": 0.9041298338761323, + "grad_norm": 1.1850946560006654, + "learning_rate": 1.1236303550176463e-06, + "loss": 1.0269, + "num_input_tokens_seen": 1491595616, + "step": 8259 + }, + { + "epoch": 0.9042393059470702, + "grad_norm": 1.0271031783368139, + "learning_rate": 1.1210828833544907e-06, + "loss": 0.933, + "num_input_tokens_seen": 1491778624, + "step": 8260 + }, + { + "epoch": 0.9043487780180082, + "grad_norm": 1.0624737455826805, + "learning_rate": 1.1185382365215853e-06, + "loss": 0.777, + "num_input_tokens_seen": 1491972160, + "step": 8261 + }, + { + "epoch": 0.9044582500889461, + "grad_norm": 1.1533993815838945, + "learning_rate": 1.1159964148199475e-06, + "loss": 1.0945, + "num_input_tokens_seen": 1492178912, + "step": 8262 + }, + { + "epoch": 0.904567722159884, + "grad_norm": 1.0937355585507262, + "learning_rate": 1.1134574185502816e-06, + "loss": 0.9078, + "num_input_tokens_seen": 1492392160, + "step": 8263 + }, + { + "epoch": 0.9046771942308218, + "grad_norm": 1.1360939117586821, + "learning_rate": 1.1109212480129334e-06, + "loss": 0.7378, + "num_input_tokens_seen": 1492574944, + "step": 8264 + }, + { + "epoch": 0.9047866663017597, + "grad_norm": 1.0099088172543553, + "learning_rate": 1.1083879035079349e-06, + "loss": 0.7589, + "num_input_tokens_seen": 1492760864, + "step": 8265 + }, + { + "epoch": 0.9048961383726977, + "grad_norm": 1.0885423115754231, + "learning_rate": 1.105857385334977e-06, + "loss": 0.9072, + "num_input_tokens_seen": 1492913632, + "step": 8266 + }, + { + "epoch": 0.9050056104436356, + "grad_norm": 1.0158997384184507, + "learning_rate": 1.1033296937934061e-06, + "loss": 0.9772, + "num_input_tokens_seen": 1493112096, + "step": 8267 + }, + { + "epoch": 0.9051150825145735, + "grad_norm": 1.167684003637182, + "learning_rate": 1.1008048291822588e-06, + "loss": 0.8122, + "num_input_tokens_seen": 1493247168, + "step": 8268 + }, + { + "epoch": 0.9052245545855113, + "grad_norm": 1.089390704971363, + "learning_rate": 1.0982827918002065e-06, + "loss": 0.8768, + "num_input_tokens_seen": 1493450560, + "step": 8269 + }, + { + "epoch": 0.9053340266564492, + "grad_norm": 1.1449779380938387, + "learning_rate": 1.0957635819456135e-06, + "loss": 1.0532, + "num_input_tokens_seen": 1493625280, + "step": 8270 + }, + { + "epoch": 0.9054434987273872, + "grad_norm": 1.0853727230516903, + "learning_rate": 1.0932471999164835e-06, + "loss": 0.8289, + "num_input_tokens_seen": 1493830688, + "step": 8271 + }, + { + "epoch": 0.9055529707983251, + "grad_norm": 1.1612475878276758, + "learning_rate": 1.0907336460105166e-06, + "loss": 1.2016, + "num_input_tokens_seen": 1494045728, + "step": 8272 + }, + { + "epoch": 0.905662442869263, + "grad_norm": 1.1825374510240008, + "learning_rate": 1.088222920525045e-06, + "loss": 0.9235, + "num_input_tokens_seen": 1494203200, + "step": 8273 + }, + { + "epoch": 0.9057719149402009, + "grad_norm": 1.1481585001535783, + "learning_rate": 1.0857150237570946e-06, + "loss": 1.0415, + "num_input_tokens_seen": 1494382848, + "step": 8274 + }, + { + "epoch": 0.9058813870111387, + "grad_norm": 1.0581745852159443, + "learning_rate": 1.0832099560033394e-06, + "loss": 0.9335, + "num_input_tokens_seen": 1494586240, + "step": 8275 + }, + { + "epoch": 0.9059908590820767, + "grad_norm": 1.1862857533444258, + "learning_rate": 1.0807077175601255e-06, + "loss": 0.8872, + "num_input_tokens_seen": 1494782464, + "step": 8276 + }, + { + "epoch": 0.9061003311530146, + "grad_norm": 1.0481398613364652, + "learning_rate": 1.0782083087234608e-06, + "loss": 0.7234, + "num_input_tokens_seen": 1494966816, + "step": 8277 + }, + { + "epoch": 0.9062098032239525, + "grad_norm": 1.2929783846677712, + "learning_rate": 1.0757117297890224e-06, + "loss": 0.7995, + "num_input_tokens_seen": 1495166624, + "step": 8278 + }, + { + "epoch": 0.9063192752948904, + "grad_norm": 1.2206979492077772, + "learning_rate": 1.073217981052152e-06, + "loss": 1.1398, + "num_input_tokens_seen": 1495342688, + "step": 8279 + }, + { + "epoch": 0.9064287473658283, + "grad_norm": 1.0603042240646194, + "learning_rate": 1.0707270628078552e-06, + "loss": 0.8177, + "num_input_tokens_seen": 1495539584, + "step": 8280 + }, + { + "epoch": 0.9065382194367662, + "grad_norm": 1.3377442662731605, + "learning_rate": 1.0682389753508021e-06, + "loss": 1.036, + "num_input_tokens_seen": 1495686080, + "step": 8281 + }, + { + "epoch": 0.9066476915077041, + "grad_norm": 1.1356243095332867, + "learning_rate": 1.065753718975329e-06, + "loss": 1.1312, + "num_input_tokens_seen": 1495872000, + "step": 8282 + }, + { + "epoch": 0.906757163578642, + "grad_norm": 1.2310982683285008, + "learning_rate": 1.0632712939754347e-06, + "loss": 0.8706, + "num_input_tokens_seen": 1496014912, + "step": 8283 + }, + { + "epoch": 0.9068666356495799, + "grad_norm": 1.1376493125424227, + "learning_rate": 1.0607917006447865e-06, + "loss": 0.9748, + "num_input_tokens_seen": 1496203520, + "step": 8284 + }, + { + "epoch": 0.9069761077205178, + "grad_norm": 1.3074893675275183, + "learning_rate": 1.0583149392767195e-06, + "loss": 1.209, + "num_input_tokens_seen": 1496410272, + "step": 8285 + }, + { + "epoch": 0.9070855797914557, + "grad_norm": 1.1075303766291242, + "learning_rate": 1.0558410101642213e-06, + "loss": 1.0296, + "num_input_tokens_seen": 1496605600, + "step": 8286 + }, + { + "epoch": 0.9071950518623936, + "grad_norm": 1.2126779170116502, + "learning_rate": 1.0533699135999608e-06, + "loss": 0.95, + "num_input_tokens_seen": 1496776736, + "step": 8287 + }, + { + "epoch": 0.9073045239333315, + "grad_norm": 1.1398379079291203, + "learning_rate": 1.0509016498762625e-06, + "loss": 0.8804, + "num_input_tokens_seen": 1496968480, + "step": 8288 + }, + { + "epoch": 0.9074139960042694, + "grad_norm": 1.0209879206691046, + "learning_rate": 1.0484362192851149e-06, + "loss": 0.6753, + "num_input_tokens_seen": 1497154848, + "step": 8289 + }, + { + "epoch": 0.9075234680752073, + "grad_norm": 1.1973647407962773, + "learning_rate": 1.0459736221181766e-06, + "loss": 0.8413, + "num_input_tokens_seen": 1497342336, + "step": 8290 + }, + { + "epoch": 0.9076329401461453, + "grad_norm": 1.2386115082003961, + "learning_rate": 1.0435138586667641e-06, + "loss": 0.847, + "num_input_tokens_seen": 1497511008, + "step": 8291 + }, + { + "epoch": 0.9077424122170831, + "grad_norm": 1.028273187410667, + "learning_rate": 1.0410569292218676e-06, + "loss": 0.8252, + "num_input_tokens_seen": 1497671616, + "step": 8292 + }, + { + "epoch": 0.907851884288021, + "grad_norm": 1.1716579490561598, + "learning_rate": 1.038602834074129e-06, + "loss": 0.8048, + "num_input_tokens_seen": 1497853280, + "step": 8293 + }, + { + "epoch": 0.9079613563589589, + "grad_norm": 1.2073329313970185, + "learning_rate": 1.0361515735138772e-06, + "loss": 0.9138, + "num_input_tokens_seen": 1498044800, + "step": 8294 + }, + { + "epoch": 0.9080708284298968, + "grad_norm": 1.145857683558811, + "learning_rate": 1.0337031478310749e-06, + "loss": 0.7592, + "num_input_tokens_seen": 1498220416, + "step": 8295 + }, + { + "epoch": 0.9081803005008348, + "grad_norm": 1.186675890009822, + "learning_rate": 1.0312575573153792e-06, + "loss": 0.7531, + "num_input_tokens_seen": 1498384608, + "step": 8296 + }, + { + "epoch": 0.9082897725717727, + "grad_norm": 1.1442127500692165, + "learning_rate": 1.0288148022560923e-06, + "loss": 0.8312, + "num_input_tokens_seen": 1498575680, + "step": 8297 + }, + { + "epoch": 0.9083992446427105, + "grad_norm": 0.9363785712220238, + "learning_rate": 1.026374882942191e-06, + "loss": 0.7952, + "num_input_tokens_seen": 1498764736, + "step": 8298 + }, + { + "epoch": 0.9085087167136484, + "grad_norm": 1.1397009986975488, + "learning_rate": 1.0239377996623112e-06, + "loss": 1.1108, + "num_input_tokens_seen": 1498947968, + "step": 8299 + }, + { + "epoch": 0.9086181887845863, + "grad_norm": 1.1579597307460567, + "learning_rate": 1.02150355270475e-06, + "loss": 0.9321, + "num_input_tokens_seen": 1499139040, + "step": 8300 + }, + { + "epoch": 0.9087276608555243, + "grad_norm": 1.082490513696137, + "learning_rate": 1.0190721423574884e-06, + "loss": 0.8652, + "num_input_tokens_seen": 1499316224, + "step": 8301 + }, + { + "epoch": 0.9088371329264622, + "grad_norm": 1.1785142634924135, + "learning_rate": 1.0166435689081404e-06, + "loss": 0.9271, + "num_input_tokens_seen": 1499499680, + "step": 8302 + }, + { + "epoch": 0.9089466049974, + "grad_norm": 1.2287962188255228, + "learning_rate": 1.0142178326440155e-06, + "loss": 0.8369, + "num_input_tokens_seen": 1499658272, + "step": 8303 + }, + { + "epoch": 0.9090560770683379, + "grad_norm": 1.1672404388310669, + "learning_rate": 1.0117949338520645e-06, + "loss": 0.9242, + "num_input_tokens_seen": 1499815520, + "step": 8304 + }, + { + "epoch": 0.9091655491392758, + "grad_norm": 1.087578939444823, + "learning_rate": 1.009374872818919e-06, + "loss": 0.6883, + "num_input_tokens_seen": 1499959104, + "step": 8305 + }, + { + "epoch": 0.9092750212102138, + "grad_norm": 1.2044528222985253, + "learning_rate": 1.0069576498308587e-06, + "loss": 0.9726, + "num_input_tokens_seen": 1500167648, + "step": 8306 + }, + { + "epoch": 0.9093844932811517, + "grad_norm": 1.3158773837085613, + "learning_rate": 1.0045432651738434e-06, + "loss": 1.0508, + "num_input_tokens_seen": 1500309216, + "step": 8307 + }, + { + "epoch": 0.9094939653520896, + "grad_norm": 1.0625791520200065, + "learning_rate": 1.0021317191334895e-06, + "loss": 0.693, + "num_input_tokens_seen": 1500491328, + "step": 8308 + }, + { + "epoch": 0.9096034374230274, + "grad_norm": 1.101391880538574, + "learning_rate": 9.99723011995074e-07, + "loss": 0.8125, + "num_input_tokens_seen": 1500638496, + "step": 8309 + }, + { + "epoch": 0.9097129094939653, + "grad_norm": 1.202216114708137, + "learning_rate": 9.973171440435524e-07, + "loss": 0.9299, + "num_input_tokens_seen": 1500813888, + "step": 8310 + }, + { + "epoch": 0.9098223815649032, + "grad_norm": 1.1088290953351392, + "learning_rate": 9.949141155635194e-07, + "loss": 0.9402, + "num_input_tokens_seen": 1500988608, + "step": 8311 + }, + { + "epoch": 0.9099318536358412, + "grad_norm": 1.0810839089955897, + "learning_rate": 9.925139268392614e-07, + "loss": 0.9988, + "num_input_tokens_seen": 1501179008, + "step": 8312 + }, + { + "epoch": 0.9100413257067791, + "grad_norm": 1.0366522577230914, + "learning_rate": 9.901165781547096e-07, + "loss": 0.9574, + "num_input_tokens_seen": 1501369184, + "step": 8313 + }, + { + "epoch": 0.910150797777717, + "grad_norm": 1.113323599026625, + "learning_rate": 9.877220697934674e-07, + "loss": 1.0071, + "num_input_tokens_seen": 1501565632, + "step": 8314 + }, + { + "epoch": 0.9102602698486548, + "grad_norm": 1.0802601666032787, + "learning_rate": 9.853304020388005e-07, + "loss": 1.0336, + "num_input_tokens_seen": 1501753792, + "step": 8315 + }, + { + "epoch": 0.9103697419195927, + "grad_norm": 0.9999907910399797, + "learning_rate": 9.829415751736404e-07, + "loss": 0.8269, + "num_input_tokens_seen": 1501953376, + "step": 8316 + }, + { + "epoch": 0.9104792139905307, + "grad_norm": 1.0767061599971883, + "learning_rate": 9.805555894805778e-07, + "loss": 1.1037, + "num_input_tokens_seen": 1502156768, + "step": 8317 + }, + { + "epoch": 0.9105886860614686, + "grad_norm": 1.0975236371390729, + "learning_rate": 9.781724452418733e-07, + "loss": 0.8832, + "num_input_tokens_seen": 1502339552, + "step": 8318 + }, + { + "epoch": 0.9106981581324065, + "grad_norm": 1.3447311722642483, + "learning_rate": 9.757921427394457e-07, + "loss": 0.8312, + "num_input_tokens_seen": 1502492320, + "step": 8319 + }, + { + "epoch": 0.9108076302033443, + "grad_norm": 1.0807158258679443, + "learning_rate": 9.73414682254878e-07, + "loss": 0.7666, + "num_input_tokens_seen": 1502681376, + "step": 8320 + }, + { + "epoch": 0.9109171022742822, + "grad_norm": 1.1012757381241631, + "learning_rate": 9.710400640694228e-07, + "loss": 0.8382, + "num_input_tokens_seen": 1502850944, + "step": 8321 + }, + { + "epoch": 0.9110265743452202, + "grad_norm": 0.9554209961580272, + "learning_rate": 9.686682884639948e-07, + "loss": 0.7403, + "num_input_tokens_seen": 1503009984, + "step": 8322 + }, + { + "epoch": 0.9111360464161581, + "grad_norm": 1.0119926052539534, + "learning_rate": 9.662993557191691e-07, + "loss": 0.7701, + "num_input_tokens_seen": 1503198368, + "step": 8323 + }, + { + "epoch": 0.911245518487096, + "grad_norm": 1.2269151022872447, + "learning_rate": 9.639332661151856e-07, + "loss": 0.942, + "num_input_tokens_seen": 1503374880, + "step": 8324 + }, + { + "epoch": 0.9113549905580339, + "grad_norm": 1.017129048486036, + "learning_rate": 9.615700199319455e-07, + "loss": 0.7965, + "num_input_tokens_seen": 1503576256, + "step": 8325 + }, + { + "epoch": 0.9114644626289717, + "grad_norm": 1.0749391671644717, + "learning_rate": 9.592096174490195e-07, + "loss": 0.8966, + "num_input_tokens_seen": 1503744256, + "step": 8326 + }, + { + "epoch": 0.9115739346999097, + "grad_norm": 1.1189009857586603, + "learning_rate": 9.56852058945637e-07, + "loss": 0.8747, + "num_input_tokens_seen": 1503922784, + "step": 8327 + }, + { + "epoch": 0.9116834067708476, + "grad_norm": 1.242827484523957, + "learning_rate": 9.544973447006888e-07, + "loss": 0.9247, + "num_input_tokens_seen": 1504076896, + "step": 8328 + }, + { + "epoch": 0.9117928788417855, + "grad_norm": 1.1129464164612786, + "learning_rate": 9.521454749927411e-07, + "loss": 1.1011, + "num_input_tokens_seen": 1504278048, + "step": 8329 + }, + { + "epoch": 0.9119023509127234, + "grad_norm": 1.063046931697608, + "learning_rate": 9.497964501000128e-07, + "loss": 0.8944, + "num_input_tokens_seen": 1504480320, + "step": 8330 + }, + { + "epoch": 0.9120118229836613, + "grad_norm": 1.3718389241334787, + "learning_rate": 9.474502703003901e-07, + "loss": 1.0168, + "num_input_tokens_seen": 1504642944, + "step": 8331 + }, + { + "epoch": 0.9121212950545992, + "grad_norm": 1.1361647366803977, + "learning_rate": 9.451069358714177e-07, + "loss": 1.0183, + "num_input_tokens_seen": 1504822368, + "step": 8332 + }, + { + "epoch": 0.9122307671255371, + "grad_norm": 1.1059087410125106, + "learning_rate": 9.427664470903097e-07, + "loss": 0.8239, + "num_input_tokens_seen": 1505012320, + "step": 8333 + }, + { + "epoch": 0.912340239196475, + "grad_norm": 1.152709902765688, + "learning_rate": 9.40428804233942e-07, + "loss": 1.2347, + "num_input_tokens_seen": 1505191072, + "step": 8334 + }, + { + "epoch": 0.9124497112674129, + "grad_norm": 1.0458370586090815, + "learning_rate": 9.380940075788491e-07, + "loss": 0.6422, + "num_input_tokens_seen": 1505385952, + "step": 8335 + }, + { + "epoch": 0.9125591833383508, + "grad_norm": 1.2553270792951363, + "learning_rate": 9.357620574012432e-07, + "loss": 1.0457, + "num_input_tokens_seen": 1505547680, + "step": 8336 + }, + { + "epoch": 0.9126686554092887, + "grad_norm": 1.0457977901868003, + "learning_rate": 9.33432953976976e-07, + "loss": 0.9652, + "num_input_tokens_seen": 1505712992, + "step": 8337 + }, + { + "epoch": 0.9127781274802266, + "grad_norm": 1.1272626481521537, + "learning_rate": 9.311066975815852e-07, + "loss": 0.9688, + "num_input_tokens_seen": 1505925344, + "step": 8338 + }, + { + "epoch": 0.9128875995511645, + "grad_norm": 1.0508503377104825, + "learning_rate": 9.28783288490262e-07, + "loss": 0.7433, + "num_input_tokens_seen": 1506120672, + "step": 8339 + }, + { + "epoch": 0.9129970716221024, + "grad_norm": 1.4019737074205503, + "learning_rate": 9.264627269778586e-07, + "loss": 1.1451, + "num_input_tokens_seen": 1506277920, + "step": 8340 + }, + { + "epoch": 0.9131065436930403, + "grad_norm": 1.1767462638826467, + "learning_rate": 9.241450133188944e-07, + "loss": 0.9105, + "num_input_tokens_seen": 1506451968, + "step": 8341 + }, + { + "epoch": 0.9132160157639783, + "grad_norm": 1.087986666208494, + "learning_rate": 9.218301477875474e-07, + "loss": 0.8518, + "num_input_tokens_seen": 1506626016, + "step": 8342 + }, + { + "epoch": 0.9133254878349161, + "grad_norm": 1.062641583153429, + "learning_rate": 9.195181306576678e-07, + "loss": 0.8852, + "num_input_tokens_seen": 1506797824, + "step": 8343 + }, + { + "epoch": 0.913434959905854, + "grad_norm": 1.1822549531364621, + "learning_rate": 9.172089622027563e-07, + "loss": 0.8781, + "num_input_tokens_seen": 1507004352, + "step": 8344 + }, + { + "epoch": 0.9135444319767919, + "grad_norm": 1.1017642546602175, + "learning_rate": 9.149026426959889e-07, + "loss": 0.9618, + "num_input_tokens_seen": 1507185568, + "step": 8345 + }, + { + "epoch": 0.9136539040477298, + "grad_norm": 1.1028662467262427, + "learning_rate": 9.125991724101918e-07, + "loss": 0.8454, + "num_input_tokens_seen": 1507356256, + "step": 8346 + }, + { + "epoch": 0.9137633761186678, + "grad_norm": 1.0927313829772267, + "learning_rate": 9.102985516178692e-07, + "loss": 0.9125, + "num_input_tokens_seen": 1507563456, + "step": 8347 + }, + { + "epoch": 0.9138728481896057, + "grad_norm": 1.123754500654838, + "learning_rate": 9.080007805911728e-07, + "loss": 1.083, + "num_input_tokens_seen": 1507779392, + "step": 8348 + }, + { + "epoch": 0.9139823202605435, + "grad_norm": 1.091428363551446, + "learning_rate": 9.057058596019325e-07, + "loss": 1.0887, + "num_input_tokens_seen": 1507956128, + "step": 8349 + }, + { + "epoch": 0.9140917923314814, + "grad_norm": 1.1487051723230608, + "learning_rate": 9.034137889216255e-07, + "loss": 0.9039, + "num_input_tokens_seen": 1508125472, + "step": 8350 + }, + { + "epoch": 0.9142012644024193, + "grad_norm": 1.1003581287703124, + "learning_rate": 9.011245688214015e-07, + "loss": 0.9551, + "num_input_tokens_seen": 1508293696, + "step": 8351 + }, + { + "epoch": 0.9143107364733573, + "grad_norm": 1.1185383132900408, + "learning_rate": 8.988381995720746e-07, + "loss": 0.9495, + "num_input_tokens_seen": 1508488800, + "step": 8352 + }, + { + "epoch": 0.9144202085442952, + "grad_norm": 1.086720301108004, + "learning_rate": 8.965546814441117e-07, + "loss": 0.8581, + "num_input_tokens_seen": 1508669120, + "step": 8353 + }, + { + "epoch": 0.914529680615233, + "grad_norm": 1.1398979378069942, + "learning_rate": 8.942740147076551e-07, + "loss": 0.8787, + "num_input_tokens_seen": 1508819200, + "step": 8354 + }, + { + "epoch": 0.9146391526861709, + "grad_norm": 1.1763814110476343, + "learning_rate": 8.919961996324999e-07, + "loss": 1.0021, + "num_input_tokens_seen": 1509002656, + "step": 8355 + }, + { + "epoch": 0.9147486247571088, + "grad_norm": 1.0843632766133888, + "learning_rate": 8.897212364881058e-07, + "loss": 0.7413, + "num_input_tokens_seen": 1509179616, + "step": 8356 + }, + { + "epoch": 0.9148580968280468, + "grad_norm": 1.1699014958053928, + "learning_rate": 8.87449125543599e-07, + "loss": 1.1203, + "num_input_tokens_seen": 1509385920, + "step": 8357 + }, + { + "epoch": 0.9149675688989847, + "grad_norm": 1.1102488125487868, + "learning_rate": 8.851798670677674e-07, + "loss": 0.9004, + "num_input_tokens_seen": 1509549216, + "step": 8358 + }, + { + "epoch": 0.9150770409699226, + "grad_norm": 1.093714195755347, + "learning_rate": 8.829134613290574e-07, + "loss": 0.8603, + "num_input_tokens_seen": 1509726624, + "step": 8359 + }, + { + "epoch": 0.9151865130408604, + "grad_norm": 1.2067857017254207, + "learning_rate": 8.806499085955794e-07, + "loss": 0.8909, + "num_input_tokens_seen": 1509904480, + "step": 8360 + }, + { + "epoch": 0.9152959851117983, + "grad_norm": 1.2577463926393615, + "learning_rate": 8.783892091351053e-07, + "loss": 0.8662, + "num_input_tokens_seen": 1510045152, + "step": 8361 + }, + { + "epoch": 0.9154054571827362, + "grad_norm": 1.0761862752127527, + "learning_rate": 8.761313632150797e-07, + "loss": 0.9722, + "num_input_tokens_seen": 1510227936, + "step": 8362 + }, + { + "epoch": 0.9155149292536742, + "grad_norm": 1.195162495693904, + "learning_rate": 8.738763711025971e-07, + "loss": 0.9622, + "num_input_tokens_seen": 1510399968, + "step": 8363 + }, + { + "epoch": 0.9156244013246121, + "grad_norm": 1.065424260895753, + "learning_rate": 8.716242330644164e-07, + "loss": 0.576, + "num_input_tokens_seen": 1510596192, + "step": 8364 + }, + { + "epoch": 0.91573387339555, + "grad_norm": 1.2143990539901426, + "learning_rate": 8.693749493669662e-07, + "loss": 0.9965, + "num_input_tokens_seen": 1510784800, + "step": 8365 + }, + { + "epoch": 0.9158433454664878, + "grad_norm": 1.1442969280885418, + "learning_rate": 8.671285202763252e-07, + "loss": 0.8559, + "num_input_tokens_seen": 1510939584, + "step": 8366 + }, + { + "epoch": 0.9159528175374257, + "grad_norm": 1.4199711556258627, + "learning_rate": 8.648849460582503e-07, + "loss": 0.8519, + "num_input_tokens_seen": 1511079808, + "step": 8367 + }, + { + "epoch": 0.9160622896083637, + "grad_norm": 1.2467281436721653, + "learning_rate": 8.626442269781432e-07, + "loss": 1.038, + "num_input_tokens_seen": 1511282976, + "step": 8368 + }, + { + "epoch": 0.9161717616793016, + "grad_norm": 1.0987276434766693, + "learning_rate": 8.604063633010862e-07, + "loss": 0.8502, + "num_input_tokens_seen": 1511478304, + "step": 8369 + }, + { + "epoch": 0.9162812337502395, + "grad_norm": 1.0747520648788276, + "learning_rate": 8.581713552918064e-07, + "loss": 0.8015, + "num_input_tokens_seen": 1511651456, + "step": 8370 + }, + { + "epoch": 0.9163907058211773, + "grad_norm": 1.2313835497621186, + "learning_rate": 8.559392032147034e-07, + "loss": 1.0392, + "num_input_tokens_seen": 1511812512, + "step": 8371 + }, + { + "epoch": 0.9165001778921152, + "grad_norm": 1.1347380485401142, + "learning_rate": 8.537099073338384e-07, + "loss": 0.8304, + "num_input_tokens_seen": 1511992384, + "step": 8372 + }, + { + "epoch": 0.9166096499630532, + "grad_norm": 1.204553276530628, + "learning_rate": 8.514834679129336e-07, + "loss": 1.1033, + "num_input_tokens_seen": 1512172480, + "step": 8373 + }, + { + "epoch": 0.9167191220339911, + "grad_norm": 1.2098886192698972, + "learning_rate": 8.492598852153672e-07, + "loss": 0.6975, + "num_input_tokens_seen": 1512342272, + "step": 8374 + }, + { + "epoch": 0.916828594104929, + "grad_norm": 1.165289929376735, + "learning_rate": 8.470391595041871e-07, + "loss": 1.029, + "num_input_tokens_seen": 1512541632, + "step": 8375 + }, + { + "epoch": 0.9169380661758669, + "grad_norm": 1.0994207244091108, + "learning_rate": 8.448212910421055e-07, + "loss": 0.9035, + "num_input_tokens_seen": 1512737856, + "step": 8376 + }, + { + "epoch": 0.9170475382468047, + "grad_norm": 1.301585822108259, + "learning_rate": 8.426062800914846e-07, + "loss": 0.9798, + "num_input_tokens_seen": 1512936096, + "step": 8377 + }, + { + "epoch": 0.9171570103177427, + "grad_norm": 1.133752156921152, + "learning_rate": 8.40394126914365e-07, + "loss": 0.8594, + "num_input_tokens_seen": 1513101632, + "step": 8378 + }, + { + "epoch": 0.9172664823886806, + "grad_norm": 1.166872307274219, + "learning_rate": 8.381848317724289e-07, + "loss": 0.9631, + "num_input_tokens_seen": 1513272544, + "step": 8379 + }, + { + "epoch": 0.9173759544596185, + "grad_norm": 1.1958590485912608, + "learning_rate": 8.359783949270394e-07, + "loss": 0.9114, + "num_input_tokens_seen": 1513452416, + "step": 8380 + }, + { + "epoch": 0.9174854265305564, + "grad_norm": 1.0371675923992254, + "learning_rate": 8.337748166392129e-07, + "loss": 0.5996, + "num_input_tokens_seen": 1513616160, + "step": 8381 + }, + { + "epoch": 0.9175948986014943, + "grad_norm": 1.065696340281619, + "learning_rate": 8.315740971696295e-07, + "loss": 0.9963, + "num_input_tokens_seen": 1513805216, + "step": 8382 + }, + { + "epoch": 0.9177043706724322, + "grad_norm": 1.2497253116153915, + "learning_rate": 8.293762367786257e-07, + "loss": 0.8812, + "num_input_tokens_seen": 1513984416, + "step": 8383 + }, + { + "epoch": 0.9178138427433701, + "grad_norm": 1.2360171247795935, + "learning_rate": 8.271812357262043e-07, + "loss": 0.717, + "num_input_tokens_seen": 1514143232, + "step": 8384 + }, + { + "epoch": 0.917923314814308, + "grad_norm": 1.1408595536374289, + "learning_rate": 8.249890942720384e-07, + "loss": 0.7321, + "num_input_tokens_seen": 1514313024, + "step": 8385 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 1.1288243564350342, + "learning_rate": 8.227998126754427e-07, + "loss": 0.8186, + "num_input_tokens_seen": 1514509248, + "step": 8386 + }, + { + "epoch": 0.9181422589561838, + "grad_norm": 1.2897332498224707, + "learning_rate": 8.206133911954156e-07, + "loss": 0.908, + "num_input_tokens_seen": 1514681952, + "step": 8387 + }, + { + "epoch": 0.9182517310271217, + "grad_norm": 1.4186030038648807, + "learning_rate": 8.184298300905946e-07, + "loss": 1.1958, + "num_input_tokens_seen": 1514897440, + "step": 8388 + }, + { + "epoch": 0.9183612030980596, + "grad_norm": 1.02453635824913, + "learning_rate": 8.162491296193009e-07, + "loss": 0.9813, + "num_input_tokens_seen": 1515083136, + "step": 8389 + }, + { + "epoch": 0.9184706751689975, + "grad_norm": 1.2189008668297288, + "learning_rate": 8.140712900395031e-07, + "loss": 0.8221, + "num_input_tokens_seen": 1515242848, + "step": 8390 + }, + { + "epoch": 0.9185801472399354, + "grad_norm": 1.2371595819979513, + "learning_rate": 8.118963116088369e-07, + "loss": 0.781, + "num_input_tokens_seen": 1515410624, + "step": 8391 + }, + { + "epoch": 0.9186896193108733, + "grad_norm": 1.3360549696646606, + "learning_rate": 8.097241945845962e-07, + "loss": 0.9616, + "num_input_tokens_seen": 1515586464, + "step": 8392 + }, + { + "epoch": 0.9187990913818113, + "grad_norm": 1.1861492807220901, + "learning_rate": 8.075549392237369e-07, + "loss": 0.9637, + "num_input_tokens_seen": 1515743488, + "step": 8393 + }, + { + "epoch": 0.9189085634527491, + "grad_norm": 1.1146777044731577, + "learning_rate": 8.053885457828869e-07, + "loss": 0.7242, + "num_input_tokens_seen": 1515936576, + "step": 8394 + }, + { + "epoch": 0.919018035523687, + "grad_norm": 1.0294774745757551, + "learning_rate": 8.032250145183134e-07, + "loss": 0.6952, + "num_input_tokens_seen": 1516131232, + "step": 8395 + }, + { + "epoch": 0.9191275075946249, + "grad_norm": 1.0287656046354805, + "learning_rate": 8.010643456859645e-07, + "loss": 0.7914, + "num_input_tokens_seen": 1516335296, + "step": 8396 + }, + { + "epoch": 0.9192369796655628, + "grad_norm": 1.1357758271868121, + "learning_rate": 7.989065395414468e-07, + "loss": 0.8502, + "num_input_tokens_seen": 1516523456, + "step": 8397 + }, + { + "epoch": 0.9193464517365008, + "grad_norm": 1.1020559193327, + "learning_rate": 7.967515963400202e-07, + "loss": 0.6692, + "num_input_tokens_seen": 1516700416, + "step": 8398 + }, + { + "epoch": 0.9194559238074387, + "grad_norm": 1.4233209153972963, + "learning_rate": 7.945995163366083e-07, + "loss": 1.1064, + "num_input_tokens_seen": 1516869088, + "step": 8399 + }, + { + "epoch": 0.9195653958783765, + "grad_norm": 1.0805473205287472, + "learning_rate": 7.924502997858018e-07, + "loss": 0.7076, + "num_input_tokens_seen": 1517019616, + "step": 8400 + }, + { + "epoch": 0.9196748679493144, + "grad_norm": 1.118146844589734, + "learning_rate": 7.903039469418506e-07, + "loss": 1.1123, + "num_input_tokens_seen": 1517222112, + "step": 8401 + }, + { + "epoch": 0.9197843400202523, + "grad_norm": 1.2416528954905006, + "learning_rate": 7.881604580586593e-07, + "loss": 0.7529, + "num_input_tokens_seen": 1517400640, + "step": 8402 + }, + { + "epoch": 0.9198938120911903, + "grad_norm": 1.0872830623986243, + "learning_rate": 7.860198333897978e-07, + "loss": 1.0415, + "num_input_tokens_seen": 1517576704, + "step": 8403 + }, + { + "epoch": 0.9200032841621282, + "grad_norm": 1.259566795567805, + "learning_rate": 7.838820731885021e-07, + "loss": 0.8858, + "num_input_tokens_seen": 1517748064, + "step": 8404 + }, + { + "epoch": 0.920112756233066, + "grad_norm": 1.0468331513651925, + "learning_rate": 7.817471777076673e-07, + "loss": 1.0054, + "num_input_tokens_seen": 1517969376, + "step": 8405 + }, + { + "epoch": 0.9202222283040039, + "grad_norm": 1.0849219314334548, + "learning_rate": 7.796151471998414e-07, + "loss": 0.9971, + "num_input_tokens_seen": 1518148352, + "step": 8406 + }, + { + "epoch": 0.9203317003749418, + "grad_norm": 1.111533308613753, + "learning_rate": 7.77485981917242e-07, + "loss": 1.0098, + "num_input_tokens_seen": 1518339424, + "step": 8407 + }, + { + "epoch": 0.9204411724458798, + "grad_norm": 1.1500430140536413, + "learning_rate": 7.753596821117426e-07, + "loss": 0.908, + "num_input_tokens_seen": 1518530272, + "step": 8408 + }, + { + "epoch": 0.9205506445168177, + "grad_norm": 1.0431664097396536, + "learning_rate": 7.732362480348892e-07, + "loss": 0.8615, + "num_input_tokens_seen": 1518710816, + "step": 8409 + }, + { + "epoch": 0.9206601165877556, + "grad_norm": 1.0997449882724388, + "learning_rate": 7.7111567993787e-07, + "loss": 0.7845, + "num_input_tokens_seen": 1518867616, + "step": 8410 + }, + { + "epoch": 0.9207695886586934, + "grad_norm": 1.262343733152816, + "learning_rate": 7.689979780715534e-07, + "loss": 0.999, + "num_input_tokens_seen": 1519025984, + "step": 8411 + }, + { + "epoch": 0.9208790607296313, + "grad_norm": 1.1638965872482423, + "learning_rate": 7.668831426864448e-07, + "loss": 0.8872, + "num_input_tokens_seen": 1519221984, + "step": 8412 + }, + { + "epoch": 0.9209885328005692, + "grad_norm": 1.1240332476256578, + "learning_rate": 7.647711740327412e-07, + "loss": 0.7102, + "num_input_tokens_seen": 1519394464, + "step": 8413 + }, + { + "epoch": 0.9210980048715072, + "grad_norm": 1.059995841791886, + "learning_rate": 7.626620723602762e-07, + "loss": 0.9296, + "num_input_tokens_seen": 1519583520, + "step": 8414 + }, + { + "epoch": 0.9212074769424451, + "grad_norm": 1.3315794953392144, + "learning_rate": 7.605558379185556e-07, + "loss": 1.0464, + "num_input_tokens_seen": 1519741888, + "step": 8415 + }, + { + "epoch": 0.921316949013383, + "grad_norm": 1.015392100700051, + "learning_rate": 7.584524709567386e-07, + "loss": 0.9576, + "num_input_tokens_seen": 1519928032, + "step": 8416 + }, + { + "epoch": 0.9214264210843208, + "grad_norm": 1.1414371890993107, + "learning_rate": 7.563519717236511e-07, + "loss": 0.9107, + "num_input_tokens_seen": 1520122912, + "step": 8417 + }, + { + "epoch": 0.9215358931552587, + "grad_norm": 1.0971271692354303, + "learning_rate": 7.54254340467786e-07, + "loss": 0.7846, + "num_input_tokens_seen": 1520287552, + "step": 8418 + }, + { + "epoch": 0.9216453652261967, + "grad_norm": 1.2708543661075946, + "learning_rate": 7.521595774372752e-07, + "loss": 1.1156, + "num_input_tokens_seen": 1520464960, + "step": 8419 + }, + { + "epoch": 0.9217548372971346, + "grad_norm": 1.051379462490605, + "learning_rate": 7.500676828799402e-07, + "loss": 0.7549, + "num_input_tokens_seen": 1520622208, + "step": 8420 + }, + { + "epoch": 0.9218643093680725, + "grad_norm": 1.6370570304364749, + "learning_rate": 7.479786570432329e-07, + "loss": 1.1153, + "num_input_tokens_seen": 1520815744, + "step": 8421 + }, + { + "epoch": 0.9219737814390103, + "grad_norm": 1.0660806235587574, + "learning_rate": 7.458925001742917e-07, + "loss": 0.8271, + "num_input_tokens_seen": 1520996288, + "step": 8422 + }, + { + "epoch": 0.9220832535099482, + "grad_norm": 0.9394786617115419, + "learning_rate": 7.438092125199025e-07, + "loss": 0.7357, + "num_input_tokens_seen": 1521183776, + "step": 8423 + }, + { + "epoch": 0.9221927255808862, + "grad_norm": 1.0531955480292399, + "learning_rate": 7.417287943265128e-07, + "loss": 1.0724, + "num_input_tokens_seen": 1521362528, + "step": 8424 + }, + { + "epoch": 0.9223021976518241, + "grad_norm": 1.012800956078842, + "learning_rate": 7.39651245840231e-07, + "loss": 0.6984, + "num_input_tokens_seen": 1521540832, + "step": 8425 + }, + { + "epoch": 0.922411669722762, + "grad_norm": 1.1932159187761453, + "learning_rate": 7.375765673068275e-07, + "loss": 0.9409, + "num_input_tokens_seen": 1521728544, + "step": 8426 + }, + { + "epoch": 0.9225211417936999, + "grad_norm": 1.0485216541882947, + "learning_rate": 7.355047589717418e-07, + "loss": 0.7307, + "num_input_tokens_seen": 1521912896, + "step": 8427 + }, + { + "epoch": 0.9226306138646377, + "grad_norm": 1.0584216664213726, + "learning_rate": 7.334358210800473e-07, + "loss": 1.0393, + "num_input_tokens_seen": 1522110240, + "step": 8428 + }, + { + "epoch": 0.9227400859355757, + "grad_norm": 1.0575367623109946, + "learning_rate": 7.313697538765124e-07, + "loss": 0.7603, + "num_input_tokens_seen": 1522277120, + "step": 8429 + }, + { + "epoch": 0.9228495580065136, + "grad_norm": 1.0720849501628127, + "learning_rate": 7.293065576055386e-07, + "loss": 0.7412, + "num_input_tokens_seen": 1522459456, + "step": 8430 + }, + { + "epoch": 0.9229590300774515, + "grad_norm": 1.1551956318236423, + "learning_rate": 7.272462325112056e-07, + "loss": 0.9742, + "num_input_tokens_seen": 1522648736, + "step": 8431 + }, + { + "epoch": 0.9230685021483894, + "grad_norm": 1.1191296002708773, + "learning_rate": 7.25188778837238e-07, + "loss": 0.8604, + "num_input_tokens_seen": 1522807328, + "step": 8432 + }, + { + "epoch": 0.9231779742193273, + "grad_norm": 1.0601998282958638, + "learning_rate": 7.231341968270328e-07, + "loss": 0.9193, + "num_input_tokens_seen": 1523008704, + "step": 8433 + }, + { + "epoch": 0.9232874462902652, + "grad_norm": 1.1381302292864437, + "learning_rate": 7.210824867236427e-07, + "loss": 0.863, + "num_input_tokens_seen": 1523223744, + "step": 8434 + }, + { + "epoch": 0.9233969183612031, + "grad_norm": 1.0568564823813598, + "learning_rate": 7.190336487697791e-07, + "loss": 1.314, + "num_input_tokens_seen": 1523444608, + "step": 8435 + }, + { + "epoch": 0.923506390432141, + "grad_norm": 1.0883924330242587, + "learning_rate": 7.169876832078204e-07, + "loss": 0.7338, + "num_input_tokens_seen": 1523643296, + "step": 8436 + }, + { + "epoch": 0.9236158625030789, + "grad_norm": 1.1905928539832555, + "learning_rate": 7.14944590279798e-07, + "loss": 0.623, + "num_input_tokens_seen": 1523815552, + "step": 8437 + }, + { + "epoch": 0.9237253345740168, + "grad_norm": 0.9327033996678763, + "learning_rate": 7.129043702274018e-07, + "loss": 0.7945, + "num_input_tokens_seen": 1524033056, + "step": 8438 + }, + { + "epoch": 0.9238348066449547, + "grad_norm": 1.2284234865990977, + "learning_rate": 7.108670232919946e-07, + "loss": 0.8372, + "num_input_tokens_seen": 1524196352, + "step": 8439 + }, + { + "epoch": 0.9239442787158926, + "grad_norm": 1.4216521371424684, + "learning_rate": 7.088325497145832e-07, + "loss": 0.873, + "num_input_tokens_seen": 1524360096, + "step": 8440 + }, + { + "epoch": 0.9240537507868305, + "grad_norm": 1.1303063392629464, + "learning_rate": 7.068009497358446e-07, + "loss": 0.9152, + "num_input_tokens_seen": 1524548256, + "step": 8441 + }, + { + "epoch": 0.9241632228577684, + "grad_norm": 1.0301548315542122, + "learning_rate": 7.047722235961119e-07, + "loss": 0.917, + "num_input_tokens_seen": 1524739328, + "step": 8442 + }, + { + "epoch": 0.9242726949287063, + "grad_norm": 1.2644690898256274, + "learning_rate": 7.027463715353789e-07, + "loss": 0.882, + "num_input_tokens_seen": 1524908224, + "step": 8443 + }, + { + "epoch": 0.9243821669996443, + "grad_norm": 1.021839670859624, + "learning_rate": 7.007233937933067e-07, + "loss": 0.672, + "num_input_tokens_seen": 1525072640, + "step": 8444 + }, + { + "epoch": 0.9244916390705821, + "grad_norm": 1.1256268662135929, + "learning_rate": 6.987032906091983e-07, + "loss": 0.8003, + "num_input_tokens_seen": 1525254976, + "step": 8445 + }, + { + "epoch": 0.92460111114152, + "grad_norm": 1.1537938708295474, + "learning_rate": 6.966860622220378e-07, + "loss": 1.0907, + "num_input_tokens_seen": 1525446272, + "step": 8446 + }, + { + "epoch": 0.9247105832124579, + "grad_norm": 1.0354518252466487, + "learning_rate": 6.946717088704563e-07, + "loss": 0.9342, + "num_input_tokens_seen": 1525654144, + "step": 8447 + }, + { + "epoch": 0.9248200552833958, + "grad_norm": 1.0727492390089641, + "learning_rate": 6.926602307927494e-07, + "loss": 0.7745, + "num_input_tokens_seen": 1525812960, + "step": 8448 + }, + { + "epoch": 0.9249295273543338, + "grad_norm": 1.0826023949347672, + "learning_rate": 6.906516282268682e-07, + "loss": 0.825, + "num_input_tokens_seen": 1525965056, + "step": 8449 + }, + { + "epoch": 0.9250389994252717, + "grad_norm": 1.1183971978232952, + "learning_rate": 6.88645901410423e-07, + "loss": 0.8161, + "num_input_tokens_seen": 1526169568, + "step": 8450 + }, + { + "epoch": 0.9251484714962095, + "grad_norm": 1.0503825172119914, + "learning_rate": 6.866430505807014e-07, + "loss": 0.7019, + "num_input_tokens_seen": 1526341152, + "step": 8451 + }, + { + "epoch": 0.9252579435671474, + "grad_norm": 1.0482952675228323, + "learning_rate": 6.846430759746198e-07, + "loss": 0.6988, + "num_input_tokens_seen": 1526513856, + "step": 8452 + }, + { + "epoch": 0.9253674156380853, + "grad_norm": 0.9695068294821688, + "learning_rate": 6.826459778287858e-07, + "loss": 0.5826, + "num_input_tokens_seen": 1526686784, + "step": 8453 + }, + { + "epoch": 0.9254768877090233, + "grad_norm": 1.181825229316986, + "learning_rate": 6.806517563794385e-07, + "loss": 1.0091, + "num_input_tokens_seen": 1526872256, + "step": 8454 + }, + { + "epoch": 0.9255863597799612, + "grad_norm": 1.0719912096204045, + "learning_rate": 6.786604118625029e-07, + "loss": 0.7468, + "num_input_tokens_seen": 1527026592, + "step": 8455 + }, + { + "epoch": 0.925695831850899, + "grad_norm": 1.1305794641235423, + "learning_rate": 6.766719445135434e-07, + "loss": 0.8087, + "num_input_tokens_seen": 1527206240, + "step": 8456 + }, + { + "epoch": 0.9258053039218369, + "grad_norm": 1.1465337635430601, + "learning_rate": 6.746863545677967e-07, + "loss": 0.9421, + "num_input_tokens_seen": 1527348032, + "step": 8457 + }, + { + "epoch": 0.9259147759927748, + "grad_norm": 1.0417571727852708, + "learning_rate": 6.727036422601529e-07, + "loss": 0.7819, + "num_input_tokens_seen": 1527527232, + "step": 8458 + }, + { + "epoch": 0.9260242480637128, + "grad_norm": 0.999379024819577, + "learning_rate": 6.707238078251576e-07, + "loss": 0.8923, + "num_input_tokens_seen": 1527710688, + "step": 8459 + }, + { + "epoch": 0.9261337201346507, + "grad_norm": 1.0636864937496733, + "learning_rate": 6.687468514970319e-07, + "loss": 0.9081, + "num_input_tokens_seen": 1527886080, + "step": 8460 + }, + { + "epoch": 0.9262431922055886, + "grad_norm": 1.0455459014835597, + "learning_rate": 6.667727735096357e-07, + "loss": 0.7566, + "num_input_tokens_seen": 1528058784, + "step": 8461 + }, + { + "epoch": 0.9263526642765264, + "grad_norm": 1.1880880706682673, + "learning_rate": 6.648015740965074e-07, + "loss": 0.8383, + "num_input_tokens_seen": 1528244928, + "step": 8462 + }, + { + "epoch": 0.9264621363474643, + "grad_norm": 1.1854213289907758, + "learning_rate": 6.628332534908272e-07, + "loss": 0.7559, + "num_input_tokens_seen": 1528400832, + "step": 8463 + }, + { + "epoch": 0.9265716084184022, + "grad_norm": 1.1329303022496813, + "learning_rate": 6.608678119254502e-07, + "loss": 1.1642, + "num_input_tokens_seen": 1528612960, + "step": 8464 + }, + { + "epoch": 0.9266810804893402, + "grad_norm": 1.1502346069028933, + "learning_rate": 6.589052496328824e-07, + "loss": 0.7349, + "num_input_tokens_seen": 1528769760, + "step": 8465 + }, + { + "epoch": 0.9267905525602781, + "grad_norm": 0.949244463536993, + "learning_rate": 6.569455668452934e-07, + "loss": 0.8504, + "num_input_tokens_seen": 1528971360, + "step": 8466 + }, + { + "epoch": 0.926900024631216, + "grad_norm": 1.136522833336753, + "learning_rate": 6.549887637945063e-07, + "loss": 0.8098, + "num_input_tokens_seen": 1529166240, + "step": 8467 + }, + { + "epoch": 0.9270094967021538, + "grad_norm": 1.1002415521874231, + "learning_rate": 6.530348407120052e-07, + "loss": 0.7526, + "num_input_tokens_seen": 1529336032, + "step": 8468 + }, + { + "epoch": 0.9271189687730917, + "grad_norm": 1.1321155377186636, + "learning_rate": 6.510837978289414e-07, + "loss": 0.8906, + "num_input_tokens_seen": 1529540544, + "step": 8469 + }, + { + "epoch": 0.9272284408440297, + "grad_norm": 1.1556708973445058, + "learning_rate": 6.491356353761191e-07, + "loss": 0.9147, + "num_input_tokens_seen": 1529755136, + "step": 8470 + }, + { + "epoch": 0.9273379129149676, + "grad_norm": 1.2718931791988675, + "learning_rate": 6.471903535839985e-07, + "loss": 0.8517, + "num_input_tokens_seen": 1529933664, + "step": 8471 + }, + { + "epoch": 0.9274473849859055, + "grad_norm": 1.0929007366578989, + "learning_rate": 6.452479526827065e-07, + "loss": 0.821, + "num_input_tokens_seen": 1530105472, + "step": 8472 + }, + { + "epoch": 0.9275568570568433, + "grad_norm": 1.00095893658262, + "learning_rate": 6.433084329020233e-07, + "loss": 0.9283, + "num_input_tokens_seen": 1530281312, + "step": 8473 + }, + { + "epoch": 0.9276663291277812, + "grad_norm": 1.2068609222428663, + "learning_rate": 6.413717944713876e-07, + "loss": 0.7355, + "num_input_tokens_seen": 1530406080, + "step": 8474 + }, + { + "epoch": 0.9277758011987192, + "grad_norm": 1.150462797585491, + "learning_rate": 6.39438037619905e-07, + "loss": 0.6925, + "num_input_tokens_seen": 1530575424, + "step": 8475 + }, + { + "epoch": 0.9278852732696571, + "grad_norm": 1.085116178735323, + "learning_rate": 6.375071625763285e-07, + "loss": 0.7625, + "num_input_tokens_seen": 1530733344, + "step": 8476 + }, + { + "epoch": 0.927994745340595, + "grad_norm": 1.1591580396139678, + "learning_rate": 6.355791695690866e-07, + "loss": 1.1852, + "num_input_tokens_seen": 1530910976, + "step": 8477 + }, + { + "epoch": 0.9281042174115329, + "grad_norm": 1.1469940144583084, + "learning_rate": 6.336540588262496e-07, + "loss": 0.8084, + "num_input_tokens_seen": 1531099136, + "step": 8478 + }, + { + "epoch": 0.9282136894824707, + "grad_norm": 1.1337201921441276, + "learning_rate": 6.317318305755604e-07, + "loss": 0.793, + "num_input_tokens_seen": 1531284384, + "step": 8479 + }, + { + "epoch": 0.9283231615534087, + "grad_norm": 1.1804571104454136, + "learning_rate": 6.298124850444093e-07, + "loss": 0.9664, + "num_input_tokens_seen": 1531469856, + "step": 8480 + }, + { + "epoch": 0.9284326336243466, + "grad_norm": 1.166369411655849, + "learning_rate": 6.278960224598507e-07, + "loss": 0.8822, + "num_input_tokens_seen": 1531652640, + "step": 8481 + }, + { + "epoch": 0.9285421056952845, + "grad_norm": 1.0547464424899717, + "learning_rate": 6.259824430486061e-07, + "loss": 0.8781, + "num_input_tokens_seen": 1531842368, + "step": 8482 + }, + { + "epoch": 0.9286515777662224, + "grad_norm": 1.2292699878177795, + "learning_rate": 6.240717470370361e-07, + "loss": 0.8605, + "num_input_tokens_seen": 1532052032, + "step": 8483 + }, + { + "epoch": 0.9287610498371603, + "grad_norm": 1.0867594620121834, + "learning_rate": 6.221639346511876e-07, + "loss": 0.8176, + "num_input_tokens_seen": 1532221376, + "step": 8484 + }, + { + "epoch": 0.9288705219080982, + "grad_norm": 1.0920074751112525, + "learning_rate": 6.202590061167385e-07, + "loss": 0.8043, + "num_input_tokens_seen": 1532422304, + "step": 8485 + }, + { + "epoch": 0.9289799939790361, + "grad_norm": 0.947209873721816, + "learning_rate": 6.183569616590446e-07, + "loss": 0.8283, + "num_input_tokens_seen": 1532637120, + "step": 8486 + }, + { + "epoch": 0.929089466049974, + "grad_norm": 1.0918883147189382, + "learning_rate": 6.164578015031092e-07, + "loss": 0.7343, + "num_input_tokens_seen": 1532801984, + "step": 8487 + }, + { + "epoch": 0.9291989381209119, + "grad_norm": 1.1857289106957496, + "learning_rate": 6.145615258736054e-07, + "loss": 1.006, + "num_input_tokens_seen": 1532994848, + "step": 8488 + }, + { + "epoch": 0.9293084101918498, + "grad_norm": 1.1567381266653811, + "learning_rate": 6.126681349948565e-07, + "loss": 1.0277, + "num_input_tokens_seen": 1533185024, + "step": 8489 + }, + { + "epoch": 0.9294178822627877, + "grad_norm": 1.255867680689754, + "learning_rate": 6.107776290908418e-07, + "loss": 1.0418, + "num_input_tokens_seen": 1533360864, + "step": 8490 + }, + { + "epoch": 0.9295273543337256, + "grad_norm": 1.1744639980016112, + "learning_rate": 6.088900083852184e-07, + "loss": 0.7971, + "num_input_tokens_seen": 1533542976, + "step": 8491 + }, + { + "epoch": 0.9296368264046635, + "grad_norm": 1.3475259026778508, + "learning_rate": 6.070052731012688e-07, + "loss": 1.0393, + "num_input_tokens_seen": 1533717248, + "step": 8492 + }, + { + "epoch": 0.9297462984756014, + "grad_norm": 1.1447738546571455, + "learning_rate": 6.051234234619729e-07, + "loss": 0.9276, + "num_input_tokens_seen": 1533922208, + "step": 8493 + }, + { + "epoch": 0.9298557705465393, + "grad_norm": 1.0942867596844321, + "learning_rate": 6.032444596899333e-07, + "loss": 0.6549, + "num_input_tokens_seen": 1534081696, + "step": 8494 + }, + { + "epoch": 0.9299652426174773, + "grad_norm": 1.1698720981917743, + "learning_rate": 6.013683820074418e-07, + "loss": 0.8835, + "num_input_tokens_seen": 1534246784, + "step": 8495 + }, + { + "epoch": 0.9300747146884151, + "grad_norm": 0.9425586595671412, + "learning_rate": 5.99495190636426e-07, + "loss": 0.7314, + "num_input_tokens_seen": 1534415904, + "step": 8496 + }, + { + "epoch": 0.930184186759353, + "grad_norm": 1.1184791620706789, + "learning_rate": 5.976248857984812e-07, + "loss": 0.7582, + "num_input_tokens_seen": 1534620192, + "step": 8497 + }, + { + "epoch": 0.9302936588302909, + "grad_norm": 1.0440231879425237, + "learning_rate": 5.957574677148664e-07, + "loss": 0.7716, + "num_input_tokens_seen": 1534799616, + "step": 8498 + }, + { + "epoch": 0.9304031309012288, + "grad_norm": 1.0536228599187178, + "learning_rate": 5.938929366064882e-07, + "loss": 0.7782, + "num_input_tokens_seen": 1534978816, + "step": 8499 + }, + { + "epoch": 0.9305126029721668, + "grad_norm": 1.1533910614975706, + "learning_rate": 5.920312926939203e-07, + "loss": 0.8257, + "num_input_tokens_seen": 1535190048, + "step": 8500 + }, + { + "epoch": 0.9306220750431047, + "grad_norm": 1.0426570126419101, + "learning_rate": 5.901725361973864e-07, + "loss": 0.7596, + "num_input_tokens_seen": 1535391648, + "step": 8501 + }, + { + "epoch": 0.9307315471140425, + "grad_norm": 1.12043195058419, + "learning_rate": 5.883166673367829e-07, + "loss": 1.1427, + "num_input_tokens_seen": 1535575104, + "step": 8502 + }, + { + "epoch": 0.9308410191849804, + "grad_norm": 1.008588152337294, + "learning_rate": 5.864636863316453e-07, + "loss": 0.8961, + "num_input_tokens_seen": 1535766400, + "step": 8503 + }, + { + "epoch": 0.9309504912559183, + "grad_norm": 1.1074600347709198, + "learning_rate": 5.84613593401187e-07, + "loss": 0.9153, + "num_input_tokens_seen": 1535944928, + "step": 8504 + }, + { + "epoch": 0.9310599633268563, + "grad_norm": 1.027720799169766, + "learning_rate": 5.827663887642665e-07, + "loss": 1.0147, + "num_input_tokens_seen": 1536123232, + "step": 8505 + }, + { + "epoch": 0.9311694353977942, + "grad_norm": 1.0771078204108264, + "learning_rate": 5.809220726394032e-07, + "loss": 0.9718, + "num_input_tokens_seen": 1536305120, + "step": 8506 + }, + { + "epoch": 0.931278907468732, + "grad_norm": 1.1070620105160636, + "learning_rate": 5.790806452447756e-07, + "loss": 0.7181, + "num_input_tokens_seen": 1536491040, + "step": 8507 + }, + { + "epoch": 0.9313883795396699, + "grad_norm": 1.0839617202728358, + "learning_rate": 5.772421067982259e-07, + "loss": 0.7907, + "num_input_tokens_seen": 1536681216, + "step": 8508 + }, + { + "epoch": 0.9314978516106078, + "grad_norm": 1.0602806136432297, + "learning_rate": 5.754064575172441e-07, + "loss": 0.9292, + "num_input_tokens_seen": 1536875648, + "step": 8509 + }, + { + "epoch": 0.9316073236815458, + "grad_norm": 1.1914962484418339, + "learning_rate": 5.735736976189871e-07, + "loss": 1.0131, + "num_input_tokens_seen": 1537057984, + "step": 8510 + }, + { + "epoch": 0.9317167957524837, + "grad_norm": 1.1330340464763398, + "learning_rate": 5.717438273202674e-07, + "loss": 0.7645, + "num_input_tokens_seen": 1537246816, + "step": 8511 + }, + { + "epoch": 0.9318262678234216, + "grad_norm": 1.1446301935021237, + "learning_rate": 5.699168468375538e-07, + "loss": 0.909, + "num_input_tokens_seen": 1537409440, + "step": 8512 + }, + { + "epoch": 0.9319357398943594, + "grad_norm": 1.094667268043923, + "learning_rate": 5.680927563869731e-07, + "loss": 0.8172, + "num_input_tokens_seen": 1537586400, + "step": 8513 + }, + { + "epoch": 0.9320452119652973, + "grad_norm": 1.1564085568148734, + "learning_rate": 5.662715561843141e-07, + "loss": 1.0101, + "num_input_tokens_seen": 1537789120, + "step": 8514 + }, + { + "epoch": 0.9321546840362352, + "grad_norm": 1.1129381688500986, + "learning_rate": 5.644532464450237e-07, + "loss": 0.8299, + "num_input_tokens_seen": 1537951744, + "step": 8515 + }, + { + "epoch": 0.9322641561071732, + "grad_norm": 1.203906016315732, + "learning_rate": 5.626378273841965e-07, + "loss": 0.9206, + "num_input_tokens_seen": 1538158048, + "step": 8516 + }, + { + "epoch": 0.9323736281781111, + "grad_norm": 0.9708541967472281, + "learning_rate": 5.608252992166024e-07, + "loss": 0.7052, + "num_input_tokens_seen": 1538376000, + "step": 8517 + }, + { + "epoch": 0.932483100249049, + "grad_norm": 1.1743678218506457, + "learning_rate": 5.590156621566506e-07, + "loss": 1.1166, + "num_input_tokens_seen": 1538572224, + "step": 8518 + }, + { + "epoch": 0.9325925723199868, + "grad_norm": 1.0110370003318567, + "learning_rate": 5.572089164184253e-07, + "loss": 0.6626, + "num_input_tokens_seen": 1538771136, + "step": 8519 + }, + { + "epoch": 0.9327020443909247, + "grad_norm": 1.1829143122566408, + "learning_rate": 5.554050622156609e-07, + "loss": 0.9293, + "num_input_tokens_seen": 1538905088, + "step": 8520 + }, + { + "epoch": 0.9328115164618627, + "grad_norm": 1.058319675677826, + "learning_rate": 5.536040997617453e-07, + "loss": 0.8047, + "num_input_tokens_seen": 1539074208, + "step": 8521 + }, + { + "epoch": 0.9329209885328006, + "grad_norm": 1.1064844217147554, + "learning_rate": 5.518060292697302e-07, + "loss": 0.875, + "num_input_tokens_seen": 1539268192, + "step": 8522 + }, + { + "epoch": 0.9330304606037385, + "grad_norm": 1.138100220495849, + "learning_rate": 5.50010850952326e-07, + "loss": 0.8767, + "num_input_tokens_seen": 1539418272, + "step": 8523 + }, + { + "epoch": 0.9331399326746763, + "grad_norm": 0.9687319415470499, + "learning_rate": 5.482185650218991e-07, + "loss": 0.8176, + "num_input_tokens_seen": 1539624128, + "step": 8524 + }, + { + "epoch": 0.9332494047456142, + "grad_norm": 1.1531556970495696, + "learning_rate": 5.464291716904684e-07, + "loss": 1.0462, + "num_input_tokens_seen": 1539795712, + "step": 8525 + }, + { + "epoch": 0.9333588768165522, + "grad_norm": 0.9929835512752363, + "learning_rate": 5.446426711697233e-07, + "loss": 0.8196, + "num_input_tokens_seen": 1539988800, + "step": 8526 + }, + { + "epoch": 0.9334683488874901, + "grad_norm": 1.2267960247130862, + "learning_rate": 5.428590636709973e-07, + "loss": 0.7238, + "num_input_tokens_seen": 1540177184, + "step": 8527 + }, + { + "epoch": 0.933577820958428, + "grad_norm": 1.1547760202961812, + "learning_rate": 5.41078349405294e-07, + "loss": 0.9734, + "num_input_tokens_seen": 1540381472, + "step": 8528 + }, + { + "epoch": 0.9336872930293659, + "grad_norm": 1.0734236115247082, + "learning_rate": 5.393005285832586e-07, + "loss": 0.7023, + "num_input_tokens_seen": 1540566048, + "step": 8529 + }, + { + "epoch": 0.9337967651003037, + "grad_norm": 0.9992874408223208, + "learning_rate": 5.375256014152119e-07, + "loss": 0.7538, + "num_input_tokens_seen": 1540751968, + "step": 8530 + }, + { + "epoch": 0.9339062371712417, + "grad_norm": 1.1348054914972916, + "learning_rate": 5.35753568111122e-07, + "loss": 0.9848, + "num_input_tokens_seen": 1540927360, + "step": 8531 + }, + { + "epoch": 0.9340157092421796, + "grad_norm": 1.1420636967099667, + "learning_rate": 5.339844288806156e-07, + "loss": 1.1922, + "num_input_tokens_seen": 1541126496, + "step": 8532 + }, + { + "epoch": 0.9341251813131175, + "grad_norm": 1.0820569748006723, + "learning_rate": 5.322181839329865e-07, + "loss": 0.8993, + "num_input_tokens_seen": 1541313984, + "step": 8533 + }, + { + "epoch": 0.9342346533840554, + "grad_norm": 1.2383609589635238, + "learning_rate": 5.304548334771648e-07, + "loss": 0.8487, + "num_input_tokens_seen": 1541473920, + "step": 8534 + }, + { + "epoch": 0.9343441254549933, + "grad_norm": 1.1765553407083964, + "learning_rate": 5.28694377721764e-07, + "loss": 0.9311, + "num_input_tokens_seen": 1541658720, + "step": 8535 + }, + { + "epoch": 0.9344535975259312, + "grad_norm": 1.1103527971594664, + "learning_rate": 5.269368168750316e-07, + "loss": 0.9159, + "num_input_tokens_seen": 1541818880, + "step": 8536 + }, + { + "epoch": 0.9345630695968691, + "grad_norm": 1.1364828697420666, + "learning_rate": 5.251821511448928e-07, + "loss": 0.876, + "num_input_tokens_seen": 1541990464, + "step": 8537 + }, + { + "epoch": 0.934672541667807, + "grad_norm": 1.0630740129824228, + "learning_rate": 5.234303807389151e-07, + "loss": 0.6961, + "num_input_tokens_seen": 1542149728, + "step": 8538 + }, + { + "epoch": 0.9347820137387449, + "grad_norm": 1.0524486939708069, + "learning_rate": 5.216815058643353e-07, + "loss": 0.8403, + "num_input_tokens_seen": 1542329152, + "step": 8539 + }, + { + "epoch": 0.9348914858096828, + "grad_norm": 1.041512776769392, + "learning_rate": 5.199355267280382e-07, + "loss": 0.6809, + "num_input_tokens_seen": 1542488416, + "step": 8540 + }, + { + "epoch": 0.9350009578806207, + "grad_norm": 1.1939623319316721, + "learning_rate": 5.181924435365693e-07, + "loss": 0.9135, + "num_input_tokens_seen": 1542682176, + "step": 8541 + }, + { + "epoch": 0.9351104299515586, + "grad_norm": 1.2494242295779856, + "learning_rate": 5.164522564961332e-07, + "loss": 0.9344, + "num_input_tokens_seen": 1542872352, + "step": 8542 + }, + { + "epoch": 0.9352199020224965, + "grad_norm": 1.1764733188260872, + "learning_rate": 5.147149658125877e-07, + "loss": 0.8241, + "num_input_tokens_seen": 1543026240, + "step": 8543 + }, + { + "epoch": 0.9353293740934344, + "grad_norm": 1.1199851053814265, + "learning_rate": 5.129805716914571e-07, + "loss": 0.9263, + "num_input_tokens_seen": 1543232544, + "step": 8544 + }, + { + "epoch": 0.9354388461643723, + "grad_norm": 1.1855937816509567, + "learning_rate": 5.112490743379133e-07, + "loss": 1.0197, + "num_input_tokens_seen": 1543410624, + "step": 8545 + }, + { + "epoch": 0.9355483182353103, + "grad_norm": 1.0537072606769435, + "learning_rate": 5.095204739567899e-07, + "loss": 0.863, + "num_input_tokens_seen": 1543624544, + "step": 8546 + }, + { + "epoch": 0.9356577903062481, + "grad_norm": 1.135374500063297, + "learning_rate": 5.07794770752576e-07, + "loss": 0.8846, + "num_input_tokens_seen": 1543827712, + "step": 8547 + }, + { + "epoch": 0.935767262377186, + "grad_norm": 1.102457102591942, + "learning_rate": 5.060719649294194e-07, + "loss": 1.1799, + "num_input_tokens_seen": 1543996160, + "step": 8548 + }, + { + "epoch": 0.9358767344481239, + "grad_norm": 1.0586699285933385, + "learning_rate": 5.043520566911264e-07, + "loss": 0.8553, + "num_input_tokens_seen": 1544201344, + "step": 8549 + }, + { + "epoch": 0.9359862065190618, + "grad_norm": 1.0547646953447847, + "learning_rate": 5.026350462411567e-07, + "loss": 0.9294, + "num_input_tokens_seen": 1544402720, + "step": 8550 + }, + { + "epoch": 0.9360956785899998, + "grad_norm": 1.1244943330044428, + "learning_rate": 5.009209337826254e-07, + "loss": 0.9125, + "num_input_tokens_seen": 1544595584, + "step": 8551 + }, + { + "epoch": 0.9362051506609377, + "grad_norm": 1.1104541955919716, + "learning_rate": 4.992097195183176e-07, + "loss": 0.7121, + "num_input_tokens_seen": 1544752832, + "step": 8552 + }, + { + "epoch": 0.9363146227318755, + "grad_norm": 1.081040848365151, + "learning_rate": 4.975014036506631e-07, + "loss": 0.7835, + "num_input_tokens_seen": 1544917696, + "step": 8553 + }, + { + "epoch": 0.9364240948028134, + "grad_norm": 1.2847091221642675, + "learning_rate": 4.957959863817502e-07, + "loss": 0.9602, + "num_input_tokens_seen": 1545098240, + "step": 8554 + }, + { + "epoch": 0.9365335668737513, + "grad_norm": 1.1156388332674023, + "learning_rate": 4.940934679133286e-07, + "loss": 0.9899, + "num_input_tokens_seen": 1545311712, + "step": 8555 + }, + { + "epoch": 0.9366430389446893, + "grad_norm": 1.2267444257001245, + "learning_rate": 4.923938484468038e-07, + "loss": 0.8513, + "num_input_tokens_seen": 1545507936, + "step": 8556 + }, + { + "epoch": 0.9367525110156272, + "grad_norm": 1.0461585596704452, + "learning_rate": 4.906971281832346e-07, + "loss": 0.8305, + "num_input_tokens_seen": 1545675712, + "step": 8557 + }, + { + "epoch": 0.936861983086565, + "grad_norm": 1.0694095673316657, + "learning_rate": 4.890033073233408e-07, + "loss": 0.723, + "num_input_tokens_seen": 1545842368, + "step": 8558 + }, + { + "epoch": 0.9369714551575029, + "grad_norm": 0.9780928256072136, + "learning_rate": 4.87312386067501e-07, + "loss": 0.882, + "num_input_tokens_seen": 1546033888, + "step": 8559 + }, + { + "epoch": 0.9370809272284408, + "grad_norm": 1.0765641011750782, + "learning_rate": 4.856243646157415e-07, + "loss": 1.0436, + "num_input_tokens_seen": 1546230784, + "step": 8560 + }, + { + "epoch": 0.9371903992993788, + "grad_norm": 1.1009773702383334, + "learning_rate": 4.83939243167758e-07, + "loss": 1.016, + "num_input_tokens_seen": 1546413120, + "step": 8561 + }, + { + "epoch": 0.9372998713703167, + "grad_norm": 1.0315505659030746, + "learning_rate": 4.822570219228967e-07, + "loss": 0.7022, + "num_input_tokens_seen": 1546574400, + "step": 8562 + }, + { + "epoch": 0.9374093434412546, + "grad_norm": 1.1767668790982413, + "learning_rate": 4.805777010801593e-07, + "loss": 0.9933, + "num_input_tokens_seen": 1546740832, + "step": 8563 + }, + { + "epoch": 0.9375188155121924, + "grad_norm": 1.1057275799335458, + "learning_rate": 4.789012808382065e-07, + "loss": 0.7225, + "num_input_tokens_seen": 1546906144, + "step": 8564 + }, + { + "epoch": 0.9376282875831303, + "grad_norm": 1.1541470141522219, + "learning_rate": 4.772277613953546e-07, + "loss": 0.9981, + "num_input_tokens_seen": 1547086912, + "step": 8565 + }, + { + "epoch": 0.9377377596540682, + "grad_norm": 1.0556553673540494, + "learning_rate": 4.7555714294958144e-07, + "loss": 0.8358, + "num_input_tokens_seen": 1547263424, + "step": 8566 + }, + { + "epoch": 0.9378472317250062, + "grad_norm": 1.1941357473686214, + "learning_rate": 4.738894256985121e-07, + "loss": 0.8621, + "num_input_tokens_seen": 1547436128, + "step": 8567 + }, + { + "epoch": 0.9379567037959441, + "grad_norm": 1.0173650068107956, + "learning_rate": 4.722246098394417e-07, + "loss": 0.7605, + "num_input_tokens_seen": 1547629888, + "step": 8568 + }, + { + "epoch": 0.938066175866882, + "grad_norm": 1.1454743545472128, + "learning_rate": 4.705626955693071e-07, + "loss": 0.8189, + "num_input_tokens_seen": 1547815584, + "step": 8569 + }, + { + "epoch": 0.9381756479378198, + "grad_norm": 1.1284332868591946, + "learning_rate": 4.689036830847177e-07, + "loss": 0.6934, + "num_input_tokens_seen": 1547976640, + "step": 8570 + }, + { + "epoch": 0.9382851200087577, + "grad_norm": 1.1278804883919553, + "learning_rate": 4.67247572581922e-07, + "loss": 0.8402, + "num_input_tokens_seen": 1548135680, + "step": 8571 + }, + { + "epoch": 0.9383945920796957, + "grad_norm": 1.146368745499972, + "learning_rate": 4.655943642568411e-07, + "loss": 0.7004, + "num_input_tokens_seen": 1548304352, + "step": 8572 + }, + { + "epoch": 0.9385040641506336, + "grad_norm": 1.1031193114396085, + "learning_rate": 4.639440583050464e-07, + "loss": 1.0581, + "num_input_tokens_seen": 1548506624, + "step": 8573 + }, + { + "epoch": 0.9386135362215715, + "grad_norm": 1.1804054046337789, + "learning_rate": 4.622966549217622e-07, + "loss": 1.0687, + "num_input_tokens_seen": 1548682688, + "step": 8574 + }, + { + "epoch": 0.9387230082925093, + "grad_norm": 1.0187060645055033, + "learning_rate": 4.606521543018799e-07, + "loss": 0.7917, + "num_input_tokens_seen": 1548874208, + "step": 8575 + }, + { + "epoch": 0.9388324803634472, + "grad_norm": 1.0519616966309129, + "learning_rate": 4.5901055663993274e-07, + "loss": 0.8699, + "num_input_tokens_seen": 1549049600, + "step": 8576 + }, + { + "epoch": 0.9389419524343852, + "grad_norm": 1.274889720560592, + "learning_rate": 4.573718621301265e-07, + "loss": 0.8874, + "num_input_tokens_seen": 1549249632, + "step": 8577 + }, + { + "epoch": 0.9390514245053231, + "grad_norm": 1.3564368756487082, + "learning_rate": 4.557360709663061e-07, + "loss": 0.9138, + "num_input_tokens_seen": 1549429280, + "step": 8578 + }, + { + "epoch": 0.939160896576261, + "grad_norm": 1.154032307365958, + "learning_rate": 4.5410318334199175e-07, + "loss": 1.1141, + "num_input_tokens_seen": 1549606912, + "step": 8579 + }, + { + "epoch": 0.9392703686471989, + "grad_norm": 0.9339972545799895, + "learning_rate": 4.524731994503456e-07, + "loss": 0.7208, + "num_input_tokens_seen": 1549812544, + "step": 8580 + }, + { + "epoch": 0.9393798407181367, + "grad_norm": 1.0363542641895824, + "learning_rate": 4.50846119484194e-07, + "loss": 0.669, + "num_input_tokens_seen": 1550011456, + "step": 8581 + }, + { + "epoch": 0.9394893127890747, + "grad_norm": 1.1811426547892192, + "learning_rate": 4.4922194363601343e-07, + "loss": 0.7336, + "num_input_tokens_seen": 1550208128, + "step": 8582 + }, + { + "epoch": 0.9395987848600126, + "grad_norm": 1.0307193893783617, + "learning_rate": 4.476006720979475e-07, + "loss": 0.7707, + "num_input_tokens_seen": 1550384864, + "step": 8583 + }, + { + "epoch": 0.9397082569309505, + "grad_norm": 1.0348734343353676, + "learning_rate": 4.459823050617845e-07, + "loss": 0.8883, + "num_input_tokens_seen": 1550559584, + "step": 8584 + }, + { + "epoch": 0.9398177290018884, + "grad_norm": 1.2247325099140849, + "learning_rate": 4.44366842718974e-07, + "loss": 0.9643, + "num_input_tokens_seen": 1550698912, + "step": 8585 + }, + { + "epoch": 0.9399272010728263, + "grad_norm": 1.0076029360037473, + "learning_rate": 4.4275428526062425e-07, + "loss": 0.6882, + "num_input_tokens_seen": 1550888192, + "step": 8586 + }, + { + "epoch": 0.9400366731437642, + "grad_norm": 1.0562013287588443, + "learning_rate": 4.411446328774993e-07, + "loss": 0.926, + "num_input_tokens_seen": 1551073888, + "step": 8587 + }, + { + "epoch": 0.9401461452147021, + "grad_norm": 1.1119904111819252, + "learning_rate": 4.3953788576001353e-07, + "loss": 0.7265, + "num_input_tokens_seen": 1551250624, + "step": 8588 + }, + { + "epoch": 0.94025561728564, + "grad_norm": 1.1243329189668088, + "learning_rate": 4.3793404409824546e-07, + "loss": 0.7611, + "num_input_tokens_seen": 1551449088, + "step": 8589 + }, + { + "epoch": 0.9403650893565779, + "grad_norm": 1.1752236396934723, + "learning_rate": 4.3633310808192385e-07, + "loss": 0.8881, + "num_input_tokens_seen": 1551615072, + "step": 8590 + }, + { + "epoch": 0.9404745614275158, + "grad_norm": 1.0030531171195494, + "learning_rate": 4.347350779004389e-07, + "loss": 0.757, + "num_input_tokens_seen": 1551807936, + "step": 8591 + }, + { + "epoch": 0.9405840334984537, + "grad_norm": 1.1099569916340375, + "learning_rate": 4.331399537428338e-07, + "loss": 0.6877, + "num_input_tokens_seen": 1551994080, + "step": 8592 + }, + { + "epoch": 0.9406935055693916, + "grad_norm": 1.136986664860748, + "learning_rate": 4.3154773579780483e-07, + "loss": 0.9182, + "num_input_tokens_seen": 1552151328, + "step": 8593 + }, + { + "epoch": 0.9408029776403295, + "grad_norm": 0.9944622366454297, + "learning_rate": 4.2995842425371524e-07, + "loss": 0.8199, + "num_input_tokens_seen": 1552328512, + "step": 8594 + }, + { + "epoch": 0.9409124497112674, + "grad_norm": 1.1591320204796278, + "learning_rate": 4.283720192985757e-07, + "loss": 0.8618, + "num_input_tokens_seen": 1552496064, + "step": 8595 + }, + { + "epoch": 0.9410219217822053, + "grad_norm": 1.075058376590212, + "learning_rate": 4.267885211200501e-07, + "loss": 1.074, + "num_input_tokens_seen": 1552695424, + "step": 8596 + }, + { + "epoch": 0.9411313938531433, + "grad_norm": 2.701939670230885, + "learning_rate": 4.25207929905469e-07, + "loss": 0.919, + "num_input_tokens_seen": 1552878880, + "step": 8597 + }, + { + "epoch": 0.9412408659240811, + "grad_norm": 1.1410154954491647, + "learning_rate": 4.236302458418051e-07, + "loss": 0.6893, + "num_input_tokens_seen": 1553065248, + "step": 8598 + }, + { + "epoch": 0.941350337995019, + "grad_norm": 1.1087027648704133, + "learning_rate": 4.2205546911570913e-07, + "loss": 0.8205, + "num_input_tokens_seen": 1553225408, + "step": 8599 + }, + { + "epoch": 0.9414598100659569, + "grad_norm": 1.0472903779806497, + "learning_rate": 4.2048359991345986e-07, + "loss": 0.8021, + "num_input_tokens_seen": 1553411552, + "step": 8600 + }, + { + "epoch": 0.9415692821368948, + "grad_norm": 1.1908275261789205, + "learning_rate": 4.1891463842101685e-07, + "loss": 0.8877, + "num_input_tokens_seen": 1553583584, + "step": 8601 + }, + { + "epoch": 0.9416787542078328, + "grad_norm": 1.0281192767786895, + "learning_rate": 4.173485848239761e-07, + "loss": 0.7356, + "num_input_tokens_seen": 1553795712, + "step": 8602 + }, + { + "epoch": 0.9417882262787707, + "grad_norm": 1.1847177336710764, + "learning_rate": 4.157854393076088e-07, + "loss": 0.8512, + "num_input_tokens_seen": 1553959456, + "step": 8603 + }, + { + "epoch": 0.9418976983497085, + "grad_norm": 0.9630088191970455, + "learning_rate": 4.1422520205682547e-07, + "loss": 0.5796, + "num_input_tokens_seen": 1554131488, + "step": 8604 + }, + { + "epoch": 0.9420071704206464, + "grad_norm": 1.0406813093650444, + "learning_rate": 4.126678732562006e-07, + "loss": 0.7157, + "num_input_tokens_seen": 1554267904, + "step": 8605 + }, + { + "epoch": 0.9421166424915843, + "grad_norm": 1.1036395248129574, + "learning_rate": 4.1111345308996185e-07, + "loss": 0.794, + "num_input_tokens_seen": 1554447328, + "step": 8606 + }, + { + "epoch": 0.9422261145625223, + "grad_norm": 1.121841978474401, + "learning_rate": 4.095619417419955e-07, + "loss": 0.7521, + "num_input_tokens_seen": 1554598528, + "step": 8607 + }, + { + "epoch": 0.9423355866334602, + "grad_norm": 1.2232250803848037, + "learning_rate": 4.080133393958463e-07, + "loss": 1.0468, + "num_input_tokens_seen": 1554799232, + "step": 8608 + }, + { + "epoch": 0.942445058704398, + "grad_norm": 1.1419798760152544, + "learning_rate": 4.0646764623470113e-07, + "loss": 1.1265, + "num_input_tokens_seen": 1554991872, + "step": 8609 + }, + { + "epoch": 0.9425545307753359, + "grad_norm": 1.290681550493475, + "learning_rate": 4.049248624414248e-07, + "loss": 0.9153, + "num_input_tokens_seen": 1555152032, + "step": 8610 + }, + { + "epoch": 0.9426640028462738, + "grad_norm": 1.1419129611816357, + "learning_rate": 4.0338498819851577e-07, + "loss": 1.1393, + "num_input_tokens_seen": 1555347360, + "step": 8611 + }, + { + "epoch": 0.9427734749172118, + "grad_norm": 1.102890026365019, + "learning_rate": 4.018480236881422e-07, + "loss": 0.9882, + "num_input_tokens_seen": 1555545152, + "step": 8612 + }, + { + "epoch": 0.9428829469881497, + "grad_norm": 1.0548270804266642, + "learning_rate": 4.003139690921254e-07, + "loss": 0.7905, + "num_input_tokens_seen": 1555699040, + "step": 8613 + }, + { + "epoch": 0.9429924190590876, + "grad_norm": 1.2223519138442442, + "learning_rate": 3.987828245919367e-07, + "loss": 1.1261, + "num_input_tokens_seen": 1555886528, + "step": 8614 + }, + { + "epoch": 0.9431018911300254, + "grad_norm": 1.1356032623578731, + "learning_rate": 3.972545903687119e-07, + "loss": 0.9953, + "num_input_tokens_seen": 1556065952, + "step": 8615 + }, + { + "epoch": 0.9432113632009633, + "grad_norm": 1.0520832829742923, + "learning_rate": 3.9572926660323695e-07, + "loss": 0.6505, + "num_input_tokens_seen": 1556208416, + "step": 8616 + }, + { + "epoch": 0.9433208352719012, + "grad_norm": 1.1573662910928924, + "learning_rate": 3.9420685347595634e-07, + "loss": 0.8938, + "num_input_tokens_seen": 1556410464, + "step": 8617 + }, + { + "epoch": 0.9434303073428392, + "grad_norm": 1.0806546043810814, + "learning_rate": 3.926873511669621e-07, + "loss": 0.8711, + "num_input_tokens_seen": 1556560992, + "step": 8618 + }, + { + "epoch": 0.9435397794137771, + "grad_norm": 1.1784353115695732, + "learning_rate": 3.91170759856016e-07, + "loss": 0.7365, + "num_input_tokens_seen": 1556734368, + "step": 8619 + }, + { + "epoch": 0.943649251484715, + "grad_norm": 1.175013446730919, + "learning_rate": 3.896570797225246e-07, + "loss": 0.8661, + "num_input_tokens_seen": 1556896320, + "step": 8620 + }, + { + "epoch": 0.9437587235556528, + "grad_norm": 1.1103775974197352, + "learning_rate": 3.88146310945553e-07, + "loss": 0.8052, + "num_input_tokens_seen": 1557033856, + "step": 8621 + }, + { + "epoch": 0.9438681956265907, + "grad_norm": 1.0638617596488693, + "learning_rate": 3.86638453703822e-07, + "loss": 0.8734, + "num_input_tokens_seen": 1557225376, + "step": 8622 + }, + { + "epoch": 0.9439776676975287, + "grad_norm": 1.1592022090996121, + "learning_rate": 3.8513350817571124e-07, + "loss": 1.0547, + "num_input_tokens_seen": 1557420928, + "step": 8623 + }, + { + "epoch": 0.9440871397684666, + "grad_norm": 1.1875858777766017, + "learning_rate": 3.836314745392505e-07, + "loss": 0.8674, + "num_input_tokens_seen": 1557622528, + "step": 8624 + }, + { + "epoch": 0.9441966118394045, + "grad_norm": 0.9690981208678344, + "learning_rate": 3.8213235297212823e-07, + "loss": 0.9329, + "num_input_tokens_seen": 1557815168, + "step": 8625 + }, + { + "epoch": 0.9443060839103423, + "grad_norm": 1.0216732980991066, + "learning_rate": 3.806361436516831e-07, + "loss": 0.7863, + "num_input_tokens_seen": 1558003104, + "step": 8626 + }, + { + "epoch": 0.9444155559812802, + "grad_norm": 1.244267479673933, + "learning_rate": 3.7914284675492075e-07, + "loss": 0.768, + "num_input_tokens_seen": 1558161696, + "step": 8627 + }, + { + "epoch": 0.9445250280522182, + "grad_norm": 1.015712968977967, + "learning_rate": 3.7765246245849426e-07, + "loss": 0.8753, + "num_input_tokens_seen": 1558373600, + "step": 8628 + }, + { + "epoch": 0.9446345001231561, + "grad_norm": 0.9491483246223065, + "learning_rate": 3.761649909387099e-07, + "loss": 0.8266, + "num_input_tokens_seen": 1558547872, + "step": 8629 + }, + { + "epoch": 0.944743972194094, + "grad_norm": 1.1233634170914755, + "learning_rate": 3.746804323715353e-07, + "loss": 0.837, + "num_input_tokens_seen": 1558713856, + "step": 8630 + }, + { + "epoch": 0.9448534442650319, + "grad_norm": 1.182662849584252, + "learning_rate": 3.731987869325881e-07, + "loss": 1.302, + "num_input_tokens_seen": 1558902240, + "step": 8631 + }, + { + "epoch": 0.9449629163359697, + "grad_norm": 1.183898030565056, + "learning_rate": 3.7172005479714777e-07, + "loss": 0.6936, + "num_input_tokens_seen": 1559067552, + "step": 8632 + }, + { + "epoch": 0.9450723884069077, + "grad_norm": 1.022185571820905, + "learning_rate": 3.7024423614014094e-07, + "loss": 0.9548, + "num_input_tokens_seen": 1559262880, + "step": 8633 + }, + { + "epoch": 0.9451818604778456, + "grad_norm": 1.2606676760756392, + "learning_rate": 3.6877133113616123e-07, + "loss": 1.3285, + "num_input_tokens_seen": 1559447456, + "step": 8634 + }, + { + "epoch": 0.9452913325487835, + "grad_norm": 1.3355559311654834, + "learning_rate": 3.673013399594444e-07, + "loss": 0.8459, + "num_input_tokens_seen": 1559633376, + "step": 8635 + }, + { + "epoch": 0.9454008046197214, + "grad_norm": 1.352154464300495, + "learning_rate": 3.658342627838873e-07, + "loss": 0.929, + "num_input_tokens_seen": 1559782336, + "step": 8636 + }, + { + "epoch": 0.9455102766906593, + "grad_norm": 1.178255690148315, + "learning_rate": 3.643700997830457e-07, + "loss": 0.9585, + "num_input_tokens_seen": 1559946080, + "step": 8637 + }, + { + "epoch": 0.9456197487615972, + "grad_norm": 1.2499835966941304, + "learning_rate": 3.6290885113012816e-07, + "loss": 1.0131, + "num_input_tokens_seen": 1560093920, + "step": 8638 + }, + { + "epoch": 0.9457292208325351, + "grad_norm": 1.1457041696517036, + "learning_rate": 3.614505169979909e-07, + "loss": 0.8744, + "num_input_tokens_seen": 1560278272, + "step": 8639 + }, + { + "epoch": 0.945838692903473, + "grad_norm": 1.1624925510619042, + "learning_rate": 3.5999509755915985e-07, + "loss": 0.8874, + "num_input_tokens_seen": 1560461728, + "step": 8640 + }, + { + "epoch": 0.9459481649744109, + "grad_norm": 1.0503589107539129, + "learning_rate": 3.585425929858055e-07, + "loss": 0.7381, + "num_input_tokens_seen": 1560644960, + "step": 8641 + }, + { + "epoch": 0.9460576370453488, + "grad_norm": 1.0461861350963715, + "learning_rate": 3.570930034497516e-07, + "loss": 0.732, + "num_input_tokens_seen": 1560846784, + "step": 8642 + }, + { + "epoch": 0.9461671091162867, + "grad_norm": 1.20239156190471, + "learning_rate": 3.556463291224915e-07, + "loss": 0.7702, + "num_input_tokens_seen": 1561011872, + "step": 8643 + }, + { + "epoch": 0.9462765811872246, + "grad_norm": 1.0963509832171374, + "learning_rate": 3.54202570175155e-07, + "loss": 1.0645, + "num_input_tokens_seen": 1561211904, + "step": 8644 + }, + { + "epoch": 0.9463860532581625, + "grad_norm": 1.0587187408192344, + "learning_rate": 3.527617267785416e-07, + "loss": 0.6722, + "num_input_tokens_seen": 1561375200, + "step": 8645 + }, + { + "epoch": 0.9464955253291004, + "grad_norm": 1.294886984146507, + "learning_rate": 3.513237991030982e-07, + "loss": 0.9221, + "num_input_tokens_seen": 1561535136, + "step": 8646 + }, + { + "epoch": 0.9466049974000383, + "grad_norm": 1.235080710410839, + "learning_rate": 3.498887873189277e-07, + "loss": 1.0339, + "num_input_tokens_seen": 1561729792, + "step": 8647 + }, + { + "epoch": 0.9467144694709763, + "grad_norm": 1.1376390958985123, + "learning_rate": 3.484566915957943e-07, + "loss": 0.7969, + "num_input_tokens_seen": 1561928256, + "step": 8648 + }, + { + "epoch": 0.9468239415419141, + "grad_norm": 1.2156963239207084, + "learning_rate": 3.470275121031041e-07, + "loss": 0.9274, + "num_input_tokens_seen": 1562086848, + "step": 8649 + }, + { + "epoch": 0.946933413612852, + "grad_norm": 1.0695363592857554, + "learning_rate": 3.4560124900993305e-07, + "loss": 0.988, + "num_input_tokens_seen": 1562287552, + "step": 8650 + }, + { + "epoch": 0.9470428856837899, + "grad_norm": 1.0719818128743994, + "learning_rate": 3.441779024850017e-07, + "loss": 0.9888, + "num_input_tokens_seen": 1562464960, + "step": 8651 + }, + { + "epoch": 0.9471523577547278, + "grad_norm": 1.1891786355688518, + "learning_rate": 3.4275747269669203e-07, + "loss": 0.7119, + "num_input_tokens_seen": 1562619520, + "step": 8652 + }, + { + "epoch": 0.9472618298256658, + "grad_norm": 1.042754990749813, + "learning_rate": 3.4133995981303624e-07, + "loss": 1.0552, + "num_input_tokens_seen": 1562840832, + "step": 8653 + }, + { + "epoch": 0.9473713018966037, + "grad_norm": 1.3663991780695128, + "learning_rate": 3.3992536400172246e-07, + "loss": 1.2645, + "num_input_tokens_seen": 1563020256, + "step": 8654 + }, + { + "epoch": 0.9474807739675415, + "grad_norm": 1.0570518271733442, + "learning_rate": 3.3851368543009745e-07, + "loss": 0.7241, + "num_input_tokens_seen": 1563197888, + "step": 8655 + }, + { + "epoch": 0.9475902460384794, + "grad_norm": 1.200713080889189, + "learning_rate": 3.3710492426515804e-07, + "loss": 0.8746, + "num_input_tokens_seen": 1563376192, + "step": 8656 + }, + { + "epoch": 0.9476997181094173, + "grad_norm": 1.0237602132769863, + "learning_rate": 3.3569908067355993e-07, + "loss": 0.7682, + "num_input_tokens_seen": 1563557184, + "step": 8657 + }, + { + "epoch": 0.9478091901803553, + "grad_norm": 1.2249882794812093, + "learning_rate": 3.3429615482160893e-07, + "loss": 0.7507, + "num_input_tokens_seen": 1563735488, + "step": 8658 + }, + { + "epoch": 0.9479186622512932, + "grad_norm": 1.2093791656028117, + "learning_rate": 3.328961468752695e-07, + "loss": 0.7345, + "num_input_tokens_seen": 1563909536, + "step": 8659 + }, + { + "epoch": 0.948028134322231, + "grad_norm": 1.147202950552547, + "learning_rate": 3.3149905700016193e-07, + "loss": 0.8257, + "num_input_tokens_seen": 1564088288, + "step": 8660 + }, + { + "epoch": 0.9481376063931689, + "grad_norm": 1.1140769925452205, + "learning_rate": 3.301048853615568e-07, + "loss": 1.1699, + "num_input_tokens_seen": 1564279808, + "step": 8661 + }, + { + "epoch": 0.9482470784641068, + "grad_norm": 0.9447633393459609, + "learning_rate": 3.2871363212438613e-07, + "loss": 0.6865, + "num_input_tokens_seen": 1564473120, + "step": 8662 + }, + { + "epoch": 0.9483565505350448, + "grad_norm": 1.123516588252425, + "learning_rate": 3.2732529745322647e-07, + "loss": 1.1145, + "num_input_tokens_seen": 1564659488, + "step": 8663 + }, + { + "epoch": 0.9484660226059827, + "grad_norm": 1.1915370181500649, + "learning_rate": 3.2593988151231603e-07, + "loss": 1.0402, + "num_input_tokens_seen": 1564862208, + "step": 8664 + }, + { + "epoch": 0.9485754946769206, + "grad_norm": 1.1064405718951842, + "learning_rate": 3.245573844655514e-07, + "loss": 0.8883, + "num_input_tokens_seen": 1565052384, + "step": 8665 + }, + { + "epoch": 0.9486849667478584, + "grad_norm": 1.1014627722649586, + "learning_rate": 3.231778064764768e-07, + "loss": 0.752, + "num_input_tokens_seen": 1565214336, + "step": 8666 + }, + { + "epoch": 0.9487944388187963, + "grad_norm": 1.2041064136218682, + "learning_rate": 3.2180114770829495e-07, + "loss": 0.8407, + "num_input_tokens_seen": 1565400928, + "step": 8667 + }, + { + "epoch": 0.9489039108897342, + "grad_norm": 1.1379500599145633, + "learning_rate": 3.204274083238562e-07, + "loss": 0.8355, + "num_input_tokens_seen": 1565542496, + "step": 8668 + }, + { + "epoch": 0.9490133829606722, + "grad_norm": 1.1066364820128778, + "learning_rate": 3.1905658848567774e-07, + "loss": 0.932, + "num_input_tokens_seen": 1565724384, + "step": 8669 + }, + { + "epoch": 0.9491228550316101, + "grad_norm": 1.0109207364185606, + "learning_rate": 3.1768868835592434e-07, + "loss": 0.983, + "num_input_tokens_seen": 1565927776, + "step": 8670 + }, + { + "epoch": 0.949232327102548, + "grad_norm": 1.0356577108462715, + "learning_rate": 3.1632370809641376e-07, + "loss": 0.7766, + "num_input_tokens_seen": 1566093984, + "step": 8671 + }, + { + "epoch": 0.9493417991734858, + "grad_norm": 1.1626970103690568, + "learning_rate": 3.149616478686196e-07, + "loss": 0.8213, + "num_input_tokens_seen": 1566259744, + "step": 8672 + }, + { + "epoch": 0.9494512712444237, + "grad_norm": 1.193789240806284, + "learning_rate": 3.1360250783367406e-07, + "loss": 0.9684, + "num_input_tokens_seen": 1566412512, + "step": 8673 + }, + { + "epoch": 0.9495607433153617, + "grad_norm": 1.210709827308789, + "learning_rate": 3.122462881523625e-07, + "loss": 0.7546, + "num_input_tokens_seen": 1566602240, + "step": 8674 + }, + { + "epoch": 0.9496702153862996, + "grad_norm": 1.1984845128592512, + "learning_rate": 3.1089298898511476e-07, + "loss": 0.9811, + "num_input_tokens_seen": 1566831616, + "step": 8675 + }, + { + "epoch": 0.9497796874572375, + "grad_norm": 1.2424409235950244, + "learning_rate": 3.095426104920335e-07, + "loss": 0.9962, + "num_input_tokens_seen": 1566999168, + "step": 8676 + }, + { + "epoch": 0.9498891595281753, + "grad_norm": 1.1748998639880635, + "learning_rate": 3.081951528328575e-07, + "loss": 0.6982, + "num_input_tokens_seen": 1567184192, + "step": 8677 + }, + { + "epoch": 0.9499986315991132, + "grad_norm": 1.1280268639652744, + "learning_rate": 3.0685061616699263e-07, + "loss": 0.8442, + "num_input_tokens_seen": 1567353088, + "step": 8678 + }, + { + "epoch": 0.9501081036700512, + "grad_norm": 1.227537654369096, + "learning_rate": 3.0550900065349774e-07, + "loss": 0.8111, + "num_input_tokens_seen": 1567517056, + "step": 8679 + }, + { + "epoch": 0.9502175757409891, + "grad_norm": 1.2350992902618882, + "learning_rate": 3.0417030645107924e-07, + "loss": 0.8162, + "num_input_tokens_seen": 1567705664, + "step": 8680 + }, + { + "epoch": 0.950327047811927, + "grad_norm": 1.0620166576752972, + "learning_rate": 3.028345337181021e-07, + "loss": 0.8566, + "num_input_tokens_seen": 1567878816, + "step": 8681 + }, + { + "epoch": 0.9504365198828649, + "grad_norm": 1.1423361497578457, + "learning_rate": 3.015016826125844e-07, + "loss": 0.9804, + "num_input_tokens_seen": 1568039872, + "step": 8682 + }, + { + "epoch": 0.9505459919538027, + "grad_norm": 1.1287278823121896, + "learning_rate": 3.001717532922055e-07, + "loss": 0.7235, + "num_input_tokens_seen": 1568210560, + "step": 8683 + }, + { + "epoch": 0.9506554640247407, + "grad_norm": 1.154458436454847, + "learning_rate": 2.988447459142868e-07, + "loss": 1.2196, + "num_input_tokens_seen": 1568414400, + "step": 8684 + }, + { + "epoch": 0.9507649360956786, + "grad_norm": 1.0584657598731824, + "learning_rate": 2.975206606358194e-07, + "loss": 0.9516, + "num_input_tokens_seen": 1568601888, + "step": 8685 + }, + { + "epoch": 0.9508744081666165, + "grad_norm": 1.0263941912814498, + "learning_rate": 2.961994976134308e-07, + "loss": 0.6294, + "num_input_tokens_seen": 1568796768, + "step": 8686 + }, + { + "epoch": 0.9509838802375544, + "grad_norm": 1.1080031711071645, + "learning_rate": 2.948812570034154e-07, + "loss": 0.8852, + "num_input_tokens_seen": 1568982688, + "step": 8687 + }, + { + "epoch": 0.9510933523084923, + "grad_norm": 1.2259891409089678, + "learning_rate": 2.9356593896172066e-07, + "loss": 0.8671, + "num_input_tokens_seen": 1569148896, + "step": 8688 + }, + { + "epoch": 0.9512028243794302, + "grad_norm": 1.1675503086466232, + "learning_rate": 2.9225354364394444e-07, + "loss": 0.6928, + "num_input_tokens_seen": 1569332352, + "step": 8689 + }, + { + "epoch": 0.9513122964503681, + "grad_norm": 1.0656777153512293, + "learning_rate": 2.9094407120534295e-07, + "loss": 0.7127, + "num_input_tokens_seen": 1569519168, + "step": 8690 + }, + { + "epoch": 0.951421768521306, + "grad_norm": 1.46678834143136, + "learning_rate": 2.896375218008174e-07, + "loss": 1.0312, + "num_input_tokens_seen": 1569713600, + "step": 8691 + }, + { + "epoch": 0.9515312405922439, + "grad_norm": 1.0839805809160796, + "learning_rate": 2.883338955849385e-07, + "loss": 0.9164, + "num_input_tokens_seen": 1569909152, + "step": 8692 + }, + { + "epoch": 0.9516407126631818, + "grad_norm": 0.9268148262004509, + "learning_rate": 2.870331927119163e-07, + "loss": 0.7892, + "num_input_tokens_seen": 1570100896, + "step": 8693 + }, + { + "epoch": 0.9517501847341197, + "grad_norm": 1.1443868812931786, + "learning_rate": 2.857354133356277e-07, + "loss": 0.9148, + "num_input_tokens_seen": 1570284352, + "step": 8694 + }, + { + "epoch": 0.9518596568050576, + "grad_norm": 1.0298202892114692, + "learning_rate": 2.8444055760959154e-07, + "loss": 0.8886, + "num_input_tokens_seen": 1570481920, + "step": 8695 + }, + { + "epoch": 0.9519691288759955, + "grad_norm": 1.2607090455891519, + "learning_rate": 2.8314862568699087e-07, + "loss": 1.0138, + "num_input_tokens_seen": 1570658208, + "step": 8696 + }, + { + "epoch": 0.9520786009469334, + "grad_norm": 1.1424856295193135, + "learning_rate": 2.8185961772065616e-07, + "loss": 1.0857, + "num_input_tokens_seen": 1570848832, + "step": 8697 + }, + { + "epoch": 0.9521880730178713, + "grad_norm": 1.0624339700264664, + "learning_rate": 2.8057353386307663e-07, + "loss": 0.8204, + "num_input_tokens_seen": 1571013696, + "step": 8698 + }, + { + "epoch": 0.9522975450888093, + "grad_norm": 1.1527744328911191, + "learning_rate": 2.792903742663916e-07, + "loss": 0.8364, + "num_input_tokens_seen": 1571216640, + "step": 8699 + }, + { + "epoch": 0.9524070171597471, + "grad_norm": 1.200459835243316, + "learning_rate": 2.7801013908239636e-07, + "loss": 1.0535, + "num_input_tokens_seen": 1571405696, + "step": 8700 + }, + { + "epoch": 0.952516489230685, + "grad_norm": 1.1101194758166633, + "learning_rate": 2.76732828462542e-07, + "loss": 0.8625, + "num_input_tokens_seen": 1571594304, + "step": 8701 + }, + { + "epoch": 0.9526259613016229, + "grad_norm": 1.201260496762527, + "learning_rate": 2.7545844255793263e-07, + "loss": 0.7619, + "num_input_tokens_seen": 1571790976, + "step": 8702 + }, + { + "epoch": 0.9527354333725608, + "grad_norm": 1.0932623184699952, + "learning_rate": 2.741869815193226e-07, + "loss": 0.7626, + "num_input_tokens_seen": 1571943744, + "step": 8703 + }, + { + "epoch": 0.9528449054434988, + "grad_norm": 1.0926559425985263, + "learning_rate": 2.729184454971251e-07, + "loss": 0.7584, + "num_input_tokens_seen": 1572130112, + "step": 8704 + }, + { + "epoch": 0.9529543775144367, + "grad_norm": 1.1701224892231439, + "learning_rate": 2.71652834641406e-07, + "loss": 0.7514, + "num_input_tokens_seen": 1572308864, + "step": 8705 + }, + { + "epoch": 0.9530638495853745, + "grad_norm": 1.0421990115146198, + "learning_rate": 2.7039014910188455e-07, + "loss": 0.6886, + "num_input_tokens_seen": 1572494336, + "step": 8706 + }, + { + "epoch": 0.9531733216563124, + "grad_norm": 1.3112905924780547, + "learning_rate": 2.691303890279301e-07, + "loss": 0.999, + "num_input_tokens_seen": 1572638592, + "step": 8707 + }, + { + "epoch": 0.9532827937272503, + "grad_norm": 1.0103490092379661, + "learning_rate": 2.678735545685762e-07, + "loss": 1.2156, + "num_input_tokens_seen": 1572849376, + "step": 8708 + }, + { + "epoch": 0.9533922657981883, + "grad_norm": 1.219671878856383, + "learning_rate": 2.666196458725012e-07, + "loss": 0.9521, + "num_input_tokens_seen": 1573016256, + "step": 8709 + }, + { + "epoch": 0.9535017378691262, + "grad_norm": 1.015568775674799, + "learning_rate": 2.653686630880392e-07, + "loss": 0.6871, + "num_input_tokens_seen": 1573197024, + "step": 8710 + }, + { + "epoch": 0.953611209940064, + "grad_norm": 1.1123731155045187, + "learning_rate": 2.641206063631774e-07, + "loss": 0.7378, + "num_input_tokens_seen": 1573388544, + "step": 8711 + }, + { + "epoch": 0.9537206820110019, + "grad_norm": 1.1122629963309947, + "learning_rate": 2.628754758455643e-07, + "loss": 0.8983, + "num_input_tokens_seen": 1573538848, + "step": 8712 + }, + { + "epoch": 0.9538301540819398, + "grad_norm": 1.2051783292833245, + "learning_rate": 2.616332716824932e-07, + "loss": 0.9111, + "num_input_tokens_seen": 1573701920, + "step": 8713 + }, + { + "epoch": 0.9539396261528778, + "grad_norm": 1.0639050113683655, + "learning_rate": 2.6039399402091324e-07, + "loss": 0.9241, + "num_input_tokens_seen": 1573896128, + "step": 8714 + }, + { + "epoch": 0.9540490982238157, + "grad_norm": 1.1585079004399472, + "learning_rate": 2.591576430074266e-07, + "loss": 0.8482, + "num_input_tokens_seen": 1574088768, + "step": 8715 + }, + { + "epoch": 0.9541585702947536, + "grad_norm": 1.1055494622734072, + "learning_rate": 2.5792421878829965e-07, + "loss": 0.7391, + "num_input_tokens_seen": 1574259008, + "step": 8716 + }, + { + "epoch": 0.9542680423656914, + "grad_norm": 1.1458865008879262, + "learning_rate": 2.5669372150943505e-07, + "loss": 0.7435, + "num_input_tokens_seen": 1574444256, + "step": 8717 + }, + { + "epoch": 0.9543775144366293, + "grad_norm": 1.1569646740188269, + "learning_rate": 2.554661513164053e-07, + "loss": 1.035, + "num_input_tokens_seen": 1574628160, + "step": 8718 + }, + { + "epoch": 0.9544869865075672, + "grad_norm": 0.9774678115715826, + "learning_rate": 2.5424150835442193e-07, + "loss": 0.7947, + "num_input_tokens_seen": 1574789216, + "step": 8719 + }, + { + "epoch": 0.9545964585785052, + "grad_norm": 1.1597010172100102, + "learning_rate": 2.530197927683664e-07, + "loss": 0.8687, + "num_input_tokens_seen": 1574937728, + "step": 8720 + }, + { + "epoch": 0.9547059306494431, + "grad_norm": 1.135899986266821, + "learning_rate": 2.5180100470275916e-07, + "loss": 1.0996, + "num_input_tokens_seen": 1575109312, + "step": 8721 + }, + { + "epoch": 0.954815402720381, + "grad_norm": 1.1623166196015462, + "learning_rate": 2.5058514430178205e-07, + "loss": 1.0317, + "num_input_tokens_seen": 1575283584, + "step": 8722 + }, + { + "epoch": 0.9549248747913188, + "grad_norm": 1.0038484786810855, + "learning_rate": 2.4937221170927007e-07, + "loss": 0.7477, + "num_input_tokens_seen": 1575429184, + "step": 8723 + }, + { + "epoch": 0.9550343468622567, + "grad_norm": 1.133146986414037, + "learning_rate": 2.481622070687112e-07, + "loss": 1.013, + "num_input_tokens_seen": 1575613760, + "step": 8724 + }, + { + "epoch": 0.9551438189331947, + "grad_norm": 1.22498253790418, + "learning_rate": 2.469551305232465e-07, + "loss": 1.0867, + "num_input_tokens_seen": 1575788256, + "step": 8725 + }, + { + "epoch": 0.9552532910041326, + "grad_norm": 1.1108686036798967, + "learning_rate": 2.457509822156673e-07, + "loss": 0.874, + "num_input_tokens_seen": 1575955360, + "step": 8726 + }, + { + "epoch": 0.9553627630750705, + "grad_norm": 1.195701785057237, + "learning_rate": 2.445497622884263e-07, + "loss": 0.8812, + "num_input_tokens_seen": 1576140608, + "step": 8727 + }, + { + "epoch": 0.9554722351460083, + "grad_norm": 1.15294442765967, + "learning_rate": 2.4335147088362377e-07, + "loss": 0.8874, + "num_input_tokens_seen": 1576344672, + "step": 8728 + }, + { + "epoch": 0.9555817072169462, + "grad_norm": 1.1531536295161613, + "learning_rate": 2.421561081430157e-07, + "loss": 0.8185, + "num_input_tokens_seen": 1576519392, + "step": 8729 + }, + { + "epoch": 0.9556911792878842, + "grad_norm": 1.3631903937813954, + "learning_rate": 2.409636742080112e-07, + "loss": 1.0725, + "num_input_tokens_seen": 1576664544, + "step": 8730 + }, + { + "epoch": 0.9558006513588221, + "grad_norm": 1.0364734833854503, + "learning_rate": 2.3977416921967256e-07, + "loss": 0.9885, + "num_input_tokens_seen": 1576849120, + "step": 8731 + }, + { + "epoch": 0.95591012342976, + "grad_norm": 1.1335904837100546, + "learning_rate": 2.385875933187176e-07, + "loss": 0.9752, + "num_input_tokens_seen": 1577000096, + "step": 8732 + }, + { + "epoch": 0.9560195955006979, + "grad_norm": 1.2512790811457546, + "learning_rate": 2.374039466455119e-07, + "loss": 0.8625, + "num_input_tokens_seen": 1577177504, + "step": 8733 + }, + { + "epoch": 0.9561290675716357, + "grad_norm": 1.239097494663036, + "learning_rate": 2.3622322934008235e-07, + "loss": 0.8287, + "num_input_tokens_seen": 1577353344, + "step": 8734 + }, + { + "epoch": 0.9562385396425737, + "grad_norm": 1.2244370139960614, + "learning_rate": 2.350454415421033e-07, + "loss": 1.0082, + "num_input_tokens_seen": 1577537472, + "step": 8735 + }, + { + "epoch": 0.9563480117135116, + "grad_norm": 1.2162660570643138, + "learning_rate": 2.3387058339090773e-07, + "loss": 0.9232, + "num_input_tokens_seen": 1577727200, + "step": 8736 + }, + { + "epoch": 0.9564574837844495, + "grad_norm": 1.0808500046180831, + "learning_rate": 2.3269865502547894e-07, + "loss": 1.0253, + "num_input_tokens_seen": 1577936640, + "step": 8737 + }, + { + "epoch": 0.9565669558553874, + "grad_norm": 1.1385042515383392, + "learning_rate": 2.3152965658445046e-07, + "loss": 0.89, + "num_input_tokens_seen": 1578147872, + "step": 8738 + }, + { + "epoch": 0.9566764279263253, + "grad_norm": 1.1162707962713239, + "learning_rate": 2.3036358820611448e-07, + "loss": 0.8703, + "num_input_tokens_seen": 1578315872, + "step": 8739 + }, + { + "epoch": 0.9567858999972632, + "grad_norm": 1.131083833493162, + "learning_rate": 2.2920045002841338e-07, + "loss": 1.0554, + "num_input_tokens_seen": 1578467968, + "step": 8740 + }, + { + "epoch": 0.9568953720682011, + "grad_norm": 0.9859748434258306, + "learning_rate": 2.280402421889455e-07, + "loss": 0.8738, + "num_input_tokens_seen": 1578645152, + "step": 8741 + }, + { + "epoch": 0.957004844139139, + "grad_norm": 1.0719118073040987, + "learning_rate": 2.2688296482496208e-07, + "loss": 0.8372, + "num_input_tokens_seen": 1578822560, + "step": 8742 + }, + { + "epoch": 0.9571143162100769, + "grad_norm": 1.0548852240696085, + "learning_rate": 2.2572861807336477e-07, + "loss": 0.7412, + "num_input_tokens_seen": 1578988096, + "step": 8743 + }, + { + "epoch": 0.9572237882810148, + "grad_norm": 1.119879832973839, + "learning_rate": 2.2457720207071098e-07, + "loss": 1.0477, + "num_input_tokens_seen": 1579175360, + "step": 8744 + }, + { + "epoch": 0.9573332603519527, + "grad_norm": 1.223946131652182, + "learning_rate": 2.23428716953214e-07, + "loss": 0.9589, + "num_input_tokens_seen": 1579333280, + "step": 8745 + }, + { + "epoch": 0.9574427324228906, + "grad_norm": 1.1143727088686444, + "learning_rate": 2.2228316285673456e-07, + "loss": 0.9437, + "num_input_tokens_seen": 1579485824, + "step": 8746 + }, + { + "epoch": 0.9575522044938285, + "grad_norm": 1.289589005654493, + "learning_rate": 2.211405399167893e-07, + "loss": 1.0804, + "num_input_tokens_seen": 1579650688, + "step": 8747 + }, + { + "epoch": 0.9576616765647664, + "grad_norm": 1.2053189276636862, + "learning_rate": 2.2000084826854784e-07, + "loss": 1.0293, + "num_input_tokens_seen": 1579818240, + "step": 8748 + }, + { + "epoch": 0.9577711486357043, + "grad_norm": 1.2098307812801283, + "learning_rate": 2.1886408804683568e-07, + "loss": 0.9951, + "num_input_tokens_seen": 1580026336, + "step": 8749 + }, + { + "epoch": 0.9578806207066423, + "grad_norm": 1.2441186828115622, + "learning_rate": 2.1773025938612856e-07, + "loss": 0.8404, + "num_input_tokens_seen": 1580196352, + "step": 8750 + }, + { + "epoch": 0.9579900927775801, + "grad_norm": 1.2400078033386315, + "learning_rate": 2.1659936242055811e-07, + "loss": 0.7377, + "num_input_tokens_seen": 1580331424, + "step": 8751 + }, + { + "epoch": 0.958099564848518, + "grad_norm": 1.4599468932225834, + "learning_rate": 2.1547139728390064e-07, + "loss": 0.8792, + "num_input_tokens_seen": 1580488896, + "step": 8752 + }, + { + "epoch": 0.9582090369194559, + "grad_norm": 1.085110246357428, + "learning_rate": 2.143463641095994e-07, + "loss": 1.1186, + "num_input_tokens_seen": 1580682208, + "step": 8753 + }, + { + "epoch": 0.9583185089903938, + "grad_norm": 1.2795369396206664, + "learning_rate": 2.1322426303074238e-07, + "loss": 0.9549, + "num_input_tokens_seen": 1580854464, + "step": 8754 + }, + { + "epoch": 0.9584279810613318, + "grad_norm": 1.1375992763081058, + "learning_rate": 2.1210509418006785e-07, + "loss": 0.8644, + "num_input_tokens_seen": 1581057856, + "step": 8755 + }, + { + "epoch": 0.9585374531322697, + "grad_norm": 1.1122669618821863, + "learning_rate": 2.1098885768997824e-07, + "loss": 1.1488, + "num_input_tokens_seen": 1581234816, + "step": 8756 + }, + { + "epoch": 0.9586469252032075, + "grad_norm": 1.1481986283983, + "learning_rate": 2.098755536925151e-07, + "loss": 0.8268, + "num_input_tokens_seen": 1581386912, + "step": 8757 + }, + { + "epoch": 0.9587563972741454, + "grad_norm": 1.1544353576313144, + "learning_rate": 2.0876518231938426e-07, + "loss": 0.9307, + "num_input_tokens_seen": 1581565664, + "step": 8758 + }, + { + "epoch": 0.9588658693450833, + "grad_norm": 1.0598451881899462, + "learning_rate": 2.0765774370193892e-07, + "loss": 0.6979, + "num_input_tokens_seen": 1581752256, + "step": 8759 + }, + { + "epoch": 0.9589753414160213, + "grad_norm": 1.1484257703947658, + "learning_rate": 2.0655323797119098e-07, + "loss": 0.85, + "num_input_tokens_seen": 1581951616, + "step": 8760 + }, + { + "epoch": 0.9590848134869592, + "grad_norm": 1.083536856435934, + "learning_rate": 2.0545166525779147e-07, + "loss": 1.1693, + "num_input_tokens_seen": 1582157024, + "step": 8761 + }, + { + "epoch": 0.959194285557897, + "grad_norm": 1.0424942480244204, + "learning_rate": 2.0435302569206672e-07, + "loss": 0.7726, + "num_input_tokens_seen": 1582367136, + "step": 8762 + }, + { + "epoch": 0.9593037576288349, + "grad_norm": 1.1792197910828404, + "learning_rate": 2.0325731940397386e-07, + "loss": 0.9017, + "num_input_tokens_seen": 1582514528, + "step": 8763 + }, + { + "epoch": 0.9594132296997728, + "grad_norm": 1.0591699362363456, + "learning_rate": 2.0216454652313976e-07, + "loss": 0.977, + "num_input_tokens_seen": 1582714560, + "step": 8764 + }, + { + "epoch": 0.9595227017707108, + "grad_norm": 1.0785576325511372, + "learning_rate": 2.0107470717883326e-07, + "loss": 1.1677, + "num_input_tokens_seen": 1582898464, + "step": 8765 + }, + { + "epoch": 0.9596321738416487, + "grad_norm": 0.9713854166821567, + "learning_rate": 1.9998780149997898e-07, + "loss": 0.8775, + "num_input_tokens_seen": 1583079456, + "step": 8766 + }, + { + "epoch": 0.9597416459125866, + "grad_norm": 1.1062235058559107, + "learning_rate": 1.9890382961516295e-07, + "loss": 1.007, + "num_input_tokens_seen": 1583267168, + "step": 8767 + }, + { + "epoch": 0.9598511179835244, + "grad_norm": 1.2767530936343163, + "learning_rate": 1.9782279165260765e-07, + "loss": 0.8749, + "num_input_tokens_seen": 1583458912, + "step": 8768 + }, + { + "epoch": 0.9599605900544623, + "grad_norm": 1.1025239252448469, + "learning_rate": 1.9674468774020516e-07, + "loss": 1.0305, + "num_input_tokens_seen": 1583650208, + "step": 8769 + }, + { + "epoch": 0.9600700621254002, + "grad_norm": 1.1352430380024123, + "learning_rate": 1.956695180054896e-07, + "loss": 0.9036, + "num_input_tokens_seen": 1583807232, + "step": 8770 + }, + { + "epoch": 0.9601795341963382, + "grad_norm": 1.0059274002331355, + "learning_rate": 1.9459728257565367e-07, + "loss": 0.9012, + "num_input_tokens_seen": 1583978368, + "step": 8771 + }, + { + "epoch": 0.9602890062672761, + "grad_norm": 1.0155963013335612, + "learning_rate": 1.935279815775376e-07, + "loss": 0.8983, + "num_input_tokens_seen": 1584161824, + "step": 8772 + }, + { + "epoch": 0.960398478338214, + "grad_norm": 1.1288606311068212, + "learning_rate": 1.9246161513764015e-07, + "loss": 1.0054, + "num_input_tokens_seen": 1584365216, + "step": 8773 + }, + { + "epoch": 0.9605079504091518, + "grad_norm": 1.2378586971731367, + "learning_rate": 1.9139818338211047e-07, + "loss": 0.9221, + "num_input_tokens_seen": 1584538816, + "step": 8774 + }, + { + "epoch": 0.9606174224800897, + "grad_norm": 1.0309426976882605, + "learning_rate": 1.903376864367451e-07, + "loss": 0.8978, + "num_input_tokens_seen": 1584693600, + "step": 8775 + }, + { + "epoch": 0.9607268945510277, + "grad_norm": 1.237113281475099, + "learning_rate": 1.892801244270076e-07, + "loss": 0.9562, + "num_input_tokens_seen": 1584897664, + "step": 8776 + }, + { + "epoch": 0.9608363666219656, + "grad_norm": 1.2055694716002443, + "learning_rate": 1.8822549747800066e-07, + "loss": 0.802, + "num_input_tokens_seen": 1585075072, + "step": 8777 + }, + { + "epoch": 0.9609458386929035, + "grad_norm": 1.0095988565705984, + "learning_rate": 1.8717380571448562e-07, + "loss": 0.7554, + "num_input_tokens_seen": 1585218656, + "step": 8778 + }, + { + "epoch": 0.9610553107638413, + "grad_norm": 0.991303087282525, + "learning_rate": 1.8612504926087405e-07, + "loss": 0.8497, + "num_input_tokens_seen": 1585403904, + "step": 8779 + }, + { + "epoch": 0.9611647828347792, + "grad_norm": 1.0720557613709216, + "learning_rate": 1.8507922824123614e-07, + "loss": 0.9242, + "num_input_tokens_seen": 1585567200, + "step": 8780 + }, + { + "epoch": 0.9612742549057172, + "grad_norm": 1.1427392047246163, + "learning_rate": 1.8403634277928407e-07, + "loss": 0.8591, + "num_input_tokens_seen": 1585759168, + "step": 8781 + }, + { + "epoch": 0.9613837269766551, + "grad_norm": 1.300611559364423, + "learning_rate": 1.829963929983941e-07, + "loss": 1.0846, + "num_input_tokens_seen": 1585928064, + "step": 8782 + }, + { + "epoch": 0.961493199047593, + "grad_norm": 1.2572032803638613, + "learning_rate": 1.8195937902158732e-07, + "loss": 0.9992, + "num_input_tokens_seen": 1586079264, + "step": 8783 + }, + { + "epoch": 0.9616026711185309, + "grad_norm": 1.1976018187812953, + "learning_rate": 1.8092530097154337e-07, + "loss": 0.921, + "num_input_tokens_seen": 1586261376, + "step": 8784 + }, + { + "epoch": 0.9617121431894687, + "grad_norm": 1.1469055649844806, + "learning_rate": 1.7989415897058938e-07, + "loss": 0.9677, + "num_input_tokens_seen": 1586453120, + "step": 8785 + }, + { + "epoch": 0.9618216152604067, + "grad_norm": 1.2066955099469776, + "learning_rate": 1.7886595314070832e-07, + "loss": 0.8934, + "num_input_tokens_seen": 1586608576, + "step": 8786 + }, + { + "epoch": 0.9619310873313446, + "grad_norm": 1.1613764655621612, + "learning_rate": 1.7784068360353623e-07, + "loss": 0.9203, + "num_input_tokens_seen": 1586790912, + "step": 8787 + }, + { + "epoch": 0.9620405594022825, + "grad_norm": 1.1093445894613243, + "learning_rate": 1.7681835048035944e-07, + "loss": 0.7854, + "num_input_tokens_seen": 1586973248, + "step": 8788 + }, + { + "epoch": 0.9621500314732204, + "grad_norm": 1.2453434996533712, + "learning_rate": 1.7579895389211732e-07, + "loss": 0.9243, + "num_input_tokens_seen": 1587143936, + "step": 8789 + }, + { + "epoch": 0.9622595035441583, + "grad_norm": 1.1016291063882104, + "learning_rate": 1.7478249395940227e-07, + "loss": 1.003, + "num_input_tokens_seen": 1587316640, + "step": 8790 + }, + { + "epoch": 0.9623689756150962, + "grad_norm": 1.1518205909898143, + "learning_rate": 1.7376897080246257e-07, + "loss": 1.0206, + "num_input_tokens_seen": 1587473440, + "step": 8791 + }, + { + "epoch": 0.9624784476860341, + "grad_norm": 1.0912568150468103, + "learning_rate": 1.727583845411912e-07, + "loss": 1.0641, + "num_input_tokens_seen": 1587673472, + "step": 8792 + }, + { + "epoch": 0.962587919756972, + "grad_norm": 1.0409122723488764, + "learning_rate": 1.717507352951453e-07, + "loss": 0.8737, + "num_input_tokens_seen": 1587836768, + "step": 8793 + }, + { + "epoch": 0.9626973918279099, + "grad_norm": 1.0683654003914869, + "learning_rate": 1.707460231835184e-07, + "loss": 0.905, + "num_input_tokens_seen": 1588030752, + "step": 8794 + }, + { + "epoch": 0.9628068638988478, + "grad_norm": 1.228621340328692, + "learning_rate": 1.6974424832517654e-07, + "loss": 0.9741, + "num_input_tokens_seen": 1588196064, + "step": 8795 + }, + { + "epoch": 0.9629163359697858, + "grad_norm": 1.0590836070758216, + "learning_rate": 1.687454108386194e-07, + "loss": 1.1864, + "num_input_tokens_seen": 1588396096, + "step": 8796 + }, + { + "epoch": 0.9630258080407236, + "grad_norm": 1.2006365319670733, + "learning_rate": 1.6774951084201073e-07, + "loss": 0.9868, + "num_input_tokens_seen": 1588575296, + "step": 8797 + }, + { + "epoch": 0.9631352801116615, + "grad_norm": 1.0421128207649581, + "learning_rate": 1.6675654845316746e-07, + "loss": 0.7224, + "num_input_tokens_seen": 1588732768, + "step": 8798 + }, + { + "epoch": 0.9632447521825994, + "grad_norm": 1.0421641242795805, + "learning_rate": 1.657665237895484e-07, + "loss": 0.8894, + "num_input_tokens_seen": 1588898752, + "step": 8799 + }, + { + "epoch": 0.9633542242535373, + "grad_norm": 1.3123241488408492, + "learning_rate": 1.6477943696827647e-07, + "loss": 0.7983, + "num_input_tokens_seen": 1589081312, + "step": 8800 + }, + { + "epoch": 0.9634636963244753, + "grad_norm": 0.9704536409051004, + "learning_rate": 1.6379528810611666e-07, + "loss": 0.5396, + "num_input_tokens_seen": 1589277760, + "step": 8801 + }, + { + "epoch": 0.9635731683954131, + "grad_norm": 1.2334423646330797, + "learning_rate": 1.6281407731949805e-07, + "loss": 0.7957, + "num_input_tokens_seen": 1589457184, + "step": 8802 + }, + { + "epoch": 0.963682640466351, + "grad_norm": 1.1084251635854123, + "learning_rate": 1.6183580472449444e-07, + "loss": 0.8748, + "num_input_tokens_seen": 1589677824, + "step": 8803 + }, + { + "epoch": 0.9637921125372889, + "grad_norm": 1.0701899771625776, + "learning_rate": 1.6086047043682994e-07, + "loss": 0.6863, + "num_input_tokens_seen": 1589867328, + "step": 8804 + }, + { + "epoch": 0.9639015846082268, + "grad_norm": 1.215632044911503, + "learning_rate": 1.5988807457189003e-07, + "loss": 0.9494, + "num_input_tokens_seen": 1590070720, + "step": 8805 + }, + { + "epoch": 0.9640110566791648, + "grad_norm": 1.0797880862868077, + "learning_rate": 1.5891861724470214e-07, + "loss": 0.9283, + "num_input_tokens_seen": 1590242528, + "step": 8806 + }, + { + "epoch": 0.9641205287501027, + "grad_norm": 1.129266067765538, + "learning_rate": 1.5795209856995507e-07, + "loss": 0.978, + "num_input_tokens_seen": 1590418592, + "step": 8807 + }, + { + "epoch": 0.9642300008210405, + "grad_norm": 1.1297497089822035, + "learning_rate": 1.5698851866198516e-07, + "loss": 0.9938, + "num_input_tokens_seen": 1590605184, + "step": 8808 + }, + { + "epoch": 0.9643394728919784, + "grad_norm": 1.2031512071802, + "learning_rate": 1.5602787763478177e-07, + "loss": 1.0023, + "num_input_tokens_seen": 1590773632, + "step": 8809 + }, + { + "epoch": 0.9644489449629163, + "grad_norm": 1.0661494466362895, + "learning_rate": 1.5507017560198457e-07, + "loss": 0.7938, + "num_input_tokens_seen": 1590953280, + "step": 8810 + }, + { + "epoch": 0.9645584170338543, + "grad_norm": 1.055109914324063, + "learning_rate": 1.5411541267689178e-07, + "loss": 0.8998, + "num_input_tokens_seen": 1591138304, + "step": 8811 + }, + { + "epoch": 0.9646678891047922, + "grad_norm": 1.0267075574047397, + "learning_rate": 1.531635889724492e-07, + "loss": 0.9486, + "num_input_tokens_seen": 1591343488, + "step": 8812 + }, + { + "epoch": 0.9647773611757301, + "grad_norm": 1.1289354472361048, + "learning_rate": 1.5221470460125565e-07, + "loss": 1.0323, + "num_input_tokens_seen": 1591530304, + "step": 8813 + }, + { + "epoch": 0.9648868332466679, + "grad_norm": 1.1729030931318647, + "learning_rate": 1.512687596755602e-07, + "loss": 1.1449, + "num_input_tokens_seen": 1591717568, + "step": 8814 + }, + { + "epoch": 0.9649963053176058, + "grad_norm": 1.078877449065571, + "learning_rate": 1.5032575430726782e-07, + "loss": 1.0165, + "num_input_tokens_seen": 1591910880, + "step": 8815 + }, + { + "epoch": 0.9651057773885438, + "grad_norm": 1.3567769885762515, + "learning_rate": 1.4938568860793367e-07, + "loss": 0.9565, + "num_input_tokens_seen": 1592096352, + "step": 8816 + }, + { + "epoch": 0.9652152494594817, + "grad_norm": 1.1491439099971266, + "learning_rate": 1.4844856268876607e-07, + "loss": 0.8753, + "num_input_tokens_seen": 1592276224, + "step": 8817 + }, + { + "epoch": 0.9653247215304196, + "grad_norm": 1.1452277346022375, + "learning_rate": 1.475143766606263e-07, + "loss": 0.9768, + "num_input_tokens_seen": 1592430784, + "step": 8818 + }, + { + "epoch": 0.9654341936013574, + "grad_norm": 1.1093719912206117, + "learning_rate": 1.4658313063402595e-07, + "loss": 0.9454, + "num_input_tokens_seen": 1592606848, + "step": 8819 + }, + { + "epoch": 0.9655436656722953, + "grad_norm": 1.2166402602705007, + "learning_rate": 1.4565482471912971e-07, + "loss": 0.9764, + "num_input_tokens_seen": 1592765216, + "step": 8820 + }, + { + "epoch": 0.9656531377432332, + "grad_norm": 1.0628385565417395, + "learning_rate": 1.447294590257553e-07, + "loss": 0.684, + "num_input_tokens_seen": 1592955840, + "step": 8821 + }, + { + "epoch": 0.9657626098141712, + "grad_norm": 1.0949127556172364, + "learning_rate": 1.438070336633679e-07, + "loss": 0.7183, + "num_input_tokens_seen": 1593144896, + "step": 8822 + }, + { + "epoch": 0.9658720818851091, + "grad_norm": 1.2620022576277137, + "learning_rate": 1.4288754874109134e-07, + "loss": 0.9583, + "num_input_tokens_seen": 1593338432, + "step": 8823 + }, + { + "epoch": 0.965981553956047, + "grad_norm": 1.1321625522157088, + "learning_rate": 1.419710043677025e-07, + "loss": 1.0413, + "num_input_tokens_seen": 1593528384, + "step": 8824 + }, + { + "epoch": 0.9660910260269848, + "grad_norm": 1.209814178211806, + "learning_rate": 1.410574006516202e-07, + "loss": 1.2107, + "num_input_tokens_seen": 1593729312, + "step": 8825 + }, + { + "epoch": 0.9662004980979227, + "grad_norm": 1.158574474171796, + "learning_rate": 1.4014673770092746e-07, + "loss": 0.8511, + "num_input_tokens_seen": 1593912992, + "step": 8826 + }, + { + "epoch": 0.9663099701688607, + "grad_norm": 1.1458353678367355, + "learning_rate": 1.3923901562334917e-07, + "loss": 0.7478, + "num_input_tokens_seen": 1594051872, + "step": 8827 + }, + { + "epoch": 0.9664194422397986, + "grad_norm": 1.0563910395792924, + "learning_rate": 1.383342345262717e-07, + "loss": 0.7839, + "num_input_tokens_seen": 1594242272, + "step": 8828 + }, + { + "epoch": 0.9665289143107365, + "grad_norm": 1.1323796991111172, + "learning_rate": 1.3743239451672608e-07, + "loss": 0.9916, + "num_input_tokens_seen": 1594425952, + "step": 8829 + }, + { + "epoch": 0.9666383863816744, + "grad_norm": 1.1504131114167448, + "learning_rate": 1.3653349570139918e-07, + "loss": 0.7943, + "num_input_tokens_seen": 1594588576, + "step": 8830 + }, + { + "epoch": 0.9667478584526122, + "grad_norm": 1.0970408930950304, + "learning_rate": 1.3563753818663093e-07, + "loss": 0.6632, + "num_input_tokens_seen": 1594774720, + "step": 8831 + }, + { + "epoch": 0.9668573305235502, + "grad_norm": 1.1080488956177208, + "learning_rate": 1.3474452207840605e-07, + "loss": 0.7547, + "num_input_tokens_seen": 1594959520, + "step": 8832 + }, + { + "epoch": 0.9669668025944881, + "grad_norm": 1.2269603788543384, + "learning_rate": 1.3385444748237053e-07, + "loss": 0.7651, + "num_input_tokens_seen": 1595127296, + "step": 8833 + }, + { + "epoch": 0.967076274665426, + "grad_norm": 1.120325602903049, + "learning_rate": 1.3296731450381795e-07, + "loss": 0.8201, + "num_input_tokens_seen": 1595276032, + "step": 8834 + }, + { + "epoch": 0.9671857467363639, + "grad_norm": 1.302661213114388, + "learning_rate": 1.3208312324769766e-07, + "loss": 0.9111, + "num_input_tokens_seen": 1595450080, + "step": 8835 + }, + { + "epoch": 0.9672952188073017, + "grad_norm": 1.187550192324329, + "learning_rate": 1.3120187381859826e-07, + "loss": 0.8124, + "num_input_tokens_seen": 1595620096, + "step": 8836 + }, + { + "epoch": 0.9674046908782397, + "grad_norm": 1.123308393463777, + "learning_rate": 1.303235663207808e-07, + "loss": 0.9544, + "num_input_tokens_seen": 1595820352, + "step": 8837 + }, + { + "epoch": 0.9675141629491776, + "grad_norm": 1.0869724638433236, + "learning_rate": 1.2944820085814268e-07, + "loss": 0.6601, + "num_input_tokens_seen": 1595978720, + "step": 8838 + }, + { + "epoch": 0.9676236350201155, + "grad_norm": 1.0574213834718451, + "learning_rate": 1.2857577753423444e-07, + "loss": 0.8659, + "num_input_tokens_seen": 1596122752, + "step": 8839 + }, + { + "epoch": 0.9677331070910534, + "grad_norm": 1.1831986173768698, + "learning_rate": 1.2770629645226796e-07, + "loss": 0.8747, + "num_input_tokens_seen": 1596258496, + "step": 8840 + }, + { + "epoch": 0.9678425791619913, + "grad_norm": 1.2512885604694004, + "learning_rate": 1.2683975771509982e-07, + "loss": 1.0417, + "num_input_tokens_seen": 1596404320, + "step": 8841 + }, + { + "epoch": 0.9679520512329292, + "grad_norm": 1.0520666832440453, + "learning_rate": 1.2597616142523973e-07, + "loss": 0.929, + "num_input_tokens_seen": 1596595168, + "step": 8842 + }, + { + "epoch": 0.9680615233038671, + "grad_norm": 1.1454150852614766, + "learning_rate": 1.251155076848448e-07, + "loss": 0.9454, + "num_input_tokens_seen": 1596775488, + "step": 8843 + }, + { + "epoch": 0.968170995374805, + "grad_norm": 1.07433348996592, + "learning_rate": 1.2425779659573368e-07, + "loss": 0.8901, + "num_input_tokens_seen": 1596957152, + "step": 8844 + }, + { + "epoch": 0.9682804674457429, + "grad_norm": 1.163181201739234, + "learning_rate": 1.2340302825937232e-07, + "loss": 0.6821, + "num_input_tokens_seen": 1597095360, + "step": 8845 + }, + { + "epoch": 0.9683899395166808, + "grad_norm": 1.1745051051938704, + "learning_rate": 1.2255120277687714e-07, + "loss": 0.9196, + "num_input_tokens_seen": 1597272320, + "step": 8846 + }, + { + "epoch": 0.9684994115876188, + "grad_norm": 1.3447476609070166, + "learning_rate": 1.2170232024901473e-07, + "loss": 0.964, + "num_input_tokens_seen": 1597433600, + "step": 8847 + }, + { + "epoch": 0.9686088836585566, + "grad_norm": 1.1011618027458163, + "learning_rate": 1.208563807762103e-07, + "loss": 1.1229, + "num_input_tokens_seen": 1597638784, + "step": 8848 + }, + { + "epoch": 0.9687183557294945, + "grad_norm": 1.0391868000685471, + "learning_rate": 1.2001338445853382e-07, + "loss": 0.8259, + "num_input_tokens_seen": 1597847328, + "step": 8849 + }, + { + "epoch": 0.9688278278004324, + "grad_norm": 1.1893951451995486, + "learning_rate": 1.1917333139571385e-07, + "loss": 1.1402, + "num_input_tokens_seen": 1598021152, + "step": 8850 + }, + { + "epoch": 0.9689372998713703, + "grad_norm": 1.1672095954162016, + "learning_rate": 1.1833622168712366e-07, + "loss": 0.7415, + "num_input_tokens_seen": 1598197888, + "step": 8851 + }, + { + "epoch": 0.9690467719423083, + "grad_norm": 1.2316215322498536, + "learning_rate": 1.1750205543179239e-07, + "loss": 1.1279, + "num_input_tokens_seen": 1598365888, + "step": 8852 + }, + { + "epoch": 0.9691562440132461, + "grad_norm": 1.174283718360024, + "learning_rate": 1.1667083272840218e-07, + "loss": 0.6788, + "num_input_tokens_seen": 1598546880, + "step": 8853 + }, + { + "epoch": 0.969265716084184, + "grad_norm": 1.0298116073720287, + "learning_rate": 1.1584255367528274e-07, + "loss": 0.7425, + "num_input_tokens_seen": 1598715328, + "step": 8854 + }, + { + "epoch": 0.9693751881551219, + "grad_norm": 1.2548330809242123, + "learning_rate": 1.1501721837041679e-07, + "loss": 0.8326, + "num_input_tokens_seen": 1598913568, + "step": 8855 + }, + { + "epoch": 0.9694846602260598, + "grad_norm": 1.242725279895808, + "learning_rate": 1.141948269114429e-07, + "loss": 0.915, + "num_input_tokens_seen": 1599093216, + "step": 8856 + }, + { + "epoch": 0.9695941322969978, + "grad_norm": 1.2014234822069725, + "learning_rate": 1.133753793956499e-07, + "loss": 0.6336, + "num_input_tokens_seen": 1599248896, + "step": 8857 + }, + { + "epoch": 0.9697036043679357, + "grad_norm": 1.160811011507582, + "learning_rate": 1.1255887591997138e-07, + "loss": 0.7579, + "num_input_tokens_seen": 1599431456, + "step": 8858 + }, + { + "epoch": 0.9698130764388735, + "grad_norm": 1.1235446523358747, + "learning_rate": 1.1174531658100229e-07, + "loss": 1.0034, + "num_input_tokens_seen": 1599616032, + "step": 8859 + }, + { + "epoch": 0.9699225485098114, + "grad_norm": 1.0973370179484891, + "learning_rate": 1.1093470147498231e-07, + "loss": 0.9299, + "num_input_tokens_seen": 1599818528, + "step": 8860 + }, + { + "epoch": 0.9700320205807493, + "grad_norm": 1.2779600657251844, + "learning_rate": 1.1012703069780972e-07, + "loss": 1.0742, + "num_input_tokens_seen": 1599987872, + "step": 8861 + }, + { + "epoch": 0.9701414926516873, + "grad_norm": 1.1827162205805657, + "learning_rate": 1.0932230434502755e-07, + "loss": 0.9106, + "num_input_tokens_seen": 1600174912, + "step": 8862 + }, + { + "epoch": 0.9702509647226252, + "grad_norm": 1.1950632471242162, + "learning_rate": 1.0852052251183187e-07, + "loss": 0.9124, + "num_input_tokens_seen": 1600316928, + "step": 8863 + }, + { + "epoch": 0.9703604367935631, + "grad_norm": 1.1094480678512126, + "learning_rate": 1.0772168529307736e-07, + "loss": 1.0668, + "num_input_tokens_seen": 1600525248, + "step": 8864 + }, + { + "epoch": 0.9704699088645009, + "grad_norm": 1.170466173826333, + "learning_rate": 1.0692579278325788e-07, + "loss": 0.8373, + "num_input_tokens_seen": 1600703552, + "step": 8865 + }, + { + "epoch": 0.9705793809354388, + "grad_norm": 1.129608305586223, + "learning_rate": 1.061328450765342e-07, + "loss": 0.9818, + "num_input_tokens_seen": 1600892384, + "step": 8866 + }, + { + "epoch": 0.9706888530063768, + "grad_norm": 1.1697678504392244, + "learning_rate": 1.0534284226670077e-07, + "loss": 0.9934, + "num_input_tokens_seen": 1601065088, + "step": 8867 + }, + { + "epoch": 0.9707983250773147, + "grad_norm": 1.159034109253856, + "learning_rate": 1.045557844472217e-07, + "loss": 0.9454, + "num_input_tokens_seen": 1601226592, + "step": 8868 + }, + { + "epoch": 0.9709077971482526, + "grad_norm": 1.0726260498418716, + "learning_rate": 1.0377167171120028e-07, + "loss": 1.0661, + "num_input_tokens_seen": 1601423488, + "step": 8869 + }, + { + "epoch": 0.9710172692191904, + "grad_norm": 1.2633419402105264, + "learning_rate": 1.0299050415139844e-07, + "loss": 1.0512, + "num_input_tokens_seen": 1601592608, + "step": 8870 + }, + { + "epoch": 0.9711267412901283, + "grad_norm": 1.0965507614972885, + "learning_rate": 1.022122818602228e-07, + "loss": 0.9743, + "num_input_tokens_seen": 1601750080, + "step": 8871 + }, + { + "epoch": 0.9712362133610662, + "grad_norm": 1.1501191989655783, + "learning_rate": 1.0143700492973862e-07, + "loss": 1.4801, + "num_input_tokens_seen": 1601954368, + "step": 8872 + }, + { + "epoch": 0.9713456854320042, + "grad_norm": 1.215833794086274, + "learning_rate": 1.0066467345165864e-07, + "loss": 0.9686, + "num_input_tokens_seen": 1602149472, + "step": 8873 + }, + { + "epoch": 0.9714551575029421, + "grad_norm": 1.2759542577999188, + "learning_rate": 9.989528751734867e-08, + "loss": 0.9571, + "num_input_tokens_seen": 1602308736, + "step": 8874 + }, + { + "epoch": 0.97156462957388, + "grad_norm": 0.9950076596046719, + "learning_rate": 9.912884721782478e-08, + "loss": 0.9825, + "num_input_tokens_seen": 1602486368, + "step": 8875 + }, + { + "epoch": 0.9716741016448178, + "grad_norm": 1.1947815657135752, + "learning_rate": 9.836535264375613e-08, + "loss": 0.6396, + "num_input_tokens_seen": 1602684832, + "step": 8876 + }, + { + "epoch": 0.9717835737157557, + "grad_norm": 1.0972774843123025, + "learning_rate": 9.760480388546211e-08, + "loss": 1.0107, + "num_input_tokens_seen": 1602867392, + "step": 8877 + }, + { + "epoch": 0.9718930457866937, + "grad_norm": 1.1685155398764187, + "learning_rate": 9.684720103291522e-08, + "loss": 0.9433, + "num_input_tokens_seen": 1603068768, + "step": 8878 + }, + { + "epoch": 0.9720025178576316, + "grad_norm": 1.180965514465262, + "learning_rate": 9.609254417573543e-08, + "loss": 0.8469, + "num_input_tokens_seen": 1603235648, + "step": 8879 + }, + { + "epoch": 0.9721119899285695, + "grad_norm": 1.1630571875874445, + "learning_rate": 9.534083340320132e-08, + "loss": 0.7047, + "num_input_tokens_seen": 1603398944, + "step": 8880 + }, + { + "epoch": 0.9722214619995074, + "grad_norm": 1.2154187868448745, + "learning_rate": 9.459206880423621e-08, + "loss": 0.8611, + "num_input_tokens_seen": 1603575008, + "step": 8881 + }, + { + "epoch": 0.9723309340704452, + "grad_norm": 0.97931122558207, + "learning_rate": 9.384625046741924e-08, + "loss": 0.9154, + "num_input_tokens_seen": 1603734944, + "step": 8882 + }, + { + "epoch": 0.9724404061413832, + "grad_norm": 1.0601419199099438, + "learning_rate": 9.310337848097705e-08, + "loss": 1.0286, + "num_input_tokens_seen": 1603910560, + "step": 8883 + }, + { + "epoch": 0.9725498782123211, + "grad_norm": 1.1148650564979308, + "learning_rate": 9.236345293279492e-08, + "loss": 0.6788, + "num_input_tokens_seen": 1604103200, + "step": 8884 + }, + { + "epoch": 0.972659350283259, + "grad_norm": 1.060541704421673, + "learning_rate": 9.162647391039724e-08, + "loss": 0.8142, + "num_input_tokens_seen": 1604284640, + "step": 8885 + }, + { + "epoch": 0.9727688223541969, + "grad_norm": 1.1709321361588911, + "learning_rate": 9.089244150097265e-08, + "loss": 0.8404, + "num_input_tokens_seen": 1604438976, + "step": 8886 + }, + { + "epoch": 0.9728782944251347, + "grad_norm": 1.1751471041858719, + "learning_rate": 9.016135579135165e-08, + "loss": 0.9308, + "num_input_tokens_seen": 1604619520, + "step": 8887 + }, + { + "epoch": 0.9729877664960727, + "grad_norm": 1.1477577637194016, + "learning_rate": 8.943321686802619e-08, + "loss": 0.8302, + "num_input_tokens_seen": 1604815072, + "step": 8888 + }, + { + "epoch": 0.9730972385670106, + "grad_norm": 1.0174131058276432, + "learning_rate": 8.870802481712736e-08, + "loss": 0.7656, + "num_input_tokens_seen": 1604983520, + "step": 8889 + }, + { + "epoch": 0.9732067106379485, + "grad_norm": 1.0873003853188081, + "learning_rate": 8.798577972445043e-08, + "loss": 0.853, + "num_input_tokens_seen": 1605153984, + "step": 8890 + }, + { + "epoch": 0.9733161827088864, + "grad_norm": 1.246212561991054, + "learning_rate": 8.726648167542706e-08, + "loss": 0.9289, + "num_input_tokens_seen": 1605291968, + "step": 8891 + }, + { + "epoch": 0.9734256547798243, + "grad_norm": 1.1786737191203593, + "learning_rate": 8.65501307551586e-08, + "loss": 0.8955, + "num_input_tokens_seen": 1605467808, + "step": 8892 + }, + { + "epoch": 0.9735351268507622, + "grad_norm": 1.080758733996031, + "learning_rate": 8.583672704838008e-08, + "loss": 1.1198, + "num_input_tokens_seen": 1605672768, + "step": 8893 + }, + { + "epoch": 0.9736445989217001, + "grad_norm": 0.9626472273844774, + "learning_rate": 8.512627063949064e-08, + "loss": 0.832, + "num_input_tokens_seen": 1605858016, + "step": 8894 + }, + { + "epoch": 0.973754070992638, + "grad_norm": 1.1334377799337465, + "learning_rate": 8.441876161253414e-08, + "loss": 0.9512, + "num_input_tokens_seen": 1606034752, + "step": 8895 + }, + { + "epoch": 0.9738635430635759, + "grad_norm": 1.064615891637444, + "learning_rate": 8.371420005120756e-08, + "loss": 0.7818, + "num_input_tokens_seen": 1606222912, + "step": 8896 + }, + { + "epoch": 0.9739730151345138, + "grad_norm": 0.9928135136018885, + "learning_rate": 8.301258603885808e-08, + "loss": 0.6377, + "num_input_tokens_seen": 1606407040, + "step": 8897 + }, + { + "epoch": 0.9740824872054518, + "grad_norm": 1.1242173969613232, + "learning_rate": 8.231391965848601e-08, + "loss": 0.9861, + "num_input_tokens_seen": 1606579296, + "step": 8898 + }, + { + "epoch": 0.9741919592763896, + "grad_norm": 1.2863800344178673, + "learning_rate": 8.161820099274464e-08, + "loss": 0.9617, + "num_input_tokens_seen": 1606779776, + "step": 8899 + }, + { + "epoch": 0.9743014313473275, + "grad_norm": 1.2490425257532813, + "learning_rate": 8.092543012393483e-08, + "loss": 0.9095, + "num_input_tokens_seen": 1606950464, + "step": 8900 + }, + { + "epoch": 0.9744109034182654, + "grad_norm": 1.3786131463561155, + "learning_rate": 8.023560713400769e-08, + "loss": 0.9943, + "num_input_tokens_seen": 1607079040, + "step": 8901 + }, + { + "epoch": 0.9745203754892033, + "grad_norm": 1.0972817213019272, + "learning_rate": 7.954873210457015e-08, + "loss": 0.9535, + "num_input_tokens_seen": 1607285792, + "step": 8902 + }, + { + "epoch": 0.9746298475601413, + "grad_norm": 1.1345878107209293, + "learning_rate": 7.886480511687666e-08, + "loss": 0.8621, + "num_input_tokens_seen": 1607454912, + "step": 8903 + }, + { + "epoch": 0.9747393196310791, + "grad_norm": 1.088825803235421, + "learning_rate": 7.81838262518375e-08, + "loss": 1.0413, + "num_input_tokens_seen": 1607654496, + "step": 8904 + }, + { + "epoch": 0.974848791702017, + "grad_norm": 1.1197944049653867, + "learning_rate": 7.75057955900077e-08, + "loss": 1.0182, + "num_input_tokens_seen": 1607828768, + "step": 8905 + }, + { + "epoch": 0.9749582637729549, + "grad_norm": 1.015281208245332, + "learning_rate": 7.683071321160085e-08, + "loss": 0.7526, + "num_input_tokens_seen": 1608007744, + "step": 8906 + }, + { + "epoch": 0.9750677358438928, + "grad_norm": 1.1436200990545806, + "learning_rate": 7.615857919647252e-08, + "loss": 1.0368, + "num_input_tokens_seen": 1608198144, + "step": 8907 + }, + { + "epoch": 0.9751772079148308, + "grad_norm": 1.1459779416935587, + "learning_rate": 7.548939362414243e-08, + "loss": 0.9094, + "num_input_tokens_seen": 1608370848, + "step": 8908 + }, + { + "epoch": 0.9752866799857687, + "grad_norm": 1.1236681471940975, + "learning_rate": 7.482315657376394e-08, + "loss": 1.0805, + "num_input_tokens_seen": 1608565056, + "step": 8909 + }, + { + "epoch": 0.9753961520567065, + "grad_norm": 1.1395255760512955, + "learning_rate": 7.41598681241601e-08, + "loss": 1.0569, + "num_input_tokens_seen": 1608769568, + "step": 8910 + }, + { + "epoch": 0.9755056241276444, + "grad_norm": 1.0041213343242463, + "learning_rate": 7.349952835379592e-08, + "loss": 1.0158, + "num_input_tokens_seen": 1608961088, + "step": 8911 + }, + { + "epoch": 0.9756150961985823, + "grad_norm": 1.0822762993366222, + "learning_rate": 7.284213734078394e-08, + "loss": 0.9854, + "num_input_tokens_seen": 1609147008, + "step": 8912 + }, + { + "epoch": 0.9757245682695203, + "grad_norm": 1.0758130813355056, + "learning_rate": 7.218769516289247e-08, + "loss": 0.9069, + "num_input_tokens_seen": 1609344576, + "step": 8913 + }, + { + "epoch": 0.9758340403404582, + "grad_norm": 1.0938907532724234, + "learning_rate": 7.153620189754573e-08, + "loss": 0.8174, + "num_input_tokens_seen": 1609543040, + "step": 8914 + }, + { + "epoch": 0.9759435124113961, + "grad_norm": 1.1722624074808499, + "learning_rate": 7.088765762180982e-08, + "loss": 0.9677, + "num_input_tokens_seen": 1609726048, + "step": 8915 + }, + { + "epoch": 0.9760529844823339, + "grad_norm": 1.123756197953662, + "learning_rate": 7.024206241240671e-08, + "loss": 0.8634, + "num_input_tokens_seen": 1609911520, + "step": 8916 + }, + { + "epoch": 0.9761624565532718, + "grad_norm": 1.2052874267438844, + "learning_rate": 6.959941634571143e-08, + "loss": 1.1983, + "num_input_tokens_seen": 1610100128, + "step": 8917 + }, + { + "epoch": 0.9762719286242098, + "grad_norm": 1.2047043874623182, + "learning_rate": 6.895971949774649e-08, + "loss": 0.7806, + "num_input_tokens_seen": 1610260288, + "step": 8918 + }, + { + "epoch": 0.9763814006951477, + "grad_norm": 1.109026518340415, + "learning_rate": 6.832297194418746e-08, + "loss": 0.8743, + "num_input_tokens_seen": 1610456960, + "step": 8919 + }, + { + "epoch": 0.9764908727660856, + "grad_norm": 1.0993241639823823, + "learning_rate": 6.768917376035744e-08, + "loss": 0.9147, + "num_input_tokens_seen": 1610629888, + "step": 8920 + }, + { + "epoch": 0.9766003448370234, + "grad_norm": 1.1912892768845411, + "learning_rate": 6.70583250212381e-08, + "loss": 0.8439, + "num_input_tokens_seen": 1610801248, + "step": 8921 + }, + { + "epoch": 0.9767098169079613, + "grad_norm": 1.1213042696234077, + "learning_rate": 6.643042580145309e-08, + "loss": 1.0172, + "num_input_tokens_seen": 1611002624, + "step": 8922 + }, + { + "epoch": 0.9768192889788992, + "grad_norm": 1.3031348852427587, + "learning_rate": 6.580547617528465e-08, + "loss": 1.2454, + "num_input_tokens_seen": 1611194592, + "step": 8923 + }, + { + "epoch": 0.9769287610498372, + "grad_norm": 1.1297849515508664, + "learning_rate": 6.518347621666255e-08, + "loss": 0.8735, + "num_input_tokens_seen": 1611380736, + "step": 8924 + }, + { + "epoch": 0.9770382331207751, + "grad_norm": 1.0580401222178573, + "learning_rate": 6.456442599916679e-08, + "loss": 0.8455, + "num_input_tokens_seen": 1611552768, + "step": 8925 + }, + { + "epoch": 0.977147705191713, + "grad_norm": 1.121242765830698, + "learning_rate": 6.394832559603048e-08, + "loss": 0.7613, + "num_input_tokens_seen": 1611695008, + "step": 8926 + }, + { + "epoch": 0.9772571772626508, + "grad_norm": 1.2314442960536793, + "learning_rate": 6.333517508013975e-08, + "loss": 1.1046, + "num_input_tokens_seen": 1611891008, + "step": 8927 + }, + { + "epoch": 0.9773666493335887, + "grad_norm": 1.179572447162082, + "learning_rate": 6.272497452402548e-08, + "loss": 0.8751, + "num_input_tokens_seen": 1612056320, + "step": 8928 + }, + { + "epoch": 0.9774761214045267, + "grad_norm": 1.2976400577607452, + "learning_rate": 6.211772399987715e-08, + "loss": 0.88, + "num_input_tokens_seen": 1612241120, + "step": 8929 + }, + { + "epoch": 0.9775855934754646, + "grad_norm": 1.306391169024472, + "learning_rate": 6.151342357952617e-08, + "loss": 1.1554, + "num_input_tokens_seen": 1612416288, + "step": 8930 + }, + { + "epoch": 0.9776950655464025, + "grad_norm": 1.1897944820270328, + "learning_rate": 6.091207333446259e-08, + "loss": 1.2414, + "num_input_tokens_seen": 1612573760, + "step": 8931 + }, + { + "epoch": 0.9778045376173404, + "grad_norm": 1.1206594017862928, + "learning_rate": 6.031367333582949e-08, + "loss": 0.9695, + "num_input_tokens_seen": 1612730112, + "step": 8932 + }, + { + "epoch": 0.9779140096882782, + "grad_norm": 1.0855687565089267, + "learning_rate": 5.971822365440639e-08, + "loss": 0.6102, + "num_input_tokens_seen": 1612899456, + "step": 8933 + }, + { + "epoch": 0.9780234817592162, + "grad_norm": 1.0532935644340151, + "learning_rate": 5.912572436064523e-08, + "loss": 1.0883, + "num_input_tokens_seen": 1613075968, + "step": 8934 + }, + { + "epoch": 0.9781329538301541, + "grad_norm": 1.1092339009562588, + "learning_rate": 5.853617552462887e-08, + "loss": 0.6987, + "num_input_tokens_seen": 1613227392, + "step": 8935 + }, + { + "epoch": 0.978242425901092, + "grad_norm": 1.0135924321026832, + "learning_rate": 5.794957721610428e-08, + "loss": 0.7384, + "num_input_tokens_seen": 1613399200, + "step": 8936 + }, + { + "epoch": 0.9783518979720299, + "grad_norm": 1.2386459150574425, + "learning_rate": 5.7365929504460404e-08, + "loss": 1.0244, + "num_input_tokens_seen": 1613564960, + "step": 8937 + }, + { + "epoch": 0.9784613700429677, + "grad_norm": 1.1805637970181688, + "learning_rate": 5.678523245874756e-08, + "loss": 0.9379, + "num_input_tokens_seen": 1613755136, + "step": 8938 + }, + { + "epoch": 0.9785708421139057, + "grad_norm": 1.114432559406915, + "learning_rate": 5.620748614765803e-08, + "loss": 0.9636, + "num_input_tokens_seen": 1613927616, + "step": 8939 + }, + { + "epoch": 0.9786803141848436, + "grad_norm": 1.3355390166335694, + "learning_rate": 5.563269063953991e-08, + "loss": 0.8928, + "num_input_tokens_seen": 1614135712, + "step": 8940 + }, + { + "epoch": 0.9787897862557815, + "grad_norm": 0.9660557844589118, + "learning_rate": 5.506084600238881e-08, + "loss": 0.6238, + "num_input_tokens_seen": 1614321408, + "step": 8941 + }, + { + "epoch": 0.9788992583267194, + "grad_norm": 1.2701255463699503, + "learning_rate": 5.4491952303850624e-08, + "loss": 1.0358, + "num_input_tokens_seen": 1614481344, + "step": 8942 + }, + { + "epoch": 0.9790087303976573, + "grad_norm": 1.1330636107894718, + "learning_rate": 5.392600961122707e-08, + "loss": 1.0387, + "num_input_tokens_seen": 1614679808, + "step": 8943 + }, + { + "epoch": 0.9791182024685952, + "grad_norm": 0.9989341479659726, + "learning_rate": 5.3363017991470145e-08, + "loss": 0.7523, + "num_input_tokens_seen": 1614847360, + "step": 8944 + }, + { + "epoch": 0.9792276745395331, + "grad_norm": 1.066538987614492, + "learning_rate": 5.280297751117658e-08, + "loss": 1.0129, + "num_input_tokens_seen": 1615044032, + "step": 8945 + }, + { + "epoch": 0.979337146610471, + "grad_norm": 1.134989678592217, + "learning_rate": 5.224588823659893e-08, + "loss": 0.9199, + "num_input_tokens_seen": 1615236448, + "step": 8946 + }, + { + "epoch": 0.9794466186814089, + "grad_norm": 1.2177588516255098, + "learning_rate": 5.169175023364003e-08, + "loss": 0.9877, + "num_input_tokens_seen": 1615457536, + "step": 8947 + }, + { + "epoch": 0.9795560907523468, + "grad_norm": 1.1515102152037944, + "learning_rate": 5.114056356785857e-08, + "loss": 0.8683, + "num_input_tokens_seen": 1615602912, + "step": 8948 + }, + { + "epoch": 0.9796655628232848, + "grad_norm": 1.1488249248523623, + "learning_rate": 5.05923283044496e-08, + "loss": 0.9934, + "num_input_tokens_seen": 1615769568, + "step": 8949 + }, + { + "epoch": 0.9797750348942226, + "grad_norm": 0.9954411481899545, + "learning_rate": 5.004704450827513e-08, + "loss": 0.743, + "num_input_tokens_seen": 1615955936, + "step": 8950 + }, + { + "epoch": 0.9798845069651605, + "grad_norm": 1.1599816084916517, + "learning_rate": 4.9504712243839126e-08, + "loss": 0.9042, + "num_input_tokens_seen": 1616162240, + "step": 8951 + }, + { + "epoch": 0.9799939790360984, + "grad_norm": 1.1848841013345044, + "learning_rate": 4.896533157529859e-08, + "loss": 0.9349, + "num_input_tokens_seen": 1616347712, + "step": 8952 + }, + { + "epoch": 0.9801034511070363, + "grad_norm": 1.1480218044164177, + "learning_rate": 4.842890256646082e-08, + "loss": 0.8569, + "num_input_tokens_seen": 1616497120, + "step": 8953 + }, + { + "epoch": 0.9802129231779743, + "grad_norm": 1.1961962356364009, + "learning_rate": 4.789542528078339e-08, + "loss": 1.0539, + "num_input_tokens_seen": 1616643168, + "step": 8954 + }, + { + "epoch": 0.9803223952489121, + "grad_norm": 1.1895360056497586, + "learning_rate": 4.73648997813797e-08, + "loss": 0.8197, + "num_input_tokens_seen": 1616805120, + "step": 8955 + }, + { + "epoch": 0.98043186731985, + "grad_norm": 1.199097667632715, + "learning_rate": 4.6837326131002336e-08, + "loss": 1.0513, + "num_input_tokens_seen": 1617007616, + "step": 8956 + }, + { + "epoch": 0.9805413393907879, + "grad_norm": 1.1039595259697015, + "learning_rate": 4.63127043920708e-08, + "loss": 1.0188, + "num_input_tokens_seen": 1617166208, + "step": 8957 + }, + { + "epoch": 0.9806508114617258, + "grad_norm": 1.163117197739282, + "learning_rate": 4.579103462664103e-08, + "loss": 1.1022, + "num_input_tokens_seen": 1617345856, + "step": 8958 + }, + { + "epoch": 0.9807602835326638, + "grad_norm": 1.1458037401481547, + "learning_rate": 4.52723168964303e-08, + "loss": 1.048, + "num_input_tokens_seen": 1617523712, + "step": 8959 + }, + { + "epoch": 0.9808697556036017, + "grad_norm": 1.0226063155885154, + "learning_rate": 4.4756551262795096e-08, + "loss": 0.6062, + "num_input_tokens_seen": 1617704480, + "step": 8960 + }, + { + "epoch": 0.9809792276745395, + "grad_norm": 1.1826767595377272, + "learning_rate": 4.424373778675606e-08, + "loss": 0.794, + "num_input_tokens_seen": 1617854112, + "step": 8961 + }, + { + "epoch": 0.9810886997454774, + "grad_norm": 1.1338735412615186, + "learning_rate": 4.373387652897576e-08, + "loss": 0.8626, + "num_input_tokens_seen": 1618029952, + "step": 8962 + }, + { + "epoch": 0.9811981718164153, + "grad_norm": 1.0537312446454634, + "learning_rate": 4.3226967549769845e-08, + "loss": 0.8156, + "num_input_tokens_seen": 1618221920, + "step": 8963 + }, + { + "epoch": 0.9813076438873533, + "grad_norm": 1.0578452979994342, + "learning_rate": 4.2723010909104244e-08, + "loss": 1.0326, + "num_input_tokens_seen": 1618427328, + "step": 8964 + }, + { + "epoch": 0.9814171159582912, + "grad_norm": 0.8912036421437721, + "learning_rate": 4.222200666659515e-08, + "loss": 0.8037, + "num_input_tokens_seen": 1618609888, + "step": 8965 + }, + { + "epoch": 0.9815265880292291, + "grad_norm": 1.258247584564419, + "learning_rate": 4.1723954881511816e-08, + "loss": 1.0293, + "num_input_tokens_seen": 1618783936, + "step": 8966 + }, + { + "epoch": 0.9816360601001669, + "grad_norm": 1.2795895307252727, + "learning_rate": 4.122885561277101e-08, + "loss": 0.9095, + "num_input_tokens_seen": 1618977696, + "step": 8967 + }, + { + "epoch": 0.9817455321711048, + "grad_norm": 0.9832771779583288, + "learning_rate": 4.073670891894532e-08, + "loss": 0.886, + "num_input_tokens_seen": 1619176832, + "step": 8968 + }, + { + "epoch": 0.9818550042420428, + "grad_norm": 1.0342457347668508, + "learning_rate": 4.0247514858252065e-08, + "loss": 0.9492, + "num_input_tokens_seen": 1619348416, + "step": 8969 + }, + { + "epoch": 0.9819644763129807, + "grad_norm": 1.028006278375089, + "learning_rate": 3.97612734885644e-08, + "loss": 0.7255, + "num_input_tokens_seen": 1619550464, + "step": 8970 + }, + { + "epoch": 0.9820739483839186, + "grad_norm": 1.2876421414771075, + "learning_rate": 3.9277984867400196e-08, + "loss": 0.8936, + "num_input_tokens_seen": 1619718016, + "step": 8971 + }, + { + "epoch": 0.9821834204548564, + "grad_norm": 1.2042760915877626, + "learning_rate": 3.879764905193595e-08, + "loss": 0.9367, + "num_input_tokens_seen": 1619916480, + "step": 8972 + }, + { + "epoch": 0.9822928925257943, + "grad_norm": 1.1246594867360282, + "learning_rate": 3.832026609899009e-08, + "loss": 0.8873, + "num_input_tokens_seen": 1620096128, + "step": 8973 + }, + { + "epoch": 0.9824023645967322, + "grad_norm": 1.1017686366925854, + "learning_rate": 3.7845836065039664e-08, + "loss": 0.8523, + "num_input_tokens_seen": 1620272640, + "step": 8974 + }, + { + "epoch": 0.9825118366676702, + "grad_norm": 1.2519406989041826, + "learning_rate": 3.737435900620645e-08, + "loss": 1.1305, + "num_input_tokens_seen": 1620433024, + "step": 8975 + }, + { + "epoch": 0.9826213087386081, + "grad_norm": 1.1602817557423004, + "learning_rate": 3.690583497826528e-08, + "loss": 0.789, + "num_input_tokens_seen": 1620584000, + "step": 8976 + }, + { + "epoch": 0.982730780809546, + "grad_norm": 1.174457755672512, + "learning_rate": 3.644026403664402e-08, + "loss": 1.2681, + "num_input_tokens_seen": 1620780448, + "step": 8977 + }, + { + "epoch": 0.9828402528804838, + "grad_norm": 1.0756893564728751, + "learning_rate": 3.5977646236415306e-08, + "loss": 0.8091, + "num_input_tokens_seen": 1620955392, + "step": 8978 + }, + { + "epoch": 0.9829497249514217, + "grad_norm": 1.0583191124769142, + "learning_rate": 3.551798163231035e-08, + "loss": 0.792, + "num_input_tokens_seen": 1621118016, + "step": 8979 + }, + { + "epoch": 0.9830591970223597, + "grad_norm": 1.0583686166924255, + "learning_rate": 3.506127027870232e-08, + "loss": 0.8992, + "num_input_tokens_seen": 1621302144, + "step": 8980 + }, + { + "epoch": 0.9831686690932976, + "grad_norm": 1.1694936334731074, + "learning_rate": 3.4607512229622993e-08, + "loss": 0.9606, + "num_input_tokens_seen": 1621499264, + "step": 8981 + }, + { + "epoch": 0.9832781411642355, + "grad_norm": 1.0514490777753347, + "learning_rate": 3.415670753874889e-08, + "loss": 0.9206, + "num_input_tokens_seen": 1621696384, + "step": 8982 + }, + { + "epoch": 0.9833876132351734, + "grad_norm": 1.2004812845044672, + "learning_rate": 3.370885625940956e-08, + "loss": 1.0507, + "num_input_tokens_seen": 1621890144, + "step": 8983 + }, + { + "epoch": 0.9834970853061112, + "grad_norm": 1.0165063408514532, + "learning_rate": 3.3263958444582076e-08, + "loss": 0.7291, + "num_input_tokens_seen": 1622071808, + "step": 8984 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 1.180730548056305, + "learning_rate": 3.2822014146902114e-08, + "loss": 1.017, + "num_input_tokens_seen": 1622241376, + "step": 8985 + }, + { + "epoch": 0.9837160294479871, + "grad_norm": 1.061269047075317, + "learning_rate": 3.2383023418650074e-08, + "loss": 1.0597, + "num_input_tokens_seen": 1622440288, + "step": 8986 + }, + { + "epoch": 0.983825501518925, + "grad_norm": 1.0590878280262925, + "learning_rate": 3.1946986311756634e-08, + "loss": 1.0105, + "num_input_tokens_seen": 1622629792, + "step": 8987 + }, + { + "epoch": 0.9839349735898629, + "grad_norm": 1.1621051179182793, + "learning_rate": 3.151390287780276e-08, + "loss": 1.0486, + "num_input_tokens_seen": 1622814592, + "step": 8988 + }, + { + "epoch": 0.9840444456608007, + "grad_norm": 1.0239728735780405, + "learning_rate": 3.108377316801969e-08, + "loss": 0.9022, + "num_input_tokens_seen": 1623006560, + "step": 8989 + }, + { + "epoch": 0.9841539177317387, + "grad_norm": 1.225764361156383, + "learning_rate": 3.065659723329728e-08, + "loss": 1.0511, + "num_input_tokens_seen": 1623185088, + "step": 8990 + }, + { + "epoch": 0.9842633898026766, + "grad_norm": 1.0912128995376245, + "learning_rate": 3.023237512416455e-08, + "loss": 0.7805, + "num_input_tokens_seen": 1623339872, + "step": 8991 + }, + { + "epoch": 0.9843728618736145, + "grad_norm": 1.1827106769631053, + "learning_rate": 2.981110689080913e-08, + "loss": 1.0692, + "num_input_tokens_seen": 1623551552, + "step": 8992 + }, + { + "epoch": 0.9844823339445524, + "grad_norm": 1.210821133836766, + "learning_rate": 2.9392792583066154e-08, + "loss": 1.0629, + "num_input_tokens_seen": 1623748896, + "step": 8993 + }, + { + "epoch": 0.9845918060154903, + "grad_norm": 1.155566400060345, + "learning_rate": 2.8977432250418267e-08, + "loss": 1.0581, + "num_input_tokens_seen": 1623929664, + "step": 8994 + }, + { + "epoch": 0.9847012780864282, + "grad_norm": 1.1856930185452683, + "learning_rate": 2.8565025942001166e-08, + "loss": 0.9712, + "num_input_tokens_seen": 1624095648, + "step": 8995 + }, + { + "epoch": 0.9848107501573661, + "grad_norm": 1.0830858510333417, + "learning_rate": 2.8155573706609152e-08, + "loss": 0.9379, + "num_input_tokens_seen": 1624291200, + "step": 8996 + }, + { + "epoch": 0.984920222228304, + "grad_norm": 0.9664338326122153, + "learning_rate": 2.7749075592670148e-08, + "loss": 0.6978, + "num_input_tokens_seen": 1624448224, + "step": 8997 + }, + { + "epoch": 0.9850296942992419, + "grad_norm": 1.032838465007866, + "learning_rate": 2.734553164827902e-08, + "loss": 0.9129, + "num_input_tokens_seen": 1624633024, + "step": 8998 + }, + { + "epoch": 0.9851391663701798, + "grad_norm": 1.1296642886741135, + "learning_rate": 2.6944941921172585e-08, + "loss": 0.8035, + "num_input_tokens_seen": 1624819840, + "step": 8999 + }, + { + "epoch": 0.9852486384411178, + "grad_norm": 1.1917238906423375, + "learning_rate": 2.654730645873793e-08, + "loss": 1.0822, + "num_input_tokens_seen": 1624968800, + "step": 9000 + }, + { + "epoch": 0.9853581105120556, + "grad_norm": 0.9603760792781908, + "learning_rate": 2.6152625308015212e-08, + "loss": 0.8484, + "num_input_tokens_seen": 1625164352, + "step": 9001 + }, + { + "epoch": 0.9854675825829935, + "grad_norm": 1.2578581807625917, + "learning_rate": 2.576089851569763e-08, + "loss": 0.8313, + "num_input_tokens_seen": 1625326304, + "step": 9002 + }, + { + "epoch": 0.9855770546539314, + "grad_norm": 1.1673146333102566, + "learning_rate": 2.5372126128120345e-08, + "loss": 1.1665, + "num_input_tokens_seen": 1625534176, + "step": 9003 + }, + { + "epoch": 0.9856865267248693, + "grad_norm": 1.0920626567604024, + "learning_rate": 2.4986308191277118e-08, + "loss": 0.9605, + "num_input_tokens_seen": 1625736672, + "step": 9004 + }, + { + "epoch": 0.9857959987958073, + "grad_norm": 1.1859568055281575, + "learning_rate": 2.4603444750811998e-08, + "loss": 0.7496, + "num_input_tokens_seen": 1625883840, + "step": 9005 + }, + { + "epoch": 0.9859054708667451, + "grad_norm": 1.042145021593787, + "learning_rate": 2.4223535852010983e-08, + "loss": 0.7944, + "num_input_tokens_seen": 1626046240, + "step": 9006 + }, + { + "epoch": 0.986014942937683, + "grad_norm": 1.107089468771851, + "learning_rate": 2.384658153982422e-08, + "loss": 0.8618, + "num_input_tokens_seen": 1626240224, + "step": 9007 + }, + { + "epoch": 0.9861244150086209, + "grad_norm": 1.192836665727467, + "learning_rate": 2.347258185883827e-08, + "loss": 1.0287, + "num_input_tokens_seen": 1626426144, + "step": 9008 + }, + { + "epoch": 0.9862338870795588, + "grad_norm": 1.1041138774320671, + "learning_rate": 2.31015368532983e-08, + "loss": 0.7897, + "num_input_tokens_seen": 1626586976, + "step": 9009 + }, + { + "epoch": 0.9863433591504968, + "grad_norm": 1.0893205049752388, + "learning_rate": 2.2733446567099747e-08, + "loss": 0.9115, + "num_input_tokens_seen": 1626755872, + "step": 9010 + }, + { + "epoch": 0.9864528312214347, + "grad_norm": 1.0244887100238087, + "learning_rate": 2.236831104378556e-08, + "loss": 0.6701, + "num_input_tokens_seen": 1626938432, + "step": 9011 + }, + { + "epoch": 0.9865623032923725, + "grad_norm": 1.2379885065750023, + "learning_rate": 2.2006130326551745e-08, + "loss": 0.9218, + "num_input_tokens_seen": 1627142272, + "step": 9012 + }, + { + "epoch": 0.9866717753633104, + "grad_norm": 1.0995492596904328, + "learning_rate": 2.16469044582418e-08, + "loss": 0.6784, + "num_input_tokens_seen": 1627328416, + "step": 9013 + }, + { + "epoch": 0.9867812474342483, + "grad_norm": 1.107156604031069, + "learning_rate": 2.129063348135507e-08, + "loss": 0.8195, + "num_input_tokens_seen": 1627513664, + "step": 9014 + }, + { + "epoch": 0.9868907195051863, + "grad_norm": 1.0544324213208296, + "learning_rate": 2.0937317438032844e-08, + "loss": 1.1298, + "num_input_tokens_seen": 1627709216, + "step": 9015 + }, + { + "epoch": 0.9870001915761242, + "grad_norm": 1.0374980995436578, + "learning_rate": 2.0586956370075018e-08, + "loss": 0.7539, + "num_input_tokens_seen": 1627878112, + "step": 9016 + }, + { + "epoch": 0.9871096636470621, + "grad_norm": 1.097658367765834, + "learning_rate": 2.0239550318926215e-08, + "loss": 1.1311, + "num_input_tokens_seen": 1628060896, + "step": 9017 + }, + { + "epoch": 0.9872191357179999, + "grad_norm": 1.1463748288173712, + "learning_rate": 1.9895099325686894e-08, + "loss": 1.0293, + "num_input_tokens_seen": 1628242784, + "step": 9018 + }, + { + "epoch": 0.9873286077889378, + "grad_norm": 1.0757178371691423, + "learning_rate": 1.955360343110224e-08, + "loss": 0.8484, + "num_input_tokens_seen": 1628446624, + "step": 9019 + }, + { + "epoch": 0.9874380798598758, + "grad_norm": 1.2063091303113889, + "learning_rate": 1.921506267557327e-08, + "loss": 0.8816, + "num_input_tokens_seen": 1628632768, + "step": 9020 + }, + { + "epoch": 0.9875475519308137, + "grad_norm": 1.1661033405684977, + "learning_rate": 1.8879477099145726e-08, + "loss": 1.1487, + "num_input_tokens_seen": 1628825856, + "step": 9021 + }, + { + "epoch": 0.9876570240017516, + "grad_norm": 1.049395530277197, + "learning_rate": 1.8546846741521184e-08, + "loss": 0.8833, + "num_input_tokens_seen": 1628998112, + "step": 9022 + }, + { + "epoch": 0.9877664960726894, + "grad_norm": 1.1242257208803546, + "learning_rate": 1.8217171642048726e-08, + "loss": 0.9563, + "num_input_tokens_seen": 1629189632, + "step": 9023 + }, + { + "epoch": 0.9878759681436273, + "grad_norm": 1.3069997616091265, + "learning_rate": 1.7890451839727707e-08, + "loss": 1.1399, + "num_input_tokens_seen": 1629393696, + "step": 9024 + }, + { + "epoch": 0.9879854402145652, + "grad_norm": 1.1702244133388082, + "learning_rate": 1.756668737320777e-08, + "loss": 0.8137, + "num_input_tokens_seen": 1629550496, + "step": 9025 + }, + { + "epoch": 0.9880949122855032, + "grad_norm": 1.0393953865699053, + "learning_rate": 1.7245878280791606e-08, + "loss": 0.86, + "num_input_tokens_seen": 1629728128, + "step": 9026 + }, + { + "epoch": 0.9882043843564411, + "grad_norm": 1.0331978175814769, + "learning_rate": 1.692802460042664e-08, + "loss": 0.6698, + "num_input_tokens_seen": 1629894784, + "step": 9027 + }, + { + "epoch": 0.988313856427379, + "grad_norm": 1.046976283853226, + "learning_rate": 1.66131263697189e-08, + "loss": 0.7303, + "num_input_tokens_seen": 1630068384, + "step": 9028 + }, + { + "epoch": 0.9884233284983168, + "grad_norm": 1.0500224610605668, + "learning_rate": 1.630118362591915e-08, + "loss": 0.8252, + "num_input_tokens_seen": 1630201440, + "step": 9029 + }, + { + "epoch": 0.9885328005692547, + "grad_norm": 1.0812769715577926, + "learning_rate": 1.5992196405925642e-08, + "loss": 0.7654, + "num_input_tokens_seen": 1630383776, + "step": 9030 + }, + { + "epoch": 0.9886422726401927, + "grad_norm": 1.0489859293565849, + "learning_rate": 1.568616474629525e-08, + "loss": 0.6067, + "num_input_tokens_seen": 1630543040, + "step": 9031 + }, + { + "epoch": 0.9887517447111306, + "grad_norm": 1.0485410954649246, + "learning_rate": 1.5383088683229574e-08, + "loss": 0.7099, + "num_input_tokens_seen": 1630716416, + "step": 9032 + }, + { + "epoch": 0.9888612167820685, + "grad_norm": 1.1583714483471552, + "learning_rate": 1.5082968252583263e-08, + "loss": 0.8025, + "num_input_tokens_seen": 1630883520, + "step": 9033 + }, + { + "epoch": 0.9889706888530064, + "grad_norm": 1.0597550894652603, + "learning_rate": 1.4785803489858474e-08, + "loss": 0.8531, + "num_input_tokens_seen": 1631082880, + "step": 9034 + }, + { + "epoch": 0.9890801609239442, + "grad_norm": 1.2639654605619053, + "learning_rate": 1.4491594430207645e-08, + "loss": 0.973, + "num_input_tokens_seen": 1631262976, + "step": 9035 + }, + { + "epoch": 0.9891896329948822, + "grad_norm": 1.0900464706839004, + "learning_rate": 1.4200341108439042e-08, + "loss": 0.9446, + "num_input_tokens_seen": 1631457632, + "step": 9036 + }, + { + "epoch": 0.9892991050658201, + "grad_norm": 1.1797893555986951, + "learning_rate": 1.3912043559005661e-08, + "loss": 0.9806, + "num_input_tokens_seen": 1631667296, + "step": 9037 + }, + { + "epoch": 0.989408577136758, + "grad_norm": 1.1190518914907384, + "learning_rate": 1.3626701816010778e-08, + "loss": 0.871, + "num_input_tokens_seen": 1631859040, + "step": 9038 + }, + { + "epoch": 0.9895180492076959, + "grad_norm": 1.1688181872544903, + "learning_rate": 1.3344315913210725e-08, + "loss": 0.7274, + "num_input_tokens_seen": 1632046304, + "step": 9039 + }, + { + "epoch": 0.9896275212786337, + "grad_norm": 1.112413302268717, + "learning_rate": 1.3064885884012112e-08, + "loss": 0.8064, + "num_input_tokens_seen": 1632251040, + "step": 9040 + }, + { + "epoch": 0.9897369933495717, + "grad_norm": 1.0829283923377453, + "learning_rate": 1.278841176147183e-08, + "loss": 0.9352, + "num_input_tokens_seen": 1632424864, + "step": 9041 + }, + { + "epoch": 0.9898464654205096, + "grad_norm": 1.2836828162601288, + "learning_rate": 1.2514893578294274e-08, + "loss": 0.9728, + "num_input_tokens_seen": 1632613248, + "step": 9042 + }, + { + "epoch": 0.9899559374914475, + "grad_norm": 1.0495957845383694, + "learning_rate": 1.2244331366836892e-08, + "loss": 0.773, + "num_input_tokens_seen": 1632790656, + "step": 9043 + }, + { + "epoch": 0.9900654095623854, + "grad_norm": 1.1046752238367128, + "learning_rate": 1.1976725159107415e-08, + "loss": 0.7169, + "num_input_tokens_seen": 1632921248, + "step": 9044 + }, + { + "epoch": 0.9901748816333233, + "grad_norm": 1.1900864592166263, + "learning_rate": 1.1712074986761079e-08, + "loss": 0.9976, + "num_input_tokens_seen": 1633111648, + "step": 9045 + }, + { + "epoch": 0.9902843537042612, + "grad_norm": 1.1837316250205756, + "learning_rate": 1.1450380881106171e-08, + "loss": 0.863, + "num_input_tokens_seen": 1633312800, + "step": 9046 + }, + { + "epoch": 0.9903938257751991, + "grad_norm": 1.1222227572344354, + "learning_rate": 1.1191642873104036e-08, + "loss": 0.6596, + "num_input_tokens_seen": 1633488864, + "step": 9047 + }, + { + "epoch": 0.990503297846137, + "grad_norm": 1.2175018081024969, + "learning_rate": 1.0935860993357971e-08, + "loss": 0.798, + "num_input_tokens_seen": 1633666720, + "step": 9048 + }, + { + "epoch": 0.9906127699170749, + "grad_norm": 1.2251650874166145, + "learning_rate": 1.0683035272127107e-08, + "loss": 0.9445, + "num_input_tokens_seen": 1633856224, + "step": 9049 + }, + { + "epoch": 0.9907222419880128, + "grad_norm": 1.026917750002146, + "learning_rate": 1.0433165739323625e-08, + "loss": 0.7118, + "num_input_tokens_seen": 1634033408, + "step": 9050 + }, + { + "epoch": 0.9908317140589508, + "grad_norm": 1.123587516350417, + "learning_rate": 1.0186252424504439e-08, + "loss": 0.6755, + "num_input_tokens_seen": 1634234784, + "step": 9051 + }, + { + "epoch": 0.9909411861298886, + "grad_norm": 1.1359007208955447, + "learning_rate": 9.942295356879517e-09, + "loss": 1.0072, + "num_input_tokens_seen": 1634433472, + "step": 9052 + }, + { + "epoch": 0.9910506582008265, + "grad_norm": 1.0660909668719627, + "learning_rate": 9.701294565309105e-09, + "loss": 1.1832, + "num_input_tokens_seen": 1634616256, + "step": 9053 + }, + { + "epoch": 0.9911601302717644, + "grad_norm": 1.2904690553604985, + "learning_rate": 9.463250078300955e-09, + "loss": 0.7886, + "num_input_tokens_seen": 1634806208, + "step": 9054 + }, + { + "epoch": 0.9912696023427023, + "grad_norm": 1.2301981794997183, + "learning_rate": 9.228161924015877e-09, + "loss": 0.9664, + "num_input_tokens_seen": 1634977120, + "step": 9055 + }, + { + "epoch": 0.9913790744136403, + "grad_norm": 1.0998162029342977, + "learning_rate": 8.99603013026773e-09, + "loss": 0.9962, + "num_input_tokens_seen": 1635175136, + "step": 9056 + }, + { + "epoch": 0.9914885464845781, + "grad_norm": 1.0568604866353266, + "learning_rate": 8.766854724509555e-09, + "loss": 0.9536, + "num_input_tokens_seen": 1635353664, + "step": 9057 + }, + { + "epoch": 0.991598018555516, + "grad_norm": 1.0919223774684914, + "learning_rate": 8.540635733861325e-09, + "loss": 0.7651, + "num_input_tokens_seen": 1635531968, + "step": 9058 + }, + { + "epoch": 0.9917074906264539, + "grad_norm": 1.1286453420757967, + "learning_rate": 8.317373185079413e-09, + "loss": 0.9625, + "num_input_tokens_seen": 1635738048, + "step": 9059 + }, + { + "epoch": 0.9918169626973918, + "grad_norm": 1.1745987836171872, + "learning_rate": 8.097067104576029e-09, + "loss": 1.0214, + "num_input_tokens_seen": 1635904928, + "step": 9060 + }, + { + "epoch": 0.9919264347683298, + "grad_norm": 1.1876608087175904, + "learning_rate": 7.879717518413654e-09, + "loss": 0.8122, + "num_input_tokens_seen": 1636081440, + "step": 9061 + }, + { + "epoch": 0.9920359068392677, + "grad_norm": 1.122744736870827, + "learning_rate": 7.66532445230228e-09, + "loss": 0.9507, + "num_input_tokens_seen": 1636274976, + "step": 9062 + }, + { + "epoch": 0.9921453789102055, + "grad_norm": 1.0914958613968595, + "learning_rate": 7.453887931607728e-09, + "loss": 0.9778, + "num_input_tokens_seen": 1636476800, + "step": 9063 + }, + { + "epoch": 0.9922548509811434, + "grad_norm": 1.0073730930263458, + "learning_rate": 7.2454079813405465e-09, + "loss": 0.9354, + "num_input_tokens_seen": 1636672576, + "step": 9064 + }, + { + "epoch": 0.9923643230520813, + "grad_norm": 1.242094936160963, + "learning_rate": 7.039884626164339e-09, + "loss": 0.9738, + "num_input_tokens_seen": 1636825344, + "step": 9065 + }, + { + "epoch": 0.9924737951230193, + "grad_norm": 1.079626116408539, + "learning_rate": 6.83731789038744e-09, + "loss": 1.1323, + "num_input_tokens_seen": 1637027840, + "step": 9066 + }, + { + "epoch": 0.9925832671939572, + "grad_norm": 1.141262555591253, + "learning_rate": 6.637707797979564e-09, + "loss": 0.8698, + "num_input_tokens_seen": 1637210848, + "step": 9067 + }, + { + "epoch": 0.9926927392648951, + "grad_norm": 1.1826750963997272, + "learning_rate": 6.44105437255238e-09, + "loss": 0.8906, + "num_input_tokens_seen": 1637398112, + "step": 9068 + }, + { + "epoch": 0.9928022113358329, + "grad_norm": 1.2686622344982152, + "learning_rate": 6.247357637367834e-09, + "loss": 0.8269, + "num_input_tokens_seen": 1637601280, + "step": 9069 + }, + { + "epoch": 0.9929116834067708, + "grad_norm": 1.0586297849032849, + "learning_rate": 6.056617615340931e-09, + "loss": 0.8737, + "num_input_tokens_seen": 1637776896, + "step": 9070 + }, + { + "epoch": 0.9930211554777088, + "grad_norm": 1.2573847546865098, + "learning_rate": 5.868834329036954e-09, + "loss": 0.8278, + "num_input_tokens_seen": 1637931232, + "step": 9071 + }, + { + "epoch": 0.9931306275486467, + "grad_norm": 1.2977802380437988, + "learning_rate": 5.684007800668689e-09, + "loss": 0.9059, + "num_input_tokens_seen": 1638106400, + "step": 9072 + }, + { + "epoch": 0.9932400996195846, + "grad_norm": 1.173935210917825, + "learning_rate": 5.50213805210198e-09, + "loss": 1.0028, + "num_input_tokens_seen": 1638288512, + "step": 9073 + }, + { + "epoch": 0.9933495716905224, + "grad_norm": 1.1063636419687082, + "learning_rate": 5.3232251048473956e-09, + "loss": 0.7726, + "num_input_tokens_seen": 1638480032, + "step": 9074 + }, + { + "epoch": 0.9934590437614603, + "grad_norm": 0.9626945930551019, + "learning_rate": 5.147268980076891e-09, + "loss": 0.8973, + "num_input_tokens_seen": 1638698880, + "step": 9075 + }, + { + "epoch": 0.9935685158323982, + "grad_norm": 1.1528212253140053, + "learning_rate": 4.974269698601597e-09, + "loss": 0.841, + "num_input_tokens_seen": 1638887712, + "step": 9076 + }, + { + "epoch": 0.9936779879033362, + "grad_norm": 1.275434227471194, + "learning_rate": 4.804227280888473e-09, + "loss": 1.1056, + "num_input_tokens_seen": 1639076096, + "step": 9077 + }, + { + "epoch": 0.9937874599742741, + "grad_norm": 1.004309072461239, + "learning_rate": 4.637141747051987e-09, + "loss": 0.7488, + "num_input_tokens_seen": 1639248128, + "step": 9078 + }, + { + "epoch": 0.993896932045212, + "grad_norm": 1.101647448138852, + "learning_rate": 4.473013116859659e-09, + "loss": 0.796, + "num_input_tokens_seen": 1639449504, + "step": 9079 + }, + { + "epoch": 0.9940064041161498, + "grad_norm": 1.2643344563024033, + "learning_rate": 4.311841409723738e-09, + "loss": 0.9079, + "num_input_tokens_seen": 1639632960, + "step": 9080 + }, + { + "epoch": 0.9941158761870877, + "grad_norm": 1.1411886520860446, + "learning_rate": 4.153626644715081e-09, + "loss": 0.8292, + "num_input_tokens_seen": 1639814848, + "step": 9081 + }, + { + "epoch": 0.9942253482580257, + "grad_norm": 1.1035185417204263, + "learning_rate": 3.998368840549271e-09, + "loss": 1.1794, + "num_input_tokens_seen": 1640018240, + "step": 9082 + }, + { + "epoch": 0.9943348203289636, + "grad_norm": 1.146233032333937, + "learning_rate": 3.8460680155921746e-09, + "loss": 0.7232, + "num_input_tokens_seen": 1640209088, + "step": 9083 + }, + { + "epoch": 0.9944442923999015, + "grad_norm": 1.1280025574235806, + "learning_rate": 3.6967241878599347e-09, + "loss": 1.1266, + "num_input_tokens_seen": 1640398144, + "step": 9084 + }, + { + "epoch": 0.9945537644708394, + "grad_norm": 1.2502053568954563, + "learning_rate": 3.550337375018975e-09, + "loss": 1.0705, + "num_input_tokens_seen": 1640599968, + "step": 9085 + }, + { + "epoch": 0.9946632365417772, + "grad_norm": 1.0722808003003168, + "learning_rate": 3.406907594388775e-09, + "loss": 0.822, + "num_input_tokens_seen": 1640798656, + "step": 9086 + }, + { + "epoch": 0.9947727086127152, + "grad_norm": 1.109627278615991, + "learning_rate": 3.2664348629363183e-09, + "loss": 0.8315, + "num_input_tokens_seen": 1640993760, + "step": 9087 + }, + { + "epoch": 0.9948821806836531, + "grad_norm": 1.076279927279576, + "learning_rate": 3.1289191972816435e-09, + "loss": 0.7504, + "num_input_tokens_seen": 1641179456, + "step": 9088 + }, + { + "epoch": 0.994991652754591, + "grad_norm": 1.084873474056728, + "learning_rate": 2.994360613686742e-09, + "loss": 1.1492, + "num_input_tokens_seen": 1641367168, + "step": 9089 + }, + { + "epoch": 0.9951011248255289, + "grad_norm": 1.0061069220948307, + "learning_rate": 2.862759128072212e-09, + "loss": 0.7467, + "num_input_tokens_seen": 1641579520, + "step": 9090 + }, + { + "epoch": 0.9952105968964667, + "grad_norm": 1.0809088990308806, + "learning_rate": 2.734114756008932e-09, + "loss": 1.2505, + "num_input_tokens_seen": 1641769024, + "step": 9091 + }, + { + "epoch": 0.9953200689674047, + "grad_norm": 1.1115907918457382, + "learning_rate": 2.6084275127125078e-09, + "loss": 1.1067, + "num_input_tokens_seen": 1641945312, + "step": 9092 + }, + { + "epoch": 0.9954295410383426, + "grad_norm": 1.3079723192138242, + "learning_rate": 2.485697413051602e-09, + "loss": 0.8964, + "num_input_tokens_seen": 1642090016, + "step": 9093 + }, + { + "epoch": 0.9955390131092805, + "grad_norm": 1.0486596112407007, + "learning_rate": 2.365924471547931e-09, + "loss": 0.8095, + "num_input_tokens_seen": 1642273248, + "step": 9094 + }, + { + "epoch": 0.9956484851802184, + "grad_norm": 1.0390263601342524, + "learning_rate": 2.2491087023651657e-09, + "loss": 0.7469, + "num_input_tokens_seen": 1642460512, + "step": 9095 + }, + { + "epoch": 0.9957579572511563, + "grad_norm": 1.0597224675638346, + "learning_rate": 2.1352501193255824e-09, + "loss": 1.0862, + "num_input_tokens_seen": 1642660096, + "step": 9096 + }, + { + "epoch": 0.9958674293220942, + "grad_norm": 1.1144072075605937, + "learning_rate": 2.0243487358989623e-09, + "loss": 0.8892, + "num_input_tokens_seen": 1642850272, + "step": 9097 + }, + { + "epoch": 0.9959769013930321, + "grad_norm": 1.1112169904064941, + "learning_rate": 1.9164045652053655e-09, + "loss": 0.8041, + "num_input_tokens_seen": 1643012000, + "step": 9098 + }, + { + "epoch": 0.99608637346397, + "grad_norm": 1.2759732701661852, + "learning_rate": 1.8114176200123567e-09, + "loss": 1.183, + "num_input_tokens_seen": 1643208224, + "step": 9099 + }, + { + "epoch": 0.9961958455349079, + "grad_norm": 1.114889061373885, + "learning_rate": 1.709387912737781e-09, + "loss": 0.8781, + "num_input_tokens_seen": 1643364800, + "step": 9100 + }, + { + "epoch": 0.9963053176058458, + "grad_norm": 1.2247596660667852, + "learning_rate": 1.6103154554553135e-09, + "loss": 0.9975, + "num_input_tokens_seen": 1643532352, + "step": 9101 + }, + { + "epoch": 0.9964147896767838, + "grad_norm": 1.064373720339657, + "learning_rate": 1.5142002598833581e-09, + "loss": 1.0536, + "num_input_tokens_seen": 1643704832, + "step": 9102 + }, + { + "epoch": 0.9965242617477216, + "grad_norm": 1.2032410701893521, + "learning_rate": 1.4210423373933746e-09, + "loss": 0.8358, + "num_input_tokens_seen": 1643816160, + "step": 9103 + }, + { + "epoch": 0.9966337338186595, + "grad_norm": 1.3002393813914435, + "learning_rate": 1.330841699004326e-09, + "loss": 1.0104, + "num_input_tokens_seen": 1643993568, + "step": 9104 + }, + { + "epoch": 0.9967432058895974, + "grad_norm": 1.1209253058343207, + "learning_rate": 1.2435983553882314e-09, + "loss": 0.856, + "num_input_tokens_seen": 1644159328, + "step": 9105 + }, + { + "epoch": 0.9968526779605353, + "grad_norm": 1.110767779685141, + "learning_rate": 1.1593123168646137e-09, + "loss": 0.8713, + "num_input_tokens_seen": 1644339200, + "step": 9106 + }, + { + "epoch": 0.9969621500314733, + "grad_norm": 1.007227469091209, + "learning_rate": 1.0779835934032755e-09, + "loss": 0.9679, + "num_input_tokens_seen": 1644517952, + "step": 9107 + }, + { + "epoch": 0.9970716221024111, + "grad_norm": 1.235404394173523, + "learning_rate": 9.996121946270753e-10, + "loss": 0.9056, + "num_input_tokens_seen": 1644713504, + "step": 9108 + }, + { + "epoch": 0.997181094173349, + "grad_norm": 1.2521571143442787, + "learning_rate": 9.241981298091506e-10, + "loss": 1.0041, + "num_input_tokens_seen": 1644880832, + "step": 9109 + }, + { + "epoch": 0.9972905662442869, + "grad_norm": 1.2700881910115858, + "learning_rate": 8.517414078645925e-10, + "loss": 0.7762, + "num_input_tokens_seen": 1645044576, + "step": 9110 + }, + { + "epoch": 0.9974000383152248, + "grad_norm": 1.2472499637355252, + "learning_rate": 7.822420373726491e-10, + "loss": 0.9477, + "num_input_tokens_seen": 1645214592, + "step": 9111 + }, + { + "epoch": 0.9975095103861628, + "grad_norm": 1.2347735473803452, + "learning_rate": 7.157000265489711e-10, + "loss": 1.1317, + "num_input_tokens_seen": 1645404096, + "step": 9112 + }, + { + "epoch": 0.9976189824571007, + "grad_norm": 1.0770493268978474, + "learning_rate": 6.521153832678151e-10, + "loss": 0.7154, + "num_input_tokens_seen": 1645558880, + "step": 9113 + }, + { + "epoch": 0.9977284545280385, + "grad_norm": 1.1807734562605396, + "learning_rate": 5.914881150509422e-10, + "loss": 0.8393, + "num_input_tokens_seen": 1645694848, + "step": 9114 + }, + { + "epoch": 0.9978379265989764, + "grad_norm": 1.0546046471724693, + "learning_rate": 5.33818229070393e-10, + "loss": 0.9212, + "num_input_tokens_seen": 1645879648, + "step": 9115 + }, + { + "epoch": 0.9979473986699143, + "grad_norm": 1.3080337920563039, + "learning_rate": 4.791057321484882e-10, + "loss": 0.9202, + "num_input_tokens_seen": 1646073856, + "step": 9116 + }, + { + "epoch": 0.9980568707408523, + "grad_norm": 1.225030756583732, + "learning_rate": 4.273506307550523e-10, + "loss": 0.8831, + "num_input_tokens_seen": 1646248576, + "step": 9117 + }, + { + "epoch": 0.9981663428117902, + "grad_norm": 1.079834067171953, + "learning_rate": 3.785529310185165e-10, + "loss": 0.9692, + "num_input_tokens_seen": 1646420160, + "step": 9118 + }, + { + "epoch": 0.9982758148827281, + "grad_norm": 1.1065163652681058, + "learning_rate": 3.327126387064894e-10, + "loss": 0.8842, + "num_input_tokens_seen": 1646585696, + "step": 9119 + }, + { + "epoch": 0.9983852869536659, + "grad_norm": 1.146186386972348, + "learning_rate": 2.898297592424104e-10, + "loss": 0.828, + "num_input_tokens_seen": 1646765792, + "step": 9120 + }, + { + "epoch": 0.9984947590246038, + "grad_norm": 1.0614178139112662, + "learning_rate": 2.499042976999988e-10, + "loss": 0.7106, + "num_input_tokens_seen": 1646930656, + "step": 9121 + }, + { + "epoch": 0.9986042310955418, + "grad_norm": 1.2714912660473545, + "learning_rate": 2.1293625880325352e-10, + "loss": 0.8705, + "num_input_tokens_seen": 1647114560, + "step": 9122 + }, + { + "epoch": 0.9987137031664797, + "grad_norm": 1.1913594721381673, + "learning_rate": 1.7892564692367775e-10, + "loss": 0.8217, + "num_input_tokens_seen": 1647317504, + "step": 9123 + }, + { + "epoch": 0.9988231752374176, + "grad_norm": 1.2194616734111414, + "learning_rate": 1.478724660886055e-10, + "loss": 0.8254, + "num_input_tokens_seen": 1647472512, + "step": 9124 + }, + { + "epoch": 0.9989326473083554, + "grad_norm": 1.146847928842468, + "learning_rate": 1.1977671996732388e-10, + "loss": 0.9125, + "num_input_tokens_seen": 1647639840, + "step": 9125 + }, + { + "epoch": 0.9990421193792933, + "grad_norm": 1.0425556522032668, + "learning_rate": 9.463841188217527e-11, + "loss": 0.8639, + "num_input_tokens_seen": 1647810976, + "step": 9126 + }, + { + "epoch": 0.9991515914502312, + "grad_norm": 1.0787739113890433, + "learning_rate": 7.245754481133294e-11, + "loss": 1.132, + "num_input_tokens_seen": 1648004064, + "step": 9127 + }, + { + "epoch": 0.9992610635211692, + "grad_norm": 0.9974236380716206, + "learning_rate": 5.3234121377698785e-11, + "loss": 0.7971, + "num_input_tokens_seen": 1648192000, + "step": 9128 + }, + { + "epoch": 0.9993705355921071, + "grad_norm": 0.9546933637523226, + "learning_rate": 3.696814385445446e-11, + "loss": 0.6711, + "num_input_tokens_seen": 1648376800, + "step": 9129 + }, + { + "epoch": 0.999480007663045, + "grad_norm": 1.0958920892162078, + "learning_rate": 2.365961416506135e-11, + "loss": 1.0707, + "num_input_tokens_seen": 1648559808, + "step": 9130 + }, + { + "epoch": 0.9995894797339828, + "grad_norm": 1.231024722485542, + "learning_rate": 1.3308533886036145e-11, + "loss": 0.8996, + "num_input_tokens_seen": 1648714144, + "step": 9131 + }, + { + "epoch": 0.9996989518049207, + "grad_norm": 1.0017347785739568, + "learning_rate": 5.914904241399732e-12, + "loss": 0.7928, + "num_input_tokens_seen": 1648911264, + "step": 9132 + }, + { + "epoch": 0.9998084238758587, + "grad_norm": 1.2572936889497326, + "learning_rate": 1.4787261026771859e-12, + "loss": 0.8593, + "num_input_tokens_seen": 1649101216, + "step": 9133 + }, + { + "epoch": 0.9999178959467966, + "grad_norm": 0.9670324481679939, + "learning_rate": 0.0, + "loss": 0.7529, + "num_input_tokens_seen": 1649269888, + "step": 9134 + }, + { + "epoch": 0.9999178959467966, + "num_input_tokens_seen": 1649269888, + "step": 9134, + "total_flos": 3298213988794368.0, + "train_loss": 0.7717254485568724, + "train_runtime": 600800.1758, + "train_samples_per_second": 3.406, + "train_steps_per_second": 0.015 + } + ], + "logging_steps": 1.0, + "max_steps": 9134, + "num_input_tokens_seen": 1649269888, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3298213988794368.0, + "train_batch_size": 28, + "trial_name": null, + "trial_params": null +}