| { |
| "best_global_step": 10602, |
| "best_metric": 0.6984838843345642, |
| "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_openbookqa_1754652174/checkpoint-10602", |
| "epoch": 10.0, |
| "eval_steps": 558, |
| "global_step": 11160, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004480286738351254, |
| "grad_norm": 2.500624895095825, |
| "learning_rate": 1.7921146953405018e-07, |
| "loss": 11.4359, |
| "num_input_tokens_seen": 1792, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.008960573476702509, |
| "grad_norm": 2.8934195041656494, |
| "learning_rate": 4.032258064516129e-07, |
| "loss": 11.1971, |
| "num_input_tokens_seen": 3776, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.013440860215053764, |
| "grad_norm": 3.4254393577575684, |
| "learning_rate": 6.272401433691756e-07, |
| "loss": 10.9754, |
| "num_input_tokens_seen": 5632, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.017921146953405017, |
| "grad_norm": 2.503443479537964, |
| "learning_rate": 8.512544802867385e-07, |
| "loss": 11.1833, |
| "num_input_tokens_seen": 7392, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.022401433691756272, |
| "grad_norm": 2.3445799350738525, |
| "learning_rate": 1.0752688172043011e-06, |
| "loss": 11.1593, |
| "num_input_tokens_seen": 9312, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.026881720430107527, |
| "grad_norm": 2.792506694793701, |
| "learning_rate": 1.2992831541218638e-06, |
| "loss": 11.0546, |
| "num_input_tokens_seen": 11104, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.03136200716845878, |
| "grad_norm": 2.593705892562866, |
| "learning_rate": 1.5232974910394266e-06, |
| "loss": 11.0171, |
| "num_input_tokens_seen": 12928, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.035842293906810034, |
| "grad_norm": 2.4072728157043457, |
| "learning_rate": 1.7473118279569893e-06, |
| "loss": 11.0708, |
| "num_input_tokens_seen": 14816, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.04032258064516129, |
| "grad_norm": 2.383638858795166, |
| "learning_rate": 1.971326164874552e-06, |
| "loss": 11.3111, |
| "num_input_tokens_seen": 16832, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.044802867383512544, |
| "grad_norm": 2.162566661834717, |
| "learning_rate": 2.1953405017921145e-06, |
| "loss": 11.2697, |
| "num_input_tokens_seen": 18560, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0492831541218638, |
| "grad_norm": 2.1702215671539307, |
| "learning_rate": 2.4193548387096776e-06, |
| "loss": 10.932, |
| "num_input_tokens_seen": 20416, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.053763440860215055, |
| "grad_norm": 2.853740930557251, |
| "learning_rate": 2.6433691756272402e-06, |
| "loss": 10.8429, |
| "num_input_tokens_seen": 22144, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.05824372759856631, |
| "grad_norm": 2.471186876296997, |
| "learning_rate": 2.867383512544803e-06, |
| "loss": 11.1565, |
| "num_input_tokens_seen": 24064, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.06272401433691756, |
| "grad_norm": 2.382014274597168, |
| "learning_rate": 3.091397849462366e-06, |
| "loss": 11.0918, |
| "num_input_tokens_seen": 25920, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.06720430107526881, |
| "grad_norm": 2.6635334491729736, |
| "learning_rate": 3.3154121863799286e-06, |
| "loss": 10.9299, |
| "num_input_tokens_seen": 27744, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.07168458781362007, |
| "grad_norm": 2.3445963859558105, |
| "learning_rate": 3.5394265232974912e-06, |
| "loss": 11.2613, |
| "num_input_tokens_seen": 29664, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.07616487455197132, |
| "grad_norm": 2.5724005699157715, |
| "learning_rate": 3.763440860215054e-06, |
| "loss": 10.9325, |
| "num_input_tokens_seen": 31552, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.08064516129032258, |
| "grad_norm": 2.4035773277282715, |
| "learning_rate": 3.987455197132617e-06, |
| "loss": 10.9375, |
| "num_input_tokens_seen": 33536, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.08512544802867383, |
| "grad_norm": 2.6278600692749023, |
| "learning_rate": 4.21146953405018e-06, |
| "loss": 11.0113, |
| "num_input_tokens_seen": 35360, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.08960573476702509, |
| "grad_norm": 2.3494575023651123, |
| "learning_rate": 4.435483870967742e-06, |
| "loss": 11.3767, |
| "num_input_tokens_seen": 37216, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.09408602150537634, |
| "grad_norm": 2.230680227279663, |
| "learning_rate": 4.659498207885305e-06, |
| "loss": 10.8387, |
| "num_input_tokens_seen": 39104, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.0985663082437276, |
| "grad_norm": 2.627718448638916, |
| "learning_rate": 4.883512544802868e-06, |
| "loss": 10.819, |
| "num_input_tokens_seen": 40960, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.10304659498207885, |
| "grad_norm": 2.3713722229003906, |
| "learning_rate": 5.1075268817204305e-06, |
| "loss": 10.9145, |
| "num_input_tokens_seen": 42880, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.10752688172043011, |
| "grad_norm": 2.1317410469055176, |
| "learning_rate": 5.331541218637993e-06, |
| "loss": 10.9142, |
| "num_input_tokens_seen": 44768, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.11200716845878136, |
| "grad_norm": 3.0103211402893066, |
| "learning_rate": 5.555555555555556e-06, |
| "loss": 10.8537, |
| "num_input_tokens_seen": 46528, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.11648745519713262, |
| "grad_norm": 2.6022181510925293, |
| "learning_rate": 5.779569892473118e-06, |
| "loss": 10.5195, |
| "num_input_tokens_seen": 48544, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.12096774193548387, |
| "grad_norm": 2.920191764831543, |
| "learning_rate": 6.003584229390681e-06, |
| "loss": 10.644, |
| "num_input_tokens_seen": 50432, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.12544802867383512, |
| "grad_norm": 2.679093837738037, |
| "learning_rate": 6.227598566308244e-06, |
| "loss": 10.5572, |
| "num_input_tokens_seen": 52416, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.12992831541218638, |
| "grad_norm": 2.975313901901245, |
| "learning_rate": 6.451612903225806e-06, |
| "loss": 10.418, |
| "num_input_tokens_seen": 54304, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.13440860215053763, |
| "grad_norm": 2.0430831909179688, |
| "learning_rate": 6.67562724014337e-06, |
| "loss": 10.6238, |
| "num_input_tokens_seen": 56256, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.1388888888888889, |
| "grad_norm": 2.5662593841552734, |
| "learning_rate": 6.8996415770609325e-06, |
| "loss": 10.529, |
| "num_input_tokens_seen": 58144, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.14336917562724014, |
| "grad_norm": 2.524044990539551, |
| "learning_rate": 7.1236559139784956e-06, |
| "loss": 10.4028, |
| "num_input_tokens_seen": 60032, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.1478494623655914, |
| "grad_norm": 2.0038323402404785, |
| "learning_rate": 7.347670250896058e-06, |
| "loss": 10.5609, |
| "num_input_tokens_seen": 61824, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.15232974910394265, |
| "grad_norm": 2.9879722595214844, |
| "learning_rate": 7.571684587813621e-06, |
| "loss": 10.072, |
| "num_input_tokens_seen": 63712, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.15681003584229392, |
| "grad_norm": 2.5288474559783936, |
| "learning_rate": 7.795698924731183e-06, |
| "loss": 10.3687, |
| "num_input_tokens_seen": 65600, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.16129032258064516, |
| "grad_norm": 2.761035919189453, |
| "learning_rate": 8.019713261648744e-06, |
| "loss": 10.2835, |
| "num_input_tokens_seen": 67392, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.16577060931899643, |
| "grad_norm": 2.7723684310913086, |
| "learning_rate": 8.24372759856631e-06, |
| "loss": 10.0884, |
| "num_input_tokens_seen": 69120, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.17025089605734767, |
| "grad_norm": 2.9377973079681396, |
| "learning_rate": 8.46774193548387e-06, |
| "loss": 10.1739, |
| "num_input_tokens_seen": 71008, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.17473118279569894, |
| "grad_norm": 2.2513413429260254, |
| "learning_rate": 8.691756272401434e-06, |
| "loss": 9.8862, |
| "num_input_tokens_seen": 72896, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.17921146953405018, |
| "grad_norm": 2.589574098587036, |
| "learning_rate": 8.915770609318997e-06, |
| "loss": 10.0585, |
| "num_input_tokens_seen": 74880, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.18369175627240145, |
| "grad_norm": 2.361954689025879, |
| "learning_rate": 9.13978494623656e-06, |
| "loss": 10.0316, |
| "num_input_tokens_seen": 76768, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.1881720430107527, |
| "grad_norm": 1.8949062824249268, |
| "learning_rate": 9.363799283154121e-06, |
| "loss": 10.4095, |
| "num_input_tokens_seen": 78656, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.19265232974910393, |
| "grad_norm": 2.3154962062835693, |
| "learning_rate": 9.587813620071686e-06, |
| "loss": 9.7952, |
| "num_input_tokens_seen": 80512, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.1971326164874552, |
| "grad_norm": 2.7080109119415283, |
| "learning_rate": 9.811827956989247e-06, |
| "loss": 9.9051, |
| "num_input_tokens_seen": 82240, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.20161290322580644, |
| "grad_norm": 1.9762256145477295, |
| "learning_rate": 1.003584229390681e-05, |
| "loss": 9.4651, |
| "num_input_tokens_seen": 84288, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.2060931899641577, |
| "grad_norm": 2.146616220474243, |
| "learning_rate": 1.0259856630824373e-05, |
| "loss": 9.4182, |
| "num_input_tokens_seen": 86240, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.21057347670250895, |
| "grad_norm": 2.200469970703125, |
| "learning_rate": 1.0483870967741936e-05, |
| "loss": 9.2961, |
| "num_input_tokens_seen": 87968, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.21505376344086022, |
| "grad_norm": 2.0625269412994385, |
| "learning_rate": 1.0707885304659498e-05, |
| "loss": 9.3518, |
| "num_input_tokens_seen": 89920, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.21953405017921146, |
| "grad_norm": 2.391050100326538, |
| "learning_rate": 1.0931899641577063e-05, |
| "loss": 9.1173, |
| "num_input_tokens_seen": 91776, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.22401433691756273, |
| "grad_norm": 2.3285434246063232, |
| "learning_rate": 1.1155913978494624e-05, |
| "loss": 9.3096, |
| "num_input_tokens_seen": 93728, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.22849462365591397, |
| "grad_norm": 2.0512301921844482, |
| "learning_rate": 1.1379928315412187e-05, |
| "loss": 9.4857, |
| "num_input_tokens_seen": 95552, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.23297491039426524, |
| "grad_norm": 2.1123080253601074, |
| "learning_rate": 1.160394265232975e-05, |
| "loss": 9.0314, |
| "num_input_tokens_seen": 97440, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.23745519713261648, |
| "grad_norm": 2.1909310817718506, |
| "learning_rate": 1.1827956989247313e-05, |
| "loss": 8.9132, |
| "num_input_tokens_seen": 99296, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.24193548387096775, |
| "grad_norm": 1.8368756771087646, |
| "learning_rate": 1.2051971326164874e-05, |
| "loss": 9.3423, |
| "num_input_tokens_seen": 101184, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.246415770609319, |
| "grad_norm": 2.124940872192383, |
| "learning_rate": 1.227598566308244e-05, |
| "loss": 9.1916, |
| "num_input_tokens_seen": 103136, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.25089605734767023, |
| "grad_norm": 1.936184287071228, |
| "learning_rate": 1.25e-05, |
| "loss": 9.3744, |
| "num_input_tokens_seen": 105024, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.2553763440860215, |
| "grad_norm": 2.1163675785064697, |
| "learning_rate": 1.2724014336917564e-05, |
| "loss": 9.2282, |
| "num_input_tokens_seen": 106880, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.25985663082437277, |
| "grad_norm": 2.478421449661255, |
| "learning_rate": 1.2948028673835125e-05, |
| "loss": 8.8832, |
| "num_input_tokens_seen": 108640, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.26433691756272404, |
| "grad_norm": 2.1263668537139893, |
| "learning_rate": 1.3172043010752688e-05, |
| "loss": 8.4789, |
| "num_input_tokens_seen": 110528, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.26881720430107525, |
| "grad_norm": 1.8503817319869995, |
| "learning_rate": 1.3396057347670251e-05, |
| "loss": 8.5889, |
| "num_input_tokens_seen": 112480, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2732974910394265, |
| "grad_norm": 2.2405242919921875, |
| "learning_rate": 1.3620071684587816e-05, |
| "loss": 8.7637, |
| "num_input_tokens_seen": 114368, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.2777777777777778, |
| "grad_norm": 1.857710838317871, |
| "learning_rate": 1.3844086021505376e-05, |
| "loss": 8.3565, |
| "num_input_tokens_seen": 116352, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.28225806451612906, |
| "grad_norm": 2.657458782196045, |
| "learning_rate": 1.4068100358422939e-05, |
| "loss": 8.6305, |
| "num_input_tokens_seen": 118272, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.2867383512544803, |
| "grad_norm": 2.1825602054595947, |
| "learning_rate": 1.4292114695340503e-05, |
| "loss": 8.4286, |
| "num_input_tokens_seen": 120192, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.29121863799283154, |
| "grad_norm": 1.961053729057312, |
| "learning_rate": 1.4516129032258066e-05, |
| "loss": 8.1782, |
| "num_input_tokens_seen": 121952, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.2956989247311828, |
| "grad_norm": 1.9111275672912598, |
| "learning_rate": 1.4740143369175626e-05, |
| "loss": 7.8948, |
| "num_input_tokens_seen": 123712, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.300179211469534, |
| "grad_norm": 1.8738315105438232, |
| "learning_rate": 1.4964157706093191e-05, |
| "loss": 8.2841, |
| "num_input_tokens_seen": 125568, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3046594982078853, |
| "grad_norm": 2.4100537300109863, |
| "learning_rate": 1.5188172043010754e-05, |
| "loss": 8.3544, |
| "num_input_tokens_seen": 127584, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.30913978494623656, |
| "grad_norm": 2.0588905811309814, |
| "learning_rate": 1.5412186379928317e-05, |
| "loss": 7.7921, |
| "num_input_tokens_seen": 129440, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.31362007168458783, |
| "grad_norm": 2.321556568145752, |
| "learning_rate": 1.563620071684588e-05, |
| "loss": 8.171, |
| "num_input_tokens_seen": 131360, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.31810035842293904, |
| "grad_norm": 2.302530527114868, |
| "learning_rate": 1.586021505376344e-05, |
| "loss": 7.9828, |
| "num_input_tokens_seen": 133312, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 1.879820704460144, |
| "learning_rate": 1.6084229390681005e-05, |
| "loss": 8.0835, |
| "num_input_tokens_seen": 135200, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.3270609318996416, |
| "grad_norm": 1.871687650680542, |
| "learning_rate": 1.630824372759857e-05, |
| "loss": 7.8038, |
| "num_input_tokens_seen": 136896, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.33154121863799285, |
| "grad_norm": 1.9226675033569336, |
| "learning_rate": 1.653225806451613e-05, |
| "loss": 8.063, |
| "num_input_tokens_seen": 139008, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.33602150537634407, |
| "grad_norm": 2.210846185684204, |
| "learning_rate": 1.6756272401433692e-05, |
| "loss": 8.1303, |
| "num_input_tokens_seen": 140832, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.34050179211469533, |
| "grad_norm": 2.274174213409424, |
| "learning_rate": 1.6980286738351257e-05, |
| "loss": 7.7176, |
| "num_input_tokens_seen": 142688, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.3449820788530466, |
| "grad_norm": 2.0752205848693848, |
| "learning_rate": 1.7204301075268818e-05, |
| "loss": 7.9585, |
| "num_input_tokens_seen": 144640, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.34946236559139787, |
| "grad_norm": 1.955730676651001, |
| "learning_rate": 1.742831541218638e-05, |
| "loss": 8.0203, |
| "num_input_tokens_seen": 146496, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.3539426523297491, |
| "grad_norm": 2.04298734664917, |
| "learning_rate": 1.7652329749103944e-05, |
| "loss": 7.8494, |
| "num_input_tokens_seen": 148288, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.35842293906810035, |
| "grad_norm": 2.653252124786377, |
| "learning_rate": 1.7876344086021506e-05, |
| "loss": 7.5965, |
| "num_input_tokens_seen": 150208, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.3629032258064516, |
| "grad_norm": 2.1470870971679688, |
| "learning_rate": 1.8100358422939067e-05, |
| "loss": 7.1931, |
| "num_input_tokens_seen": 152128, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.3673835125448029, |
| "grad_norm": 2.616868019104004, |
| "learning_rate": 1.8324372759856632e-05, |
| "loss": 7.3095, |
| "num_input_tokens_seen": 153728, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.3718637992831541, |
| "grad_norm": 1.9402117729187012, |
| "learning_rate": 1.8548387096774193e-05, |
| "loss": 7.0567, |
| "num_input_tokens_seen": 155648, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.3763440860215054, |
| "grad_norm": 2.4077136516571045, |
| "learning_rate": 1.8772401433691758e-05, |
| "loss": 7.0942, |
| "num_input_tokens_seen": 157568, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.38082437275985664, |
| "grad_norm": 2.162140369415283, |
| "learning_rate": 1.899641577060932e-05, |
| "loss": 7.0183, |
| "num_input_tokens_seen": 159392, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.38530465949820786, |
| "grad_norm": 2.530998706817627, |
| "learning_rate": 1.922043010752688e-05, |
| "loss": 6.6967, |
| "num_input_tokens_seen": 161312, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.3897849462365591, |
| "grad_norm": 2.8186707496643066, |
| "learning_rate": 1.9444444444444445e-05, |
| "loss": 6.5046, |
| "num_input_tokens_seen": 163168, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.3942652329749104, |
| "grad_norm": 2.482224702835083, |
| "learning_rate": 1.966845878136201e-05, |
| "loss": 6.8594, |
| "num_input_tokens_seen": 165120, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.39874551971326166, |
| "grad_norm": 2.4772849082946777, |
| "learning_rate": 1.989247311827957e-05, |
| "loss": 6.8452, |
| "num_input_tokens_seen": 166976, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.4032258064516129, |
| "grad_norm": 2.0559003353118896, |
| "learning_rate": 2.0116487455197133e-05, |
| "loss": 6.7853, |
| "num_input_tokens_seen": 168768, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.40770609318996415, |
| "grad_norm": 1.855859637260437, |
| "learning_rate": 2.0340501792114698e-05, |
| "loss": 6.8535, |
| "num_input_tokens_seen": 170592, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.4121863799283154, |
| "grad_norm": 2.203948974609375, |
| "learning_rate": 2.056451612903226e-05, |
| "loss": 6.5722, |
| "num_input_tokens_seen": 172736, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 1.9957222938537598, |
| "learning_rate": 2.078853046594982e-05, |
| "loss": 6.6833, |
| "num_input_tokens_seen": 174592, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.4211469534050179, |
| "grad_norm": 1.826316475868225, |
| "learning_rate": 2.1012544802867385e-05, |
| "loss": 6.4222, |
| "num_input_tokens_seen": 176704, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.42562724014336917, |
| "grad_norm": 1.9822869300842285, |
| "learning_rate": 2.1236559139784946e-05, |
| "loss": 6.1397, |
| "num_input_tokens_seen": 178816, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.43010752688172044, |
| "grad_norm": 2.5274133682250977, |
| "learning_rate": 2.146057347670251e-05, |
| "loss": 6.353, |
| "num_input_tokens_seen": 180608, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.4345878136200717, |
| "grad_norm": 2.535459518432617, |
| "learning_rate": 2.1684587813620073e-05, |
| "loss": 5.812, |
| "num_input_tokens_seen": 182368, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.4390681003584229, |
| "grad_norm": 2.3390567302703857, |
| "learning_rate": 2.1908602150537634e-05, |
| "loss": 5.6701, |
| "num_input_tokens_seen": 184128, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.4435483870967742, |
| "grad_norm": 1.7963190078735352, |
| "learning_rate": 2.21326164874552e-05, |
| "loss": 6.1912, |
| "num_input_tokens_seen": 186144, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.44802867383512546, |
| "grad_norm": 2.1545753479003906, |
| "learning_rate": 2.235663082437276e-05, |
| "loss": 5.945, |
| "num_input_tokens_seen": 188160, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4525089605734767, |
| "grad_norm": 2.177809953689575, |
| "learning_rate": 2.258064516129032e-05, |
| "loss": 5.495, |
| "num_input_tokens_seen": 190016, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.45698924731182794, |
| "grad_norm": 1.9128177165985107, |
| "learning_rate": 2.2804659498207886e-05, |
| "loss": 5.3772, |
| "num_input_tokens_seen": 191840, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.4614695340501792, |
| "grad_norm": 1.9529114961624146, |
| "learning_rate": 2.302867383512545e-05, |
| "loss": 5.6659, |
| "num_input_tokens_seen": 193920, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.4659498207885305, |
| "grad_norm": 2.681643486022949, |
| "learning_rate": 2.325268817204301e-05, |
| "loss": 5.3074, |
| "num_input_tokens_seen": 195744, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.47043010752688175, |
| "grad_norm": 2.481017827987671, |
| "learning_rate": 2.3476702508960574e-05, |
| "loss": 4.8561, |
| "num_input_tokens_seen": 197632, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.47491039426523296, |
| "grad_norm": 2.2446951866149902, |
| "learning_rate": 2.370071684587814e-05, |
| "loss": 5.0107, |
| "num_input_tokens_seen": 199424, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.47939068100358423, |
| "grad_norm": 1.7352396249771118, |
| "learning_rate": 2.39247311827957e-05, |
| "loss": 4.8499, |
| "num_input_tokens_seen": 201280, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.4838709677419355, |
| "grad_norm": 2.33585262298584, |
| "learning_rate": 2.414874551971326e-05, |
| "loss": 4.6192, |
| "num_input_tokens_seen": 203072, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.4883512544802867, |
| "grad_norm": 2.1425092220306396, |
| "learning_rate": 2.4372759856630826e-05, |
| "loss": 5.0357, |
| "num_input_tokens_seen": 204992, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.492831541218638, |
| "grad_norm": 2.2969155311584473, |
| "learning_rate": 2.4596774193548387e-05, |
| "loss": 4.7181, |
| "num_input_tokens_seen": 206912, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.49731182795698925, |
| "grad_norm": 2.9185574054718018, |
| "learning_rate": 2.4820788530465952e-05, |
| "loss": 4.7201, |
| "num_input_tokens_seen": 208864, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.5, |
| "eval_loss": 4.355785369873047, |
| "eval_runtime": 5.6351, |
| "eval_samples_per_second": 88.019, |
| "eval_steps_per_second": 22.005, |
| "num_input_tokens_seen": 210048, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.5017921146953405, |
| "grad_norm": 2.5545475482940674, |
| "learning_rate": 2.5044802867383517e-05, |
| "loss": 4.6788, |
| "num_input_tokens_seen": 210816, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.5062724014336918, |
| "grad_norm": 2.3069698810577393, |
| "learning_rate": 2.5268817204301075e-05, |
| "loss": 4.1564, |
| "num_input_tokens_seen": 212640, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.510752688172043, |
| "grad_norm": 2.7041890621185303, |
| "learning_rate": 2.5492831541218636e-05, |
| "loss": 4.1372, |
| "num_input_tokens_seen": 214560, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.5152329749103942, |
| "grad_norm": 2.012913227081299, |
| "learning_rate": 2.5716845878136204e-05, |
| "loss": 3.9504, |
| "num_input_tokens_seen": 216384, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.5197132616487455, |
| "grad_norm": 2.3750064373016357, |
| "learning_rate": 2.5940860215053762e-05, |
| "loss": 3.7693, |
| "num_input_tokens_seen": 218304, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.5241935483870968, |
| "grad_norm": 2.0457749366760254, |
| "learning_rate": 2.616487455197133e-05, |
| "loss": 3.8367, |
| "num_input_tokens_seen": 220128, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.5286738351254481, |
| "grad_norm": 2.2593724727630615, |
| "learning_rate": 2.6388888888888892e-05, |
| "loss": 3.689, |
| "num_input_tokens_seen": 222016, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.5331541218637993, |
| "grad_norm": 2.111712694168091, |
| "learning_rate": 2.661290322580645e-05, |
| "loss": 3.5935, |
| "num_input_tokens_seen": 224000, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.5376344086021505, |
| "grad_norm": 2.8802244663238525, |
| "learning_rate": 2.6836917562724018e-05, |
| "loss": 3.5805, |
| "num_input_tokens_seen": 225920, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.5421146953405018, |
| "grad_norm": 1.9827439785003662, |
| "learning_rate": 2.706093189964158e-05, |
| "loss": 3.2292, |
| "num_input_tokens_seen": 227712, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.546594982078853, |
| "grad_norm": 2.3735995292663574, |
| "learning_rate": 2.728494623655914e-05, |
| "loss": 3.8282, |
| "num_input_tokens_seen": 229824, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.5510752688172043, |
| "grad_norm": 2.136101007461548, |
| "learning_rate": 2.7508960573476705e-05, |
| "loss": 3.5909, |
| "num_input_tokens_seen": 231840, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 2.7474825382232666, |
| "learning_rate": 2.7732974910394267e-05, |
| "loss": 3.5369, |
| "num_input_tokens_seen": 233856, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.5600358422939068, |
| "grad_norm": 1.956363320350647, |
| "learning_rate": 2.7956989247311828e-05, |
| "loss": 2.6224, |
| "num_input_tokens_seen": 235680, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.5645161290322581, |
| "grad_norm": 2.3791565895080566, |
| "learning_rate": 2.8181003584229393e-05, |
| "loss": 2.5847, |
| "num_input_tokens_seen": 237536, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.5689964157706093, |
| "grad_norm": 2.644176483154297, |
| "learning_rate": 2.8405017921146954e-05, |
| "loss": 2.8541, |
| "num_input_tokens_seen": 239488, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.5734767025089605, |
| "grad_norm": 2.3562235832214355, |
| "learning_rate": 2.862903225806452e-05, |
| "loss": 2.5187, |
| "num_input_tokens_seen": 241376, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.5779569892473119, |
| "grad_norm": 2.059034824371338, |
| "learning_rate": 2.885304659498208e-05, |
| "loss": 2.3649, |
| "num_input_tokens_seen": 243200, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.5824372759856631, |
| "grad_norm": 2.070953130722046, |
| "learning_rate": 2.9077060931899642e-05, |
| "loss": 2.2836, |
| "num_input_tokens_seen": 245056, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.5869175627240143, |
| "grad_norm": 1.588262915611267, |
| "learning_rate": 2.9301075268817207e-05, |
| "loss": 2.0521, |
| "num_input_tokens_seen": 246880, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.5913978494623656, |
| "grad_norm": 2.269887924194336, |
| "learning_rate": 2.9525089605734768e-05, |
| "loss": 2.0766, |
| "num_input_tokens_seen": 248832, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.5958781362007168, |
| "grad_norm": 2.170715570449829, |
| "learning_rate": 2.974910394265233e-05, |
| "loss": 2.0889, |
| "num_input_tokens_seen": 250880, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.600358422939068, |
| "grad_norm": 2.2083332538604736, |
| "learning_rate": 2.9973118279569894e-05, |
| "loss": 2.249, |
| "num_input_tokens_seen": 252864, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.6048387096774194, |
| "grad_norm": 2.376215934753418, |
| "learning_rate": 3.0197132616487455e-05, |
| "loss": 1.6939, |
| "num_input_tokens_seen": 254656, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.6093189964157706, |
| "grad_norm": 1.581056833267212, |
| "learning_rate": 3.0421146953405024e-05, |
| "loss": 1.9665, |
| "num_input_tokens_seen": 256640, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.6137992831541219, |
| "grad_norm": 2.688706398010254, |
| "learning_rate": 3.0645161290322585e-05, |
| "loss": 1.8917, |
| "num_input_tokens_seen": 258560, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.6182795698924731, |
| "grad_norm": 2.051927328109741, |
| "learning_rate": 3.0869175627240146e-05, |
| "loss": 1.9391, |
| "num_input_tokens_seen": 260544, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.6227598566308243, |
| "grad_norm": 2.0645041465759277, |
| "learning_rate": 3.109318996415771e-05, |
| "loss": 1.6377, |
| "num_input_tokens_seen": 262368, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.6272401433691757, |
| "grad_norm": 1.9354743957519531, |
| "learning_rate": 3.131720430107527e-05, |
| "loss": 1.6164, |
| "num_input_tokens_seen": 264288, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.6317204301075269, |
| "grad_norm": 2.0850887298583984, |
| "learning_rate": 3.154121863799283e-05, |
| "loss": 1.3333, |
| "num_input_tokens_seen": 266112, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.6362007168458781, |
| "grad_norm": 2.084470748901367, |
| "learning_rate": 3.17652329749104e-05, |
| "loss": 1.6501, |
| "num_input_tokens_seen": 268064, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.6406810035842294, |
| "grad_norm": 2.0416452884674072, |
| "learning_rate": 3.198924731182796e-05, |
| "loss": 1.1101, |
| "num_input_tokens_seen": 269792, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 1.9667887687683105, |
| "learning_rate": 3.221326164874552e-05, |
| "loss": 1.2862, |
| "num_input_tokens_seen": 271616, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.649641577060932, |
| "grad_norm": 1.1966661214828491, |
| "learning_rate": 3.243727598566308e-05, |
| "loss": 1.4731, |
| "num_input_tokens_seen": 273568, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.6541218637992832, |
| "grad_norm": 2.009204387664795, |
| "learning_rate": 3.2661290322580644e-05, |
| "loss": 1.3512, |
| "num_input_tokens_seen": 275552, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.6586021505376344, |
| "grad_norm": 1.6872437000274658, |
| "learning_rate": 3.288530465949821e-05, |
| "loss": 1.2472, |
| "num_input_tokens_seen": 277472, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.6630824372759857, |
| "grad_norm": 1.8811956644058228, |
| "learning_rate": 3.3109318996415774e-05, |
| "loss": 1.1825, |
| "num_input_tokens_seen": 279360, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.6675627240143369, |
| "grad_norm": 1.574930191040039, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 1.2683, |
| "num_input_tokens_seen": 281440, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.6720430107526881, |
| "grad_norm": 1.8590502738952637, |
| "learning_rate": 3.3557347670250896e-05, |
| "loss": 1.0297, |
| "num_input_tokens_seen": 283328, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.6765232974910395, |
| "grad_norm": 2.326279878616333, |
| "learning_rate": 3.378136200716846e-05, |
| "loss": 1.0714, |
| "num_input_tokens_seen": 285184, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.6810035842293907, |
| "grad_norm": 1.9387420415878296, |
| "learning_rate": 3.400537634408602e-05, |
| "loss": 0.9402, |
| "num_input_tokens_seen": 286944, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.6854838709677419, |
| "grad_norm": 1.8371498584747314, |
| "learning_rate": 3.422939068100359e-05, |
| "loss": 1.3783, |
| "num_input_tokens_seen": 289120, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.6899641577060932, |
| "grad_norm": 1.6254054307937622, |
| "learning_rate": 3.445340501792115e-05, |
| "loss": 1.1224, |
| "num_input_tokens_seen": 291008, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.6944444444444444, |
| "grad_norm": 1.7399088144302368, |
| "learning_rate": 3.467741935483872e-05, |
| "loss": 1.228, |
| "num_input_tokens_seen": 292960, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.6989247311827957, |
| "grad_norm": 1.1927127838134766, |
| "learning_rate": 3.490143369175627e-05, |
| "loss": 0.9204, |
| "num_input_tokens_seen": 294752, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.703405017921147, |
| "grad_norm": 2.3804333209991455, |
| "learning_rate": 3.512544802867383e-05, |
| "loss": 1.131, |
| "num_input_tokens_seen": 296672, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.7078853046594982, |
| "grad_norm": 1.4596134424209595, |
| "learning_rate": 3.53494623655914e-05, |
| "loss": 0.9233, |
| "num_input_tokens_seen": 298528, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.7123655913978495, |
| "grad_norm": 2.27669095993042, |
| "learning_rate": 3.557347670250896e-05, |
| "loss": 1.386, |
| "num_input_tokens_seen": 300672, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.7168458781362007, |
| "grad_norm": 2.4323675632476807, |
| "learning_rate": 3.5797491039426524e-05, |
| "loss": 0.9721, |
| "num_input_tokens_seen": 302528, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.7213261648745519, |
| "grad_norm": 1.8626689910888672, |
| "learning_rate": 3.602150537634409e-05, |
| "loss": 1.1334, |
| "num_input_tokens_seen": 304512, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.7258064516129032, |
| "grad_norm": 1.959444522857666, |
| "learning_rate": 3.624551971326165e-05, |
| "loss": 0.9517, |
| "num_input_tokens_seen": 306240, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.7302867383512545, |
| "grad_norm": 1.3139394521713257, |
| "learning_rate": 3.6469534050179214e-05, |
| "loss": 0.9271, |
| "num_input_tokens_seen": 308160, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.7347670250896058, |
| "grad_norm": 1.1890360116958618, |
| "learning_rate": 3.6693548387096776e-05, |
| "loss": 0.8931, |
| "num_input_tokens_seen": 309952, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.739247311827957, |
| "grad_norm": 1.6261545419692993, |
| "learning_rate": 3.691756272401434e-05, |
| "loss": 0.8954, |
| "num_input_tokens_seen": 311936, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.7437275985663082, |
| "grad_norm": 2.207946538925171, |
| "learning_rate": 3.7141577060931905e-05, |
| "loss": 1.0085, |
| "num_input_tokens_seen": 313760, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.7482078853046595, |
| "grad_norm": 1.2214912176132202, |
| "learning_rate": 3.736559139784947e-05, |
| "loss": 0.8793, |
| "num_input_tokens_seen": 315456, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.7526881720430108, |
| "grad_norm": 1.8236546516418457, |
| "learning_rate": 3.758960573476703e-05, |
| "loss": 0.8436, |
| "num_input_tokens_seen": 317312, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.757168458781362, |
| "grad_norm": 1.6997966766357422, |
| "learning_rate": 3.781362007168459e-05, |
| "loss": 0.9173, |
| "num_input_tokens_seen": 319264, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.7616487455197133, |
| "grad_norm": 1.9939361810684204, |
| "learning_rate": 3.803763440860215e-05, |
| "loss": 1.0545, |
| "num_input_tokens_seen": 321248, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.7661290322580645, |
| "grad_norm": 1.4386049509048462, |
| "learning_rate": 3.826164874551971e-05, |
| "loss": 0.8228, |
| "num_input_tokens_seen": 323072, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.7706093189964157, |
| "grad_norm": 1.3483433723449707, |
| "learning_rate": 3.848566308243728e-05, |
| "loss": 0.784, |
| "num_input_tokens_seen": 324896, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.775089605734767, |
| "grad_norm": 1.7732473611831665, |
| "learning_rate": 3.870967741935484e-05, |
| "loss": 0.8639, |
| "num_input_tokens_seen": 326592, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.7795698924731183, |
| "grad_norm": 2.9457969665527344, |
| "learning_rate": 3.89336917562724e-05, |
| "loss": 1.0969, |
| "num_input_tokens_seen": 328480, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.7840501792114696, |
| "grad_norm": 1.3309738636016846, |
| "learning_rate": 3.9157706093189964e-05, |
| "loss": 0.814, |
| "num_input_tokens_seen": 330240, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.7885304659498208, |
| "grad_norm": 1.2904449701309204, |
| "learning_rate": 3.9381720430107526e-05, |
| "loss": 0.8384, |
| "num_input_tokens_seen": 331936, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.793010752688172, |
| "grad_norm": 1.127160906791687, |
| "learning_rate": 3.9605734767025094e-05, |
| "loss": 0.8072, |
| "num_input_tokens_seen": 333664, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.7974910394265233, |
| "grad_norm": 1.7538950443267822, |
| "learning_rate": 3.9829749103942655e-05, |
| "loss": 0.7792, |
| "num_input_tokens_seen": 335488, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.8019713261648745, |
| "grad_norm": 1.3736745119094849, |
| "learning_rate": 4.005376344086022e-05, |
| "loss": 0.8026, |
| "num_input_tokens_seen": 337280, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.8064516129032258, |
| "grad_norm": 1.918084740638733, |
| "learning_rate": 4.027777777777778e-05, |
| "loss": 1.027, |
| "num_input_tokens_seen": 339360, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.8109318996415771, |
| "grad_norm": 1.6451882123947144, |
| "learning_rate": 4.050179211469534e-05, |
| "loss": 0.7405, |
| "num_input_tokens_seen": 341184, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.8154121863799283, |
| "grad_norm": 2.081998586654663, |
| "learning_rate": 4.072580645161291e-05, |
| "loss": 0.8656, |
| "num_input_tokens_seen": 343072, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.8198924731182796, |
| "grad_norm": 3.3316433429718018, |
| "learning_rate": 4.094982078853047e-05, |
| "loss": 0.9319, |
| "num_input_tokens_seen": 344928, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.8243727598566308, |
| "grad_norm": 1.1804288625717163, |
| "learning_rate": 4.117383512544803e-05, |
| "loss": 0.7355, |
| "num_input_tokens_seen": 346752, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.828853046594982, |
| "grad_norm": 1.9487136602401733, |
| "learning_rate": 4.13978494623656e-05, |
| "loss": 0.8087, |
| "num_input_tokens_seen": 348608, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 2.2469987869262695, |
| "learning_rate": 4.162186379928315e-05, |
| "loss": 1.0164, |
| "num_input_tokens_seen": 350496, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.8378136200716846, |
| "grad_norm": 1.9513884782791138, |
| "learning_rate": 4.1845878136200714e-05, |
| "loss": 0.8033, |
| "num_input_tokens_seen": 352416, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.8422939068100358, |
| "grad_norm": 2.2020530700683594, |
| "learning_rate": 4.206989247311828e-05, |
| "loss": 0.7324, |
| "num_input_tokens_seen": 354272, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.8467741935483871, |
| "grad_norm": 1.5562139749526978, |
| "learning_rate": 4.2293906810035844e-05, |
| "loss": 0.8158, |
| "num_input_tokens_seen": 356064, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.8512544802867383, |
| "grad_norm": 2.2040066719055176, |
| "learning_rate": 4.2517921146953405e-05, |
| "loss": 0.8879, |
| "num_input_tokens_seen": 358016, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.8557347670250897, |
| "grad_norm": 1.9474464654922485, |
| "learning_rate": 4.2741935483870973e-05, |
| "loss": 0.8055, |
| "num_input_tokens_seen": 359808, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.8602150537634409, |
| "grad_norm": 1.0472666025161743, |
| "learning_rate": 4.296594982078853e-05, |
| "loss": 0.7877, |
| "num_input_tokens_seen": 361664, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.8646953405017921, |
| "grad_norm": 1.2779587507247925, |
| "learning_rate": 4.3189964157706096e-05, |
| "loss": 0.738, |
| "num_input_tokens_seen": 363488, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.8691756272401434, |
| "grad_norm": 1.3081976175308228, |
| "learning_rate": 4.341397849462366e-05, |
| "loss": 0.9189, |
| "num_input_tokens_seen": 365632, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.8736559139784946, |
| "grad_norm": 1.8249237537384033, |
| "learning_rate": 4.363799283154122e-05, |
| "loss": 0.8646, |
| "num_input_tokens_seen": 367616, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.8781362007168458, |
| "grad_norm": 1.5187082290649414, |
| "learning_rate": 4.386200716845879e-05, |
| "loss": 0.7628, |
| "num_input_tokens_seen": 369408, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.8826164874551972, |
| "grad_norm": 2.554736614227295, |
| "learning_rate": 4.408602150537635e-05, |
| "loss": 0.8075, |
| "num_input_tokens_seen": 371232, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.8870967741935484, |
| "grad_norm": 1.244150161743164, |
| "learning_rate": 4.431003584229391e-05, |
| "loss": 0.8056, |
| "num_input_tokens_seen": 373088, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.8915770609318996, |
| "grad_norm": 1.334649920463562, |
| "learning_rate": 4.453405017921147e-05, |
| "loss": 0.7364, |
| "num_input_tokens_seen": 374944, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.8960573476702509, |
| "grad_norm": 1.1086010932922363, |
| "learning_rate": 4.475806451612903e-05, |
| "loss": 0.7582, |
| "num_input_tokens_seen": 376800, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.9005376344086021, |
| "grad_norm": 1.9798866510391235, |
| "learning_rate": 4.49820788530466e-05, |
| "loss": 0.8295, |
| "num_input_tokens_seen": 378720, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.9050179211469535, |
| "grad_norm": 1.3580180406570435, |
| "learning_rate": 4.520609318996416e-05, |
| "loss": 0.9695, |
| "num_input_tokens_seen": 380768, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.9094982078853047, |
| "grad_norm": 1.9761812686920166, |
| "learning_rate": 4.543010752688172e-05, |
| "loss": 0.8012, |
| "num_input_tokens_seen": 382752, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.9139784946236559, |
| "grad_norm": 1.539128065109253, |
| "learning_rate": 4.5654121863799285e-05, |
| "loss": 0.7739, |
| "num_input_tokens_seen": 384576, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.9184587813620072, |
| "grad_norm": 1.047807216644287, |
| "learning_rate": 4.5878136200716846e-05, |
| "loss": 0.7571, |
| "num_input_tokens_seen": 386368, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.9229390681003584, |
| "grad_norm": 0.7635663151741028, |
| "learning_rate": 4.610215053763441e-05, |
| "loss": 0.725, |
| "num_input_tokens_seen": 388192, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.9274193548387096, |
| "grad_norm": 0.9058603644371033, |
| "learning_rate": 4.6326164874551976e-05, |
| "loss": 0.6824, |
| "num_input_tokens_seen": 390016, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.931899641577061, |
| "grad_norm": 2.0875561237335205, |
| "learning_rate": 4.655017921146954e-05, |
| "loss": 0.7599, |
| "num_input_tokens_seen": 391904, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.9363799283154122, |
| "grad_norm": 1.1811761856079102, |
| "learning_rate": 4.67741935483871e-05, |
| "loss": 0.7755, |
| "num_input_tokens_seen": 393888, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.9408602150537635, |
| "grad_norm": 1.8433384895324707, |
| "learning_rate": 4.699820788530466e-05, |
| "loss": 1.0802, |
| "num_input_tokens_seen": 396000, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.9453405017921147, |
| "grad_norm": 1.3989745378494263, |
| "learning_rate": 4.722222222222222e-05, |
| "loss": 0.7628, |
| "num_input_tokens_seen": 397824, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.9498207885304659, |
| "grad_norm": 1.1494001150131226, |
| "learning_rate": 4.744623655913979e-05, |
| "loss": 0.7441, |
| "num_input_tokens_seen": 399648, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.9543010752688172, |
| "grad_norm": 1.8642150163650513, |
| "learning_rate": 4.767025089605735e-05, |
| "loss": 0.9036, |
| "num_input_tokens_seen": 401728, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.9587813620071685, |
| "grad_norm": 0.9621382355690002, |
| "learning_rate": 4.789426523297491e-05, |
| "loss": 0.7951, |
| "num_input_tokens_seen": 403616, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.9632616487455197, |
| "grad_norm": 1.2531604766845703, |
| "learning_rate": 4.811827956989248e-05, |
| "loss": 0.8256, |
| "num_input_tokens_seen": 405504, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 1.0363707542419434, |
| "learning_rate": 4.8342293906810035e-05, |
| "loss": 0.7455, |
| "num_input_tokens_seen": 407392, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.9722222222222222, |
| "grad_norm": 1.4194499254226685, |
| "learning_rate": 4.8566308243727596e-05, |
| "loss": 0.7447, |
| "num_input_tokens_seen": 409216, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.9767025089605734, |
| "grad_norm": 1.6883561611175537, |
| "learning_rate": 4.8790322580645164e-05, |
| "loss": 0.8617, |
| "num_input_tokens_seen": 411104, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.9811827956989247, |
| "grad_norm": 1.1037418842315674, |
| "learning_rate": 4.9014336917562726e-05, |
| "loss": 0.697, |
| "num_input_tokens_seen": 412992, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.985663082437276, |
| "grad_norm": 1.2818868160247803, |
| "learning_rate": 4.9238351254480294e-05, |
| "loss": 0.7121, |
| "num_input_tokens_seen": 414656, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.9901433691756273, |
| "grad_norm": 1.3587709665298462, |
| "learning_rate": 4.9462365591397855e-05, |
| "loss": 0.8608, |
| "num_input_tokens_seen": 416736, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.9946236559139785, |
| "grad_norm": 1.685178518295288, |
| "learning_rate": 4.968637992831541e-05, |
| "loss": 0.7884, |
| "num_input_tokens_seen": 418496, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.9991039426523297, |
| "grad_norm": 1.2909343242645264, |
| "learning_rate": 4.991039426523298e-05, |
| "loss": 0.8136, |
| "num_input_tokens_seen": 420448, |
| "step": 1115 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.8080337047576904, |
| "eval_runtime": 5.6144, |
| "eval_samples_per_second": 88.345, |
| "eval_steps_per_second": 22.086, |
| "num_input_tokens_seen": 420520, |
| "step": 1116 |
| }, |
| { |
| "epoch": 1.003584229390681, |
| "grad_norm": 1.6225780248641968, |
| "learning_rate": 4.9999988993763824e-05, |
| "loss": 0.819, |
| "num_input_tokens_seen": 422088, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.0080645161290323, |
| "grad_norm": 0.8704792857170105, |
| "learning_rate": 4.9999921733466727e-05, |
| "loss": 0.814, |
| "num_input_tokens_seen": 423912, |
| "step": 1125 |
| }, |
| { |
| "epoch": 1.0125448028673836, |
| "grad_norm": 1.1995881795883179, |
| "learning_rate": 4.9999793327612486e-05, |
| "loss": 0.7385, |
| "num_input_tokens_seen": 425768, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.0170250896057347, |
| "grad_norm": 1.436139464378357, |
| "learning_rate": 4.999960377651517e-05, |
| "loss": 0.7454, |
| "num_input_tokens_seen": 427528, |
| "step": 1135 |
| }, |
| { |
| "epoch": 1.021505376344086, |
| "grad_norm": 0.8228470087051392, |
| "learning_rate": 4.9999353080638376e-05, |
| "loss": 0.8053, |
| "num_input_tokens_seen": 429416, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.0259856630824373, |
| "grad_norm": 1.5793920755386353, |
| "learning_rate": 4.9999041240595276e-05, |
| "loss": 0.8063, |
| "num_input_tokens_seen": 431080, |
| "step": 1145 |
| }, |
| { |
| "epoch": 1.0304659498207884, |
| "grad_norm": 2.1269445419311523, |
| "learning_rate": 4.9998668257148576e-05, |
| "loss": 0.8313, |
| "num_input_tokens_seen": 432936, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.0349462365591398, |
| "grad_norm": 1.4320948123931885, |
| "learning_rate": 4.999823413121053e-05, |
| "loss": 0.8094, |
| "num_input_tokens_seen": 434984, |
| "step": 1155 |
| }, |
| { |
| "epoch": 1.039426523297491, |
| "grad_norm": 1.0124826431274414, |
| "learning_rate": 4.999773886384293e-05, |
| "loss": 0.6991, |
| "num_input_tokens_seen": 436744, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.0439068100358422, |
| "grad_norm": 1.4147486686706543, |
| "learning_rate": 4.9997182456257116e-05, |
| "loss": 0.7146, |
| "num_input_tokens_seen": 438568, |
| "step": 1165 |
| }, |
| { |
| "epoch": 1.0483870967741935, |
| "grad_norm": 1.1802654266357422, |
| "learning_rate": 4.999656490981397e-05, |
| "loss": 0.7963, |
| "num_input_tokens_seen": 440424, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.0528673835125448, |
| "grad_norm": 1.2537444829940796, |
| "learning_rate": 4.9995886226023913e-05, |
| "loss": 0.829, |
| "num_input_tokens_seen": 442504, |
| "step": 1175 |
| }, |
| { |
| "epoch": 1.0573476702508962, |
| "grad_norm": 1.1889104843139648, |
| "learning_rate": 4.999514640654688e-05, |
| "loss": 0.7126, |
| "num_input_tokens_seen": 444328, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.0618279569892473, |
| "grad_norm": 0.9470105767250061, |
| "learning_rate": 4.999434545319234e-05, |
| "loss": 0.8046, |
| "num_input_tokens_seen": 446312, |
| "step": 1185 |
| }, |
| { |
| "epoch": 1.0663082437275986, |
| "grad_norm": 1.1713849306106567, |
| "learning_rate": 4.999348336791929e-05, |
| "loss": 0.741, |
| "num_input_tokens_seen": 448232, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.07078853046595, |
| "grad_norm": 1.3088457584381104, |
| "learning_rate": 4.9992560152836264e-05, |
| "loss": 0.7323, |
| "num_input_tokens_seen": 450056, |
| "step": 1195 |
| }, |
| { |
| "epoch": 1.075268817204301, |
| "grad_norm": 1.7432241439819336, |
| "learning_rate": 4.999157581020126e-05, |
| "loss": 0.7173, |
| "num_input_tokens_seen": 451976, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.0797491039426523, |
| "grad_norm": 1.3616633415222168, |
| "learning_rate": 4.9990530342421835e-05, |
| "loss": 0.7661, |
| "num_input_tokens_seen": 454056, |
| "step": 1205 |
| }, |
| { |
| "epoch": 1.0842293906810037, |
| "grad_norm": 1.893376111984253, |
| "learning_rate": 4.998942375205502e-05, |
| "loss": 0.8518, |
| "num_input_tokens_seen": 455880, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.0887096774193548, |
| "grad_norm": 1.1770555973052979, |
| "learning_rate": 4.9988256041807334e-05, |
| "loss": 0.8027, |
| "num_input_tokens_seen": 457736, |
| "step": 1215 |
| }, |
| { |
| "epoch": 1.093189964157706, |
| "grad_norm": 0.9845167398452759, |
| "learning_rate": 4.998702721453481e-05, |
| "loss": 0.8092, |
| "num_input_tokens_seen": 459752, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.0976702508960574, |
| "grad_norm": 1.02119779586792, |
| "learning_rate": 4.998573727324295e-05, |
| "loss": 0.7272, |
| "num_input_tokens_seen": 461608, |
| "step": 1225 |
| }, |
| { |
| "epoch": 1.1021505376344085, |
| "grad_norm": 1.4152262210845947, |
| "learning_rate": 4.998438622108673e-05, |
| "loss": 0.7721, |
| "num_input_tokens_seen": 463464, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.1066308243727598, |
| "grad_norm": 1.7630904912948608, |
| "learning_rate": 4.9982974061370594e-05, |
| "loss": 0.8142, |
| "num_input_tokens_seen": 465416, |
| "step": 1235 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 1.0361242294311523, |
| "learning_rate": 4.9981500797548445e-05, |
| "loss": 0.75, |
| "num_input_tokens_seen": 467208, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.1155913978494623, |
| "grad_norm": 1.8716683387756348, |
| "learning_rate": 4.9979966433223627e-05, |
| "loss": 0.6942, |
| "num_input_tokens_seen": 469096, |
| "step": 1245 |
| }, |
| { |
| "epoch": 1.1200716845878136, |
| "grad_norm": 1.1575100421905518, |
| "learning_rate": 4.997837097214895e-05, |
| "loss": 0.7676, |
| "num_input_tokens_seen": 470984, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.124551971326165, |
| "grad_norm": 0.7588862180709839, |
| "learning_rate": 4.997671441822662e-05, |
| "loss": 0.7607, |
| "num_input_tokens_seen": 472904, |
| "step": 1255 |
| }, |
| { |
| "epoch": 1.129032258064516, |
| "grad_norm": 1.8226057291030884, |
| "learning_rate": 4.997499677550831e-05, |
| "loss": 0.7432, |
| "num_input_tokens_seen": 474792, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.1335125448028673, |
| "grad_norm": 1.1694135665893555, |
| "learning_rate": 4.997321804819506e-05, |
| "loss": 0.7216, |
| "num_input_tokens_seen": 476680, |
| "step": 1265 |
| }, |
| { |
| "epoch": 1.1379928315412187, |
| "grad_norm": 1.0002014636993408, |
| "learning_rate": 4.9971378240637345e-05, |
| "loss": 0.7637, |
| "num_input_tokens_seen": 478344, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.14247311827957, |
| "grad_norm": 1.5931822061538696, |
| "learning_rate": 4.9969477357335025e-05, |
| "loss": 0.8194, |
| "num_input_tokens_seen": 480328, |
| "step": 1275 |
| }, |
| { |
| "epoch": 1.146953405017921, |
| "grad_norm": 1.2760032415390015, |
| "learning_rate": 4.9967515402937334e-05, |
| "loss": 0.7717, |
| "num_input_tokens_seen": 482344, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.1514336917562724, |
| "grad_norm": 1.3313841819763184, |
| "learning_rate": 4.996549238224288e-05, |
| "loss": 0.7957, |
| "num_input_tokens_seen": 484296, |
| "step": 1285 |
| }, |
| { |
| "epoch": 1.1559139784946237, |
| "grad_norm": 1.0116024017333984, |
| "learning_rate": 4.996340830019962e-05, |
| "loss": 0.7225, |
| "num_input_tokens_seen": 486216, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.1603942652329748, |
| "grad_norm": 0.9009803533554077, |
| "learning_rate": 4.996126316190488e-05, |
| "loss": 0.7333, |
| "num_input_tokens_seen": 488040, |
| "step": 1295 |
| }, |
| { |
| "epoch": 1.1648745519713262, |
| "grad_norm": 1.6891072988510132, |
| "learning_rate": 4.995905697260528e-05, |
| "loss": 0.7796, |
| "num_input_tokens_seen": 489832, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.1693548387096775, |
| "grad_norm": 0.8419767022132874, |
| "learning_rate": 4.995678973769681e-05, |
| "loss": 0.7108, |
| "num_input_tokens_seen": 491688, |
| "step": 1305 |
| }, |
| { |
| "epoch": 1.1738351254480286, |
| "grad_norm": 1.250991940498352, |
| "learning_rate": 4.995446146272472e-05, |
| "loss": 0.7791, |
| "num_input_tokens_seen": 493736, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.17831541218638, |
| "grad_norm": 1.2484474182128906, |
| "learning_rate": 4.9952072153383575e-05, |
| "loss": 0.79, |
| "num_input_tokens_seen": 495656, |
| "step": 1315 |
| }, |
| { |
| "epoch": 1.1827956989247312, |
| "grad_norm": 0.9272834062576294, |
| "learning_rate": 4.994962181551725e-05, |
| "loss": 0.7649, |
| "num_input_tokens_seen": 497640, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.1872759856630823, |
| "grad_norm": 0.682406485080719, |
| "learning_rate": 4.994711045511881e-05, |
| "loss": 0.7139, |
| "num_input_tokens_seen": 499464, |
| "step": 1325 |
| }, |
| { |
| "epoch": 1.1917562724014337, |
| "grad_norm": 1.1172208786010742, |
| "learning_rate": 4.9944538078330646e-05, |
| "loss": 0.6943, |
| "num_input_tokens_seen": 501352, |
| "step": 1330 |
| }, |
| { |
| "epoch": 1.196236559139785, |
| "grad_norm": 1.0329699516296387, |
| "learning_rate": 4.994190469144434e-05, |
| "loss": 0.651, |
| "num_input_tokens_seen": 503240, |
| "step": 1335 |
| }, |
| { |
| "epoch": 1.2007168458781363, |
| "grad_norm": 1.6401331424713135, |
| "learning_rate": 4.993921030090072e-05, |
| "loss": 0.9447, |
| "num_input_tokens_seen": 505320, |
| "step": 1340 |
| }, |
| { |
| "epoch": 1.2051971326164874, |
| "grad_norm": 1.254267930984497, |
| "learning_rate": 4.99364549132898e-05, |
| "loss": 0.754, |
| "num_input_tokens_seen": 507336, |
| "step": 1345 |
| }, |
| { |
| "epoch": 1.2096774193548387, |
| "grad_norm": 1.2467966079711914, |
| "learning_rate": 4.993363853535079e-05, |
| "loss": 0.7379, |
| "num_input_tokens_seen": 509128, |
| "step": 1350 |
| }, |
| { |
| "epoch": 1.2141577060931898, |
| "grad_norm": 1.1022220849990845, |
| "learning_rate": 4.9930761173972076e-05, |
| "loss": 0.7379, |
| "num_input_tokens_seen": 510952, |
| "step": 1355 |
| }, |
| { |
| "epoch": 1.2186379928315412, |
| "grad_norm": 1.1572595834732056, |
| "learning_rate": 4.992782283619118e-05, |
| "loss": 0.7332, |
| "num_input_tokens_seen": 512808, |
| "step": 1360 |
| }, |
| { |
| "epoch": 1.2231182795698925, |
| "grad_norm": 0.8642338514328003, |
| "learning_rate": 4.99248235291948e-05, |
| "loss": 0.6583, |
| "num_input_tokens_seen": 514504, |
| "step": 1365 |
| }, |
| { |
| "epoch": 1.2275985663082438, |
| "grad_norm": 0.9974188208580017, |
| "learning_rate": 4.992176326031872e-05, |
| "loss": 0.8578, |
| "num_input_tokens_seen": 516360, |
| "step": 1370 |
| }, |
| { |
| "epoch": 1.232078853046595, |
| "grad_norm": 1.2274900674819946, |
| "learning_rate": 4.991864203704783e-05, |
| "loss": 0.762, |
| "num_input_tokens_seen": 518152, |
| "step": 1375 |
| }, |
| { |
| "epoch": 1.2365591397849462, |
| "grad_norm": 1.3409820795059204, |
| "learning_rate": 4.991545986701611e-05, |
| "loss": 0.7545, |
| "num_input_tokens_seen": 520072, |
| "step": 1380 |
| }, |
| { |
| "epoch": 1.2410394265232976, |
| "grad_norm": 1.1630853414535522, |
| "learning_rate": 4.991221675800662e-05, |
| "loss": 0.7108, |
| "num_input_tokens_seen": 521928, |
| "step": 1385 |
| }, |
| { |
| "epoch": 1.2455197132616487, |
| "grad_norm": 1.7423793077468872, |
| "learning_rate": 4.990891271795145e-05, |
| "loss": 0.8476, |
| "num_input_tokens_seen": 523880, |
| "step": 1390 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.8607897758483887, |
| "learning_rate": 4.99055477549317e-05, |
| "loss": 0.6993, |
| "num_input_tokens_seen": 525832, |
| "step": 1395 |
| }, |
| { |
| "epoch": 1.2544802867383513, |
| "grad_norm": 0.9243642687797546, |
| "learning_rate": 4.990212187717753e-05, |
| "loss": 0.575, |
| "num_input_tokens_seen": 527560, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.2589605734767024, |
| "grad_norm": 0.6754842400550842, |
| "learning_rate": 4.9898635093068036e-05, |
| "loss": 0.7522, |
| "num_input_tokens_seen": 529480, |
| "step": 1405 |
| }, |
| { |
| "epoch": 1.2634408602150538, |
| "grad_norm": 1.0726746320724487, |
| "learning_rate": 4.98950874111313e-05, |
| "loss": 0.9473, |
| "num_input_tokens_seen": 531592, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.267921146953405, |
| "grad_norm": 0.930479109287262, |
| "learning_rate": 4.989147884004435e-05, |
| "loss": 0.7041, |
| "num_input_tokens_seen": 533480, |
| "step": 1415 |
| }, |
| { |
| "epoch": 1.2724014336917562, |
| "grad_norm": 0.8492618203163147, |
| "learning_rate": 4.988780938863314e-05, |
| "loss": 0.6708, |
| "num_input_tokens_seen": 535464, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.2768817204301075, |
| "grad_norm": 1.962171196937561, |
| "learning_rate": 4.9884079065872514e-05, |
| "loss": 0.7281, |
| "num_input_tokens_seen": 537512, |
| "step": 1425 |
| }, |
| { |
| "epoch": 1.2813620071684588, |
| "grad_norm": 1.0054094791412354, |
| "learning_rate": 4.988028788088622e-05, |
| "loss": 0.8596, |
| "num_input_tokens_seen": 539560, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.2858422939068102, |
| "grad_norm": 1.520097017288208, |
| "learning_rate": 4.9876435842946845e-05, |
| "loss": 0.7854, |
| "num_input_tokens_seen": 541448, |
| "step": 1435 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 1.189052939414978, |
| "learning_rate": 4.987252296147582e-05, |
| "loss": 0.6744, |
| "num_input_tokens_seen": 543336, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.2948028673835126, |
| "grad_norm": 1.1547536849975586, |
| "learning_rate": 4.986854924604339e-05, |
| "loss": 0.7349, |
| "num_input_tokens_seen": 545320, |
| "step": 1445 |
| }, |
| { |
| "epoch": 1.2992831541218637, |
| "grad_norm": 1.1133335828781128, |
| "learning_rate": 4.986451470636858e-05, |
| "loss": 0.7514, |
| "num_input_tokens_seen": 547240, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.303763440860215, |
| "grad_norm": 1.0307190418243408, |
| "learning_rate": 4.98604193523192e-05, |
| "loss": 0.7785, |
| "num_input_tokens_seen": 549000, |
| "step": 1455 |
| }, |
| { |
| "epoch": 1.3082437275985663, |
| "grad_norm": 1.0905498266220093, |
| "learning_rate": 4.985626319391178e-05, |
| "loss": 0.7204, |
| "num_input_tokens_seen": 550920, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.3127240143369177, |
| "grad_norm": 0.8858598470687866, |
| "learning_rate": 4.985204624131157e-05, |
| "loss": 0.7684, |
| "num_input_tokens_seen": 552744, |
| "step": 1465 |
| }, |
| { |
| "epoch": 1.3172043010752688, |
| "grad_norm": 1.4811769723892212, |
| "learning_rate": 4.984776850483254e-05, |
| "loss": 0.7303, |
| "num_input_tokens_seen": 554632, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.32168458781362, |
| "grad_norm": 1.123785376548767, |
| "learning_rate": 4.9843429994937284e-05, |
| "loss": 0.6639, |
| "num_input_tokens_seen": 556392, |
| "step": 1475 |
| }, |
| { |
| "epoch": 1.3261648745519714, |
| "grad_norm": 1.258150339126587, |
| "learning_rate": 4.983903072223708e-05, |
| "loss": 0.7914, |
| "num_input_tokens_seen": 558248, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.3306451612903225, |
| "grad_norm": 0.9963940978050232, |
| "learning_rate": 4.983457069749178e-05, |
| "loss": 0.7242, |
| "num_input_tokens_seen": 560200, |
| "step": 1485 |
| }, |
| { |
| "epoch": 1.3351254480286738, |
| "grad_norm": 1.2597713470458984, |
| "learning_rate": 4.983004993160986e-05, |
| "loss": 0.7417, |
| "num_input_tokens_seen": 562024, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.3396057347670252, |
| "grad_norm": 0.9808022975921631, |
| "learning_rate": 4.982546843564834e-05, |
| "loss": 0.7227, |
| "num_input_tokens_seen": 563848, |
| "step": 1495 |
| }, |
| { |
| "epoch": 1.3440860215053765, |
| "grad_norm": 1.9162483215332031, |
| "learning_rate": 4.982082622081279e-05, |
| "loss": 0.7643, |
| "num_input_tokens_seen": 565832, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.3485663082437276, |
| "grad_norm": 1.5381019115447998, |
| "learning_rate": 4.981612329845726e-05, |
| "loss": 0.7185, |
| "num_input_tokens_seen": 567688, |
| "step": 1505 |
| }, |
| { |
| "epoch": 1.353046594982079, |
| "grad_norm": 0.8574863076210022, |
| "learning_rate": 4.98113596800843e-05, |
| "loss": 0.6938, |
| "num_input_tokens_seen": 569544, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.35752688172043, |
| "grad_norm": 1.4968950748443604, |
| "learning_rate": 4.980653537734493e-05, |
| "loss": 0.7329, |
| "num_input_tokens_seen": 571432, |
| "step": 1515 |
| }, |
| { |
| "epoch": 1.3620071684587813, |
| "grad_norm": 1.1606332063674927, |
| "learning_rate": 4.9801650402038555e-05, |
| "loss": 0.7442, |
| "num_input_tokens_seen": 573256, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.3664874551971327, |
| "grad_norm": 1.2675437927246094, |
| "learning_rate": 4.979670476611301e-05, |
| "loss": 0.763, |
| "num_input_tokens_seen": 575208, |
| "step": 1525 |
| }, |
| { |
| "epoch": 1.370967741935484, |
| "grad_norm": 1.4575673341751099, |
| "learning_rate": 4.979169848166446e-05, |
| "loss": 0.8159, |
| "num_input_tokens_seen": 576936, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.375448028673835, |
| "grad_norm": 0.9934259653091431, |
| "learning_rate": 4.978663156093744e-05, |
| "loss": 0.6303, |
| "num_input_tokens_seen": 579016, |
| "step": 1535 |
| }, |
| { |
| "epoch": 1.3799283154121864, |
| "grad_norm": 0.9751392006874084, |
| "learning_rate": 4.978150401632477e-05, |
| "loss": 0.7635, |
| "num_input_tokens_seen": 580872, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.3844086021505375, |
| "grad_norm": 1.1100707054138184, |
| "learning_rate": 4.9776315860367564e-05, |
| "loss": 0.7501, |
| "num_input_tokens_seen": 582696, |
| "step": 1545 |
| }, |
| { |
| "epoch": 1.3888888888888888, |
| "grad_norm": 0.8194773197174072, |
| "learning_rate": 4.9771067105755145e-05, |
| "loss": 0.7135, |
| "num_input_tokens_seen": 584456, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.3933691756272402, |
| "grad_norm": 0.9508175253868103, |
| "learning_rate": 4.976575776532509e-05, |
| "loss": 0.7308, |
| "num_input_tokens_seen": 586408, |
| "step": 1555 |
| }, |
| { |
| "epoch": 1.3978494623655915, |
| "grad_norm": 1.265271782875061, |
| "learning_rate": 4.976038785206315e-05, |
| "loss": 0.7841, |
| "num_input_tokens_seen": 588296, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.4023297491039426, |
| "grad_norm": 1.7464754581451416, |
| "learning_rate": 4.9754957379103205e-05, |
| "loss": 0.7915, |
| "num_input_tokens_seen": 590280, |
| "step": 1565 |
| }, |
| { |
| "epoch": 1.406810035842294, |
| "grad_norm": 1.1853801012039185, |
| "learning_rate": 4.974946635972728e-05, |
| "loss": 0.7529, |
| "num_input_tokens_seen": 592104, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.4112903225806452, |
| "grad_norm": 0.9984325766563416, |
| "learning_rate": 4.974391480736546e-05, |
| "loss": 0.7774, |
| "num_input_tokens_seen": 593960, |
| "step": 1575 |
| }, |
| { |
| "epoch": 1.4157706093189963, |
| "grad_norm": 0.6899164915084839, |
| "learning_rate": 4.973830273559591e-05, |
| "loss": 0.7263, |
| "num_input_tokens_seen": 595720, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.4202508960573477, |
| "grad_norm": 1.1132155656814575, |
| "learning_rate": 4.97326301581448e-05, |
| "loss": 0.7447, |
| "num_input_tokens_seen": 597704, |
| "step": 1585 |
| }, |
| { |
| "epoch": 1.424731182795699, |
| "grad_norm": 1.076192855834961, |
| "learning_rate": 4.9726897088886294e-05, |
| "loss": 0.7111, |
| "num_input_tokens_seen": 599560, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.4292114695340503, |
| "grad_norm": 1.03714120388031, |
| "learning_rate": 4.972110354184249e-05, |
| "loss": 0.7496, |
| "num_input_tokens_seen": 601384, |
| "step": 1595 |
| }, |
| { |
| "epoch": 1.4336917562724014, |
| "grad_norm": 1.5453832149505615, |
| "learning_rate": 4.971524953118344e-05, |
| "loss": 0.7348, |
| "num_input_tokens_seen": 603176, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.4381720430107527, |
| "grad_norm": 0.8860936164855957, |
| "learning_rate": 4.9709335071227046e-05, |
| "loss": 0.7677, |
| "num_input_tokens_seen": 605064, |
| "step": 1605 |
| }, |
| { |
| "epoch": 1.4426523297491038, |
| "grad_norm": 1.1750874519348145, |
| "learning_rate": 4.970336017643907e-05, |
| "loss": 0.7877, |
| "num_input_tokens_seen": 606920, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.4471326164874552, |
| "grad_norm": 0.8579301834106445, |
| "learning_rate": 4.969732486143309e-05, |
| "loss": 0.7482, |
| "num_input_tokens_seen": 608712, |
| "step": 1615 |
| }, |
| { |
| "epoch": 1.4516129032258065, |
| "grad_norm": 1.032989501953125, |
| "learning_rate": 4.969122914097046e-05, |
| "loss": 0.7113, |
| "num_input_tokens_seen": 610600, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.4560931899641578, |
| "grad_norm": 0.986748218536377, |
| "learning_rate": 4.968507302996029e-05, |
| "loss": 0.7629, |
| "num_input_tokens_seen": 612488, |
| "step": 1625 |
| }, |
| { |
| "epoch": 1.460573476702509, |
| "grad_norm": 0.9742453098297119, |
| "learning_rate": 4.967885654345936e-05, |
| "loss": 0.7882, |
| "num_input_tokens_seen": 614344, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.4650537634408602, |
| "grad_norm": 1.1552369594573975, |
| "learning_rate": 4.9672579696672136e-05, |
| "loss": 0.7541, |
| "num_input_tokens_seen": 616168, |
| "step": 1635 |
| }, |
| { |
| "epoch": 1.4695340501792113, |
| "grad_norm": 0.7690085172653198, |
| "learning_rate": 4.966624250495075e-05, |
| "loss": 0.6982, |
| "num_input_tokens_seen": 618024, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.4740143369175627, |
| "grad_norm": 1.2485721111297607, |
| "learning_rate": 4.9659844983794855e-05, |
| "loss": 0.7561, |
| "num_input_tokens_seen": 619848, |
| "step": 1645 |
| }, |
| { |
| "epoch": 1.478494623655914, |
| "grad_norm": 0.8757264614105225, |
| "learning_rate": 4.965338714885173e-05, |
| "loss": 0.6676, |
| "num_input_tokens_seen": 621576, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.4829749103942653, |
| "grad_norm": 1.2202672958374023, |
| "learning_rate": 4.964686901591612e-05, |
| "loss": 0.7544, |
| "num_input_tokens_seen": 623592, |
| "step": 1655 |
| }, |
| { |
| "epoch": 1.4874551971326164, |
| "grad_norm": 1.16517174243927, |
| "learning_rate": 4.964029060093029e-05, |
| "loss": 0.6739, |
| "num_input_tokens_seen": 625384, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.4919354838709677, |
| "grad_norm": 0.7608532309532166, |
| "learning_rate": 4.96336519199839e-05, |
| "loss": 0.7081, |
| "num_input_tokens_seen": 627400, |
| "step": 1665 |
| }, |
| { |
| "epoch": 1.496415770609319, |
| "grad_norm": 0.8713858723640442, |
| "learning_rate": 4.9626952989314065e-05, |
| "loss": 0.7929, |
| "num_input_tokens_seen": 629192, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.5, |
| "eval_loss": 0.7311503291130066, |
| "eval_runtime": 5.6351, |
| "eval_samples_per_second": 88.019, |
| "eval_steps_per_second": 22.005, |
| "num_input_tokens_seen": 630888, |
| "step": 1674 |
| }, |
| { |
| "epoch": 1.5008960573476702, |
| "grad_norm": 1.2036266326904297, |
| "learning_rate": 4.962019382530521e-05, |
| "loss": 0.7689, |
| "num_input_tokens_seen": 631336, |
| "step": 1675 |
| }, |
| { |
| "epoch": 1.5053763440860215, |
| "grad_norm": 1.4354534149169922, |
| "learning_rate": 4.9613374444489095e-05, |
| "loss": 0.7565, |
| "num_input_tokens_seen": 633160, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.5098566308243728, |
| "grad_norm": 0.8775255680084229, |
| "learning_rate": 4.960649486354478e-05, |
| "loss": 0.815, |
| "num_input_tokens_seen": 635176, |
| "step": 1685 |
| }, |
| { |
| "epoch": 1.5143369175627241, |
| "grad_norm": 0.6111800670623779, |
| "learning_rate": 4.959955509929854e-05, |
| "loss": 0.6656, |
| "num_input_tokens_seen": 636936, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.5188172043010753, |
| "grad_norm": 1.351608157157898, |
| "learning_rate": 4.9592555168723875e-05, |
| "loss": 0.8207, |
| "num_input_tokens_seen": 638984, |
| "step": 1695 |
| }, |
| { |
| "epoch": 1.5232974910394266, |
| "grad_norm": 1.0694748163223267, |
| "learning_rate": 4.95854950889414e-05, |
| "loss": 0.7403, |
| "num_input_tokens_seen": 640904, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.5277777777777777, |
| "grad_norm": 0.8151997923851013, |
| "learning_rate": 4.957837487721889e-05, |
| "loss": 0.7493, |
| "num_input_tokens_seen": 642792, |
| "step": 1705 |
| }, |
| { |
| "epoch": 1.532258064516129, |
| "grad_norm": 0.8805826306343079, |
| "learning_rate": 4.957119455097117e-05, |
| "loss": 0.6942, |
| "num_input_tokens_seen": 644680, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.5367383512544803, |
| "grad_norm": 0.7584099769592285, |
| "learning_rate": 4.956395412776008e-05, |
| "loss": 0.7322, |
| "num_input_tokens_seen": 646504, |
| "step": 1715 |
| }, |
| { |
| "epoch": 1.5412186379928317, |
| "grad_norm": 0.8927046060562134, |
| "learning_rate": 4.955665362529448e-05, |
| "loss": 0.7146, |
| "num_input_tokens_seen": 648520, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.5456989247311828, |
| "grad_norm": 1.0460692644119263, |
| "learning_rate": 4.954929306143016e-05, |
| "loss": 0.7403, |
| "num_input_tokens_seen": 650312, |
| "step": 1725 |
| }, |
| { |
| "epoch": 1.550179211469534, |
| "grad_norm": 0.9621413350105286, |
| "learning_rate": 4.9541872454169794e-05, |
| "loss": 0.685, |
| "num_input_tokens_seen": 652200, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.5546594982078852, |
| "grad_norm": 1.074214220046997, |
| "learning_rate": 4.953439182166293e-05, |
| "loss": 0.806, |
| "num_input_tokens_seen": 654280, |
| "step": 1735 |
| }, |
| { |
| "epoch": 1.5591397849462365, |
| "grad_norm": 0.9384589195251465, |
| "learning_rate": 4.952685118220593e-05, |
| "loss": 0.7478, |
| "num_input_tokens_seen": 656168, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.5636200716845878, |
| "grad_norm": 0.9732498526573181, |
| "learning_rate": 4.951925055424191e-05, |
| "loss": 0.7457, |
| "num_input_tokens_seen": 657992, |
| "step": 1745 |
| }, |
| { |
| "epoch": 1.5681003584229392, |
| "grad_norm": 0.9830466508865356, |
| "learning_rate": 4.951158995636071e-05, |
| "loss": 0.7573, |
| "num_input_tokens_seen": 659720, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.5725806451612905, |
| "grad_norm": 0.8285174369812012, |
| "learning_rate": 4.9503869407298856e-05, |
| "loss": 0.7618, |
| "num_input_tokens_seen": 661768, |
| "step": 1755 |
| }, |
| { |
| "epoch": 1.5770609318996416, |
| "grad_norm": 0.9129826426506042, |
| "learning_rate": 4.94960889259395e-05, |
| "loss": 0.7081, |
| "num_input_tokens_seen": 663592, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.5815412186379927, |
| "grad_norm": 0.8629391193389893, |
| "learning_rate": 4.948824853131236e-05, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 665384, |
| "step": 1765 |
| }, |
| { |
| "epoch": 1.586021505376344, |
| "grad_norm": 0.7808408737182617, |
| "learning_rate": 4.948034824259373e-05, |
| "loss": 0.6774, |
| "num_input_tokens_seen": 667400, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.5905017921146953, |
| "grad_norm": 1.3148083686828613, |
| "learning_rate": 4.947238807910637e-05, |
| "loss": 0.7496, |
| "num_input_tokens_seen": 669192, |
| "step": 1775 |
| }, |
| { |
| "epoch": 1.5949820788530467, |
| "grad_norm": 0.9059514403343201, |
| "learning_rate": 4.9464368060319465e-05, |
| "loss": 0.7509, |
| "num_input_tokens_seen": 671112, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.599462365591398, |
| "grad_norm": 0.9348905682563782, |
| "learning_rate": 4.9456288205848634e-05, |
| "loss": 0.714, |
| "num_input_tokens_seen": 672968, |
| "step": 1785 |
| }, |
| { |
| "epoch": 1.603942652329749, |
| "grad_norm": 0.8638232350349426, |
| "learning_rate": 4.944814853545583e-05, |
| "loss": 0.6887, |
| "num_input_tokens_seen": 674792, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.6084229390681004, |
| "grad_norm": 1.297763466835022, |
| "learning_rate": 4.9439949069049294e-05, |
| "loss": 0.7983, |
| "num_input_tokens_seen": 676776, |
| "step": 1795 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "grad_norm": 1.1472688913345337, |
| "learning_rate": 4.943168982668352e-05, |
| "loss": 0.7423, |
| "num_input_tokens_seen": 678760, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.6173835125448028, |
| "grad_norm": 1.002719759941101, |
| "learning_rate": 4.9423370828559236e-05, |
| "loss": 0.6999, |
| "num_input_tokens_seen": 680584, |
| "step": 1805 |
| }, |
| { |
| "epoch": 1.6218637992831542, |
| "grad_norm": 0.9048038125038147, |
| "learning_rate": 4.941499209502327e-05, |
| "loss": 0.6823, |
| "num_input_tokens_seen": 682504, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.6263440860215055, |
| "grad_norm": 0.7667252421379089, |
| "learning_rate": 4.9406553646568594e-05, |
| "loss": 0.6943, |
| "num_input_tokens_seen": 684424, |
| "step": 1815 |
| }, |
| { |
| "epoch": 1.6308243727598566, |
| "grad_norm": 0.767292320728302, |
| "learning_rate": 4.939805550383421e-05, |
| "loss": 0.722, |
| "num_input_tokens_seen": 686248, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.635304659498208, |
| "grad_norm": 0.9602553844451904, |
| "learning_rate": 4.9389497687605135e-05, |
| "loss": 0.7781, |
| "num_input_tokens_seen": 688104, |
| "step": 1825 |
| }, |
| { |
| "epoch": 1.639784946236559, |
| "grad_norm": 0.936279296875, |
| "learning_rate": 4.938088021881233e-05, |
| "loss": 0.7545, |
| "num_input_tokens_seen": 689896, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.6442652329749103, |
| "grad_norm": 1.1077156066894531, |
| "learning_rate": 4.9372203118532655e-05, |
| "loss": 0.6995, |
| "num_input_tokens_seen": 691912, |
| "step": 1835 |
| }, |
| { |
| "epoch": 1.6487455197132617, |
| "grad_norm": 0.8677238821983337, |
| "learning_rate": 4.936346640798883e-05, |
| "loss": 0.7406, |
| "num_input_tokens_seen": 693640, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.653225806451613, |
| "grad_norm": 0.5871409773826599, |
| "learning_rate": 4.935467010854936e-05, |
| "loss": 0.6855, |
| "num_input_tokens_seen": 695528, |
| "step": 1845 |
| }, |
| { |
| "epoch": 1.6577060931899643, |
| "grad_norm": 0.7833765745162964, |
| "learning_rate": 4.9345814241728495e-05, |
| "loss": 0.6938, |
| "num_input_tokens_seen": 697256, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.6621863799283154, |
| "grad_norm": 0.8606950044631958, |
| "learning_rate": 4.933689882918618e-05, |
| "loss": 0.6943, |
| "num_input_tokens_seen": 699112, |
| "step": 1855 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 1.0058404207229614, |
| "learning_rate": 4.9327923892728e-05, |
| "loss": 0.7113, |
| "num_input_tokens_seen": 700904, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.6711469534050178, |
| "grad_norm": 0.9566648602485657, |
| "learning_rate": 4.9318889454305115e-05, |
| "loss": 0.6834, |
| "num_input_tokens_seen": 702632, |
| "step": 1865 |
| }, |
| { |
| "epoch": 1.6756272401433692, |
| "grad_norm": 0.9307464957237244, |
| "learning_rate": 4.930979553601423e-05, |
| "loss": 0.7581, |
| "num_input_tokens_seen": 704680, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.6801075268817205, |
| "grad_norm": 1.2653905153274536, |
| "learning_rate": 4.930064216009754e-05, |
| "loss": 0.7228, |
| "num_input_tokens_seen": 706792, |
| "step": 1875 |
| }, |
| { |
| "epoch": 1.6845878136200718, |
| "grad_norm": 0.9056828618049622, |
| "learning_rate": 4.929142934894262e-05, |
| "loss": 0.7146, |
| "num_input_tokens_seen": 708552, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.689068100358423, |
| "grad_norm": 0.7426489591598511, |
| "learning_rate": 4.928215712508245e-05, |
| "loss": 0.7212, |
| "num_input_tokens_seen": 710568, |
| "step": 1885 |
| }, |
| { |
| "epoch": 1.6935483870967742, |
| "grad_norm": 0.914968729019165, |
| "learning_rate": 4.9272825511195316e-05, |
| "loss": 0.664, |
| "num_input_tokens_seen": 712488, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.6980286738351253, |
| "grad_norm": 1.0682841539382935, |
| "learning_rate": 4.9263434530104755e-05, |
| "loss": 0.7293, |
| "num_input_tokens_seen": 714440, |
| "step": 1895 |
| }, |
| { |
| "epoch": 1.7025089605734767, |
| "grad_norm": 0.7572513222694397, |
| "learning_rate": 4.92539842047795e-05, |
| "loss": 0.713, |
| "num_input_tokens_seen": 716328, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.706989247311828, |
| "grad_norm": 1.7348966598510742, |
| "learning_rate": 4.924447455833346e-05, |
| "loss": 0.8395, |
| "num_input_tokens_seen": 718216, |
| "step": 1905 |
| }, |
| { |
| "epoch": 1.7114695340501793, |
| "grad_norm": 0.8259839415550232, |
| "learning_rate": 4.9234905614025594e-05, |
| "loss": 0.7374, |
| "num_input_tokens_seen": 719976, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.7159498207885304, |
| "grad_norm": 0.9698778986930847, |
| "learning_rate": 4.922527739525993e-05, |
| "loss": 0.6721, |
| "num_input_tokens_seen": 721928, |
| "step": 1915 |
| }, |
| { |
| "epoch": 1.7204301075268817, |
| "grad_norm": 0.6747263669967651, |
| "learning_rate": 4.9215589925585434e-05, |
| "loss": 0.7525, |
| "num_input_tokens_seen": 723752, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.7249103942652328, |
| "grad_norm": 0.4461309313774109, |
| "learning_rate": 4.9205843228696036e-05, |
| "loss": 0.7114, |
| "num_input_tokens_seen": 725480, |
| "step": 1925 |
| }, |
| { |
| "epoch": 1.7293906810035842, |
| "grad_norm": 0.6168416142463684, |
| "learning_rate": 4.9196037328430475e-05, |
| "loss": 0.6724, |
| "num_input_tokens_seen": 727400, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.7338709677419355, |
| "grad_norm": 0.8838000297546387, |
| "learning_rate": 4.918617224877232e-05, |
| "loss": 0.6928, |
| "num_input_tokens_seen": 729576, |
| "step": 1935 |
| }, |
| { |
| "epoch": 1.7383512544802868, |
| "grad_norm": 0.8864124417304993, |
| "learning_rate": 4.917624801384988e-05, |
| "loss": 0.7149, |
| "num_input_tokens_seen": 731528, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.7428315412186381, |
| "grad_norm": 0.8120636343955994, |
| "learning_rate": 4.916626464793616e-05, |
| "loss": 0.7634, |
| "num_input_tokens_seen": 733448, |
| "step": 1945 |
| }, |
| { |
| "epoch": 1.7473118279569892, |
| "grad_norm": 0.9845364689826965, |
| "learning_rate": 4.915622217544875e-05, |
| "loss": 0.7203, |
| "num_input_tokens_seen": 735304, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.7517921146953404, |
| "grad_norm": 0.8506798148155212, |
| "learning_rate": 4.9146120620949854e-05, |
| "loss": 0.7283, |
| "num_input_tokens_seen": 737032, |
| "step": 1955 |
| }, |
| { |
| "epoch": 1.7562724014336917, |
| "grad_norm": 0.7311829328536987, |
| "learning_rate": 4.9135960009146135e-05, |
| "loss": 0.7221, |
| "num_input_tokens_seen": 738856, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.760752688172043, |
| "grad_norm": 0.9475505948066711, |
| "learning_rate": 4.912574036488874e-05, |
| "loss": 0.7073, |
| "num_input_tokens_seen": 740712, |
| "step": 1965 |
| }, |
| { |
| "epoch": 1.7652329749103943, |
| "grad_norm": 1.0571333169937134, |
| "learning_rate": 4.9115461713173174e-05, |
| "loss": 0.7287, |
| "num_input_tokens_seen": 742568, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.7697132616487457, |
| "grad_norm": 1.2273980379104614, |
| "learning_rate": 4.910512407913926e-05, |
| "loss": 0.7846, |
| "num_input_tokens_seen": 744584, |
| "step": 1975 |
| }, |
| { |
| "epoch": 1.7741935483870968, |
| "grad_norm": 0.6895895600318909, |
| "learning_rate": 4.9094727488071114e-05, |
| "loss": 0.7451, |
| "num_input_tokens_seen": 746376, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.778673835125448, |
| "grad_norm": 0.7869203686714172, |
| "learning_rate": 4.9084271965397014e-05, |
| "loss": 0.8673, |
| "num_input_tokens_seen": 748488, |
| "step": 1985 |
| }, |
| { |
| "epoch": 1.7831541218637992, |
| "grad_norm": 1.0943074226379395, |
| "learning_rate": 4.907375753668939e-05, |
| "loss": 0.7509, |
| "num_input_tokens_seen": 750376, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.7876344086021505, |
| "grad_norm": 0.6461062431335449, |
| "learning_rate": 4.906318422766476e-05, |
| "loss": 0.6848, |
| "num_input_tokens_seen": 752104, |
| "step": 1995 |
| }, |
| { |
| "epoch": 1.7921146953405018, |
| "grad_norm": 0.6915934085845947, |
| "learning_rate": 4.9052552064183624e-05, |
| "loss": 0.7169, |
| "num_input_tokens_seen": 753960, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.7965949820788532, |
| "grad_norm": 0.842902660369873, |
| "learning_rate": 4.904186107225046e-05, |
| "loss": 0.7407, |
| "num_input_tokens_seen": 755944, |
| "step": 2005 |
| }, |
| { |
| "epoch": 1.8010752688172043, |
| "grad_norm": 1.1747803688049316, |
| "learning_rate": 4.903111127801361e-05, |
| "loss": 0.703, |
| "num_input_tokens_seen": 757832, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.8055555555555556, |
| "grad_norm": 0.8623597621917725, |
| "learning_rate": 4.902030270776524e-05, |
| "loss": 0.7459, |
| "num_input_tokens_seen": 759816, |
| "step": 2015 |
| }, |
| { |
| "epoch": 1.8100358422939067, |
| "grad_norm": 0.697680652141571, |
| "learning_rate": 4.9009435387941274e-05, |
| "loss": 0.696, |
| "num_input_tokens_seen": 761640, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.814516129032258, |
| "grad_norm": 0.7466960549354553, |
| "learning_rate": 4.899850934512134e-05, |
| "loss": 0.756, |
| "num_input_tokens_seen": 763400, |
| "step": 2025 |
| }, |
| { |
| "epoch": 1.8189964157706093, |
| "grad_norm": 0.8614512085914612, |
| "learning_rate": 4.898752460602866e-05, |
| "loss": 0.7388, |
| "num_input_tokens_seen": 765288, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.8234767025089607, |
| "grad_norm": 0.7374971508979797, |
| "learning_rate": 4.897648119753006e-05, |
| "loss": 0.7338, |
| "num_input_tokens_seen": 767144, |
| "step": 2035 |
| }, |
| { |
| "epoch": 1.827956989247312, |
| "grad_norm": 1.1002193689346313, |
| "learning_rate": 4.8965379146635816e-05, |
| "loss": 0.7455, |
| "num_input_tokens_seen": 769064, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.832437275985663, |
| "grad_norm": 0.6678388118743896, |
| "learning_rate": 4.895421848049968e-05, |
| "loss": 0.7326, |
| "num_input_tokens_seen": 770856, |
| "step": 2045 |
| }, |
| { |
| "epoch": 1.8369175627240142, |
| "grad_norm": 0.7533695697784424, |
| "learning_rate": 4.894299922641873e-05, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 772776, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.8413978494623655, |
| "grad_norm": 0.649446964263916, |
| "learning_rate": 4.893172141183335e-05, |
| "loss": 0.7192, |
| "num_input_tokens_seen": 774728, |
| "step": 2055 |
| }, |
| { |
| "epoch": 1.8458781362007168, |
| "grad_norm": 1.1061700582504272, |
| "learning_rate": 4.892038506432717e-05, |
| "loss": 0.6923, |
| "num_input_tokens_seen": 776712, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.8503584229390682, |
| "grad_norm": 0.44784870743751526, |
| "learning_rate": 4.890899021162696e-05, |
| "loss": 0.7245, |
| "num_input_tokens_seen": 778472, |
| "step": 2065 |
| }, |
| { |
| "epoch": 1.8548387096774195, |
| "grad_norm": 0.8359610438346863, |
| "learning_rate": 4.8897536881602594e-05, |
| "loss": 0.6979, |
| "num_input_tokens_seen": 780360, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.8593189964157706, |
| "grad_norm": 0.5439364910125732, |
| "learning_rate": 4.888602510226697e-05, |
| "loss": 0.7232, |
| "num_input_tokens_seen": 782280, |
| "step": 2075 |
| }, |
| { |
| "epoch": 1.863799283154122, |
| "grad_norm": 0.8178055882453918, |
| "learning_rate": 4.8874454901775936e-05, |
| "loss": 0.7403, |
| "num_input_tokens_seen": 784136, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.868279569892473, |
| "grad_norm": 0.6543580889701843, |
| "learning_rate": 4.8862826308428244e-05, |
| "loss": 0.6871, |
| "num_input_tokens_seen": 785960, |
| "step": 2085 |
| }, |
| { |
| "epoch": 1.8727598566308243, |
| "grad_norm": 0.7824105024337769, |
| "learning_rate": 4.885113935066545e-05, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 787848, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.8772401433691757, |
| "grad_norm": 1.0091453790664673, |
| "learning_rate": 4.883939405707186e-05, |
| "loss": 0.68, |
| "num_input_tokens_seen": 789800, |
| "step": 2095 |
| }, |
| { |
| "epoch": 1.881720430107527, |
| "grad_norm": 0.7868355512619019, |
| "learning_rate": 4.882759045637449e-05, |
| "loss": 0.71, |
| "num_input_tokens_seen": 791592, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.886200716845878, |
| "grad_norm": 1.3903027772903442, |
| "learning_rate": 4.88157285774429e-05, |
| "loss": 0.7023, |
| "num_input_tokens_seen": 793544, |
| "step": 2105 |
| }, |
| { |
| "epoch": 1.8906810035842294, |
| "grad_norm": 0.9901719689369202, |
| "learning_rate": 4.8803808449289264e-05, |
| "loss": 0.7471, |
| "num_input_tokens_seen": 795368, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.8951612903225805, |
| "grad_norm": 0.870309054851532, |
| "learning_rate": 4.879183010106817e-05, |
| "loss": 0.7189, |
| "num_input_tokens_seen": 797128, |
| "step": 2115 |
| }, |
| { |
| "epoch": 1.8996415770609318, |
| "grad_norm": 0.7967744469642639, |
| "learning_rate": 4.877979356207663e-05, |
| "loss": 0.7321, |
| "num_input_tokens_seen": 798888, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.9041218637992832, |
| "grad_norm": 1.005323886871338, |
| "learning_rate": 4.876769886175396e-05, |
| "loss": 0.6991, |
| "num_input_tokens_seen": 800616, |
| "step": 2125 |
| }, |
| { |
| "epoch": 1.9086021505376345, |
| "grad_norm": 0.9401888251304626, |
| "learning_rate": 4.8755546029681746e-05, |
| "loss": 0.7486, |
| "num_input_tokens_seen": 802472, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.9130824372759858, |
| "grad_norm": 0.8822266459465027, |
| "learning_rate": 4.874333509558375e-05, |
| "loss": 0.6868, |
| "num_input_tokens_seen": 804328, |
| "step": 2135 |
| }, |
| { |
| "epoch": 1.917562724014337, |
| "grad_norm": 1.493293285369873, |
| "learning_rate": 4.873106608932585e-05, |
| "loss": 0.6651, |
| "num_input_tokens_seen": 806152, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.922043010752688, |
| "grad_norm": 1.1211999654769897, |
| "learning_rate": 4.871873904091593e-05, |
| "loss": 0.7402, |
| "num_input_tokens_seen": 808168, |
| "step": 2145 |
| }, |
| { |
| "epoch": 1.9265232974910393, |
| "grad_norm": 0.7824253439903259, |
| "learning_rate": 4.870635398050387e-05, |
| "loss": 0.7018, |
| "num_input_tokens_seen": 810056, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.9310035842293907, |
| "grad_norm": 0.8798695802688599, |
| "learning_rate": 4.8693910938381404e-05, |
| "loss": 0.6747, |
| "num_input_tokens_seen": 812008, |
| "step": 2155 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 0.33410248160362244, |
| "learning_rate": 4.868140994498211e-05, |
| "loss": 0.7464, |
| "num_input_tokens_seen": 813736, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.9399641577060933, |
| "grad_norm": 0.9553130865097046, |
| "learning_rate": 4.86688510308813e-05, |
| "loss": 0.7589, |
| "num_input_tokens_seen": 815752, |
| "step": 2165 |
| }, |
| { |
| "epoch": 1.9444444444444444, |
| "grad_norm": 0.8246096968650818, |
| "learning_rate": 4.865623422679593e-05, |
| "loss": 0.7059, |
| "num_input_tokens_seen": 817544, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.9489247311827957, |
| "grad_norm": 0.8371939063072205, |
| "learning_rate": 4.864355956358454e-05, |
| "loss": 0.6801, |
| "num_input_tokens_seen": 819432, |
| "step": 2175 |
| }, |
| { |
| "epoch": 1.9534050179211468, |
| "grad_norm": 1.350062608718872, |
| "learning_rate": 4.8630827072247244e-05, |
| "loss": 0.7161, |
| "num_input_tokens_seen": 821224, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.9578853046594982, |
| "grad_norm": 0.7525457143783569, |
| "learning_rate": 4.8618036783925516e-05, |
| "loss": 0.7676, |
| "num_input_tokens_seen": 822984, |
| "step": 2185 |
| }, |
| { |
| "epoch": 1.9623655913978495, |
| "grad_norm": 1.0451446771621704, |
| "learning_rate": 4.860518872990223e-05, |
| "loss": 0.751, |
| "num_input_tokens_seen": 824968, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.9668458781362008, |
| "grad_norm": 0.6209150552749634, |
| "learning_rate": 4.859228294160155e-05, |
| "loss": 0.7335, |
| "num_input_tokens_seen": 826984, |
| "step": 2195 |
| }, |
| { |
| "epoch": 1.971326164874552, |
| "grad_norm": 0.9934207201004028, |
| "learning_rate": 4.857931945058884e-05, |
| "loss": 0.7283, |
| "num_input_tokens_seen": 828872, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.9758064516129032, |
| "grad_norm": 0.7372912168502808, |
| "learning_rate": 4.856629828857059e-05, |
| "loss": 0.7357, |
| "num_input_tokens_seen": 830760, |
| "step": 2205 |
| }, |
| { |
| "epoch": 1.9802867383512543, |
| "grad_norm": 0.6201682090759277, |
| "learning_rate": 4.855321948739435e-05, |
| "loss": 0.6704, |
| "num_input_tokens_seen": 832712, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.9847670250896057, |
| "grad_norm": 0.7720775604248047, |
| "learning_rate": 4.8540083079048645e-05, |
| "loss": 0.7008, |
| "num_input_tokens_seen": 834696, |
| "step": 2215 |
| }, |
| { |
| "epoch": 1.989247311827957, |
| "grad_norm": 1.0468122959136963, |
| "learning_rate": 4.85268890956629e-05, |
| "loss": 0.7516, |
| "num_input_tokens_seen": 836648, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.9937275985663083, |
| "grad_norm": 1.0690462589263916, |
| "learning_rate": 4.851363756950733e-05, |
| "loss": 0.7224, |
| "num_input_tokens_seen": 838760, |
| "step": 2225 |
| }, |
| { |
| "epoch": 1.9982078853046596, |
| "grad_norm": 0.727118968963623, |
| "learning_rate": 4.8500328532992945e-05, |
| "loss": 0.7296, |
| "num_input_tokens_seen": 840584, |
| "step": 2230 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.7162764072418213, |
| "eval_runtime": 5.6096, |
| "eval_samples_per_second": 88.419, |
| "eval_steps_per_second": 22.105, |
| "num_input_tokens_seen": 841024, |
| "step": 2232 |
| }, |
| { |
| "epoch": 2.002688172043011, |
| "grad_norm": 1.104372262954712, |
| "learning_rate": 4.848696201867138e-05, |
| "loss": 0.7025, |
| "num_input_tokens_seen": 842272, |
| "step": 2235 |
| }, |
| { |
| "epoch": 2.007168458781362, |
| "grad_norm": 0.6588714122772217, |
| "learning_rate": 4.847353805923484e-05, |
| "loss": 0.7179, |
| "num_input_tokens_seen": 844000, |
| "step": 2240 |
| }, |
| { |
| "epoch": 2.011648745519713, |
| "grad_norm": 0.5846701860427856, |
| "learning_rate": 4.846005668751605e-05, |
| "loss": 0.7021, |
| "num_input_tokens_seen": 845760, |
| "step": 2245 |
| }, |
| { |
| "epoch": 2.0161290322580645, |
| "grad_norm": 0.8954629302024841, |
| "learning_rate": 4.844651793648817e-05, |
| "loss": 0.7324, |
| "num_input_tokens_seen": 847776, |
| "step": 2250 |
| }, |
| { |
| "epoch": 2.020609318996416, |
| "grad_norm": 1.2610913515090942, |
| "learning_rate": 4.843292183926466e-05, |
| "loss": 0.7149, |
| "num_input_tokens_seen": 849728, |
| "step": 2255 |
| }, |
| { |
| "epoch": 2.025089605734767, |
| "grad_norm": 0.9264258742332458, |
| "learning_rate": 4.841926842909928e-05, |
| "loss": 0.7172, |
| "num_input_tokens_seen": 851584, |
| "step": 2260 |
| }, |
| { |
| "epoch": 2.0295698924731185, |
| "grad_norm": 0.6855342388153076, |
| "learning_rate": 4.840555773938594e-05, |
| "loss": 0.7209, |
| "num_input_tokens_seen": 853408, |
| "step": 2265 |
| }, |
| { |
| "epoch": 2.0340501792114694, |
| "grad_norm": 0.8659264445304871, |
| "learning_rate": 4.839178980365866e-05, |
| "loss": 0.6889, |
| "num_input_tokens_seen": 855360, |
| "step": 2270 |
| }, |
| { |
| "epoch": 2.0385304659498207, |
| "grad_norm": 1.0739582777023315, |
| "learning_rate": 4.8377964655591465e-05, |
| "loss": 0.7711, |
| "num_input_tokens_seen": 857504, |
| "step": 2275 |
| }, |
| { |
| "epoch": 2.043010752688172, |
| "grad_norm": 1.0123777389526367, |
| "learning_rate": 4.8364082328998314e-05, |
| "loss": 0.7176, |
| "num_input_tokens_seen": 859360, |
| "step": 2280 |
| }, |
| { |
| "epoch": 2.0474910394265233, |
| "grad_norm": 0.9168586134910583, |
| "learning_rate": 4.835014285783303e-05, |
| "loss": 0.7695, |
| "num_input_tokens_seen": 861312, |
| "step": 2285 |
| }, |
| { |
| "epoch": 2.0519713261648747, |
| "grad_norm": 0.8716160655021667, |
| "learning_rate": 4.833614627618918e-05, |
| "loss": 0.7196, |
| "num_input_tokens_seen": 863168, |
| "step": 2290 |
| }, |
| { |
| "epoch": 2.056451612903226, |
| "grad_norm": 0.6699190139770508, |
| "learning_rate": 4.832209261830002e-05, |
| "loss": 0.7157, |
| "num_input_tokens_seen": 865184, |
| "step": 2295 |
| }, |
| { |
| "epoch": 2.060931899641577, |
| "grad_norm": 0.715002715587616, |
| "learning_rate": 4.8307981918538405e-05, |
| "loss": 0.6378, |
| "num_input_tokens_seen": 867168, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.065412186379928, |
| "grad_norm": 0.9280583262443542, |
| "learning_rate": 4.829381421141671e-05, |
| "loss": 0.6811, |
| "num_input_tokens_seen": 869056, |
| "step": 2305 |
| }, |
| { |
| "epoch": 2.0698924731182795, |
| "grad_norm": 0.8694478273391724, |
| "learning_rate": 4.827958953158675e-05, |
| "loss": 0.734, |
| "num_input_tokens_seen": 870816, |
| "step": 2310 |
| }, |
| { |
| "epoch": 2.074372759856631, |
| "grad_norm": 0.6347289681434631, |
| "learning_rate": 4.8265307913839655e-05, |
| "loss": 0.716, |
| "num_input_tokens_seen": 872736, |
| "step": 2315 |
| }, |
| { |
| "epoch": 2.078853046594982, |
| "grad_norm": 0.4250599443912506, |
| "learning_rate": 4.825096939310584e-05, |
| "loss": 0.7348, |
| "num_input_tokens_seen": 874656, |
| "step": 2320 |
| }, |
| { |
| "epoch": 2.0833333333333335, |
| "grad_norm": 0.9700059294700623, |
| "learning_rate": 4.823657400445489e-05, |
| "loss": 0.8272, |
| "num_input_tokens_seen": 876576, |
| "step": 2325 |
| }, |
| { |
| "epoch": 2.0878136200716844, |
| "grad_norm": 0.7163282036781311, |
| "learning_rate": 4.822212178309548e-05, |
| "loss": 0.6818, |
| "num_input_tokens_seen": 878528, |
| "step": 2330 |
| }, |
| { |
| "epoch": 2.0922939068100357, |
| "grad_norm": 1.27232825756073, |
| "learning_rate": 4.820761276437527e-05, |
| "loss": 0.7249, |
| "num_input_tokens_seen": 880288, |
| "step": 2335 |
| }, |
| { |
| "epoch": 2.096774193548387, |
| "grad_norm": 0.8018732666969299, |
| "learning_rate": 4.819304698378089e-05, |
| "loss": 0.7493, |
| "num_input_tokens_seen": 882304, |
| "step": 2340 |
| }, |
| { |
| "epoch": 2.1012544802867383, |
| "grad_norm": 0.8478394150733948, |
| "learning_rate": 4.817842447693771e-05, |
| "loss": 0.6688, |
| "num_input_tokens_seen": 884256, |
| "step": 2345 |
| }, |
| { |
| "epoch": 2.1057347670250897, |
| "grad_norm": 1.056063175201416, |
| "learning_rate": 4.816374527960994e-05, |
| "loss": 0.6616, |
| "num_input_tokens_seen": 886240, |
| "step": 2350 |
| }, |
| { |
| "epoch": 2.110215053763441, |
| "grad_norm": 0.9384861588478088, |
| "learning_rate": 4.8149009427700377e-05, |
| "loss": 0.6951, |
| "num_input_tokens_seen": 888064, |
| "step": 2355 |
| }, |
| { |
| "epoch": 2.1146953405017923, |
| "grad_norm": 0.5180307626724243, |
| "learning_rate": 4.813421695725041e-05, |
| "loss": 0.7293, |
| "num_input_tokens_seen": 889728, |
| "step": 2360 |
| }, |
| { |
| "epoch": 2.119175627240143, |
| "grad_norm": 0.7397670149803162, |
| "learning_rate": 4.81193679044399e-05, |
| "loss": 0.7085, |
| "num_input_tokens_seen": 891520, |
| "step": 2365 |
| }, |
| { |
| "epoch": 2.1236559139784945, |
| "grad_norm": 0.6285805702209473, |
| "learning_rate": 4.810446230558714e-05, |
| "loss": 0.7045, |
| "num_input_tokens_seen": 893344, |
| "step": 2370 |
| }, |
| { |
| "epoch": 2.128136200716846, |
| "grad_norm": 0.7491189241409302, |
| "learning_rate": 4.8089500197148654e-05, |
| "loss": 0.683, |
| "num_input_tokens_seen": 895328, |
| "step": 2375 |
| }, |
| { |
| "epoch": 2.132616487455197, |
| "grad_norm": 0.5812940001487732, |
| "learning_rate": 4.807448161571922e-05, |
| "loss": 0.729, |
| "num_input_tokens_seen": 897248, |
| "step": 2380 |
| }, |
| { |
| "epoch": 2.1370967741935485, |
| "grad_norm": 0.6962029337882996, |
| "learning_rate": 4.805940659803174e-05, |
| "loss": 0.7535, |
| "num_input_tokens_seen": 899200, |
| "step": 2385 |
| }, |
| { |
| "epoch": 2.1415770609319, |
| "grad_norm": 0.5583885312080383, |
| "learning_rate": 4.804427518095715e-05, |
| "loss": 0.6988, |
| "num_input_tokens_seen": 901120, |
| "step": 2390 |
| }, |
| { |
| "epoch": 2.1460573476702507, |
| "grad_norm": 0.740242600440979, |
| "learning_rate": 4.802908740150431e-05, |
| "loss": 0.681, |
| "num_input_tokens_seen": 903040, |
| "step": 2395 |
| }, |
| { |
| "epoch": 2.150537634408602, |
| "grad_norm": 0.7208986878395081, |
| "learning_rate": 4.801384329681996e-05, |
| "loss": 0.712, |
| "num_input_tokens_seen": 904960, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.1550179211469533, |
| "grad_norm": 0.8246489763259888, |
| "learning_rate": 4.799854290418858e-05, |
| "loss": 0.7185, |
| "num_input_tokens_seen": 906816, |
| "step": 2405 |
| }, |
| { |
| "epoch": 2.1594982078853047, |
| "grad_norm": 0.5951177477836609, |
| "learning_rate": 4.798318626103233e-05, |
| "loss": 0.6935, |
| "num_input_tokens_seen": 908544, |
| "step": 2410 |
| }, |
| { |
| "epoch": 2.163978494623656, |
| "grad_norm": 1.0811342000961304, |
| "learning_rate": 4.7967773404910946e-05, |
| "loss": 0.7317, |
| "num_input_tokens_seen": 910336, |
| "step": 2415 |
| }, |
| { |
| "epoch": 2.1684587813620073, |
| "grad_norm": 0.9689534902572632, |
| "learning_rate": 4.7952304373521644e-05, |
| "loss": 0.7447, |
| "num_input_tokens_seen": 912288, |
| "step": 2420 |
| }, |
| { |
| "epoch": 2.1729390681003586, |
| "grad_norm": 0.4089421331882477, |
| "learning_rate": 4.793677920469906e-05, |
| "loss": 0.6866, |
| "num_input_tokens_seen": 914144, |
| "step": 2425 |
| }, |
| { |
| "epoch": 2.1774193548387095, |
| "grad_norm": 0.618166983127594, |
| "learning_rate": 4.7921197936415106e-05, |
| "loss": 0.7001, |
| "num_input_tokens_seen": 915904, |
| "step": 2430 |
| }, |
| { |
| "epoch": 2.181899641577061, |
| "grad_norm": 0.492492139339447, |
| "learning_rate": 4.7905560606778924e-05, |
| "loss": 0.7071, |
| "num_input_tokens_seen": 917824, |
| "step": 2435 |
| }, |
| { |
| "epoch": 2.186379928315412, |
| "grad_norm": 0.6676193475723267, |
| "learning_rate": 4.7889867254036755e-05, |
| "loss": 0.6964, |
| "num_input_tokens_seen": 919744, |
| "step": 2440 |
| }, |
| { |
| "epoch": 2.1908602150537635, |
| "grad_norm": 0.7295692563056946, |
| "learning_rate": 4.787411791657188e-05, |
| "loss": 0.7325, |
| "num_input_tokens_seen": 921728, |
| "step": 2445 |
| }, |
| { |
| "epoch": 2.195340501792115, |
| "grad_norm": 0.6862680315971375, |
| "learning_rate": 4.785831263290449e-05, |
| "loss": 0.7055, |
| "num_input_tokens_seen": 923648, |
| "step": 2450 |
| }, |
| { |
| "epoch": 2.199820788530466, |
| "grad_norm": 0.7587245106697083, |
| "learning_rate": 4.784245144169162e-05, |
| "loss": 0.673, |
| "num_input_tokens_seen": 925536, |
| "step": 2455 |
| }, |
| { |
| "epoch": 2.204301075268817, |
| "grad_norm": 0.6514589786529541, |
| "learning_rate": 4.782653438172705e-05, |
| "loss": 0.7201, |
| "num_input_tokens_seen": 927392, |
| "step": 2460 |
| }, |
| { |
| "epoch": 2.2087813620071683, |
| "grad_norm": 0.9270156025886536, |
| "learning_rate": 4.781056149194121e-05, |
| "loss": 0.6743, |
| "num_input_tokens_seen": 929536, |
| "step": 2465 |
| }, |
| { |
| "epoch": 2.2132616487455197, |
| "grad_norm": 0.795536458492279, |
| "learning_rate": 4.779453281140107e-05, |
| "loss": 0.719, |
| "num_input_tokens_seen": 931520, |
| "step": 2470 |
| }, |
| { |
| "epoch": 2.217741935483871, |
| "grad_norm": 0.9114360809326172, |
| "learning_rate": 4.777844837931005e-05, |
| "loss": 0.7193, |
| "num_input_tokens_seen": 933504, |
| "step": 2475 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 0.509145200252533, |
| "learning_rate": 4.776230823500793e-05, |
| "loss": 0.662, |
| "num_input_tokens_seen": 935360, |
| "step": 2480 |
| }, |
| { |
| "epoch": 2.2267025089605736, |
| "grad_norm": 1.2724803686141968, |
| "learning_rate": 4.7746112417970766e-05, |
| "loss": 0.6978, |
| "num_input_tokens_seen": 937440, |
| "step": 2485 |
| }, |
| { |
| "epoch": 2.2311827956989245, |
| "grad_norm": 1.0436537265777588, |
| "learning_rate": 4.772986096781078e-05, |
| "loss": 0.7792, |
| "num_input_tokens_seen": 939424, |
| "step": 2490 |
| }, |
| { |
| "epoch": 2.235663082437276, |
| "grad_norm": 0.8516407608985901, |
| "learning_rate": 4.771355392427624e-05, |
| "loss": 0.7294, |
| "num_input_tokens_seen": 941312, |
| "step": 2495 |
| }, |
| { |
| "epoch": 2.240143369175627, |
| "grad_norm": 0.859889805316925, |
| "learning_rate": 4.769719132725141e-05, |
| "loss": 0.7053, |
| "num_input_tokens_seen": 943264, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.2446236559139785, |
| "grad_norm": 0.8790197968482971, |
| "learning_rate": 4.768077321675643e-05, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 945248, |
| "step": 2505 |
| }, |
| { |
| "epoch": 2.24910394265233, |
| "grad_norm": 0.7509116530418396, |
| "learning_rate": 4.766429963294719e-05, |
| "loss": 0.7072, |
| "num_input_tokens_seen": 947168, |
| "step": 2510 |
| }, |
| { |
| "epoch": 2.253584229390681, |
| "grad_norm": 0.7757908701896667, |
| "learning_rate": 4.7647770616115265e-05, |
| "loss": 0.7193, |
| "num_input_tokens_seen": 948960, |
| "step": 2515 |
| }, |
| { |
| "epoch": 2.258064516129032, |
| "grad_norm": 0.6172046065330505, |
| "learning_rate": 4.763118620668785e-05, |
| "loss": 0.7676, |
| "num_input_tokens_seen": 950688, |
| "step": 2520 |
| }, |
| { |
| "epoch": 2.2625448028673834, |
| "grad_norm": 0.7593435049057007, |
| "learning_rate": 4.761454644522757e-05, |
| "loss": 0.6887, |
| "num_input_tokens_seen": 952608, |
| "step": 2525 |
| }, |
| { |
| "epoch": 2.2670250896057347, |
| "grad_norm": 1.0903900861740112, |
| "learning_rate": 4.759785137243245e-05, |
| "loss": 0.7673, |
| "num_input_tokens_seen": 954368, |
| "step": 2530 |
| }, |
| { |
| "epoch": 2.271505376344086, |
| "grad_norm": 0.9267560243606567, |
| "learning_rate": 4.758110102913581e-05, |
| "loss": 0.7019, |
| "num_input_tokens_seen": 956160, |
| "step": 2535 |
| }, |
| { |
| "epoch": 2.2759856630824373, |
| "grad_norm": 1.1232478618621826, |
| "learning_rate": 4.7564295456306136e-05, |
| "loss": 0.7269, |
| "num_input_tokens_seen": 957984, |
| "step": 2540 |
| }, |
| { |
| "epoch": 2.2804659498207887, |
| "grad_norm": 1.0440434217453003, |
| "learning_rate": 4.7547434695047e-05, |
| "loss": 0.7089, |
| "num_input_tokens_seen": 959872, |
| "step": 2545 |
| }, |
| { |
| "epoch": 2.28494623655914, |
| "grad_norm": 0.6118893027305603, |
| "learning_rate": 4.7530518786596954e-05, |
| "loss": 0.696, |
| "num_input_tokens_seen": 961664, |
| "step": 2550 |
| }, |
| { |
| "epoch": 2.289426523297491, |
| "grad_norm": 0.8722130656242371, |
| "learning_rate": 4.7513547772329446e-05, |
| "loss": 0.727, |
| "num_input_tokens_seen": 963712, |
| "step": 2555 |
| }, |
| { |
| "epoch": 2.293906810035842, |
| "grad_norm": 0.5364380478858948, |
| "learning_rate": 4.749652169375268e-05, |
| "loss": 0.7026, |
| "num_input_tokens_seen": 965696, |
| "step": 2560 |
| }, |
| { |
| "epoch": 2.2983870967741935, |
| "grad_norm": 0.7415496706962585, |
| "learning_rate": 4.747944059250955e-05, |
| "loss": 0.6811, |
| "num_input_tokens_seen": 967488, |
| "step": 2565 |
| }, |
| { |
| "epoch": 2.302867383512545, |
| "grad_norm": 0.612433910369873, |
| "learning_rate": 4.746230451037752e-05, |
| "loss": 0.6918, |
| "num_input_tokens_seen": 969344, |
| "step": 2570 |
| }, |
| { |
| "epoch": 2.307347670250896, |
| "grad_norm": 0.6766025424003601, |
| "learning_rate": 4.7445113489268544e-05, |
| "loss": 0.6934, |
| "num_input_tokens_seen": 971168, |
| "step": 2575 |
| }, |
| { |
| "epoch": 2.3118279569892475, |
| "grad_norm": 1.016257882118225, |
| "learning_rate": 4.7427867571228926e-05, |
| "loss": 0.7136, |
| "num_input_tokens_seen": 973088, |
| "step": 2580 |
| }, |
| { |
| "epoch": 2.3163082437275984, |
| "grad_norm": 0.9015939235687256, |
| "learning_rate": 4.741056679843926e-05, |
| "loss": 0.6865, |
| "num_input_tokens_seen": 975008, |
| "step": 2585 |
| }, |
| { |
| "epoch": 2.3207885304659497, |
| "grad_norm": 0.5278044939041138, |
| "learning_rate": 4.739321121321428e-05, |
| "loss": 0.6557, |
| "num_input_tokens_seen": 976864, |
| "step": 2590 |
| }, |
| { |
| "epoch": 2.325268817204301, |
| "grad_norm": 1.0323004722595215, |
| "learning_rate": 4.737580085800282e-05, |
| "loss": 0.7846, |
| "num_input_tokens_seen": 978752, |
| "step": 2595 |
| }, |
| { |
| "epoch": 2.3297491039426523, |
| "grad_norm": 0.5666264295578003, |
| "learning_rate": 4.735833577538762e-05, |
| "loss": 0.7187, |
| "num_input_tokens_seen": 980576, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.3342293906810037, |
| "grad_norm": 0.5789869427680969, |
| "learning_rate": 4.734081600808531e-05, |
| "loss": 0.7199, |
| "num_input_tokens_seen": 982336, |
| "step": 2605 |
| }, |
| { |
| "epoch": 2.338709677419355, |
| "grad_norm": 0.9192356467247009, |
| "learning_rate": 4.732324159894627e-05, |
| "loss": 0.7349, |
| "num_input_tokens_seen": 984064, |
| "step": 2610 |
| }, |
| { |
| "epoch": 2.3431899641577063, |
| "grad_norm": 0.880268394947052, |
| "learning_rate": 4.730561259095451e-05, |
| "loss": 0.7277, |
| "num_input_tokens_seen": 985888, |
| "step": 2615 |
| }, |
| { |
| "epoch": 2.347670250896057, |
| "grad_norm": 0.5334110260009766, |
| "learning_rate": 4.728792902722759e-05, |
| "loss": 0.6812, |
| "num_input_tokens_seen": 987712, |
| "step": 2620 |
| }, |
| { |
| "epoch": 2.3521505376344085, |
| "grad_norm": 0.7519697546958923, |
| "learning_rate": 4.7270190951016493e-05, |
| "loss": 0.6801, |
| "num_input_tokens_seen": 989568, |
| "step": 2625 |
| }, |
| { |
| "epoch": 2.35663082437276, |
| "grad_norm": 0.8907373547554016, |
| "learning_rate": 4.7252398405705535e-05, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 991584, |
| "step": 2630 |
| }, |
| { |
| "epoch": 2.361111111111111, |
| "grad_norm": 0.7968729138374329, |
| "learning_rate": 4.723455143481227e-05, |
| "loss": 0.7279, |
| "num_input_tokens_seen": 993472, |
| "step": 2635 |
| }, |
| { |
| "epoch": 2.3655913978494625, |
| "grad_norm": 0.799196183681488, |
| "learning_rate": 4.721665008198734e-05, |
| "loss": 0.691, |
| "num_input_tokens_seen": 995296, |
| "step": 2640 |
| }, |
| { |
| "epoch": 2.370071684587814, |
| "grad_norm": 0.653005063533783, |
| "learning_rate": 4.719869439101442e-05, |
| "loss": 0.7456, |
| "num_input_tokens_seen": 997120, |
| "step": 2645 |
| }, |
| { |
| "epoch": 2.3745519713261647, |
| "grad_norm": 0.7391387224197388, |
| "learning_rate": 4.718068440581007e-05, |
| "loss": 0.7312, |
| "num_input_tokens_seen": 999104, |
| "step": 2650 |
| }, |
| { |
| "epoch": 2.379032258064516, |
| "grad_norm": 0.9343587756156921, |
| "learning_rate": 4.7162620170423655e-05, |
| "loss": 0.7184, |
| "num_input_tokens_seen": 1000864, |
| "step": 2655 |
| }, |
| { |
| "epoch": 2.3835125448028673, |
| "grad_norm": 0.5467570424079895, |
| "learning_rate": 4.714450172903722e-05, |
| "loss": 0.7674, |
| "num_input_tokens_seen": 1002976, |
| "step": 2660 |
| }, |
| { |
| "epoch": 2.3879928315412187, |
| "grad_norm": 0.6304900050163269, |
| "learning_rate": 4.712632912596538e-05, |
| "loss": 0.7117, |
| "num_input_tokens_seen": 1004672, |
| "step": 2665 |
| }, |
| { |
| "epoch": 2.39247311827957, |
| "grad_norm": 0.8206930756568909, |
| "learning_rate": 4.710810240565526e-05, |
| "loss": 0.6894, |
| "num_input_tokens_seen": 1006560, |
| "step": 2670 |
| }, |
| { |
| "epoch": 2.3969534050179213, |
| "grad_norm": 0.7391961812973022, |
| "learning_rate": 4.7089821612686295e-05, |
| "loss": 0.7055, |
| "num_input_tokens_seen": 1008384, |
| "step": 2675 |
| }, |
| { |
| "epoch": 2.4014336917562726, |
| "grad_norm": 0.6682542562484741, |
| "learning_rate": 4.707148679177021e-05, |
| "loss": 0.6728, |
| "num_input_tokens_seen": 1010208, |
| "step": 2680 |
| }, |
| { |
| "epoch": 2.4059139784946235, |
| "grad_norm": 0.6063985824584961, |
| "learning_rate": 4.705309798775084e-05, |
| "loss": 0.7021, |
| "num_input_tokens_seen": 1012128, |
| "step": 2685 |
| }, |
| { |
| "epoch": 2.410394265232975, |
| "grad_norm": 0.8216093182563782, |
| "learning_rate": 4.703465524560409e-05, |
| "loss": 0.722, |
| "num_input_tokens_seen": 1014080, |
| "step": 2690 |
| }, |
| { |
| "epoch": 2.414874551971326, |
| "grad_norm": 0.9102132320404053, |
| "learning_rate": 4.7016158610437764e-05, |
| "loss": 0.7014, |
| "num_input_tokens_seen": 1015968, |
| "step": 2695 |
| }, |
| { |
| "epoch": 2.4193548387096775, |
| "grad_norm": 0.6346727609634399, |
| "learning_rate": 4.69976081274915e-05, |
| "loss": 0.6802, |
| "num_input_tokens_seen": 1017696, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.423835125448029, |
| "grad_norm": 0.7000402808189392, |
| "learning_rate": 4.6979003842136596e-05, |
| "loss": 0.7651, |
| "num_input_tokens_seen": 1019552, |
| "step": 2705 |
| }, |
| { |
| "epoch": 2.4283154121863797, |
| "grad_norm": 0.549882173538208, |
| "learning_rate": 4.6960345799875995e-05, |
| "loss": 0.8167, |
| "num_input_tokens_seen": 1021344, |
| "step": 2710 |
| }, |
| { |
| "epoch": 2.432795698924731, |
| "grad_norm": 0.6052292585372925, |
| "learning_rate": 4.694163404634408e-05, |
| "loss": 0.6887, |
| "num_input_tokens_seen": 1023136, |
| "step": 2715 |
| }, |
| { |
| "epoch": 2.4372759856630823, |
| "grad_norm": 0.6004948616027832, |
| "learning_rate": 4.692286862730663e-05, |
| "loss": 0.6855, |
| "num_input_tokens_seen": 1024960, |
| "step": 2720 |
| }, |
| { |
| "epoch": 2.4417562724014337, |
| "grad_norm": 0.6895211338996887, |
| "learning_rate": 4.690404958866066e-05, |
| "loss": 0.7451, |
| "num_input_tokens_seen": 1026720, |
| "step": 2725 |
| }, |
| { |
| "epoch": 2.446236559139785, |
| "grad_norm": 0.6713605523109436, |
| "learning_rate": 4.6885176976434344e-05, |
| "loss": 0.7132, |
| "num_input_tokens_seen": 1028544, |
| "step": 2730 |
| }, |
| { |
| "epoch": 2.4507168458781363, |
| "grad_norm": 0.7282506823539734, |
| "learning_rate": 4.6866250836786876e-05, |
| "loss": 0.689, |
| "num_input_tokens_seen": 1030368, |
| "step": 2735 |
| }, |
| { |
| "epoch": 2.4551971326164876, |
| "grad_norm": 0.8884916305541992, |
| "learning_rate": 4.684727121600838e-05, |
| "loss": 0.6809, |
| "num_input_tokens_seen": 1032224, |
| "step": 2740 |
| }, |
| { |
| "epoch": 2.4596774193548385, |
| "grad_norm": 0.7573866844177246, |
| "learning_rate": 4.6828238160519775e-05, |
| "loss": 0.6593, |
| "num_input_tokens_seen": 1034112, |
| "step": 2745 |
| }, |
| { |
| "epoch": 2.46415770609319, |
| "grad_norm": 0.7106537222862244, |
| "learning_rate": 4.680915171687269e-05, |
| "loss": 0.6905, |
| "num_input_tokens_seen": 1036000, |
| "step": 2750 |
| }, |
| { |
| "epoch": 2.468637992831541, |
| "grad_norm": 0.5046974420547485, |
| "learning_rate": 4.6790011931749314e-05, |
| "loss": 0.6927, |
| "num_input_tokens_seen": 1037888, |
| "step": 2755 |
| }, |
| { |
| "epoch": 2.4731182795698925, |
| "grad_norm": 0.8967843055725098, |
| "learning_rate": 4.6770818851962305e-05, |
| "loss": 0.7133, |
| "num_input_tokens_seen": 1039776, |
| "step": 2760 |
| }, |
| { |
| "epoch": 2.477598566308244, |
| "grad_norm": 0.784045398235321, |
| "learning_rate": 4.675157252445467e-05, |
| "loss": 0.6573, |
| "num_input_tokens_seen": 1041600, |
| "step": 2765 |
| }, |
| { |
| "epoch": 2.482078853046595, |
| "grad_norm": 0.8381885290145874, |
| "learning_rate": 4.673227299629966e-05, |
| "loss": 0.7263, |
| "num_input_tokens_seen": 1043456, |
| "step": 2770 |
| }, |
| { |
| "epoch": 2.486559139784946, |
| "grad_norm": 0.8245952725410461, |
| "learning_rate": 4.6712920314700624e-05, |
| "loss": 0.6746, |
| "num_input_tokens_seen": 1045248, |
| "step": 2775 |
| }, |
| { |
| "epoch": 2.4910394265232974, |
| "grad_norm": 0.6422320008277893, |
| "learning_rate": 4.6693514526990955e-05, |
| "loss": 0.7433, |
| "num_input_tokens_seen": 1047168, |
| "step": 2780 |
| }, |
| { |
| "epoch": 2.4955197132616487, |
| "grad_norm": 0.6076728701591492, |
| "learning_rate": 4.6674055680633885e-05, |
| "loss": 0.6742, |
| "num_input_tokens_seen": 1049056, |
| "step": 2785 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.5336353182792664, |
| "learning_rate": 4.665454382322246e-05, |
| "loss": 0.8036, |
| "num_input_tokens_seen": 1051168, |
| "step": 2790 |
| }, |
| { |
| "epoch": 2.5, |
| "eval_loss": 0.724970281124115, |
| "eval_runtime": 5.6399, |
| "eval_samples_per_second": 87.945, |
| "eval_steps_per_second": 21.986, |
| "num_input_tokens_seen": 1051168, |
| "step": 2790 |
| }, |
| { |
| "epoch": 2.5044802867383513, |
| "grad_norm": 1.0776666402816772, |
| "learning_rate": 4.663497900247936e-05, |
| "loss": 0.7213, |
| "num_input_tokens_seen": 1053120, |
| "step": 2795 |
| }, |
| { |
| "epoch": 2.5089605734767026, |
| "grad_norm": 0.7473271489143372, |
| "learning_rate": 4.6615361266256805e-05, |
| "loss": 0.7244, |
| "num_input_tokens_seen": 1055008, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.513440860215054, |
| "grad_norm": 0.6820984482765198, |
| "learning_rate": 4.6595690662536436e-05, |
| "loss": 0.6848, |
| "num_input_tokens_seen": 1056832, |
| "step": 2805 |
| }, |
| { |
| "epoch": 2.517921146953405, |
| "grad_norm": 0.7136480808258057, |
| "learning_rate": 4.657596723942923e-05, |
| "loss": 0.7369, |
| "num_input_tokens_seen": 1058656, |
| "step": 2810 |
| }, |
| { |
| "epoch": 2.522401433691756, |
| "grad_norm": 0.6265502572059631, |
| "learning_rate": 4.65561910451753e-05, |
| "loss": 0.7132, |
| "num_input_tokens_seen": 1060416, |
| "step": 2815 |
| }, |
| { |
| "epoch": 2.5268817204301075, |
| "grad_norm": 0.6327486634254456, |
| "learning_rate": 4.653636212814386e-05, |
| "loss": 0.7276, |
| "num_input_tokens_seen": 1062176, |
| "step": 2820 |
| }, |
| { |
| "epoch": 2.531362007168459, |
| "grad_norm": 0.6677072644233704, |
| "learning_rate": 4.651648053683308e-05, |
| "loss": 0.75, |
| "num_input_tokens_seen": 1064032, |
| "step": 2825 |
| }, |
| { |
| "epoch": 2.53584229390681, |
| "grad_norm": 0.6885439157485962, |
| "learning_rate": 4.649654631986994e-05, |
| "loss": 0.6952, |
| "num_input_tokens_seen": 1065920, |
| "step": 2830 |
| }, |
| { |
| "epoch": 2.540322580645161, |
| "grad_norm": 0.4123775362968445, |
| "learning_rate": 4.6476559526010146e-05, |
| "loss": 0.6645, |
| "num_input_tokens_seen": 1067840, |
| "step": 2835 |
| }, |
| { |
| "epoch": 2.5448028673835124, |
| "grad_norm": 0.5762872099876404, |
| "learning_rate": 4.6456520204137996e-05, |
| "loss": 0.7147, |
| "num_input_tokens_seen": 1069824, |
| "step": 2840 |
| }, |
| { |
| "epoch": 2.5492831541218637, |
| "grad_norm": 0.6282514333724976, |
| "learning_rate": 4.643642840326627e-05, |
| "loss": 0.7152, |
| "num_input_tokens_seen": 1071744, |
| "step": 2845 |
| }, |
| { |
| "epoch": 2.553763440860215, |
| "grad_norm": 0.6361386775970459, |
| "learning_rate": 4.64162841725361e-05, |
| "loss": 0.7435, |
| "num_input_tokens_seen": 1073536, |
| "step": 2850 |
| }, |
| { |
| "epoch": 2.5582437275985663, |
| "grad_norm": 0.7141626477241516, |
| "learning_rate": 4.639608756121684e-05, |
| "loss": 0.6694, |
| "num_input_tokens_seen": 1075424, |
| "step": 2855 |
| }, |
| { |
| "epoch": 2.5627240143369177, |
| "grad_norm": 0.8432585597038269, |
| "learning_rate": 4.637583861870596e-05, |
| "loss": 0.6899, |
| "num_input_tokens_seen": 1077472, |
| "step": 2860 |
| }, |
| { |
| "epoch": 2.567204301075269, |
| "grad_norm": 0.6747820377349854, |
| "learning_rate": 4.635553739452895e-05, |
| "loss": 0.694, |
| "num_input_tokens_seen": 1079296, |
| "step": 2865 |
| }, |
| { |
| "epoch": 2.5716845878136203, |
| "grad_norm": 0.9020037651062012, |
| "learning_rate": 4.6335183938339125e-05, |
| "loss": 0.6956, |
| "num_input_tokens_seen": 1081152, |
| "step": 2870 |
| }, |
| { |
| "epoch": 2.576164874551971, |
| "grad_norm": 0.7527491450309753, |
| "learning_rate": 4.631477829991761e-05, |
| "loss": 0.7206, |
| "num_input_tokens_seen": 1083168, |
| "step": 2875 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 0.6851952075958252, |
| "learning_rate": 4.629432052917309e-05, |
| "loss": 0.7044, |
| "num_input_tokens_seen": 1084992, |
| "step": 2880 |
| }, |
| { |
| "epoch": 2.585125448028674, |
| "grad_norm": 0.6493790745735168, |
| "learning_rate": 4.627381067614182e-05, |
| "loss": 0.7422, |
| "num_input_tokens_seen": 1086784, |
| "step": 2885 |
| }, |
| { |
| "epoch": 2.589605734767025, |
| "grad_norm": 0.5780056715011597, |
| "learning_rate": 4.625324879098741e-05, |
| "loss": 0.6984, |
| "num_input_tokens_seen": 1088608, |
| "step": 2890 |
| }, |
| { |
| "epoch": 2.5940860215053765, |
| "grad_norm": 0.7665016651153564, |
| "learning_rate": 4.6232634924000725e-05, |
| "loss": 0.7409, |
| "num_input_tokens_seen": 1090592, |
| "step": 2895 |
| }, |
| { |
| "epoch": 2.5985663082437274, |
| "grad_norm": 0.7732678651809692, |
| "learning_rate": 4.621196912559978e-05, |
| "loss": 0.6971, |
| "num_input_tokens_seen": 1092448, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.6030465949820787, |
| "grad_norm": 0.5451002717018127, |
| "learning_rate": 4.619125144632961e-05, |
| "loss": 0.7078, |
| "num_input_tokens_seen": 1094368, |
| "step": 2905 |
| }, |
| { |
| "epoch": 2.60752688172043, |
| "grad_norm": 0.7645034193992615, |
| "learning_rate": 4.617048193686213e-05, |
| "loss": 0.7346, |
| "num_input_tokens_seen": 1096288, |
| "step": 2910 |
| }, |
| { |
| "epoch": 2.6120071684587813, |
| "grad_norm": 0.7834355235099792, |
| "learning_rate": 4.614966064799603e-05, |
| "loss": 0.6801, |
| "num_input_tokens_seen": 1098240, |
| "step": 2915 |
| }, |
| { |
| "epoch": 2.6164874551971327, |
| "grad_norm": 0.6444031000137329, |
| "learning_rate": 4.612878763065664e-05, |
| "loss": 0.746, |
| "num_input_tokens_seen": 1100096, |
| "step": 2920 |
| }, |
| { |
| "epoch": 2.620967741935484, |
| "grad_norm": 0.6403073668479919, |
| "learning_rate": 4.610786293589581e-05, |
| "loss": 0.7428, |
| "num_input_tokens_seen": 1101984, |
| "step": 2925 |
| }, |
| { |
| "epoch": 2.6254480286738353, |
| "grad_norm": 0.7044591307640076, |
| "learning_rate": 4.608688661489179e-05, |
| "loss": 0.6958, |
| "num_input_tokens_seen": 1104000, |
| "step": 2930 |
| }, |
| { |
| "epoch": 2.6299283154121866, |
| "grad_norm": 0.7843032479286194, |
| "learning_rate": 4.60658587189491e-05, |
| "loss": 0.7658, |
| "num_input_tokens_seen": 1106080, |
| "step": 2935 |
| }, |
| { |
| "epoch": 2.6344086021505375, |
| "grad_norm": 0.4566028416156769, |
| "learning_rate": 4.604477929949837e-05, |
| "loss": 0.691, |
| "num_input_tokens_seen": 1108096, |
| "step": 2940 |
| }, |
| { |
| "epoch": 2.638888888888889, |
| "grad_norm": 0.7375583052635193, |
| "learning_rate": 4.60236484080963e-05, |
| "loss": 0.7131, |
| "num_input_tokens_seen": 1109952, |
| "step": 2945 |
| }, |
| { |
| "epoch": 2.64336917562724, |
| "grad_norm": 0.7825741767883301, |
| "learning_rate": 4.600246609642546e-05, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 1111840, |
| "step": 2950 |
| }, |
| { |
| "epoch": 2.6478494623655915, |
| "grad_norm": 0.8638558983802795, |
| "learning_rate": 4.598123241629416e-05, |
| "loss": 0.7269, |
| "num_input_tokens_seen": 1113600, |
| "step": 2955 |
| }, |
| { |
| "epoch": 2.652329749103943, |
| "grad_norm": 0.5645026564598083, |
| "learning_rate": 4.5959947419636394e-05, |
| "loss": 0.6928, |
| "num_input_tokens_seen": 1115424, |
| "step": 2960 |
| }, |
| { |
| "epoch": 2.6568100358422937, |
| "grad_norm": 0.791883111000061, |
| "learning_rate": 4.593861115851163e-05, |
| "loss": 0.6988, |
| "num_input_tokens_seen": 1117376, |
| "step": 2965 |
| }, |
| { |
| "epoch": 2.661290322580645, |
| "grad_norm": 0.5525398850440979, |
| "learning_rate": 4.5917223685104735e-05, |
| "loss": 0.6782, |
| "num_input_tokens_seen": 1119232, |
| "step": 2970 |
| }, |
| { |
| "epoch": 2.6657706093189963, |
| "grad_norm": 0.6524225473403931, |
| "learning_rate": 4.5895785051725836e-05, |
| "loss": 0.6991, |
| "num_input_tokens_seen": 1121184, |
| "step": 2975 |
| }, |
| { |
| "epoch": 2.6702508960573477, |
| "grad_norm": 0.723503589630127, |
| "learning_rate": 4.587429531081019e-05, |
| "loss": 0.6962, |
| "num_input_tokens_seen": 1123424, |
| "step": 2980 |
| }, |
| { |
| "epoch": 2.674731182795699, |
| "grad_norm": 0.5620132088661194, |
| "learning_rate": 4.5852754514918034e-05, |
| "loss": 0.737, |
| "num_input_tokens_seen": 1125152, |
| "step": 2985 |
| }, |
| { |
| "epoch": 2.6792114695340503, |
| "grad_norm": 0.5039889216423035, |
| "learning_rate": 4.58311627167345e-05, |
| "loss": 0.6667, |
| "num_input_tokens_seen": 1127136, |
| "step": 2990 |
| }, |
| { |
| "epoch": 2.6836917562724016, |
| "grad_norm": 0.963306188583374, |
| "learning_rate": 4.580951996906946e-05, |
| "loss": 0.7496, |
| "num_input_tokens_seen": 1128992, |
| "step": 2995 |
| }, |
| { |
| "epoch": 2.688172043010753, |
| "grad_norm": 0.9899599552154541, |
| "learning_rate": 4.578782632485738e-05, |
| "loss": 0.7067, |
| "num_input_tokens_seen": 1130976, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.692652329749104, |
| "grad_norm": 0.47623762488365173, |
| "learning_rate": 4.576608183715724e-05, |
| "loss": 0.6723, |
| "num_input_tokens_seen": 1132832, |
| "step": 3005 |
| }, |
| { |
| "epoch": 2.697132616487455, |
| "grad_norm": 0.6253779530525208, |
| "learning_rate": 4.574428655915235e-05, |
| "loss": 0.7055, |
| "num_input_tokens_seen": 1134720, |
| "step": 3010 |
| }, |
| { |
| "epoch": 2.7016129032258065, |
| "grad_norm": 0.9512951970100403, |
| "learning_rate": 4.572244054415026e-05, |
| "loss": 0.7123, |
| "num_input_tokens_seen": 1136576, |
| "step": 3015 |
| }, |
| { |
| "epoch": 2.706093189964158, |
| "grad_norm": 0.702843427658081, |
| "learning_rate": 4.570054384558259e-05, |
| "loss": 0.6806, |
| "num_input_tokens_seen": 1138560, |
| "step": 3020 |
| }, |
| { |
| "epoch": 2.7105734767025087, |
| "grad_norm": 0.6525705456733704, |
| "learning_rate": 4.5678596517004966e-05, |
| "loss": 0.6627, |
| "num_input_tokens_seen": 1140480, |
| "step": 3025 |
| }, |
| { |
| "epoch": 2.71505376344086, |
| "grad_norm": 0.4482388496398926, |
| "learning_rate": 4.56565986120968e-05, |
| "loss": 0.7051, |
| "num_input_tokens_seen": 1142432, |
| "step": 3030 |
| }, |
| { |
| "epoch": 2.7195340501792113, |
| "grad_norm": 0.8427887558937073, |
| "learning_rate": 4.563455018466125e-05, |
| "loss": 0.7249, |
| "num_input_tokens_seen": 1144256, |
| "step": 3035 |
| }, |
| { |
| "epoch": 2.7240143369175627, |
| "grad_norm": 0.4986298084259033, |
| "learning_rate": 4.5612451288624996e-05, |
| "loss": 0.6847, |
| "num_input_tokens_seen": 1146240, |
| "step": 3040 |
| }, |
| { |
| "epoch": 2.728494623655914, |
| "grad_norm": 0.8530718684196472, |
| "learning_rate": 4.559030197803819e-05, |
| "loss": 0.6835, |
| "num_input_tokens_seen": 1148192, |
| "step": 3045 |
| }, |
| { |
| "epoch": 2.7329749103942653, |
| "grad_norm": 0.47099852561950684, |
| "learning_rate": 4.5568102307074286e-05, |
| "loss": 0.7198, |
| "num_input_tokens_seen": 1149984, |
| "step": 3050 |
| }, |
| { |
| "epoch": 2.7374551971326166, |
| "grad_norm": 0.5832663774490356, |
| "learning_rate": 4.554585233002989e-05, |
| "loss": 0.6821, |
| "num_input_tokens_seen": 1151872, |
| "step": 3055 |
| }, |
| { |
| "epoch": 2.741935483870968, |
| "grad_norm": 0.6409208178520203, |
| "learning_rate": 4.552355210132467e-05, |
| "loss": 0.7222, |
| "num_input_tokens_seen": 1153696, |
| "step": 3060 |
| }, |
| { |
| "epoch": 2.746415770609319, |
| "grad_norm": 0.6105261445045471, |
| "learning_rate": 4.550120167550119e-05, |
| "loss": 0.7136, |
| "num_input_tokens_seen": 1155584, |
| "step": 3065 |
| }, |
| { |
| "epoch": 2.75089605734767, |
| "grad_norm": 0.7974156141281128, |
| "learning_rate": 4.54788011072248e-05, |
| "loss": 0.6708, |
| "num_input_tokens_seen": 1157376, |
| "step": 3070 |
| }, |
| { |
| "epoch": 2.7553763440860215, |
| "grad_norm": 0.8578731417655945, |
| "learning_rate": 4.545635045128347e-05, |
| "loss": 0.713, |
| "num_input_tokens_seen": 1159104, |
| "step": 3075 |
| }, |
| { |
| "epoch": 2.759856630824373, |
| "grad_norm": 0.9953461289405823, |
| "learning_rate": 4.5433849762587685e-05, |
| "loss": 0.7076, |
| "num_input_tokens_seen": 1161024, |
| "step": 3080 |
| }, |
| { |
| "epoch": 2.764336917562724, |
| "grad_norm": 0.7679957747459412, |
| "learning_rate": 4.541129909617031e-05, |
| "loss": 0.7383, |
| "num_input_tokens_seen": 1162848, |
| "step": 3085 |
| }, |
| { |
| "epoch": 2.768817204301075, |
| "grad_norm": 0.6889281272888184, |
| "learning_rate": 4.5388698507186445e-05, |
| "loss": 0.6939, |
| "num_input_tokens_seen": 1164608, |
| "step": 3090 |
| }, |
| { |
| "epoch": 2.7732974910394264, |
| "grad_norm": 0.894231915473938, |
| "learning_rate": 4.536604805091327e-05, |
| "loss": 0.7089, |
| "num_input_tokens_seen": 1166368, |
| "step": 3095 |
| }, |
| { |
| "epoch": 2.7777777777777777, |
| "grad_norm": 0.6850932240486145, |
| "learning_rate": 4.534334778274997e-05, |
| "loss": 0.6971, |
| "num_input_tokens_seen": 1168064, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.782258064516129, |
| "grad_norm": 0.7918106913566589, |
| "learning_rate": 4.532059775821752e-05, |
| "loss": 0.6858, |
| "num_input_tokens_seen": 1170016, |
| "step": 3105 |
| }, |
| { |
| "epoch": 2.7867383512544803, |
| "grad_norm": 0.6311814785003662, |
| "learning_rate": 4.529779803295863e-05, |
| "loss": 0.7439, |
| "num_input_tokens_seen": 1171712, |
| "step": 3110 |
| }, |
| { |
| "epoch": 2.7912186379928317, |
| "grad_norm": 0.8450077772140503, |
| "learning_rate": 4.527494866273753e-05, |
| "loss": 0.6845, |
| "num_input_tokens_seen": 1173536, |
| "step": 3115 |
| }, |
| { |
| "epoch": 2.795698924731183, |
| "grad_norm": 0.6848275065422058, |
| "learning_rate": 4.525204970343991e-05, |
| "loss": 0.7427, |
| "num_input_tokens_seen": 1175456, |
| "step": 3120 |
| }, |
| { |
| "epoch": 2.8001792114695343, |
| "grad_norm": 0.6643990874290466, |
| "learning_rate": 4.5229101211072736e-05, |
| "loss": 0.7146, |
| "num_input_tokens_seen": 1177536, |
| "step": 3125 |
| }, |
| { |
| "epoch": 2.804659498207885, |
| "grad_norm": 1.0588222742080688, |
| "learning_rate": 4.52061032417641e-05, |
| "loss": 0.7177, |
| "num_input_tokens_seen": 1179328, |
| "step": 3130 |
| }, |
| { |
| "epoch": 2.8091397849462365, |
| "grad_norm": 0.5510256886482239, |
| "learning_rate": 4.518305585176313e-05, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 1181152, |
| "step": 3135 |
| }, |
| { |
| "epoch": 2.813620071684588, |
| "grad_norm": 0.8167834877967834, |
| "learning_rate": 4.5159959097439833e-05, |
| "loss": 0.7174, |
| "num_input_tokens_seen": 1183104, |
| "step": 3140 |
| }, |
| { |
| "epoch": 2.818100358422939, |
| "grad_norm": 0.6767428517341614, |
| "learning_rate": 4.513681303528493e-05, |
| "loss": 0.6817, |
| "num_input_tokens_seen": 1184960, |
| "step": 3145 |
| }, |
| { |
| "epoch": 2.8225806451612905, |
| "grad_norm": 0.5507485866546631, |
| "learning_rate": 4.511361772190975e-05, |
| "loss": 0.7098, |
| "num_input_tokens_seen": 1186784, |
| "step": 3150 |
| }, |
| { |
| "epoch": 2.8270609318996414, |
| "grad_norm": 0.5543942451477051, |
| "learning_rate": 4.50903732140461e-05, |
| "loss": 0.7044, |
| "num_input_tokens_seen": 1188960, |
| "step": 3155 |
| }, |
| { |
| "epoch": 2.8315412186379927, |
| "grad_norm": 0.5798549056053162, |
| "learning_rate": 4.506707956854608e-05, |
| "loss": 0.6974, |
| "num_input_tokens_seen": 1190784, |
| "step": 3160 |
| }, |
| { |
| "epoch": 2.836021505376344, |
| "grad_norm": 0.9505072236061096, |
| "learning_rate": 4.5043736842382e-05, |
| "loss": 0.6779, |
| "num_input_tokens_seen": 1192896, |
| "step": 3165 |
| }, |
| { |
| "epoch": 2.8405017921146953, |
| "grad_norm": 0.7640702128410339, |
| "learning_rate": 4.5020345092646176e-05, |
| "loss": 0.7244, |
| "num_input_tokens_seen": 1195008, |
| "step": 3170 |
| }, |
| { |
| "epoch": 2.8449820788530467, |
| "grad_norm": 0.6997788548469543, |
| "learning_rate": 4.4996904376550876e-05, |
| "loss": 0.7405, |
| "num_input_tokens_seen": 1196800, |
| "step": 3175 |
| }, |
| { |
| "epoch": 2.849462365591398, |
| "grad_norm": 0.5832765698432922, |
| "learning_rate": 4.497341475142808e-05, |
| "loss": 0.7037, |
| "num_input_tokens_seen": 1198688, |
| "step": 3180 |
| }, |
| { |
| "epoch": 2.8539426523297493, |
| "grad_norm": 0.6882241368293762, |
| "learning_rate": 4.494987627472943e-05, |
| "loss": 0.6921, |
| "num_input_tokens_seen": 1200704, |
| "step": 3185 |
| }, |
| { |
| "epoch": 2.8584229390681006, |
| "grad_norm": 0.7053371667861938, |
| "learning_rate": 4.492628900402604e-05, |
| "loss": 0.726, |
| "num_input_tokens_seen": 1202560, |
| "step": 3190 |
| }, |
| { |
| "epoch": 2.8629032258064515, |
| "grad_norm": 0.6048314571380615, |
| "learning_rate": 4.4902652997008365e-05, |
| "loss": 0.6922, |
| "num_input_tokens_seen": 1204448, |
| "step": 3195 |
| }, |
| { |
| "epoch": 2.867383512544803, |
| "grad_norm": 0.7072091698646545, |
| "learning_rate": 4.487896831148605e-05, |
| "loss": 0.7175, |
| "num_input_tokens_seen": 1206400, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.871863799283154, |
| "grad_norm": 0.7911497950553894, |
| "learning_rate": 4.48552350053878e-05, |
| "loss": 0.7205, |
| "num_input_tokens_seen": 1208160, |
| "step": 3205 |
| }, |
| { |
| "epoch": 2.8763440860215055, |
| "grad_norm": 0.3948129713535309, |
| "learning_rate": 4.483145313676127e-05, |
| "loss": 0.7346, |
| "num_input_tokens_seen": 1209920, |
| "step": 3210 |
| }, |
| { |
| "epoch": 2.8808243727598564, |
| "grad_norm": 0.6734813451766968, |
| "learning_rate": 4.480762276377284e-05, |
| "loss": 0.7002, |
| "num_input_tokens_seen": 1211872, |
| "step": 3215 |
| }, |
| { |
| "epoch": 2.8853046594982077, |
| "grad_norm": 0.48924365639686584, |
| "learning_rate": 4.4783743944707576e-05, |
| "loss": 0.7089, |
| "num_input_tokens_seen": 1213856, |
| "step": 3220 |
| }, |
| { |
| "epoch": 2.889784946236559, |
| "grad_norm": 0.7001926898956299, |
| "learning_rate": 4.475981673796899e-05, |
| "loss": 0.7029, |
| "num_input_tokens_seen": 1215680, |
| "step": 3225 |
| }, |
| { |
| "epoch": 2.8942652329749103, |
| "grad_norm": 1.0040634870529175, |
| "learning_rate": 4.473584120207896e-05, |
| "loss": 0.6916, |
| "num_input_tokens_seen": 1217600, |
| "step": 3230 |
| }, |
| { |
| "epoch": 2.8987455197132617, |
| "grad_norm": 0.7664744257926941, |
| "learning_rate": 4.471181739567758e-05, |
| "loss": 0.7127, |
| "num_input_tokens_seen": 1219488, |
| "step": 3235 |
| }, |
| { |
| "epoch": 2.903225806451613, |
| "grad_norm": 0.5811721682548523, |
| "learning_rate": 4.468774537752299e-05, |
| "loss": 0.6816, |
| "num_input_tokens_seen": 1221216, |
| "step": 3240 |
| }, |
| { |
| "epoch": 2.9077060931899643, |
| "grad_norm": 0.5584621429443359, |
| "learning_rate": 4.466362520649125e-05, |
| "loss": 0.7139, |
| "num_input_tokens_seen": 1222944, |
| "step": 3245 |
| }, |
| { |
| "epoch": 2.9121863799283156, |
| "grad_norm": 0.7363925576210022, |
| "learning_rate": 4.463945694157621e-05, |
| "loss": 0.7161, |
| "num_input_tokens_seen": 1224832, |
| "step": 3250 |
| }, |
| { |
| "epoch": 2.9166666666666665, |
| "grad_norm": 0.6875536441802979, |
| "learning_rate": 4.461524064188931e-05, |
| "loss": 0.7205, |
| "num_input_tokens_seen": 1226560, |
| "step": 3255 |
| }, |
| { |
| "epoch": 2.921146953405018, |
| "grad_norm": 0.6339446902275085, |
| "learning_rate": 4.459097636665953e-05, |
| "loss": 0.7823, |
| "num_input_tokens_seen": 1228480, |
| "step": 3260 |
| }, |
| { |
| "epoch": 2.925627240143369, |
| "grad_norm": 0.8035193085670471, |
| "learning_rate": 4.456666417523314e-05, |
| "loss": 0.7181, |
| "num_input_tokens_seen": 1230208, |
| "step": 3265 |
| }, |
| { |
| "epoch": 2.9301075268817205, |
| "grad_norm": 0.6746560335159302, |
| "learning_rate": 4.4542304127073644e-05, |
| "loss": 0.7073, |
| "num_input_tokens_seen": 1232160, |
| "step": 3270 |
| }, |
| { |
| "epoch": 2.934587813620072, |
| "grad_norm": 0.603693425655365, |
| "learning_rate": 4.451789628176155e-05, |
| "loss": 0.7055, |
| "num_input_tokens_seen": 1234112, |
| "step": 3275 |
| }, |
| { |
| "epoch": 2.9390681003584227, |
| "grad_norm": 0.6682401299476624, |
| "learning_rate": 4.449344069899433e-05, |
| "loss": 0.7182, |
| "num_input_tokens_seen": 1236064, |
| "step": 3280 |
| }, |
| { |
| "epoch": 2.943548387096774, |
| "grad_norm": 0.5182085633277893, |
| "learning_rate": 4.446893743858615e-05, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 1237856, |
| "step": 3285 |
| }, |
| { |
| "epoch": 2.9480286738351253, |
| "grad_norm": 0.5174866914749146, |
| "learning_rate": 4.4444386560467836e-05, |
| "loss": 0.6907, |
| "num_input_tokens_seen": 1239968, |
| "step": 3290 |
| }, |
| { |
| "epoch": 2.9525089605734767, |
| "grad_norm": 0.6247062087059021, |
| "learning_rate": 4.441978812468666e-05, |
| "loss": 0.7432, |
| "num_input_tokens_seen": 1241760, |
| "step": 3295 |
| }, |
| { |
| "epoch": 2.956989247311828, |
| "grad_norm": 0.7549983859062195, |
| "learning_rate": 4.439514219140621e-05, |
| "loss": 0.7235, |
| "num_input_tokens_seen": 1243840, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.9614695340501793, |
| "grad_norm": 0.7973654270172119, |
| "learning_rate": 4.4370448820906246e-05, |
| "loss": 0.7258, |
| "num_input_tokens_seen": 1245664, |
| "step": 3305 |
| }, |
| { |
| "epoch": 2.9659498207885306, |
| "grad_norm": 0.4042164385318756, |
| "learning_rate": 4.434570807358255e-05, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 1247488, |
| "step": 3310 |
| }, |
| { |
| "epoch": 2.970430107526882, |
| "grad_norm": 0.8879252672195435, |
| "learning_rate": 4.4320920009946795e-05, |
| "loss": 0.7188, |
| "num_input_tokens_seen": 1249280, |
| "step": 3315 |
| }, |
| { |
| "epoch": 2.974910394265233, |
| "grad_norm": 0.6141677498817444, |
| "learning_rate": 4.4296084690626356e-05, |
| "loss": 0.6683, |
| "num_input_tokens_seen": 1251136, |
| "step": 3320 |
| }, |
| { |
| "epoch": 2.979390681003584, |
| "grad_norm": 0.5475640892982483, |
| "learning_rate": 4.427120217636421e-05, |
| "loss": 0.6596, |
| "num_input_tokens_seen": 1253024, |
| "step": 3325 |
| }, |
| { |
| "epoch": 2.9838709677419355, |
| "grad_norm": 0.6818305850028992, |
| "learning_rate": 4.424627252801874e-05, |
| "loss": 0.6488, |
| "num_input_tokens_seen": 1254848, |
| "step": 3330 |
| }, |
| { |
| "epoch": 2.988351254480287, |
| "grad_norm": 0.688703715801239, |
| "learning_rate": 4.422129580656365e-05, |
| "loss": 0.6964, |
| "num_input_tokens_seen": 1256704, |
| "step": 3335 |
| }, |
| { |
| "epoch": 2.992831541218638, |
| "grad_norm": 0.6664419770240784, |
| "learning_rate": 4.419627207308773e-05, |
| "loss": 0.6693, |
| "num_input_tokens_seen": 1258624, |
| "step": 3340 |
| }, |
| { |
| "epoch": 2.997311827956989, |
| "grad_norm": 0.6489902138710022, |
| "learning_rate": 4.4171201388794795e-05, |
| "loss": 0.7289, |
| "num_input_tokens_seen": 1260480, |
| "step": 3345 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.7142015099525452, |
| "eval_runtime": 5.6222, |
| "eval_samples_per_second": 88.222, |
| "eval_steps_per_second": 22.056, |
| "num_input_tokens_seen": 1261304, |
| "step": 3348 |
| }, |
| { |
| "epoch": 3.0017921146953404, |
| "grad_norm": 0.6640289425849915, |
| "learning_rate": 4.414608381500347e-05, |
| "loss": 0.6641, |
| "num_input_tokens_seen": 1262008, |
| "step": 3350 |
| }, |
| { |
| "epoch": 3.0062724014336917, |
| "grad_norm": 0.6687781810760498, |
| "learning_rate": 4.4120919413147054e-05, |
| "loss": 0.7156, |
| "num_input_tokens_seen": 1263800, |
| "step": 3355 |
| }, |
| { |
| "epoch": 3.010752688172043, |
| "grad_norm": 0.6788121461868286, |
| "learning_rate": 4.409570824477341e-05, |
| "loss": 0.7282, |
| "num_input_tokens_seen": 1265592, |
| "step": 3360 |
| }, |
| { |
| "epoch": 3.0152329749103943, |
| "grad_norm": 0.6610774993896484, |
| "learning_rate": 4.407045037154478e-05, |
| "loss": 0.6751, |
| "num_input_tokens_seen": 1267512, |
| "step": 3365 |
| }, |
| { |
| "epoch": 3.0197132616487457, |
| "grad_norm": 0.6137348413467407, |
| "learning_rate": 4.40451458552376e-05, |
| "loss": 0.7057, |
| "num_input_tokens_seen": 1269400, |
| "step": 3370 |
| }, |
| { |
| "epoch": 3.024193548387097, |
| "grad_norm": 0.6485475897789001, |
| "learning_rate": 4.4019794757742426e-05, |
| "loss": 0.6814, |
| "num_input_tokens_seen": 1271192, |
| "step": 3375 |
| }, |
| { |
| "epoch": 3.028673835125448, |
| "grad_norm": 1.0241650342941284, |
| "learning_rate": 4.3994397141063734e-05, |
| "loss": 0.7126, |
| "num_input_tokens_seen": 1273080, |
| "step": 3380 |
| }, |
| { |
| "epoch": 3.033154121863799, |
| "grad_norm": 0.6707364320755005, |
| "learning_rate": 4.3968953067319777e-05, |
| "loss": 0.6491, |
| "num_input_tokens_seen": 1275000, |
| "step": 3385 |
| }, |
| { |
| "epoch": 3.0376344086021505, |
| "grad_norm": 0.7968290448188782, |
| "learning_rate": 4.394346259874242e-05, |
| "loss": 0.7002, |
| "num_input_tokens_seen": 1276856, |
| "step": 3390 |
| }, |
| { |
| "epoch": 3.042114695340502, |
| "grad_norm": 0.5572385191917419, |
| "learning_rate": 4.3917925797677025e-05, |
| "loss": 0.6943, |
| "num_input_tokens_seen": 1278648, |
| "step": 3395 |
| }, |
| { |
| "epoch": 3.046594982078853, |
| "grad_norm": 0.6464095711708069, |
| "learning_rate": 4.389234272658227e-05, |
| "loss": 0.6567, |
| "num_input_tokens_seen": 1280504, |
| "step": 3400 |
| }, |
| { |
| "epoch": 3.0510752688172045, |
| "grad_norm": 0.698445200920105, |
| "learning_rate": 4.386671344802998e-05, |
| "loss": 0.729, |
| "num_input_tokens_seen": 1282488, |
| "step": 3405 |
| }, |
| { |
| "epoch": 3.0555555555555554, |
| "grad_norm": 0.7717545628547668, |
| "learning_rate": 4.384103802470502e-05, |
| "loss": 0.7305, |
| "num_input_tokens_seen": 1284312, |
| "step": 3410 |
| }, |
| { |
| "epoch": 3.0600358422939067, |
| "grad_norm": 0.8240836262702942, |
| "learning_rate": 4.381531651940511e-05, |
| "loss": 0.6899, |
| "num_input_tokens_seen": 1286200, |
| "step": 3415 |
| }, |
| { |
| "epoch": 3.064516129032258, |
| "grad_norm": 0.6516638398170471, |
| "learning_rate": 4.378954899504068e-05, |
| "loss": 0.6686, |
| "num_input_tokens_seen": 1288088, |
| "step": 3420 |
| }, |
| { |
| "epoch": 3.0689964157706093, |
| "grad_norm": 0.5957766771316528, |
| "learning_rate": 4.3763735514634706e-05, |
| "loss": 0.7081, |
| "num_input_tokens_seen": 1290232, |
| "step": 3425 |
| }, |
| { |
| "epoch": 3.0734767025089607, |
| "grad_norm": 0.5282344222068787, |
| "learning_rate": 4.3737876141322576e-05, |
| "loss": 0.7534, |
| "num_input_tokens_seen": 1292184, |
| "step": 3430 |
| }, |
| { |
| "epoch": 3.077956989247312, |
| "grad_norm": 0.6102969646453857, |
| "learning_rate": 4.371197093835192e-05, |
| "loss": 0.6726, |
| "num_input_tokens_seen": 1294168, |
| "step": 3435 |
| }, |
| { |
| "epoch": 3.0824372759856633, |
| "grad_norm": 0.8353556394577026, |
| "learning_rate": 4.368601996908246e-05, |
| "loss": 0.6673, |
| "num_input_tokens_seen": 1296088, |
| "step": 3440 |
| }, |
| { |
| "epoch": 3.086917562724014, |
| "grad_norm": 0.5756303071975708, |
| "learning_rate": 4.366002329698585e-05, |
| "loss": 0.6629, |
| "num_input_tokens_seen": 1297816, |
| "step": 3445 |
| }, |
| { |
| "epoch": 3.0913978494623655, |
| "grad_norm": 0.8167845606803894, |
| "learning_rate": 4.3633980985645526e-05, |
| "loss": 0.7104, |
| "num_input_tokens_seen": 1299704, |
| "step": 3450 |
| }, |
| { |
| "epoch": 3.095878136200717, |
| "grad_norm": 0.9127864837646484, |
| "learning_rate": 4.360789309875656e-05, |
| "loss": 0.7214, |
| "num_input_tokens_seen": 1301656, |
| "step": 3455 |
| }, |
| { |
| "epoch": 3.100358422939068, |
| "grad_norm": 0.5587136149406433, |
| "learning_rate": 4.358175970012549e-05, |
| "loss": 0.6937, |
| "num_input_tokens_seen": 1303608, |
| "step": 3460 |
| }, |
| { |
| "epoch": 3.1048387096774195, |
| "grad_norm": 0.5719638466835022, |
| "learning_rate": 4.3555580853670154e-05, |
| "loss": 0.6916, |
| "num_input_tokens_seen": 1305432, |
| "step": 3465 |
| }, |
| { |
| "epoch": 3.109318996415771, |
| "grad_norm": 0.4955412447452545, |
| "learning_rate": 4.352935662341956e-05, |
| "loss": 0.7134, |
| "num_input_tokens_seen": 1307288, |
| "step": 3470 |
| }, |
| { |
| "epoch": 3.1137992831541217, |
| "grad_norm": 0.837181806564331, |
| "learning_rate": 4.350308707351372e-05, |
| "loss": 0.6982, |
| "num_input_tokens_seen": 1309272, |
| "step": 3475 |
| }, |
| { |
| "epoch": 3.118279569892473, |
| "grad_norm": 0.4352196753025055, |
| "learning_rate": 4.347677226820349e-05, |
| "loss": 0.6825, |
| "num_input_tokens_seen": 1311128, |
| "step": 3480 |
| }, |
| { |
| "epoch": 3.1227598566308243, |
| "grad_norm": 0.4888545870780945, |
| "learning_rate": 4.3450412271850406e-05, |
| "loss": 0.7204, |
| "num_input_tokens_seen": 1312856, |
| "step": 3485 |
| }, |
| { |
| "epoch": 3.1272401433691757, |
| "grad_norm": 0.6269586682319641, |
| "learning_rate": 4.342400714892653e-05, |
| "loss": 0.6848, |
| "num_input_tokens_seen": 1315000, |
| "step": 3490 |
| }, |
| { |
| "epoch": 3.131720430107527, |
| "grad_norm": 0.6699550747871399, |
| "learning_rate": 4.339755696401431e-05, |
| "loss": 0.6998, |
| "num_input_tokens_seen": 1316792, |
| "step": 3495 |
| }, |
| { |
| "epoch": 3.1362007168458783, |
| "grad_norm": 0.5314154028892517, |
| "learning_rate": 4.337106178180639e-05, |
| "loss": 0.6819, |
| "num_input_tokens_seen": 1318616, |
| "step": 3500 |
| }, |
| { |
| "epoch": 3.140681003584229, |
| "grad_norm": 0.7012357711791992, |
| "learning_rate": 4.3344521667105486e-05, |
| "loss": 0.7421, |
| "num_input_tokens_seen": 1320504, |
| "step": 3505 |
| }, |
| { |
| "epoch": 3.1451612903225805, |
| "grad_norm": 0.6760140657424927, |
| "learning_rate": 4.331793668482421e-05, |
| "loss": 0.6886, |
| "num_input_tokens_seen": 1322488, |
| "step": 3510 |
| }, |
| { |
| "epoch": 3.149641577060932, |
| "grad_norm": 0.5820931196212769, |
| "learning_rate": 4.329130689998491e-05, |
| "loss": 0.7029, |
| "num_input_tokens_seen": 1324440, |
| "step": 3515 |
| }, |
| { |
| "epoch": 3.154121863799283, |
| "grad_norm": 0.5365394353866577, |
| "learning_rate": 4.3264632377719496e-05, |
| "loss": 0.737, |
| "num_input_tokens_seen": 1326488, |
| "step": 3520 |
| }, |
| { |
| "epoch": 3.1586021505376345, |
| "grad_norm": 0.5414961576461792, |
| "learning_rate": 4.323791318326932e-05, |
| "loss": 0.6979, |
| "num_input_tokens_seen": 1328536, |
| "step": 3525 |
| }, |
| { |
| "epoch": 3.163082437275986, |
| "grad_norm": 0.5694983005523682, |
| "learning_rate": 4.3211149381984996e-05, |
| "loss": 0.6969, |
| "num_input_tokens_seen": 1330328, |
| "step": 3530 |
| }, |
| { |
| "epoch": 3.1675627240143367, |
| "grad_norm": 0.5108627080917358, |
| "learning_rate": 4.318434103932622e-05, |
| "loss": 0.7004, |
| "num_input_tokens_seen": 1332280, |
| "step": 3535 |
| }, |
| { |
| "epoch": 3.172043010752688, |
| "grad_norm": 0.4905160665512085, |
| "learning_rate": 4.315748822086164e-05, |
| "loss": 0.6608, |
| "num_input_tokens_seen": 1334360, |
| "step": 3540 |
| }, |
| { |
| "epoch": 3.1765232974910393, |
| "grad_norm": 0.7809849977493286, |
| "learning_rate": 4.3130590992268695e-05, |
| "loss": 0.7581, |
| "num_input_tokens_seen": 1336472, |
| "step": 3545 |
| }, |
| { |
| "epoch": 3.1810035842293907, |
| "grad_norm": 0.48875778913497925, |
| "learning_rate": 4.3103649419333424e-05, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 1338296, |
| "step": 3550 |
| }, |
| { |
| "epoch": 3.185483870967742, |
| "grad_norm": 0.6410282254219055, |
| "learning_rate": 4.307666356795033e-05, |
| "loss": 0.7318, |
| "num_input_tokens_seen": 1340216, |
| "step": 3555 |
| }, |
| { |
| "epoch": 3.1899641577060933, |
| "grad_norm": 0.9984332323074341, |
| "learning_rate": 4.3049633504122215e-05, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 1341912, |
| "step": 3560 |
| }, |
| { |
| "epoch": 3.1944444444444446, |
| "grad_norm": 0.5605589747428894, |
| "learning_rate": 4.302255929396003e-05, |
| "loss": 0.714, |
| "num_input_tokens_seen": 1343672, |
| "step": 3565 |
| }, |
| { |
| "epoch": 3.1989247311827955, |
| "grad_norm": 0.7751408815383911, |
| "learning_rate": 4.299544100368268e-05, |
| "loss": 0.6955, |
| "num_input_tokens_seen": 1345528, |
| "step": 3570 |
| }, |
| { |
| "epoch": 3.203405017921147, |
| "grad_norm": 0.663055956363678, |
| "learning_rate": 4.2968278699616885e-05, |
| "loss": 0.6538, |
| "num_input_tokens_seen": 1347640, |
| "step": 3575 |
| }, |
| { |
| "epoch": 3.207885304659498, |
| "grad_norm": 0.41530367732048035, |
| "learning_rate": 4.294107244819704e-05, |
| "loss": 0.6929, |
| "num_input_tokens_seen": 1349432, |
| "step": 3580 |
| }, |
| { |
| "epoch": 3.2123655913978495, |
| "grad_norm": 0.6339878439903259, |
| "learning_rate": 4.291382231596499e-05, |
| "loss": 0.7599, |
| "num_input_tokens_seen": 1351160, |
| "step": 3585 |
| }, |
| { |
| "epoch": 3.216845878136201, |
| "grad_norm": 0.8388494849205017, |
| "learning_rate": 4.2886528369569935e-05, |
| "loss": 0.6925, |
| "num_input_tokens_seen": 1353016, |
| "step": 3590 |
| }, |
| { |
| "epoch": 3.221326164874552, |
| "grad_norm": 0.5189142227172852, |
| "learning_rate": 4.285919067576822e-05, |
| "loss": 0.7088, |
| "num_input_tokens_seen": 1354904, |
| "step": 3595 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "grad_norm": 0.6881794929504395, |
| "learning_rate": 4.283180930142322e-05, |
| "loss": 0.6506, |
| "num_input_tokens_seen": 1356792, |
| "step": 3600 |
| }, |
| { |
| "epoch": 3.2302867383512543, |
| "grad_norm": 0.550651490688324, |
| "learning_rate": 4.280438431350508e-05, |
| "loss": 0.7122, |
| "num_input_tokens_seen": 1358680, |
| "step": 3605 |
| }, |
| { |
| "epoch": 3.2347670250896057, |
| "grad_norm": 0.5976690053939819, |
| "learning_rate": 4.2776915779090674e-05, |
| "loss": 0.6498, |
| "num_input_tokens_seen": 1360536, |
| "step": 3610 |
| }, |
| { |
| "epoch": 3.239247311827957, |
| "grad_norm": 0.8103978633880615, |
| "learning_rate": 4.274940376536338e-05, |
| "loss": 0.716, |
| "num_input_tokens_seen": 1362424, |
| "step": 3615 |
| }, |
| { |
| "epoch": 3.2437275985663083, |
| "grad_norm": 0.35558953881263733, |
| "learning_rate": 4.272184833961289e-05, |
| "loss": 0.6674, |
| "num_input_tokens_seen": 1364408, |
| "step": 3620 |
| }, |
| { |
| "epoch": 3.2482078853046596, |
| "grad_norm": 0.592622697353363, |
| "learning_rate": 4.269424956923509e-05, |
| "loss": 0.7252, |
| "num_input_tokens_seen": 1366200, |
| "step": 3625 |
| }, |
| { |
| "epoch": 3.252688172043011, |
| "grad_norm": 0.6038530468940735, |
| "learning_rate": 4.2666607521731883e-05, |
| "loss": 0.6721, |
| "num_input_tokens_seen": 1368024, |
| "step": 3630 |
| }, |
| { |
| "epoch": 3.257168458781362, |
| "grad_norm": 0.5127895474433899, |
| "learning_rate": 4.2638922264711026e-05, |
| "loss": 0.6845, |
| "num_input_tokens_seen": 1369784, |
| "step": 3635 |
| }, |
| { |
| "epoch": 3.261648745519713, |
| "grad_norm": 0.6586855053901672, |
| "learning_rate": 4.2611193865885926e-05, |
| "loss": 0.7232, |
| "num_input_tokens_seen": 1371608, |
| "step": 3640 |
| }, |
| { |
| "epoch": 3.2661290322580645, |
| "grad_norm": 0.671576738357544, |
| "learning_rate": 4.258342239307554e-05, |
| "loss": 0.7184, |
| "num_input_tokens_seen": 1373400, |
| "step": 3645 |
| }, |
| { |
| "epoch": 3.270609318996416, |
| "grad_norm": 0.5418992042541504, |
| "learning_rate": 4.255560791420417e-05, |
| "loss": 0.6709, |
| "num_input_tokens_seen": 1375256, |
| "step": 3650 |
| }, |
| { |
| "epoch": 3.275089605734767, |
| "grad_norm": 0.5688634514808655, |
| "learning_rate": 4.2527750497301323e-05, |
| "loss": 0.6669, |
| "num_input_tokens_seen": 1377336, |
| "step": 3655 |
| }, |
| { |
| "epoch": 3.279569892473118, |
| "grad_norm": 0.6521787047386169, |
| "learning_rate": 4.249985021050147e-05, |
| "loss": 0.7181, |
| "num_input_tokens_seen": 1379064, |
| "step": 3660 |
| }, |
| { |
| "epoch": 3.2840501792114694, |
| "grad_norm": 0.6241593360900879, |
| "learning_rate": 4.247190712204398e-05, |
| "loss": 0.6542, |
| "num_input_tokens_seen": 1380920, |
| "step": 3665 |
| }, |
| { |
| "epoch": 3.2885304659498207, |
| "grad_norm": 0.5417748093605042, |
| "learning_rate": 4.2443921300272895e-05, |
| "loss": 0.7291, |
| "num_input_tokens_seen": 1382872, |
| "step": 3670 |
| }, |
| { |
| "epoch": 3.293010752688172, |
| "grad_norm": 0.6573375463485718, |
| "learning_rate": 4.241589281363678e-05, |
| "loss": 0.7441, |
| "num_input_tokens_seen": 1384888, |
| "step": 3675 |
| }, |
| { |
| "epoch": 3.2974910394265233, |
| "grad_norm": 0.8385305404663086, |
| "learning_rate": 4.2387821730688545e-05, |
| "loss": 0.6885, |
| "num_input_tokens_seen": 1386776, |
| "step": 3680 |
| }, |
| { |
| "epoch": 3.3019713261648747, |
| "grad_norm": 0.8915187120437622, |
| "learning_rate": 4.2359708120085286e-05, |
| "loss": 0.7465, |
| "num_input_tokens_seen": 1388600, |
| "step": 3685 |
| }, |
| { |
| "epoch": 3.306451612903226, |
| "grad_norm": 0.48122549057006836, |
| "learning_rate": 4.233155205058811e-05, |
| "loss": 0.6854, |
| "num_input_tokens_seen": 1390488, |
| "step": 3690 |
| }, |
| { |
| "epoch": 3.3109318996415773, |
| "grad_norm": 0.6241974830627441, |
| "learning_rate": 4.230335359106198e-05, |
| "loss": 0.712, |
| "num_input_tokens_seen": 1392344, |
| "step": 3695 |
| }, |
| { |
| "epoch": 3.315412186379928, |
| "grad_norm": 0.8159958124160767, |
| "learning_rate": 4.227511281047552e-05, |
| "loss": 0.7353, |
| "num_input_tokens_seen": 1394296, |
| "step": 3700 |
| }, |
| { |
| "epoch": 3.3198924731182795, |
| "grad_norm": 0.8499752283096313, |
| "learning_rate": 4.22468297779009e-05, |
| "loss": 0.7385, |
| "num_input_tokens_seen": 1396216, |
| "step": 3705 |
| }, |
| { |
| "epoch": 3.324372759856631, |
| "grad_norm": 0.5092227458953857, |
| "learning_rate": 4.2218504562513584e-05, |
| "loss": 0.6918, |
| "num_input_tokens_seen": 1398136, |
| "step": 3710 |
| }, |
| { |
| "epoch": 3.328853046594982, |
| "grad_norm": 0.7707375884056091, |
| "learning_rate": 4.219013723359224e-05, |
| "loss": 0.7062, |
| "num_input_tokens_seen": 1400088, |
| "step": 3715 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.5665374398231506, |
| "learning_rate": 4.216172786051854e-05, |
| "loss": 0.7163, |
| "num_input_tokens_seen": 1402040, |
| "step": 3720 |
| }, |
| { |
| "epoch": 3.3378136200716844, |
| "grad_norm": 0.5855923295021057, |
| "learning_rate": 4.213327651277697e-05, |
| "loss": 0.6933, |
| "num_input_tokens_seen": 1403960, |
| "step": 3725 |
| }, |
| { |
| "epoch": 3.3422939068100357, |
| "grad_norm": 0.5904120206832886, |
| "learning_rate": 4.2104783259954687e-05, |
| "loss": 0.6721, |
| "num_input_tokens_seen": 1405848, |
| "step": 3730 |
| }, |
| { |
| "epoch": 3.346774193548387, |
| "grad_norm": 0.5156244039535522, |
| "learning_rate": 4.207624817174135e-05, |
| "loss": 0.6761, |
| "num_input_tokens_seen": 1407736, |
| "step": 3735 |
| }, |
| { |
| "epoch": 3.3512544802867383, |
| "grad_norm": 0.5167236328125, |
| "learning_rate": 4.204767131792892e-05, |
| "loss": 0.7573, |
| "num_input_tokens_seen": 1409624, |
| "step": 3740 |
| }, |
| { |
| "epoch": 3.3557347670250897, |
| "grad_norm": 0.46987155079841614, |
| "learning_rate": 4.201905276841153e-05, |
| "loss": 0.7128, |
| "num_input_tokens_seen": 1411480, |
| "step": 3745 |
| }, |
| { |
| "epoch": 3.360215053763441, |
| "grad_norm": 0.5643429160118103, |
| "learning_rate": 4.199039259318529e-05, |
| "loss": 0.7207, |
| "num_input_tokens_seen": 1413400, |
| "step": 3750 |
| }, |
| { |
| "epoch": 3.3646953405017923, |
| "grad_norm": 0.7398551106452942, |
| "learning_rate": 4.196169086234811e-05, |
| "loss": 0.7291, |
| "num_input_tokens_seen": 1415224, |
| "step": 3755 |
| }, |
| { |
| "epoch": 3.369175627240143, |
| "grad_norm": 0.6347612142562866, |
| "learning_rate": 4.193294764609954e-05, |
| "loss": 0.7287, |
| "num_input_tokens_seen": 1416952, |
| "step": 3760 |
| }, |
| { |
| "epoch": 3.3736559139784945, |
| "grad_norm": 0.434487909078598, |
| "learning_rate": 4.190416301474059e-05, |
| "loss": 0.7096, |
| "num_input_tokens_seen": 1418840, |
| "step": 3765 |
| }, |
| { |
| "epoch": 3.378136200716846, |
| "grad_norm": 0.6980918645858765, |
| "learning_rate": 4.18753370386736e-05, |
| "loss": 0.6996, |
| "num_input_tokens_seen": 1420536, |
| "step": 3770 |
| }, |
| { |
| "epoch": 3.382616487455197, |
| "grad_norm": 0.6351816058158875, |
| "learning_rate": 4.184646978840198e-05, |
| "loss": 0.7126, |
| "num_input_tokens_seen": 1422456, |
| "step": 3775 |
| }, |
| { |
| "epoch": 3.3870967741935485, |
| "grad_norm": 0.5904970169067383, |
| "learning_rate": 4.181756133453013e-05, |
| "loss": 0.7314, |
| "num_input_tokens_seen": 1424312, |
| "step": 3780 |
| }, |
| { |
| "epoch": 3.3915770609319, |
| "grad_norm": 0.663182258605957, |
| "learning_rate": 4.17886117477632e-05, |
| "loss": 0.6684, |
| "num_input_tokens_seen": 1426360, |
| "step": 3785 |
| }, |
| { |
| "epoch": 3.3960573476702507, |
| "grad_norm": 0.7417337894439697, |
| "learning_rate": 4.175962109890696e-05, |
| "loss": 0.685, |
| "num_input_tokens_seen": 1428152, |
| "step": 3790 |
| }, |
| { |
| "epoch": 3.400537634408602, |
| "grad_norm": 0.6670061945915222, |
| "learning_rate": 4.173058945886762e-05, |
| "loss": 0.6656, |
| "num_input_tokens_seen": 1430360, |
| "step": 3795 |
| }, |
| { |
| "epoch": 3.4050179211469533, |
| "grad_norm": 0.41760730743408203, |
| "learning_rate": 4.1701516898651614e-05, |
| "loss": 0.7181, |
| "num_input_tokens_seen": 1432184, |
| "step": 3800 |
| }, |
| { |
| "epoch": 3.4094982078853047, |
| "grad_norm": 0.6389894485473633, |
| "learning_rate": 4.1672403489365505e-05, |
| "loss": 0.7196, |
| "num_input_tokens_seen": 1434008, |
| "step": 3805 |
| }, |
| { |
| "epoch": 3.413978494623656, |
| "grad_norm": 0.622952401638031, |
| "learning_rate": 4.164324930221571e-05, |
| "loss": 0.7128, |
| "num_input_tokens_seen": 1435928, |
| "step": 3810 |
| }, |
| { |
| "epoch": 3.4184587813620073, |
| "grad_norm": 0.8242074251174927, |
| "learning_rate": 4.161405440850844e-05, |
| "loss": 0.7395, |
| "num_input_tokens_seen": 1437784, |
| "step": 3815 |
| }, |
| { |
| "epoch": 3.4229390681003586, |
| "grad_norm": 0.541373610496521, |
| "learning_rate": 4.1584818879649426e-05, |
| "loss": 0.6827, |
| "num_input_tokens_seen": 1439640, |
| "step": 3820 |
| }, |
| { |
| "epoch": 3.4274193548387095, |
| "grad_norm": 0.5152111053466797, |
| "learning_rate": 4.1555542787143795e-05, |
| "loss": 0.7267, |
| "num_input_tokens_seen": 1441496, |
| "step": 3825 |
| }, |
| { |
| "epoch": 3.431899641577061, |
| "grad_norm": 0.6609454154968262, |
| "learning_rate": 4.1526226202595915e-05, |
| "loss": 0.7206, |
| "num_input_tokens_seen": 1443512, |
| "step": 3830 |
| }, |
| { |
| "epoch": 3.436379928315412, |
| "grad_norm": 0.6851304769515991, |
| "learning_rate": 4.1496869197709146e-05, |
| "loss": 0.708, |
| "num_input_tokens_seen": 1445432, |
| "step": 3835 |
| }, |
| { |
| "epoch": 3.4408602150537635, |
| "grad_norm": 0.6026902794837952, |
| "learning_rate": 4.1467471844285724e-05, |
| "loss": 0.7032, |
| "num_input_tokens_seen": 1447384, |
| "step": 3840 |
| }, |
| { |
| "epoch": 3.445340501792115, |
| "grad_norm": 0.6418375968933105, |
| "learning_rate": 4.14380342142266e-05, |
| "loss": 0.6866, |
| "num_input_tokens_seen": 1449208, |
| "step": 3845 |
| }, |
| { |
| "epoch": 3.449820788530466, |
| "grad_norm": 0.5678339600563049, |
| "learning_rate": 4.1408556379531186e-05, |
| "loss": 0.7173, |
| "num_input_tokens_seen": 1451064, |
| "step": 3850 |
| }, |
| { |
| "epoch": 3.454301075268817, |
| "grad_norm": 0.5131914019584656, |
| "learning_rate": 4.137903841229727e-05, |
| "loss": 0.7137, |
| "num_input_tokens_seen": 1452856, |
| "step": 3855 |
| }, |
| { |
| "epoch": 3.4587813620071683, |
| "grad_norm": 0.39637306332588196, |
| "learning_rate": 4.1349480384720765e-05, |
| "loss": 0.6941, |
| "num_input_tokens_seen": 1454712, |
| "step": 3860 |
| }, |
| { |
| "epoch": 3.4632616487455197, |
| "grad_norm": 0.8247675895690918, |
| "learning_rate": 4.13198823690956e-05, |
| "loss": 0.6861, |
| "num_input_tokens_seen": 1456600, |
| "step": 3865 |
| }, |
| { |
| "epoch": 3.467741935483871, |
| "grad_norm": 0.4988982379436493, |
| "learning_rate": 4.1290244437813475e-05, |
| "loss": 0.7023, |
| "num_input_tokens_seen": 1458552, |
| "step": 3870 |
| }, |
| { |
| "epoch": 3.4722222222222223, |
| "grad_norm": 0.7563645839691162, |
| "learning_rate": 4.126056666336373e-05, |
| "loss": 0.7156, |
| "num_input_tokens_seen": 1460408, |
| "step": 3875 |
| }, |
| { |
| "epoch": 3.4767025089605736, |
| "grad_norm": 0.5761010646820068, |
| "learning_rate": 4.123084911833315e-05, |
| "loss": 0.7287, |
| "num_input_tokens_seen": 1462392, |
| "step": 3880 |
| }, |
| { |
| "epoch": 3.481182795698925, |
| "grad_norm": 0.6941600441932678, |
| "learning_rate": 4.120109187540581e-05, |
| "loss": 0.7154, |
| "num_input_tokens_seen": 1464184, |
| "step": 3885 |
| }, |
| { |
| "epoch": 3.485663082437276, |
| "grad_norm": 0.6207517981529236, |
| "learning_rate": 4.117129500736286e-05, |
| "loss": 0.711, |
| "num_input_tokens_seen": 1466040, |
| "step": 3890 |
| }, |
| { |
| "epoch": 3.490143369175627, |
| "grad_norm": 0.4753372073173523, |
| "learning_rate": 4.114145858708236e-05, |
| "loss": 0.7232, |
| "num_input_tokens_seen": 1467960, |
| "step": 3895 |
| }, |
| { |
| "epoch": 3.4946236559139785, |
| "grad_norm": 0.6294930577278137, |
| "learning_rate": 4.111158268753914e-05, |
| "loss": 0.6971, |
| "num_input_tokens_seen": 1469944, |
| "step": 3900 |
| }, |
| { |
| "epoch": 3.49910394265233, |
| "grad_norm": 0.8148292303085327, |
| "learning_rate": 4.108166738180455e-05, |
| "loss": 0.698, |
| "num_input_tokens_seen": 1471736, |
| "step": 3905 |
| }, |
| { |
| "epoch": 3.5, |
| "eval_loss": 0.7066304683685303, |
| "eval_runtime": 5.6339, |
| "eval_samples_per_second": 88.038, |
| "eval_steps_per_second": 22.009, |
| "num_input_tokens_seen": 1472152, |
| "step": 3906 |
| }, |
| { |
| "epoch": 3.503584229390681, |
| "grad_norm": 0.5032709836959839, |
| "learning_rate": 4.105171274304637e-05, |
| "loss": 0.7026, |
| "num_input_tokens_seen": 1473624, |
| "step": 3910 |
| }, |
| { |
| "epoch": 3.508064516129032, |
| "grad_norm": 0.5328845381736755, |
| "learning_rate": 4.102171884452852e-05, |
| "loss": 0.6933, |
| "num_input_tokens_seen": 1475480, |
| "step": 3915 |
| }, |
| { |
| "epoch": 3.5125448028673834, |
| "grad_norm": 0.664138674736023, |
| "learning_rate": 4.099168575961099e-05, |
| "loss": 0.7192, |
| "num_input_tokens_seen": 1477336, |
| "step": 3920 |
| }, |
| { |
| "epoch": 3.5170250896057347, |
| "grad_norm": 0.3400190472602844, |
| "learning_rate": 4.096161356174959e-05, |
| "loss": 0.6938, |
| "num_input_tokens_seen": 1479256, |
| "step": 3925 |
| }, |
| { |
| "epoch": 3.521505376344086, |
| "grad_norm": 0.73305344581604, |
| "learning_rate": 4.093150232449581e-05, |
| "loss": 0.6948, |
| "num_input_tokens_seen": 1481080, |
| "step": 3930 |
| }, |
| { |
| "epoch": 3.5259856630824373, |
| "grad_norm": 0.5535285472869873, |
| "learning_rate": 4.0901352121496613e-05, |
| "loss": 0.6842, |
| "num_input_tokens_seen": 1483128, |
| "step": 3935 |
| }, |
| { |
| "epoch": 3.5304659498207887, |
| "grad_norm": 0.507737398147583, |
| "learning_rate": 4.087116302649428e-05, |
| "loss": 0.6932, |
| "num_input_tokens_seen": 1484984, |
| "step": 3940 |
| }, |
| { |
| "epoch": 3.53494623655914, |
| "grad_norm": 0.4982096254825592, |
| "learning_rate": 4.0840935113326184e-05, |
| "loss": 0.6843, |
| "num_input_tokens_seen": 1486744, |
| "step": 3945 |
| }, |
| { |
| "epoch": 3.5394265232974913, |
| "grad_norm": 0.5703076124191284, |
| "learning_rate": 4.081066845592467e-05, |
| "loss": 0.7112, |
| "num_input_tokens_seen": 1488632, |
| "step": 3950 |
| }, |
| { |
| "epoch": 3.543906810035842, |
| "grad_norm": 0.6137137413024902, |
| "learning_rate": 4.0780363128316844e-05, |
| "loss": 0.7193, |
| "num_input_tokens_seen": 1490584, |
| "step": 3955 |
| }, |
| { |
| "epoch": 3.5483870967741935, |
| "grad_norm": 0.5267996191978455, |
| "learning_rate": 4.0750019204624356e-05, |
| "loss": 0.6742, |
| "num_input_tokens_seen": 1492472, |
| "step": 3960 |
| }, |
| { |
| "epoch": 3.552867383512545, |
| "grad_norm": 0.7842523455619812, |
| "learning_rate": 4.071963675906331e-05, |
| "loss": 0.7559, |
| "num_input_tokens_seen": 1494488, |
| "step": 3965 |
| }, |
| { |
| "epoch": 3.557347670250896, |
| "grad_norm": 0.6850671768188477, |
| "learning_rate": 4.0689215865944e-05, |
| "loss": 0.6919, |
| "num_input_tokens_seen": 1496504, |
| "step": 3970 |
| }, |
| { |
| "epoch": 3.561827956989247, |
| "grad_norm": 0.990515947341919, |
| "learning_rate": 4.0658756599670735e-05, |
| "loss": 0.7051, |
| "num_input_tokens_seen": 1498392, |
| "step": 3975 |
| }, |
| { |
| "epoch": 3.5663082437275984, |
| "grad_norm": 0.663224458694458, |
| "learning_rate": 4.062825903474172e-05, |
| "loss": 0.6949, |
| "num_input_tokens_seen": 1500376, |
| "step": 3980 |
| }, |
| { |
| "epoch": 3.5707885304659497, |
| "grad_norm": 0.8304319381713867, |
| "learning_rate": 4.059772324574881e-05, |
| "loss": 0.7059, |
| "num_input_tokens_seen": 1502200, |
| "step": 3985 |
| }, |
| { |
| "epoch": 3.575268817204301, |
| "grad_norm": 0.6099317669868469, |
| "learning_rate": 4.056714930737735e-05, |
| "loss": 0.7065, |
| "num_input_tokens_seen": 1503928, |
| "step": 3990 |
| }, |
| { |
| "epoch": 3.5797491039426523, |
| "grad_norm": 0.5851802229881287, |
| "learning_rate": 4.053653729440599e-05, |
| "loss": 0.7104, |
| "num_input_tokens_seen": 1505816, |
| "step": 3995 |
| }, |
| { |
| "epoch": 3.5842293906810037, |
| "grad_norm": 0.6147633790969849, |
| "learning_rate": 4.05058872817065e-05, |
| "loss": 0.6802, |
| "num_input_tokens_seen": 1507608, |
| "step": 4000 |
| }, |
| { |
| "epoch": 3.588709677419355, |
| "grad_norm": 0.6019237637519836, |
| "learning_rate": 4.047519934424362e-05, |
| "loss": 0.6765, |
| "num_input_tokens_seen": 1509560, |
| "step": 4005 |
| }, |
| { |
| "epoch": 3.5931899641577063, |
| "grad_norm": 0.6119495034217834, |
| "learning_rate": 4.044447355707483e-05, |
| "loss": 0.7309, |
| "num_input_tokens_seen": 1511672, |
| "step": 4010 |
| }, |
| { |
| "epoch": 3.597670250896057, |
| "grad_norm": 0.875652015209198, |
| "learning_rate": 4.0413709995350145e-05, |
| "loss": 0.7415, |
| "num_input_tokens_seen": 1513560, |
| "step": 4015 |
| }, |
| { |
| "epoch": 3.6021505376344085, |
| "grad_norm": 1.135251522064209, |
| "learning_rate": 4.038290873431203e-05, |
| "loss": 0.7534, |
| "num_input_tokens_seen": 1515544, |
| "step": 4020 |
| }, |
| { |
| "epoch": 3.60663082437276, |
| "grad_norm": 0.5745335221290588, |
| "learning_rate": 4.035206984929513e-05, |
| "loss": 0.741, |
| "num_input_tokens_seen": 1517496, |
| "step": 4025 |
| }, |
| { |
| "epoch": 3.611111111111111, |
| "grad_norm": 0.878966212272644, |
| "learning_rate": 4.032119341572612e-05, |
| "loss": 0.704, |
| "num_input_tokens_seen": 1519448, |
| "step": 4030 |
| }, |
| { |
| "epoch": 3.6155913978494625, |
| "grad_norm": 0.5013412833213806, |
| "learning_rate": 4.0290279509123483e-05, |
| "loss": 0.7011, |
| "num_input_tokens_seen": 1521272, |
| "step": 4035 |
| }, |
| { |
| "epoch": 3.6200716845878134, |
| "grad_norm": 0.5498907566070557, |
| "learning_rate": 4.02593282050974e-05, |
| "loss": 0.6896, |
| "num_input_tokens_seen": 1523128, |
| "step": 4040 |
| }, |
| { |
| "epoch": 3.6245519713261647, |
| "grad_norm": 0.5444872379302979, |
| "learning_rate": 4.022833957934949e-05, |
| "loss": 0.7108, |
| "num_input_tokens_seen": 1525144, |
| "step": 4045 |
| }, |
| { |
| "epoch": 3.629032258064516, |
| "grad_norm": 0.5091925859451294, |
| "learning_rate": 4.019731370767267e-05, |
| "loss": 0.6913, |
| "num_input_tokens_seen": 1526968, |
| "step": 4050 |
| }, |
| { |
| "epoch": 3.6335125448028673, |
| "grad_norm": 0.3739386200904846, |
| "learning_rate": 4.016625066595092e-05, |
| "loss": 0.688, |
| "num_input_tokens_seen": 1528760, |
| "step": 4055 |
| }, |
| { |
| "epoch": 3.6379928315412187, |
| "grad_norm": 0.6722102761268616, |
| "learning_rate": 4.013515053015918e-05, |
| "loss": 0.69, |
| "num_input_tokens_seen": 1530584, |
| "step": 4060 |
| }, |
| { |
| "epoch": 3.64247311827957, |
| "grad_norm": 0.7019551992416382, |
| "learning_rate": 4.010401337636309e-05, |
| "loss": 0.7203, |
| "num_input_tokens_seen": 1532312, |
| "step": 4065 |
| }, |
| { |
| "epoch": 3.6469534050179213, |
| "grad_norm": 0.539631724357605, |
| "learning_rate": 4.007283928071882e-05, |
| "loss": 0.6904, |
| "num_input_tokens_seen": 1534008, |
| "step": 4070 |
| }, |
| { |
| "epoch": 3.6514336917562726, |
| "grad_norm": 0.7080748081207275, |
| "learning_rate": 4.0041628319472926e-05, |
| "loss": 0.6979, |
| "num_input_tokens_seen": 1535896, |
| "step": 4075 |
| }, |
| { |
| "epoch": 3.6559139784946235, |
| "grad_norm": 0.5636100172996521, |
| "learning_rate": 4.001038056896211e-05, |
| "loss": 0.7193, |
| "num_input_tokens_seen": 1537752, |
| "step": 4080 |
| }, |
| { |
| "epoch": 3.660394265232975, |
| "grad_norm": 0.5630587935447693, |
| "learning_rate": 3.9979096105613035e-05, |
| "loss": 0.6734, |
| "num_input_tokens_seen": 1539640, |
| "step": 4085 |
| }, |
| { |
| "epoch": 3.664874551971326, |
| "grad_norm": 0.7142034769058228, |
| "learning_rate": 3.99477750059422e-05, |
| "loss": 0.6964, |
| "num_input_tokens_seen": 1541528, |
| "step": 4090 |
| }, |
| { |
| "epoch": 3.6693548387096775, |
| "grad_norm": 0.5816643238067627, |
| "learning_rate": 3.991641734655568e-05, |
| "loss": 0.6701, |
| "num_input_tokens_seen": 1543448, |
| "step": 4095 |
| }, |
| { |
| "epoch": 3.673835125448029, |
| "grad_norm": 0.7215139269828796, |
| "learning_rate": 3.988502320414897e-05, |
| "loss": 0.7094, |
| "num_input_tokens_seen": 1545240, |
| "step": 4100 |
| }, |
| { |
| "epoch": 3.6783154121863797, |
| "grad_norm": 0.6312930583953857, |
| "learning_rate": 3.985359265550682e-05, |
| "loss": 0.7388, |
| "num_input_tokens_seen": 1547096, |
| "step": 4105 |
| }, |
| { |
| "epoch": 3.682795698924731, |
| "grad_norm": 0.49201858043670654, |
| "learning_rate": 3.9822125777502995e-05, |
| "loss": 0.7318, |
| "num_input_tokens_seen": 1549016, |
| "step": 4110 |
| }, |
| { |
| "epoch": 3.6872759856630823, |
| "grad_norm": 0.6607922315597534, |
| "learning_rate": 3.979062264710012e-05, |
| "loss": 0.708, |
| "num_input_tokens_seen": 1550968, |
| "step": 4115 |
| }, |
| { |
| "epoch": 3.6917562724014337, |
| "grad_norm": 0.6152588129043579, |
| "learning_rate": 3.975908334134952e-05, |
| "loss": 0.7058, |
| "num_input_tokens_seen": 1552760, |
| "step": 4120 |
| }, |
| { |
| "epoch": 3.696236559139785, |
| "grad_norm": 0.5260980129241943, |
| "learning_rate": 3.9727507937390954e-05, |
| "loss": 0.7266, |
| "num_input_tokens_seen": 1554680, |
| "step": 4125 |
| }, |
| { |
| "epoch": 3.7007168458781363, |
| "grad_norm": 0.5318123698234558, |
| "learning_rate": 3.969589651245249e-05, |
| "loss": 0.715, |
| "num_input_tokens_seen": 1556536, |
| "step": 4130 |
| }, |
| { |
| "epoch": 3.7051971326164876, |
| "grad_norm": 0.6568738222122192, |
| "learning_rate": 3.9664249143850304e-05, |
| "loss": 0.6898, |
| "num_input_tokens_seen": 1558424, |
| "step": 4135 |
| }, |
| { |
| "epoch": 3.709677419354839, |
| "grad_norm": 0.9819836020469666, |
| "learning_rate": 3.9632565908988476e-05, |
| "loss": 0.7165, |
| "num_input_tokens_seen": 1560344, |
| "step": 4140 |
| }, |
| { |
| "epoch": 3.71415770609319, |
| "grad_norm": 0.6370415687561035, |
| "learning_rate": 3.960084688535881e-05, |
| "loss": 0.6916, |
| "num_input_tokens_seen": 1562264, |
| "step": 4145 |
| }, |
| { |
| "epoch": 3.718637992831541, |
| "grad_norm": 0.8133038878440857, |
| "learning_rate": 3.956909215054066e-05, |
| "loss": 0.7061, |
| "num_input_tokens_seen": 1564120, |
| "step": 4150 |
| }, |
| { |
| "epoch": 3.7231182795698925, |
| "grad_norm": 0.46259617805480957, |
| "learning_rate": 3.953730178220067e-05, |
| "loss": 0.6822, |
| "num_input_tokens_seen": 1566072, |
| "step": 4155 |
| }, |
| { |
| "epoch": 3.727598566308244, |
| "grad_norm": 0.6947198510169983, |
| "learning_rate": 3.9505475858092705e-05, |
| "loss": 0.7145, |
| "num_input_tokens_seen": 1567992, |
| "step": 4160 |
| }, |
| { |
| "epoch": 3.732078853046595, |
| "grad_norm": 0.7821716070175171, |
| "learning_rate": 3.947361445605755e-05, |
| "loss": 0.6979, |
| "num_input_tokens_seen": 1569816, |
| "step": 4165 |
| }, |
| { |
| "epoch": 3.736559139784946, |
| "grad_norm": 0.6598497629165649, |
| "learning_rate": 3.944171765402279e-05, |
| "loss": 0.6687, |
| "num_input_tokens_seen": 1571672, |
| "step": 4170 |
| }, |
| { |
| "epoch": 3.7410394265232974, |
| "grad_norm": 0.6169228553771973, |
| "learning_rate": 3.9409785530002565e-05, |
| "loss": 0.7181, |
| "num_input_tokens_seen": 1573496, |
| "step": 4175 |
| }, |
| { |
| "epoch": 3.7455197132616487, |
| "grad_norm": 0.41680485010147095, |
| "learning_rate": 3.937781816209742e-05, |
| "loss": 0.7108, |
| "num_input_tokens_seen": 1575416, |
| "step": 4180 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 0.5349394679069519, |
| "learning_rate": 3.934581562849411e-05, |
| "loss": 0.6989, |
| "num_input_tokens_seen": 1577272, |
| "step": 4185 |
| }, |
| { |
| "epoch": 3.7544802867383513, |
| "grad_norm": 0.49851375818252563, |
| "learning_rate": 3.931377800746538e-05, |
| "loss": 0.7065, |
| "num_input_tokens_seen": 1579064, |
| "step": 4190 |
| }, |
| { |
| "epoch": 3.7589605734767026, |
| "grad_norm": 0.7621869444847107, |
| "learning_rate": 3.928170537736981e-05, |
| "loss": 0.7245, |
| "num_input_tokens_seen": 1580984, |
| "step": 4195 |
| }, |
| { |
| "epoch": 3.763440860215054, |
| "grad_norm": 0.5499773025512695, |
| "learning_rate": 3.924959781665159e-05, |
| "loss": 0.7089, |
| "num_input_tokens_seen": 1583096, |
| "step": 4200 |
| }, |
| { |
| "epoch": 3.767921146953405, |
| "grad_norm": 0.7189744710922241, |
| "learning_rate": 3.921745540384038e-05, |
| "loss": 0.6925, |
| "num_input_tokens_seen": 1584824, |
| "step": 4205 |
| }, |
| { |
| "epoch": 3.772401433691756, |
| "grad_norm": 0.5173475742340088, |
| "learning_rate": 3.918527821755101e-05, |
| "loss": 0.679, |
| "num_input_tokens_seen": 1586872, |
| "step": 4210 |
| }, |
| { |
| "epoch": 3.7768817204301075, |
| "grad_norm": 0.620393693447113, |
| "learning_rate": 3.915306633648345e-05, |
| "loss": 0.7302, |
| "num_input_tokens_seen": 1588696, |
| "step": 4215 |
| }, |
| { |
| "epoch": 3.781362007168459, |
| "grad_norm": 0.6762012243270874, |
| "learning_rate": 3.9120819839422456e-05, |
| "loss": 0.7044, |
| "num_input_tokens_seen": 1590712, |
| "step": 4220 |
| }, |
| { |
| "epoch": 3.78584229390681, |
| "grad_norm": 0.6276195049285889, |
| "learning_rate": 3.908853880523748e-05, |
| "loss": 0.6695, |
| "num_input_tokens_seen": 1592472, |
| "step": 4225 |
| }, |
| { |
| "epoch": 3.790322580645161, |
| "grad_norm": 0.6157204508781433, |
| "learning_rate": 3.905622331288246e-05, |
| "loss": 0.7026, |
| "num_input_tokens_seen": 1594168, |
| "step": 4230 |
| }, |
| { |
| "epoch": 3.7948028673835124, |
| "grad_norm": 0.5470080375671387, |
| "learning_rate": 3.9023873441395574e-05, |
| "loss": 0.7001, |
| "num_input_tokens_seen": 1595992, |
| "step": 4235 |
| }, |
| { |
| "epoch": 3.7992831541218637, |
| "grad_norm": 0.5555064082145691, |
| "learning_rate": 3.899148926989912e-05, |
| "loss": 0.7048, |
| "num_input_tokens_seen": 1597720, |
| "step": 4240 |
| }, |
| { |
| "epoch": 3.803763440860215, |
| "grad_norm": 0.780737578868866, |
| "learning_rate": 3.895907087759926e-05, |
| "loss": 0.6902, |
| "num_input_tokens_seen": 1599736, |
| "step": 4245 |
| }, |
| { |
| "epoch": 3.8082437275985663, |
| "grad_norm": 0.507416844367981, |
| "learning_rate": 3.8926618343785876e-05, |
| "loss": 0.6769, |
| "num_input_tokens_seen": 1601496, |
| "step": 4250 |
| }, |
| { |
| "epoch": 3.8127240143369177, |
| "grad_norm": 0.5080125331878662, |
| "learning_rate": 3.8894131747832354e-05, |
| "loss": 0.7197, |
| "num_input_tokens_seen": 1603352, |
| "step": 4255 |
| }, |
| { |
| "epoch": 3.817204301075269, |
| "grad_norm": 0.6002953052520752, |
| "learning_rate": 3.886161116919537e-05, |
| "loss": 0.677, |
| "num_input_tokens_seen": 1605208, |
| "step": 4260 |
| }, |
| { |
| "epoch": 3.8216845878136203, |
| "grad_norm": 0.514525294303894, |
| "learning_rate": 3.8829056687414735e-05, |
| "loss": 0.6942, |
| "num_input_tokens_seen": 1607128, |
| "step": 4265 |
| }, |
| { |
| "epoch": 3.826164874551971, |
| "grad_norm": 0.7737773060798645, |
| "learning_rate": 3.8796468382113184e-05, |
| "loss": 0.7222, |
| "num_input_tokens_seen": 1609176, |
| "step": 4270 |
| }, |
| { |
| "epoch": 3.8306451612903225, |
| "grad_norm": 0.6488903760910034, |
| "learning_rate": 3.876384633299616e-05, |
| "loss": 0.6904, |
| "num_input_tokens_seen": 1611096, |
| "step": 4275 |
| }, |
| { |
| "epoch": 3.835125448028674, |
| "grad_norm": 0.6453109383583069, |
| "learning_rate": 3.873119061985164e-05, |
| "loss": 0.6992, |
| "num_input_tokens_seen": 1613048, |
| "step": 4280 |
| }, |
| { |
| "epoch": 3.839605734767025, |
| "grad_norm": 0.7293371558189392, |
| "learning_rate": 3.869850132254996e-05, |
| "loss": 0.7087, |
| "num_input_tokens_seen": 1615128, |
| "step": 4285 |
| }, |
| { |
| "epoch": 3.8440860215053765, |
| "grad_norm": 0.7813587188720703, |
| "learning_rate": 3.866577852104358e-05, |
| "loss": 0.6819, |
| "num_input_tokens_seen": 1616952, |
| "step": 4290 |
| }, |
| { |
| "epoch": 3.8485663082437274, |
| "grad_norm": 0.6214764714241028, |
| "learning_rate": 3.86330222953669e-05, |
| "loss": 0.666, |
| "num_input_tokens_seen": 1618840, |
| "step": 4295 |
| }, |
| { |
| "epoch": 3.8530465949820787, |
| "grad_norm": 0.41636016964912415, |
| "learning_rate": 3.860023272563609e-05, |
| "loss": 0.7116, |
| "num_input_tokens_seen": 1620760, |
| "step": 4300 |
| }, |
| { |
| "epoch": 3.85752688172043, |
| "grad_norm": 0.6706348657608032, |
| "learning_rate": 3.856740989204884e-05, |
| "loss": 0.721, |
| "num_input_tokens_seen": 1622648, |
| "step": 4305 |
| }, |
| { |
| "epoch": 3.8620071684587813, |
| "grad_norm": 0.5346550941467285, |
| "learning_rate": 3.8534553874884244e-05, |
| "loss": 0.6626, |
| "num_input_tokens_seen": 1624632, |
| "step": 4310 |
| }, |
| { |
| "epoch": 3.8664874551971327, |
| "grad_norm": 0.46775901317596436, |
| "learning_rate": 3.850166475450252e-05, |
| "loss": 0.6639, |
| "num_input_tokens_seen": 1626520, |
| "step": 4315 |
| }, |
| { |
| "epoch": 3.870967741935484, |
| "grad_norm": 0.49902352690696716, |
| "learning_rate": 3.846874261134485e-05, |
| "loss": 0.7102, |
| "num_input_tokens_seen": 1628536, |
| "step": 4320 |
| }, |
| { |
| "epoch": 3.8754480286738353, |
| "grad_norm": 0.7217360138893127, |
| "learning_rate": 3.843578752593323e-05, |
| "loss": 0.7223, |
| "num_input_tokens_seen": 1630488, |
| "step": 4325 |
| }, |
| { |
| "epoch": 3.8799283154121866, |
| "grad_norm": 0.6418315172195435, |
| "learning_rate": 3.840279957887017e-05, |
| "loss": 0.7037, |
| "num_input_tokens_seen": 1632344, |
| "step": 4330 |
| }, |
| { |
| "epoch": 3.8844086021505375, |
| "grad_norm": 0.7038565278053284, |
| "learning_rate": 3.836977885083858e-05, |
| "loss": 0.7187, |
| "num_input_tokens_seen": 1634296, |
| "step": 4335 |
| }, |
| { |
| "epoch": 3.888888888888889, |
| "grad_norm": 0.4795782268047333, |
| "learning_rate": 3.833672542260156e-05, |
| "loss": 0.6854, |
| "num_input_tokens_seen": 1636312, |
| "step": 4340 |
| }, |
| { |
| "epoch": 3.89336917562724, |
| "grad_norm": 0.5679958462715149, |
| "learning_rate": 3.830363937500216e-05, |
| "loss": 0.7072, |
| "num_input_tokens_seen": 1638296, |
| "step": 4345 |
| }, |
| { |
| "epoch": 3.8978494623655915, |
| "grad_norm": 0.5424354076385498, |
| "learning_rate": 3.827052078896323e-05, |
| "loss": 0.6753, |
| "num_input_tokens_seen": 1640248, |
| "step": 4350 |
| }, |
| { |
| "epoch": 3.902329749103943, |
| "grad_norm": 0.7104454636573792, |
| "learning_rate": 3.8237369745487205e-05, |
| "loss": 0.7172, |
| "num_input_tokens_seen": 1642040, |
| "step": 4355 |
| }, |
| { |
| "epoch": 3.9068100358422937, |
| "grad_norm": 0.5328916311264038, |
| "learning_rate": 3.820418632565589e-05, |
| "loss": 0.6794, |
| "num_input_tokens_seen": 1643736, |
| "step": 4360 |
| }, |
| { |
| "epoch": 3.911290322580645, |
| "grad_norm": 0.6578375697135925, |
| "learning_rate": 3.817097061063028e-05, |
| "loss": 0.662, |
| "num_input_tokens_seen": 1645784, |
| "step": 4365 |
| }, |
| { |
| "epoch": 3.9157706093189963, |
| "grad_norm": 0.7673133015632629, |
| "learning_rate": 3.81377226816504e-05, |
| "loss": 0.7265, |
| "num_input_tokens_seen": 1647480, |
| "step": 4370 |
| }, |
| { |
| "epoch": 3.9202508960573477, |
| "grad_norm": 0.5228795409202576, |
| "learning_rate": 3.8104442620035e-05, |
| "loss": 0.7221, |
| "num_input_tokens_seen": 1649336, |
| "step": 4375 |
| }, |
| { |
| "epoch": 3.924731182795699, |
| "grad_norm": 0.7645145654678345, |
| "learning_rate": 3.8071130507181466e-05, |
| "loss": 0.7152, |
| "num_input_tokens_seen": 1651192, |
| "step": 4380 |
| }, |
| { |
| "epoch": 3.9292114695340503, |
| "grad_norm": 0.4934726357460022, |
| "learning_rate": 3.803778642456553e-05, |
| "loss": 0.6892, |
| "num_input_tokens_seen": 1653080, |
| "step": 4385 |
| }, |
| { |
| "epoch": 3.9336917562724016, |
| "grad_norm": 0.3990285396575928, |
| "learning_rate": 3.800441045374119e-05, |
| "loss": 0.7042, |
| "num_input_tokens_seen": 1655000, |
| "step": 4390 |
| }, |
| { |
| "epoch": 3.938172043010753, |
| "grad_norm": 0.563335657119751, |
| "learning_rate": 3.797100267634038e-05, |
| "loss": 0.6995, |
| "num_input_tokens_seen": 1656824, |
| "step": 4395 |
| }, |
| { |
| "epoch": 3.942652329749104, |
| "grad_norm": 0.6097021698951721, |
| "learning_rate": 3.7937563174072826e-05, |
| "loss": 0.673, |
| "num_input_tokens_seen": 1658712, |
| "step": 4400 |
| }, |
| { |
| "epoch": 3.947132616487455, |
| "grad_norm": 0.4125150740146637, |
| "learning_rate": 3.790409202872588e-05, |
| "loss": 0.6597, |
| "num_input_tokens_seen": 1660568, |
| "step": 4405 |
| }, |
| { |
| "epoch": 3.9516129032258065, |
| "grad_norm": 0.558273434638977, |
| "learning_rate": 3.787058932216427e-05, |
| "loss": 0.6884, |
| "num_input_tokens_seen": 1662392, |
| "step": 4410 |
| }, |
| { |
| "epoch": 3.956093189964158, |
| "grad_norm": 0.6978607773780823, |
| "learning_rate": 3.783705513632992e-05, |
| "loss": 0.7311, |
| "num_input_tokens_seen": 1664088, |
| "step": 4415 |
| }, |
| { |
| "epoch": 3.9605734767025087, |
| "grad_norm": 0.600098729133606, |
| "learning_rate": 3.780348955324173e-05, |
| "loss": 0.6842, |
| "num_input_tokens_seen": 1665912, |
| "step": 4420 |
| }, |
| { |
| "epoch": 3.96505376344086, |
| "grad_norm": 0.7134581804275513, |
| "learning_rate": 3.7769892654995444e-05, |
| "loss": 0.7465, |
| "num_input_tokens_seen": 1667832, |
| "step": 4425 |
| }, |
| { |
| "epoch": 3.9695340501792113, |
| "grad_norm": 0.5726223587989807, |
| "learning_rate": 3.773626452376332e-05, |
| "loss": 0.7285, |
| "num_input_tokens_seen": 1669816, |
| "step": 4430 |
| }, |
| { |
| "epoch": 3.9740143369175627, |
| "grad_norm": 0.6757382750511169, |
| "learning_rate": 3.7702605241794073e-05, |
| "loss": 0.7042, |
| "num_input_tokens_seen": 1671608, |
| "step": 4435 |
| }, |
| { |
| "epoch": 3.978494623655914, |
| "grad_norm": 0.6381019949913025, |
| "learning_rate": 3.7668914891412574e-05, |
| "loss": 0.6666, |
| "num_input_tokens_seen": 1673400, |
| "step": 4440 |
| }, |
| { |
| "epoch": 3.9829749103942653, |
| "grad_norm": 0.553852915763855, |
| "learning_rate": 3.7635193555019697e-05, |
| "loss": 0.7053, |
| "num_input_tokens_seen": 1675192, |
| "step": 4445 |
| }, |
| { |
| "epoch": 3.9874551971326166, |
| "grad_norm": 0.6171552538871765, |
| "learning_rate": 3.760144131509209e-05, |
| "loss": 0.7065, |
| "num_input_tokens_seen": 1677048, |
| "step": 4450 |
| }, |
| { |
| "epoch": 3.991935483870968, |
| "grad_norm": 0.6333886981010437, |
| "learning_rate": 3.756765825418199e-05, |
| "loss": 0.6822, |
| "num_input_tokens_seen": 1679128, |
| "step": 4455 |
| }, |
| { |
| "epoch": 3.996415770609319, |
| "grad_norm": 0.5539998412132263, |
| "learning_rate": 3.7533844454917025e-05, |
| "loss": 0.7177, |
| "num_input_tokens_seen": 1680856, |
| "step": 4460 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.704715371131897, |
| "eval_runtime": 5.621, |
| "eval_samples_per_second": 88.241, |
| "eval_steps_per_second": 22.06, |
| "num_input_tokens_seen": 1682016, |
| "step": 4464 |
| }, |
| { |
| "epoch": 4.000896057347671, |
| "grad_norm": 0.3961438834667206, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.7105, |
| "num_input_tokens_seen": 1682336, |
| "step": 4465 |
| }, |
| { |
| "epoch": 4.005376344086022, |
| "grad_norm": 0.6253134608268738, |
| "learning_rate": 3.746612497220869e-05, |
| "loss": 0.724, |
| "num_input_tokens_seen": 1684096, |
| "step": 4470 |
| }, |
| { |
| "epoch": 4.009856630824372, |
| "grad_norm": 0.6570338010787964, |
| "learning_rate": 3.743221945439566e-05, |
| "loss": 0.6865, |
| "num_input_tokens_seen": 1686112, |
| "step": 4475 |
| }, |
| { |
| "epoch": 4.014336917562724, |
| "grad_norm": 0.6149647235870361, |
| "learning_rate": 3.739828352948803e-05, |
| "loss": 0.7226, |
| "num_input_tokens_seen": 1687872, |
| "step": 4480 |
| }, |
| { |
| "epoch": 4.018817204301075, |
| "grad_norm": 0.5701264142990112, |
| "learning_rate": 3.736431728048731e-05, |
| "loss": 0.7126, |
| "num_input_tokens_seen": 1689760, |
| "step": 4485 |
| }, |
| { |
| "epoch": 4.023297491039426, |
| "grad_norm": 0.7899668216705322, |
| "learning_rate": 3.733032079046916e-05, |
| "loss": 0.739, |
| "num_input_tokens_seen": 1691584, |
| "step": 4490 |
| }, |
| { |
| "epoch": 4.027777777777778, |
| "grad_norm": 0.3633069097995758, |
| "learning_rate": 3.7296294142583225e-05, |
| "loss": 0.6703, |
| "num_input_tokens_seen": 1693376, |
| "step": 4495 |
| }, |
| { |
| "epoch": 4.032258064516129, |
| "grad_norm": 0.43072450160980225, |
| "learning_rate": 3.726223742005289e-05, |
| "loss": 0.7192, |
| "num_input_tokens_seen": 1695232, |
| "step": 4500 |
| }, |
| { |
| "epoch": 4.03673835125448, |
| "grad_norm": 0.3624493479728699, |
| "learning_rate": 3.7228150706175116e-05, |
| "loss": 0.6964, |
| "num_input_tokens_seen": 1697088, |
| "step": 4505 |
| }, |
| { |
| "epoch": 4.041218637992832, |
| "grad_norm": 0.46872445940971375, |
| "learning_rate": 3.7194034084320195e-05, |
| "loss": 0.6599, |
| "num_input_tokens_seen": 1699008, |
| "step": 4510 |
| }, |
| { |
| "epoch": 4.045698924731183, |
| "grad_norm": 0.6037322878837585, |
| "learning_rate": 3.715988763793158e-05, |
| "loss": 0.7385, |
| "num_input_tokens_seen": 1701216, |
| "step": 4515 |
| }, |
| { |
| "epoch": 4.050179211469534, |
| "grad_norm": 0.46617811918258667, |
| "learning_rate": 3.7125711450525704e-05, |
| "loss": 0.6784, |
| "num_input_tokens_seen": 1703040, |
| "step": 4520 |
| }, |
| { |
| "epoch": 4.054659498207886, |
| "grad_norm": 0.460159033536911, |
| "learning_rate": 3.7091505605691674e-05, |
| "loss": 0.6943, |
| "num_input_tokens_seen": 1704800, |
| "step": 4525 |
| }, |
| { |
| "epoch": 4.059139784946237, |
| "grad_norm": 0.553875207901001, |
| "learning_rate": 3.705727018709118e-05, |
| "loss": 0.7053, |
| "num_input_tokens_seen": 1706592, |
| "step": 4530 |
| }, |
| { |
| "epoch": 4.063620071684587, |
| "grad_norm": 0.7636135816574097, |
| "learning_rate": 3.702300527845825e-05, |
| "loss": 0.7173, |
| "num_input_tokens_seen": 1708544, |
| "step": 4535 |
| }, |
| { |
| "epoch": 4.068100358422939, |
| "grad_norm": 0.7820340394973755, |
| "learning_rate": 3.6988710963598993e-05, |
| "loss": 0.8042, |
| "num_input_tokens_seen": 1710720, |
| "step": 4540 |
| }, |
| { |
| "epoch": 4.07258064516129, |
| "grad_norm": 0.4480503499507904, |
| "learning_rate": 3.695438732639149e-05, |
| "loss": 0.7068, |
| "num_input_tokens_seen": 1712480, |
| "step": 4545 |
| }, |
| { |
| "epoch": 4.077060931899641, |
| "grad_norm": 0.7662056088447571, |
| "learning_rate": 3.6920034450785526e-05, |
| "loss": 0.7124, |
| "num_input_tokens_seen": 1714368, |
| "step": 4550 |
| }, |
| { |
| "epoch": 4.081541218637993, |
| "grad_norm": 0.7628973722457886, |
| "learning_rate": 3.688565242080238e-05, |
| "loss": 0.6843, |
| "num_input_tokens_seen": 1716256, |
| "step": 4555 |
| }, |
| { |
| "epoch": 4.086021505376344, |
| "grad_norm": 0.5160735845565796, |
| "learning_rate": 3.6851241320534665e-05, |
| "loss": 0.6879, |
| "num_input_tokens_seen": 1718208, |
| "step": 4560 |
| }, |
| { |
| "epoch": 4.090501792114695, |
| "grad_norm": 0.4380353093147278, |
| "learning_rate": 3.681680123414606e-05, |
| "loss": 0.7175, |
| "num_input_tokens_seen": 1719936, |
| "step": 4565 |
| }, |
| { |
| "epoch": 4.094982078853047, |
| "grad_norm": 0.5784623622894287, |
| "learning_rate": 3.678233224587118e-05, |
| "loss": 0.6923, |
| "num_input_tokens_seen": 1721856, |
| "step": 4570 |
| }, |
| { |
| "epoch": 4.099462365591398, |
| "grad_norm": 0.5189318656921387, |
| "learning_rate": 3.6747834440015294e-05, |
| "loss": 0.7124, |
| "num_input_tokens_seen": 1723808, |
| "step": 4575 |
| }, |
| { |
| "epoch": 4.103942652329749, |
| "grad_norm": 0.6294048428535461, |
| "learning_rate": 3.671330790095417e-05, |
| "loss": 0.6556, |
| "num_input_tokens_seen": 1725696, |
| "step": 4580 |
| }, |
| { |
| "epoch": 4.108422939068101, |
| "grad_norm": 0.5305403470993042, |
| "learning_rate": 3.667875271313386e-05, |
| "loss": 0.719, |
| "num_input_tokens_seen": 1727584, |
| "step": 4585 |
| }, |
| { |
| "epoch": 4.112903225806452, |
| "grad_norm": 0.7114974856376648, |
| "learning_rate": 3.664416896107047e-05, |
| "loss": 0.7206, |
| "num_input_tokens_seen": 1729568, |
| "step": 4590 |
| }, |
| { |
| "epoch": 4.117383512544803, |
| "grad_norm": 0.4398999512195587, |
| "learning_rate": 3.660955672934998e-05, |
| "loss": 0.6858, |
| "num_input_tokens_seen": 1731328, |
| "step": 4595 |
| }, |
| { |
| "epoch": 4.121863799283154, |
| "grad_norm": 0.550876259803772, |
| "learning_rate": 3.657491610262802e-05, |
| "loss": 0.6637, |
| "num_input_tokens_seen": 1733344, |
| "step": 4600 |
| }, |
| { |
| "epoch": 4.126344086021505, |
| "grad_norm": 0.7345083951950073, |
| "learning_rate": 3.654024716562968e-05, |
| "loss": 0.6746, |
| "num_input_tokens_seen": 1735232, |
| "step": 4605 |
| }, |
| { |
| "epoch": 4.130824372759856, |
| "grad_norm": 0.5149017572402954, |
| "learning_rate": 3.650555000314927e-05, |
| "loss": 0.6852, |
| "num_input_tokens_seen": 1737248, |
| "step": 4610 |
| }, |
| { |
| "epoch": 4.135304659498208, |
| "grad_norm": 0.7252482175827026, |
| "learning_rate": 3.6470824700050155e-05, |
| "loss": 0.7039, |
| "num_input_tokens_seen": 1739264, |
| "step": 4615 |
| }, |
| { |
| "epoch": 4.139784946236559, |
| "grad_norm": 0.5596062541007996, |
| "learning_rate": 3.643607134126452e-05, |
| "loss": 0.7211, |
| "num_input_tokens_seen": 1741184, |
| "step": 4620 |
| }, |
| { |
| "epoch": 4.14426523297491, |
| "grad_norm": 0.5212526321411133, |
| "learning_rate": 3.6401290011793185e-05, |
| "loss": 0.6673, |
| "num_input_tokens_seen": 1742976, |
| "step": 4625 |
| }, |
| { |
| "epoch": 4.148745519713262, |
| "grad_norm": 0.5234253406524658, |
| "learning_rate": 3.636648079670534e-05, |
| "loss": 0.6706, |
| "num_input_tokens_seen": 1744832, |
| "step": 4630 |
| }, |
| { |
| "epoch": 4.153225806451613, |
| "grad_norm": 0.6067260503768921, |
| "learning_rate": 3.6331643781138426e-05, |
| "loss": 0.7152, |
| "num_input_tokens_seen": 1746592, |
| "step": 4635 |
| }, |
| { |
| "epoch": 4.157706093189964, |
| "grad_norm": 0.8035642504692078, |
| "learning_rate": 3.629677905029785e-05, |
| "loss": 0.6924, |
| "num_input_tokens_seen": 1748384, |
| "step": 4640 |
| }, |
| { |
| "epoch": 4.162186379928316, |
| "grad_norm": 0.5164393782615662, |
| "learning_rate": 3.626188668945683e-05, |
| "loss": 0.7261, |
| "num_input_tokens_seen": 1750272, |
| "step": 4645 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 0.3963463306427002, |
| "learning_rate": 3.622696678395613e-05, |
| "loss": 0.6577, |
| "num_input_tokens_seen": 1752128, |
| "step": 4650 |
| }, |
| { |
| "epoch": 4.171146953405018, |
| "grad_norm": 0.7539615631103516, |
| "learning_rate": 3.619201941920389e-05, |
| "loss": 0.6858, |
| "num_input_tokens_seen": 1754112, |
| "step": 4655 |
| }, |
| { |
| "epoch": 4.175627240143369, |
| "grad_norm": 0.8456557393074036, |
| "learning_rate": 3.615704468067545e-05, |
| "loss": 0.7315, |
| "num_input_tokens_seen": 1755936, |
| "step": 4660 |
| }, |
| { |
| "epoch": 4.18010752688172, |
| "grad_norm": 0.5629037618637085, |
| "learning_rate": 3.612204265391306e-05, |
| "loss": 0.71, |
| "num_input_tokens_seen": 1757792, |
| "step": 4665 |
| }, |
| { |
| "epoch": 4.184587813620071, |
| "grad_norm": 0.3057938516139984, |
| "learning_rate": 3.608701342452573e-05, |
| "loss": 0.6738, |
| "num_input_tokens_seen": 1759680, |
| "step": 4670 |
| }, |
| { |
| "epoch": 4.189068100358423, |
| "grad_norm": 0.7656651735305786, |
| "learning_rate": 3.605195707818898e-05, |
| "loss": 0.7177, |
| "num_input_tokens_seen": 1761568, |
| "step": 4675 |
| }, |
| { |
| "epoch": 4.193548387096774, |
| "grad_norm": 0.837243914604187, |
| "learning_rate": 3.6016873700644685e-05, |
| "loss": 0.7361, |
| "num_input_tokens_seen": 1763488, |
| "step": 4680 |
| }, |
| { |
| "epoch": 4.198028673835125, |
| "grad_norm": 0.49771350622177124, |
| "learning_rate": 3.598176337770082e-05, |
| "loss": 0.7004, |
| "num_input_tokens_seen": 1765440, |
| "step": 4685 |
| }, |
| { |
| "epoch": 4.202508960573477, |
| "grad_norm": 0.6064881086349487, |
| "learning_rate": 3.594662619523127e-05, |
| "loss": 0.6879, |
| "num_input_tokens_seen": 1767296, |
| "step": 4690 |
| }, |
| { |
| "epoch": 4.206989247311828, |
| "grad_norm": 0.45831379294395447, |
| "learning_rate": 3.5911462239175595e-05, |
| "loss": 0.6758, |
| "num_input_tokens_seen": 1769248, |
| "step": 4695 |
| }, |
| { |
| "epoch": 4.211469534050179, |
| "grad_norm": 0.3446964919567108, |
| "learning_rate": 3.587627159553886e-05, |
| "loss": 0.6909, |
| "num_input_tokens_seen": 1770976, |
| "step": 4700 |
| }, |
| { |
| "epoch": 4.215949820788531, |
| "grad_norm": 0.4302808940410614, |
| "learning_rate": 3.5841054350391386e-05, |
| "loss": 0.6985, |
| "num_input_tokens_seen": 1772960, |
| "step": 4705 |
| }, |
| { |
| "epoch": 4.220430107526882, |
| "grad_norm": 0.737852156162262, |
| "learning_rate": 3.580581058986858e-05, |
| "loss": 0.7044, |
| "num_input_tokens_seen": 1774752, |
| "step": 4710 |
| }, |
| { |
| "epoch": 4.224910394265233, |
| "grad_norm": 0.5260282754898071, |
| "learning_rate": 3.5770540400170675e-05, |
| "loss": 0.6993, |
| "num_input_tokens_seen": 1776800, |
| "step": 4715 |
| }, |
| { |
| "epoch": 4.229390681003585, |
| "grad_norm": 0.5473984479904175, |
| "learning_rate": 3.573524386756256e-05, |
| "loss": 0.7054, |
| "num_input_tokens_seen": 1778752, |
| "step": 4720 |
| }, |
| { |
| "epoch": 4.233870967741935, |
| "grad_norm": 0.45589348673820496, |
| "learning_rate": 3.569992107837356e-05, |
| "loss": 0.6958, |
| "num_input_tokens_seen": 1780736, |
| "step": 4725 |
| }, |
| { |
| "epoch": 4.238351254480286, |
| "grad_norm": 0.5350434184074402, |
| "learning_rate": 3.56645721189972e-05, |
| "loss": 0.6803, |
| "num_input_tokens_seen": 1782688, |
| "step": 4730 |
| }, |
| { |
| "epoch": 4.242831541218638, |
| "grad_norm": 0.761256992816925, |
| "learning_rate": 3.562919707589102e-05, |
| "loss": 0.7072, |
| "num_input_tokens_seen": 1784416, |
| "step": 4735 |
| }, |
| { |
| "epoch": 4.247311827956989, |
| "grad_norm": 0.6957365870475769, |
| "learning_rate": 3.5593796035576373e-05, |
| "loss": 0.668, |
| "num_input_tokens_seen": 1786432, |
| "step": 4740 |
| }, |
| { |
| "epoch": 4.25179211469534, |
| "grad_norm": 0.5861120223999023, |
| "learning_rate": 3.555836908463817e-05, |
| "loss": 0.6732, |
| "num_input_tokens_seen": 1788352, |
| "step": 4745 |
| }, |
| { |
| "epoch": 4.256272401433692, |
| "grad_norm": 0.5068464279174805, |
| "learning_rate": 3.552291630972472e-05, |
| "loss": 0.6916, |
| "num_input_tokens_seen": 1790208, |
| "step": 4750 |
| }, |
| { |
| "epoch": 4.260752688172043, |
| "grad_norm": 0.38968732953071594, |
| "learning_rate": 3.5487437797547476e-05, |
| "loss": 0.7119, |
| "num_input_tokens_seen": 1792032, |
| "step": 4755 |
| }, |
| { |
| "epoch": 4.265232974910394, |
| "grad_norm": 0.5201514363288879, |
| "learning_rate": 3.545193363488085e-05, |
| "loss": 0.7201, |
| "num_input_tokens_seen": 1793792, |
| "step": 4760 |
| }, |
| { |
| "epoch": 4.269713261648746, |
| "grad_norm": 0.6711928248405457, |
| "learning_rate": 3.5416403908561966e-05, |
| "loss": 0.7008, |
| "num_input_tokens_seen": 1795712, |
| "step": 4765 |
| }, |
| { |
| "epoch": 4.274193548387097, |
| "grad_norm": 0.5274402499198914, |
| "learning_rate": 3.538084870549052e-05, |
| "loss": 0.7237, |
| "num_input_tokens_seen": 1797536, |
| "step": 4770 |
| }, |
| { |
| "epoch": 4.278673835125448, |
| "grad_norm": 0.5370923280715942, |
| "learning_rate": 3.534526811262848e-05, |
| "loss": 0.7051, |
| "num_input_tokens_seen": 1799392, |
| "step": 4775 |
| }, |
| { |
| "epoch": 4.2831541218638, |
| "grad_norm": 0.5397687554359436, |
| "learning_rate": 3.530966221699992e-05, |
| "loss": 0.7353, |
| "num_input_tokens_seen": 1801184, |
| "step": 4780 |
| }, |
| { |
| "epoch": 4.287634408602151, |
| "grad_norm": 0.5232270956039429, |
| "learning_rate": 3.5274031105690826e-05, |
| "loss": 0.7115, |
| "num_input_tokens_seen": 1803104, |
| "step": 4785 |
| }, |
| { |
| "epoch": 4.292114695340501, |
| "grad_norm": 0.6073275208473206, |
| "learning_rate": 3.523837486584881e-05, |
| "loss": 0.6764, |
| "num_input_tokens_seen": 1805024, |
| "step": 4790 |
| }, |
| { |
| "epoch": 4.296594982078853, |
| "grad_norm": 0.5557631254196167, |
| "learning_rate": 3.5202693584682986e-05, |
| "loss": 0.6845, |
| "num_input_tokens_seen": 1806848, |
| "step": 4795 |
| }, |
| { |
| "epoch": 4.301075268817204, |
| "grad_norm": 0.45816871523857117, |
| "learning_rate": 3.51669873494637e-05, |
| "loss": 0.7111, |
| "num_input_tokens_seen": 1808832, |
| "step": 4800 |
| }, |
| { |
| "epoch": 4.305555555555555, |
| "grad_norm": 0.3697778880596161, |
| "learning_rate": 3.513125624752232e-05, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 1810656, |
| "step": 4805 |
| }, |
| { |
| "epoch": 4.310035842293907, |
| "grad_norm": 0.5849930644035339, |
| "learning_rate": 3.509550036625106e-05, |
| "loss": 0.7011, |
| "num_input_tokens_seen": 1812512, |
| "step": 4810 |
| }, |
| { |
| "epoch": 4.314516129032258, |
| "grad_norm": 0.597547709941864, |
| "learning_rate": 3.5059719793102716e-05, |
| "loss": 0.7366, |
| "num_input_tokens_seen": 1814336, |
| "step": 4815 |
| }, |
| { |
| "epoch": 4.318996415770609, |
| "grad_norm": 0.6377572417259216, |
| "learning_rate": 3.502391461559049e-05, |
| "loss": 0.7249, |
| "num_input_tokens_seen": 1816288, |
| "step": 4820 |
| }, |
| { |
| "epoch": 4.323476702508961, |
| "grad_norm": 0.43655264377593994, |
| "learning_rate": 3.498808492128776e-05, |
| "loss": 0.7186, |
| "num_input_tokens_seen": 1818144, |
| "step": 4825 |
| }, |
| { |
| "epoch": 4.327956989247312, |
| "grad_norm": 0.5869845747947693, |
| "learning_rate": 3.495223079782785e-05, |
| "loss": 0.6415, |
| "num_input_tokens_seen": 1820224, |
| "step": 4830 |
| }, |
| { |
| "epoch": 4.332437275985663, |
| "grad_norm": 0.3732021152973175, |
| "learning_rate": 3.491635233290387e-05, |
| "loss": 0.6636, |
| "num_input_tokens_seen": 1822048, |
| "step": 4835 |
| }, |
| { |
| "epoch": 4.336917562724015, |
| "grad_norm": 0.5236073136329651, |
| "learning_rate": 3.488044961426843e-05, |
| "loss": 0.6605, |
| "num_input_tokens_seen": 1824000, |
| "step": 4840 |
| }, |
| { |
| "epoch": 4.341397849462366, |
| "grad_norm": 0.4687046408653259, |
| "learning_rate": 3.484452272973347e-05, |
| "loss": 0.6923, |
| "num_input_tokens_seen": 1825856, |
| "step": 4845 |
| }, |
| { |
| "epoch": 4.345878136200717, |
| "grad_norm": 0.4935377240180969, |
| "learning_rate": 3.480857176717005e-05, |
| "loss": 0.7204, |
| "num_input_tokens_seen": 1827776, |
| "step": 4850 |
| }, |
| { |
| "epoch": 4.350358422939068, |
| "grad_norm": 0.5924071073532104, |
| "learning_rate": 3.4772596814508104e-05, |
| "loss": 0.7215, |
| "num_input_tokens_seen": 1829600, |
| "step": 4855 |
| }, |
| { |
| "epoch": 4.354838709677419, |
| "grad_norm": 0.42385607957839966, |
| "learning_rate": 3.473659795973626e-05, |
| "loss": 0.6406, |
| "num_input_tokens_seen": 1831360, |
| "step": 4860 |
| }, |
| { |
| "epoch": 4.35931899641577, |
| "grad_norm": 0.41054651141166687, |
| "learning_rate": 3.470057529090159e-05, |
| "loss": 0.7, |
| "num_input_tokens_seen": 1833152, |
| "step": 4865 |
| }, |
| { |
| "epoch": 4.363799283154122, |
| "grad_norm": 0.7326763868331909, |
| "learning_rate": 3.46645288961094e-05, |
| "loss": 0.7314, |
| "num_input_tokens_seen": 1834976, |
| "step": 4870 |
| }, |
| { |
| "epoch": 4.368279569892473, |
| "grad_norm": 0.32546576857566833, |
| "learning_rate": 3.462845886352306e-05, |
| "loss": 0.6558, |
| "num_input_tokens_seen": 1837024, |
| "step": 4875 |
| }, |
| { |
| "epoch": 4.372759856630824, |
| "grad_norm": 0.563502848148346, |
| "learning_rate": 3.4592365281363734e-05, |
| "loss": 0.7179, |
| "num_input_tokens_seen": 1838848, |
| "step": 4880 |
| }, |
| { |
| "epoch": 4.377240143369176, |
| "grad_norm": 0.4499485492706299, |
| "learning_rate": 3.455624823791018e-05, |
| "loss": 0.685, |
| "num_input_tokens_seen": 1840640, |
| "step": 4885 |
| }, |
| { |
| "epoch": 4.381720430107527, |
| "grad_norm": 0.6014490723609924, |
| "learning_rate": 3.4520107821498544e-05, |
| "loss": 0.7341, |
| "num_input_tokens_seen": 1842752, |
| "step": 4890 |
| }, |
| { |
| "epoch": 4.386200716845878, |
| "grad_norm": 0.4780343174934387, |
| "learning_rate": 3.448394412052215e-05, |
| "loss": 0.7098, |
| "num_input_tokens_seen": 1844480, |
| "step": 4895 |
| }, |
| { |
| "epoch": 4.39068100358423, |
| "grad_norm": 0.39559486508369446, |
| "learning_rate": 3.444775722343124e-05, |
| "loss": 0.6782, |
| "num_input_tokens_seen": 1846240, |
| "step": 4900 |
| }, |
| { |
| "epoch": 4.395161290322581, |
| "grad_norm": 0.6034467220306396, |
| "learning_rate": 3.441154721873284e-05, |
| "loss": 0.7097, |
| "num_input_tokens_seen": 1848064, |
| "step": 4905 |
| }, |
| { |
| "epoch": 4.399641577060932, |
| "grad_norm": 0.5184746980667114, |
| "learning_rate": 3.437531419499043e-05, |
| "loss": 0.6952, |
| "num_input_tokens_seen": 1850048, |
| "step": 4910 |
| }, |
| { |
| "epoch": 4.404121863799283, |
| "grad_norm": 0.6733912229537964, |
| "learning_rate": 3.4339058240823843e-05, |
| "loss": 0.6948, |
| "num_input_tokens_seen": 1852032, |
| "step": 4915 |
| }, |
| { |
| "epoch": 4.408602150537634, |
| "grad_norm": 0.6464542746543884, |
| "learning_rate": 3.430277944490898e-05, |
| "loss": 0.7098, |
| "num_input_tokens_seen": 1853888, |
| "step": 4920 |
| }, |
| { |
| "epoch": 4.413082437275985, |
| "grad_norm": 0.39893773198127747, |
| "learning_rate": 3.42664778959776e-05, |
| "loss": 0.695, |
| "num_input_tokens_seen": 1855776, |
| "step": 4925 |
| }, |
| { |
| "epoch": 4.417562724014337, |
| "grad_norm": 0.6849373579025269, |
| "learning_rate": 3.423015368281711e-05, |
| "loss": 0.7076, |
| "num_input_tokens_seen": 1857600, |
| "step": 4930 |
| }, |
| { |
| "epoch": 4.422043010752688, |
| "grad_norm": 0.5464024543762207, |
| "learning_rate": 3.419380689427038e-05, |
| "loss": 0.7138, |
| "num_input_tokens_seen": 1859520, |
| "step": 4935 |
| }, |
| { |
| "epoch": 4.426523297491039, |
| "grad_norm": 0.532284677028656, |
| "learning_rate": 3.415743761923546e-05, |
| "loss": 0.6927, |
| "num_input_tokens_seen": 1861440, |
| "step": 4940 |
| }, |
| { |
| "epoch": 4.431003584229391, |
| "grad_norm": 0.45980408787727356, |
| "learning_rate": 3.412104594666541e-05, |
| "loss": 0.7577, |
| "num_input_tokens_seen": 1863200, |
| "step": 4945 |
| }, |
| { |
| "epoch": 4.435483870967742, |
| "grad_norm": 0.506994903087616, |
| "learning_rate": 3.408463196556807e-05, |
| "loss": 0.6934, |
| "num_input_tokens_seen": 1865120, |
| "step": 4950 |
| }, |
| { |
| "epoch": 4.439964157706093, |
| "grad_norm": 0.5589765310287476, |
| "learning_rate": 3.404819576500586e-05, |
| "loss": 0.7261, |
| "num_input_tokens_seen": 1867136, |
| "step": 4955 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 0.448844313621521, |
| "learning_rate": 3.401173743409552e-05, |
| "loss": 0.6661, |
| "num_input_tokens_seen": 1868864, |
| "step": 4960 |
| }, |
| { |
| "epoch": 4.448924731182796, |
| "grad_norm": 0.3504087030887604, |
| "learning_rate": 3.397525706200793e-05, |
| "loss": 0.7071, |
| "num_input_tokens_seen": 1870720, |
| "step": 4965 |
| }, |
| { |
| "epoch": 4.453405017921147, |
| "grad_norm": 0.44309166073799133, |
| "learning_rate": 3.393875473796787e-05, |
| "loss": 0.6904, |
| "num_input_tokens_seen": 1872640, |
| "step": 4970 |
| }, |
| { |
| "epoch": 4.457885304659499, |
| "grad_norm": 0.6156589984893799, |
| "learning_rate": 3.390223055125383e-05, |
| "loss": 0.7403, |
| "num_input_tokens_seen": 1874528, |
| "step": 4975 |
| }, |
| { |
| "epoch": 4.462365591397849, |
| "grad_norm": 0.5609185695648193, |
| "learning_rate": 3.3865684591197745e-05, |
| "loss": 0.7005, |
| "num_input_tokens_seen": 1876416, |
| "step": 4980 |
| }, |
| { |
| "epoch": 4.4668458781362, |
| "grad_norm": 0.5331929922103882, |
| "learning_rate": 3.3829116947184823e-05, |
| "loss": 0.6866, |
| "num_input_tokens_seen": 1878176, |
| "step": 4985 |
| }, |
| { |
| "epoch": 4.471326164874552, |
| "grad_norm": 0.49103447794914246, |
| "learning_rate": 3.379252770865331e-05, |
| "loss": 0.7062, |
| "num_input_tokens_seen": 1879968, |
| "step": 4990 |
| }, |
| { |
| "epoch": 4.475806451612903, |
| "grad_norm": 0.4910992681980133, |
| "learning_rate": 3.375591696509425e-05, |
| "loss": 0.7213, |
| "num_input_tokens_seen": 1881760, |
| "step": 4995 |
| }, |
| { |
| "epoch": 4.480286738351254, |
| "grad_norm": 0.8638359904289246, |
| "learning_rate": 3.371928480605131e-05, |
| "loss": 0.7325, |
| "num_input_tokens_seen": 1883648, |
| "step": 5000 |
| }, |
| { |
| "epoch": 4.484767025089606, |
| "grad_norm": 0.590151846408844, |
| "learning_rate": 3.3682631321120504e-05, |
| "loss": 0.669, |
| "num_input_tokens_seen": 1885696, |
| "step": 5005 |
| }, |
| { |
| "epoch": 4.489247311827957, |
| "grad_norm": 0.5797485709190369, |
| "learning_rate": 3.3645956599950044e-05, |
| "loss": 0.6884, |
| "num_input_tokens_seen": 1887488, |
| "step": 5010 |
| }, |
| { |
| "epoch": 4.493727598566308, |
| "grad_norm": 0.5620754361152649, |
| "learning_rate": 3.360926073224004e-05, |
| "loss": 0.6982, |
| "num_input_tokens_seen": 1889472, |
| "step": 5015 |
| }, |
| { |
| "epoch": 4.49820788530466, |
| "grad_norm": 0.5586421489715576, |
| "learning_rate": 3.3572543807742364e-05, |
| "loss": 0.6778, |
| "num_input_tokens_seen": 1891360, |
| "step": 5020 |
| }, |
| { |
| "epoch": 4.5, |
| "eval_loss": 0.701896607875824, |
| "eval_runtime": 5.6352, |
| "eval_samples_per_second": 88.018, |
| "eval_steps_per_second": 22.005, |
| "num_input_tokens_seen": 1892160, |
| "step": 5022 |
| }, |
| { |
| "epoch": 4.502688172043011, |
| "grad_norm": 0.6140494346618652, |
| "learning_rate": 3.3535805916260346e-05, |
| "loss": 0.7061, |
| "num_input_tokens_seen": 1893312, |
| "step": 5025 |
| }, |
| { |
| "epoch": 4.507168458781362, |
| "grad_norm": 0.5332615375518799, |
| "learning_rate": 3.3499047147648645e-05, |
| "loss": 0.6872, |
| "num_input_tokens_seen": 1895072, |
| "step": 5030 |
| }, |
| { |
| "epoch": 4.511648745519714, |
| "grad_norm": 0.43873006105422974, |
| "learning_rate": 3.346226759181294e-05, |
| "loss": 0.699, |
| "num_input_tokens_seen": 1896928, |
| "step": 5035 |
| }, |
| { |
| "epoch": 4.516129032258064, |
| "grad_norm": 0.5747973918914795, |
| "learning_rate": 3.342546733870977e-05, |
| "loss": 0.6896, |
| "num_input_tokens_seen": 1898816, |
| "step": 5040 |
| }, |
| { |
| "epoch": 4.520609318996415, |
| "grad_norm": 0.5600061416625977, |
| "learning_rate": 3.338864647834631e-05, |
| "loss": 0.6712, |
| "num_input_tokens_seen": 1900672, |
| "step": 5045 |
| }, |
| { |
| "epoch": 4.525089605734767, |
| "grad_norm": 0.4608553946018219, |
| "learning_rate": 3.335180510078012e-05, |
| "loss": 0.6479, |
| "num_input_tokens_seen": 1902528, |
| "step": 5050 |
| }, |
| { |
| "epoch": 4.529569892473118, |
| "grad_norm": 0.5437533855438232, |
| "learning_rate": 3.331494329611894e-05, |
| "loss": 0.7082, |
| "num_input_tokens_seen": 1904672, |
| "step": 5055 |
| }, |
| { |
| "epoch": 4.534050179211469, |
| "grad_norm": 0.8298529386520386, |
| "learning_rate": 3.327806115452046e-05, |
| "loss": 0.7172, |
| "num_input_tokens_seen": 1906720, |
| "step": 5060 |
| }, |
| { |
| "epoch": 4.538530465949821, |
| "grad_norm": 0.6532770991325378, |
| "learning_rate": 3.324115876619215e-05, |
| "loss": 0.7106, |
| "num_input_tokens_seen": 1908544, |
| "step": 5065 |
| }, |
| { |
| "epoch": 4.543010752688172, |
| "grad_norm": 0.6913199424743652, |
| "learning_rate": 3.3204236221390975e-05, |
| "loss": 0.7495, |
| "num_input_tokens_seen": 1910496, |
| "step": 5070 |
| }, |
| { |
| "epoch": 4.547491039426523, |
| "grad_norm": 0.5206092000007629, |
| "learning_rate": 3.316729361042319e-05, |
| "loss": 0.679, |
| "num_input_tokens_seen": 1912288, |
| "step": 5075 |
| }, |
| { |
| "epoch": 4.551971326164875, |
| "grad_norm": 0.46319350600242615, |
| "learning_rate": 3.3130331023644134e-05, |
| "loss": 0.7173, |
| "num_input_tokens_seen": 1914208, |
| "step": 5080 |
| }, |
| { |
| "epoch": 4.556451612903226, |
| "grad_norm": 0.7394029498100281, |
| "learning_rate": 3.309334855145803e-05, |
| "loss": 0.6766, |
| "num_input_tokens_seen": 1915968, |
| "step": 5085 |
| }, |
| { |
| "epoch": 4.560931899641577, |
| "grad_norm": 0.7033692002296448, |
| "learning_rate": 3.30563462843177e-05, |
| "loss": 0.7051, |
| "num_input_tokens_seen": 1917824, |
| "step": 5090 |
| }, |
| { |
| "epoch": 4.565412186379929, |
| "grad_norm": 0.7795708179473877, |
| "learning_rate": 3.301932431272439e-05, |
| "loss": 0.6573, |
| "num_input_tokens_seen": 1919808, |
| "step": 5095 |
| }, |
| { |
| "epoch": 4.56989247311828, |
| "grad_norm": 0.45599478483200073, |
| "learning_rate": 3.2982282727227565e-05, |
| "loss": 0.7036, |
| "num_input_tokens_seen": 1921728, |
| "step": 5100 |
| }, |
| { |
| "epoch": 4.574372759856631, |
| "grad_norm": 0.8516075611114502, |
| "learning_rate": 3.294522161842463e-05, |
| "loss": 0.7271, |
| "num_input_tokens_seen": 1923584, |
| "step": 5105 |
| }, |
| { |
| "epoch": 4.578853046594982, |
| "grad_norm": 0.6737523078918457, |
| "learning_rate": 3.2908141076960766e-05, |
| "loss": 0.727, |
| "num_input_tokens_seen": 1925504, |
| "step": 5110 |
| }, |
| { |
| "epoch": 4.583333333333333, |
| "grad_norm": 0.5484982132911682, |
| "learning_rate": 3.287104119352867e-05, |
| "loss": 0.6958, |
| "num_input_tokens_seen": 1927360, |
| "step": 5115 |
| }, |
| { |
| "epoch": 4.587813620071684, |
| "grad_norm": 0.6374887824058533, |
| "learning_rate": 3.283392205886833e-05, |
| "loss": 0.7413, |
| "num_input_tokens_seen": 1929376, |
| "step": 5120 |
| }, |
| { |
| "epoch": 4.592293906810036, |
| "grad_norm": 0.6388539671897888, |
| "learning_rate": 3.279678376376686e-05, |
| "loss": 0.6703, |
| "num_input_tokens_seen": 1931232, |
| "step": 5125 |
| }, |
| { |
| "epoch": 4.596774193548387, |
| "grad_norm": 0.5705585479736328, |
| "learning_rate": 3.2759626399058196e-05, |
| "loss": 0.6992, |
| "num_input_tokens_seen": 1933056, |
| "step": 5130 |
| }, |
| { |
| "epoch": 4.601254480286738, |
| "grad_norm": 0.45625385642051697, |
| "learning_rate": 3.2722450055622946e-05, |
| "loss": 0.6908, |
| "num_input_tokens_seen": 1935040, |
| "step": 5135 |
| }, |
| { |
| "epoch": 4.60573476702509, |
| "grad_norm": 0.4561392068862915, |
| "learning_rate": 3.268525482438813e-05, |
| "loss": 0.6669, |
| "num_input_tokens_seen": 1937056, |
| "step": 5140 |
| }, |
| { |
| "epoch": 4.610215053763441, |
| "grad_norm": 0.5542040467262268, |
| "learning_rate": 3.264804079632693e-05, |
| "loss": 0.6967, |
| "num_input_tokens_seen": 1938944, |
| "step": 5145 |
| }, |
| { |
| "epoch": 4.614695340501792, |
| "grad_norm": 0.4966046214103699, |
| "learning_rate": 3.2610808062458554e-05, |
| "loss": 0.7079, |
| "num_input_tokens_seen": 1940768, |
| "step": 5150 |
| }, |
| { |
| "epoch": 4.619175627240144, |
| "grad_norm": 0.5149814486503601, |
| "learning_rate": 3.257355671384794e-05, |
| "loss": 0.6761, |
| "num_input_tokens_seen": 1942560, |
| "step": 5155 |
| }, |
| { |
| "epoch": 4.623655913978495, |
| "grad_norm": 0.44616055488586426, |
| "learning_rate": 3.253628684160554e-05, |
| "loss": 0.7097, |
| "num_input_tokens_seen": 1944480, |
| "step": 5160 |
| }, |
| { |
| "epoch": 4.628136200716845, |
| "grad_norm": 0.7818517088890076, |
| "learning_rate": 3.2498998536887114e-05, |
| "loss": 0.7135, |
| "num_input_tokens_seen": 1946336, |
| "step": 5165 |
| }, |
| { |
| "epoch": 4.632616487455197, |
| "grad_norm": 0.46605023741722107, |
| "learning_rate": 3.246169189089354e-05, |
| "loss": 0.6895, |
| "num_input_tokens_seen": 1948064, |
| "step": 5170 |
| }, |
| { |
| "epoch": 4.637096774193548, |
| "grad_norm": 0.6193602681159973, |
| "learning_rate": 3.2424366994870515e-05, |
| "loss": 0.6853, |
| "num_input_tokens_seen": 1949952, |
| "step": 5175 |
| }, |
| { |
| "epoch": 4.641577060931899, |
| "grad_norm": 0.5214295387268066, |
| "learning_rate": 3.238702394010839e-05, |
| "loss": 0.6746, |
| "num_input_tokens_seen": 1951680, |
| "step": 5180 |
| }, |
| { |
| "epoch": 4.646057347670251, |
| "grad_norm": 0.4216688573360443, |
| "learning_rate": 3.234966281794193e-05, |
| "loss": 0.6882, |
| "num_input_tokens_seen": 1953472, |
| "step": 5185 |
| }, |
| { |
| "epoch": 4.650537634408602, |
| "grad_norm": 0.46460291743278503, |
| "learning_rate": 3.231228371975007e-05, |
| "loss": 0.7436, |
| "num_input_tokens_seen": 1955328, |
| "step": 5190 |
| }, |
| { |
| "epoch": 4.655017921146953, |
| "grad_norm": 0.6976000666618347, |
| "learning_rate": 3.2274886736955744e-05, |
| "loss": 0.6798, |
| "num_input_tokens_seen": 1957184, |
| "step": 5195 |
| }, |
| { |
| "epoch": 4.659498207885305, |
| "grad_norm": 0.5898621678352356, |
| "learning_rate": 3.223747196102561e-05, |
| "loss": 0.7066, |
| "num_input_tokens_seen": 1959040, |
| "step": 5200 |
| }, |
| { |
| "epoch": 4.663978494623656, |
| "grad_norm": 0.5544499754905701, |
| "learning_rate": 3.220003948346984e-05, |
| "loss": 0.6431, |
| "num_input_tokens_seen": 1961088, |
| "step": 5205 |
| }, |
| { |
| "epoch": 4.668458781362007, |
| "grad_norm": 0.8915784955024719, |
| "learning_rate": 3.216258939584192e-05, |
| "loss": 0.698, |
| "num_input_tokens_seen": 1962752, |
| "step": 5210 |
| }, |
| { |
| "epoch": 4.672939068100359, |
| "grad_norm": 0.7086608409881592, |
| "learning_rate": 3.2125121789738384e-05, |
| "loss": 0.7153, |
| "num_input_tokens_seen": 1964704, |
| "step": 5215 |
| }, |
| { |
| "epoch": 4.67741935483871, |
| "grad_norm": 0.4128365218639374, |
| "learning_rate": 3.2087636756798635e-05, |
| "loss": 0.7051, |
| "num_input_tokens_seen": 1966688, |
| "step": 5220 |
| }, |
| { |
| "epoch": 4.681899641577061, |
| "grad_norm": 0.577278196811676, |
| "learning_rate": 3.205013438870468e-05, |
| "loss": 0.7005, |
| "num_input_tokens_seen": 1968480, |
| "step": 5225 |
| }, |
| { |
| "epoch": 4.686379928315413, |
| "grad_norm": 0.3509978652000427, |
| "learning_rate": 3.201261477718093e-05, |
| "loss": 0.7304, |
| "num_input_tokens_seen": 1970304, |
| "step": 5230 |
| }, |
| { |
| "epoch": 4.690860215053764, |
| "grad_norm": 0.6244447827339172, |
| "learning_rate": 3.197507801399399e-05, |
| "loss": 0.709, |
| "num_input_tokens_seen": 1972224, |
| "step": 5235 |
| }, |
| { |
| "epoch": 4.695340501792114, |
| "grad_norm": 0.6282808780670166, |
| "learning_rate": 3.193752419095239e-05, |
| "loss": 0.7164, |
| "num_input_tokens_seen": 1974016, |
| "step": 5240 |
| }, |
| { |
| "epoch": 4.699820788530466, |
| "grad_norm": 0.6692128777503967, |
| "learning_rate": 3.18999533999064e-05, |
| "loss": 0.6799, |
| "num_input_tokens_seen": 1975840, |
| "step": 5245 |
| }, |
| { |
| "epoch": 4.704301075268817, |
| "grad_norm": 0.7175477147102356, |
| "learning_rate": 3.186236573274779e-05, |
| "loss": 0.6818, |
| "num_input_tokens_seen": 1977728, |
| "step": 5250 |
| }, |
| { |
| "epoch": 4.708781362007168, |
| "grad_norm": 0.5985310673713684, |
| "learning_rate": 3.1824761281409574e-05, |
| "loss": 0.6939, |
| "num_input_tokens_seen": 1979776, |
| "step": 5255 |
| }, |
| { |
| "epoch": 4.71326164874552, |
| "grad_norm": 0.5040144920349121, |
| "learning_rate": 3.178714013786587e-05, |
| "loss": 0.6917, |
| "num_input_tokens_seen": 1981728, |
| "step": 5260 |
| }, |
| { |
| "epoch": 4.717741935483871, |
| "grad_norm": 0.5648061037063599, |
| "learning_rate": 3.174950239413161e-05, |
| "loss": 0.7029, |
| "num_input_tokens_seen": 1983776, |
| "step": 5265 |
| }, |
| { |
| "epoch": 4.722222222222222, |
| "grad_norm": 0.5307470560073853, |
| "learning_rate": 3.171184814226228e-05, |
| "loss": 0.7299, |
| "num_input_tokens_seen": 1985632, |
| "step": 5270 |
| }, |
| { |
| "epoch": 4.726702508960574, |
| "grad_norm": 0.5408966541290283, |
| "learning_rate": 3.167417747435379e-05, |
| "loss": 0.7017, |
| "num_input_tokens_seen": 1987456, |
| "step": 5275 |
| }, |
| { |
| "epoch": 4.731182795698925, |
| "grad_norm": 0.603818416595459, |
| "learning_rate": 3.16364904825422e-05, |
| "loss": 0.7246, |
| "num_input_tokens_seen": 1989344, |
| "step": 5280 |
| }, |
| { |
| "epoch": 4.735663082437276, |
| "grad_norm": 0.8000281453132629, |
| "learning_rate": 3.1598787259003476e-05, |
| "loss": 0.6887, |
| "num_input_tokens_seen": 1991232, |
| "step": 5285 |
| }, |
| { |
| "epoch": 4.740143369175628, |
| "grad_norm": 0.74366295337677, |
| "learning_rate": 3.1561067895953276e-05, |
| "loss": 0.6743, |
| "num_input_tokens_seen": 1993216, |
| "step": 5290 |
| }, |
| { |
| "epoch": 4.744623655913978, |
| "grad_norm": 0.6478007435798645, |
| "learning_rate": 3.152333248564677e-05, |
| "loss": 0.692, |
| "num_input_tokens_seen": 1995040, |
| "step": 5295 |
| }, |
| { |
| "epoch": 4.749103942652329, |
| "grad_norm": 0.6803041696548462, |
| "learning_rate": 3.148558112037835e-05, |
| "loss": 0.6901, |
| "num_input_tokens_seen": 1996928, |
| "step": 5300 |
| }, |
| { |
| "epoch": 4.753584229390681, |
| "grad_norm": 0.4506027102470398, |
| "learning_rate": 3.1447813892481425e-05, |
| "loss": 0.7007, |
| "num_input_tokens_seen": 1998976, |
| "step": 5305 |
| }, |
| { |
| "epoch": 4.758064516129032, |
| "grad_norm": 0.4170258343219757, |
| "learning_rate": 3.141003089432822e-05, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 2000864, |
| "step": 5310 |
| }, |
| { |
| "epoch": 4.762544802867383, |
| "grad_norm": 0.5882114171981812, |
| "learning_rate": 3.137223221832951e-05, |
| "loss": 0.6908, |
| "num_input_tokens_seen": 2002688, |
| "step": 5315 |
| }, |
| { |
| "epoch": 4.767025089605735, |
| "grad_norm": 0.4571791887283325, |
| "learning_rate": 3.133441795693445e-05, |
| "loss": 0.7135, |
| "num_input_tokens_seen": 2004864, |
| "step": 5320 |
| }, |
| { |
| "epoch": 4.771505376344086, |
| "grad_norm": 0.6060591340065002, |
| "learning_rate": 3.129658820263028e-05, |
| "loss": 0.7044, |
| "num_input_tokens_seen": 2006880, |
| "step": 5325 |
| }, |
| { |
| "epoch": 4.775985663082437, |
| "grad_norm": 0.5432692170143127, |
| "learning_rate": 3.125874304794214e-05, |
| "loss": 0.6767, |
| "num_input_tokens_seen": 2008704, |
| "step": 5330 |
| }, |
| { |
| "epoch": 4.780465949820789, |
| "grad_norm": 0.7855601906776428, |
| "learning_rate": 3.122088258543287e-05, |
| "loss": 0.7241, |
| "num_input_tokens_seen": 2010592, |
| "step": 5335 |
| }, |
| { |
| "epoch": 4.78494623655914, |
| "grad_norm": 0.5748709440231323, |
| "learning_rate": 3.1183006907702684e-05, |
| "loss": 0.7007, |
| "num_input_tokens_seen": 2012448, |
| "step": 5340 |
| }, |
| { |
| "epoch": 4.789426523297491, |
| "grad_norm": 0.6286282539367676, |
| "learning_rate": 3.114511610738907e-05, |
| "loss": 0.708, |
| "num_input_tokens_seen": 2014208, |
| "step": 5345 |
| }, |
| { |
| "epoch": 4.793906810035843, |
| "grad_norm": 0.5121644735336304, |
| "learning_rate": 3.110721027716649e-05, |
| "loss": 0.7078, |
| "num_input_tokens_seen": 2016032, |
| "step": 5350 |
| }, |
| { |
| "epoch": 4.798387096774194, |
| "grad_norm": 0.5875132083892822, |
| "learning_rate": 3.106928950974614e-05, |
| "loss": 0.6871, |
| "num_input_tokens_seen": 2017920, |
| "step": 5355 |
| }, |
| { |
| "epoch": 4.802867383512545, |
| "grad_norm": 0.5885509252548218, |
| "learning_rate": 3.103135389787578e-05, |
| "loss": 0.7209, |
| "num_input_tokens_seen": 2019936, |
| "step": 5360 |
| }, |
| { |
| "epoch": 4.807347670250896, |
| "grad_norm": 0.6623149514198303, |
| "learning_rate": 3.099340353433946e-05, |
| "loss": 0.7188, |
| "num_input_tokens_seen": 2021824, |
| "step": 5365 |
| }, |
| { |
| "epoch": 4.811827956989247, |
| "grad_norm": 0.5710822939872742, |
| "learning_rate": 3.095543851195732e-05, |
| "loss": 0.6984, |
| "num_input_tokens_seen": 2023904, |
| "step": 5370 |
| }, |
| { |
| "epoch": 4.816308243727598, |
| "grad_norm": 0.42584097385406494, |
| "learning_rate": 3.091745892358535e-05, |
| "loss": 0.7011, |
| "num_input_tokens_seen": 2025728, |
| "step": 5375 |
| }, |
| { |
| "epoch": 4.82078853046595, |
| "grad_norm": 0.5550757050514221, |
| "learning_rate": 3.087946486211515e-05, |
| "loss": 0.705, |
| "num_input_tokens_seen": 2027520, |
| "step": 5380 |
| }, |
| { |
| "epoch": 4.825268817204301, |
| "grad_norm": 0.5160370469093323, |
| "learning_rate": 3.084145642047374e-05, |
| "loss": 0.6821, |
| "num_input_tokens_seen": 2029568, |
| "step": 5385 |
| }, |
| { |
| "epoch": 4.829749103942652, |
| "grad_norm": 0.6525334119796753, |
| "learning_rate": 3.080343369162332e-05, |
| "loss": 0.6815, |
| "num_input_tokens_seen": 2031552, |
| "step": 5390 |
| }, |
| { |
| "epoch": 4.834229390681004, |
| "grad_norm": 0.4453662633895874, |
| "learning_rate": 3.076539676856101e-05, |
| "loss": 0.6928, |
| "num_input_tokens_seen": 2033472, |
| "step": 5395 |
| }, |
| { |
| "epoch": 4.838709677419355, |
| "grad_norm": 0.4875478744506836, |
| "learning_rate": 3.0727345744318645e-05, |
| "loss": 0.678, |
| "num_input_tokens_seen": 2035424, |
| "step": 5400 |
| }, |
| { |
| "epoch": 4.843189964157706, |
| "grad_norm": 0.409298837184906, |
| "learning_rate": 3.068928071196256e-05, |
| "loss": 0.7295, |
| "num_input_tokens_seen": 2037248, |
| "step": 5405 |
| }, |
| { |
| "epoch": 4.847670250896058, |
| "grad_norm": 0.458920955657959, |
| "learning_rate": 3.065120176459338e-05, |
| "loss": 0.6985, |
| "num_input_tokens_seen": 2039040, |
| "step": 5410 |
| }, |
| { |
| "epoch": 4.852150537634409, |
| "grad_norm": 0.4822891354560852, |
| "learning_rate": 3.0613108995345694e-05, |
| "loss": 0.6806, |
| "num_input_tokens_seen": 2041152, |
| "step": 5415 |
| }, |
| { |
| "epoch": 4.856630824372759, |
| "grad_norm": 0.7133973836898804, |
| "learning_rate": 3.057500249738796e-05, |
| "loss": 0.7237, |
| "num_input_tokens_seen": 2043072, |
| "step": 5420 |
| }, |
| { |
| "epoch": 4.861111111111111, |
| "grad_norm": 0.5981613993644714, |
| "learning_rate": 3.053688236392219e-05, |
| "loss": 0.6778, |
| "num_input_tokens_seen": 2045088, |
| "step": 5425 |
| }, |
| { |
| "epoch": 4.865591397849462, |
| "grad_norm": 0.6646333932876587, |
| "learning_rate": 3.0498748688183744e-05, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 2046912, |
| "step": 5430 |
| }, |
| { |
| "epoch": 4.870071684587813, |
| "grad_norm": 0.5133581161499023, |
| "learning_rate": 3.046060156344111e-05, |
| "loss": 0.6655, |
| "num_input_tokens_seen": 2048768, |
| "step": 5435 |
| }, |
| { |
| "epoch": 4.874551971326165, |
| "grad_norm": 0.555539608001709, |
| "learning_rate": 3.0422441082995667e-05, |
| "loss": 0.6783, |
| "num_input_tokens_seen": 2050624, |
| "step": 5440 |
| }, |
| { |
| "epoch": 4.879032258064516, |
| "grad_norm": 0.5734941363334656, |
| "learning_rate": 3.0384267340181462e-05, |
| "loss": 0.6662, |
| "num_input_tokens_seen": 2052608, |
| "step": 5445 |
| }, |
| { |
| "epoch": 4.883512544802867, |
| "grad_norm": 0.3394169509410858, |
| "learning_rate": 3.0346080428364974e-05, |
| "loss": 0.6761, |
| "num_input_tokens_seen": 2054368, |
| "step": 5450 |
| }, |
| { |
| "epoch": 4.887992831541219, |
| "grad_norm": 0.5142562389373779, |
| "learning_rate": 3.0307880440944902e-05, |
| "loss": 0.7261, |
| "num_input_tokens_seen": 2056448, |
| "step": 5455 |
| }, |
| { |
| "epoch": 4.89247311827957, |
| "grad_norm": 0.5017949938774109, |
| "learning_rate": 3.026966747135192e-05, |
| "loss": 0.6861, |
| "num_input_tokens_seen": 2058368, |
| "step": 5460 |
| }, |
| { |
| "epoch": 4.896953405017921, |
| "grad_norm": 0.5257229804992676, |
| "learning_rate": 3.023144161304844e-05, |
| "loss": 0.67, |
| "num_input_tokens_seen": 2060256, |
| "step": 5465 |
| }, |
| { |
| "epoch": 4.901433691756273, |
| "grad_norm": 0.4861137866973877, |
| "learning_rate": 3.0193202959528426e-05, |
| "loss": 0.6471, |
| "num_input_tokens_seen": 2062240, |
| "step": 5470 |
| }, |
| { |
| "epoch": 4.905913978494624, |
| "grad_norm": 0.5103967785835266, |
| "learning_rate": 3.0154951604317118e-05, |
| "loss": 0.7269, |
| "num_input_tokens_seen": 2063968, |
| "step": 5475 |
| }, |
| { |
| "epoch": 4.910394265232975, |
| "grad_norm": 0.634955883026123, |
| "learning_rate": 3.0116687640970814e-05, |
| "loss": 0.7047, |
| "num_input_tokens_seen": 2065920, |
| "step": 5480 |
| }, |
| { |
| "epoch": 4.914874551971327, |
| "grad_norm": 0.7920023202896118, |
| "learning_rate": 3.0078411163076682e-05, |
| "loss": 0.7021, |
| "num_input_tokens_seen": 2067808, |
| "step": 5485 |
| }, |
| { |
| "epoch": 4.919354838709677, |
| "grad_norm": 0.49357402324676514, |
| "learning_rate": 3.0040122264252457e-05, |
| "loss": 0.636, |
| "num_input_tokens_seen": 2069888, |
| "step": 5490 |
| }, |
| { |
| "epoch": 4.923835125448028, |
| "grad_norm": 0.581206738948822, |
| "learning_rate": 3.0001821038146287e-05, |
| "loss": 0.7512, |
| "num_input_tokens_seen": 2071712, |
| "step": 5495 |
| }, |
| { |
| "epoch": 4.92831541218638, |
| "grad_norm": 0.5302388668060303, |
| "learning_rate": 2.9963507578436456e-05, |
| "loss": 0.7503, |
| "num_input_tokens_seen": 2073536, |
| "step": 5500 |
| }, |
| { |
| "epoch": 4.932795698924731, |
| "grad_norm": 0.655440628528595, |
| "learning_rate": 2.9925181978831163e-05, |
| "loss": 0.6823, |
| "num_input_tokens_seen": 2075392, |
| "step": 5505 |
| }, |
| { |
| "epoch": 4.937275985663082, |
| "grad_norm": 0.4810947775840759, |
| "learning_rate": 2.9886844333068314e-05, |
| "loss": 0.6949, |
| "num_input_tokens_seen": 2077280, |
| "step": 5510 |
| }, |
| { |
| "epoch": 4.941756272401434, |
| "grad_norm": 0.4489869773387909, |
| "learning_rate": 2.9848494734915276e-05, |
| "loss": 0.737, |
| "num_input_tokens_seen": 2079360, |
| "step": 5515 |
| }, |
| { |
| "epoch": 4.946236559139785, |
| "grad_norm": 0.6105610728263855, |
| "learning_rate": 2.9810133278168643e-05, |
| "loss": 0.7341, |
| "num_input_tokens_seen": 2081216, |
| "step": 5520 |
| }, |
| { |
| "epoch": 4.950716845878136, |
| "grad_norm": 0.4917219281196594, |
| "learning_rate": 2.9771760056654e-05, |
| "loss": 0.7096, |
| "num_input_tokens_seen": 2082944, |
| "step": 5525 |
| }, |
| { |
| "epoch": 4.955197132616488, |
| "grad_norm": 0.5548819899559021, |
| "learning_rate": 2.973337516422574e-05, |
| "loss": 0.7047, |
| "num_input_tokens_seen": 2084768, |
| "step": 5530 |
| }, |
| { |
| "epoch": 4.959677419354839, |
| "grad_norm": 0.5685299634933472, |
| "learning_rate": 2.9694978694766767e-05, |
| "loss": 0.6785, |
| "num_input_tokens_seen": 2086752, |
| "step": 5535 |
| }, |
| { |
| "epoch": 4.96415770609319, |
| "grad_norm": 0.36095836758613586, |
| "learning_rate": 2.9656570742188332e-05, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 2088448, |
| "step": 5540 |
| }, |
| { |
| "epoch": 4.968637992831541, |
| "grad_norm": 0.5040276646614075, |
| "learning_rate": 2.961815140042974e-05, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 2090432, |
| "step": 5545 |
| }, |
| { |
| "epoch": 4.973118279569892, |
| "grad_norm": 0.4736102521419525, |
| "learning_rate": 2.957972076345817e-05, |
| "loss": 0.6886, |
| "num_input_tokens_seen": 2092384, |
| "step": 5550 |
| }, |
| { |
| "epoch": 4.977598566308243, |
| "grad_norm": 0.47025778889656067, |
| "learning_rate": 2.9541278925268428e-05, |
| "loss": 0.6863, |
| "num_input_tokens_seen": 2094080, |
| "step": 5555 |
| }, |
| { |
| "epoch": 4.982078853046595, |
| "grad_norm": 0.43467360734939575, |
| "learning_rate": 2.950282597988272e-05, |
| "loss": 0.6994, |
| "num_input_tokens_seen": 2095776, |
| "step": 5560 |
| }, |
| { |
| "epoch": 4.986559139784946, |
| "grad_norm": 0.8185675740242004, |
| "learning_rate": 2.9464362021350395e-05, |
| "loss": 0.6944, |
| "num_input_tokens_seen": 2097664, |
| "step": 5565 |
| }, |
| { |
| "epoch": 4.991039426523297, |
| "grad_norm": 0.5016605854034424, |
| "learning_rate": 2.9425887143747773e-05, |
| "loss": 0.6658, |
| "num_input_tokens_seen": 2099456, |
| "step": 5570 |
| }, |
| { |
| "epoch": 4.995519713261649, |
| "grad_norm": 0.610438346862793, |
| "learning_rate": 2.938740144117784e-05, |
| "loss": 0.6839, |
| "num_input_tokens_seen": 2101312, |
| "step": 5575 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 1.209800362586975, |
| "learning_rate": 2.93489050077701e-05, |
| "loss": 0.7394, |
| "num_input_tokens_seen": 2102920, |
| "step": 5580 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.7092333436012268, |
| "eval_runtime": 5.6121, |
| "eval_samples_per_second": 88.38, |
| "eval_steps_per_second": 22.095, |
| "num_input_tokens_seen": 2102920, |
| "step": 5580 |
| }, |
| { |
| "epoch": 5.004480286738351, |
| "grad_norm": 0.7478780746459961, |
| "learning_rate": 2.9310397937680277e-05, |
| "loss": 0.6765, |
| "num_input_tokens_seen": 2104808, |
| "step": 5585 |
| }, |
| { |
| "epoch": 5.008960573476703, |
| "grad_norm": 0.39114946126937866, |
| "learning_rate": 2.9271880325090105e-05, |
| "loss": 0.68, |
| "num_input_tokens_seen": 2106568, |
| "step": 5590 |
| }, |
| { |
| "epoch": 5.013440860215054, |
| "grad_norm": 0.949701189994812, |
| "learning_rate": 2.9233352264207133e-05, |
| "loss": 0.7081, |
| "num_input_tokens_seen": 2108456, |
| "step": 5595 |
| }, |
| { |
| "epoch": 5.017921146953405, |
| "grad_norm": 0.525955319404602, |
| "learning_rate": 2.919481384926443e-05, |
| "loss": 0.6584, |
| "num_input_tokens_seen": 2110184, |
| "step": 5600 |
| }, |
| { |
| "epoch": 5.022401433691757, |
| "grad_norm": 0.5059929490089417, |
| "learning_rate": 2.9156265174520414e-05, |
| "loss": 0.7112, |
| "num_input_tokens_seen": 2112104, |
| "step": 5605 |
| }, |
| { |
| "epoch": 5.026881720430108, |
| "grad_norm": 0.5731722712516785, |
| "learning_rate": 2.911770633425858e-05, |
| "loss": 0.7132, |
| "num_input_tokens_seen": 2114056, |
| "step": 5610 |
| }, |
| { |
| "epoch": 5.031362007168458, |
| "grad_norm": 0.6896179914474487, |
| "learning_rate": 2.90791374227873e-05, |
| "loss": 0.7445, |
| "num_input_tokens_seen": 2115880, |
| "step": 5615 |
| }, |
| { |
| "epoch": 5.03584229390681, |
| "grad_norm": 0.6563051342964172, |
| "learning_rate": 2.9040558534439564e-05, |
| "loss": 0.6798, |
| "num_input_tokens_seen": 2117640, |
| "step": 5620 |
| }, |
| { |
| "epoch": 5.040322580645161, |
| "grad_norm": 0.4548991322517395, |
| "learning_rate": 2.9001969763572802e-05, |
| "loss": 0.6843, |
| "num_input_tokens_seen": 2119496, |
| "step": 5625 |
| }, |
| { |
| "epoch": 5.044802867383512, |
| "grad_norm": 0.5727083683013916, |
| "learning_rate": 2.8963371204568542e-05, |
| "loss": 0.6879, |
| "num_input_tokens_seen": 2121384, |
| "step": 5630 |
| }, |
| { |
| "epoch": 5.049283154121864, |
| "grad_norm": 0.6287766098976135, |
| "learning_rate": 2.892476295183232e-05, |
| "loss": 0.6761, |
| "num_input_tokens_seen": 2123336, |
| "step": 5635 |
| }, |
| { |
| "epoch": 5.053763440860215, |
| "grad_norm": 0.452092707157135, |
| "learning_rate": 2.888614509979336e-05, |
| "loss": 0.6833, |
| "num_input_tokens_seen": 2125064, |
| "step": 5640 |
| }, |
| { |
| "epoch": 5.058243727598566, |
| "grad_norm": 0.5629188418388367, |
| "learning_rate": 2.8847517742904352e-05, |
| "loss": 0.6902, |
| "num_input_tokens_seen": 2126920, |
| "step": 5645 |
| }, |
| { |
| "epoch": 5.062724014336918, |
| "grad_norm": 0.5243597626686096, |
| "learning_rate": 2.880888097564124e-05, |
| "loss": 0.7022, |
| "num_input_tokens_seen": 2128744, |
| "step": 5650 |
| }, |
| { |
| "epoch": 5.067204301075269, |
| "grad_norm": 0.39402908086776733, |
| "learning_rate": 2.877023489250299e-05, |
| "loss": 0.6651, |
| "num_input_tokens_seen": 2130664, |
| "step": 5655 |
| }, |
| { |
| "epoch": 5.07168458781362, |
| "grad_norm": 0.4814399778842926, |
| "learning_rate": 2.8731579588011343e-05, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 2132520, |
| "step": 5660 |
| }, |
| { |
| "epoch": 5.076164874551972, |
| "grad_norm": 0.469891756772995, |
| "learning_rate": 2.8692915156710615e-05, |
| "loss": 0.7327, |
| "num_input_tokens_seen": 2134536, |
| "step": 5665 |
| }, |
| { |
| "epoch": 5.080645161290323, |
| "grad_norm": 0.49232617020606995, |
| "learning_rate": 2.8654241693167423e-05, |
| "loss": 0.6932, |
| "num_input_tokens_seen": 2136616, |
| "step": 5670 |
| }, |
| { |
| "epoch": 5.085125448028673, |
| "grad_norm": 0.7345511317253113, |
| "learning_rate": 2.8615559291970474e-05, |
| "loss": 0.6729, |
| "num_input_tokens_seen": 2138408, |
| "step": 5675 |
| }, |
| { |
| "epoch": 5.089605734767025, |
| "grad_norm": 0.7703377604484558, |
| "learning_rate": 2.8576868047730354e-05, |
| "loss": 0.7267, |
| "num_input_tokens_seen": 2140264, |
| "step": 5680 |
| }, |
| { |
| "epoch": 5.094086021505376, |
| "grad_norm": 0.5797245502471924, |
| "learning_rate": 2.8538168055079262e-05, |
| "loss": 0.6878, |
| "num_input_tokens_seen": 2142152, |
| "step": 5685 |
| }, |
| { |
| "epoch": 5.098566308243727, |
| "grad_norm": 0.8190505504608154, |
| "learning_rate": 2.8499459408670796e-05, |
| "loss": 0.6967, |
| "num_input_tokens_seen": 2144040, |
| "step": 5690 |
| }, |
| { |
| "epoch": 5.103046594982079, |
| "grad_norm": 0.5040394067764282, |
| "learning_rate": 2.846074220317973e-05, |
| "loss": 0.6745, |
| "num_input_tokens_seen": 2145896, |
| "step": 5695 |
| }, |
| { |
| "epoch": 5.10752688172043, |
| "grad_norm": 0.5397759675979614, |
| "learning_rate": 2.8422016533301753e-05, |
| "loss": 0.7206, |
| "num_input_tokens_seen": 2147720, |
| "step": 5700 |
| }, |
| { |
| "epoch": 5.112007168458781, |
| "grad_norm": 0.6920524835586548, |
| "learning_rate": 2.8383282493753283e-05, |
| "loss": 0.6896, |
| "num_input_tokens_seen": 2149704, |
| "step": 5705 |
| }, |
| { |
| "epoch": 5.116487455197133, |
| "grad_norm": 0.4486968219280243, |
| "learning_rate": 2.8344540179271178e-05, |
| "loss": 0.7165, |
| "num_input_tokens_seen": 2151592, |
| "step": 5710 |
| }, |
| { |
| "epoch": 5.120967741935484, |
| "grad_norm": 0.563451886177063, |
| "learning_rate": 2.830578968461256e-05, |
| "loss": 0.6802, |
| "num_input_tokens_seen": 2153320, |
| "step": 5715 |
| }, |
| { |
| "epoch": 5.125448028673835, |
| "grad_norm": 0.4947597086429596, |
| "learning_rate": 2.8267031104554552e-05, |
| "loss": 0.6844, |
| "num_input_tokens_seen": 2155144, |
| "step": 5720 |
| }, |
| { |
| "epoch": 5.129928315412187, |
| "grad_norm": 0.4879480302333832, |
| "learning_rate": 2.822826453389404e-05, |
| "loss": 0.6772, |
| "num_input_tokens_seen": 2156904, |
| "step": 5725 |
| }, |
| { |
| "epoch": 5.134408602150538, |
| "grad_norm": 0.675524115562439, |
| "learning_rate": 2.8189490067447473e-05, |
| "loss": 0.7385, |
| "num_input_tokens_seen": 2158792, |
| "step": 5730 |
| }, |
| { |
| "epoch": 5.138888888888889, |
| "grad_norm": 0.5710222125053406, |
| "learning_rate": 2.815070780005059e-05, |
| "loss": 0.6677, |
| "num_input_tokens_seen": 2160776, |
| "step": 5735 |
| }, |
| { |
| "epoch": 5.14336917562724, |
| "grad_norm": 0.3597255349159241, |
| "learning_rate": 2.811191782655823e-05, |
| "loss": 0.6658, |
| "num_input_tokens_seen": 2162568, |
| "step": 5740 |
| }, |
| { |
| "epoch": 5.147849462365591, |
| "grad_norm": 0.5168389678001404, |
| "learning_rate": 2.8073120241844077e-05, |
| "loss": 0.6692, |
| "num_input_tokens_seen": 2164488, |
| "step": 5745 |
| }, |
| { |
| "epoch": 5.152329749103942, |
| "grad_norm": 0.41304054856300354, |
| "learning_rate": 2.8034315140800414e-05, |
| "loss": 0.6983, |
| "num_input_tokens_seen": 2166184, |
| "step": 5750 |
| }, |
| { |
| "epoch": 5.156810035842294, |
| "grad_norm": 0.5995345711708069, |
| "learning_rate": 2.7995502618337933e-05, |
| "loss": 0.7489, |
| "num_input_tokens_seen": 2168040, |
| "step": 5755 |
| }, |
| { |
| "epoch": 5.161290322580645, |
| "grad_norm": 0.5592546463012695, |
| "learning_rate": 2.795668276938545e-05, |
| "loss": 0.6616, |
| "num_input_tokens_seen": 2170024, |
| "step": 5760 |
| }, |
| { |
| "epoch": 5.165770609318996, |
| "grad_norm": 0.37307098507881165, |
| "learning_rate": 2.7917855688889717e-05, |
| "loss": 0.7253, |
| "num_input_tokens_seen": 2171848, |
| "step": 5765 |
| }, |
| { |
| "epoch": 5.170250896057348, |
| "grad_norm": 0.4320157468318939, |
| "learning_rate": 2.787902147181517e-05, |
| "loss": 0.6866, |
| "num_input_tokens_seen": 2173608, |
| "step": 5770 |
| }, |
| { |
| "epoch": 5.174731182795699, |
| "grad_norm": 0.5110576152801514, |
| "learning_rate": 2.7840180213143712e-05, |
| "loss": 0.6937, |
| "num_input_tokens_seen": 2175336, |
| "step": 5775 |
| }, |
| { |
| "epoch": 5.17921146953405, |
| "grad_norm": 0.6609287261962891, |
| "learning_rate": 2.7801332007874437e-05, |
| "loss": 0.7321, |
| "num_input_tokens_seen": 2177192, |
| "step": 5780 |
| }, |
| { |
| "epoch": 5.183691756272402, |
| "grad_norm": 0.6040504574775696, |
| "learning_rate": 2.776247695102345e-05, |
| "loss": 0.691, |
| "num_input_tokens_seen": 2178952, |
| "step": 5785 |
| }, |
| { |
| "epoch": 5.188172043010753, |
| "grad_norm": 0.5164482593536377, |
| "learning_rate": 2.7723615137623637e-05, |
| "loss": 0.7194, |
| "num_input_tokens_seen": 2180968, |
| "step": 5790 |
| }, |
| { |
| "epoch": 5.192652329749104, |
| "grad_norm": 0.43215903639793396, |
| "learning_rate": 2.7684746662724363e-05, |
| "loss": 0.6952, |
| "num_input_tokens_seen": 2182792, |
| "step": 5795 |
| }, |
| { |
| "epoch": 5.197132616487456, |
| "grad_norm": 0.44016534090042114, |
| "learning_rate": 2.7645871621391305e-05, |
| "loss": 0.7246, |
| "num_input_tokens_seen": 2184648, |
| "step": 5800 |
| }, |
| { |
| "epoch": 5.201612903225806, |
| "grad_norm": 0.4214681386947632, |
| "learning_rate": 2.760699010870622e-05, |
| "loss": 0.6804, |
| "num_input_tokens_seen": 2186440, |
| "step": 5805 |
| }, |
| { |
| "epoch": 5.206093189964157, |
| "grad_norm": 0.48975706100463867, |
| "learning_rate": 2.7568102219766666e-05, |
| "loss": 0.7153, |
| "num_input_tokens_seen": 2188424, |
| "step": 5810 |
| }, |
| { |
| "epoch": 5.210573476702509, |
| "grad_norm": 0.6235063076019287, |
| "learning_rate": 2.7529208049685807e-05, |
| "loss": 0.7322, |
| "num_input_tokens_seen": 2190152, |
| "step": 5815 |
| }, |
| { |
| "epoch": 5.21505376344086, |
| "grad_norm": 0.4480155110359192, |
| "learning_rate": 2.7490307693592172e-05, |
| "loss": 0.7, |
| "num_input_tokens_seen": 2192072, |
| "step": 5820 |
| }, |
| { |
| "epoch": 5.219534050179211, |
| "grad_norm": 0.5600855946540833, |
| "learning_rate": 2.7451401246629403e-05, |
| "loss": 0.6852, |
| "num_input_tokens_seen": 2194056, |
| "step": 5825 |
| }, |
| { |
| "epoch": 5.224014336917563, |
| "grad_norm": 0.7299264073371887, |
| "learning_rate": 2.741248880395607e-05, |
| "loss": 0.6946, |
| "num_input_tokens_seen": 2195816, |
| "step": 5830 |
| }, |
| { |
| "epoch": 5.228494623655914, |
| "grad_norm": 0.46794813871383667, |
| "learning_rate": 2.7373570460745384e-05, |
| "loss": 0.6861, |
| "num_input_tokens_seen": 2197736, |
| "step": 5835 |
| }, |
| { |
| "epoch": 5.232974910394265, |
| "grad_norm": 0.5217588543891907, |
| "learning_rate": 2.7334646312184997e-05, |
| "loss": 0.6669, |
| "num_input_tokens_seen": 2199688, |
| "step": 5840 |
| }, |
| { |
| "epoch": 5.237455197132617, |
| "grad_norm": 0.44602036476135254, |
| "learning_rate": 2.7295716453476755e-05, |
| "loss": 0.6665, |
| "num_input_tokens_seen": 2201576, |
| "step": 5845 |
| }, |
| { |
| "epoch": 5.241935483870968, |
| "grad_norm": 0.5522176027297974, |
| "learning_rate": 2.7256780979836466e-05, |
| "loss": 0.6441, |
| "num_input_tokens_seen": 2203624, |
| "step": 5850 |
| }, |
| { |
| "epoch": 5.246415770609319, |
| "grad_norm": 0.43028005957603455, |
| "learning_rate": 2.721783998649369e-05, |
| "loss": 0.6738, |
| "num_input_tokens_seen": 2205448, |
| "step": 5855 |
| }, |
| { |
| "epoch": 5.250896057347671, |
| "grad_norm": 0.6127921938896179, |
| "learning_rate": 2.717889356869146e-05, |
| "loss": 0.7429, |
| "num_input_tokens_seen": 2207272, |
| "step": 5860 |
| }, |
| { |
| "epoch": 5.255376344086022, |
| "grad_norm": 0.5151240229606628, |
| "learning_rate": 2.71399418216861e-05, |
| "loss": 0.6823, |
| "num_input_tokens_seen": 2209160, |
| "step": 5865 |
| }, |
| { |
| "epoch": 5.259856630824372, |
| "grad_norm": 0.8400991559028625, |
| "learning_rate": 2.7100984840746956e-05, |
| "loss": 0.7006, |
| "num_input_tokens_seen": 2211080, |
| "step": 5870 |
| }, |
| { |
| "epoch": 5.264336917562724, |
| "grad_norm": 0.7540143728256226, |
| "learning_rate": 2.7062022721156177e-05, |
| "loss": 0.7426, |
| "num_input_tokens_seen": 2213032, |
| "step": 5875 |
| }, |
| { |
| "epoch": 5.268817204301075, |
| "grad_norm": 0.6295384764671326, |
| "learning_rate": 2.7023055558208487e-05, |
| "loss": 0.7187, |
| "num_input_tokens_seen": 2214824, |
| "step": 5880 |
| }, |
| { |
| "epoch": 5.273297491039426, |
| "grad_norm": 0.467625230550766, |
| "learning_rate": 2.6984083447210945e-05, |
| "loss": 0.6995, |
| "num_input_tokens_seen": 2216648, |
| "step": 5885 |
| }, |
| { |
| "epoch": 5.277777777777778, |
| "grad_norm": 0.5293396711349487, |
| "learning_rate": 2.6945106483482686e-05, |
| "loss": 0.6924, |
| "num_input_tokens_seen": 2218440, |
| "step": 5890 |
| }, |
| { |
| "epoch": 5.282258064516129, |
| "grad_norm": 0.6371461749076843, |
| "learning_rate": 2.690612476235475e-05, |
| "loss": 0.7196, |
| "num_input_tokens_seen": 2220424, |
| "step": 5895 |
| }, |
| { |
| "epoch": 5.28673835125448, |
| "grad_norm": 0.6132698059082031, |
| "learning_rate": 2.6867138379169802e-05, |
| "loss": 0.6934, |
| "num_input_tokens_seen": 2222152, |
| "step": 5900 |
| }, |
| { |
| "epoch": 5.291218637992832, |
| "grad_norm": 0.5086933374404907, |
| "learning_rate": 2.6828147429281902e-05, |
| "loss": 0.7014, |
| "num_input_tokens_seen": 2223976, |
| "step": 5905 |
| }, |
| { |
| "epoch": 5.295698924731183, |
| "grad_norm": 0.47041961550712585, |
| "learning_rate": 2.6789152008056272e-05, |
| "loss": 0.6988, |
| "num_input_tokens_seen": 2225960, |
| "step": 5910 |
| }, |
| { |
| "epoch": 5.300179211469534, |
| "grad_norm": 0.7675802707672119, |
| "learning_rate": 2.6750152210869095e-05, |
| "loss": 0.6973, |
| "num_input_tokens_seen": 2227912, |
| "step": 5915 |
| }, |
| { |
| "epoch": 5.304659498207886, |
| "grad_norm": 0.5190770030021667, |
| "learning_rate": 2.6711148133107233e-05, |
| "loss": 0.6921, |
| "num_input_tokens_seen": 2229736, |
| "step": 5920 |
| }, |
| { |
| "epoch": 5.309139784946237, |
| "grad_norm": 0.509989321231842, |
| "learning_rate": 2.6672139870168034e-05, |
| "loss": 0.6864, |
| "num_input_tokens_seen": 2231720, |
| "step": 5925 |
| }, |
| { |
| "epoch": 5.313620071684587, |
| "grad_norm": 0.5584254264831543, |
| "learning_rate": 2.6633127517459066e-05, |
| "loss": 0.6944, |
| "num_input_tokens_seen": 2233544, |
| "step": 5930 |
| }, |
| { |
| "epoch": 5.318100358422939, |
| "grad_norm": 0.4709128439426422, |
| "learning_rate": 2.6594111170397916e-05, |
| "loss": 0.6945, |
| "num_input_tokens_seen": 2235336, |
| "step": 5935 |
| }, |
| { |
| "epoch": 5.32258064516129, |
| "grad_norm": 0.498522013425827, |
| "learning_rate": 2.655509092441194e-05, |
| "loss": 0.6868, |
| "num_input_tokens_seen": 2237128, |
| "step": 5940 |
| }, |
| { |
| "epoch": 5.327060931899641, |
| "grad_norm": 0.7889726758003235, |
| "learning_rate": 2.6516066874938023e-05, |
| "loss": 0.7143, |
| "num_input_tokens_seen": 2239016, |
| "step": 5945 |
| }, |
| { |
| "epoch": 5.331541218637993, |
| "grad_norm": 0.5677915811538696, |
| "learning_rate": 2.6477039117422335e-05, |
| "loss": 0.6815, |
| "num_input_tokens_seen": 2240968, |
| "step": 5950 |
| }, |
| { |
| "epoch": 5.336021505376344, |
| "grad_norm": 0.6692419052124023, |
| "learning_rate": 2.6438007747320153e-05, |
| "loss": 0.6658, |
| "num_input_tokens_seen": 2242728, |
| "step": 5955 |
| }, |
| { |
| "epoch": 5.340501792114695, |
| "grad_norm": 0.5950575470924377, |
| "learning_rate": 2.639897286009556e-05, |
| "loss": 0.7028, |
| "num_input_tokens_seen": 2244584, |
| "step": 5960 |
| }, |
| { |
| "epoch": 5.344982078853047, |
| "grad_norm": 0.4070301949977875, |
| "learning_rate": 2.6359934551221267e-05, |
| "loss": 0.6951, |
| "num_input_tokens_seen": 2246408, |
| "step": 5965 |
| }, |
| { |
| "epoch": 5.349462365591398, |
| "grad_norm": 0.6000044345855713, |
| "learning_rate": 2.6320892916178326e-05, |
| "loss": 0.7226, |
| "num_input_tokens_seen": 2248456, |
| "step": 5970 |
| }, |
| { |
| "epoch": 5.353942652329749, |
| "grad_norm": 0.5580965280532837, |
| "learning_rate": 2.628184805045593e-05, |
| "loss": 0.7243, |
| "num_input_tokens_seen": 2250216, |
| "step": 5975 |
| }, |
| { |
| "epoch": 5.358422939068101, |
| "grad_norm": 0.5775971412658691, |
| "learning_rate": 2.6242800049551192e-05, |
| "loss": 0.6869, |
| "num_input_tokens_seen": 2252040, |
| "step": 5980 |
| }, |
| { |
| "epoch": 5.362903225806452, |
| "grad_norm": 0.7574295401573181, |
| "learning_rate": 2.620374900896889e-05, |
| "loss": 0.6809, |
| "num_input_tokens_seen": 2253992, |
| "step": 5985 |
| }, |
| { |
| "epoch": 5.367383512544803, |
| "grad_norm": 0.4660971164703369, |
| "learning_rate": 2.6164695024221215e-05, |
| "loss": 0.7012, |
| "num_input_tokens_seen": 2255816, |
| "step": 5990 |
| }, |
| { |
| "epoch": 5.371863799283154, |
| "grad_norm": 0.8073466420173645, |
| "learning_rate": 2.612563819082757e-05, |
| "loss": 0.6988, |
| "num_input_tokens_seen": 2257672, |
| "step": 5995 |
| }, |
| { |
| "epoch": 5.376344086021505, |
| "grad_norm": 0.7728441953659058, |
| "learning_rate": 2.6086578604314337e-05, |
| "loss": 0.6909, |
| "num_input_tokens_seen": 2259688, |
| "step": 6000 |
| }, |
| { |
| "epoch": 5.380824372759856, |
| "grad_norm": 0.665420651435852, |
| "learning_rate": 2.6047516360214623e-05, |
| "loss": 0.6906, |
| "num_input_tokens_seen": 2261512, |
| "step": 6005 |
| }, |
| { |
| "epoch": 5.385304659498208, |
| "grad_norm": 0.772237241268158, |
| "learning_rate": 2.6008451554068025e-05, |
| "loss": 0.6879, |
| "num_input_tokens_seen": 2263240, |
| "step": 6010 |
| }, |
| { |
| "epoch": 5.389784946236559, |
| "grad_norm": 0.6501445770263672, |
| "learning_rate": 2.5969384281420424e-05, |
| "loss": 0.6998, |
| "num_input_tokens_seen": 2265000, |
| "step": 6015 |
| }, |
| { |
| "epoch": 5.39426523297491, |
| "grad_norm": 0.49419528245925903, |
| "learning_rate": 2.593031463782371e-05, |
| "loss": 0.6816, |
| "num_input_tokens_seen": 2266792, |
| "step": 6020 |
| }, |
| { |
| "epoch": 5.398745519713262, |
| "grad_norm": 0.46163687109947205, |
| "learning_rate": 2.5891242718835614e-05, |
| "loss": 0.7022, |
| "num_input_tokens_seen": 2268648, |
| "step": 6025 |
| }, |
| { |
| "epoch": 5.403225806451613, |
| "grad_norm": 0.4298229515552521, |
| "learning_rate": 2.5852168620019385e-05, |
| "loss": 0.6706, |
| "num_input_tokens_seen": 2270472, |
| "step": 6030 |
| }, |
| { |
| "epoch": 5.407706093189964, |
| "grad_norm": 0.615990936756134, |
| "learning_rate": 2.5813092436943626e-05, |
| "loss": 0.7241, |
| "num_input_tokens_seen": 2272296, |
| "step": 6035 |
| }, |
| { |
| "epoch": 5.412186379928316, |
| "grad_norm": 0.5446776151657104, |
| "learning_rate": 2.577401426518204e-05, |
| "loss": 0.7074, |
| "num_input_tokens_seen": 2274248, |
| "step": 6040 |
| }, |
| { |
| "epoch": 5.416666666666667, |
| "grad_norm": 0.48909687995910645, |
| "learning_rate": 2.573493420031318e-05, |
| "loss": 0.6816, |
| "num_input_tokens_seen": 2276168, |
| "step": 6045 |
| }, |
| { |
| "epoch": 5.421146953405018, |
| "grad_norm": 0.45095184445381165, |
| "learning_rate": 2.569585233792027e-05, |
| "loss": 0.6593, |
| "num_input_tokens_seen": 2278056, |
| "step": 6050 |
| }, |
| { |
| "epoch": 5.425627240143369, |
| "grad_norm": 0.6048058867454529, |
| "learning_rate": 2.5656768773590854e-05, |
| "loss": 0.6883, |
| "num_input_tokens_seen": 2279944, |
| "step": 6055 |
| }, |
| { |
| "epoch": 5.43010752688172, |
| "grad_norm": 0.44659972190856934, |
| "learning_rate": 2.5617683602916714e-05, |
| "loss": 0.721, |
| "num_input_tokens_seen": 2281896, |
| "step": 6060 |
| }, |
| { |
| "epoch": 5.434587813620071, |
| "grad_norm": 0.3663732409477234, |
| "learning_rate": 2.5578596921493525e-05, |
| "loss": 0.7137, |
| "num_input_tokens_seen": 2283592, |
| "step": 6065 |
| }, |
| { |
| "epoch": 5.439068100358423, |
| "grad_norm": 0.6166502237319946, |
| "learning_rate": 2.553950882492066e-05, |
| "loss": 0.6721, |
| "num_input_tokens_seen": 2285640, |
| "step": 6070 |
| }, |
| { |
| "epoch": 5.443548387096774, |
| "grad_norm": 0.4628104865550995, |
| "learning_rate": 2.5500419408800953e-05, |
| "loss": 0.7151, |
| "num_input_tokens_seen": 2287464, |
| "step": 6075 |
| }, |
| { |
| "epoch": 5.448028673835125, |
| "grad_norm": 0.5418219566345215, |
| "learning_rate": 2.546132876874048e-05, |
| "loss": 0.6941, |
| "num_input_tokens_seen": 2289352, |
| "step": 6080 |
| }, |
| { |
| "epoch": 5.452508960573477, |
| "grad_norm": 0.5038431882858276, |
| "learning_rate": 2.5422237000348276e-05, |
| "loss": 0.7297, |
| "num_input_tokens_seen": 2291240, |
| "step": 6085 |
| }, |
| { |
| "epoch": 5.456989247311828, |
| "grad_norm": 0.7030521631240845, |
| "learning_rate": 2.5383144199236188e-05, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 2293352, |
| "step": 6090 |
| }, |
| { |
| "epoch": 5.461469534050179, |
| "grad_norm": 0.6526070237159729, |
| "learning_rate": 2.5344050461018542e-05, |
| "loss": 0.6786, |
| "num_input_tokens_seen": 2295464, |
| "step": 6095 |
| }, |
| { |
| "epoch": 5.465949820788531, |
| "grad_norm": 0.5026730298995972, |
| "learning_rate": 2.530495588131197e-05, |
| "loss": 0.6907, |
| "num_input_tokens_seen": 2297160, |
| "step": 6100 |
| }, |
| { |
| "epoch": 5.470430107526882, |
| "grad_norm": 0.3277010917663574, |
| "learning_rate": 2.526586055573518e-05, |
| "loss": 0.6977, |
| "num_input_tokens_seen": 2299048, |
| "step": 6105 |
| }, |
| { |
| "epoch": 5.474910394265233, |
| "grad_norm": 0.6846584677696228, |
| "learning_rate": 2.5226764579908678e-05, |
| "loss": 0.6823, |
| "num_input_tokens_seen": 2300904, |
| "step": 6110 |
| }, |
| { |
| "epoch": 5.479390681003585, |
| "grad_norm": 0.4550936818122864, |
| "learning_rate": 2.5187668049454583e-05, |
| "loss": 0.6808, |
| "num_input_tokens_seen": 2302824, |
| "step": 6115 |
| }, |
| { |
| "epoch": 5.483870967741936, |
| "grad_norm": 0.600966215133667, |
| "learning_rate": 2.5148571059996346e-05, |
| "loss": 0.7128, |
| "num_input_tokens_seen": 2304648, |
| "step": 6120 |
| }, |
| { |
| "epoch": 5.488351254480286, |
| "grad_norm": 0.61279296875, |
| "learning_rate": 2.5109473707158565e-05, |
| "loss": 0.7259, |
| "num_input_tokens_seen": 2306760, |
| "step": 6125 |
| }, |
| { |
| "epoch": 5.492831541218638, |
| "grad_norm": 0.7445093989372253, |
| "learning_rate": 2.5070376086566704e-05, |
| "loss": 0.6921, |
| "num_input_tokens_seen": 2308648, |
| "step": 6130 |
| }, |
| { |
| "epoch": 5.497311827956989, |
| "grad_norm": 0.6505946516990662, |
| "learning_rate": 2.5031278293846922e-05, |
| "loss": 0.6996, |
| "num_input_tokens_seen": 2310728, |
| "step": 6135 |
| }, |
| { |
| "epoch": 5.5, |
| "eval_loss": 0.7000005841255188, |
| "eval_runtime": 5.6446, |
| "eval_samples_per_second": 87.871, |
| "eval_steps_per_second": 21.968, |
| "num_input_tokens_seen": 2311976, |
| "step": 6138 |
| }, |
| { |
| "epoch": 5.50179211469534, |
| "grad_norm": 0.5703749656677246, |
| "learning_rate": 2.4992180424625737e-05, |
| "loss": 0.6789, |
| "num_input_tokens_seen": 2312904, |
| "step": 6140 |
| }, |
| { |
| "epoch": 5.506272401433692, |
| "grad_norm": 0.3148249387741089, |
| "learning_rate": 2.4953082574529906e-05, |
| "loss": 0.7067, |
| "num_input_tokens_seen": 2314856, |
| "step": 6145 |
| }, |
| { |
| "epoch": 5.510752688172043, |
| "grad_norm": 0.41538357734680176, |
| "learning_rate": 2.491398483918612e-05, |
| "loss": 0.6485, |
| "num_input_tokens_seen": 2316808, |
| "step": 6150 |
| }, |
| { |
| "epoch": 5.515232974910394, |
| "grad_norm": 0.6810615658760071, |
| "learning_rate": 2.48748873142208e-05, |
| "loss": 0.6799, |
| "num_input_tokens_seen": 2318664, |
| "step": 6155 |
| }, |
| { |
| "epoch": 5.519713261648746, |
| "grad_norm": 0.5225232243537903, |
| "learning_rate": 2.4835790095259825e-05, |
| "loss": 0.6843, |
| "num_input_tokens_seen": 2320552, |
| "step": 6160 |
| }, |
| { |
| "epoch": 5.524193548387097, |
| "grad_norm": 0.7076472043991089, |
| "learning_rate": 2.479669327792835e-05, |
| "loss": 0.6802, |
| "num_input_tokens_seen": 2322632, |
| "step": 6165 |
| }, |
| { |
| "epoch": 5.528673835125448, |
| "grad_norm": 0.5768188834190369, |
| "learning_rate": 2.475759695785054e-05, |
| "loss": 0.6942, |
| "num_input_tokens_seen": 2324360, |
| "step": 6170 |
| }, |
| { |
| "epoch": 5.5331541218638, |
| "grad_norm": 0.623727023601532, |
| "learning_rate": 2.4718501230649355e-05, |
| "loss": 0.6683, |
| "num_input_tokens_seen": 2326184, |
| "step": 6175 |
| }, |
| { |
| "epoch": 5.53763440860215, |
| "grad_norm": 0.4721769392490387, |
| "learning_rate": 2.4679406191946285e-05, |
| "loss": 0.7199, |
| "num_input_tokens_seen": 2328072, |
| "step": 6180 |
| }, |
| { |
| "epoch": 5.542114695340501, |
| "grad_norm": 0.47976237535476685, |
| "learning_rate": 2.464031193736116e-05, |
| "loss": 0.6867, |
| "num_input_tokens_seen": 2329960, |
| "step": 6185 |
| }, |
| { |
| "epoch": 5.546594982078853, |
| "grad_norm": 0.5275446772575378, |
| "learning_rate": 2.4601218562511856e-05, |
| "loss": 0.7152, |
| "num_input_tokens_seen": 2331816, |
| "step": 6190 |
| }, |
| { |
| "epoch": 5.551075268817204, |
| "grad_norm": 0.5743386745452881, |
| "learning_rate": 2.4562126163014134e-05, |
| "loss": 0.7023, |
| "num_input_tokens_seen": 2333800, |
| "step": 6195 |
| }, |
| { |
| "epoch": 5.555555555555555, |
| "grad_norm": 0.3725605607032776, |
| "learning_rate": 2.452303483448136e-05, |
| "loss": 0.6987, |
| "num_input_tokens_seen": 2335624, |
| "step": 6200 |
| }, |
| { |
| "epoch": 5.560035842293907, |
| "grad_norm": 0.7053723931312561, |
| "learning_rate": 2.4483944672524263e-05, |
| "loss": 0.671, |
| "num_input_tokens_seen": 2337544, |
| "step": 6205 |
| }, |
| { |
| "epoch": 5.564516129032258, |
| "grad_norm": 0.3978196382522583, |
| "learning_rate": 2.444485577275075e-05, |
| "loss": 0.7045, |
| "num_input_tokens_seen": 2339400, |
| "step": 6210 |
| }, |
| { |
| "epoch": 5.568996415770609, |
| "grad_norm": 0.46860271692276, |
| "learning_rate": 2.44057682307656e-05, |
| "loss": 0.6753, |
| "num_input_tokens_seen": 2341128, |
| "step": 6215 |
| }, |
| { |
| "epoch": 5.573476702508961, |
| "grad_norm": 0.6284614205360413, |
| "learning_rate": 2.436668214217031e-05, |
| "loss": 0.6944, |
| "num_input_tokens_seen": 2342920, |
| "step": 6220 |
| }, |
| { |
| "epoch": 5.577956989247312, |
| "grad_norm": 0.541385293006897, |
| "learning_rate": 2.4327597602562792e-05, |
| "loss": 0.6794, |
| "num_input_tokens_seen": 2344968, |
| "step": 6225 |
| }, |
| { |
| "epoch": 5.582437275985663, |
| "grad_norm": 0.41632750630378723, |
| "learning_rate": 2.428851470753719e-05, |
| "loss": 0.6832, |
| "num_input_tokens_seen": 2346824, |
| "step": 6230 |
| }, |
| { |
| "epoch": 5.586917562724015, |
| "grad_norm": 0.36444324254989624, |
| "learning_rate": 2.4249433552683627e-05, |
| "loss": 0.6629, |
| "num_input_tokens_seen": 2348712, |
| "step": 6235 |
| }, |
| { |
| "epoch": 5.591397849462366, |
| "grad_norm": 0.4200541377067566, |
| "learning_rate": 2.4210354233587955e-05, |
| "loss": 0.7115, |
| "num_input_tokens_seen": 2350600, |
| "step": 6240 |
| }, |
| { |
| "epoch": 5.595878136200717, |
| "grad_norm": 0.5005070567131042, |
| "learning_rate": 2.417127684583154e-05, |
| "loss": 0.6852, |
| "num_input_tokens_seen": 2352584, |
| "step": 6245 |
| }, |
| { |
| "epoch": 5.600358422939068, |
| "grad_norm": 0.4710138440132141, |
| "learning_rate": 2.413220148499103e-05, |
| "loss": 0.7178, |
| "num_input_tokens_seen": 2354408, |
| "step": 6250 |
| }, |
| { |
| "epoch": 5.604838709677419, |
| "grad_norm": 0.6707773804664612, |
| "learning_rate": 2.409312824663811e-05, |
| "loss": 0.7461, |
| "num_input_tokens_seen": 2356264, |
| "step": 6255 |
| }, |
| { |
| "epoch": 5.60931899641577, |
| "grad_norm": 0.5251260995864868, |
| "learning_rate": 2.405405722633928e-05, |
| "loss": 0.6999, |
| "num_input_tokens_seen": 2358152, |
| "step": 6260 |
| }, |
| { |
| "epoch": 5.613799283154122, |
| "grad_norm": 0.46387919783592224, |
| "learning_rate": 2.4014988519655618e-05, |
| "loss": 0.71, |
| "num_input_tokens_seen": 2359912, |
| "step": 6265 |
| }, |
| { |
| "epoch": 5.618279569892473, |
| "grad_norm": 0.7532406449317932, |
| "learning_rate": 2.3975922222142517e-05, |
| "loss": 0.6983, |
| "num_input_tokens_seen": 2361864, |
| "step": 6270 |
| }, |
| { |
| "epoch": 5.622759856630824, |
| "grad_norm": 0.591415286064148, |
| "learning_rate": 2.3936858429349508e-05, |
| "loss": 0.6857, |
| "num_input_tokens_seen": 2363784, |
| "step": 6275 |
| }, |
| { |
| "epoch": 5.627240143369176, |
| "grad_norm": 0.45360586047172546, |
| "learning_rate": 2.389779723681999e-05, |
| "loss": 0.6659, |
| "num_input_tokens_seen": 2365608, |
| "step": 6280 |
| }, |
| { |
| "epoch": 5.631720430107527, |
| "grad_norm": 0.43823105096817017, |
| "learning_rate": 2.3858738740090995e-05, |
| "loss": 0.7003, |
| "num_input_tokens_seen": 2367496, |
| "step": 6285 |
| }, |
| { |
| "epoch": 5.636200716845878, |
| "grad_norm": 0.5166615843772888, |
| "learning_rate": 2.3819683034692953e-05, |
| "loss": 0.6941, |
| "num_input_tokens_seen": 2369416, |
| "step": 6290 |
| }, |
| { |
| "epoch": 5.64068100358423, |
| "grad_norm": 0.5909369587898254, |
| "learning_rate": 2.3780630216149506e-05, |
| "loss": 0.6664, |
| "num_input_tokens_seen": 2371336, |
| "step": 6295 |
| }, |
| { |
| "epoch": 5.645161290322581, |
| "grad_norm": 0.6770111322402954, |
| "learning_rate": 2.374158037997717e-05, |
| "loss": 0.702, |
| "num_input_tokens_seen": 2373416, |
| "step": 6300 |
| }, |
| { |
| "epoch": 5.649641577060932, |
| "grad_norm": 0.6585894227027893, |
| "learning_rate": 2.3702533621685228e-05, |
| "loss": 0.7382, |
| "num_input_tokens_seen": 2375304, |
| "step": 6305 |
| }, |
| { |
| "epoch": 5.654121863799283, |
| "grad_norm": 0.4873889088630676, |
| "learning_rate": 2.36634900367754e-05, |
| "loss": 0.6871, |
| "num_input_tokens_seen": 2377192, |
| "step": 6310 |
| }, |
| { |
| "epoch": 5.658602150537634, |
| "grad_norm": 0.7726801037788391, |
| "learning_rate": 2.3624449720741654e-05, |
| "loss": 0.7019, |
| "num_input_tokens_seen": 2379080, |
| "step": 6315 |
| }, |
| { |
| "epoch": 5.663082437275985, |
| "grad_norm": 0.8256939649581909, |
| "learning_rate": 2.3585412769069984e-05, |
| "loss": 0.6638, |
| "num_input_tokens_seen": 2381384, |
| "step": 6320 |
| }, |
| { |
| "epoch": 5.667562724014337, |
| "grad_norm": 0.4363030195236206, |
| "learning_rate": 2.3546379277238107e-05, |
| "loss": 0.6887, |
| "num_input_tokens_seen": 2383304, |
| "step": 6325 |
| }, |
| { |
| "epoch": 5.672043010752688, |
| "grad_norm": 0.6827439665794373, |
| "learning_rate": 2.3507349340715322e-05, |
| "loss": 0.7208, |
| "num_input_tokens_seen": 2385128, |
| "step": 6330 |
| }, |
| { |
| "epoch": 5.676523297491039, |
| "grad_norm": 0.43318870663642883, |
| "learning_rate": 2.3468323054962213e-05, |
| "loss": 0.6828, |
| "num_input_tokens_seen": 2386952, |
| "step": 6335 |
| }, |
| { |
| "epoch": 5.681003584229391, |
| "grad_norm": 0.6294808983802795, |
| "learning_rate": 2.3429300515430437e-05, |
| "loss": 0.6895, |
| "num_input_tokens_seen": 2388872, |
| "step": 6340 |
| }, |
| { |
| "epoch": 5.685483870967742, |
| "grad_norm": 0.43699830770492554, |
| "learning_rate": 2.3390281817562496e-05, |
| "loss": 0.6953, |
| "num_input_tokens_seen": 2390888, |
| "step": 6345 |
| }, |
| { |
| "epoch": 5.689964157706093, |
| "grad_norm": 0.3608168065547943, |
| "learning_rate": 2.335126705679149e-05, |
| "loss": 0.6976, |
| "num_input_tokens_seen": 2392712, |
| "step": 6350 |
| }, |
| { |
| "epoch": 5.694444444444445, |
| "grad_norm": 0.5301570296287537, |
| "learning_rate": 2.331225632854087e-05, |
| "loss": 0.6999, |
| "num_input_tokens_seen": 2394728, |
| "step": 6355 |
| }, |
| { |
| "epoch": 5.698924731182796, |
| "grad_norm": 0.5568897128105164, |
| "learning_rate": 2.327324972822426e-05, |
| "loss": 0.7094, |
| "num_input_tokens_seen": 2396648, |
| "step": 6360 |
| }, |
| { |
| "epoch": 5.703405017921147, |
| "grad_norm": 0.4521498680114746, |
| "learning_rate": 2.3234247351245177e-05, |
| "loss": 0.6585, |
| "num_input_tokens_seen": 2398632, |
| "step": 6365 |
| }, |
| { |
| "epoch": 5.707885304659499, |
| "grad_norm": 0.46761560440063477, |
| "learning_rate": 2.3195249292996786e-05, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 2400616, |
| "step": 6370 |
| }, |
| { |
| "epoch": 5.71236559139785, |
| "grad_norm": 0.4455620348453522, |
| "learning_rate": 2.3156255648861723e-05, |
| "loss": 0.6895, |
| "num_input_tokens_seen": 2402472, |
| "step": 6375 |
| }, |
| { |
| "epoch": 5.7168458781362, |
| "grad_norm": 0.6404704451560974, |
| "learning_rate": 2.3117266514211788e-05, |
| "loss": 0.7024, |
| "num_input_tokens_seen": 2404392, |
| "step": 6380 |
| }, |
| { |
| "epoch": 5.721326164874552, |
| "grad_norm": 0.7674428224563599, |
| "learning_rate": 2.3078281984407787e-05, |
| "loss": 0.683, |
| "num_input_tokens_seen": 2406312, |
| "step": 6385 |
| }, |
| { |
| "epoch": 5.725806451612903, |
| "grad_norm": 0.48401227593421936, |
| "learning_rate": 2.3039302154799256e-05, |
| "loss": 0.6813, |
| "num_input_tokens_seen": 2408168, |
| "step": 6390 |
| }, |
| { |
| "epoch": 5.730286738351254, |
| "grad_norm": 0.4482485055923462, |
| "learning_rate": 2.300032712072422e-05, |
| "loss": 0.6582, |
| "num_input_tokens_seen": 2409992, |
| "step": 6395 |
| }, |
| { |
| "epoch": 5.734767025089606, |
| "grad_norm": 0.7896308898925781, |
| "learning_rate": 2.2961356977508984e-05, |
| "loss": 0.6966, |
| "num_input_tokens_seen": 2411944, |
| "step": 6400 |
| }, |
| { |
| "epoch": 5.739247311827957, |
| "grad_norm": 0.6187811493873596, |
| "learning_rate": 2.2922391820467905e-05, |
| "loss": 0.7247, |
| "num_input_tokens_seen": 2413928, |
| "step": 6405 |
| }, |
| { |
| "epoch": 5.743727598566308, |
| "grad_norm": 0.4433819055557251, |
| "learning_rate": 2.2883431744903115e-05, |
| "loss": 0.7091, |
| "num_input_tokens_seen": 2415848, |
| "step": 6410 |
| }, |
| { |
| "epoch": 5.74820788530466, |
| "grad_norm": 0.40984046459198, |
| "learning_rate": 2.284447684610434e-05, |
| "loss": 0.7202, |
| "num_input_tokens_seen": 2417704, |
| "step": 6415 |
| }, |
| { |
| "epoch": 5.752688172043011, |
| "grad_norm": 0.45450559258461, |
| "learning_rate": 2.2805527219348632e-05, |
| "loss": 0.7115, |
| "num_input_tokens_seen": 2419656, |
| "step": 6420 |
| }, |
| { |
| "epoch": 5.757168458781362, |
| "grad_norm": 0.8165116906166077, |
| "learning_rate": 2.276658295990016e-05, |
| "loss": 0.7439, |
| "num_input_tokens_seen": 2421512, |
| "step": 6425 |
| }, |
| { |
| "epoch": 5.761648745519714, |
| "grad_norm": 0.616450846195221, |
| "learning_rate": 2.272764416300997e-05, |
| "loss": 0.6797, |
| "num_input_tokens_seen": 2423272, |
| "step": 6430 |
| }, |
| { |
| "epoch": 5.766129032258064, |
| "grad_norm": 0.5471182465553284, |
| "learning_rate": 2.2688710923915718e-05, |
| "loss": 0.6858, |
| "num_input_tokens_seen": 2425288, |
| "step": 6435 |
| }, |
| { |
| "epoch": 5.770609318996415, |
| "grad_norm": 0.8255714178085327, |
| "learning_rate": 2.264978333784149e-05, |
| "loss": 0.6955, |
| "num_input_tokens_seen": 2427112, |
| "step": 6440 |
| }, |
| { |
| "epoch": 5.775089605734767, |
| "grad_norm": 0.5268316268920898, |
| "learning_rate": 2.261086149999755e-05, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 2428968, |
| "step": 6445 |
| }, |
| { |
| "epoch": 5.779569892473118, |
| "grad_norm": 0.7851718664169312, |
| "learning_rate": 2.257194550558009e-05, |
| "loss": 0.7278, |
| "num_input_tokens_seen": 2431048, |
| "step": 6450 |
| }, |
| { |
| "epoch": 5.784050179211469, |
| "grad_norm": 0.43519654870033264, |
| "learning_rate": 2.253303544977101e-05, |
| "loss": 0.6455, |
| "num_input_tokens_seen": 2432904, |
| "step": 6455 |
| }, |
| { |
| "epoch": 5.788530465949821, |
| "grad_norm": 0.4320980906486511, |
| "learning_rate": 2.249413142773771e-05, |
| "loss": 0.7036, |
| "num_input_tokens_seen": 2434856, |
| "step": 6460 |
| }, |
| { |
| "epoch": 5.793010752688172, |
| "grad_norm": 0.49086955189704895, |
| "learning_rate": 2.245523353463278e-05, |
| "loss": 0.7149, |
| "num_input_tokens_seen": 2436680, |
| "step": 6465 |
| }, |
| { |
| "epoch": 5.797491039426523, |
| "grad_norm": 0.9313246011734009, |
| "learning_rate": 2.2416341865593875e-05, |
| "loss": 0.7261, |
| "num_input_tokens_seen": 2438344, |
| "step": 6470 |
| }, |
| { |
| "epoch": 5.801971326164875, |
| "grad_norm": 0.36511847376823425, |
| "learning_rate": 2.2377456515743396e-05, |
| "loss": 0.72, |
| "num_input_tokens_seen": 2440168, |
| "step": 6475 |
| }, |
| { |
| "epoch": 5.806451612903226, |
| "grad_norm": 0.3627917766571045, |
| "learning_rate": 2.2338577580188296e-05, |
| "loss": 0.6819, |
| "num_input_tokens_seen": 2442056, |
| "step": 6480 |
| }, |
| { |
| "epoch": 5.810931899641577, |
| "grad_norm": 0.4386759102344513, |
| "learning_rate": 2.2299705154019846e-05, |
| "loss": 0.7089, |
| "num_input_tokens_seen": 2443976, |
| "step": 6485 |
| }, |
| { |
| "epoch": 5.815412186379929, |
| "grad_norm": 0.5388379693031311, |
| "learning_rate": 2.2260839332313375e-05, |
| "loss": 0.7013, |
| "num_input_tokens_seen": 2445832, |
| "step": 6490 |
| }, |
| { |
| "epoch": 5.81989247311828, |
| "grad_norm": 0.5991097092628479, |
| "learning_rate": 2.222198021012809e-05, |
| "loss": 0.7096, |
| "num_input_tokens_seen": 2447720, |
| "step": 6495 |
| }, |
| { |
| "epoch": 5.824372759856631, |
| "grad_norm": 0.5087167620658875, |
| "learning_rate": 2.218312788250678e-05, |
| "loss": 0.6658, |
| "num_input_tokens_seen": 2449704, |
| "step": 6500 |
| }, |
| { |
| "epoch": 5.828853046594982, |
| "grad_norm": 0.5476199388504028, |
| "learning_rate": 2.2144282444475638e-05, |
| "loss": 0.6827, |
| "num_input_tokens_seen": 2451592, |
| "step": 6505 |
| }, |
| { |
| "epoch": 5.833333333333333, |
| "grad_norm": 0.5977054238319397, |
| "learning_rate": 2.2105443991044006e-05, |
| "loss": 0.7112, |
| "num_input_tokens_seen": 2453640, |
| "step": 6510 |
| }, |
| { |
| "epoch": 5.837813620071684, |
| "grad_norm": 0.5165858864784241, |
| "learning_rate": 2.206661261720414e-05, |
| "loss": 0.6908, |
| "num_input_tokens_seen": 2455496, |
| "step": 6515 |
| }, |
| { |
| "epoch": 5.842293906810036, |
| "grad_norm": 0.8256833553314209, |
| "learning_rate": 2.2027788417930962e-05, |
| "loss": 0.6984, |
| "num_input_tokens_seen": 2457320, |
| "step": 6520 |
| }, |
| { |
| "epoch": 5.846774193548387, |
| "grad_norm": 0.7374931573867798, |
| "learning_rate": 2.1988971488181862e-05, |
| "loss": 0.7022, |
| "num_input_tokens_seen": 2459400, |
| "step": 6525 |
| }, |
| { |
| "epoch": 5.851254480286738, |
| "grad_norm": 0.42575767636299133, |
| "learning_rate": 2.1950161922896452e-05, |
| "loss": 0.7119, |
| "num_input_tokens_seen": 2461352, |
| "step": 6530 |
| }, |
| { |
| "epoch": 5.85573476702509, |
| "grad_norm": 0.8485537767410278, |
| "learning_rate": 2.1911359816996342e-05, |
| "loss": 0.6922, |
| "num_input_tokens_seen": 2463112, |
| "step": 6535 |
| }, |
| { |
| "epoch": 5.860215053763441, |
| "grad_norm": 0.5161400437355042, |
| "learning_rate": 2.1872565265384867e-05, |
| "loss": 0.6798, |
| "num_input_tokens_seen": 2465096, |
| "step": 6540 |
| }, |
| { |
| "epoch": 5.864695340501792, |
| "grad_norm": 0.6514114141464233, |
| "learning_rate": 2.1833778362946914e-05, |
| "loss": 0.7127, |
| "num_input_tokens_seen": 2467016, |
| "step": 6545 |
| }, |
| { |
| "epoch": 5.869175627240144, |
| "grad_norm": 0.5723159909248352, |
| "learning_rate": 2.179499920454864e-05, |
| "loss": 0.6659, |
| "num_input_tokens_seen": 2468680, |
| "step": 6550 |
| }, |
| { |
| "epoch": 5.873655913978495, |
| "grad_norm": 0.6084119081497192, |
| "learning_rate": 2.1756227885037277e-05, |
| "loss": 0.6798, |
| "num_input_tokens_seen": 2470632, |
| "step": 6555 |
| }, |
| { |
| "epoch": 5.878136200716845, |
| "grad_norm": 0.4714006781578064, |
| "learning_rate": 2.1717464499240882e-05, |
| "loss": 0.6966, |
| "num_input_tokens_seen": 2472616, |
| "step": 6560 |
| }, |
| { |
| "epoch": 5.882616487455197, |
| "grad_norm": 0.41111356019973755, |
| "learning_rate": 2.16787091419681e-05, |
| "loss": 0.6927, |
| "num_input_tokens_seen": 2474536, |
| "step": 6565 |
| }, |
| { |
| "epoch": 5.887096774193548, |
| "grad_norm": 0.7911174297332764, |
| "learning_rate": 2.1639961908007962e-05, |
| "loss": 0.6812, |
| "num_input_tokens_seen": 2476616, |
| "step": 6570 |
| }, |
| { |
| "epoch": 5.891577060931899, |
| "grad_norm": 0.3497937023639679, |
| "learning_rate": 2.160122289212958e-05, |
| "loss": 0.7069, |
| "num_input_tokens_seen": 2478504, |
| "step": 6575 |
| }, |
| { |
| "epoch": 5.896057347670251, |
| "grad_norm": 0.45918330550193787, |
| "learning_rate": 2.1562492189082023e-05, |
| "loss": 0.7093, |
| "num_input_tokens_seen": 2480296, |
| "step": 6580 |
| }, |
| { |
| "epoch": 5.900537634408602, |
| "grad_norm": 0.5026075839996338, |
| "learning_rate": 2.1523769893593997e-05, |
| "loss": 0.6777, |
| "num_input_tokens_seen": 2482312, |
| "step": 6585 |
| }, |
| { |
| "epoch": 5.905017921146953, |
| "grad_norm": 0.41753828525543213, |
| "learning_rate": 2.1485056100373646e-05, |
| "loss": 0.6774, |
| "num_input_tokens_seen": 2483976, |
| "step": 6590 |
| }, |
| { |
| "epoch": 5.909498207885305, |
| "grad_norm": 0.6379095911979675, |
| "learning_rate": 2.1446350904108346e-05, |
| "loss": 0.7132, |
| "num_input_tokens_seen": 2485704, |
| "step": 6595 |
| }, |
| { |
| "epoch": 5.913978494623656, |
| "grad_norm": 0.7349073886871338, |
| "learning_rate": 2.14076543994644e-05, |
| "loss": 0.6919, |
| "num_input_tokens_seen": 2487688, |
| "step": 6600 |
| }, |
| { |
| "epoch": 5.918458781362007, |
| "grad_norm": 0.7176439762115479, |
| "learning_rate": 2.1368966681086892e-05, |
| "loss": 0.7006, |
| "num_input_tokens_seen": 2489512, |
| "step": 6605 |
| }, |
| { |
| "epoch": 5.922939068100359, |
| "grad_norm": 0.687764048576355, |
| "learning_rate": 2.1330287843599393e-05, |
| "loss": 0.7288, |
| "num_input_tokens_seen": 2491464, |
| "step": 6610 |
| }, |
| { |
| "epoch": 5.92741935483871, |
| "grad_norm": 0.5664499998092651, |
| "learning_rate": 2.1291617981603766e-05, |
| "loss": 0.7268, |
| "num_input_tokens_seen": 2493416, |
| "step": 6615 |
| }, |
| { |
| "epoch": 5.931899641577061, |
| "grad_norm": 0.5868696570396423, |
| "learning_rate": 2.1252957189679927e-05, |
| "loss": 0.7139, |
| "num_input_tokens_seen": 2495272, |
| "step": 6620 |
| }, |
| { |
| "epoch": 5.936379928315413, |
| "grad_norm": 0.6797670125961304, |
| "learning_rate": 2.1214305562385592e-05, |
| "loss": 0.7032, |
| "num_input_tokens_seen": 2497192, |
| "step": 6625 |
| }, |
| { |
| "epoch": 5.940860215053764, |
| "grad_norm": 0.5387040376663208, |
| "learning_rate": 2.1175663194256056e-05, |
| "loss": 0.7143, |
| "num_input_tokens_seen": 2499080, |
| "step": 6630 |
| }, |
| { |
| "epoch": 5.945340501792114, |
| "grad_norm": 0.3677046000957489, |
| "learning_rate": 2.113703017980399e-05, |
| "loss": 0.687, |
| "num_input_tokens_seen": 2500872, |
| "step": 6635 |
| }, |
| { |
| "epoch": 5.949820788530466, |
| "grad_norm": 0.47968631982803345, |
| "learning_rate": 2.1098406613519178e-05, |
| "loss": 0.7149, |
| "num_input_tokens_seen": 2502760, |
| "step": 6640 |
| }, |
| { |
| "epoch": 5.954301075268817, |
| "grad_norm": 0.5777232050895691, |
| "learning_rate": 2.10597925898683e-05, |
| "loss": 0.6951, |
| "num_input_tokens_seen": 2504680, |
| "step": 6645 |
| }, |
| { |
| "epoch": 5.958781362007168, |
| "grad_norm": 0.44781193137168884, |
| "learning_rate": 2.102118820329469e-05, |
| "loss": 0.705, |
| "num_input_tokens_seen": 2506504, |
| "step": 6650 |
| }, |
| { |
| "epoch": 5.96326164874552, |
| "grad_norm": 0.41459017992019653, |
| "learning_rate": 2.09825935482181e-05, |
| "loss": 0.7021, |
| "num_input_tokens_seen": 2508328, |
| "step": 6655 |
| }, |
| { |
| "epoch": 5.967741935483871, |
| "grad_norm": 0.607573926448822, |
| "learning_rate": 2.09440087190345e-05, |
| "loss": 0.7145, |
| "num_input_tokens_seen": 2510280, |
| "step": 6660 |
| }, |
| { |
| "epoch": 5.972222222222222, |
| "grad_norm": 0.3400748074054718, |
| "learning_rate": 2.0905433810115828e-05, |
| "loss": 0.6999, |
| "num_input_tokens_seen": 2512264, |
| "step": 6665 |
| }, |
| { |
| "epoch": 5.976702508960574, |
| "grad_norm": 0.7823965549468994, |
| "learning_rate": 2.0866868915809733e-05, |
| "loss": 0.6573, |
| "num_input_tokens_seen": 2514216, |
| "step": 6670 |
| }, |
| { |
| "epoch": 5.981182795698925, |
| "grad_norm": 0.4284115433692932, |
| "learning_rate": 2.0828314130439408e-05, |
| "loss": 0.6815, |
| "num_input_tokens_seen": 2516104, |
| "step": 6675 |
| }, |
| { |
| "epoch": 5.985663082437276, |
| "grad_norm": 0.5594421625137329, |
| "learning_rate": 2.0789769548303303e-05, |
| "loss": 0.7105, |
| "num_input_tokens_seen": 2518120, |
| "step": 6680 |
| }, |
| { |
| "epoch": 5.990143369175628, |
| "grad_norm": 0.8436473608016968, |
| "learning_rate": 2.0751235263674893e-05, |
| "loss": 0.7038, |
| "num_input_tokens_seen": 2519880, |
| "step": 6685 |
| }, |
| { |
| "epoch": 5.994623655913978, |
| "grad_norm": 0.5207362771034241, |
| "learning_rate": 2.0712711370802495e-05, |
| "loss": 0.6996, |
| "num_input_tokens_seen": 2521800, |
| "step": 6690 |
| }, |
| { |
| "epoch": 5.999103942652329, |
| "grad_norm": 0.46800780296325684, |
| "learning_rate": 2.0674197963908997e-05, |
| "loss": 0.7012, |
| "num_input_tokens_seen": 2523592, |
| "step": 6695 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.701448380947113, |
| "eval_runtime": 5.6297, |
| "eval_samples_per_second": 88.104, |
| "eval_steps_per_second": 22.026, |
| "num_input_tokens_seen": 2523672, |
| "step": 6696 |
| }, |
| { |
| "epoch": 6.003584229390681, |
| "grad_norm": 0.5440098643302917, |
| "learning_rate": 2.0635695137191646e-05, |
| "loss": 0.6747, |
| "num_input_tokens_seen": 2525048, |
| "step": 6700 |
| }, |
| { |
| "epoch": 6.008064516129032, |
| "grad_norm": 0.577639102935791, |
| "learning_rate": 2.0597202984821815e-05, |
| "loss": 0.6818, |
| "num_input_tokens_seen": 2526776, |
| "step": 6705 |
| }, |
| { |
| "epoch": 6.012544802867383, |
| "grad_norm": 0.5155590772628784, |
| "learning_rate": 2.0558721600944754e-05, |
| "loss": 0.6864, |
| "num_input_tokens_seen": 2528696, |
| "step": 6710 |
| }, |
| { |
| "epoch": 6.017025089605735, |
| "grad_norm": 0.5763823390007019, |
| "learning_rate": 2.0520251079679373e-05, |
| "loss": 0.7004, |
| "num_input_tokens_seen": 2530520, |
| "step": 6715 |
| }, |
| { |
| "epoch": 6.021505376344086, |
| "grad_norm": 0.6298096179962158, |
| "learning_rate": 2.048179151511804e-05, |
| "loss": 0.7013, |
| "num_input_tokens_seen": 2532344, |
| "step": 6720 |
| }, |
| { |
| "epoch": 6.025985663082437, |
| "grad_norm": 0.5589931607246399, |
| "learning_rate": 2.0443343001326303e-05, |
| "loss": 0.7173, |
| "num_input_tokens_seen": 2534264, |
| "step": 6725 |
| }, |
| { |
| "epoch": 6.030465949820789, |
| "grad_norm": 0.4019322097301483, |
| "learning_rate": 2.04049056323427e-05, |
| "loss": 0.6726, |
| "num_input_tokens_seen": 2535992, |
| "step": 6730 |
| }, |
| { |
| "epoch": 6.03494623655914, |
| "grad_norm": 0.7917940020561218, |
| "learning_rate": 2.0366479502178497e-05, |
| "loss": 0.6833, |
| "num_input_tokens_seen": 2537944, |
| "step": 6735 |
| }, |
| { |
| "epoch": 6.039426523297491, |
| "grad_norm": 0.5238997340202332, |
| "learning_rate": 2.0328064704817458e-05, |
| "loss": 0.6622, |
| "num_input_tokens_seen": 2539864, |
| "step": 6740 |
| }, |
| { |
| "epoch": 6.043906810035843, |
| "grad_norm": 0.4306890368461609, |
| "learning_rate": 2.028966133421565e-05, |
| "loss": 0.6686, |
| "num_input_tokens_seen": 2541784, |
| "step": 6745 |
| }, |
| { |
| "epoch": 6.048387096774194, |
| "grad_norm": 0.3549671471118927, |
| "learning_rate": 2.0251269484301193e-05, |
| "loss": 0.6855, |
| "num_input_tokens_seen": 2543640, |
| "step": 6750 |
| }, |
| { |
| "epoch": 6.052867383512544, |
| "grad_norm": 0.5074473023414612, |
| "learning_rate": 2.021288924897402e-05, |
| "loss": 0.6812, |
| "num_input_tokens_seen": 2545656, |
| "step": 6755 |
| }, |
| { |
| "epoch": 6.057347670250896, |
| "grad_norm": 0.597404420375824, |
| "learning_rate": 2.0174520722105673e-05, |
| "loss": 0.6669, |
| "num_input_tokens_seen": 2547448, |
| "step": 6760 |
| }, |
| { |
| "epoch": 6.061827956989247, |
| "grad_norm": 0.503865122795105, |
| "learning_rate": 2.0136163997539017e-05, |
| "loss": 0.6961, |
| "num_input_tokens_seen": 2549272, |
| "step": 6765 |
| }, |
| { |
| "epoch": 6.066308243727598, |
| "grad_norm": 0.44585174322128296, |
| "learning_rate": 2.0097819169088096e-05, |
| "loss": 0.6521, |
| "num_input_tokens_seen": 2551032, |
| "step": 6770 |
| }, |
| { |
| "epoch": 6.07078853046595, |
| "grad_norm": 0.5672586560249329, |
| "learning_rate": 2.0059486330537835e-05, |
| "loss": 0.6468, |
| "num_input_tokens_seen": 2552824, |
| "step": 6775 |
| }, |
| { |
| "epoch": 6.075268817204301, |
| "grad_norm": 0.575846791267395, |
| "learning_rate": 2.0021165575643837e-05, |
| "loss": 0.6824, |
| "num_input_tokens_seen": 2554520, |
| "step": 6780 |
| }, |
| { |
| "epoch": 6.079749103942652, |
| "grad_norm": 0.4153701364994049, |
| "learning_rate": 1.998285699813215e-05, |
| "loss": 0.7352, |
| "num_input_tokens_seen": 2556376, |
| "step": 6785 |
| }, |
| { |
| "epoch": 6.084229390681004, |
| "grad_norm": 0.8207941651344299, |
| "learning_rate": 1.9944560691699057e-05, |
| "loss": 0.744, |
| "num_input_tokens_seen": 2558200, |
| "step": 6790 |
| }, |
| { |
| "epoch": 6.088709677419355, |
| "grad_norm": 0.5624127388000488, |
| "learning_rate": 1.9906276750010792e-05, |
| "loss": 0.7281, |
| "num_input_tokens_seen": 2560312, |
| "step": 6795 |
| }, |
| { |
| "epoch": 6.093189964157706, |
| "grad_norm": 0.6707472801208496, |
| "learning_rate": 1.9868005266703364e-05, |
| "loss": 0.692, |
| "num_input_tokens_seen": 2562328, |
| "step": 6800 |
| }, |
| { |
| "epoch": 6.097670250896058, |
| "grad_norm": 0.6541929244995117, |
| "learning_rate": 1.982974633538232e-05, |
| "loss": 0.6828, |
| "num_input_tokens_seen": 2564248, |
| "step": 6805 |
| }, |
| { |
| "epoch": 6.102150537634409, |
| "grad_norm": 0.6280264258384705, |
| "learning_rate": 1.9791500049622505e-05, |
| "loss": 0.6971, |
| "num_input_tokens_seen": 2566296, |
| "step": 6810 |
| }, |
| { |
| "epoch": 6.10663082437276, |
| "grad_norm": 0.5198401212692261, |
| "learning_rate": 1.975326650296782e-05, |
| "loss": 0.6392, |
| "num_input_tokens_seen": 2568376, |
| "step": 6815 |
| }, |
| { |
| "epoch": 6.111111111111111, |
| "grad_norm": 0.6566698551177979, |
| "learning_rate": 1.9715045788931037e-05, |
| "loss": 0.7099, |
| "num_input_tokens_seen": 2570328, |
| "step": 6820 |
| }, |
| { |
| "epoch": 6.115591397849462, |
| "grad_norm": 0.6096717715263367, |
| "learning_rate": 1.967683800099349e-05, |
| "loss": 0.6999, |
| "num_input_tokens_seen": 2572184, |
| "step": 6825 |
| }, |
| { |
| "epoch": 6.120071684587813, |
| "grad_norm": 0.5329359173774719, |
| "learning_rate": 1.9638643232604957e-05, |
| "loss": 0.693, |
| "num_input_tokens_seen": 2573944, |
| "step": 6830 |
| }, |
| { |
| "epoch": 6.124551971326165, |
| "grad_norm": 0.5989028215408325, |
| "learning_rate": 1.9600461577183344e-05, |
| "loss": 0.6944, |
| "num_input_tokens_seen": 2575864, |
| "step": 6835 |
| }, |
| { |
| "epoch": 6.129032258064516, |
| "grad_norm": 0.5565618872642517, |
| "learning_rate": 1.9562293128114473e-05, |
| "loss": 0.6789, |
| "num_input_tokens_seen": 2577656, |
| "step": 6840 |
| }, |
| { |
| "epoch": 6.133512544802867, |
| "grad_norm": 0.4710071384906769, |
| "learning_rate": 1.95241379787519e-05, |
| "loss": 0.6942, |
| "num_input_tokens_seen": 2579416, |
| "step": 6845 |
| }, |
| { |
| "epoch": 6.137992831541219, |
| "grad_norm": 0.3683546781539917, |
| "learning_rate": 1.9485996222416607e-05, |
| "loss": 0.6928, |
| "num_input_tokens_seen": 2581208, |
| "step": 6850 |
| }, |
| { |
| "epoch": 6.14247311827957, |
| "grad_norm": 0.47386008501052856, |
| "learning_rate": 1.944786795239686e-05, |
| "loss": 0.6891, |
| "num_input_tokens_seen": 2583096, |
| "step": 6855 |
| }, |
| { |
| "epoch": 6.146953405017921, |
| "grad_norm": 0.6793469190597534, |
| "learning_rate": 1.9409753261947927e-05, |
| "loss": 0.6871, |
| "num_input_tokens_seen": 2585112, |
| "step": 6860 |
| }, |
| { |
| "epoch": 6.151433691756273, |
| "grad_norm": 0.46531930565834045, |
| "learning_rate": 1.9371652244291842e-05, |
| "loss": 0.6862, |
| "num_input_tokens_seen": 2586968, |
| "step": 6865 |
| }, |
| { |
| "epoch": 6.155913978494624, |
| "grad_norm": 0.47625985741615295, |
| "learning_rate": 1.9333564992617232e-05, |
| "loss": 0.691, |
| "num_input_tokens_seen": 2588760, |
| "step": 6870 |
| }, |
| { |
| "epoch": 6.160394265232975, |
| "grad_norm": 0.7655602693557739, |
| "learning_rate": 1.9295491600079035e-05, |
| "loss": 0.7103, |
| "num_input_tokens_seen": 2590680, |
| "step": 6875 |
| }, |
| { |
| "epoch": 6.164874551971327, |
| "grad_norm": 0.5129245519638062, |
| "learning_rate": 1.925743215979829e-05, |
| "loss": 0.6551, |
| "num_input_tokens_seen": 2592824, |
| "step": 6880 |
| }, |
| { |
| "epoch": 6.169354838709677, |
| "grad_norm": 0.5057689547538757, |
| "learning_rate": 1.9219386764861908e-05, |
| "loss": 0.6699, |
| "num_input_tokens_seen": 2594648, |
| "step": 6885 |
| }, |
| { |
| "epoch": 6.173835125448028, |
| "grad_norm": 0.47186315059661865, |
| "learning_rate": 1.9181355508322462e-05, |
| "loss": 0.7018, |
| "num_input_tokens_seen": 2596536, |
| "step": 6890 |
| }, |
| { |
| "epoch": 6.17831541218638, |
| "grad_norm": 0.6571786403656006, |
| "learning_rate": 1.914333848319795e-05, |
| "loss": 0.6503, |
| "num_input_tokens_seen": 2598424, |
| "step": 6895 |
| }, |
| { |
| "epoch": 6.182795698924731, |
| "grad_norm": 0.34855401515960693, |
| "learning_rate": 1.9105335782471534e-05, |
| "loss": 0.6945, |
| "num_input_tokens_seen": 2600216, |
| "step": 6900 |
| }, |
| { |
| "epoch": 6.187275985663082, |
| "grad_norm": 0.6422795057296753, |
| "learning_rate": 1.9067347499091364e-05, |
| "loss": 0.7219, |
| "num_input_tokens_seen": 2601944, |
| "step": 6905 |
| }, |
| { |
| "epoch": 6.191756272401434, |
| "grad_norm": 0.696559488773346, |
| "learning_rate": 1.9029373725970313e-05, |
| "loss": 0.7042, |
| "num_input_tokens_seen": 2603896, |
| "step": 6910 |
| }, |
| { |
| "epoch": 6.196236559139785, |
| "grad_norm": 0.6045291423797607, |
| "learning_rate": 1.8991414555985783e-05, |
| "loss": 0.6727, |
| "num_input_tokens_seen": 2605880, |
| "step": 6915 |
| }, |
| { |
| "epoch": 6.200716845878136, |
| "grad_norm": 0.5362111926078796, |
| "learning_rate": 1.895347008197945e-05, |
| "loss": 0.6899, |
| "num_input_tokens_seen": 2607672, |
| "step": 6920 |
| }, |
| { |
| "epoch": 6.205197132616488, |
| "grad_norm": 0.5272240042686462, |
| "learning_rate": 1.891554039675703e-05, |
| "loss": 0.6914, |
| "num_input_tokens_seen": 2609496, |
| "step": 6925 |
| }, |
| { |
| "epoch": 6.209677419354839, |
| "grad_norm": 0.6496487855911255, |
| "learning_rate": 1.8877625593088104e-05, |
| "loss": 0.6956, |
| "num_input_tokens_seen": 2611320, |
| "step": 6930 |
| }, |
| { |
| "epoch": 6.21415770609319, |
| "grad_norm": 0.5585481524467468, |
| "learning_rate": 1.8839725763705814e-05, |
| "loss": 0.664, |
| "num_input_tokens_seen": 2613304, |
| "step": 6935 |
| }, |
| { |
| "epoch": 6.218637992831542, |
| "grad_norm": 0.5600648522377014, |
| "learning_rate": 1.880184100130671e-05, |
| "loss": 0.6873, |
| "num_input_tokens_seen": 2615128, |
| "step": 6940 |
| }, |
| { |
| "epoch": 6.223118279569892, |
| "grad_norm": 0.49623891711235046, |
| "learning_rate": 1.876397139855047e-05, |
| "loss": 0.6787, |
| "num_input_tokens_seen": 2617016, |
| "step": 6945 |
| }, |
| { |
| "epoch": 6.227598566308243, |
| "grad_norm": 0.648080587387085, |
| "learning_rate": 1.8726117048059704e-05, |
| "loss": 0.6754, |
| "num_input_tokens_seen": 2618840, |
| "step": 6950 |
| }, |
| { |
| "epoch": 6.232078853046595, |
| "grad_norm": 0.611798107624054, |
| "learning_rate": 1.8688278042419734e-05, |
| "loss": 0.6912, |
| "num_input_tokens_seen": 2620664, |
| "step": 6955 |
| }, |
| { |
| "epoch": 6.236559139784946, |
| "grad_norm": 0.633701741695404, |
| "learning_rate": 1.8650454474178298e-05, |
| "loss": 0.7054, |
| "num_input_tokens_seen": 2622360, |
| "step": 6960 |
| }, |
| { |
| "epoch": 6.241039426523297, |
| "grad_norm": 0.698072612285614, |
| "learning_rate": 1.8612646435845443e-05, |
| "loss": 0.6942, |
| "num_input_tokens_seen": 2624120, |
| "step": 6965 |
| }, |
| { |
| "epoch": 6.245519713261649, |
| "grad_norm": 0.6582928895950317, |
| "learning_rate": 1.857485401989318e-05, |
| "loss": 0.6985, |
| "num_input_tokens_seen": 2625976, |
| "step": 6970 |
| }, |
| { |
| "epoch": 6.25, |
| "grad_norm": 0.6447840332984924, |
| "learning_rate": 1.853707731875534e-05, |
| "loss": 0.7099, |
| "num_input_tokens_seen": 2627896, |
| "step": 6975 |
| }, |
| { |
| "epoch": 6.254480286738351, |
| "grad_norm": 0.4497109651565552, |
| "learning_rate": 1.849931642482732e-05, |
| "loss": 0.6981, |
| "num_input_tokens_seen": 2629752, |
| "step": 6980 |
| }, |
| { |
| "epoch": 6.258960573476703, |
| "grad_norm": 0.7298440337181091, |
| "learning_rate": 1.8461571430465834e-05, |
| "loss": 0.7398, |
| "num_input_tokens_seen": 2631608, |
| "step": 6985 |
| }, |
| { |
| "epoch": 6.263440860215054, |
| "grad_norm": 0.46043136715888977, |
| "learning_rate": 1.8423842427988722e-05, |
| "loss": 0.6922, |
| "num_input_tokens_seen": 2633528, |
| "step": 6990 |
| }, |
| { |
| "epoch": 6.267921146953405, |
| "grad_norm": 0.5057744979858398, |
| "learning_rate": 1.83861295096747e-05, |
| "loss": 0.7101, |
| "num_input_tokens_seen": 2635544, |
| "step": 6995 |
| }, |
| { |
| "epoch": 6.272401433691757, |
| "grad_norm": 0.6320658922195435, |
| "learning_rate": 1.8348432767763162e-05, |
| "loss": 0.6801, |
| "num_input_tokens_seen": 2637496, |
| "step": 7000 |
| }, |
| { |
| "epoch": 6.276881720430108, |
| "grad_norm": 0.34551823139190674, |
| "learning_rate": 1.8310752294453924e-05, |
| "loss": 0.6804, |
| "num_input_tokens_seen": 2639320, |
| "step": 7005 |
| }, |
| { |
| "epoch": 6.281362007168458, |
| "grad_norm": 0.574415922164917, |
| "learning_rate": 1.8273088181907034e-05, |
| "loss": 0.6867, |
| "num_input_tokens_seen": 2641176, |
| "step": 7010 |
| }, |
| { |
| "epoch": 6.28584229390681, |
| "grad_norm": 0.5269017815589905, |
| "learning_rate": 1.823544052224247e-05, |
| "loss": 0.6902, |
| "num_input_tokens_seen": 2642936, |
| "step": 7015 |
| }, |
| { |
| "epoch": 6.290322580645161, |
| "grad_norm": 0.4298408031463623, |
| "learning_rate": 1.8197809407540028e-05, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 2644696, |
| "step": 7020 |
| }, |
| { |
| "epoch": 6.294802867383512, |
| "grad_norm": 0.6325519680976868, |
| "learning_rate": 1.816019492983902e-05, |
| "loss": 0.6596, |
| "num_input_tokens_seen": 2646520, |
| "step": 7025 |
| }, |
| { |
| "epoch": 6.299283154121864, |
| "grad_norm": 0.39390936493873596, |
| "learning_rate": 1.812259718113805e-05, |
| "loss": 0.6794, |
| "num_input_tokens_seen": 2648312, |
| "step": 7030 |
| }, |
| { |
| "epoch": 6.303763440860215, |
| "grad_norm": 0.557681143283844, |
| "learning_rate": 1.8085016253394817e-05, |
| "loss": 0.6849, |
| "num_input_tokens_seen": 2650200, |
| "step": 7035 |
| }, |
| { |
| "epoch": 6.308243727598566, |
| "grad_norm": 0.8681994676589966, |
| "learning_rate": 1.8047452238525896e-05, |
| "loss": 0.6985, |
| "num_input_tokens_seen": 2651992, |
| "step": 7040 |
| }, |
| { |
| "epoch": 6.312724014336918, |
| "grad_norm": 0.6332396864891052, |
| "learning_rate": 1.8009905228406458e-05, |
| "loss": 0.7095, |
| "num_input_tokens_seen": 2653848, |
| "step": 7045 |
| }, |
| { |
| "epoch": 6.317204301075269, |
| "grad_norm": 0.5211427807807922, |
| "learning_rate": 1.797237531487012e-05, |
| "loss": 0.6843, |
| "num_input_tokens_seen": 2655672, |
| "step": 7050 |
| }, |
| { |
| "epoch": 6.32168458781362, |
| "grad_norm": 0.7297235727310181, |
| "learning_rate": 1.7934862589708657e-05, |
| "loss": 0.6913, |
| "num_input_tokens_seen": 2657432, |
| "step": 7055 |
| }, |
| { |
| "epoch": 6.326164874551972, |
| "grad_norm": 0.46372517943382263, |
| "learning_rate": 1.789736714467182e-05, |
| "loss": 0.6807, |
| "num_input_tokens_seen": 2659256, |
| "step": 7060 |
| }, |
| { |
| "epoch": 6.330645161290323, |
| "grad_norm": 0.5800030827522278, |
| "learning_rate": 1.7859889071467102e-05, |
| "loss": 0.6791, |
| "num_input_tokens_seen": 2661144, |
| "step": 7065 |
| }, |
| { |
| "epoch": 6.335125448028673, |
| "grad_norm": 0.5289621949195862, |
| "learning_rate": 1.7822428461759483e-05, |
| "loss": 0.692, |
| "num_input_tokens_seen": 2662904, |
| "step": 7070 |
| }, |
| { |
| "epoch": 6.339605734767025, |
| "grad_norm": 0.6062299609184265, |
| "learning_rate": 1.778498540717124e-05, |
| "loss": 0.6937, |
| "num_input_tokens_seen": 2664728, |
| "step": 7075 |
| }, |
| { |
| "epoch": 6.344086021505376, |
| "grad_norm": 0.4943602383136749, |
| "learning_rate": 1.7747559999281723e-05, |
| "loss": 0.7078, |
| "num_input_tokens_seen": 2666616, |
| "step": 7080 |
| }, |
| { |
| "epoch": 6.348566308243727, |
| "grad_norm": 0.4620401859283447, |
| "learning_rate": 1.771015232962712e-05, |
| "loss": 0.6676, |
| "num_input_tokens_seen": 2668376, |
| "step": 7085 |
| }, |
| { |
| "epoch": 6.353046594982079, |
| "grad_norm": 0.7000113129615784, |
| "learning_rate": 1.7672762489700227e-05, |
| "loss": 0.6619, |
| "num_input_tokens_seen": 2670168, |
| "step": 7090 |
| }, |
| { |
| "epoch": 6.35752688172043, |
| "grad_norm": 0.4342782497406006, |
| "learning_rate": 1.7635390570950246e-05, |
| "loss": 0.7001, |
| "num_input_tokens_seen": 2672120, |
| "step": 7095 |
| }, |
| { |
| "epoch": 6.362007168458781, |
| "grad_norm": 0.7346124053001404, |
| "learning_rate": 1.7598036664782508e-05, |
| "loss": 0.6692, |
| "num_input_tokens_seen": 2674232, |
| "step": 7100 |
| }, |
| { |
| "epoch": 6.366487455197133, |
| "grad_norm": 0.8609234690666199, |
| "learning_rate": 1.7560700862558325e-05, |
| "loss": 0.7069, |
| "num_input_tokens_seen": 2676120, |
| "step": 7105 |
| }, |
| { |
| "epoch": 6.370967741935484, |
| "grad_norm": 0.580582857131958, |
| "learning_rate": 1.7523383255594735e-05, |
| "loss": 0.7049, |
| "num_input_tokens_seen": 2678072, |
| "step": 7110 |
| }, |
| { |
| "epoch": 6.375448028673835, |
| "grad_norm": 0.6592505574226379, |
| "learning_rate": 1.7486083935164244e-05, |
| "loss": 0.7161, |
| "num_input_tokens_seen": 2679960, |
| "step": 7115 |
| }, |
| { |
| "epoch": 6.379928315412187, |
| "grad_norm": 0.6857872605323792, |
| "learning_rate": 1.7448802992494657e-05, |
| "loss": 0.6875, |
| "num_input_tokens_seen": 2681816, |
| "step": 7120 |
| }, |
| { |
| "epoch": 6.384408602150538, |
| "grad_norm": 0.5294128060340881, |
| "learning_rate": 1.7411540518768805e-05, |
| "loss": 0.6824, |
| "num_input_tokens_seen": 2683768, |
| "step": 7125 |
| }, |
| { |
| "epoch": 6.388888888888889, |
| "grad_norm": 0.6431249380111694, |
| "learning_rate": 1.737429660512437e-05, |
| "loss": 0.7197, |
| "num_input_tokens_seen": 2685464, |
| "step": 7130 |
| }, |
| { |
| "epoch": 6.393369175627241, |
| "grad_norm": 0.6430848240852356, |
| "learning_rate": 1.733707134265363e-05, |
| "loss": 0.6713, |
| "num_input_tokens_seen": 2687384, |
| "step": 7135 |
| }, |
| { |
| "epoch": 6.397849462365591, |
| "grad_norm": 0.4698632061481476, |
| "learning_rate": 1.7299864822403257e-05, |
| "loss": 0.6624, |
| "num_input_tokens_seen": 2689176, |
| "step": 7140 |
| }, |
| { |
| "epoch": 6.402329749103942, |
| "grad_norm": 0.7135388851165771, |
| "learning_rate": 1.7262677135374053e-05, |
| "loss": 0.7079, |
| "num_input_tokens_seen": 2691000, |
| "step": 7145 |
| }, |
| { |
| "epoch": 6.406810035842294, |
| "grad_norm": 0.48572251200675964, |
| "learning_rate": 1.72255083725208e-05, |
| "loss": 0.7285, |
| "num_input_tokens_seen": 2692824, |
| "step": 7150 |
| }, |
| { |
| "epoch": 6.411290322580645, |
| "grad_norm": 0.4359692335128784, |
| "learning_rate": 1.7188358624751954e-05, |
| "loss": 0.7156, |
| "num_input_tokens_seen": 2694648, |
| "step": 7155 |
| }, |
| { |
| "epoch": 6.415770609318996, |
| "grad_norm": 0.3980657160282135, |
| "learning_rate": 1.7151227982929477e-05, |
| "loss": 0.659, |
| "num_input_tokens_seen": 2696760, |
| "step": 7160 |
| }, |
| { |
| "epoch": 6.420250896057348, |
| "grad_norm": 0.5434011220932007, |
| "learning_rate": 1.711411653786861e-05, |
| "loss": 0.731, |
| "num_input_tokens_seen": 2698680, |
| "step": 7165 |
| }, |
| { |
| "epoch": 6.424731182795699, |
| "grad_norm": 0.7666317820549011, |
| "learning_rate": 1.7077024380337646e-05, |
| "loss": 0.7375, |
| "num_input_tokens_seen": 2700568, |
| "step": 7170 |
| }, |
| { |
| "epoch": 6.42921146953405, |
| "grad_norm": 0.7653113007545471, |
| "learning_rate": 1.7039951601057692e-05, |
| "loss": 0.6526, |
| "num_input_tokens_seen": 2702360, |
| "step": 7175 |
| }, |
| { |
| "epoch": 6.433691756272402, |
| "grad_norm": 0.45610731840133667, |
| "learning_rate": 1.7002898290702454e-05, |
| "loss": 0.7237, |
| "num_input_tokens_seen": 2704376, |
| "step": 7180 |
| }, |
| { |
| "epoch": 6.438172043010753, |
| "grad_norm": 0.46774065494537354, |
| "learning_rate": 1.6965864539898026e-05, |
| "loss": 0.6886, |
| "num_input_tokens_seen": 2706200, |
| "step": 7185 |
| }, |
| { |
| "epoch": 6.442652329749104, |
| "grad_norm": 0.5820562839508057, |
| "learning_rate": 1.6928850439222666e-05, |
| "loss": 0.6965, |
| "num_input_tokens_seen": 2708088, |
| "step": 7190 |
| }, |
| { |
| "epoch": 6.447132616487455, |
| "grad_norm": 0.9814329743385315, |
| "learning_rate": 1.689185607920658e-05, |
| "loss": 0.6905, |
| "num_input_tokens_seen": 2709912, |
| "step": 7195 |
| }, |
| { |
| "epoch": 6.451612903225806, |
| "grad_norm": 0.5103081464767456, |
| "learning_rate": 1.685488155033167e-05, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 2711672, |
| "step": 7200 |
| }, |
| { |
| "epoch": 6.456093189964157, |
| "grad_norm": 0.5958290696144104, |
| "learning_rate": 1.681792694303136e-05, |
| "loss": 0.6738, |
| "num_input_tokens_seen": 2713528, |
| "step": 7205 |
| }, |
| { |
| "epoch": 6.460573476702509, |
| "grad_norm": 0.5504493713378906, |
| "learning_rate": 1.6780992347690313e-05, |
| "loss": 0.6801, |
| "num_input_tokens_seen": 2715416, |
| "step": 7210 |
| }, |
| { |
| "epoch": 6.46505376344086, |
| "grad_norm": 0.47812798619270325, |
| "learning_rate": 1.6744077854644282e-05, |
| "loss": 0.7178, |
| "num_input_tokens_seen": 2717464, |
| "step": 7215 |
| }, |
| { |
| "epoch": 6.469534050179211, |
| "grad_norm": 0.5012235641479492, |
| "learning_rate": 1.6707183554179846e-05, |
| "loss": 0.6902, |
| "num_input_tokens_seen": 2719352, |
| "step": 7220 |
| }, |
| { |
| "epoch": 6.474014336917563, |
| "grad_norm": 0.6582107543945312, |
| "learning_rate": 1.6670309536534172e-05, |
| "loss": 0.6875, |
| "num_input_tokens_seen": 2721368, |
| "step": 7225 |
| }, |
| { |
| "epoch": 6.478494623655914, |
| "grad_norm": 0.7172203063964844, |
| "learning_rate": 1.6633455891894858e-05, |
| "loss": 0.6896, |
| "num_input_tokens_seen": 2723320, |
| "step": 7230 |
| }, |
| { |
| "epoch": 6.482974910394265, |
| "grad_norm": 0.6096293330192566, |
| "learning_rate": 1.659662271039963e-05, |
| "loss": 0.7123, |
| "num_input_tokens_seen": 2725240, |
| "step": 7235 |
| }, |
| { |
| "epoch": 6.487455197132617, |
| "grad_norm": 0.6388385891914368, |
| "learning_rate": 1.65598100821362e-05, |
| "loss": 0.6414, |
| "num_input_tokens_seen": 2727288, |
| "step": 7240 |
| }, |
| { |
| "epoch": 6.491935483870968, |
| "grad_norm": 0.7723689079284668, |
| "learning_rate": 1.652301809714199e-05, |
| "loss": 0.7895, |
| "num_input_tokens_seen": 2729080, |
| "step": 7245 |
| }, |
| { |
| "epoch": 6.496415770609319, |
| "grad_norm": 0.5337994694709778, |
| "learning_rate": 1.648624684540394e-05, |
| "loss": 0.6863, |
| "num_input_tokens_seen": 2730904, |
| "step": 7250 |
| }, |
| { |
| "epoch": 6.5, |
| "eval_loss": 0.7022318840026855, |
| "eval_runtime": 5.6438, |
| "eval_samples_per_second": 87.884, |
| "eval_steps_per_second": 21.971, |
| "num_input_tokens_seen": 2732440, |
| "step": 7254 |
| }, |
| { |
| "epoch": 6.500896057347671, |
| "grad_norm": 0.5980244278907776, |
| "learning_rate": 1.6449496416858284e-05, |
| "loss": 0.6843, |
| "num_input_tokens_seen": 2732792, |
| "step": 7255 |
| }, |
| { |
| "epoch": 6.505376344086022, |
| "grad_norm": 0.7844721078872681, |
| "learning_rate": 1.6412766901390314e-05, |
| "loss": 0.6785, |
| "num_input_tokens_seen": 2734616, |
| "step": 7260 |
| }, |
| { |
| "epoch": 6.509856630824372, |
| "grad_norm": 0.6142382025718689, |
| "learning_rate": 1.6376058388834183e-05, |
| "loss": 0.7065, |
| "num_input_tokens_seen": 2736472, |
| "step": 7265 |
| }, |
| { |
| "epoch": 6.514336917562724, |
| "grad_norm": 0.3665355145931244, |
| "learning_rate": 1.633937096897266e-05, |
| "loss": 0.7276, |
| "num_input_tokens_seen": 2738360, |
| "step": 7270 |
| }, |
| { |
| "epoch": 6.518817204301075, |
| "grad_norm": 0.5914114713668823, |
| "learning_rate": 1.630270473153695e-05, |
| "loss": 0.7268, |
| "num_input_tokens_seen": 2740408, |
| "step": 7275 |
| }, |
| { |
| "epoch": 6.523297491039426, |
| "grad_norm": 0.5145869255065918, |
| "learning_rate": 1.6266059766206425e-05, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 2742168, |
| "step": 7280 |
| }, |
| { |
| "epoch": 6.527777777777778, |
| "grad_norm": 0.6934862732887268, |
| "learning_rate": 1.6229436162608448e-05, |
| "loss": 0.6979, |
| "num_input_tokens_seen": 2743928, |
| "step": 7285 |
| }, |
| { |
| "epoch": 6.532258064516129, |
| "grad_norm": 0.6275290250778198, |
| "learning_rate": 1.619283401031811e-05, |
| "loss": 0.688, |
| "num_input_tokens_seen": 2745944, |
| "step": 7290 |
| }, |
| { |
| "epoch": 6.53673835125448, |
| "grad_norm": 0.49510547518730164, |
| "learning_rate": 1.6156253398858058e-05, |
| "loss": 0.6983, |
| "num_input_tokens_seen": 2747960, |
| "step": 7295 |
| }, |
| { |
| "epoch": 6.541218637992832, |
| "grad_norm": 0.569757878780365, |
| "learning_rate": 1.6119694417698246e-05, |
| "loss": 0.6844, |
| "num_input_tokens_seen": 2749848, |
| "step": 7300 |
| }, |
| { |
| "epoch": 6.545698924731183, |
| "grad_norm": 0.46662643551826477, |
| "learning_rate": 1.6083157156255733e-05, |
| "loss": 0.713, |
| "num_input_tokens_seen": 2751704, |
| "step": 7305 |
| }, |
| { |
| "epoch": 6.550179211469534, |
| "grad_norm": 0.46720123291015625, |
| "learning_rate": 1.6046641703894434e-05, |
| "loss": 0.7148, |
| "num_input_tokens_seen": 2753528, |
| "step": 7310 |
| }, |
| { |
| "epoch": 6.554659498207886, |
| "grad_norm": 0.37194913625717163, |
| "learning_rate": 1.6010148149924956e-05, |
| "loss": 0.7162, |
| "num_input_tokens_seen": 2755320, |
| "step": 7315 |
| }, |
| { |
| "epoch": 6.559139784946236, |
| "grad_norm": 0.4675387442111969, |
| "learning_rate": 1.5973676583604298e-05, |
| "loss": 0.683, |
| "num_input_tokens_seen": 2757368, |
| "step": 7320 |
| }, |
| { |
| "epoch": 6.563620071684587, |
| "grad_norm": 0.5463271141052246, |
| "learning_rate": 1.5937227094135733e-05, |
| "loss": 0.6915, |
| "num_input_tokens_seen": 2759224, |
| "step": 7325 |
| }, |
| { |
| "epoch": 6.568100358422939, |
| "grad_norm": 0.7810373902320862, |
| "learning_rate": 1.5900799770668495e-05, |
| "loss": 0.73, |
| "num_input_tokens_seen": 2761112, |
| "step": 7330 |
| }, |
| { |
| "epoch": 6.57258064516129, |
| "grad_norm": 0.5256034135818481, |
| "learning_rate": 1.5864394702297636e-05, |
| "loss": 0.6704, |
| "num_input_tokens_seen": 2763096, |
| "step": 7335 |
| }, |
| { |
| "epoch": 6.577060931899641, |
| "grad_norm": 0.46397244930267334, |
| "learning_rate": 1.5828011978063765e-05, |
| "loss": 0.706, |
| "num_input_tokens_seen": 2764888, |
| "step": 7340 |
| }, |
| { |
| "epoch": 6.581541218637993, |
| "grad_norm": 0.5735636949539185, |
| "learning_rate": 1.5791651686952823e-05, |
| "loss": 0.676, |
| "num_input_tokens_seen": 2766776, |
| "step": 7345 |
| }, |
| { |
| "epoch": 6.586021505376344, |
| "grad_norm": 0.6215652227401733, |
| "learning_rate": 1.575531391789591e-05, |
| "loss": 0.6988, |
| "num_input_tokens_seen": 2768760, |
| "step": 7350 |
| }, |
| { |
| "epoch": 6.590501792114695, |
| "grad_norm": 0.39189133048057556, |
| "learning_rate": 1.5718998759769025e-05, |
| "loss": 0.7245, |
| "num_input_tokens_seen": 2770584, |
| "step": 7355 |
| }, |
| { |
| "epoch": 6.594982078853047, |
| "grad_norm": 0.39220893383026123, |
| "learning_rate": 1.5682706301392867e-05, |
| "loss": 0.69, |
| "num_input_tokens_seen": 2772408, |
| "step": 7360 |
| }, |
| { |
| "epoch": 6.599462365591398, |
| "grad_norm": 0.5382081866264343, |
| "learning_rate": 1.564643663153263e-05, |
| "loss": 0.7117, |
| "num_input_tokens_seen": 2774328, |
| "step": 7365 |
| }, |
| { |
| "epoch": 6.603942652329749, |
| "grad_norm": 0.4528568685054779, |
| "learning_rate": 1.561018983889775e-05, |
| "loss": 0.6928, |
| "num_input_tokens_seen": 2776120, |
| "step": 7370 |
| }, |
| { |
| "epoch": 6.608422939068101, |
| "grad_norm": 0.5559325814247131, |
| "learning_rate": 1.557396601214171e-05, |
| "loss": 0.7035, |
| "num_input_tokens_seen": 2778040, |
| "step": 7375 |
| }, |
| { |
| "epoch": 6.612903225806452, |
| "grad_norm": 0.5265352725982666, |
| "learning_rate": 1.5537765239861838e-05, |
| "loss": 0.7336, |
| "num_input_tokens_seen": 2779928, |
| "step": 7380 |
| }, |
| { |
| "epoch": 6.617383512544803, |
| "grad_norm": 0.46773481369018555, |
| "learning_rate": 1.550158761059907e-05, |
| "loss": 0.7086, |
| "num_input_tokens_seen": 2781656, |
| "step": 7385 |
| }, |
| { |
| "epoch": 6.621863799283155, |
| "grad_norm": 0.4599006772041321, |
| "learning_rate": 1.5465433212837726e-05, |
| "loss": 0.6835, |
| "num_input_tokens_seen": 2783544, |
| "step": 7390 |
| }, |
| { |
| "epoch": 6.626344086021505, |
| "grad_norm": 0.5777242183685303, |
| "learning_rate": 1.542930213500533e-05, |
| "loss": 0.6867, |
| "num_input_tokens_seen": 2785304, |
| "step": 7395 |
| }, |
| { |
| "epoch": 6.630824372759856, |
| "grad_norm": 0.5781882405281067, |
| "learning_rate": 1.5393194465472337e-05, |
| "loss": 0.6471, |
| "num_input_tokens_seen": 2787256, |
| "step": 7400 |
| }, |
| { |
| "epoch": 6.635304659498208, |
| "grad_norm": 0.4682949483394623, |
| "learning_rate": 1.535711029255197e-05, |
| "loss": 0.7062, |
| "num_input_tokens_seen": 2789144, |
| "step": 7405 |
| }, |
| { |
| "epoch": 6.639784946236559, |
| "grad_norm": 0.3891659080982208, |
| "learning_rate": 1.532104970449999e-05, |
| "loss": 0.6508, |
| "num_input_tokens_seen": 2791000, |
| "step": 7410 |
| }, |
| { |
| "epoch": 6.64426523297491, |
| "grad_norm": 0.5683819055557251, |
| "learning_rate": 1.5285012789514446e-05, |
| "loss": 0.6994, |
| "num_input_tokens_seen": 2793016, |
| "step": 7415 |
| }, |
| { |
| "epoch": 6.648745519713262, |
| "grad_norm": 0.4903818666934967, |
| "learning_rate": 1.5248999635735516e-05, |
| "loss": 0.7113, |
| "num_input_tokens_seen": 2795000, |
| "step": 7420 |
| }, |
| { |
| "epoch": 6.653225806451613, |
| "grad_norm": 0.6192786693572998, |
| "learning_rate": 1.5213010331245259e-05, |
| "loss": 0.7565, |
| "num_input_tokens_seen": 2796984, |
| "step": 7425 |
| }, |
| { |
| "epoch": 6.657706093189964, |
| "grad_norm": 0.6645828485488892, |
| "learning_rate": 1.5177044964067372e-05, |
| "loss": 0.6786, |
| "num_input_tokens_seen": 2798872, |
| "step": 7430 |
| }, |
| { |
| "epoch": 6.662186379928316, |
| "grad_norm": 0.6101380586624146, |
| "learning_rate": 1.5141103622167041e-05, |
| "loss": 0.6987, |
| "num_input_tokens_seen": 2800888, |
| "step": 7435 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.6002082228660583, |
| "learning_rate": 1.5105186393450665e-05, |
| "loss": 0.7221, |
| "num_input_tokens_seen": 2802776, |
| "step": 7440 |
| }, |
| { |
| "epoch": 6.671146953405018, |
| "grad_norm": 0.44711893796920776, |
| "learning_rate": 1.5069293365765685e-05, |
| "loss": 0.6928, |
| "num_input_tokens_seen": 2804504, |
| "step": 7445 |
| }, |
| { |
| "epoch": 6.675627240143369, |
| "grad_norm": 0.4706989824771881, |
| "learning_rate": 1.5033424626900353e-05, |
| "loss": 0.7042, |
| "num_input_tokens_seen": 2806424, |
| "step": 7450 |
| }, |
| { |
| "epoch": 6.68010752688172, |
| "grad_norm": 0.4829060435295105, |
| "learning_rate": 1.4997580264583488e-05, |
| "loss": 0.6754, |
| "num_input_tokens_seen": 2808312, |
| "step": 7455 |
| }, |
| { |
| "epoch": 6.684587813620071, |
| "grad_norm": 0.40096986293792725, |
| "learning_rate": 1.4961760366484307e-05, |
| "loss": 0.6713, |
| "num_input_tokens_seen": 2810232, |
| "step": 7460 |
| }, |
| { |
| "epoch": 6.689068100358423, |
| "grad_norm": 0.6513391137123108, |
| "learning_rate": 1.492596502021219e-05, |
| "loss": 0.6462, |
| "num_input_tokens_seen": 2812248, |
| "step": 7465 |
| }, |
| { |
| "epoch": 6.693548387096774, |
| "grad_norm": 0.5434605479240417, |
| "learning_rate": 1.4890194313316478e-05, |
| "loss": 0.6619, |
| "num_input_tokens_seen": 2814168, |
| "step": 7470 |
| }, |
| { |
| "epoch": 6.698028673835125, |
| "grad_norm": 0.5546020865440369, |
| "learning_rate": 1.4854448333286222e-05, |
| "loss": 0.6814, |
| "num_input_tokens_seen": 2816088, |
| "step": 7475 |
| }, |
| { |
| "epoch": 6.702508960573477, |
| "grad_norm": 0.5495643615722656, |
| "learning_rate": 1.4818727167550025e-05, |
| "loss": 0.7079, |
| "num_input_tokens_seen": 2817944, |
| "step": 7480 |
| }, |
| { |
| "epoch": 6.706989247311828, |
| "grad_norm": 0.5469850897789001, |
| "learning_rate": 1.478303090347577e-05, |
| "loss": 0.6782, |
| "num_input_tokens_seen": 2819928, |
| "step": 7485 |
| }, |
| { |
| "epoch": 6.711469534050179, |
| "grad_norm": 0.41618314385414124, |
| "learning_rate": 1.474735962837045e-05, |
| "loss": 0.7023, |
| "num_input_tokens_seen": 2821880, |
| "step": 7490 |
| }, |
| { |
| "epoch": 6.715949820788531, |
| "grad_norm": 0.4919305145740509, |
| "learning_rate": 1.4711713429479945e-05, |
| "loss": 0.6803, |
| "num_input_tokens_seen": 2823800, |
| "step": 7495 |
| }, |
| { |
| "epoch": 6.720430107526882, |
| "grad_norm": 0.7194111347198486, |
| "learning_rate": 1.4676092393988791e-05, |
| "loss": 0.674, |
| "num_input_tokens_seen": 2825656, |
| "step": 7500 |
| }, |
| { |
| "epoch": 6.724910394265233, |
| "grad_norm": 0.4722954332828522, |
| "learning_rate": 1.4640496609019993e-05, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 2827512, |
| "step": 7505 |
| }, |
| { |
| "epoch": 6.729390681003585, |
| "grad_norm": 0.3946280777454376, |
| "learning_rate": 1.4604926161634768e-05, |
| "loss": 0.6972, |
| "num_input_tokens_seen": 2829336, |
| "step": 7510 |
| }, |
| { |
| "epoch": 6.733870967741936, |
| "grad_norm": 0.35536396503448486, |
| "learning_rate": 1.45693811388324e-05, |
| "loss": 0.696, |
| "num_input_tokens_seen": 2831224, |
| "step": 7515 |
| }, |
| { |
| "epoch": 6.738351254480286, |
| "grad_norm": 0.5042831301689148, |
| "learning_rate": 1.4533861627549953e-05, |
| "loss": 0.6795, |
| "num_input_tokens_seen": 2833176, |
| "step": 7520 |
| }, |
| { |
| "epoch": 6.742831541218638, |
| "grad_norm": 0.6712019443511963, |
| "learning_rate": 1.4498367714662128e-05, |
| "loss": 0.6755, |
| "num_input_tokens_seen": 2835096, |
| "step": 7525 |
| }, |
| { |
| "epoch": 6.747311827956989, |
| "grad_norm": 0.431209921836853, |
| "learning_rate": 1.4462899486980994e-05, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 2836952, |
| "step": 7530 |
| }, |
| { |
| "epoch": 6.75179211469534, |
| "grad_norm": 0.55506432056427, |
| "learning_rate": 1.4427457031255803e-05, |
| "loss": 0.682, |
| "num_input_tokens_seen": 2838936, |
| "step": 7535 |
| }, |
| { |
| "epoch": 6.756272401433692, |
| "grad_norm": 0.6920475363731384, |
| "learning_rate": 1.4392040434172773e-05, |
| "loss": 0.7126, |
| "num_input_tokens_seen": 2840888, |
| "step": 7540 |
| }, |
| { |
| "epoch": 6.760752688172043, |
| "grad_norm": 0.775012731552124, |
| "learning_rate": 1.4356649782354872e-05, |
| "loss": 0.6911, |
| "num_input_tokens_seen": 2842776, |
| "step": 7545 |
| }, |
| { |
| "epoch": 6.765232974910394, |
| "grad_norm": 0.41614946722984314, |
| "learning_rate": 1.432128516236163e-05, |
| "loss": 0.6598, |
| "num_input_tokens_seen": 2844568, |
| "step": 7550 |
| }, |
| { |
| "epoch": 6.769713261648746, |
| "grad_norm": 0.4207038879394531, |
| "learning_rate": 1.4285946660688888e-05, |
| "loss": 0.697, |
| "num_input_tokens_seen": 2846328, |
| "step": 7555 |
| }, |
| { |
| "epoch": 6.774193548387097, |
| "grad_norm": 0.5408599972724915, |
| "learning_rate": 1.4250634363768601e-05, |
| "loss": 0.7036, |
| "num_input_tokens_seen": 2848216, |
| "step": 7560 |
| }, |
| { |
| "epoch": 6.778673835125448, |
| "grad_norm": 0.617653489112854, |
| "learning_rate": 1.4215348357968669e-05, |
| "loss": 0.7178, |
| "num_input_tokens_seen": 2850104, |
| "step": 7565 |
| }, |
| { |
| "epoch": 6.7831541218638, |
| "grad_norm": 0.6825722455978394, |
| "learning_rate": 1.4180088729592633e-05, |
| "loss": 0.6723, |
| "num_input_tokens_seen": 2851960, |
| "step": 7570 |
| }, |
| { |
| "epoch": 6.78763440860215, |
| "grad_norm": 0.48724040389060974, |
| "learning_rate": 1.4144855564879553e-05, |
| "loss": 0.7048, |
| "num_input_tokens_seen": 2853944, |
| "step": 7575 |
| }, |
| { |
| "epoch": 6.792114695340501, |
| "grad_norm": 0.5684961080551147, |
| "learning_rate": 1.410964895000377e-05, |
| "loss": 0.7017, |
| "num_input_tokens_seen": 2855672, |
| "step": 7580 |
| }, |
| { |
| "epoch": 6.796594982078853, |
| "grad_norm": 0.586720883846283, |
| "learning_rate": 1.4074468971074673e-05, |
| "loss": 0.696, |
| "num_input_tokens_seen": 2857496, |
| "step": 7585 |
| }, |
| { |
| "epoch": 6.801075268817204, |
| "grad_norm": 0.44053155183792114, |
| "learning_rate": 1.4039315714136502e-05, |
| "loss": 0.7209, |
| "num_input_tokens_seen": 2859320, |
| "step": 7590 |
| }, |
| { |
| "epoch": 6.805555555555555, |
| "grad_norm": 0.537570595741272, |
| "learning_rate": 1.4004189265168149e-05, |
| "loss": 0.6767, |
| "num_input_tokens_seen": 2861272, |
| "step": 7595 |
| }, |
| { |
| "epoch": 6.810035842293907, |
| "grad_norm": 0.4736902117729187, |
| "learning_rate": 1.3969089710082927e-05, |
| "loss": 0.6851, |
| "num_input_tokens_seen": 2863256, |
| "step": 7600 |
| }, |
| { |
| "epoch": 6.814516129032258, |
| "grad_norm": 0.4806761145591736, |
| "learning_rate": 1.3934017134728397e-05, |
| "loss": 0.6803, |
| "num_input_tokens_seen": 2865048, |
| "step": 7605 |
| }, |
| { |
| "epoch": 6.818996415770609, |
| "grad_norm": 0.563230037689209, |
| "learning_rate": 1.3898971624886101e-05, |
| "loss": 0.7188, |
| "num_input_tokens_seen": 2866904, |
| "step": 7610 |
| }, |
| { |
| "epoch": 6.823476702508961, |
| "grad_norm": 0.6949727535247803, |
| "learning_rate": 1.386395326627139e-05, |
| "loss": 0.6784, |
| "num_input_tokens_seen": 2868920, |
| "step": 7615 |
| }, |
| { |
| "epoch": 6.827956989247312, |
| "grad_norm": 0.5937641263008118, |
| "learning_rate": 1.3828962144533242e-05, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 2870840, |
| "step": 7620 |
| }, |
| { |
| "epoch": 6.832437275985663, |
| "grad_norm": 0.5545377135276794, |
| "learning_rate": 1.379399834525395e-05, |
| "loss": 0.7275, |
| "num_input_tokens_seen": 2872792, |
| "step": 7625 |
| }, |
| { |
| "epoch": 6.836917562724015, |
| "grad_norm": 0.5834463238716125, |
| "learning_rate": 1.3759061953949054e-05, |
| "loss": 0.6668, |
| "num_input_tokens_seen": 2874552, |
| "step": 7630 |
| }, |
| { |
| "epoch": 6.841397849462366, |
| "grad_norm": 0.6372240781784058, |
| "learning_rate": 1.3724153056067013e-05, |
| "loss": 0.6821, |
| "num_input_tokens_seen": 2876312, |
| "step": 7635 |
| }, |
| { |
| "epoch": 6.845878136200717, |
| "grad_norm": 0.4398591220378876, |
| "learning_rate": 1.3689271736989046e-05, |
| "loss": 0.6746, |
| "num_input_tokens_seen": 2878200, |
| "step": 7640 |
| }, |
| { |
| "epoch": 6.850358422939068, |
| "grad_norm": 0.43143922090530396, |
| "learning_rate": 1.3654418082028956e-05, |
| "loss": 0.692, |
| "num_input_tokens_seen": 2879992, |
| "step": 7645 |
| }, |
| { |
| "epoch": 6.854838709677419, |
| "grad_norm": 0.5556138753890991, |
| "learning_rate": 1.3619592176432816e-05, |
| "loss": 0.7245, |
| "num_input_tokens_seen": 2881816, |
| "step": 7650 |
| }, |
| { |
| "epoch": 6.85931899641577, |
| "grad_norm": 0.8755431771278381, |
| "learning_rate": 1.3584794105378904e-05, |
| "loss": 0.7197, |
| "num_input_tokens_seen": 2883832, |
| "step": 7655 |
| }, |
| { |
| "epoch": 6.863799283154122, |
| "grad_norm": 0.5189265608787537, |
| "learning_rate": 1.3550023953977367e-05, |
| "loss": 0.6686, |
| "num_input_tokens_seen": 2885848, |
| "step": 7660 |
| }, |
| { |
| "epoch": 6.868279569892473, |
| "grad_norm": 0.6444762945175171, |
| "learning_rate": 1.3515281807270075e-05, |
| "loss": 0.7059, |
| "num_input_tokens_seen": 2887864, |
| "step": 7665 |
| }, |
| { |
| "epoch": 6.872759856630824, |
| "grad_norm": 0.60382479429245, |
| "learning_rate": 1.3480567750230433e-05, |
| "loss": 0.7083, |
| "num_input_tokens_seen": 2889816, |
| "step": 7670 |
| }, |
| { |
| "epoch": 6.877240143369176, |
| "grad_norm": 0.5019137859344482, |
| "learning_rate": 1.344588186776311e-05, |
| "loss": 0.7427, |
| "num_input_tokens_seen": 2891608, |
| "step": 7675 |
| }, |
| { |
| "epoch": 6.881720430107527, |
| "grad_norm": 0.37922972440719604, |
| "learning_rate": 1.3411224244703873e-05, |
| "loss": 0.7213, |
| "num_input_tokens_seen": 2893528, |
| "step": 7680 |
| }, |
| { |
| "epoch": 6.886200716845878, |
| "grad_norm": 0.537884533405304, |
| "learning_rate": 1.3376594965819378e-05, |
| "loss": 0.6898, |
| "num_input_tokens_seen": 2895576, |
| "step": 7685 |
| }, |
| { |
| "epoch": 6.89068100358423, |
| "grad_norm": 0.4329852759838104, |
| "learning_rate": 1.3341994115806943e-05, |
| "loss": 0.6824, |
| "num_input_tokens_seen": 2897592, |
| "step": 7690 |
| }, |
| { |
| "epoch": 6.895161290322581, |
| "grad_norm": 0.36487624049186707, |
| "learning_rate": 1.3307421779294377e-05, |
| "loss": 0.6868, |
| "num_input_tokens_seen": 2899384, |
| "step": 7695 |
| }, |
| { |
| "epoch": 6.899641577060932, |
| "grad_norm": 0.5098947286605835, |
| "learning_rate": 1.3272878040839742e-05, |
| "loss": 0.6745, |
| "num_input_tokens_seen": 2901240, |
| "step": 7700 |
| }, |
| { |
| "epoch": 6.904121863799283, |
| "grad_norm": 0.4954400360584259, |
| "learning_rate": 1.3238362984931113e-05, |
| "loss": 0.6972, |
| "num_input_tokens_seen": 2903224, |
| "step": 7705 |
| }, |
| { |
| "epoch": 6.908602150537634, |
| "grad_norm": 0.3625122904777527, |
| "learning_rate": 1.3203876695986478e-05, |
| "loss": 0.691, |
| "num_input_tokens_seen": 2905112, |
| "step": 7710 |
| }, |
| { |
| "epoch": 6.913082437275985, |
| "grad_norm": 0.5607591271400452, |
| "learning_rate": 1.3169419258353433e-05, |
| "loss": 0.7021, |
| "num_input_tokens_seen": 2907192, |
| "step": 7715 |
| }, |
| { |
| "epoch": 6.917562724014337, |
| "grad_norm": 0.4479368031024933, |
| "learning_rate": 1.313499075630899e-05, |
| "loss": 0.709, |
| "num_input_tokens_seen": 2909016, |
| "step": 7720 |
| }, |
| { |
| "epoch": 6.922043010752688, |
| "grad_norm": 0.6518425345420837, |
| "learning_rate": 1.3100591274059431e-05, |
| "loss": 0.6698, |
| "num_input_tokens_seen": 2910968, |
| "step": 7725 |
| }, |
| { |
| "epoch": 6.926523297491039, |
| "grad_norm": 0.48006075620651245, |
| "learning_rate": 1.3066220895740039e-05, |
| "loss": 0.7108, |
| "num_input_tokens_seen": 2913080, |
| "step": 7730 |
| }, |
| { |
| "epoch": 6.931003584229391, |
| "grad_norm": 0.5145459771156311, |
| "learning_rate": 1.3031879705414907e-05, |
| "loss": 0.6862, |
| "num_input_tokens_seen": 2914968, |
| "step": 7735 |
| }, |
| { |
| "epoch": 6.935483870967742, |
| "grad_norm": 0.35982221364974976, |
| "learning_rate": 1.2997567787076747e-05, |
| "loss": 0.6826, |
| "num_input_tokens_seen": 2916824, |
| "step": 7740 |
| }, |
| { |
| "epoch": 6.939964157706093, |
| "grad_norm": 0.5295773148536682, |
| "learning_rate": 1.296328522464667e-05, |
| "loss": 0.7037, |
| "num_input_tokens_seen": 2919032, |
| "step": 7745 |
| }, |
| { |
| "epoch": 6.944444444444445, |
| "grad_norm": 0.4160950183868408, |
| "learning_rate": 1.2929032101974009e-05, |
| "loss": 0.6743, |
| "num_input_tokens_seen": 2920920, |
| "step": 7750 |
| }, |
| { |
| "epoch": 6.948924731182796, |
| "grad_norm": 0.502730667591095, |
| "learning_rate": 1.289480850283607e-05, |
| "loss": 0.6956, |
| "num_input_tokens_seen": 2923032, |
| "step": 7755 |
| }, |
| { |
| "epoch": 6.953405017921147, |
| "grad_norm": 0.5727908611297607, |
| "learning_rate": 1.2860614510937955e-05, |
| "loss": 0.6978, |
| "num_input_tokens_seen": 2924856, |
| "step": 7760 |
| }, |
| { |
| "epoch": 6.957885304659499, |
| "grad_norm": 0.5175768136978149, |
| "learning_rate": 1.2826450209912355e-05, |
| "loss": 0.7139, |
| "num_input_tokens_seen": 2926680, |
| "step": 7765 |
| }, |
| { |
| "epoch": 6.96236559139785, |
| "grad_norm": 0.63689786195755, |
| "learning_rate": 1.2792315683319328e-05, |
| "loss": 0.6853, |
| "num_input_tokens_seen": 2928568, |
| "step": 7770 |
| }, |
| { |
| "epoch": 6.9668458781362, |
| "grad_norm": 0.6053296327590942, |
| "learning_rate": 1.2758211014646143e-05, |
| "loss": 0.7121, |
| "num_input_tokens_seen": 2930424, |
| "step": 7775 |
| }, |
| { |
| "epoch": 6.971326164874552, |
| "grad_norm": 0.4488166272640228, |
| "learning_rate": 1.2724136287307009e-05, |
| "loss": 0.6899, |
| "num_input_tokens_seen": 2932376, |
| "step": 7780 |
| }, |
| { |
| "epoch": 6.975806451612903, |
| "grad_norm": 0.5750177502632141, |
| "learning_rate": 1.2690091584642916e-05, |
| "loss": 0.7093, |
| "num_input_tokens_seen": 2934072, |
| "step": 7785 |
| }, |
| { |
| "epoch": 6.980286738351254, |
| "grad_norm": 0.4811491370201111, |
| "learning_rate": 1.2656076989921417e-05, |
| "loss": 0.6774, |
| "num_input_tokens_seen": 2935896, |
| "step": 7790 |
| }, |
| { |
| "epoch": 6.984767025089606, |
| "grad_norm": 0.6178358793258667, |
| "learning_rate": 1.2622092586336415e-05, |
| "loss": 0.6656, |
| "num_input_tokens_seen": 2937720, |
| "step": 7795 |
| }, |
| { |
| "epoch": 6.989247311827957, |
| "grad_norm": 0.814631998538971, |
| "learning_rate": 1.2588138457008e-05, |
| "loss": 0.694, |
| "num_input_tokens_seen": 2939480, |
| "step": 7800 |
| }, |
| { |
| "epoch": 6.993727598566308, |
| "grad_norm": 0.555487871170044, |
| "learning_rate": 1.2554214684982191e-05, |
| "loss": 0.6775, |
| "num_input_tokens_seen": 2941304, |
| "step": 7805 |
| }, |
| { |
| "epoch": 6.99820788530466, |
| "grad_norm": 0.4906958341598511, |
| "learning_rate": 1.2520321353230769e-05, |
| "loss": 0.6765, |
| "num_input_tokens_seen": 2943256, |
| "step": 7810 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.7003687620162964, |
| "eval_runtime": 5.6201, |
| "eval_samples_per_second": 88.255, |
| "eval_steps_per_second": 22.064, |
| "num_input_tokens_seen": 2943688, |
| "step": 7812 |
| }, |
| { |
| "epoch": 7.002688172043011, |
| "grad_norm": 0.6357385516166687, |
| "learning_rate": 1.248645854465105e-05, |
| "loss": 0.6548, |
| "num_input_tokens_seen": 2944680, |
| "step": 7815 |
| }, |
| { |
| "epoch": 7.007168458781362, |
| "grad_norm": 0.4822031557559967, |
| "learning_rate": 1.2452626342065702e-05, |
| "loss": 0.683, |
| "num_input_tokens_seen": 2946696, |
| "step": 7820 |
| }, |
| { |
| "epoch": 7.011648745519714, |
| "grad_norm": 0.5621390342712402, |
| "learning_rate": 1.2418824828222559e-05, |
| "loss": 0.6629, |
| "num_input_tokens_seen": 2948616, |
| "step": 7825 |
| }, |
| { |
| "epoch": 7.016129032258065, |
| "grad_norm": 0.5329768061637878, |
| "learning_rate": 1.2385054085794361e-05, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 2950472, |
| "step": 7830 |
| }, |
| { |
| "epoch": 7.020609318996415, |
| "grad_norm": 0.49358680844306946, |
| "learning_rate": 1.2351314197378597e-05, |
| "loss": 0.7007, |
| "num_input_tokens_seen": 2952392, |
| "step": 7835 |
| }, |
| { |
| "epoch": 7.025089605734767, |
| "grad_norm": 0.5468727946281433, |
| "learning_rate": 1.2317605245497323e-05, |
| "loss": 0.6655, |
| "num_input_tokens_seen": 2954248, |
| "step": 7840 |
| }, |
| { |
| "epoch": 7.029569892473118, |
| "grad_norm": 0.5755560994148254, |
| "learning_rate": 1.2283927312596874e-05, |
| "loss": 0.6909, |
| "num_input_tokens_seen": 2956072, |
| "step": 7845 |
| }, |
| { |
| "epoch": 7.034050179211469, |
| "grad_norm": 0.48634764552116394, |
| "learning_rate": 1.2250280481047746e-05, |
| "loss": 0.6717, |
| "num_input_tokens_seen": 2958024, |
| "step": 7850 |
| }, |
| { |
| "epoch": 7.038530465949821, |
| "grad_norm": 0.7252983450889587, |
| "learning_rate": 1.2216664833144386e-05, |
| "loss": 0.7125, |
| "num_input_tokens_seen": 2959816, |
| "step": 7855 |
| }, |
| { |
| "epoch": 7.043010752688172, |
| "grad_norm": 0.7096565961837769, |
| "learning_rate": 1.2183080451104937e-05, |
| "loss": 0.7341, |
| "num_input_tokens_seen": 2961672, |
| "step": 7860 |
| }, |
| { |
| "epoch": 7.047491039426523, |
| "grad_norm": 0.5406831502914429, |
| "learning_rate": 1.2149527417071107e-05, |
| "loss": 0.6632, |
| "num_input_tokens_seen": 2963688, |
| "step": 7865 |
| }, |
| { |
| "epoch": 7.051971326164875, |
| "grad_norm": 0.5515265464782715, |
| "learning_rate": 1.2116005813107891e-05, |
| "loss": 0.6838, |
| "num_input_tokens_seen": 2965576, |
| "step": 7870 |
| }, |
| { |
| "epoch": 7.056451612903226, |
| "grad_norm": 0.42072048783302307, |
| "learning_rate": 1.2082515721203427e-05, |
| "loss": 0.6974, |
| "num_input_tokens_seen": 2967464, |
| "step": 7875 |
| }, |
| { |
| "epoch": 7.060931899641577, |
| "grad_norm": 0.46907660365104675, |
| "learning_rate": 1.2049057223268807e-05, |
| "loss": 0.6855, |
| "num_input_tokens_seen": 2969416, |
| "step": 7880 |
| }, |
| { |
| "epoch": 7.065412186379929, |
| "grad_norm": 0.47050419449806213, |
| "learning_rate": 1.2015630401137812e-05, |
| "loss": 0.6843, |
| "num_input_tokens_seen": 2971304, |
| "step": 7885 |
| }, |
| { |
| "epoch": 7.06989247311828, |
| "grad_norm": 0.3717748820781708, |
| "learning_rate": 1.198223533656676e-05, |
| "loss": 0.6866, |
| "num_input_tokens_seen": 2973096, |
| "step": 7890 |
| }, |
| { |
| "epoch": 7.07437275985663, |
| "grad_norm": 0.690002977848053, |
| "learning_rate": 1.1948872111234327e-05, |
| "loss": 0.669, |
| "num_input_tokens_seen": 2975080, |
| "step": 7895 |
| }, |
| { |
| "epoch": 7.078853046594982, |
| "grad_norm": 0.6414685845375061, |
| "learning_rate": 1.191554080674125e-05, |
| "loss": 0.6924, |
| "num_input_tokens_seen": 2977064, |
| "step": 7900 |
| }, |
| { |
| "epoch": 7.083333333333333, |
| "grad_norm": 0.6106062531471252, |
| "learning_rate": 1.188224150461026e-05, |
| "loss": 0.6997, |
| "num_input_tokens_seen": 2979016, |
| "step": 7905 |
| }, |
| { |
| "epoch": 7.087813620071684, |
| "grad_norm": 0.7638527750968933, |
| "learning_rate": 1.1848974286285774e-05, |
| "loss": 0.6863, |
| "num_input_tokens_seen": 2980904, |
| "step": 7910 |
| }, |
| { |
| "epoch": 7.092293906810036, |
| "grad_norm": 0.5625856518745422, |
| "learning_rate": 1.181573923313375e-05, |
| "loss": 0.6968, |
| "num_input_tokens_seen": 2982792, |
| "step": 7915 |
| }, |
| { |
| "epoch": 7.096774193548387, |
| "grad_norm": 0.5783429145812988, |
| "learning_rate": 1.1782536426441498e-05, |
| "loss": 0.6761, |
| "num_input_tokens_seen": 2984552, |
| "step": 7920 |
| }, |
| { |
| "epoch": 7.101254480286738, |
| "grad_norm": 0.4028991162776947, |
| "learning_rate": 1.17493659474174e-05, |
| "loss": 0.6894, |
| "num_input_tokens_seen": 2986280, |
| "step": 7925 |
| }, |
| { |
| "epoch": 7.10573476702509, |
| "grad_norm": 0.5815252661705017, |
| "learning_rate": 1.1716227877190839e-05, |
| "loss": 0.6844, |
| "num_input_tokens_seen": 2988200, |
| "step": 7930 |
| }, |
| { |
| "epoch": 7.110215053763441, |
| "grad_norm": 0.5259630084037781, |
| "learning_rate": 1.1683122296811883e-05, |
| "loss": 0.663, |
| "num_input_tokens_seen": 2989928, |
| "step": 7935 |
| }, |
| { |
| "epoch": 7.114695340501792, |
| "grad_norm": 0.6334452629089355, |
| "learning_rate": 1.1650049287251147e-05, |
| "loss": 0.6699, |
| "num_input_tokens_seen": 2991752, |
| "step": 7940 |
| }, |
| { |
| "epoch": 7.119175627240144, |
| "grad_norm": 0.5388948917388916, |
| "learning_rate": 1.1617008929399606e-05, |
| "loss": 0.6883, |
| "num_input_tokens_seen": 2993640, |
| "step": 7945 |
| }, |
| { |
| "epoch": 7.123655913978495, |
| "grad_norm": 0.712680459022522, |
| "learning_rate": 1.1584001304068349e-05, |
| "loss": 0.6778, |
| "num_input_tokens_seen": 2995528, |
| "step": 7950 |
| }, |
| { |
| "epoch": 7.128136200716846, |
| "grad_norm": 0.5485901236534119, |
| "learning_rate": 1.155102649198841e-05, |
| "loss": 0.6874, |
| "num_input_tokens_seen": 2997512, |
| "step": 7955 |
| }, |
| { |
| "epoch": 7.132616487455197, |
| "grad_norm": 0.5746596455574036, |
| "learning_rate": 1.1518084573810575e-05, |
| "loss": 0.7271, |
| "num_input_tokens_seen": 2999272, |
| "step": 7960 |
| }, |
| { |
| "epoch": 7.137096774193548, |
| "grad_norm": 0.5184845924377441, |
| "learning_rate": 1.1485175630105163e-05, |
| "loss": 0.6905, |
| "num_input_tokens_seen": 3001160, |
| "step": 7965 |
| }, |
| { |
| "epoch": 7.141577060931899, |
| "grad_norm": 0.49086540937423706, |
| "learning_rate": 1.1452299741361875e-05, |
| "loss": 0.6594, |
| "num_input_tokens_seen": 3003048, |
| "step": 7970 |
| }, |
| { |
| "epoch": 7.146057347670251, |
| "grad_norm": 0.548001229763031, |
| "learning_rate": 1.141945698798954e-05, |
| "loss": 0.7119, |
| "num_input_tokens_seen": 3005224, |
| "step": 7975 |
| }, |
| { |
| "epoch": 7.150537634408602, |
| "grad_norm": 0.47451603412628174, |
| "learning_rate": 1.1386647450315924e-05, |
| "loss": 0.6799, |
| "num_input_tokens_seen": 3007112, |
| "step": 7980 |
| }, |
| { |
| "epoch": 7.155017921146953, |
| "grad_norm": 0.6413478851318359, |
| "learning_rate": 1.1353871208587602e-05, |
| "loss": 0.7233, |
| "num_input_tokens_seen": 3009000, |
| "step": 7985 |
| }, |
| { |
| "epoch": 7.159498207885305, |
| "grad_norm": 0.6764240264892578, |
| "learning_rate": 1.132112834296967e-05, |
| "loss": 0.6607, |
| "num_input_tokens_seen": 3010920, |
| "step": 7990 |
| }, |
| { |
| "epoch": 7.163978494623656, |
| "grad_norm": 0.4934854209423065, |
| "learning_rate": 1.1288418933545624e-05, |
| "loss": 0.7034, |
| "num_input_tokens_seen": 3012936, |
| "step": 7995 |
| }, |
| { |
| "epoch": 7.168458781362007, |
| "grad_norm": 0.32125502824783325, |
| "learning_rate": 1.1255743060317115e-05, |
| "loss": 0.6992, |
| "num_input_tokens_seen": 3014600, |
| "step": 8000 |
| }, |
| { |
| "epoch": 7.172939068100359, |
| "grad_norm": 0.5356723070144653, |
| "learning_rate": 1.1223100803203767e-05, |
| "loss": 0.6913, |
| "num_input_tokens_seen": 3016552, |
| "step": 8005 |
| }, |
| { |
| "epoch": 7.17741935483871, |
| "grad_norm": 0.3970068395137787, |
| "learning_rate": 1.1190492242042989e-05, |
| "loss": 0.7154, |
| "num_input_tokens_seen": 3018376, |
| "step": 8010 |
| }, |
| { |
| "epoch": 7.181899641577061, |
| "grad_norm": 0.5137478113174438, |
| "learning_rate": 1.1157917456589778e-05, |
| "loss": 0.7028, |
| "num_input_tokens_seen": 3020296, |
| "step": 8015 |
| }, |
| { |
| "epoch": 7.186379928315413, |
| "grad_norm": 0.6350739598274231, |
| "learning_rate": 1.1125376526516511e-05, |
| "loss": 0.6737, |
| "num_input_tokens_seen": 3022120, |
| "step": 8020 |
| }, |
| { |
| "epoch": 7.190860215053763, |
| "grad_norm": 0.5553760528564453, |
| "learning_rate": 1.109286953141279e-05, |
| "loss": 0.6878, |
| "num_input_tokens_seen": 3023816, |
| "step": 8025 |
| }, |
| { |
| "epoch": 7.195340501792114, |
| "grad_norm": 0.42521554231643677, |
| "learning_rate": 1.1060396550785182e-05, |
| "loss": 0.6979, |
| "num_input_tokens_seen": 3025672, |
| "step": 8030 |
| }, |
| { |
| "epoch": 7.199820788530466, |
| "grad_norm": 0.5245974659919739, |
| "learning_rate": 1.1027957664057079e-05, |
| "loss": 0.7418, |
| "num_input_tokens_seen": 3027496, |
| "step": 8035 |
| }, |
| { |
| "epoch": 7.204301075268817, |
| "grad_norm": 0.4561592638492584, |
| "learning_rate": 1.099555295056848e-05, |
| "loss": 0.6645, |
| "num_input_tokens_seen": 3029288, |
| "step": 8040 |
| }, |
| { |
| "epoch": 7.208781362007168, |
| "grad_norm": 0.46861767768859863, |
| "learning_rate": 1.0963182489575797e-05, |
| "loss": 0.6933, |
| "num_input_tokens_seen": 3031080, |
| "step": 8045 |
| }, |
| { |
| "epoch": 7.21326164874552, |
| "grad_norm": 0.4887754023075104, |
| "learning_rate": 1.0930846360251684e-05, |
| "loss": 0.6598, |
| "num_input_tokens_seen": 3033128, |
| "step": 8050 |
| }, |
| { |
| "epoch": 7.217741935483871, |
| "grad_norm": 0.7280912399291992, |
| "learning_rate": 1.0898544641684816e-05, |
| "loss": 0.6929, |
| "num_input_tokens_seen": 3035144, |
| "step": 8055 |
| }, |
| { |
| "epoch": 7.222222222222222, |
| "grad_norm": 0.5493155121803284, |
| "learning_rate": 1.0866277412879695e-05, |
| "loss": 0.7104, |
| "num_input_tokens_seen": 3037032, |
| "step": 8060 |
| }, |
| { |
| "epoch": 7.226702508960574, |
| "grad_norm": 0.3439621925354004, |
| "learning_rate": 1.0834044752756478e-05, |
| "loss": 0.6971, |
| "num_input_tokens_seen": 3038952, |
| "step": 8065 |
| }, |
| { |
| "epoch": 7.231182795698925, |
| "grad_norm": 0.5505177974700928, |
| "learning_rate": 1.0801846740150759e-05, |
| "loss": 0.6735, |
| "num_input_tokens_seen": 3041000, |
| "step": 8070 |
| }, |
| { |
| "epoch": 7.235663082437276, |
| "grad_norm": 0.44457921385765076, |
| "learning_rate": 1.0769683453813426e-05, |
| "loss": 0.7112, |
| "num_input_tokens_seen": 3042824, |
| "step": 8075 |
| }, |
| { |
| "epoch": 7.240143369175628, |
| "grad_norm": 0.41731348633766174, |
| "learning_rate": 1.0737554972410391e-05, |
| "loss": 0.6924, |
| "num_input_tokens_seen": 3044648, |
| "step": 8080 |
| }, |
| { |
| "epoch": 7.244623655913978, |
| "grad_norm": 0.6889303922653198, |
| "learning_rate": 1.0705461374522463e-05, |
| "loss": 0.6767, |
| "num_input_tokens_seen": 3046664, |
| "step": 8085 |
| }, |
| { |
| "epoch": 7.249103942652329, |
| "grad_norm": 0.4339003264904022, |
| "learning_rate": 1.0673402738645116e-05, |
| "loss": 0.7029, |
| "num_input_tokens_seen": 3048456, |
| "step": 8090 |
| }, |
| { |
| "epoch": 7.253584229390681, |
| "grad_norm": 0.5571053624153137, |
| "learning_rate": 1.0641379143188321e-05, |
| "loss": 0.7222, |
| "num_input_tokens_seen": 3050408, |
| "step": 8095 |
| }, |
| { |
| "epoch": 7.258064516129032, |
| "grad_norm": 0.5285934805870056, |
| "learning_rate": 1.060939066647636e-05, |
| "loss": 0.6845, |
| "num_input_tokens_seen": 3052264, |
| "step": 8100 |
| }, |
| { |
| "epoch": 7.262544802867383, |
| "grad_norm": 0.5844146013259888, |
| "learning_rate": 1.0577437386747601e-05, |
| "loss": 0.6693, |
| "num_input_tokens_seen": 3054184, |
| "step": 8105 |
| }, |
| { |
| "epoch": 7.267025089605735, |
| "grad_norm": 0.5706028342247009, |
| "learning_rate": 1.054551938215432e-05, |
| "loss": 0.7184, |
| "num_input_tokens_seen": 3056008, |
| "step": 8110 |
| }, |
| { |
| "epoch": 7.271505376344086, |
| "grad_norm": 0.6662572622299194, |
| "learning_rate": 1.0513636730762558e-05, |
| "loss": 0.6994, |
| "num_input_tokens_seen": 3057992, |
| "step": 8115 |
| }, |
| { |
| "epoch": 7.275985663082437, |
| "grad_norm": 0.5411928296089172, |
| "learning_rate": 1.0481789510551821e-05, |
| "loss": 0.6712, |
| "num_input_tokens_seen": 3059720, |
| "step": 8120 |
| }, |
| { |
| "epoch": 7.280465949820789, |
| "grad_norm": 0.46258223056793213, |
| "learning_rate": 1.044997779941502e-05, |
| "loss": 0.6983, |
| "num_input_tokens_seen": 3061576, |
| "step": 8125 |
| }, |
| { |
| "epoch": 7.28494623655914, |
| "grad_norm": 0.5718064308166504, |
| "learning_rate": 1.0418201675158182e-05, |
| "loss": 0.707, |
| "num_input_tokens_seen": 3063368, |
| "step": 8130 |
| }, |
| { |
| "epoch": 7.289426523297491, |
| "grad_norm": 0.5663242936134338, |
| "learning_rate": 1.0386461215500296e-05, |
| "loss": 0.6284, |
| "num_input_tokens_seen": 3065128, |
| "step": 8135 |
| }, |
| { |
| "epoch": 7.293906810035843, |
| "grad_norm": 0.647124171257019, |
| "learning_rate": 1.0354756498073156e-05, |
| "loss": 0.6919, |
| "num_input_tokens_seen": 3067144, |
| "step": 8140 |
| }, |
| { |
| "epoch": 7.298387096774194, |
| "grad_norm": 0.5559285283088684, |
| "learning_rate": 1.032308760042108e-05, |
| "loss": 0.6971, |
| "num_input_tokens_seen": 3069064, |
| "step": 8145 |
| }, |
| { |
| "epoch": 7.302867383512545, |
| "grad_norm": 0.5535130500793457, |
| "learning_rate": 1.0291454600000805e-05, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 3071048, |
| "step": 8150 |
| }, |
| { |
| "epoch": 7.307347670250896, |
| "grad_norm": 0.48422160744667053, |
| "learning_rate": 1.0259857574181292e-05, |
| "loss": 0.6874, |
| "num_input_tokens_seen": 3073032, |
| "step": 8155 |
| }, |
| { |
| "epoch": 7.311827956989247, |
| "grad_norm": 0.4725182354450226, |
| "learning_rate": 1.0228296600243483e-05, |
| "loss": 0.666, |
| "num_input_tokens_seen": 3074824, |
| "step": 8160 |
| }, |
| { |
| "epoch": 7.316308243727598, |
| "grad_norm": 0.5510170459747314, |
| "learning_rate": 1.0196771755380145e-05, |
| "loss": 0.6881, |
| "num_input_tokens_seen": 3076712, |
| "step": 8165 |
| }, |
| { |
| "epoch": 7.32078853046595, |
| "grad_norm": 0.22946563363075256, |
| "learning_rate": 1.016528311669571e-05, |
| "loss": 0.7115, |
| "num_input_tokens_seen": 3078536, |
| "step": 8170 |
| }, |
| { |
| "epoch": 7.325268817204301, |
| "grad_norm": 0.559217631816864, |
| "learning_rate": 1.0133830761206e-05, |
| "loss": 0.6647, |
| "num_input_tokens_seen": 3080424, |
| "step": 8175 |
| }, |
| { |
| "epoch": 7.329749103942652, |
| "grad_norm": 0.6456106305122375, |
| "learning_rate": 1.0102414765838156e-05, |
| "loss": 0.6888, |
| "num_input_tokens_seen": 3082472, |
| "step": 8180 |
| }, |
| { |
| "epoch": 7.334229390681004, |
| "grad_norm": 0.635503351688385, |
| "learning_rate": 1.0071035207430352e-05, |
| "loss": 0.6336, |
| "num_input_tokens_seen": 3084328, |
| "step": 8185 |
| }, |
| { |
| "epoch": 7.338709677419355, |
| "grad_norm": 0.519087553024292, |
| "learning_rate": 1.0039692162731637e-05, |
| "loss": 0.6786, |
| "num_input_tokens_seen": 3086120, |
| "step": 8190 |
| }, |
| { |
| "epoch": 7.343189964157706, |
| "grad_norm": 0.32819828391075134, |
| "learning_rate": 1.0008385708401802e-05, |
| "loss": 0.6819, |
| "num_input_tokens_seen": 3087976, |
| "step": 8195 |
| }, |
| { |
| "epoch": 7.347670250896058, |
| "grad_norm": 0.5701029896736145, |
| "learning_rate": 9.977115921011071e-06, |
| "loss": 0.6909, |
| "num_input_tokens_seen": 3089864, |
| "step": 8200 |
| }, |
| { |
| "epoch": 7.352150537634409, |
| "grad_norm": 0.5487130880355835, |
| "learning_rate": 9.945882877040053e-06, |
| "loss": 0.6849, |
| "num_input_tokens_seen": 3091688, |
| "step": 8205 |
| }, |
| { |
| "epoch": 7.356630824372759, |
| "grad_norm": 0.4447524845600128, |
| "learning_rate": 9.914686652879454e-06, |
| "loss": 0.7049, |
| "num_input_tokens_seen": 3093480, |
| "step": 8210 |
| }, |
| { |
| "epoch": 7.361111111111111, |
| "grad_norm": 0.7242264747619629, |
| "learning_rate": 9.883527324829925e-06, |
| "loss": 0.6908, |
| "num_input_tokens_seen": 3095368, |
| "step": 8215 |
| }, |
| { |
| "epoch": 7.365591397849462, |
| "grad_norm": 0.44289588928222656, |
| "learning_rate": 9.8524049691019e-06, |
| "loss": 0.6813, |
| "num_input_tokens_seen": 3097224, |
| "step": 8220 |
| }, |
| { |
| "epoch": 7.370071684587813, |
| "grad_norm": 0.49530482292175293, |
| "learning_rate": 9.821319661815359e-06, |
| "loss": 0.7155, |
| "num_input_tokens_seen": 3099016, |
| "step": 8225 |
| }, |
| { |
| "epoch": 7.374551971326165, |
| "grad_norm": 0.7250574827194214, |
| "learning_rate": 9.790271478999677e-06, |
| "loss": 0.689, |
| "num_input_tokens_seen": 3100904, |
| "step": 8230 |
| }, |
| { |
| "epoch": 7.379032258064516, |
| "grad_norm": 0.6142673492431641, |
| "learning_rate": 9.759260496593434e-06, |
| "loss": 0.6448, |
| "num_input_tokens_seen": 3102696, |
| "step": 8235 |
| }, |
| { |
| "epoch": 7.383512544802867, |
| "grad_norm": 0.4833540916442871, |
| "learning_rate": 9.728286790444206e-06, |
| "loss": 0.6913, |
| "num_input_tokens_seen": 3104488, |
| "step": 8240 |
| }, |
| { |
| "epoch": 7.387992831541219, |
| "grad_norm": 0.5866033434867859, |
| "learning_rate": 9.697350436308427e-06, |
| "loss": 0.7112, |
| "num_input_tokens_seen": 3106440, |
| "step": 8245 |
| }, |
| { |
| "epoch": 7.39247311827957, |
| "grad_norm": 0.5501033663749695, |
| "learning_rate": 9.666451509851158e-06, |
| "loss": 0.6574, |
| "num_input_tokens_seen": 3108264, |
| "step": 8250 |
| }, |
| { |
| "epoch": 7.396953405017921, |
| "grad_norm": 0.5121339559555054, |
| "learning_rate": 9.635590086645906e-06, |
| "loss": 0.7593, |
| "num_input_tokens_seen": 3110120, |
| "step": 8255 |
| }, |
| { |
| "epoch": 7.401433691756273, |
| "grad_norm": 0.5490121841430664, |
| "learning_rate": 9.604766242174474e-06, |
| "loss": 0.6737, |
| "num_input_tokens_seen": 3111912, |
| "step": 8260 |
| }, |
| { |
| "epoch": 7.405913978494624, |
| "grad_norm": 0.49912509322166443, |
| "learning_rate": 9.573980051826731e-06, |
| "loss": 0.6849, |
| "num_input_tokens_seen": 3113832, |
| "step": 8265 |
| }, |
| { |
| "epoch": 7.410394265232975, |
| "grad_norm": 0.4087314009666443, |
| "learning_rate": 9.54323159090048e-06, |
| "loss": 0.6895, |
| "num_input_tokens_seen": 3115624, |
| "step": 8270 |
| }, |
| { |
| "epoch": 7.414874551971327, |
| "grad_norm": 0.40049365162849426, |
| "learning_rate": 9.512520934601225e-06, |
| "loss": 0.6722, |
| "num_input_tokens_seen": 3117544, |
| "step": 8275 |
| }, |
| { |
| "epoch": 7.419354838709677, |
| "grad_norm": 0.490710586309433, |
| "learning_rate": 9.481848158041998e-06, |
| "loss": 0.6829, |
| "num_input_tokens_seen": 3119464, |
| "step": 8280 |
| }, |
| { |
| "epoch": 7.423835125448028, |
| "grad_norm": 0.6339983940124512, |
| "learning_rate": 9.4512133362432e-06, |
| "loss": 0.6974, |
| "num_input_tokens_seen": 3121224, |
| "step": 8285 |
| }, |
| { |
| "epoch": 7.42831541218638, |
| "grad_norm": 0.49414268136024475, |
| "learning_rate": 9.4206165441324e-06, |
| "loss": 0.7126, |
| "num_input_tokens_seen": 3123080, |
| "step": 8290 |
| }, |
| { |
| "epoch": 7.432795698924731, |
| "grad_norm": 0.43187791109085083, |
| "learning_rate": 9.390057856544129e-06, |
| "loss": 0.7088, |
| "num_input_tokens_seen": 3125000, |
| "step": 8295 |
| }, |
| { |
| "epoch": 7.437275985663082, |
| "grad_norm": 0.5625995993614197, |
| "learning_rate": 9.359537348219768e-06, |
| "loss": 0.6566, |
| "num_input_tokens_seen": 3127080, |
| "step": 8300 |
| }, |
| { |
| "epoch": 7.441756272401434, |
| "grad_norm": 0.45806175470352173, |
| "learning_rate": 9.329055093807268e-06, |
| "loss": 0.6758, |
| "num_input_tokens_seen": 3129032, |
| "step": 8305 |
| }, |
| { |
| "epoch": 7.446236559139785, |
| "grad_norm": 0.5392464995384216, |
| "learning_rate": 9.298611167861062e-06, |
| "loss": 0.7007, |
| "num_input_tokens_seen": 3130792, |
| "step": 8310 |
| }, |
| { |
| "epoch": 7.450716845878136, |
| "grad_norm": 0.7194018363952637, |
| "learning_rate": 9.2682056448418e-06, |
| "loss": 0.7235, |
| "num_input_tokens_seen": 3132776, |
| "step": 8315 |
| }, |
| { |
| "epoch": 7.455197132616488, |
| "grad_norm": 0.3086296021938324, |
| "learning_rate": 9.237838599116208e-06, |
| "loss": 0.6978, |
| "num_input_tokens_seen": 3134728, |
| "step": 8320 |
| }, |
| { |
| "epoch": 7.459677419354839, |
| "grad_norm": 0.6527276635169983, |
| "learning_rate": 9.207510104956944e-06, |
| "loss": 0.7155, |
| "num_input_tokens_seen": 3136616, |
| "step": 8325 |
| }, |
| { |
| "epoch": 7.46415770609319, |
| "grad_norm": 0.6955182552337646, |
| "learning_rate": 9.17722023654233e-06, |
| "loss": 0.7086, |
| "num_input_tokens_seen": 3138632, |
| "step": 8330 |
| }, |
| { |
| "epoch": 7.468637992831542, |
| "grad_norm": 0.5681894421577454, |
| "learning_rate": 9.146969067956238e-06, |
| "loss": 0.6894, |
| "num_input_tokens_seen": 3140456, |
| "step": 8335 |
| }, |
| { |
| "epoch": 7.473118279569892, |
| "grad_norm": 0.640601634979248, |
| "learning_rate": 9.116756673187878e-06, |
| "loss": 0.6617, |
| "num_input_tokens_seen": 3142312, |
| "step": 8340 |
| }, |
| { |
| "epoch": 7.477598566308243, |
| "grad_norm": 0.34400156140327454, |
| "learning_rate": 9.08658312613163e-06, |
| "loss": 0.6795, |
| "num_input_tokens_seen": 3144360, |
| "step": 8345 |
| }, |
| { |
| "epoch": 7.482078853046595, |
| "grad_norm": 0.6456219553947449, |
| "learning_rate": 9.056448500586865e-06, |
| "loss": 0.7127, |
| "num_input_tokens_seen": 3146152, |
| "step": 8350 |
| }, |
| { |
| "epoch": 7.486559139784946, |
| "grad_norm": 0.5124385952949524, |
| "learning_rate": 9.026352870257748e-06, |
| "loss": 0.6962, |
| "num_input_tokens_seen": 3148040, |
| "step": 8355 |
| }, |
| { |
| "epoch": 7.491039426523297, |
| "grad_norm": 0.6023479700088501, |
| "learning_rate": 8.996296308753069e-06, |
| "loss": 0.6879, |
| "num_input_tokens_seen": 3149864, |
| "step": 8360 |
| }, |
| { |
| "epoch": 7.495519713261649, |
| "grad_norm": 0.4037036597728729, |
| "learning_rate": 8.966278889586086e-06, |
| "loss": 0.6976, |
| "num_input_tokens_seen": 3151720, |
| "step": 8365 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.7624744772911072, |
| "learning_rate": 8.936300686174268e-06, |
| "loss": 0.7108, |
| "num_input_tokens_seen": 3153640, |
| "step": 8370 |
| }, |
| { |
| "epoch": 7.5, |
| "eval_loss": 0.7001423835754395, |
| "eval_runtime": 5.6376, |
| "eval_samples_per_second": 87.98, |
| "eval_steps_per_second": 21.995, |
| "num_input_tokens_seen": 3153640, |
| "step": 8370 |
| }, |
| { |
| "epoch": 7.504480286738351, |
| "grad_norm": 0.4228692650794983, |
| "learning_rate": 8.906361771839227e-06, |
| "loss": 0.6946, |
| "num_input_tokens_seen": 3155496, |
| "step": 8375 |
| }, |
| { |
| "epoch": 7.508960573476703, |
| "grad_norm": 0.7325205206871033, |
| "learning_rate": 8.876462219806456e-06, |
| "loss": 0.6861, |
| "num_input_tokens_seen": 3157448, |
| "step": 8380 |
| }, |
| { |
| "epoch": 7.513440860215054, |
| "grad_norm": 0.6230577230453491, |
| "learning_rate": 8.846602103205157e-06, |
| "loss": 0.6706, |
| "num_input_tokens_seen": 3159496, |
| "step": 8385 |
| }, |
| { |
| "epoch": 7.517921146953405, |
| "grad_norm": 0.6926305294036865, |
| "learning_rate": 8.816781495068125e-06, |
| "loss": 0.6744, |
| "num_input_tokens_seen": 3161320, |
| "step": 8390 |
| }, |
| { |
| "epoch": 7.522401433691757, |
| "grad_norm": 0.606583297252655, |
| "learning_rate": 8.787000468331463e-06, |
| "loss": 0.7136, |
| "num_input_tokens_seen": 3163144, |
| "step": 8395 |
| }, |
| { |
| "epoch": 7.526881720430108, |
| "grad_norm": 0.5779367089271545, |
| "learning_rate": 8.757259095834525e-06, |
| "loss": 0.7018, |
| "num_input_tokens_seen": 3164904, |
| "step": 8400 |
| }, |
| { |
| "epoch": 7.531362007168459, |
| "grad_norm": 0.588382363319397, |
| "learning_rate": 8.72755745031964e-06, |
| "loss": 0.6922, |
| "num_input_tokens_seen": 3166696, |
| "step": 8405 |
| }, |
| { |
| "epoch": 7.53584229390681, |
| "grad_norm": 0.4489257335662842, |
| "learning_rate": 8.697895604431974e-06, |
| "loss": 0.7202, |
| "num_input_tokens_seen": 3168456, |
| "step": 8410 |
| }, |
| { |
| "epoch": 7.540322580645161, |
| "grad_norm": 0.5579271912574768, |
| "learning_rate": 8.668273630719373e-06, |
| "loss": 0.7056, |
| "num_input_tokens_seen": 3170344, |
| "step": 8415 |
| }, |
| { |
| "epoch": 7.544802867383512, |
| "grad_norm": 0.6759196519851685, |
| "learning_rate": 8.638691601632152e-06, |
| "loss": 0.684, |
| "num_input_tokens_seen": 3172232, |
| "step": 8420 |
| }, |
| { |
| "epoch": 7.549283154121864, |
| "grad_norm": 0.45993009209632874, |
| "learning_rate": 8.609149589522894e-06, |
| "loss": 0.6925, |
| "num_input_tokens_seen": 3174056, |
| "step": 8425 |
| }, |
| { |
| "epoch": 7.553763440860215, |
| "grad_norm": 0.5422730445861816, |
| "learning_rate": 8.579647666646361e-06, |
| "loss": 0.6939, |
| "num_input_tokens_seen": 3175944, |
| "step": 8430 |
| }, |
| { |
| "epoch": 7.558243727598566, |
| "grad_norm": 0.48670727014541626, |
| "learning_rate": 8.550185905159227e-06, |
| "loss": 0.6682, |
| "num_input_tokens_seen": 3177896, |
| "step": 8435 |
| }, |
| { |
| "epoch": 7.562724014336918, |
| "grad_norm": 0.6719427108764648, |
| "learning_rate": 8.520764377119964e-06, |
| "loss": 0.7217, |
| "num_input_tokens_seen": 3179912, |
| "step": 8440 |
| }, |
| { |
| "epoch": 7.567204301075269, |
| "grad_norm": 0.5485569834709167, |
| "learning_rate": 8.491383154488628e-06, |
| "loss": 0.7005, |
| "num_input_tokens_seen": 3181736, |
| "step": 8445 |
| }, |
| { |
| "epoch": 7.57168458781362, |
| "grad_norm": 0.47630730271339417, |
| "learning_rate": 8.462042309126664e-06, |
| "loss": 0.707, |
| "num_input_tokens_seen": 3183592, |
| "step": 8450 |
| }, |
| { |
| "epoch": 7.576164874551972, |
| "grad_norm": 0.6303143501281738, |
| "learning_rate": 8.432741912796821e-06, |
| "loss": 0.6824, |
| "num_input_tokens_seen": 3185448, |
| "step": 8455 |
| }, |
| { |
| "epoch": 7.580645161290323, |
| "grad_norm": 0.5015650391578674, |
| "learning_rate": 8.403482037162873e-06, |
| "loss": 0.688, |
| "num_input_tokens_seen": 3187368, |
| "step": 8460 |
| }, |
| { |
| "epoch": 7.585125448028673, |
| "grad_norm": 0.690214216709137, |
| "learning_rate": 8.374262753789493e-06, |
| "loss": 0.6917, |
| "num_input_tokens_seen": 3189192, |
| "step": 8465 |
| }, |
| { |
| "epoch": 7.589605734767025, |
| "grad_norm": 0.47096434235572815, |
| "learning_rate": 8.345084134142098e-06, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 3191112, |
| "step": 8470 |
| }, |
| { |
| "epoch": 7.594086021505376, |
| "grad_norm": 0.5318554639816284, |
| "learning_rate": 8.31594624958662e-06, |
| "loss": 0.7068, |
| "num_input_tokens_seen": 3192808, |
| "step": 8475 |
| }, |
| { |
| "epoch": 7.598566308243727, |
| "grad_norm": 0.808319628238678, |
| "learning_rate": 8.286849171389366e-06, |
| "loss": 0.6443, |
| "num_input_tokens_seen": 3194632, |
| "step": 8480 |
| }, |
| { |
| "epoch": 7.603046594982079, |
| "grad_norm": 0.44692009687423706, |
| "learning_rate": 8.257792970716846e-06, |
| "loss": 0.7021, |
| "num_input_tokens_seen": 3196488, |
| "step": 8485 |
| }, |
| { |
| "epoch": 7.60752688172043, |
| "grad_norm": 0.5783076882362366, |
| "learning_rate": 8.228777718635575e-06, |
| "loss": 0.6777, |
| "num_input_tokens_seen": 3198408, |
| "step": 8490 |
| }, |
| { |
| "epoch": 7.612007168458781, |
| "grad_norm": 0.5471696257591248, |
| "learning_rate": 8.19980348611194e-06, |
| "loss": 0.6884, |
| "num_input_tokens_seen": 3200264, |
| "step": 8495 |
| }, |
| { |
| "epoch": 7.616487455197133, |
| "grad_norm": 0.5129026174545288, |
| "learning_rate": 8.170870344011982e-06, |
| "loss": 0.7057, |
| "num_input_tokens_seen": 3202120, |
| "step": 8500 |
| }, |
| { |
| "epoch": 7.620967741935484, |
| "grad_norm": 0.5552482008934021, |
| "learning_rate": 8.141978363101243e-06, |
| "loss": 0.7117, |
| "num_input_tokens_seen": 3203976, |
| "step": 8505 |
| }, |
| { |
| "epoch": 7.625448028673835, |
| "grad_norm": 0.7430400252342224, |
| "learning_rate": 8.1131276140446e-06, |
| "loss": 0.7236, |
| "num_input_tokens_seen": 3205832, |
| "step": 8510 |
| }, |
| { |
| "epoch": 7.629928315412187, |
| "grad_norm": 0.5252755880355835, |
| "learning_rate": 8.084318167406063e-06, |
| "loss": 0.699, |
| "num_input_tokens_seen": 3207816, |
| "step": 8515 |
| }, |
| { |
| "epoch": 7.634408602150538, |
| "grad_norm": 0.5251414775848389, |
| "learning_rate": 8.055550093648665e-06, |
| "loss": 0.7018, |
| "num_input_tokens_seen": 3209768, |
| "step": 8520 |
| }, |
| { |
| "epoch": 7.638888888888889, |
| "grad_norm": 0.3986772298812866, |
| "learning_rate": 8.026823463134206e-06, |
| "loss": 0.6745, |
| "num_input_tokens_seen": 3211464, |
| "step": 8525 |
| }, |
| { |
| "epoch": 7.643369175627241, |
| "grad_norm": 0.4646606743335724, |
| "learning_rate": 7.99813834612314e-06, |
| "loss": 0.6786, |
| "num_input_tokens_seen": 3213320, |
| "step": 8530 |
| }, |
| { |
| "epoch": 7.647849462365591, |
| "grad_norm": 0.5674442648887634, |
| "learning_rate": 7.969494812774392e-06, |
| "loss": 0.6596, |
| "num_input_tokens_seen": 3215272, |
| "step": 8535 |
| }, |
| { |
| "epoch": 7.652329749103942, |
| "grad_norm": 0.42505696415901184, |
| "learning_rate": 7.940892933145156e-06, |
| "loss": 0.667, |
| "num_input_tokens_seen": 3217256, |
| "step": 8540 |
| }, |
| { |
| "epoch": 7.656810035842294, |
| "grad_norm": 0.5792336463928223, |
| "learning_rate": 7.91233277719079e-06, |
| "loss": 0.6752, |
| "num_input_tokens_seen": 3219016, |
| "step": 8545 |
| }, |
| { |
| "epoch": 7.661290322580645, |
| "grad_norm": 0.5960796475410461, |
| "learning_rate": 7.883814414764566e-06, |
| "loss": 0.7255, |
| "num_input_tokens_seen": 3220680, |
| "step": 8550 |
| }, |
| { |
| "epoch": 7.665770609318996, |
| "grad_norm": 0.8227483034133911, |
| "learning_rate": 7.855337915617548e-06, |
| "loss": 0.6573, |
| "num_input_tokens_seen": 3222344, |
| "step": 8555 |
| }, |
| { |
| "epoch": 7.670250896057348, |
| "grad_norm": 0.6761976480484009, |
| "learning_rate": 7.82690334939841e-06, |
| "loss": 0.6926, |
| "num_input_tokens_seen": 3224168, |
| "step": 8560 |
| }, |
| { |
| "epoch": 7.674731182795699, |
| "grad_norm": 0.4717724025249481, |
| "learning_rate": 7.798510785653263e-06, |
| "loss": 0.6832, |
| "num_input_tokens_seen": 3225992, |
| "step": 8565 |
| }, |
| { |
| "epoch": 7.67921146953405, |
| "grad_norm": 0.4982898533344269, |
| "learning_rate": 7.770160293825498e-06, |
| "loss": 0.6951, |
| "num_input_tokens_seen": 3227912, |
| "step": 8570 |
| }, |
| { |
| "epoch": 7.683691756272402, |
| "grad_norm": 0.5173410177230835, |
| "learning_rate": 7.741851943255596e-06, |
| "loss": 0.7161, |
| "num_input_tokens_seen": 3229736, |
| "step": 8575 |
| }, |
| { |
| "epoch": 7.688172043010753, |
| "grad_norm": 0.9301579594612122, |
| "learning_rate": 7.713585803180956e-06, |
| "loss": 0.6635, |
| "num_input_tokens_seen": 3231720, |
| "step": 8580 |
| }, |
| { |
| "epoch": 7.692652329749104, |
| "grad_norm": 0.5519044995307922, |
| "learning_rate": 7.685361942735777e-06, |
| "loss": 0.7005, |
| "num_input_tokens_seen": 3233640, |
| "step": 8585 |
| }, |
| { |
| "epoch": 7.697132616487455, |
| "grad_norm": 0.34752702713012695, |
| "learning_rate": 7.657180430950794e-06, |
| "loss": 0.7004, |
| "num_input_tokens_seen": 3235400, |
| "step": 8590 |
| }, |
| { |
| "epoch": 7.701612903225806, |
| "grad_norm": 0.5915330648422241, |
| "learning_rate": 7.629041336753193e-06, |
| "loss": 0.7062, |
| "num_input_tokens_seen": 3237384, |
| "step": 8595 |
| }, |
| { |
| "epoch": 7.706093189964157, |
| "grad_norm": 0.6914758682250977, |
| "learning_rate": 7.600944728966433e-06, |
| "loss": 0.6589, |
| "num_input_tokens_seen": 3239496, |
| "step": 8600 |
| }, |
| { |
| "epoch": 7.710573476702509, |
| "grad_norm": 0.3407343029975891, |
| "learning_rate": 7.572890676310026e-06, |
| "loss": 0.6834, |
| "num_input_tokens_seen": 3241128, |
| "step": 8605 |
| }, |
| { |
| "epoch": 7.71505376344086, |
| "grad_norm": 0.7424318194389343, |
| "learning_rate": 7.544879247399417e-06, |
| "loss": 0.7125, |
| "num_input_tokens_seen": 3242920, |
| "step": 8610 |
| }, |
| { |
| "epoch": 7.719534050179211, |
| "grad_norm": 0.900806725025177, |
| "learning_rate": 7.516910510745795e-06, |
| "loss": 0.6943, |
| "num_input_tokens_seen": 3244680, |
| "step": 8615 |
| }, |
| { |
| "epoch": 7.724014336917563, |
| "grad_norm": 0.6076695322990417, |
| "learning_rate": 7.48898453475593e-06, |
| "loss": 0.6976, |
| "num_input_tokens_seen": 3246728, |
| "step": 8620 |
| }, |
| { |
| "epoch": 7.728494623655914, |
| "grad_norm": 0.4813416600227356, |
| "learning_rate": 7.46110138773202e-06, |
| "loss": 0.6916, |
| "num_input_tokens_seen": 3248712, |
| "step": 8625 |
| }, |
| { |
| "epoch": 7.732974910394265, |
| "grad_norm": 0.4648270905017853, |
| "learning_rate": 7.433261137871497e-06, |
| "loss": 0.6958, |
| "num_input_tokens_seen": 3250568, |
| "step": 8630 |
| }, |
| { |
| "epoch": 7.737455197132617, |
| "grad_norm": 0.49301114678382874, |
| "learning_rate": 7.405463853266869e-06, |
| "loss": 0.6908, |
| "num_input_tokens_seen": 3252328, |
| "step": 8635 |
| }, |
| { |
| "epoch": 7.741935483870968, |
| "grad_norm": 0.7146629691123962, |
| "learning_rate": 7.377709601905594e-06, |
| "loss": 0.6818, |
| "num_input_tokens_seen": 3254248, |
| "step": 8640 |
| }, |
| { |
| "epoch": 7.746415770609319, |
| "grad_norm": 0.49218234419822693, |
| "learning_rate": 7.349998451669812e-06, |
| "loss": 0.6951, |
| "num_input_tokens_seen": 3256040, |
| "step": 8645 |
| }, |
| { |
| "epoch": 7.750896057347671, |
| "grad_norm": 0.43367111682891846, |
| "learning_rate": 7.3223304703363135e-06, |
| "loss": 0.7164, |
| "num_input_tokens_seen": 3257960, |
| "step": 8650 |
| }, |
| { |
| "epoch": 7.755376344086022, |
| "grad_norm": 0.43355652689933777, |
| "learning_rate": 7.294705725576267e-06, |
| "loss": 0.6817, |
| "num_input_tokens_seen": 3259880, |
| "step": 8655 |
| }, |
| { |
| "epoch": 7.759856630824372, |
| "grad_norm": 0.42015159130096436, |
| "learning_rate": 7.2671242849550905e-06, |
| "loss": 0.685, |
| "num_input_tokens_seen": 3261960, |
| "step": 8660 |
| }, |
| { |
| "epoch": 7.764336917562724, |
| "grad_norm": 0.5154469013214111, |
| "learning_rate": 7.239586215932323e-06, |
| "loss": 0.7112, |
| "num_input_tokens_seen": 3263784, |
| "step": 8665 |
| }, |
| { |
| "epoch": 7.768817204301075, |
| "grad_norm": 0.4720630347728729, |
| "learning_rate": 7.212091585861363e-06, |
| "loss": 0.6855, |
| "num_input_tokens_seen": 3265640, |
| "step": 8670 |
| }, |
| { |
| "epoch": 7.773297491039426, |
| "grad_norm": 0.4294196367263794, |
| "learning_rate": 7.184640461989431e-06, |
| "loss": 0.6866, |
| "num_input_tokens_seen": 3267368, |
| "step": 8675 |
| }, |
| { |
| "epoch": 7.777777777777778, |
| "grad_norm": 0.6301731467247009, |
| "learning_rate": 7.157232911457293e-06, |
| "loss": 0.709, |
| "num_input_tokens_seen": 3269096, |
| "step": 8680 |
| }, |
| { |
| "epoch": 7.782258064516129, |
| "grad_norm": 0.638538122177124, |
| "learning_rate": 7.12986900129915e-06, |
| "loss": 0.6918, |
| "num_input_tokens_seen": 3270920, |
| "step": 8685 |
| }, |
| { |
| "epoch": 7.78673835125448, |
| "grad_norm": 0.5257901549339294, |
| "learning_rate": 7.10254879844249e-06, |
| "loss": 0.6933, |
| "num_input_tokens_seen": 3272840, |
| "step": 8690 |
| }, |
| { |
| "epoch": 7.791218637992832, |
| "grad_norm": 0.6461403965950012, |
| "learning_rate": 7.075272369707878e-06, |
| "loss": 0.6791, |
| "num_input_tokens_seen": 3274824, |
| "step": 8695 |
| }, |
| { |
| "epoch": 7.795698924731183, |
| "grad_norm": 0.6005759835243225, |
| "learning_rate": 7.048039781808816e-06, |
| "loss": 0.6913, |
| "num_input_tokens_seen": 3276808, |
| "step": 8700 |
| }, |
| { |
| "epoch": 7.800179211469534, |
| "grad_norm": 0.5737102627754211, |
| "learning_rate": 7.020851101351583e-06, |
| "loss": 0.647, |
| "num_input_tokens_seen": 3279144, |
| "step": 8705 |
| }, |
| { |
| "epoch": 7.804659498207886, |
| "grad_norm": 0.6555394530296326, |
| "learning_rate": 6.993706394835062e-06, |
| "loss": 0.6987, |
| "num_input_tokens_seen": 3281128, |
| "step": 8710 |
| }, |
| { |
| "epoch": 7.809139784946236, |
| "grad_norm": 0.6129330396652222, |
| "learning_rate": 6.966605728650602e-06, |
| "loss": 0.7193, |
| "num_input_tokens_seen": 3282952, |
| "step": 8715 |
| }, |
| { |
| "epoch": 7.813620071684587, |
| "grad_norm": 0.4754336476325989, |
| "learning_rate": 6.939549169081827e-06, |
| "loss": 0.6926, |
| "num_input_tokens_seen": 3284904, |
| "step": 8720 |
| }, |
| { |
| "epoch": 7.818100358422939, |
| "grad_norm": 0.6485406756401062, |
| "learning_rate": 6.912536782304454e-06, |
| "loss": 0.6967, |
| "num_input_tokens_seen": 3286760, |
| "step": 8725 |
| }, |
| { |
| "epoch": 7.82258064516129, |
| "grad_norm": 0.5529478788375854, |
| "learning_rate": 6.885568634386217e-06, |
| "loss": 0.68, |
| "num_input_tokens_seen": 3288584, |
| "step": 8730 |
| }, |
| { |
| "epoch": 7.827060931899641, |
| "grad_norm": 0.4761482775211334, |
| "learning_rate": 6.858644791286603e-06, |
| "loss": 0.6578, |
| "num_input_tokens_seen": 3290632, |
| "step": 8735 |
| }, |
| { |
| "epoch": 7.831541218637993, |
| "grad_norm": 0.6546998023986816, |
| "learning_rate": 6.83176531885675e-06, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 3292488, |
| "step": 8740 |
| }, |
| { |
| "epoch": 7.836021505376344, |
| "grad_norm": 0.4401530921459198, |
| "learning_rate": 6.804930282839295e-06, |
| "loss": 0.7002, |
| "num_input_tokens_seen": 3294376, |
| "step": 8745 |
| }, |
| { |
| "epoch": 7.840501792114695, |
| "grad_norm": 0.6239868402481079, |
| "learning_rate": 6.778139748868159e-06, |
| "loss": 0.6898, |
| "num_input_tokens_seen": 3296360, |
| "step": 8750 |
| }, |
| { |
| "epoch": 7.844982078853047, |
| "grad_norm": 0.6737553477287292, |
| "learning_rate": 6.751393782468438e-06, |
| "loss": 0.6658, |
| "num_input_tokens_seen": 3298152, |
| "step": 8755 |
| }, |
| { |
| "epoch": 7.849462365591398, |
| "grad_norm": 0.5571895837783813, |
| "learning_rate": 6.7246924490562135e-06, |
| "loss": 0.6665, |
| "num_input_tokens_seen": 3300104, |
| "step": 8760 |
| }, |
| { |
| "epoch": 7.853942652329749, |
| "grad_norm": 0.4849938750267029, |
| "learning_rate": 6.6980358139384e-06, |
| "loss": 0.6841, |
| "num_input_tokens_seen": 3301928, |
| "step": 8765 |
| }, |
| { |
| "epoch": 7.858422939068101, |
| "grad_norm": 0.4987306296825409, |
| "learning_rate": 6.671423942312608e-06, |
| "loss": 0.7039, |
| "num_input_tokens_seen": 3303816, |
| "step": 8770 |
| }, |
| { |
| "epoch": 7.862903225806452, |
| "grad_norm": 0.45879921317100525, |
| "learning_rate": 6.6448568992669434e-06, |
| "loss": 0.6825, |
| "num_input_tokens_seen": 3305704, |
| "step": 8775 |
| }, |
| { |
| "epoch": 7.867383512544803, |
| "grad_norm": 0.5061188340187073, |
| "learning_rate": 6.6183347497798755e-06, |
| "loss": 0.7011, |
| "num_input_tokens_seen": 3307656, |
| "step": 8780 |
| }, |
| { |
| "epoch": 7.871863799283155, |
| "grad_norm": 0.4768591523170471, |
| "learning_rate": 6.591857558720071e-06, |
| "loss": 0.6836, |
| "num_input_tokens_seen": 3309608, |
| "step": 8785 |
| }, |
| { |
| "epoch": 7.876344086021505, |
| "grad_norm": 0.3989751636981964, |
| "learning_rate": 6.565425390846233e-06, |
| "loss": 0.7049, |
| "num_input_tokens_seen": 3311368, |
| "step": 8790 |
| }, |
| { |
| "epoch": 7.880824372759856, |
| "grad_norm": 0.8028482794761658, |
| "learning_rate": 6.539038310806958e-06, |
| "loss": 0.7282, |
| "num_input_tokens_seen": 3313352, |
| "step": 8795 |
| }, |
| { |
| "epoch": 7.885304659498208, |
| "grad_norm": 0.45631352066993713, |
| "learning_rate": 6.512696383140551e-06, |
| "loss": 0.6603, |
| "num_input_tokens_seen": 3315240, |
| "step": 8800 |
| }, |
| { |
| "epoch": 7.889784946236559, |
| "grad_norm": 0.5273287296295166, |
| "learning_rate": 6.48639967227489e-06, |
| "loss": 0.7098, |
| "num_input_tokens_seen": 3317032, |
| "step": 8805 |
| }, |
| { |
| "epoch": 7.89426523297491, |
| "grad_norm": 0.5012726783752441, |
| "learning_rate": 6.460148242527253e-06, |
| "loss": 0.6982, |
| "num_input_tokens_seen": 3319048, |
| "step": 8810 |
| }, |
| { |
| "epoch": 7.898745519713262, |
| "grad_norm": 0.9831424951553345, |
| "learning_rate": 6.4339421581041725e-06, |
| "loss": 0.699, |
| "num_input_tokens_seen": 3320936, |
| "step": 8815 |
| }, |
| { |
| "epoch": 7.903225806451613, |
| "grad_norm": 0.6299842596054077, |
| "learning_rate": 6.407781483101283e-06, |
| "loss": 0.6759, |
| "num_input_tokens_seen": 3322760, |
| "step": 8820 |
| }, |
| { |
| "epoch": 7.907706093189964, |
| "grad_norm": 0.5512109994888306, |
| "learning_rate": 6.38166628150314e-06, |
| "loss": 0.6783, |
| "num_input_tokens_seen": 3324584, |
| "step": 8825 |
| }, |
| { |
| "epoch": 7.912186379928316, |
| "grad_norm": 0.6959258317947388, |
| "learning_rate": 6.355596617183091e-06, |
| "loss": 0.7322, |
| "num_input_tokens_seen": 3326600, |
| "step": 8830 |
| }, |
| { |
| "epoch": 7.916666666666667, |
| "grad_norm": 0.47916871309280396, |
| "learning_rate": 6.329572553903096e-06, |
| "loss": 0.705, |
| "num_input_tokens_seen": 3328456, |
| "step": 8835 |
| }, |
| { |
| "epoch": 7.921146953405018, |
| "grad_norm": 0.7595999240875244, |
| "learning_rate": 6.303594155313583e-06, |
| "loss": 0.6839, |
| "num_input_tokens_seen": 3330472, |
| "step": 8840 |
| }, |
| { |
| "epoch": 7.925627240143369, |
| "grad_norm": 0.48006659746170044, |
| "learning_rate": 6.277661484953309e-06, |
| "loss": 0.6823, |
| "num_input_tokens_seen": 3332488, |
| "step": 8845 |
| }, |
| { |
| "epoch": 7.93010752688172, |
| "grad_norm": 0.6921666860580444, |
| "learning_rate": 6.251774606249172e-06, |
| "loss": 0.6787, |
| "num_input_tokens_seen": 3334376, |
| "step": 8850 |
| }, |
| { |
| "epoch": 7.934587813620071, |
| "grad_norm": 0.6116976141929626, |
| "learning_rate": 6.225933582516069e-06, |
| "loss": 0.7175, |
| "num_input_tokens_seen": 3336264, |
| "step": 8855 |
| }, |
| { |
| "epoch": 7.939068100358423, |
| "grad_norm": 0.4193064570426941, |
| "learning_rate": 6.200138476956766e-06, |
| "loss": 0.6833, |
| "num_input_tokens_seen": 3338024, |
| "step": 8860 |
| }, |
| { |
| "epoch": 7.943548387096774, |
| "grad_norm": 0.48867300152778625, |
| "learning_rate": 6.174389352661686e-06, |
| "loss": 0.6892, |
| "num_input_tokens_seen": 3340008, |
| "step": 8865 |
| }, |
| { |
| "epoch": 7.948028673835125, |
| "grad_norm": 0.7177107930183411, |
| "learning_rate": 6.148686272608809e-06, |
| "loss": 0.7311, |
| "num_input_tokens_seen": 3341864, |
| "step": 8870 |
| }, |
| { |
| "epoch": 7.952508960573477, |
| "grad_norm": 0.5652689933776855, |
| "learning_rate": 6.12302929966351e-06, |
| "loss": 0.6862, |
| "num_input_tokens_seen": 3343752, |
| "step": 8875 |
| }, |
| { |
| "epoch": 7.956989247311828, |
| "grad_norm": 0.4583767354488373, |
| "learning_rate": 6.097418496578369e-06, |
| "loss": 0.6801, |
| "num_input_tokens_seen": 3345672, |
| "step": 8880 |
| }, |
| { |
| "epoch": 7.961469534050179, |
| "grad_norm": 0.38024917244911194, |
| "learning_rate": 6.0718539259930766e-06, |
| "loss": 0.6886, |
| "num_input_tokens_seen": 3347624, |
| "step": 8885 |
| }, |
| { |
| "epoch": 7.965949820788531, |
| "grad_norm": 0.7006999850273132, |
| "learning_rate": 6.046335650434201e-06, |
| "loss": 0.7096, |
| "num_input_tokens_seen": 3349480, |
| "step": 8890 |
| }, |
| { |
| "epoch": 7.970430107526882, |
| "grad_norm": 0.6173056364059448, |
| "learning_rate": 6.020863732315108e-06, |
| "loss": 0.6821, |
| "num_input_tokens_seen": 3351400, |
| "step": 8895 |
| }, |
| { |
| "epoch": 7.974910394265233, |
| "grad_norm": 0.5322303771972656, |
| "learning_rate": 5.9954382339357905e-06, |
| "loss": 0.6849, |
| "num_input_tokens_seen": 3353352, |
| "step": 8900 |
| }, |
| { |
| "epoch": 7.979390681003585, |
| "grad_norm": 0.5544753670692444, |
| "learning_rate": 5.970059217482685e-06, |
| "loss": 0.6994, |
| "num_input_tokens_seen": 3355176, |
| "step": 8905 |
| }, |
| { |
| "epoch": 7.983870967741936, |
| "grad_norm": 0.5962495803833008, |
| "learning_rate": 5.944726745028545e-06, |
| "loss": 0.6769, |
| "num_input_tokens_seen": 3357224, |
| "step": 8910 |
| }, |
| { |
| "epoch": 7.988351254480286, |
| "grad_norm": 0.7063138484954834, |
| "learning_rate": 5.919440878532312e-06, |
| "loss": 0.6867, |
| "num_input_tokens_seen": 3358984, |
| "step": 8915 |
| }, |
| { |
| "epoch": 7.992831541218638, |
| "grad_norm": 0.6204646825790405, |
| "learning_rate": 5.894201679838885e-06, |
| "loss": 0.6851, |
| "num_input_tokens_seen": 3361032, |
| "step": 8920 |
| }, |
| { |
| "epoch": 7.997311827956989, |
| "grad_norm": 0.5719345808029175, |
| "learning_rate": 5.869009210679074e-06, |
| "loss": 0.7097, |
| "num_input_tokens_seen": 3363048, |
| "step": 8925 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.6994269490242004, |
| "eval_runtime": 5.6261, |
| "eval_samples_per_second": 88.16, |
| "eval_steps_per_second": 22.04, |
| "num_input_tokens_seen": 3363864, |
| "step": 8928 |
| }, |
| { |
| "epoch": 8.001792114695341, |
| "grad_norm": 0.42449355125427246, |
| "learning_rate": 5.8438635326693664e-06, |
| "loss": 0.6755, |
| "num_input_tokens_seen": 3364600, |
| "step": 8930 |
| }, |
| { |
| "epoch": 8.006272401433693, |
| "grad_norm": 0.5668836236000061, |
| "learning_rate": 5.818764707311811e-06, |
| "loss": 0.6986, |
| "num_input_tokens_seen": 3366360, |
| "step": 8935 |
| }, |
| { |
| "epoch": 8.010752688172044, |
| "grad_norm": 0.6535866856575012, |
| "learning_rate": 5.7937127959938806e-06, |
| "loss": 0.7205, |
| "num_input_tokens_seen": 3368312, |
| "step": 8940 |
| }, |
| { |
| "epoch": 8.015232974910393, |
| "grad_norm": 0.507878303527832, |
| "learning_rate": 5.768707859988267e-06, |
| "loss": 0.6678, |
| "num_input_tokens_seen": 3370200, |
| "step": 8945 |
| }, |
| { |
| "epoch": 8.019713261648745, |
| "grad_norm": 0.6113376021385193, |
| "learning_rate": 5.7437499604528125e-06, |
| "loss": 0.7054, |
| "num_input_tokens_seen": 3372056, |
| "step": 8950 |
| }, |
| { |
| "epoch": 8.024193548387096, |
| "grad_norm": 0.6190397143363953, |
| "learning_rate": 5.7188391584302895e-06, |
| "loss": 0.6893, |
| "num_input_tokens_seen": 3373976, |
| "step": 8955 |
| }, |
| { |
| "epoch": 8.028673835125447, |
| "grad_norm": 0.4994731545448303, |
| "learning_rate": 5.693975514848271e-06, |
| "loss": 0.6849, |
| "num_input_tokens_seen": 3375960, |
| "step": 8960 |
| }, |
| { |
| "epoch": 8.033154121863799, |
| "grad_norm": 0.703480064868927, |
| "learning_rate": 5.669159090519019e-06, |
| "loss": 0.7006, |
| "num_input_tokens_seen": 3377880, |
| "step": 8965 |
| }, |
| { |
| "epoch": 8.03763440860215, |
| "grad_norm": 0.559497594833374, |
| "learning_rate": 5.644389946139278e-06, |
| "loss": 0.6633, |
| "num_input_tokens_seen": 3379768, |
| "step": 8970 |
| }, |
| { |
| "epoch": 8.042114695340501, |
| "grad_norm": 0.5711075663566589, |
| "learning_rate": 5.6196681422901634e-06, |
| "loss": 0.6687, |
| "num_input_tokens_seen": 3381560, |
| "step": 8975 |
| }, |
| { |
| "epoch": 8.046594982078853, |
| "grad_norm": 0.6641356348991394, |
| "learning_rate": 5.594993739437007e-06, |
| "loss": 0.6936, |
| "num_input_tokens_seen": 3383544, |
| "step": 8980 |
| }, |
| { |
| "epoch": 8.051075268817204, |
| "grad_norm": 0.7046183347702026, |
| "learning_rate": 5.5703667979291915e-06, |
| "loss": 0.6733, |
| "num_input_tokens_seen": 3385272, |
| "step": 8985 |
| }, |
| { |
| "epoch": 8.055555555555555, |
| "grad_norm": 0.9060787558555603, |
| "learning_rate": 5.545787378000039e-06, |
| "loss": 0.6793, |
| "num_input_tokens_seen": 3387256, |
| "step": 8990 |
| }, |
| { |
| "epoch": 8.060035842293907, |
| "grad_norm": 0.48104485869407654, |
| "learning_rate": 5.521255539766637e-06, |
| "loss": 0.6987, |
| "num_input_tokens_seen": 3389144, |
| "step": 8995 |
| }, |
| { |
| "epoch": 8.064516129032258, |
| "grad_norm": 0.5199661254882812, |
| "learning_rate": 5.4967713432296674e-06, |
| "loss": 0.6867, |
| "num_input_tokens_seen": 3390904, |
| "step": 9000 |
| }, |
| { |
| "epoch": 8.06899641577061, |
| "grad_norm": 0.4922904074192047, |
| "learning_rate": 5.472334848273328e-06, |
| "loss": 0.6865, |
| "num_input_tokens_seen": 3392792, |
| "step": 9005 |
| }, |
| { |
| "epoch": 8.07347670250896, |
| "grad_norm": 0.44281700253486633, |
| "learning_rate": 5.44794611466512e-06, |
| "loss": 0.686, |
| "num_input_tokens_seen": 3394744, |
| "step": 9010 |
| }, |
| { |
| "epoch": 8.077956989247312, |
| "grad_norm": 0.4282800257205963, |
| "learning_rate": 5.4236052020557535e-06, |
| "loss": 0.6553, |
| "num_input_tokens_seen": 3396632, |
| "step": 9015 |
| }, |
| { |
| "epoch": 8.082437275985663, |
| "grad_norm": 0.7372511625289917, |
| "learning_rate": 5.399312169978949e-06, |
| "loss": 0.6899, |
| "num_input_tokens_seen": 3398424, |
| "step": 9020 |
| }, |
| { |
| "epoch": 8.086917562724015, |
| "grad_norm": 0.48595067858695984, |
| "learning_rate": 5.375067077851337e-06, |
| "loss": 0.6586, |
| "num_input_tokens_seen": 3400312, |
| "step": 9025 |
| }, |
| { |
| "epoch": 8.091397849462366, |
| "grad_norm": 0.5720658898353577, |
| "learning_rate": 5.350869984972287e-06, |
| "loss": 0.7233, |
| "num_input_tokens_seen": 3402200, |
| "step": 9030 |
| }, |
| { |
| "epoch": 8.095878136200717, |
| "grad_norm": 0.38736122846603394, |
| "learning_rate": 5.326720950523772e-06, |
| "loss": 0.6877, |
| "num_input_tokens_seen": 3404152, |
| "step": 9035 |
| }, |
| { |
| "epoch": 8.100358422939069, |
| "grad_norm": 0.5513134002685547, |
| "learning_rate": 5.302620033570222e-06, |
| "loss": 0.7067, |
| "num_input_tokens_seen": 3405912, |
| "step": 9040 |
| }, |
| { |
| "epoch": 8.10483870967742, |
| "grad_norm": 0.5928265452384949, |
| "learning_rate": 5.27856729305839e-06, |
| "loss": 0.713, |
| "num_input_tokens_seen": 3407672, |
| "step": 9045 |
| }, |
| { |
| "epoch": 8.109318996415771, |
| "grad_norm": 0.838718056678772, |
| "learning_rate": 5.254562787817183e-06, |
| "loss": 0.6896, |
| "num_input_tokens_seen": 3409496, |
| "step": 9050 |
| }, |
| { |
| "epoch": 8.113799283154123, |
| "grad_norm": 0.5603805184364319, |
| "learning_rate": 5.23060657655754e-06, |
| "loss": 0.6567, |
| "num_input_tokens_seen": 3411352, |
| "step": 9055 |
| }, |
| { |
| "epoch": 8.118279569892474, |
| "grad_norm": 0.5363320112228394, |
| "learning_rate": 5.206698717872277e-06, |
| "loss": 0.6803, |
| "num_input_tokens_seen": 3413432, |
| "step": 9060 |
| }, |
| { |
| "epoch": 8.122759856630825, |
| "grad_norm": 0.5360172390937805, |
| "learning_rate": 5.1828392702359504e-06, |
| "loss": 0.7181, |
| "num_input_tokens_seen": 3415320, |
| "step": 9065 |
| }, |
| { |
| "epoch": 8.127240143369175, |
| "grad_norm": 0.5474600791931152, |
| "learning_rate": 5.159028292004717e-06, |
| "loss": 0.6764, |
| "num_input_tokens_seen": 3417240, |
| "step": 9070 |
| }, |
| { |
| "epoch": 8.131720430107526, |
| "grad_norm": 0.5112743377685547, |
| "learning_rate": 5.1352658414161785e-06, |
| "loss": 0.6878, |
| "num_input_tokens_seen": 3419192, |
| "step": 9075 |
| }, |
| { |
| "epoch": 8.136200716845877, |
| "grad_norm": 0.784028172492981, |
| "learning_rate": 5.111551976589249e-06, |
| "loss": 0.6983, |
| "num_input_tokens_seen": 3421208, |
| "step": 9080 |
| }, |
| { |
| "epoch": 8.140681003584229, |
| "grad_norm": 0.6619753837585449, |
| "learning_rate": 5.087886755524005e-06, |
| "loss": 0.695, |
| "num_input_tokens_seen": 3423064, |
| "step": 9085 |
| }, |
| { |
| "epoch": 8.14516129032258, |
| "grad_norm": 0.4583960771560669, |
| "learning_rate": 5.064270236101548e-06, |
| "loss": 0.7061, |
| "num_input_tokens_seen": 3424984, |
| "step": 9090 |
| }, |
| { |
| "epoch": 8.149641577060931, |
| "grad_norm": 0.6008018851280212, |
| "learning_rate": 5.040702476083883e-06, |
| "loss": 0.6968, |
| "num_input_tokens_seen": 3426936, |
| "step": 9095 |
| }, |
| { |
| "epoch": 8.154121863799283, |
| "grad_norm": 0.4817352592945099, |
| "learning_rate": 5.0171835331137365e-06, |
| "loss": 0.691, |
| "num_input_tokens_seen": 3428696, |
| "step": 9100 |
| }, |
| { |
| "epoch": 8.158602150537634, |
| "grad_norm": 0.5647047162055969, |
| "learning_rate": 4.993713464714433e-06, |
| "loss": 0.6788, |
| "num_input_tokens_seen": 3430744, |
| "step": 9105 |
| }, |
| { |
| "epoch": 8.163082437275985, |
| "grad_norm": 0.503331184387207, |
| "learning_rate": 4.970292328289794e-06, |
| "loss": 0.6684, |
| "num_input_tokens_seen": 3432696, |
| "step": 9110 |
| }, |
| { |
| "epoch": 8.167562724014337, |
| "grad_norm": 0.39130842685699463, |
| "learning_rate": 4.946920181123904e-06, |
| "loss": 0.6576, |
| "num_input_tokens_seen": 3434424, |
| "step": 9115 |
| }, |
| { |
| "epoch": 8.172043010752688, |
| "grad_norm": 0.6829502582550049, |
| "learning_rate": 4.9235970803810845e-06, |
| "loss": 0.7095, |
| "num_input_tokens_seen": 3436312, |
| "step": 9120 |
| }, |
| { |
| "epoch": 8.17652329749104, |
| "grad_norm": 0.5101547241210938, |
| "learning_rate": 4.900323083105668e-06, |
| "loss": 0.6655, |
| "num_input_tokens_seen": 3438328, |
| "step": 9125 |
| }, |
| { |
| "epoch": 8.18100358422939, |
| "grad_norm": 0.3391803205013275, |
| "learning_rate": 4.877098246221881e-06, |
| "loss": 0.6881, |
| "num_input_tokens_seen": 3440088, |
| "step": 9130 |
| }, |
| { |
| "epoch": 8.185483870967742, |
| "grad_norm": 0.4887087643146515, |
| "learning_rate": 4.853922626533749e-06, |
| "loss": 0.6946, |
| "num_input_tokens_seen": 3441912, |
| "step": 9135 |
| }, |
| { |
| "epoch": 8.189964157706093, |
| "grad_norm": 0.48914480209350586, |
| "learning_rate": 4.830796280724873e-06, |
| "loss": 0.6883, |
| "num_input_tokens_seen": 3443832, |
| "step": 9140 |
| }, |
| { |
| "epoch": 8.194444444444445, |
| "grad_norm": 0.4121975302696228, |
| "learning_rate": 4.807719265358377e-06, |
| "loss": 0.6959, |
| "num_input_tokens_seen": 3445720, |
| "step": 9145 |
| }, |
| { |
| "epoch": 8.198924731182796, |
| "grad_norm": 0.4341879189014435, |
| "learning_rate": 4.7846916368767094e-06, |
| "loss": 0.6814, |
| "num_input_tokens_seen": 3447544, |
| "step": 9150 |
| }, |
| { |
| "epoch": 8.203405017921147, |
| "grad_norm": 0.6341235041618347, |
| "learning_rate": 4.761713451601532e-06, |
| "loss": 0.6919, |
| "num_input_tokens_seen": 3449400, |
| "step": 9155 |
| }, |
| { |
| "epoch": 8.207885304659499, |
| "grad_norm": 0.3464028835296631, |
| "learning_rate": 4.738784765733586e-06, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 3451256, |
| "step": 9160 |
| }, |
| { |
| "epoch": 8.21236559139785, |
| "grad_norm": 0.4516923129558563, |
| "learning_rate": 4.715905635352541e-06, |
| "loss": 0.6937, |
| "num_input_tokens_seen": 3453240, |
| "step": 9165 |
| }, |
| { |
| "epoch": 8.216845878136201, |
| "grad_norm": 0.5117191672325134, |
| "learning_rate": 4.6930761164168395e-06, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 3455064, |
| "step": 9170 |
| }, |
| { |
| "epoch": 8.221326164874553, |
| "grad_norm": 0.5112479329109192, |
| "learning_rate": 4.670296264763618e-06, |
| "loss": 0.6783, |
| "num_input_tokens_seen": 3456888, |
| "step": 9175 |
| }, |
| { |
| "epoch": 8.225806451612904, |
| "grad_norm": 0.6857882738113403, |
| "learning_rate": 4.6475661361085195e-06, |
| "loss": 0.7114, |
| "num_input_tokens_seen": 3458776, |
| "step": 9180 |
| }, |
| { |
| "epoch": 8.230286738351255, |
| "grad_norm": 0.6740723848342896, |
| "learning_rate": 4.624885786045563e-06, |
| "loss": 0.6879, |
| "num_input_tokens_seen": 3460600, |
| "step": 9185 |
| }, |
| { |
| "epoch": 8.234767025089607, |
| "grad_norm": 0.5737828016281128, |
| "learning_rate": 4.602255270047048e-06, |
| "loss": 0.68, |
| "num_input_tokens_seen": 3462552, |
| "step": 9190 |
| }, |
| { |
| "epoch": 8.239247311827956, |
| "grad_norm": 0.5475903749465942, |
| "learning_rate": 4.579674643463341e-06, |
| "loss": 0.7221, |
| "num_input_tokens_seen": 3464568, |
| "step": 9195 |
| }, |
| { |
| "epoch": 8.243727598566307, |
| "grad_norm": 0.43242955207824707, |
| "learning_rate": 4.557143961522836e-06, |
| "loss": 0.6986, |
| "num_input_tokens_seen": 3466328, |
| "step": 9200 |
| }, |
| { |
| "epoch": 8.248207885304659, |
| "grad_norm": 0.6515691876411438, |
| "learning_rate": 4.534663279331744e-06, |
| "loss": 0.6631, |
| "num_input_tokens_seen": 3468248, |
| "step": 9205 |
| }, |
| { |
| "epoch": 8.25268817204301, |
| "grad_norm": 0.6092105507850647, |
| "learning_rate": 4.512232651873982e-06, |
| "loss": 0.6971, |
| "num_input_tokens_seen": 3470200, |
| "step": 9210 |
| }, |
| { |
| "epoch": 8.257168458781361, |
| "grad_norm": 0.6402620077133179, |
| "learning_rate": 4.489852134011061e-06, |
| "loss": 0.6802, |
| "num_input_tokens_seen": 3472184, |
| "step": 9215 |
| }, |
| { |
| "epoch": 8.261648745519713, |
| "grad_norm": 0.4998003840446472, |
| "learning_rate": 4.46752178048192e-06, |
| "loss": 0.6781, |
| "num_input_tokens_seen": 3474008, |
| "step": 9220 |
| }, |
| { |
| "epoch": 8.266129032258064, |
| "grad_norm": 0.548748254776001, |
| "learning_rate": 4.445241645902804e-06, |
| "loss": 0.6829, |
| "num_input_tokens_seen": 3475896, |
| "step": 9225 |
| }, |
| { |
| "epoch": 8.270609318996415, |
| "grad_norm": 0.5525373220443726, |
| "learning_rate": 4.423011784767133e-06, |
| "loss": 0.683, |
| "num_input_tokens_seen": 3477880, |
| "step": 9230 |
| }, |
| { |
| "epoch": 8.275089605734767, |
| "grad_norm": 0.7804484963417053, |
| "learning_rate": 4.400832251445361e-06, |
| "loss": 0.6933, |
| "num_input_tokens_seen": 3479832, |
| "step": 9235 |
| }, |
| { |
| "epoch": 8.279569892473118, |
| "grad_norm": 0.5897542834281921, |
| "learning_rate": 4.378703100184869e-06, |
| "loss": 0.677, |
| "num_input_tokens_seen": 3481976, |
| "step": 9240 |
| }, |
| { |
| "epoch": 8.28405017921147, |
| "grad_norm": 0.335835337638855, |
| "learning_rate": 4.35662438510979e-06, |
| "loss": 0.6744, |
| "num_input_tokens_seen": 3483832, |
| "step": 9245 |
| }, |
| { |
| "epoch": 8.28853046594982, |
| "grad_norm": 0.7230558395385742, |
| "learning_rate": 4.334596160220905e-06, |
| "loss": 0.696, |
| "num_input_tokens_seen": 3485720, |
| "step": 9250 |
| }, |
| { |
| "epoch": 8.293010752688172, |
| "grad_norm": 0.5177806615829468, |
| "learning_rate": 4.312618479395506e-06, |
| "loss": 0.717, |
| "num_input_tokens_seen": 3487640, |
| "step": 9255 |
| }, |
| { |
| "epoch": 8.297491039426523, |
| "grad_norm": 0.6857832670211792, |
| "learning_rate": 4.290691396387258e-06, |
| "loss": 0.6967, |
| "num_input_tokens_seen": 3489688, |
| "step": 9260 |
| }, |
| { |
| "epoch": 8.301971326164875, |
| "grad_norm": 0.443286269903183, |
| "learning_rate": 4.268814964826093e-06, |
| "loss": 0.7012, |
| "num_input_tokens_seen": 3491512, |
| "step": 9265 |
| }, |
| { |
| "epoch": 8.306451612903226, |
| "grad_norm": 0.6708531975746155, |
| "learning_rate": 4.24698923821803e-06, |
| "loss": 0.6813, |
| "num_input_tokens_seen": 3493560, |
| "step": 9270 |
| }, |
| { |
| "epoch": 8.310931899641577, |
| "grad_norm": 0.5990952253341675, |
| "learning_rate": 4.225214269945088e-06, |
| "loss": 0.7024, |
| "num_input_tokens_seen": 3495608, |
| "step": 9275 |
| }, |
| { |
| "epoch": 8.315412186379929, |
| "grad_norm": 0.5891165137290955, |
| "learning_rate": 4.203490113265138e-06, |
| "loss": 0.6855, |
| "num_input_tokens_seen": 3497464, |
| "step": 9280 |
| }, |
| { |
| "epoch": 8.31989247311828, |
| "grad_norm": 0.5316137671470642, |
| "learning_rate": 4.181816821311763e-06, |
| "loss": 0.6746, |
| "num_input_tokens_seen": 3499416, |
| "step": 9285 |
| }, |
| { |
| "epoch": 8.324372759856631, |
| "grad_norm": 0.7413282990455627, |
| "learning_rate": 4.160194447094162e-06, |
| "loss": 0.6805, |
| "num_input_tokens_seen": 3501400, |
| "step": 9290 |
| }, |
| { |
| "epoch": 8.328853046594983, |
| "grad_norm": 0.41643762588500977, |
| "learning_rate": 4.138623043496981e-06, |
| "loss": 0.7057, |
| "num_input_tokens_seen": 3503160, |
| "step": 9295 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.5769673585891724, |
| "learning_rate": 4.1171026632802035e-06, |
| "loss": 0.6817, |
| "num_input_tokens_seen": 3504952, |
| "step": 9300 |
| }, |
| { |
| "epoch": 8.337813620071685, |
| "grad_norm": 0.5316203236579895, |
| "learning_rate": 4.095633359079024e-06, |
| "loss": 0.6856, |
| "num_input_tokens_seen": 3506680, |
| "step": 9305 |
| }, |
| { |
| "epoch": 8.342293906810037, |
| "grad_norm": 0.6140154600143433, |
| "learning_rate": 4.074215183403701e-06, |
| "loss": 0.6866, |
| "num_input_tokens_seen": 3508600, |
| "step": 9310 |
| }, |
| { |
| "epoch": 8.346774193548388, |
| "grad_norm": 0.440112441778183, |
| "learning_rate": 4.052848188639452e-06, |
| "loss": 0.6845, |
| "num_input_tokens_seen": 3510520, |
| "step": 9315 |
| }, |
| { |
| "epoch": 8.351254480286737, |
| "grad_norm": 0.4764098823070526, |
| "learning_rate": 4.031532427046322e-06, |
| "loss": 0.6773, |
| "num_input_tokens_seen": 3512312, |
| "step": 9320 |
| }, |
| { |
| "epoch": 8.355734767025089, |
| "grad_norm": 0.7385228276252747, |
| "learning_rate": 4.010267950759025e-06, |
| "loss": 0.6937, |
| "num_input_tokens_seen": 3514200, |
| "step": 9325 |
| }, |
| { |
| "epoch": 8.36021505376344, |
| "grad_norm": 0.41506174206733704, |
| "learning_rate": 3.989054811786874e-06, |
| "loss": 0.7099, |
| "num_input_tokens_seen": 3516024, |
| "step": 9330 |
| }, |
| { |
| "epoch": 8.364695340501791, |
| "grad_norm": 0.5923046469688416, |
| "learning_rate": 3.967893062013581e-06, |
| "loss": 0.6867, |
| "num_input_tokens_seen": 3517912, |
| "step": 9335 |
| }, |
| { |
| "epoch": 8.369175627240143, |
| "grad_norm": 0.470580518245697, |
| "learning_rate": 3.946782753197187e-06, |
| "loss": 0.6977, |
| "num_input_tokens_seen": 3519768, |
| "step": 9340 |
| }, |
| { |
| "epoch": 8.373655913978494, |
| "grad_norm": 0.4065123200416565, |
| "learning_rate": 3.925723936969927e-06, |
| "loss": 0.6779, |
| "num_input_tokens_seen": 3521560, |
| "step": 9345 |
| }, |
| { |
| "epoch": 8.378136200716845, |
| "grad_norm": 0.5575389266014099, |
| "learning_rate": 3.9047166648380844e-06, |
| "loss": 0.7009, |
| "num_input_tokens_seen": 3523448, |
| "step": 9350 |
| }, |
| { |
| "epoch": 8.382616487455197, |
| "grad_norm": 0.5877669453620911, |
| "learning_rate": 3.883760988181867e-06, |
| "loss": 0.6966, |
| "num_input_tokens_seen": 3525176, |
| "step": 9355 |
| }, |
| { |
| "epoch": 8.387096774193548, |
| "grad_norm": 0.6943244338035583, |
| "learning_rate": 3.862856958255304e-06, |
| "loss": 0.7053, |
| "num_input_tokens_seen": 3527128, |
| "step": 9360 |
| }, |
| { |
| "epoch": 8.3915770609319, |
| "grad_norm": 0.5657069087028503, |
| "learning_rate": 3.842004626186085e-06, |
| "loss": 0.6851, |
| "num_input_tokens_seen": 3528856, |
| "step": 9365 |
| }, |
| { |
| "epoch": 8.39605734767025, |
| "grad_norm": 0.40441882610321045, |
| "learning_rate": 3.821204042975482e-06, |
| "loss": 0.7008, |
| "num_input_tokens_seen": 3530648, |
| "step": 9370 |
| }, |
| { |
| "epoch": 8.400537634408602, |
| "grad_norm": 0.43344125151634216, |
| "learning_rate": 3.8004552594981815e-06, |
| "loss": 0.7028, |
| "num_input_tokens_seen": 3532376, |
| "step": 9375 |
| }, |
| { |
| "epoch": 8.405017921146953, |
| "grad_norm": 0.5527206659317017, |
| "learning_rate": 3.77975832650217e-06, |
| "loss": 0.7186, |
| "num_input_tokens_seen": 3534200, |
| "step": 9380 |
| }, |
| { |
| "epoch": 8.409498207885305, |
| "grad_norm": 0.5849681496620178, |
| "learning_rate": 3.7591132946086434e-06, |
| "loss": 0.6592, |
| "num_input_tokens_seen": 3536376, |
| "step": 9385 |
| }, |
| { |
| "epoch": 8.413978494623656, |
| "grad_norm": 0.6648504734039307, |
| "learning_rate": 3.7385202143118192e-06, |
| "loss": 0.681, |
| "num_input_tokens_seen": 3538392, |
| "step": 9390 |
| }, |
| { |
| "epoch": 8.418458781362007, |
| "grad_norm": 0.49053147435188293, |
| "learning_rate": 3.717979135978883e-06, |
| "loss": 0.6819, |
| "num_input_tokens_seen": 3540280, |
| "step": 9395 |
| }, |
| { |
| "epoch": 8.422939068100359, |
| "grad_norm": 0.6155134439468384, |
| "learning_rate": 3.697490109849816e-06, |
| "loss": 0.694, |
| "num_input_tokens_seen": 3542136, |
| "step": 9400 |
| }, |
| { |
| "epoch": 8.42741935483871, |
| "grad_norm": 0.583812415599823, |
| "learning_rate": 3.6770531860372853e-06, |
| "loss": 0.694, |
| "num_input_tokens_seen": 3543992, |
| "step": 9405 |
| }, |
| { |
| "epoch": 8.431899641577061, |
| "grad_norm": 0.4418407082557678, |
| "learning_rate": 3.6566684145265483e-06, |
| "loss": 0.6877, |
| "num_input_tokens_seen": 3545784, |
| "step": 9410 |
| }, |
| { |
| "epoch": 8.436379928315413, |
| "grad_norm": 0.6325966715812683, |
| "learning_rate": 3.636335845175265e-06, |
| "loss": 0.6729, |
| "num_input_tokens_seen": 3547800, |
| "step": 9415 |
| }, |
| { |
| "epoch": 8.440860215053764, |
| "grad_norm": 0.3695630729198456, |
| "learning_rate": 3.616055527713463e-06, |
| "loss": 0.7098, |
| "num_input_tokens_seen": 3549528, |
| "step": 9420 |
| }, |
| { |
| "epoch": 8.445340501792115, |
| "grad_norm": 0.6490418910980225, |
| "learning_rate": 3.595827511743341e-06, |
| "loss": 0.7014, |
| "num_input_tokens_seen": 3551416, |
| "step": 9425 |
| }, |
| { |
| "epoch": 8.449820788530467, |
| "grad_norm": 0.6782343983650208, |
| "learning_rate": 3.575651846739181e-06, |
| "loss": 0.6723, |
| "num_input_tokens_seen": 3553336, |
| "step": 9430 |
| }, |
| { |
| "epoch": 8.454301075268818, |
| "grad_norm": 0.6364790797233582, |
| "learning_rate": 3.5555285820472435e-06, |
| "loss": 0.6909, |
| "num_input_tokens_seen": 3555096, |
| "step": 9435 |
| }, |
| { |
| "epoch": 8.45878136200717, |
| "grad_norm": 0.5496588945388794, |
| "learning_rate": 3.5354577668856083e-06, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 3556952, |
| "step": 9440 |
| }, |
| { |
| "epoch": 8.46326164874552, |
| "grad_norm": 0.6158499717712402, |
| "learning_rate": 3.5154394503440576e-06, |
| "loss": 0.686, |
| "num_input_tokens_seen": 3558776, |
| "step": 9445 |
| }, |
| { |
| "epoch": 8.46774193548387, |
| "grad_norm": 0.5592875480651855, |
| "learning_rate": 3.4954736813840095e-06, |
| "loss": 0.677, |
| "num_input_tokens_seen": 3560792, |
| "step": 9450 |
| }, |
| { |
| "epoch": 8.472222222222221, |
| "grad_norm": 0.5937447547912598, |
| "learning_rate": 3.47556050883833e-06, |
| "loss": 0.7008, |
| "num_input_tokens_seen": 3562680, |
| "step": 9455 |
| }, |
| { |
| "epoch": 8.476702508960573, |
| "grad_norm": 0.45481178164482117, |
| "learning_rate": 3.455699981411259e-06, |
| "loss": 0.6613, |
| "num_input_tokens_seen": 3564696, |
| "step": 9460 |
| }, |
| { |
| "epoch": 8.481182795698924, |
| "grad_norm": 0.61704421043396, |
| "learning_rate": 3.4358921476782714e-06, |
| "loss": 0.6897, |
| "num_input_tokens_seen": 3566520, |
| "step": 9465 |
| }, |
| { |
| "epoch": 8.485663082437275, |
| "grad_norm": 0.8740954995155334, |
| "learning_rate": 3.416137056085944e-06, |
| "loss": 0.6944, |
| "num_input_tokens_seen": 3568536, |
| "step": 9470 |
| }, |
| { |
| "epoch": 8.490143369175627, |
| "grad_norm": 0.5003806352615356, |
| "learning_rate": 3.3964347549518883e-06, |
| "loss": 0.6639, |
| "num_input_tokens_seen": 3570360, |
| "step": 9475 |
| }, |
| { |
| "epoch": 8.494623655913978, |
| "grad_norm": 0.7029122114181519, |
| "learning_rate": 3.376785292464574e-06, |
| "loss": 0.7039, |
| "num_input_tokens_seen": 3572280, |
| "step": 9480 |
| }, |
| { |
| "epoch": 8.49910394265233, |
| "grad_norm": 0.6127151250839233, |
| "learning_rate": 3.3571887166832434e-06, |
| "loss": 0.6701, |
| "num_input_tokens_seen": 3574200, |
| "step": 9485 |
| }, |
| { |
| "epoch": 8.5, |
| "eval_loss": 0.6988206505775452, |
| "eval_runtime": 5.621, |
| "eval_samples_per_second": 88.241, |
| "eval_steps_per_second": 22.06, |
| "num_input_tokens_seen": 3574616, |
| "step": 9486 |
| }, |
| { |
| "epoch": 8.50358422939068, |
| "grad_norm": 0.5318965315818787, |
| "learning_rate": 3.3376450755377958e-06, |
| "loss": 0.6762, |
| "num_input_tokens_seen": 3576344, |
| "step": 9490 |
| }, |
| { |
| "epoch": 8.508064516129032, |
| "grad_norm": 0.4716894030570984, |
| "learning_rate": 3.3181544168286503e-06, |
| "loss": 0.6827, |
| "num_input_tokens_seen": 3578296, |
| "step": 9495 |
| }, |
| { |
| "epoch": 8.512544802867383, |
| "grad_norm": 0.46207571029663086, |
| "learning_rate": 3.298716788226644e-06, |
| "loss": 0.6788, |
| "num_input_tokens_seen": 3580216, |
| "step": 9500 |
| }, |
| { |
| "epoch": 8.517025089605735, |
| "grad_norm": 0.7773041725158691, |
| "learning_rate": 3.2793322372729085e-06, |
| "loss": 0.6783, |
| "num_input_tokens_seen": 3582200, |
| "step": 9505 |
| }, |
| { |
| "epoch": 8.521505376344086, |
| "grad_norm": 0.6034818887710571, |
| "learning_rate": 3.260000811378755e-06, |
| "loss": 0.6997, |
| "num_input_tokens_seen": 3584152, |
| "step": 9510 |
| }, |
| { |
| "epoch": 8.525985663082437, |
| "grad_norm": 0.5097447633743286, |
| "learning_rate": 3.240722557825576e-06, |
| "loss": 0.6898, |
| "num_input_tokens_seen": 3585816, |
| "step": 9515 |
| }, |
| { |
| "epoch": 8.530465949820789, |
| "grad_norm": 0.5300137400627136, |
| "learning_rate": 3.2214975237646937e-06, |
| "loss": 0.6626, |
| "num_input_tokens_seen": 3587896, |
| "step": 9520 |
| }, |
| { |
| "epoch": 8.53494623655914, |
| "grad_norm": 0.6867164373397827, |
| "learning_rate": 3.2023257562172725e-06, |
| "loss": 0.6889, |
| "num_input_tokens_seen": 3589752, |
| "step": 9525 |
| }, |
| { |
| "epoch": 8.539426523297491, |
| "grad_norm": 0.4473750591278076, |
| "learning_rate": 3.1832073020741983e-06, |
| "loss": 0.6506, |
| "num_input_tokens_seen": 3591672, |
| "step": 9530 |
| }, |
| { |
| "epoch": 8.543906810035843, |
| "grad_norm": 0.4385966658592224, |
| "learning_rate": 3.1641422080959465e-06, |
| "loss": 0.6698, |
| "num_input_tokens_seen": 3593656, |
| "step": 9535 |
| }, |
| { |
| "epoch": 8.548387096774194, |
| "grad_norm": 0.5705263018608093, |
| "learning_rate": 3.145130520912515e-06, |
| "loss": 0.6846, |
| "num_input_tokens_seen": 3595832, |
| "step": 9540 |
| }, |
| { |
| "epoch": 8.552867383512545, |
| "grad_norm": 0.4351136088371277, |
| "learning_rate": 3.1261722870232436e-06, |
| "loss": 0.6792, |
| "num_input_tokens_seen": 3597528, |
| "step": 9545 |
| }, |
| { |
| "epoch": 8.557347670250897, |
| "grad_norm": 0.863899827003479, |
| "learning_rate": 3.1072675527967526e-06, |
| "loss": 0.7347, |
| "num_input_tokens_seen": 3599480, |
| "step": 9550 |
| }, |
| { |
| "epoch": 8.561827956989248, |
| "grad_norm": 0.5281751155853271, |
| "learning_rate": 3.0884163644708084e-06, |
| "loss": 0.679, |
| "num_input_tokens_seen": 3601368, |
| "step": 9555 |
| }, |
| { |
| "epoch": 8.5663082437276, |
| "grad_norm": 0.5913388133049011, |
| "learning_rate": 3.069618768152202e-06, |
| "loss": 0.7038, |
| "num_input_tokens_seen": 3603288, |
| "step": 9560 |
| }, |
| { |
| "epoch": 8.57078853046595, |
| "grad_norm": 0.4851022958755493, |
| "learning_rate": 3.050874809816673e-06, |
| "loss": 0.6821, |
| "num_input_tokens_seen": 3605048, |
| "step": 9565 |
| }, |
| { |
| "epoch": 8.575268817204302, |
| "grad_norm": 0.3585127294063568, |
| "learning_rate": 3.0321845353087463e-06, |
| "loss": 0.697, |
| "num_input_tokens_seen": 3606840, |
| "step": 9570 |
| }, |
| { |
| "epoch": 8.579749103942653, |
| "grad_norm": 0.5370261669158936, |
| "learning_rate": 3.0135479903416586e-06, |
| "loss": 0.6821, |
| "num_input_tokens_seen": 3608728, |
| "step": 9575 |
| }, |
| { |
| "epoch": 8.584229390681003, |
| "grad_norm": 0.7784853577613831, |
| "learning_rate": 2.9949652204972254e-06, |
| "loss": 0.6823, |
| "num_input_tokens_seen": 3610552, |
| "step": 9580 |
| }, |
| { |
| "epoch": 8.588709677419354, |
| "grad_norm": 0.5405849814414978, |
| "learning_rate": 2.976436271225741e-06, |
| "loss": 0.6896, |
| "num_input_tokens_seen": 3612472, |
| "step": 9585 |
| }, |
| { |
| "epoch": 8.593189964157705, |
| "grad_norm": 0.4924999475479126, |
| "learning_rate": 2.95796118784587e-06, |
| "loss": 0.6981, |
| "num_input_tokens_seen": 3614360, |
| "step": 9590 |
| }, |
| { |
| "epoch": 8.597670250896057, |
| "grad_norm": 0.8940668106079102, |
| "learning_rate": 2.939540015544523e-06, |
| "loss": 0.7345, |
| "num_input_tokens_seen": 3616216, |
| "step": 9595 |
| }, |
| { |
| "epoch": 8.602150537634408, |
| "grad_norm": 0.5306518077850342, |
| "learning_rate": 2.9211727993767507e-06, |
| "loss": 0.669, |
| "num_input_tokens_seen": 3617880, |
| "step": 9600 |
| }, |
| { |
| "epoch": 8.60663082437276, |
| "grad_norm": 0.5440559387207031, |
| "learning_rate": 2.902859584265649e-06, |
| "loss": 0.6866, |
| "num_input_tokens_seen": 3619736, |
| "step": 9605 |
| }, |
| { |
| "epoch": 8.61111111111111, |
| "grad_norm": 0.5547166466712952, |
| "learning_rate": 2.88460041500222e-06, |
| "loss": 0.6938, |
| "num_input_tokens_seen": 3621560, |
| "step": 9610 |
| }, |
| { |
| "epoch": 8.615591397849462, |
| "grad_norm": 0.27369892597198486, |
| "learning_rate": 2.866395336245284e-06, |
| "loss": 0.6766, |
| "num_input_tokens_seen": 3623224, |
| "step": 9615 |
| }, |
| { |
| "epoch": 8.620071684587813, |
| "grad_norm": 0.48885655403137207, |
| "learning_rate": 2.8482443925213765e-06, |
| "loss": 0.6696, |
| "num_input_tokens_seen": 3625208, |
| "step": 9620 |
| }, |
| { |
| "epoch": 8.624551971326165, |
| "grad_norm": 0.5681203603744507, |
| "learning_rate": 2.8301476282246164e-06, |
| "loss": 0.6538, |
| "num_input_tokens_seen": 3627192, |
| "step": 9625 |
| }, |
| { |
| "epoch": 8.629032258064516, |
| "grad_norm": 0.6536678075790405, |
| "learning_rate": 2.8121050876166096e-06, |
| "loss": 0.6999, |
| "num_input_tokens_seen": 3629112, |
| "step": 9630 |
| }, |
| { |
| "epoch": 8.633512544802867, |
| "grad_norm": 0.5405173897743225, |
| "learning_rate": 2.794116814826342e-06, |
| "loss": 0.686, |
| "num_input_tokens_seen": 3630808, |
| "step": 9635 |
| }, |
| { |
| "epoch": 8.637992831541219, |
| "grad_norm": 0.4284382462501526, |
| "learning_rate": 2.776182853850065e-06, |
| "loss": 0.6639, |
| "num_input_tokens_seen": 3632664, |
| "step": 9640 |
| }, |
| { |
| "epoch": 8.64247311827957, |
| "grad_norm": 0.4499342441558838, |
| "learning_rate": 2.758303248551211e-06, |
| "loss": 0.6603, |
| "num_input_tokens_seen": 3634456, |
| "step": 9645 |
| }, |
| { |
| "epoch": 8.646953405017921, |
| "grad_norm": 0.4879976809024811, |
| "learning_rate": 2.740478042660244e-06, |
| "loss": 0.6835, |
| "num_input_tokens_seen": 3636216, |
| "step": 9650 |
| }, |
| { |
| "epoch": 8.651433691756273, |
| "grad_norm": 0.33876270055770874, |
| "learning_rate": 2.7227072797745833e-06, |
| "loss": 0.6855, |
| "num_input_tokens_seen": 3638200, |
| "step": 9655 |
| }, |
| { |
| "epoch": 8.655913978494624, |
| "grad_norm": 0.7893423438072205, |
| "learning_rate": 2.7049910033585093e-06, |
| "loss": 0.7201, |
| "num_input_tokens_seen": 3640088, |
| "step": 9660 |
| }, |
| { |
| "epoch": 8.660394265232975, |
| "grad_norm": 0.4228385090827942, |
| "learning_rate": 2.6873292567429986e-06, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 3641944, |
| "step": 9665 |
| }, |
| { |
| "epoch": 8.664874551971327, |
| "grad_norm": 0.5193427801132202, |
| "learning_rate": 2.6697220831256974e-06, |
| "loss": 0.6952, |
| "num_input_tokens_seen": 3643896, |
| "step": 9670 |
| }, |
| { |
| "epoch": 8.669354838709678, |
| "grad_norm": 0.5370306372642517, |
| "learning_rate": 2.6521695255707495e-06, |
| "loss": 0.7183, |
| "num_input_tokens_seen": 3645784, |
| "step": 9675 |
| }, |
| { |
| "epoch": 8.67383512544803, |
| "grad_norm": 0.6196435689926147, |
| "learning_rate": 2.6346716270087253e-06, |
| "loss": 0.7019, |
| "num_input_tokens_seen": 3647800, |
| "step": 9680 |
| }, |
| { |
| "epoch": 8.67831541218638, |
| "grad_norm": 0.8368743658065796, |
| "learning_rate": 2.617228430236521e-06, |
| "loss": 0.6967, |
| "num_input_tokens_seen": 3649624, |
| "step": 9685 |
| }, |
| { |
| "epoch": 8.682795698924732, |
| "grad_norm": 0.6965826153755188, |
| "learning_rate": 2.5998399779172123e-06, |
| "loss": 0.7065, |
| "num_input_tokens_seen": 3651416, |
| "step": 9690 |
| }, |
| { |
| "epoch": 8.687275985663083, |
| "grad_norm": 0.6506573557853699, |
| "learning_rate": 2.5825063125800074e-06, |
| "loss": 0.7179, |
| "num_input_tokens_seen": 3653464, |
| "step": 9695 |
| }, |
| { |
| "epoch": 8.691756272401435, |
| "grad_norm": 0.4431875944137573, |
| "learning_rate": 2.565227476620105e-06, |
| "loss": 0.6612, |
| "num_input_tokens_seen": 3655320, |
| "step": 9700 |
| }, |
| { |
| "epoch": 8.696236559139784, |
| "grad_norm": 0.6212185621261597, |
| "learning_rate": 2.5480035122985885e-06, |
| "loss": 0.6481, |
| "num_input_tokens_seen": 3657400, |
| "step": 9705 |
| }, |
| { |
| "epoch": 8.700716845878135, |
| "grad_norm": 0.5748448967933655, |
| "learning_rate": 2.530834461742357e-06, |
| "loss": 0.6894, |
| "num_input_tokens_seen": 3659256, |
| "step": 9710 |
| }, |
| { |
| "epoch": 8.705197132616487, |
| "grad_norm": 0.4338792860507965, |
| "learning_rate": 2.513720366943986e-06, |
| "loss": 0.702, |
| "num_input_tokens_seen": 3661080, |
| "step": 9715 |
| }, |
| { |
| "epoch": 8.709677419354838, |
| "grad_norm": 0.43087977170944214, |
| "learning_rate": 2.4966612697616382e-06, |
| "loss": 0.6779, |
| "num_input_tokens_seen": 3663128, |
| "step": 9720 |
| }, |
| { |
| "epoch": 8.71415770609319, |
| "grad_norm": 0.7099825739860535, |
| "learning_rate": 2.4796572119189647e-06, |
| "loss": 0.7211, |
| "num_input_tokens_seen": 3665144, |
| "step": 9725 |
| }, |
| { |
| "epoch": 8.71863799283154, |
| "grad_norm": 0.4541792869567871, |
| "learning_rate": 2.462708235004996e-06, |
| "loss": 0.7046, |
| "num_input_tokens_seen": 3666968, |
| "step": 9730 |
| }, |
| { |
| "epoch": 8.723118279569892, |
| "grad_norm": 0.5138086676597595, |
| "learning_rate": 2.445814380474057e-06, |
| "loss": 0.6604, |
| "num_input_tokens_seen": 3668952, |
| "step": 9735 |
| }, |
| { |
| "epoch": 8.727598566308243, |
| "grad_norm": 0.4486521780490875, |
| "learning_rate": 2.4289756896456434e-06, |
| "loss": 0.677, |
| "num_input_tokens_seen": 3670744, |
| "step": 9740 |
| }, |
| { |
| "epoch": 8.732078853046595, |
| "grad_norm": 0.6109570264816284, |
| "learning_rate": 2.412192203704311e-06, |
| "loss": 0.6889, |
| "num_input_tokens_seen": 3672600, |
| "step": 9745 |
| }, |
| { |
| "epoch": 8.736559139784946, |
| "grad_norm": 0.5936306715011597, |
| "learning_rate": 2.395463963699629e-06, |
| "loss": 0.6976, |
| "num_input_tokens_seen": 3674360, |
| "step": 9750 |
| }, |
| { |
| "epoch": 8.741039426523297, |
| "grad_norm": 0.6088002324104309, |
| "learning_rate": 2.3787910105460247e-06, |
| "loss": 0.7359, |
| "num_input_tokens_seen": 3676152, |
| "step": 9755 |
| }, |
| { |
| "epoch": 8.745519713261649, |
| "grad_norm": 0.49755457043647766, |
| "learning_rate": 2.362173385022701e-06, |
| "loss": 0.6827, |
| "num_input_tokens_seen": 3678104, |
| "step": 9760 |
| }, |
| { |
| "epoch": 8.75, |
| "grad_norm": 0.5336976647377014, |
| "learning_rate": 2.3456111277735506e-06, |
| "loss": 0.7125, |
| "num_input_tokens_seen": 3679864, |
| "step": 9765 |
| }, |
| { |
| "epoch": 8.754480286738351, |
| "grad_norm": 0.42951998114585876, |
| "learning_rate": 2.3291042793070374e-06, |
| "loss": 0.6635, |
| "num_input_tokens_seen": 3681720, |
| "step": 9770 |
| }, |
| { |
| "epoch": 8.758960573476703, |
| "grad_norm": 0.46314573287963867, |
| "learning_rate": 2.3126528799961024e-06, |
| "loss": 0.6886, |
| "num_input_tokens_seen": 3683832, |
| "step": 9775 |
| }, |
| { |
| "epoch": 8.763440860215054, |
| "grad_norm": 0.5346803069114685, |
| "learning_rate": 2.2962569700780726e-06, |
| "loss": 0.646, |
| "num_input_tokens_seen": 3685752, |
| "step": 9780 |
| }, |
| { |
| "epoch": 8.767921146953405, |
| "grad_norm": 0.4398757219314575, |
| "learning_rate": 2.279916589654549e-06, |
| "loss": 0.6684, |
| "num_input_tokens_seen": 3687704, |
| "step": 9785 |
| }, |
| { |
| "epoch": 8.772401433691757, |
| "grad_norm": 0.5858141183853149, |
| "learning_rate": 2.263631778691333e-06, |
| "loss": 0.6911, |
| "num_input_tokens_seen": 3689624, |
| "step": 9790 |
| }, |
| { |
| "epoch": 8.776881720430108, |
| "grad_norm": 0.7604189515113831, |
| "learning_rate": 2.2474025770182982e-06, |
| "loss": 0.6823, |
| "num_input_tokens_seen": 3691544, |
| "step": 9795 |
| }, |
| { |
| "epoch": 8.78136200716846, |
| "grad_norm": 0.46800726652145386, |
| "learning_rate": 2.2312290243293147e-06, |
| "loss": 0.6948, |
| "num_input_tokens_seen": 3693368, |
| "step": 9800 |
| }, |
| { |
| "epoch": 8.78584229390681, |
| "grad_norm": 0.40996742248535156, |
| "learning_rate": 2.21511116018214e-06, |
| "loss": 0.6847, |
| "num_input_tokens_seen": 3695224, |
| "step": 9805 |
| }, |
| { |
| "epoch": 8.790322580645162, |
| "grad_norm": 0.5102288126945496, |
| "learning_rate": 2.199049023998323e-06, |
| "loss": 0.7095, |
| "num_input_tokens_seen": 3697048, |
| "step": 9810 |
| }, |
| { |
| "epoch": 8.794802867383513, |
| "grad_norm": 0.4394807517528534, |
| "learning_rate": 2.1830426550631276e-06, |
| "loss": 0.6739, |
| "num_input_tokens_seen": 3699032, |
| "step": 9815 |
| }, |
| { |
| "epoch": 8.799283154121865, |
| "grad_norm": 0.6569847464561462, |
| "learning_rate": 2.1670920925254053e-06, |
| "loss": 0.664, |
| "num_input_tokens_seen": 3700888, |
| "step": 9820 |
| }, |
| { |
| "epoch": 8.803763440860216, |
| "grad_norm": 0.4514296054840088, |
| "learning_rate": 2.1511973753975208e-06, |
| "loss": 0.6695, |
| "num_input_tokens_seen": 3702680, |
| "step": 9825 |
| }, |
| { |
| "epoch": 8.808243727598565, |
| "grad_norm": 0.7222883105278015, |
| "learning_rate": 2.1353585425552463e-06, |
| "loss": 0.6985, |
| "num_input_tokens_seen": 3704536, |
| "step": 9830 |
| }, |
| { |
| "epoch": 8.812724014336917, |
| "grad_norm": 0.5998217463493347, |
| "learning_rate": 2.1195756327376722e-06, |
| "loss": 0.6851, |
| "num_input_tokens_seen": 3706360, |
| "step": 9835 |
| }, |
| { |
| "epoch": 8.817204301075268, |
| "grad_norm": 0.5000945329666138, |
| "learning_rate": 2.1038486845471215e-06, |
| "loss": 0.6977, |
| "num_input_tokens_seen": 3708088, |
| "step": 9840 |
| }, |
| { |
| "epoch": 8.82168458781362, |
| "grad_norm": 0.48113876581192017, |
| "learning_rate": 2.0881777364490265e-06, |
| "loss": 0.6994, |
| "num_input_tokens_seen": 3710040, |
| "step": 9845 |
| }, |
| { |
| "epoch": 8.82616487455197, |
| "grad_norm": 0.5569098591804504, |
| "learning_rate": 2.0725628267718595e-06, |
| "loss": 0.6841, |
| "num_input_tokens_seen": 3711928, |
| "step": 9850 |
| }, |
| { |
| "epoch": 8.830645161290322, |
| "grad_norm": 0.46733352541923523, |
| "learning_rate": 2.0570039937070463e-06, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 3713720, |
| "step": 9855 |
| }, |
| { |
| "epoch": 8.835125448028673, |
| "grad_norm": 0.5141034126281738, |
| "learning_rate": 2.04150127530883e-06, |
| "loss": 0.7, |
| "num_input_tokens_seen": 3715416, |
| "step": 9860 |
| }, |
| { |
| "epoch": 8.839605734767025, |
| "grad_norm": 0.7939246892929077, |
| "learning_rate": 2.026054709494235e-06, |
| "loss": 0.6841, |
| "num_input_tokens_seen": 3717208, |
| "step": 9865 |
| }, |
| { |
| "epoch": 8.844086021505376, |
| "grad_norm": 0.6357170939445496, |
| "learning_rate": 2.0106643340429332e-06, |
| "loss": 0.7052, |
| "num_input_tokens_seen": 3718936, |
| "step": 9870 |
| }, |
| { |
| "epoch": 8.848566308243727, |
| "grad_norm": 0.6246017813682556, |
| "learning_rate": 1.995330186597158e-06, |
| "loss": 0.668, |
| "num_input_tokens_seen": 3720920, |
| "step": 9875 |
| }, |
| { |
| "epoch": 8.853046594982079, |
| "grad_norm": 0.5736755728721619, |
| "learning_rate": 1.980052304661642e-06, |
| "loss": 0.6858, |
| "num_input_tokens_seen": 3722776, |
| "step": 9880 |
| }, |
| { |
| "epoch": 8.85752688172043, |
| "grad_norm": 0.6197718977928162, |
| "learning_rate": 1.9648307256034697e-06, |
| "loss": 0.6752, |
| "num_input_tokens_seen": 3724792, |
| "step": 9885 |
| }, |
| { |
| "epoch": 8.862007168458781, |
| "grad_norm": 0.718443751335144, |
| "learning_rate": 1.9496654866520414e-06, |
| "loss": 0.6788, |
| "num_input_tokens_seen": 3726712, |
| "step": 9890 |
| }, |
| { |
| "epoch": 8.866487455197133, |
| "grad_norm": 0.6312748789787292, |
| "learning_rate": 1.9345566248989534e-06, |
| "loss": 0.7045, |
| "num_input_tokens_seen": 3728696, |
| "step": 9895 |
| }, |
| { |
| "epoch": 8.870967741935484, |
| "grad_norm": 0.4447576403617859, |
| "learning_rate": 1.9195041772979093e-06, |
| "loss": 0.6994, |
| "num_input_tokens_seen": 3730488, |
| "step": 9900 |
| }, |
| { |
| "epoch": 8.875448028673835, |
| "grad_norm": 0.7500755190849304, |
| "learning_rate": 1.9045081806646436e-06, |
| "loss": 0.7111, |
| "num_input_tokens_seen": 3732440, |
| "step": 9905 |
| }, |
| { |
| "epoch": 8.879928315412187, |
| "grad_norm": 0.5350488424301147, |
| "learning_rate": 1.8895686716768113e-06, |
| "loss": 0.7168, |
| "num_input_tokens_seen": 3734488, |
| "step": 9910 |
| }, |
| { |
| "epoch": 8.884408602150538, |
| "grad_norm": 0.4130159020423889, |
| "learning_rate": 1.8746856868739004e-06, |
| "loss": 0.6988, |
| "num_input_tokens_seen": 3736472, |
| "step": 9915 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 0.9204648733139038, |
| "learning_rate": 1.8598592626571737e-06, |
| "loss": 0.7053, |
| "num_input_tokens_seen": 3738264, |
| "step": 9920 |
| }, |
| { |
| "epoch": 8.89336917562724, |
| "grad_norm": 0.542391836643219, |
| "learning_rate": 1.8450894352895375e-06, |
| "loss": 0.7031, |
| "num_input_tokens_seen": 3740056, |
| "step": 9925 |
| }, |
| { |
| "epoch": 8.897849462365592, |
| "grad_norm": 0.7514336109161377, |
| "learning_rate": 1.8303762408954761e-06, |
| "loss": 0.6645, |
| "num_input_tokens_seen": 3742008, |
| "step": 9930 |
| }, |
| { |
| "epoch": 8.902329749103943, |
| "grad_norm": 0.6825776696205139, |
| "learning_rate": 1.81571971546097e-06, |
| "loss": 0.6812, |
| "num_input_tokens_seen": 3743864, |
| "step": 9935 |
| }, |
| { |
| "epoch": 8.906810035842295, |
| "grad_norm": 0.5593582987785339, |
| "learning_rate": 1.8011198948333751e-06, |
| "loss": 0.6949, |
| "num_input_tokens_seen": 3745752, |
| "step": 9940 |
| }, |
| { |
| "epoch": 8.911290322580646, |
| "grad_norm": 0.4969135522842407, |
| "learning_rate": 1.7865768147213802e-06, |
| "loss": 0.7029, |
| "num_input_tokens_seen": 3747576, |
| "step": 9945 |
| }, |
| { |
| "epoch": 8.915770609318997, |
| "grad_norm": 0.507438063621521, |
| "learning_rate": 1.7720905106948821e-06, |
| "loss": 0.7065, |
| "num_input_tokens_seen": 3749464, |
| "step": 9950 |
| }, |
| { |
| "epoch": 8.920250896057347, |
| "grad_norm": 0.6009644865989685, |
| "learning_rate": 1.7576610181849113e-06, |
| "loss": 0.6846, |
| "num_input_tokens_seen": 3751352, |
| "step": 9955 |
| }, |
| { |
| "epoch": 8.924731182795698, |
| "grad_norm": 0.5117262005805969, |
| "learning_rate": 1.7432883724835646e-06, |
| "loss": 0.6989, |
| "num_input_tokens_seen": 3753208, |
| "step": 9960 |
| }, |
| { |
| "epoch": 8.92921146953405, |
| "grad_norm": 0.45503732562065125, |
| "learning_rate": 1.7289726087438813e-06, |
| "loss": 0.6954, |
| "num_input_tokens_seen": 3755000, |
| "step": 9965 |
| }, |
| { |
| "epoch": 8.9336917562724, |
| "grad_norm": 0.5579915642738342, |
| "learning_rate": 1.7147137619797888e-06, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 3756856, |
| "step": 9970 |
| }, |
| { |
| "epoch": 8.938172043010752, |
| "grad_norm": 0.3492272198200226, |
| "learning_rate": 1.7005118670659987e-06, |
| "loss": 0.7001, |
| "num_input_tokens_seen": 3758616, |
| "step": 9975 |
| }, |
| { |
| "epoch": 8.942652329749103, |
| "grad_norm": 0.47283822298049927, |
| "learning_rate": 1.6863669587379282e-06, |
| "loss": 0.7192, |
| "num_input_tokens_seen": 3760344, |
| "step": 9980 |
| }, |
| { |
| "epoch": 8.947132616487455, |
| "grad_norm": 0.5391170382499695, |
| "learning_rate": 1.6722790715916231e-06, |
| "loss": 0.6784, |
| "num_input_tokens_seen": 3762232, |
| "step": 9985 |
| }, |
| { |
| "epoch": 8.951612903225806, |
| "grad_norm": 0.6539673805236816, |
| "learning_rate": 1.658248240083657e-06, |
| "loss": 0.6817, |
| "num_input_tokens_seen": 3764088, |
| "step": 9990 |
| }, |
| { |
| "epoch": 8.956093189964157, |
| "grad_norm": 0.6863439083099365, |
| "learning_rate": 1.6442744985310593e-06, |
| "loss": 0.7096, |
| "num_input_tokens_seen": 3765656, |
| "step": 9995 |
| }, |
| { |
| "epoch": 8.960573476702509, |
| "grad_norm": 0.5178156495094299, |
| "learning_rate": 1.6303578811112246e-06, |
| "loss": 0.6845, |
| "num_input_tokens_seen": 3767352, |
| "step": 10000 |
| }, |
| { |
| "epoch": 8.96505376344086, |
| "grad_norm": 0.5608931183815002, |
| "learning_rate": 1.6164984218618285e-06, |
| "loss": 0.7052, |
| "num_input_tokens_seen": 3769240, |
| "step": 10005 |
| }, |
| { |
| "epoch": 8.969534050179211, |
| "grad_norm": 0.5038066506385803, |
| "learning_rate": 1.6026961546807605e-06, |
| "loss": 0.6746, |
| "num_input_tokens_seen": 3771032, |
| "step": 10010 |
| }, |
| { |
| "epoch": 8.974014336917563, |
| "grad_norm": 0.7060883045196533, |
| "learning_rate": 1.5889511133260121e-06, |
| "loss": 0.6875, |
| "num_input_tokens_seen": 3772952, |
| "step": 10015 |
| }, |
| { |
| "epoch": 8.978494623655914, |
| "grad_norm": 0.5578808188438416, |
| "learning_rate": 1.575263331415619e-06, |
| "loss": 0.66, |
| "num_input_tokens_seen": 3774904, |
| "step": 10020 |
| }, |
| { |
| "epoch": 8.982974910394265, |
| "grad_norm": 0.5263607501983643, |
| "learning_rate": 1.5616328424275656e-06, |
| "loss": 0.6953, |
| "num_input_tokens_seen": 3776696, |
| "step": 10025 |
| }, |
| { |
| "epoch": 8.987455197132617, |
| "grad_norm": 0.6655629277229309, |
| "learning_rate": 1.5480596796997094e-06, |
| "loss": 0.6753, |
| "num_input_tokens_seen": 3778648, |
| "step": 10030 |
| }, |
| { |
| "epoch": 8.991935483870968, |
| "grad_norm": 0.5918577909469604, |
| "learning_rate": 1.534543876429706e-06, |
| "loss": 0.7152, |
| "num_input_tokens_seen": 3780568, |
| "step": 10035 |
| }, |
| { |
| "epoch": 8.99641577060932, |
| "grad_norm": 0.5958806276321411, |
| "learning_rate": 1.521085465674904e-06, |
| "loss": 0.7297, |
| "num_input_tokens_seen": 3782488, |
| "step": 10040 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.699591875076294, |
| "eval_runtime": 5.6395, |
| "eval_samples_per_second": 87.951, |
| "eval_steps_per_second": 21.988, |
| "num_input_tokens_seen": 3783840, |
| "step": 10044 |
| }, |
| { |
| "epoch": 9.00089605734767, |
| "grad_norm": 0.6611407399177551, |
| "learning_rate": 1.5076844803522922e-06, |
| "loss": 0.6816, |
| "num_input_tokens_seen": 3784384, |
| "step": 10045 |
| }, |
| { |
| "epoch": 9.005376344086022, |
| "grad_norm": 0.45874500274658203, |
| "learning_rate": 1.494340953238399e-06, |
| "loss": 0.6632, |
| "num_input_tokens_seen": 3786240, |
| "step": 10050 |
| }, |
| { |
| "epoch": 9.009856630824373, |
| "grad_norm": 0.5364258289337158, |
| "learning_rate": 1.481054916969221e-06, |
| "loss": 0.6888, |
| "num_input_tokens_seen": 3788032, |
| "step": 10055 |
| }, |
| { |
| "epoch": 9.014336917562725, |
| "grad_norm": 0.5259512662887573, |
| "learning_rate": 1.4678264040401458e-06, |
| "loss": 0.6833, |
| "num_input_tokens_seen": 3789920, |
| "step": 10060 |
| }, |
| { |
| "epoch": 9.018817204301076, |
| "grad_norm": 0.5970426797866821, |
| "learning_rate": 1.4546554468058665e-06, |
| "loss": 0.7168, |
| "num_input_tokens_seen": 3791616, |
| "step": 10065 |
| }, |
| { |
| "epoch": 9.023297491039427, |
| "grad_norm": 0.6536547541618347, |
| "learning_rate": 1.441542077480304e-06, |
| "loss": 0.7067, |
| "num_input_tokens_seen": 3793472, |
| "step": 10070 |
| }, |
| { |
| "epoch": 9.027777777777779, |
| "grad_norm": 0.5501564145088196, |
| "learning_rate": 1.428486328136533e-06, |
| "loss": 0.6724, |
| "num_input_tokens_seen": 3795200, |
| "step": 10075 |
| }, |
| { |
| "epoch": 9.03225806451613, |
| "grad_norm": 0.6372694969177246, |
| "learning_rate": 1.4154882307066907e-06, |
| "loss": 0.7086, |
| "num_input_tokens_seen": 3797152, |
| "step": 10080 |
| }, |
| { |
| "epoch": 9.03673835125448, |
| "grad_norm": 0.4424917697906494, |
| "learning_rate": 1.402547816981914e-06, |
| "loss": 0.6851, |
| "num_input_tokens_seen": 3799008, |
| "step": 10085 |
| }, |
| { |
| "epoch": 9.04121863799283, |
| "grad_norm": 0.5123321413993835, |
| "learning_rate": 1.3896651186122573e-06, |
| "loss": 0.6978, |
| "num_input_tokens_seen": 3801024, |
| "step": 10090 |
| }, |
| { |
| "epoch": 9.045698924731182, |
| "grad_norm": 0.6249075531959534, |
| "learning_rate": 1.3768401671066105e-06, |
| "loss": 0.7141, |
| "num_input_tokens_seen": 3803008, |
| "step": 10095 |
| }, |
| { |
| "epoch": 9.050179211469533, |
| "grad_norm": 0.6043964624404907, |
| "learning_rate": 1.3640729938326213e-06, |
| "loss": 0.7001, |
| "num_input_tokens_seen": 3804992, |
| "step": 10100 |
| }, |
| { |
| "epoch": 9.054659498207885, |
| "grad_norm": 0.36214983463287354, |
| "learning_rate": 1.351363630016622e-06, |
| "loss": 0.6571, |
| "num_input_tokens_seen": 3806848, |
| "step": 10105 |
| }, |
| { |
| "epoch": 9.059139784946236, |
| "grad_norm": 0.594513475894928, |
| "learning_rate": 1.3387121067435588e-06, |
| "loss": 0.6684, |
| "num_input_tokens_seen": 3808704, |
| "step": 10110 |
| }, |
| { |
| "epoch": 9.063620071684587, |
| "grad_norm": 0.5957300662994385, |
| "learning_rate": 1.3261184549569066e-06, |
| "loss": 0.7092, |
| "num_input_tokens_seen": 3810528, |
| "step": 10115 |
| }, |
| { |
| "epoch": 9.068100358422939, |
| "grad_norm": 0.5582435727119446, |
| "learning_rate": 1.3135827054585964e-06, |
| "loss": 0.6886, |
| "num_input_tokens_seen": 3812288, |
| "step": 10120 |
| }, |
| { |
| "epoch": 9.07258064516129, |
| "grad_norm": 0.45786210894584656, |
| "learning_rate": 1.3011048889089355e-06, |
| "loss": 0.6701, |
| "num_input_tokens_seen": 3814048, |
| "step": 10125 |
| }, |
| { |
| "epoch": 9.077060931899641, |
| "grad_norm": 0.6108914613723755, |
| "learning_rate": 1.288685035826548e-06, |
| "loss": 0.712, |
| "num_input_tokens_seen": 3815840, |
| "step": 10130 |
| }, |
| { |
| "epoch": 9.081541218637993, |
| "grad_norm": 0.5366916656494141, |
| "learning_rate": 1.2763231765882732e-06, |
| "loss": 0.6844, |
| "num_input_tokens_seen": 3817632, |
| "step": 10135 |
| }, |
| { |
| "epoch": 9.086021505376344, |
| "grad_norm": 0.6079829931259155, |
| "learning_rate": 1.2640193414291262e-06, |
| "loss": 0.6964, |
| "num_input_tokens_seen": 3819456, |
| "step": 10140 |
| }, |
| { |
| "epoch": 9.090501792114695, |
| "grad_norm": 0.6302862167358398, |
| "learning_rate": 1.2517735604421904e-06, |
| "loss": 0.6833, |
| "num_input_tokens_seen": 3821344, |
| "step": 10145 |
| }, |
| { |
| "epoch": 9.094982078853047, |
| "grad_norm": 0.5367377400398254, |
| "learning_rate": 1.2395858635785602e-06, |
| "loss": 0.6815, |
| "num_input_tokens_seen": 3823296, |
| "step": 10150 |
| }, |
| { |
| "epoch": 9.099462365591398, |
| "grad_norm": 0.5147011876106262, |
| "learning_rate": 1.2274562806472794e-06, |
| "loss": 0.7238, |
| "num_input_tokens_seen": 3825184, |
| "step": 10155 |
| }, |
| { |
| "epoch": 9.10394265232975, |
| "grad_norm": 0.49252960085868835, |
| "learning_rate": 1.2153848413152341e-06, |
| "loss": 0.7037, |
| "num_input_tokens_seen": 3827296, |
| "step": 10160 |
| }, |
| { |
| "epoch": 9.1084229390681, |
| "grad_norm": 0.3270881175994873, |
| "learning_rate": 1.2033715751071206e-06, |
| "loss": 0.6846, |
| "num_input_tokens_seen": 3828992, |
| "step": 10165 |
| }, |
| { |
| "epoch": 9.112903225806452, |
| "grad_norm": 0.6329634785652161, |
| "learning_rate": 1.191416511405341e-06, |
| "loss": 0.6666, |
| "num_input_tokens_seen": 3830880, |
| "step": 10170 |
| }, |
| { |
| "epoch": 9.117383512544803, |
| "grad_norm": 0.4612480401992798, |
| "learning_rate": 1.1795196794499475e-06, |
| "loss": 0.6996, |
| "num_input_tokens_seen": 3832864, |
| "step": 10175 |
| }, |
| { |
| "epoch": 9.121863799283155, |
| "grad_norm": 0.678649365901947, |
| "learning_rate": 1.1676811083385698e-06, |
| "loss": 0.6893, |
| "num_input_tokens_seen": 3834752, |
| "step": 10180 |
| }, |
| { |
| "epoch": 9.126344086021506, |
| "grad_norm": 0.5528131127357483, |
| "learning_rate": 1.155900827026346e-06, |
| "loss": 0.7061, |
| "num_input_tokens_seen": 3836640, |
| "step": 10185 |
| }, |
| { |
| "epoch": 9.130824372759857, |
| "grad_norm": 0.48890024423599243, |
| "learning_rate": 1.1441788643258233e-06, |
| "loss": 0.7156, |
| "num_input_tokens_seen": 3838528, |
| "step": 10190 |
| }, |
| { |
| "epoch": 9.135304659498209, |
| "grad_norm": 0.571433424949646, |
| "learning_rate": 1.1325152489069457e-06, |
| "loss": 0.6852, |
| "num_input_tokens_seen": 3840320, |
| "step": 10195 |
| }, |
| { |
| "epoch": 9.13978494623656, |
| "grad_norm": 0.4964417517185211, |
| "learning_rate": 1.1209100092969244e-06, |
| "loss": 0.6722, |
| "num_input_tokens_seen": 3842496, |
| "step": 10200 |
| }, |
| { |
| "epoch": 9.144265232974911, |
| "grad_norm": 0.39581766724586487, |
| "learning_rate": 1.109363173880204e-06, |
| "loss": 0.7177, |
| "num_input_tokens_seen": 3844352, |
| "step": 10205 |
| }, |
| { |
| "epoch": 9.14874551971326, |
| "grad_norm": 0.5092259049415588, |
| "learning_rate": 1.0978747708983854e-06, |
| "loss": 0.6978, |
| "num_input_tokens_seen": 3846304, |
| "step": 10210 |
| }, |
| { |
| "epoch": 9.153225806451612, |
| "grad_norm": 0.3758569359779358, |
| "learning_rate": 1.0864448284501394e-06, |
| "loss": 0.6779, |
| "num_input_tokens_seen": 3848192, |
| "step": 10215 |
| }, |
| { |
| "epoch": 9.157706093189963, |
| "grad_norm": 0.48183518648147583, |
| "learning_rate": 1.0750733744911674e-06, |
| "loss": 0.722, |
| "num_input_tokens_seen": 3850016, |
| "step": 10220 |
| }, |
| { |
| "epoch": 9.162186379928315, |
| "grad_norm": 0.6768030524253845, |
| "learning_rate": 1.063760436834113e-06, |
| "loss": 0.6925, |
| "num_input_tokens_seen": 3852000, |
| "step": 10225 |
| }, |
| { |
| "epoch": 9.166666666666666, |
| "grad_norm": 0.4611859917640686, |
| "learning_rate": 1.0525060431484907e-06, |
| "loss": 0.6945, |
| "num_input_tokens_seen": 3853760, |
| "step": 10230 |
| }, |
| { |
| "epoch": 9.171146953405017, |
| "grad_norm": 0.593660295009613, |
| "learning_rate": 1.0413102209606424e-06, |
| "loss": 0.6924, |
| "num_input_tokens_seen": 3855488, |
| "step": 10235 |
| }, |
| { |
| "epoch": 9.175627240143369, |
| "grad_norm": 0.35018354654312134, |
| "learning_rate": 1.0301729976536417e-06, |
| "loss": 0.6819, |
| "num_input_tokens_seen": 3857536, |
| "step": 10240 |
| }, |
| { |
| "epoch": 9.18010752688172, |
| "grad_norm": 0.6653251051902771, |
| "learning_rate": 1.0190944004672409e-06, |
| "loss": 0.6665, |
| "num_input_tokens_seen": 3859424, |
| "step": 10245 |
| }, |
| { |
| "epoch": 9.184587813620071, |
| "grad_norm": 0.4804827570915222, |
| "learning_rate": 1.0080744564978068e-06, |
| "loss": 0.6813, |
| "num_input_tokens_seen": 3861248, |
| "step": 10250 |
| }, |
| { |
| "epoch": 9.189068100358423, |
| "grad_norm": 0.5318313241004944, |
| "learning_rate": 9.971131926982458e-07, |
| "loss": 0.7229, |
| "num_input_tokens_seen": 3863168, |
| "step": 10255 |
| }, |
| { |
| "epoch": 9.193548387096774, |
| "grad_norm": 0.8895540237426758, |
| "learning_rate": 9.86210635877949e-07, |
| "loss": 0.6953, |
| "num_input_tokens_seen": 3865312, |
| "step": 10260 |
| }, |
| { |
| "epoch": 9.198028673835125, |
| "grad_norm": 0.5024261474609375, |
| "learning_rate": 9.753668127027133e-07, |
| "loss": 0.6892, |
| "num_input_tokens_seen": 3867328, |
| "step": 10265 |
| }, |
| { |
| "epoch": 9.202508960573477, |
| "grad_norm": 0.6652594208717346, |
| "learning_rate": 9.645817496946903e-07, |
| "loss": 0.6858, |
| "num_input_tokens_seen": 3869056, |
| "step": 10270 |
| }, |
| { |
| "epoch": 9.206989247311828, |
| "grad_norm": 0.5735182762145996, |
| "learning_rate": 9.538554732323041e-07, |
| "loss": 0.6699, |
| "num_input_tokens_seen": 3870976, |
| "step": 10275 |
| }, |
| { |
| "epoch": 9.21146953405018, |
| "grad_norm": 0.5266287326812744, |
| "learning_rate": 9.431880095502027e-07, |
| "loss": 0.6749, |
| "num_input_tokens_seen": 3872960, |
| "step": 10280 |
| }, |
| { |
| "epoch": 9.21594982078853, |
| "grad_norm": 0.4899725914001465, |
| "learning_rate": 9.325793847391962e-07, |
| "loss": 0.6694, |
| "num_input_tokens_seen": 3874976, |
| "step": 10285 |
| }, |
| { |
| "epoch": 9.220430107526882, |
| "grad_norm": 0.5171582698822021, |
| "learning_rate": 9.220296247461707e-07, |
| "loss": 0.7095, |
| "num_input_tokens_seen": 3876800, |
| "step": 10290 |
| }, |
| { |
| "epoch": 9.224910394265233, |
| "grad_norm": 0.5212193131446838, |
| "learning_rate": 9.115387553740473e-07, |
| "loss": 0.6967, |
| "num_input_tokens_seen": 3878560, |
| "step": 10295 |
| }, |
| { |
| "epoch": 9.229390681003585, |
| "grad_norm": 0.7846523523330688, |
| "learning_rate": 9.011068022817065e-07, |
| "loss": 0.6745, |
| "num_input_tokens_seen": 3880544, |
| "step": 10300 |
| }, |
| { |
| "epoch": 9.233870967741936, |
| "grad_norm": 0.46724578738212585, |
| "learning_rate": 8.907337909839275e-07, |
| "loss": 0.6785, |
| "num_input_tokens_seen": 3882368, |
| "step": 10305 |
| }, |
| { |
| "epoch": 9.238351254480287, |
| "grad_norm": 0.4712303578853607, |
| "learning_rate": 8.804197468513436e-07, |
| "loss": 0.6942, |
| "num_input_tokens_seen": 3884288, |
| "step": 10310 |
| }, |
| { |
| "epoch": 9.242831541218639, |
| "grad_norm": 0.5561097860336304, |
| "learning_rate": 8.701646951103425e-07, |
| "loss": 0.6814, |
| "num_input_tokens_seen": 3886176, |
| "step": 10315 |
| }, |
| { |
| "epoch": 9.24731182795699, |
| "grad_norm": 0.6059445738792419, |
| "learning_rate": 8.599686608430413e-07, |
| "loss": 0.6716, |
| "num_input_tokens_seen": 3888192, |
| "step": 10320 |
| }, |
| { |
| "epoch": 9.251792114695341, |
| "grad_norm": 0.4931063652038574, |
| "learning_rate": 8.498316689872055e-07, |
| "loss": 0.6829, |
| "num_input_tokens_seen": 3890048, |
| "step": 10325 |
| }, |
| { |
| "epoch": 9.256272401433693, |
| "grad_norm": 0.5145807266235352, |
| "learning_rate": 8.397537443361913e-07, |
| "loss": 0.7006, |
| "num_input_tokens_seen": 3891968, |
| "step": 10330 |
| }, |
| { |
| "epoch": 9.260752688172044, |
| "grad_norm": 0.3241588771343231, |
| "learning_rate": 8.297349115388903e-07, |
| "loss": 0.6973, |
| "num_input_tokens_seen": 3893696, |
| "step": 10335 |
| }, |
| { |
| "epoch": 9.265232974910393, |
| "grad_norm": 0.4836924076080322, |
| "learning_rate": 8.197751950996619e-07, |
| "loss": 0.7073, |
| "num_input_tokens_seen": 3895616, |
| "step": 10340 |
| }, |
| { |
| "epoch": 9.269713261648745, |
| "grad_norm": 0.667323887348175, |
| "learning_rate": 8.098746193782813e-07, |
| "loss": 0.6645, |
| "num_input_tokens_seen": 3897472, |
| "step": 10345 |
| }, |
| { |
| "epoch": 9.274193548387096, |
| "grad_norm": 0.5213760733604431, |
| "learning_rate": 8.00033208589876e-07, |
| "loss": 0.6747, |
| "num_input_tokens_seen": 3899264, |
| "step": 10350 |
| }, |
| { |
| "epoch": 9.278673835125447, |
| "grad_norm": 0.8510650992393494, |
| "learning_rate": 7.902509868048552e-07, |
| "loss": 0.6796, |
| "num_input_tokens_seen": 3901376, |
| "step": 10355 |
| }, |
| { |
| "epoch": 9.283154121863799, |
| "grad_norm": 0.4620399475097656, |
| "learning_rate": 7.805279779488722e-07, |
| "loss": 0.6795, |
| "num_input_tokens_seen": 3903328, |
| "step": 10360 |
| }, |
| { |
| "epoch": 9.28763440860215, |
| "grad_norm": 0.5646343231201172, |
| "learning_rate": 7.708642058027571e-07, |
| "loss": 0.6833, |
| "num_input_tokens_seen": 3905312, |
| "step": 10365 |
| }, |
| { |
| "epoch": 9.292114695340501, |
| "grad_norm": 0.5708273649215698, |
| "learning_rate": 7.61259694002453e-07, |
| "loss": 0.6887, |
| "num_input_tokens_seen": 3907200, |
| "step": 10370 |
| }, |
| { |
| "epoch": 9.296594982078853, |
| "grad_norm": 0.6553865671157837, |
| "learning_rate": 7.51714466038958e-07, |
| "loss": 0.6791, |
| "num_input_tokens_seen": 3909248, |
| "step": 10375 |
| }, |
| { |
| "epoch": 9.301075268817204, |
| "grad_norm": 0.5654320120811462, |
| "learning_rate": 7.422285452582805e-07, |
| "loss": 0.7134, |
| "num_input_tokens_seen": 3911168, |
| "step": 10380 |
| }, |
| { |
| "epoch": 9.305555555555555, |
| "grad_norm": 0.43947505950927734, |
| "learning_rate": 7.328019548613619e-07, |
| "loss": 0.7043, |
| "num_input_tokens_seen": 3912992, |
| "step": 10385 |
| }, |
| { |
| "epoch": 9.310035842293907, |
| "grad_norm": 0.7046077847480774, |
| "learning_rate": 7.234347179040507e-07, |
| "loss": 0.6869, |
| "num_input_tokens_seen": 3914784, |
| "step": 10390 |
| }, |
| { |
| "epoch": 9.314516129032258, |
| "grad_norm": 0.5752326250076294, |
| "learning_rate": 7.141268572970094e-07, |
| "loss": 0.6428, |
| "num_input_tokens_seen": 3916896, |
| "step": 10395 |
| }, |
| { |
| "epoch": 9.31899641577061, |
| "grad_norm": 0.5495961904525757, |
| "learning_rate": 7.048783958056804e-07, |
| "loss": 0.6925, |
| "num_input_tokens_seen": 3918688, |
| "step": 10400 |
| }, |
| { |
| "epoch": 9.32347670250896, |
| "grad_norm": 0.4289925694465637, |
| "learning_rate": 6.956893560502359e-07, |
| "loss": 0.706, |
| "num_input_tokens_seen": 3920512, |
| "step": 10405 |
| }, |
| { |
| "epoch": 9.327956989247312, |
| "grad_norm": 0.6946158409118652, |
| "learning_rate": 6.865597605054952e-07, |
| "loss": 0.6793, |
| "num_input_tokens_seen": 3922304, |
| "step": 10410 |
| }, |
| { |
| "epoch": 9.332437275985663, |
| "grad_norm": 0.4214330017566681, |
| "learning_rate": 6.774896315008994e-07, |
| "loss": 0.6784, |
| "num_input_tokens_seen": 3924384, |
| "step": 10415 |
| }, |
| { |
| "epoch": 9.336917562724015, |
| "grad_norm": 0.6280259490013123, |
| "learning_rate": 6.68478991220442e-07, |
| "loss": 0.6726, |
| "num_input_tokens_seen": 3926368, |
| "step": 10420 |
| }, |
| { |
| "epoch": 9.341397849462366, |
| "grad_norm": 0.5349095463752747, |
| "learning_rate": 6.595278617026163e-07, |
| "loss": 0.6903, |
| "num_input_tokens_seen": 3928288, |
| "step": 10425 |
| }, |
| { |
| "epoch": 9.345878136200717, |
| "grad_norm": 0.5612955689430237, |
| "learning_rate": 6.50636264840368e-07, |
| "loss": 0.7131, |
| "num_input_tokens_seen": 3930176, |
| "step": 10430 |
| }, |
| { |
| "epoch": 9.350358422939069, |
| "grad_norm": 0.5245122909545898, |
| "learning_rate": 6.418042223810234e-07, |
| "loss": 0.6809, |
| "num_input_tokens_seen": 3932224, |
| "step": 10435 |
| }, |
| { |
| "epoch": 9.35483870967742, |
| "grad_norm": 0.5770167112350464, |
| "learning_rate": 6.33031755926261e-07, |
| "loss": 0.6999, |
| "num_input_tokens_seen": 3934112, |
| "step": 10440 |
| }, |
| { |
| "epoch": 9.359318996415771, |
| "grad_norm": 0.46723732352256775, |
| "learning_rate": 6.243188869320377e-07, |
| "loss": 0.6972, |
| "num_input_tokens_seen": 3936096, |
| "step": 10445 |
| }, |
| { |
| "epoch": 9.363799283154123, |
| "grad_norm": 0.505517840385437, |
| "learning_rate": 6.156656367085539e-07, |
| "loss": 0.6911, |
| "num_input_tokens_seen": 3937952, |
| "step": 10450 |
| }, |
| { |
| "epoch": 9.368279569892474, |
| "grad_norm": 0.7766585350036621, |
| "learning_rate": 6.070720264201857e-07, |
| "loss": 0.6759, |
| "num_input_tokens_seen": 3939872, |
| "step": 10455 |
| }, |
| { |
| "epoch": 9.372759856630825, |
| "grad_norm": 0.6021184325218201, |
| "learning_rate": 5.985380770854476e-07, |
| "loss": 0.6598, |
| "num_input_tokens_seen": 3941920, |
| "step": 10460 |
| }, |
| { |
| "epoch": 9.377240143369175, |
| "grad_norm": 0.5606325268745422, |
| "learning_rate": 5.900638095769185e-07, |
| "loss": 0.6604, |
| "num_input_tokens_seen": 3943712, |
| "step": 10465 |
| }, |
| { |
| "epoch": 9.381720430107526, |
| "grad_norm": 0.4621894657611847, |
| "learning_rate": 5.816492446212213e-07, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 3945440, |
| "step": 10470 |
| }, |
| { |
| "epoch": 9.386200716845877, |
| "grad_norm": 0.681788444519043, |
| "learning_rate": 5.732944027989518e-07, |
| "loss": 0.6627, |
| "num_input_tokens_seen": 3947296, |
| "step": 10475 |
| }, |
| { |
| "epoch": 9.390681003584229, |
| "grad_norm": 0.7300588488578796, |
| "learning_rate": 5.649993045446305e-07, |
| "loss": 0.6905, |
| "num_input_tokens_seen": 3949152, |
| "step": 10480 |
| }, |
| { |
| "epoch": 9.39516129032258, |
| "grad_norm": 0.6331045031547546, |
| "learning_rate": 5.56763970146662e-07, |
| "loss": 0.6845, |
| "num_input_tokens_seen": 3951008, |
| "step": 10485 |
| }, |
| { |
| "epoch": 9.399641577060931, |
| "grad_norm": 0.4790831506252289, |
| "learning_rate": 5.485884197472646e-07, |
| "loss": 0.6956, |
| "num_input_tokens_seen": 3952960, |
| "step": 10490 |
| }, |
| { |
| "epoch": 9.404121863799283, |
| "grad_norm": 0.4457530975341797, |
| "learning_rate": 5.404726733424514e-07, |
| "loss": 0.7039, |
| "num_input_tokens_seen": 3954752, |
| "step": 10495 |
| }, |
| { |
| "epoch": 9.408602150537634, |
| "grad_norm": 0.48740777373313904, |
| "learning_rate": 5.324167507819555e-07, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 3956736, |
| "step": 10500 |
| }, |
| { |
| "epoch": 9.413082437275985, |
| "grad_norm": 0.6508479714393616, |
| "learning_rate": 5.244206717691908e-07, |
| "loss": 0.6699, |
| "num_input_tokens_seen": 3958528, |
| "step": 10505 |
| }, |
| { |
| "epoch": 9.417562724014337, |
| "grad_norm": 0.5810533165931702, |
| "learning_rate": 5.164844558612131e-07, |
| "loss": 0.6726, |
| "num_input_tokens_seen": 3960672, |
| "step": 10510 |
| }, |
| { |
| "epoch": 9.422043010752688, |
| "grad_norm": 0.5137490630149841, |
| "learning_rate": 5.086081224686512e-07, |
| "loss": 0.6818, |
| "num_input_tokens_seen": 3962752, |
| "step": 10515 |
| }, |
| { |
| "epoch": 9.42652329749104, |
| "grad_norm": 0.49306225776672363, |
| "learning_rate": 5.007916908556814e-07, |
| "loss": 0.6892, |
| "num_input_tokens_seen": 3964576, |
| "step": 10520 |
| }, |
| { |
| "epoch": 9.43100358422939, |
| "grad_norm": 0.6570764183998108, |
| "learning_rate": 4.930351801399641e-07, |
| "loss": 0.6514, |
| "num_input_tokens_seen": 3966432, |
| "step": 10525 |
| }, |
| { |
| "epoch": 9.435483870967742, |
| "grad_norm": 0.528634250164032, |
| "learning_rate": 4.853386092926044e-07, |
| "loss": 0.681, |
| "num_input_tokens_seen": 3968256, |
| "step": 10530 |
| }, |
| { |
| "epoch": 9.439964157706093, |
| "grad_norm": 0.5617717504501343, |
| "learning_rate": 4.77701997138108e-07, |
| "loss": 0.6681, |
| "num_input_tokens_seen": 3970048, |
| "step": 10535 |
| }, |
| { |
| "epoch": 9.444444444444445, |
| "grad_norm": 0.46119117736816406, |
| "learning_rate": 4.701253623543289e-07, |
| "loss": 0.6593, |
| "num_input_tokens_seen": 3971840, |
| "step": 10540 |
| }, |
| { |
| "epoch": 9.448924731182796, |
| "grad_norm": 0.33854037523269653, |
| "learning_rate": 4.626087234724269e-07, |
| "loss": 0.6559, |
| "num_input_tokens_seen": 3973760, |
| "step": 10545 |
| }, |
| { |
| "epoch": 9.453405017921147, |
| "grad_norm": 0.6266850233078003, |
| "learning_rate": 4.5515209887682096e-07, |
| "loss": 0.6831, |
| "num_input_tokens_seen": 3975520, |
| "step": 10550 |
| }, |
| { |
| "epoch": 9.457885304659499, |
| "grad_norm": 0.6646607518196106, |
| "learning_rate": 4.477555068051476e-07, |
| "loss": 0.6685, |
| "num_input_tokens_seen": 3977376, |
| "step": 10555 |
| }, |
| { |
| "epoch": 9.46236559139785, |
| "grad_norm": 0.5167231559753418, |
| "learning_rate": 4.40418965348216e-07, |
| "loss": 0.662, |
| "num_input_tokens_seen": 3979232, |
| "step": 10560 |
| }, |
| { |
| "epoch": 9.466845878136201, |
| "grad_norm": 0.46271848678588867, |
| "learning_rate": 4.3314249244995884e-07, |
| "loss": 0.6683, |
| "num_input_tokens_seen": 3981024, |
| "step": 10565 |
| }, |
| { |
| "epoch": 9.471326164874553, |
| "grad_norm": 0.272670179605484, |
| "learning_rate": 4.259261059073871e-07, |
| "loss": 0.7004, |
| "num_input_tokens_seen": 3982816, |
| "step": 10570 |
| }, |
| { |
| "epoch": 9.475806451612904, |
| "grad_norm": 0.4650828242301941, |
| "learning_rate": 4.1876982337055725e-07, |
| "loss": 0.6686, |
| "num_input_tokens_seen": 3984736, |
| "step": 10575 |
| }, |
| { |
| "epoch": 9.480286738351255, |
| "grad_norm": 0.686195433139801, |
| "learning_rate": 4.1167366234251824e-07, |
| "loss": 0.6862, |
| "num_input_tokens_seen": 3986816, |
| "step": 10580 |
| }, |
| { |
| "epoch": 9.484767025089607, |
| "grad_norm": 0.594731330871582, |
| "learning_rate": 4.0463764017927565e-07, |
| "loss": 0.6631, |
| "num_input_tokens_seen": 3988640, |
| "step": 10585 |
| }, |
| { |
| "epoch": 9.489247311827956, |
| "grad_norm": 0.5093191266059875, |
| "learning_rate": 3.976617740897415e-07, |
| "loss": 0.7095, |
| "num_input_tokens_seen": 3990560, |
| "step": 10590 |
| }, |
| { |
| "epoch": 9.493727598566307, |
| "grad_norm": 0.7626033425331116, |
| "learning_rate": 3.907460811356956e-07, |
| "loss": 0.7275, |
| "num_input_tokens_seen": 3992416, |
| "step": 10595 |
| }, |
| { |
| "epoch": 9.498207885304659, |
| "grad_norm": 0.7179610133171082, |
| "learning_rate": 3.8389057823175754e-07, |
| "loss": 0.6966, |
| "num_input_tokens_seen": 3994240, |
| "step": 10600 |
| }, |
| { |
| "epoch": 9.5, |
| "eval_loss": 0.6984838843345642, |
| "eval_runtime": 5.6358, |
| "eval_samples_per_second": 88.009, |
| "eval_steps_per_second": 22.002, |
| "num_input_tokens_seen": 3994976, |
| "step": 10602 |
| }, |
| { |
| "epoch": 9.50268817204301, |
| "grad_norm": 0.5522060990333557, |
| "learning_rate": 3.7709528214530664e-07, |
| "loss": 0.7008, |
| "num_input_tokens_seen": 3996192, |
| "step": 10605 |
| }, |
| { |
| "epoch": 9.507168458781361, |
| "grad_norm": 0.5137274861335754, |
| "learning_rate": 3.7036020949648974e-07, |
| "loss": 0.6976, |
| "num_input_tokens_seen": 3998144, |
| "step": 10610 |
| }, |
| { |
| "epoch": 9.511648745519713, |
| "grad_norm": 0.5644457936286926, |
| "learning_rate": 3.636853767581494e-07, |
| "loss": 0.697, |
| "num_input_tokens_seen": 4000160, |
| "step": 10615 |
| }, |
| { |
| "epoch": 9.516129032258064, |
| "grad_norm": 0.6362537741661072, |
| "learning_rate": 3.5707080025579045e-07, |
| "loss": 0.7134, |
| "num_input_tokens_seen": 4001888, |
| "step": 10620 |
| }, |
| { |
| "epoch": 9.520609318996415, |
| "grad_norm": 0.45133867859840393, |
| "learning_rate": 3.5051649616754114e-07, |
| "loss": 0.6981, |
| "num_input_tokens_seen": 4003680, |
| "step": 10625 |
| }, |
| { |
| "epoch": 9.525089605734767, |
| "grad_norm": 0.7299519181251526, |
| "learning_rate": 3.440224805241171e-07, |
| "loss": 0.6643, |
| "num_input_tokens_seen": 4005632, |
| "step": 10630 |
| }, |
| { |
| "epoch": 9.529569892473118, |
| "grad_norm": 0.5302301645278931, |
| "learning_rate": 3.3758876920877147e-07, |
| "loss": 0.7006, |
| "num_input_tokens_seen": 4007296, |
| "step": 10635 |
| }, |
| { |
| "epoch": 9.53405017921147, |
| "grad_norm": 0.5000677704811096, |
| "learning_rate": 3.312153779572724e-07, |
| "loss": 0.6945, |
| "num_input_tokens_seen": 4009120, |
| "step": 10640 |
| }, |
| { |
| "epoch": 9.53853046594982, |
| "grad_norm": 0.5188679099082947, |
| "learning_rate": 3.249023223578479e-07, |
| "loss": 0.6937, |
| "num_input_tokens_seen": 4010944, |
| "step": 10645 |
| }, |
| { |
| "epoch": 9.543010752688172, |
| "grad_norm": 0.48496314883232117, |
| "learning_rate": 3.1864961785116054e-07, |
| "loss": 0.696, |
| "num_input_tokens_seen": 4012832, |
| "step": 10650 |
| }, |
| { |
| "epoch": 9.547491039426523, |
| "grad_norm": 0.5330994129180908, |
| "learning_rate": 3.124572797302661e-07, |
| "loss": 0.7141, |
| "num_input_tokens_seen": 4014688, |
| "step": 10655 |
| }, |
| { |
| "epoch": 9.551971326164875, |
| "grad_norm": 0.5689115524291992, |
| "learning_rate": 3.063253231405605e-07, |
| "loss": 0.709, |
| "num_input_tokens_seen": 4016512, |
| "step": 10660 |
| }, |
| { |
| "epoch": 9.556451612903226, |
| "grad_norm": 0.4524476230144501, |
| "learning_rate": 3.002537630797747e-07, |
| "loss": 0.655, |
| "num_input_tokens_seen": 4018368, |
| "step": 10665 |
| }, |
| { |
| "epoch": 9.560931899641577, |
| "grad_norm": 0.6236125230789185, |
| "learning_rate": 2.9424261439791323e-07, |
| "loss": 0.6903, |
| "num_input_tokens_seen": 4020096, |
| "step": 10670 |
| }, |
| { |
| "epoch": 9.565412186379929, |
| "grad_norm": 0.4066424071788788, |
| "learning_rate": 2.8829189179721547e-07, |
| "loss": 0.6701, |
| "num_input_tokens_seen": 4022048, |
| "step": 10675 |
| }, |
| { |
| "epoch": 9.56989247311828, |
| "grad_norm": 0.8376783132553101, |
| "learning_rate": 2.824016098321447e-07, |
| "loss": 0.6913, |
| "num_input_tokens_seen": 4023936, |
| "step": 10680 |
| }, |
| { |
| "epoch": 9.574372759856631, |
| "grad_norm": 0.7089189887046814, |
| "learning_rate": 2.7657178290932396e-07, |
| "loss": 0.6963, |
| "num_input_tokens_seen": 4025984, |
| "step": 10685 |
| }, |
| { |
| "epoch": 9.578853046594983, |
| "grad_norm": 0.6390495896339417, |
| "learning_rate": 2.7080242528751964e-07, |
| "loss": 0.6971, |
| "num_input_tokens_seen": 4027808, |
| "step": 10690 |
| }, |
| { |
| "epoch": 9.583333333333334, |
| "grad_norm": 0.5339614152908325, |
| "learning_rate": 2.650935510776026e-07, |
| "loss": 0.7024, |
| "num_input_tokens_seen": 4029632, |
| "step": 10695 |
| }, |
| { |
| "epoch": 9.587813620071685, |
| "grad_norm": 0.47523030638694763, |
| "learning_rate": 2.594451742425036e-07, |
| "loss": 0.6794, |
| "num_input_tokens_seen": 4031520, |
| "step": 10700 |
| }, |
| { |
| "epoch": 9.592293906810037, |
| "grad_norm": 0.49205970764160156, |
| "learning_rate": 2.538573085971968e-07, |
| "loss": 0.6579, |
| "num_input_tokens_seen": 4033568, |
| "step": 10705 |
| }, |
| { |
| "epoch": 9.596774193548388, |
| "grad_norm": 0.6642840504646301, |
| "learning_rate": 2.4832996780864704e-07, |
| "loss": 0.6687, |
| "num_input_tokens_seen": 4035424, |
| "step": 10710 |
| }, |
| { |
| "epoch": 9.601254480286737, |
| "grad_norm": 0.5492425560951233, |
| "learning_rate": 2.42863165395793e-07, |
| "loss": 0.6616, |
| "num_input_tokens_seen": 4037376, |
| "step": 10715 |
| }, |
| { |
| "epoch": 9.60573476702509, |
| "grad_norm": 0.6824979186058044, |
| "learning_rate": 2.3745691472950026e-07, |
| "loss": 0.7162, |
| "num_input_tokens_seen": 4039264, |
| "step": 10720 |
| }, |
| { |
| "epoch": 9.61021505376344, |
| "grad_norm": 0.6007011532783508, |
| "learning_rate": 2.3211122903254167e-07, |
| "loss": 0.6801, |
| "num_input_tokens_seen": 4040992, |
| "step": 10725 |
| }, |
| { |
| "epoch": 9.614695340501791, |
| "grad_norm": 0.32746192812919617, |
| "learning_rate": 2.2682612137955307e-07, |
| "loss": 0.6677, |
| "num_input_tokens_seen": 4042848, |
| "step": 10730 |
| }, |
| { |
| "epoch": 9.619175627240143, |
| "grad_norm": 0.4873063266277313, |
| "learning_rate": 2.2160160469701097e-07, |
| "loss": 0.6752, |
| "num_input_tokens_seen": 4044608, |
| "step": 10735 |
| }, |
| { |
| "epoch": 9.623655913978494, |
| "grad_norm": 0.5440759062767029, |
| "learning_rate": 2.1643769176319385e-07, |
| "loss": 0.672, |
| "num_input_tokens_seen": 4046528, |
| "step": 10740 |
| }, |
| { |
| "epoch": 9.628136200716845, |
| "grad_norm": 0.5048784017562866, |
| "learning_rate": 2.1133439520815423e-07, |
| "loss": 0.723, |
| "num_input_tokens_seen": 4048448, |
| "step": 10745 |
| }, |
| { |
| "epoch": 9.632616487455197, |
| "grad_norm": 0.48166608810424805, |
| "learning_rate": 2.062917275136883e-07, |
| "loss": 0.6671, |
| "num_input_tokens_seen": 4050304, |
| "step": 10750 |
| }, |
| { |
| "epoch": 9.637096774193548, |
| "grad_norm": 0.5569744110107422, |
| "learning_rate": 2.0130970101330527e-07, |
| "loss": 0.7106, |
| "num_input_tokens_seen": 4052224, |
| "step": 10755 |
| }, |
| { |
| "epoch": 9.6415770609319, |
| "grad_norm": 0.4456222951412201, |
| "learning_rate": 1.963883278921913e-07, |
| "loss": 0.692, |
| "num_input_tokens_seen": 4054208, |
| "step": 10760 |
| }, |
| { |
| "epoch": 9.64605734767025, |
| "grad_norm": 0.5037098526954651, |
| "learning_rate": 1.9152762018719017e-07, |
| "loss": 0.6757, |
| "num_input_tokens_seen": 4056192, |
| "step": 10765 |
| }, |
| { |
| "epoch": 9.650537634408602, |
| "grad_norm": 0.47247257828712463, |
| "learning_rate": 1.867275897867643e-07, |
| "loss": 0.681, |
| "num_input_tokens_seen": 4058208, |
| "step": 10770 |
| }, |
| { |
| "epoch": 9.655017921146953, |
| "grad_norm": 0.5090415477752686, |
| "learning_rate": 1.819882484309754e-07, |
| "loss": 0.6892, |
| "num_input_tokens_seen": 4060096, |
| "step": 10775 |
| }, |
| { |
| "epoch": 9.659498207885305, |
| "grad_norm": 0.5318375825881958, |
| "learning_rate": 1.773096077114428e-07, |
| "loss": 0.7078, |
| "num_input_tokens_seen": 4062016, |
| "step": 10780 |
| }, |
| { |
| "epoch": 9.663978494623656, |
| "grad_norm": 0.5755466222763062, |
| "learning_rate": 1.7269167907132954e-07, |
| "loss": 0.6475, |
| "num_input_tokens_seen": 4063808, |
| "step": 10785 |
| }, |
| { |
| "epoch": 9.668458781362007, |
| "grad_norm": 0.3897082209587097, |
| "learning_rate": 1.681344738053009e-07, |
| "loss": 0.7167, |
| "num_input_tokens_seen": 4065600, |
| "step": 10790 |
| }, |
| { |
| "epoch": 9.672939068100359, |
| "grad_norm": 0.5163991451263428, |
| "learning_rate": 1.636380030595075e-07, |
| "loss": 0.6564, |
| "num_input_tokens_seen": 4067488, |
| "step": 10795 |
| }, |
| { |
| "epoch": 9.67741935483871, |
| "grad_norm": 0.3706883192062378, |
| "learning_rate": 1.5920227783155217e-07, |
| "loss": 0.703, |
| "num_input_tokens_seen": 4069312, |
| "step": 10800 |
| }, |
| { |
| "epoch": 9.681899641577061, |
| "grad_norm": 0.554594874382019, |
| "learning_rate": 1.5482730897046216e-07, |
| "loss": 0.6392, |
| "num_input_tokens_seen": 4071104, |
| "step": 10805 |
| }, |
| { |
| "epoch": 9.686379928315413, |
| "grad_norm": 0.631853461265564, |
| "learning_rate": 1.5051310717666967e-07, |
| "loss": 0.689, |
| "num_input_tokens_seen": 4073184, |
| "step": 10810 |
| }, |
| { |
| "epoch": 9.690860215053764, |
| "grad_norm": 0.6092721819877625, |
| "learning_rate": 1.4625968300197857e-07, |
| "loss": 0.7235, |
| "num_input_tokens_seen": 4075072, |
| "step": 10815 |
| }, |
| { |
| "epoch": 9.695340501792115, |
| "grad_norm": 0.598450779914856, |
| "learning_rate": 1.4206704684953943e-07, |
| "loss": 0.6993, |
| "num_input_tokens_seen": 4077024, |
| "step": 10820 |
| }, |
| { |
| "epoch": 9.699820788530467, |
| "grad_norm": 0.8374021053314209, |
| "learning_rate": 1.3793520897383006e-07, |
| "loss": 0.6696, |
| "num_input_tokens_seen": 4078944, |
| "step": 10825 |
| }, |
| { |
| "epoch": 9.704301075268818, |
| "grad_norm": 0.6128981709480286, |
| "learning_rate": 1.3386417948061947e-07, |
| "loss": 0.6869, |
| "num_input_tokens_seen": 4080704, |
| "step": 10830 |
| }, |
| { |
| "epoch": 9.70878136200717, |
| "grad_norm": 0.6915923953056335, |
| "learning_rate": 1.2985396832695674e-07, |
| "loss": 0.6667, |
| "num_input_tokens_seen": 4082432, |
| "step": 10835 |
| }, |
| { |
| "epoch": 9.713261648745519, |
| "grad_norm": 0.43707898259162903, |
| "learning_rate": 1.259045853211349e-07, |
| "loss": 0.6825, |
| "num_input_tokens_seen": 4084320, |
| "step": 10840 |
| }, |
| { |
| "epoch": 9.717741935483872, |
| "grad_norm": 0.5835666060447693, |
| "learning_rate": 1.2201604012267442e-07, |
| "loss": 0.6845, |
| "num_input_tokens_seen": 4086240, |
| "step": 10845 |
| }, |
| { |
| "epoch": 9.722222222222221, |
| "grad_norm": 0.5008912682533264, |
| "learning_rate": 1.1818834224229525e-07, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 4088096, |
| "step": 10850 |
| }, |
| { |
| "epoch": 9.726702508960573, |
| "grad_norm": 0.6731480956077576, |
| "learning_rate": 1.1442150104189198e-07, |
| "loss": 0.6958, |
| "num_input_tokens_seen": 4089888, |
| "step": 10855 |
| }, |
| { |
| "epoch": 9.731182795698924, |
| "grad_norm": 0.5034038424491882, |
| "learning_rate": 1.1071552573452271e-07, |
| "loss": 0.6914, |
| "num_input_tokens_seen": 4091744, |
| "step": 10860 |
| }, |
| { |
| "epoch": 9.735663082437275, |
| "grad_norm": 0.530077338218689, |
| "learning_rate": 1.0707042538437018e-07, |
| "loss": 0.6844, |
| "num_input_tokens_seen": 4093408, |
| "step": 10865 |
| }, |
| { |
| "epoch": 9.740143369175627, |
| "grad_norm": 0.32035142183303833, |
| "learning_rate": 1.0348620890673067e-07, |
| "loss": 0.6572, |
| "num_input_tokens_seen": 4095392, |
| "step": 10870 |
| }, |
| { |
| "epoch": 9.744623655913978, |
| "grad_norm": 0.5076518058776855, |
| "learning_rate": 9.9962885067989e-08, |
| "loss": 0.6863, |
| "num_input_tokens_seen": 4097280, |
| "step": 10875 |
| }, |
| { |
| "epoch": 9.74910394265233, |
| "grad_norm": 0.508080780506134, |
| "learning_rate": 9.650046248559363e-08, |
| "loss": 0.6726, |
| "num_input_tokens_seen": 4099360, |
| "step": 10880 |
| }, |
| { |
| "epoch": 9.75358422939068, |
| "grad_norm": 0.5304430723190308, |
| "learning_rate": 9.309894962804267e-08, |
| "loss": 0.6844, |
| "num_input_tokens_seen": 4101376, |
| "step": 10885 |
| }, |
| { |
| "epoch": 9.758064516129032, |
| "grad_norm": 0.5922977328300476, |
| "learning_rate": 8.975835481485895e-08, |
| "loss": 0.7022, |
| "num_input_tokens_seen": 4103296, |
| "step": 10890 |
| }, |
| { |
| "epoch": 9.762544802867383, |
| "grad_norm": 0.65008145570755, |
| "learning_rate": 8.647868621656785e-08, |
| "loss": 0.6829, |
| "num_input_tokens_seen": 4105248, |
| "step": 10895 |
| }, |
| { |
| "epoch": 9.767025089605735, |
| "grad_norm": 0.6050592064857483, |
| "learning_rate": 8.325995185468339e-08, |
| "loss": 0.6834, |
| "num_input_tokens_seen": 4107072, |
| "step": 10900 |
| }, |
| { |
| "epoch": 9.771505376344086, |
| "grad_norm": 0.3722684979438782, |
| "learning_rate": 8.010215960168044e-08, |
| "loss": 0.7032, |
| "num_input_tokens_seen": 4108768, |
| "step": 10905 |
| }, |
| { |
| "epoch": 9.775985663082437, |
| "grad_norm": 0.5007041692733765, |
| "learning_rate": 7.700531718098092e-08, |
| "loss": 0.6709, |
| "num_input_tokens_seen": 4110624, |
| "step": 10910 |
| }, |
| { |
| "epoch": 9.780465949820789, |
| "grad_norm": 0.5899874567985535, |
| "learning_rate": 7.396943216693708e-08, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 4112352, |
| "step": 10915 |
| }, |
| { |
| "epoch": 9.78494623655914, |
| "grad_norm": 0.5524150133132935, |
| "learning_rate": 7.099451198480378e-08, |
| "loss": 0.6927, |
| "num_input_tokens_seen": 4114144, |
| "step": 10920 |
| }, |
| { |
| "epoch": 9.789426523297491, |
| "grad_norm": 0.5737338662147522, |
| "learning_rate": 6.808056391073569e-08, |
| "loss": 0.6877, |
| "num_input_tokens_seen": 4115872, |
| "step": 10925 |
| }, |
| { |
| "epoch": 9.793906810035843, |
| "grad_norm": 0.5014054179191589, |
| "learning_rate": 6.522759507175124e-08, |
| "loss": 0.6803, |
| "num_input_tokens_seen": 4117984, |
| "step": 10930 |
| }, |
| { |
| "epoch": 9.798387096774194, |
| "grad_norm": 0.5102945566177368, |
| "learning_rate": 6.243561244572427e-08, |
| "loss": 0.6793, |
| "num_input_tokens_seen": 4119968, |
| "step": 10935 |
| }, |
| { |
| "epoch": 9.802867383512545, |
| "grad_norm": 0.7418368458747864, |
| "learning_rate": 5.970462286137291e-08, |
| "loss": 0.705, |
| "num_input_tokens_seen": 4122048, |
| "step": 10940 |
| }, |
| { |
| "epoch": 9.807347670250897, |
| "grad_norm": 0.728233814239502, |
| "learning_rate": 5.7034632998231865e-08, |
| "loss": 0.6996, |
| "num_input_tokens_seen": 4124032, |
| "step": 10945 |
| }, |
| { |
| "epoch": 9.811827956989248, |
| "grad_norm": 0.4803222417831421, |
| "learning_rate": 5.4425649386644075e-08, |
| "loss": 0.6633, |
| "num_input_tokens_seen": 4125984, |
| "step": 10950 |
| }, |
| { |
| "epoch": 9.8163082437276, |
| "grad_norm": 0.4849235415458679, |
| "learning_rate": 5.187767840773849e-08, |
| "loss": 0.6833, |
| "num_input_tokens_seen": 4128032, |
| "step": 10955 |
| }, |
| { |
| "epoch": 9.82078853046595, |
| "grad_norm": 0.7425673604011536, |
| "learning_rate": 4.939072629341901e-08, |
| "loss": 0.6912, |
| "num_input_tokens_seen": 4129792, |
| "step": 10960 |
| }, |
| { |
| "epoch": 9.825268817204302, |
| "grad_norm": 0.5853078365325928, |
| "learning_rate": 4.696479912634499e-08, |
| "loss": 0.708, |
| "num_input_tokens_seen": 4131808, |
| "step": 10965 |
| }, |
| { |
| "epoch": 9.829749103942653, |
| "grad_norm": 0.6256317496299744, |
| "learning_rate": 4.459990283992577e-08, |
| "loss": 0.6856, |
| "num_input_tokens_seen": 4133696, |
| "step": 10970 |
| }, |
| { |
| "epoch": 9.834229390681003, |
| "grad_norm": 0.5935572981834412, |
| "learning_rate": 4.229604321829561e-08, |
| "loss": 0.6775, |
| "num_input_tokens_seen": 4135616, |
| "step": 10975 |
| }, |
| { |
| "epoch": 9.838709677419354, |
| "grad_norm": 0.6854491233825684, |
| "learning_rate": 4.0053225896299894e-08, |
| "loss": 0.6769, |
| "num_input_tokens_seen": 4137472, |
| "step": 10980 |
| }, |
| { |
| "epoch": 9.843189964157705, |
| "grad_norm": 0.5285246968269348, |
| "learning_rate": 3.787145635948952e-08, |
| "loss": 0.7279, |
| "num_input_tokens_seen": 4139328, |
| "step": 10985 |
| }, |
| { |
| "epoch": 9.847670250896057, |
| "grad_norm": 0.5252229571342468, |
| "learning_rate": 3.575073994410427e-08, |
| "loss": 0.6765, |
| "num_input_tokens_seen": 4141216, |
| "step": 10990 |
| }, |
| { |
| "epoch": 9.852150537634408, |
| "grad_norm": 0.31370288133621216, |
| "learning_rate": 3.369108183705339e-08, |
| "loss": 0.697, |
| "num_input_tokens_seen": 4142976, |
| "step": 10995 |
| }, |
| { |
| "epoch": 9.85663082437276, |
| "grad_norm": 0.42567935585975647, |
| "learning_rate": 3.169248707590999e-08, |
| "loss": 0.7026, |
| "num_input_tokens_seen": 4144672, |
| "step": 11000 |
| }, |
| { |
| "epoch": 9.86111111111111, |
| "grad_norm": 0.48273128271102905, |
| "learning_rate": 2.975496054889726e-08, |
| "loss": 0.706, |
| "num_input_tokens_seen": 4146496, |
| "step": 11005 |
| }, |
| { |
| "epoch": 9.865591397849462, |
| "grad_norm": 0.7682796716690063, |
| "learning_rate": 2.7878506994877263e-08, |
| "loss": 0.7025, |
| "num_input_tokens_seen": 4148320, |
| "step": 11010 |
| }, |
| { |
| "epoch": 9.870071684587813, |
| "grad_norm": 0.3882335424423218, |
| "learning_rate": 2.6063131003337126e-08, |
| "loss": 0.6872, |
| "num_input_tokens_seen": 4150176, |
| "step": 11015 |
| }, |
| { |
| "epoch": 9.874551971326165, |
| "grad_norm": 0.5765259265899658, |
| "learning_rate": 2.4308837014372366e-08, |
| "loss": 0.6708, |
| "num_input_tokens_seen": 4152000, |
| "step": 11020 |
| }, |
| { |
| "epoch": 9.879032258064516, |
| "grad_norm": 0.5910921692848206, |
| "learning_rate": 2.2615629318692434e-08, |
| "loss": 0.697, |
| "num_input_tokens_seen": 4153984, |
| "step": 11025 |
| }, |
| { |
| "epoch": 9.883512544802867, |
| "grad_norm": 0.7944561839103699, |
| "learning_rate": 2.0983512057595743e-08, |
| "loss": 0.6893, |
| "num_input_tokens_seen": 4155904, |
| "step": 11030 |
| }, |
| { |
| "epoch": 9.887992831541219, |
| "grad_norm": 0.7031881213188171, |
| "learning_rate": 1.941248922296135e-08, |
| "loss": 0.6565, |
| "num_input_tokens_seen": 4157760, |
| "step": 11035 |
| }, |
| { |
| "epoch": 9.89247311827957, |
| "grad_norm": 0.5116055607795715, |
| "learning_rate": 1.7902564657246158e-08, |
| "loss": 0.7064, |
| "num_input_tokens_seen": 4159584, |
| "step": 11040 |
| }, |
| { |
| "epoch": 9.896953405017921, |
| "grad_norm": 0.5304086208343506, |
| "learning_rate": 1.6453742053465504e-08, |
| "loss": 0.706, |
| "num_input_tokens_seen": 4161472, |
| "step": 11045 |
| }, |
| { |
| "epoch": 9.901433691756273, |
| "grad_norm": 0.8019806146621704, |
| "learning_rate": 1.506602495519316e-08, |
| "loss": 0.7026, |
| "num_input_tokens_seen": 4163328, |
| "step": 11050 |
| }, |
| { |
| "epoch": 9.905913978494624, |
| "grad_norm": 0.6591276526451111, |
| "learning_rate": 1.3739416756555768e-08, |
| "loss": 0.6832, |
| "num_input_tokens_seen": 4165376, |
| "step": 11055 |
| }, |
| { |
| "epoch": 9.910394265232975, |
| "grad_norm": 0.39249947667121887, |
| "learning_rate": 1.2473920702202325e-08, |
| "loss": 0.6374, |
| "num_input_tokens_seen": 4167168, |
| "step": 11060 |
| }, |
| { |
| "epoch": 9.914874551971327, |
| "grad_norm": 0.534702479839325, |
| "learning_rate": 1.126953988732915e-08, |
| "loss": 0.7048, |
| "num_input_tokens_seen": 4169056, |
| "step": 11065 |
| }, |
| { |
| "epoch": 9.919354838709678, |
| "grad_norm": 0.4636304974555969, |
| "learning_rate": 1.0126277257641037e-08, |
| "loss": 0.6804, |
| "num_input_tokens_seen": 4170976, |
| "step": 11070 |
| }, |
| { |
| "epoch": 9.92383512544803, |
| "grad_norm": 0.5376958250999451, |
| "learning_rate": 9.044135609365124e-09, |
| "loss": 0.7069, |
| "num_input_tokens_seen": 4172704, |
| "step": 11075 |
| }, |
| { |
| "epoch": 9.92831541218638, |
| "grad_norm": 0.5938841700553894, |
| "learning_rate": 8.023117589237017e-09, |
| "loss": 0.6862, |
| "num_input_tokens_seen": 4174688, |
| "step": 11080 |
| }, |
| { |
| "epoch": 9.932795698924732, |
| "grad_norm": 0.6230871677398682, |
| "learning_rate": 7.06322569449247e-09, |
| "loss": 0.6972, |
| "num_input_tokens_seen": 4176480, |
| "step": 11085 |
| }, |
| { |
| "epoch": 9.937275985663083, |
| "grad_norm": 0.5930041074752808, |
| "learning_rate": 6.164462272864602e-09, |
| "loss": 0.6864, |
| "num_input_tokens_seen": 4178432, |
| "step": 11090 |
| }, |
| { |
| "epoch": 9.941756272401435, |
| "grad_norm": 0.7413445115089417, |
| "learning_rate": 5.326829522578347e-09, |
| "loss": 0.7061, |
| "num_input_tokens_seen": 4180256, |
| "step": 11095 |
| }, |
| { |
| "epoch": 9.946236559139784, |
| "grad_norm": 0.5409526824951172, |
| "learning_rate": 4.5503294923338044e-09, |
| "loss": 0.6698, |
| "num_input_tokens_seen": 4182144, |
| "step": 11100 |
| }, |
| { |
| "epoch": 9.950716845878135, |
| "grad_norm": 0.4235214591026306, |
| "learning_rate": 3.834964081325665e-09, |
| "loss": 0.6779, |
| "num_input_tokens_seen": 4184064, |
| "step": 11105 |
| }, |
| { |
| "epoch": 9.955197132616487, |
| "grad_norm": 0.6128969192504883, |
| "learning_rate": 3.1807350392099033e-09, |
| "loss": 0.6741, |
| "num_input_tokens_seen": 4185888, |
| "step": 11110 |
| }, |
| { |
| "epoch": 9.959677419354838, |
| "grad_norm": 0.7141082286834717, |
| "learning_rate": 2.58764396612321e-09, |
| "loss": 0.7148, |
| "num_input_tokens_seen": 4187808, |
| "step": 11115 |
| }, |
| { |
| "epoch": 9.96415770609319, |
| "grad_norm": 0.5248094201087952, |
| "learning_rate": 2.0556923126663353e-09, |
| "loss": 0.7143, |
| "num_input_tokens_seen": 4189440, |
| "step": 11120 |
| }, |
| { |
| "epoch": 9.96863799283154, |
| "grad_norm": 0.566316545009613, |
| "learning_rate": 1.5848813798985396e-09, |
| "loss": 0.7122, |
| "num_input_tokens_seen": 4191328, |
| "step": 11125 |
| }, |
| { |
| "epoch": 9.973118279569892, |
| "grad_norm": 0.5793007612228394, |
| "learning_rate": 1.1752123193459197e-09, |
| "loss": 0.7101, |
| "num_input_tokens_seen": 4193120, |
| "step": 11130 |
| }, |
| { |
| "epoch": 9.977598566308243, |
| "grad_norm": 0.45351698994636536, |
| "learning_rate": 8.266861329903064e-10, |
| "loss": 0.6819, |
| "num_input_tokens_seen": 4195200, |
| "step": 11135 |
| }, |
| { |
| "epoch": 9.982078853046595, |
| "grad_norm": 0.31627628207206726, |
| "learning_rate": 5.393036732637136e-10, |
| "loss": 0.7049, |
| "num_input_tokens_seen": 4197024, |
| "step": 11140 |
| }, |
| { |
| "epoch": 9.986559139784946, |
| "grad_norm": 0.8334797024726868, |
| "learning_rate": 3.130656430594403e-10, |
| "loss": 0.7084, |
| "num_input_tokens_seen": 4198976, |
| "step": 11145 |
| }, |
| { |
| "epoch": 9.991039426523297, |
| "grad_norm": 0.5872827768325806, |
| "learning_rate": 1.4797259571541767e-10, |
| "loss": 0.717, |
| "num_input_tokens_seen": 4200832, |
| "step": 11150 |
| }, |
| { |
| "epoch": 9.995519713261649, |
| "grad_norm": 0.6502652764320374, |
| "learning_rate": 4.402493501975968e-11, |
| "loss": 0.6907, |
| "num_input_tokens_seen": 4202656, |
| "step": 11155 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 1.1099315881729126, |
| "learning_rate": 1.2229152107634533e-12, |
| "loss": 0.6937, |
| "num_input_tokens_seen": 4204168, |
| "step": 11160 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.699965238571167, |
| "eval_runtime": 5.6351, |
| "eval_samples_per_second": 88.02, |
| "eval_steps_per_second": 22.005, |
| "num_input_tokens_seen": 4204168, |
| "step": 11160 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 4204168, |
| "step": 11160, |
| "total_flos": 1.8931178489059738e+17, |
| "train_loss": 1.13652567756646, |
| "train_runtime": 1257.9718, |
| "train_samples_per_second": 35.462, |
| "train_steps_per_second": 8.871 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 11160, |
| "num_input_tokens_seen": 4204168, |
| "num_train_epochs": 10, |
| "save_steps": 558, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.8931178489059738e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|