diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11158 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 123603, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024271255552049706, + "grad_norm": 15.667192459106445, + "learning_rate": 1.6018121511204597e-06, + "loss": 6.3579, + "mean_token_accuracy": 0.16570602521300315, + "num_tokens": 286328.0, + "step": 100 + }, + { + "epoch": 0.004854251110409941, + "grad_norm": 12.743103981018066, + "learning_rate": 3.219804222959308e-06, + "loss": 4.8459, + "mean_token_accuracy": 0.23822063744068145, + "num_tokens": 566683.0, + "step": 200 + }, + { + "epoch": 0.007281376665614912, + "grad_norm": 9.71696662902832, + "learning_rate": 4.837796294798156e-06, + "loss": 3.9201, + "mean_token_accuracy": 0.3217995056509972, + "num_tokens": 859370.0, + "step": 300 + }, + { + "epoch": 0.009708502220819882, + "grad_norm": 7.237846851348877, + "learning_rate": 6.455788366637004e-06, + "loss": 3.425, + "mean_token_accuracy": 0.37329041689634324, + "num_tokens": 1148220.0, + "step": 400 + }, + { + "epoch": 0.012135627776024853, + "grad_norm": 7.680373668670654, + "learning_rate": 8.073780438475852e-06, + "loss": 3.1271, + "mean_token_accuracy": 0.4050374338030815, + "num_tokens": 1425964.0, + "step": 500 + }, + { + "epoch": 0.014562753331229824, + "grad_norm": 5.841236114501953, + "learning_rate": 9.6917725103147e-06, + "loss": 2.8971, + "mean_token_accuracy": 0.43935800462961194, + "num_tokens": 1691169.0, + "step": 600 + }, + { + "epoch": 0.016989878886434794, + "grad_norm": 6.139180660247803, + "learning_rate": 1.1309764582153549e-05, + "loss": 2.7676, + "mean_token_accuracy": 0.4539221778512001, + "num_tokens": 1971620.0, + "step": 700 + }, + { + "epoch": 0.019417004441639765, + "grad_norm": 7.90980863571167, + "learning_rate": 1.2927756653992395e-05, + "loss": 2.6594, + "mean_token_accuracy": 0.4727063673734665, + "num_tokens": 2256913.0, + "step": 800 + }, + { + "epoch": 0.021844129996844736, + "grad_norm": 6.0803985595703125, + "learning_rate": 1.4545748725831243e-05, + "loss": 2.5591, + "mean_token_accuracy": 0.48720634669065477, + "num_tokens": 2539493.0, + "step": 900 + }, + { + "epoch": 0.024271255552049707, + "grad_norm": 5.513032913208008, + "learning_rate": 1.6163740797670095e-05, + "loss": 2.5159, + "mean_token_accuracy": 0.4934208956360817, + "num_tokens": 2833735.0, + "step": 1000 + }, + { + "epoch": 0.026698381107254678, + "grad_norm": 5.765513896942139, + "learning_rate": 1.778173286950894e-05, + "loss": 2.4417, + "mean_token_accuracy": 0.5049500033259392, + "num_tokens": 3107550.0, + "step": 1100 + }, + { + "epoch": 0.02912550666245965, + "grad_norm": 4.920138835906982, + "learning_rate": 1.9399724941347788e-05, + "loss": 2.3992, + "mean_token_accuracy": 0.5102156156301498, + "num_tokens": 3386017.0, + "step": 1200 + }, + { + "epoch": 0.03155263221766462, + "grad_norm": 5.399600505828857, + "learning_rate": 2.1017717013186636e-05, + "loss": 2.3613, + "mean_token_accuracy": 0.5177968215942382, + "num_tokens": 3677046.0, + "step": 1300 + }, + { + "epoch": 0.03397975777286959, + "grad_norm": 7.163160800933838, + "learning_rate": 2.2635709085025484e-05, + "loss": 2.291, + "mean_token_accuracy": 0.5300437113642693, + "num_tokens": 3950955.0, + "step": 1400 + }, + { + "epoch": 0.03640688332807456, + "grad_norm": 4.863125324249268, + "learning_rate": 2.4253701156864333e-05, + "loss": 2.2754, + "mean_token_accuracy": 0.5317086404561997, + "num_tokens": 4235827.0, + "step": 1500 + }, + { + "epoch": 0.03883400888327953, + "grad_norm": 5.184915542602539, + "learning_rate": 2.587169322870318e-05, + "loss": 2.2208, + "mean_token_accuracy": 0.5397129538655281, + "num_tokens": 4523618.0, + "step": 1600 + }, + { + "epoch": 0.041261134438484504, + "grad_norm": 4.9580841064453125, + "learning_rate": 2.748968530054203e-05, + "loss": 2.1542, + "mean_token_accuracy": 0.552655583024025, + "num_tokens": 4806914.0, + "step": 1700 + }, + { + "epoch": 0.04368825999368947, + "grad_norm": 4.563178062438965, + "learning_rate": 2.9107677372380877e-05, + "loss": 2.1012, + "mean_token_accuracy": 0.5598031800985336, + "num_tokens": 5070432.0, + "step": 1800 + }, + { + "epoch": 0.046115385548894446, + "grad_norm": 5.044736862182617, + "learning_rate": 3.0725669444219726e-05, + "loss": 2.0686, + "mean_token_accuracy": 0.5646976864337921, + "num_tokens": 5348835.0, + "step": 1900 + }, + { + "epoch": 0.04854251110409941, + "grad_norm": 4.853013038635254, + "learning_rate": 3.234366151605857e-05, + "loss": 2.0491, + "mean_token_accuracy": 0.5689935937523842, + "num_tokens": 5644954.0, + "step": 2000 + }, + { + "epoch": 0.05096963665930439, + "grad_norm": 4.354330062866211, + "learning_rate": 3.3961653587897415e-05, + "loss": 2.0365, + "mean_token_accuracy": 0.5719451946020127, + "num_tokens": 5919205.0, + "step": 2100 + }, + { + "epoch": 0.053396762214509355, + "grad_norm": 5.173582553863525, + "learning_rate": 3.5579645659736274e-05, + "loss": 1.9987, + "mean_token_accuracy": 0.5776787069439888, + "num_tokens": 6201450.0, + "step": 2200 + }, + { + "epoch": 0.05582388776971433, + "grad_norm": 4.418239116668701, + "learning_rate": 3.719763773157512e-05, + "loss": 1.9778, + "mean_token_accuracy": 0.5795711416006089, + "num_tokens": 6485924.0, + "step": 2300 + }, + { + "epoch": 0.0582510133249193, + "grad_norm": 5.564678192138672, + "learning_rate": 3.881562980341396e-05, + "loss": 1.9145, + "mean_token_accuracy": 0.5895163154602051, + "num_tokens": 6779709.0, + "step": 2400 + }, + { + "epoch": 0.06067813888012427, + "grad_norm": 4.3705153465271, + "learning_rate": 4.0433621875252815e-05, + "loss": 1.938, + "mean_token_accuracy": 0.5882080298662186, + "num_tokens": 7053740.0, + "step": 2500 + }, + { + "epoch": 0.06310526443532924, + "grad_norm": 3.672412633895874, + "learning_rate": 4.205161394709166e-05, + "loss": 1.8963, + "mean_token_accuracy": 0.5925538367033005, + "num_tokens": 7355267.0, + "step": 2600 + }, + { + "epoch": 0.0655323899905342, + "grad_norm": 4.00920295715332, + "learning_rate": 4.366960601893051e-05, + "loss": 1.8685, + "mean_token_accuracy": 0.59998017847538, + "num_tokens": 7632131.0, + "step": 2700 + }, + { + "epoch": 0.06795951554573917, + "grad_norm": 3.901211977005005, + "learning_rate": 4.5287598090769356e-05, + "loss": 1.8885, + "mean_token_accuracy": 0.5954222363233567, + "num_tokens": 7917324.0, + "step": 2800 + }, + { + "epoch": 0.07038664110094416, + "grad_norm": 3.7556352615356445, + "learning_rate": 4.69055901626082e-05, + "loss": 1.8447, + "mean_token_accuracy": 0.6014421585202218, + "num_tokens": 8197157.0, + "step": 2900 + }, + { + "epoch": 0.07281376665614912, + "grad_norm": 4.258702278137207, + "learning_rate": 4.852358223444705e-05, + "loss": 1.8817, + "mean_token_accuracy": 0.5944454026222229, + "num_tokens": 8486350.0, + "step": 3000 + }, + { + "epoch": 0.07524089221135409, + "grad_norm": 3.6492583751678467, + "learning_rate": 5.01415743062859e-05, + "loss": 1.8413, + "mean_token_accuracy": 0.6042111480236053, + "num_tokens": 8766318.0, + "step": 3100 + }, + { + "epoch": 0.07766801776655906, + "grad_norm": 3.297952175140381, + "learning_rate": 5.175956637812475e-05, + "loss": 1.836, + "mean_token_accuracy": 0.6041852396726608, + "num_tokens": 9041673.0, + "step": 3200 + }, + { + "epoch": 0.08009514332176404, + "grad_norm": 3.869326114654541, + "learning_rate": 5.3377558449963594e-05, + "loss": 1.7743, + "mean_token_accuracy": 0.6153368014097214, + "num_tokens": 9320058.0, + "step": 3300 + }, + { + "epoch": 0.08252226887696901, + "grad_norm": 4.298006534576416, + "learning_rate": 5.499555052180244e-05, + "loss": 1.825, + "mean_token_accuracy": 0.6060752519965171, + "num_tokens": 9607570.0, + "step": 3400 + }, + { + "epoch": 0.08494939443217397, + "grad_norm": 2.8588080406188965, + "learning_rate": 5.661354259364129e-05, + "loss": 1.823, + "mean_token_accuracy": 0.6073589542508125, + "num_tokens": 9891913.0, + "step": 3500 + }, + { + "epoch": 0.08737651998737894, + "grad_norm": 3.600383996963501, + "learning_rate": 5.823153466548015e-05, + "loss": 1.8353, + "mean_token_accuracy": 0.6057981529831886, + "num_tokens": 10171842.0, + "step": 3600 + }, + { + "epoch": 0.08980364554258392, + "grad_norm": 3.0030224323272705, + "learning_rate": 5.9849526737318994e-05, + "loss": 1.7668, + "mean_token_accuracy": 0.6160938149690628, + "num_tokens": 10443178.0, + "step": 3700 + }, + { + "epoch": 0.09223077109778889, + "grad_norm": 3.0406975746154785, + "learning_rate": 6.146751880915784e-05, + "loss": 1.7535, + "mean_token_accuracy": 0.6178180432319641, + "num_tokens": 10732155.0, + "step": 3800 + }, + { + "epoch": 0.09465789665299386, + "grad_norm": 2.917201519012451, + "learning_rate": 6.308551088099668e-05, + "loss": 1.7572, + "mean_token_accuracy": 0.6191618722677231, + "num_tokens": 11009552.0, + "step": 3900 + }, + { + "epoch": 0.09708502220819883, + "grad_norm": 3.206831932067871, + "learning_rate": 6.470350295283554e-05, + "loss": 1.749, + "mean_token_accuracy": 0.6184579783678055, + "num_tokens": 11297388.0, + "step": 4000 + }, + { + "epoch": 0.09951214776340381, + "grad_norm": 3.438751459121704, + "learning_rate": 6.632149502467439e-05, + "loss": 1.7667, + "mean_token_accuracy": 0.6156092411279679, + "num_tokens": 11570862.0, + "step": 4100 + }, + { + "epoch": 0.10193927331860878, + "grad_norm": 8.892073631286621, + "learning_rate": 6.793948709651323e-05, + "loss": 1.7377, + "mean_token_accuracy": 0.6204822993278504, + "num_tokens": 11846985.0, + "step": 4200 + }, + { + "epoch": 0.10436639887381374, + "grad_norm": 3.2197091579437256, + "learning_rate": 6.955747916835208e-05, + "loss": 1.7421, + "mean_token_accuracy": 0.6201369935274124, + "num_tokens": 12125408.0, + "step": 4300 + }, + { + "epoch": 0.10679352442901871, + "grad_norm": 2.925734281539917, + "learning_rate": 7.117547124019092e-05, + "loss": 1.7426, + "mean_token_accuracy": 0.6202259063720703, + "num_tokens": 12413939.0, + "step": 4400 + }, + { + "epoch": 0.10922064998422368, + "grad_norm": 3.6164560317993164, + "learning_rate": 7.279346331202978e-05, + "loss": 1.7318, + "mean_token_accuracy": 0.6214462512731552, + "num_tokens": 12696604.0, + "step": 4500 + }, + { + "epoch": 0.11164777553942866, + "grad_norm": 3.639244556427002, + "learning_rate": 7.441145538386862e-05, + "loss": 1.7056, + "mean_token_accuracy": 0.6253874599933624, + "num_tokens": 12969852.0, + "step": 4600 + }, + { + "epoch": 0.11407490109463363, + "grad_norm": 2.9974870681762695, + "learning_rate": 7.602944745570747e-05, + "loss": 1.7024, + "mean_token_accuracy": 0.62675716817379, + "num_tokens": 13248912.0, + "step": 4700 + }, + { + "epoch": 0.1165020266498386, + "grad_norm": 2.8196988105773926, + "learning_rate": 7.764743952754631e-05, + "loss": 1.7229, + "mean_token_accuracy": 0.6227961453795433, + "num_tokens": 13545940.0, + "step": 4800 + }, + { + "epoch": 0.11892915220504356, + "grad_norm": 3.317641496658325, + "learning_rate": 7.926543159938516e-05, + "loss": 1.7493, + "mean_token_accuracy": 0.6176340329647064, + "num_tokens": 13823732.0, + "step": 4900 + }, + { + "epoch": 0.12135627776024854, + "grad_norm": 2.6800076961517334, + "learning_rate": 8.088342367122402e-05, + "loss": 1.7526, + "mean_token_accuracy": 0.6184008997678757, + "num_tokens": 14101813.0, + "step": 5000 + }, + { + "epoch": 0.12378340331545351, + "grad_norm": 3.6860008239746094, + "learning_rate": 8.250141574306286e-05, + "loss": 1.7355, + "mean_token_accuracy": 0.6187130987644196, + "num_tokens": 14396116.0, + "step": 5100 + }, + { + "epoch": 0.12621052887065848, + "grad_norm": 3.056393623352051, + "learning_rate": 8.41194078149017e-05, + "loss": 1.6988, + "mean_token_accuracy": 0.6259019857645035, + "num_tokens": 14670147.0, + "step": 5200 + }, + { + "epoch": 0.12863765442586345, + "grad_norm": 3.0868637561798096, + "learning_rate": 8.573739988674055e-05, + "loss": 1.6956, + "mean_token_accuracy": 0.627558496594429, + "num_tokens": 14958834.0, + "step": 5300 + }, + { + "epoch": 0.1310647799810684, + "grad_norm": 4.050454139709473, + "learning_rate": 8.73553919585794e-05, + "loss": 1.6534, + "mean_token_accuracy": 0.63445347905159, + "num_tokens": 15237854.0, + "step": 5400 + }, + { + "epoch": 0.13349190553627338, + "grad_norm": 3.293698787689209, + "learning_rate": 8.897338403041825e-05, + "loss": 1.6915, + "mean_token_accuracy": 0.6279270613193512, + "num_tokens": 15516061.0, + "step": 5500 + }, + { + "epoch": 0.13591903109147835, + "grad_norm": 2.8466482162475586, + "learning_rate": 9.059137610225711e-05, + "loss": 1.6734, + "mean_token_accuracy": 0.6312123596668243, + "num_tokens": 15805438.0, + "step": 5600 + }, + { + "epoch": 0.13834615664668334, + "grad_norm": 2.694309949874878, + "learning_rate": 9.220936817409596e-05, + "loss": 1.7153, + "mean_token_accuracy": 0.625066881775856, + "num_tokens": 16080134.0, + "step": 5700 + }, + { + "epoch": 0.1407732822018883, + "grad_norm": 3.828099489212036, + "learning_rate": 9.38273602459348e-05, + "loss": 1.6925, + "mean_token_accuracy": 0.6267722380161286, + "num_tokens": 16342095.0, + "step": 5800 + }, + { + "epoch": 0.14320040775709328, + "grad_norm": 2.3940014839172363, + "learning_rate": 9.544535231777365e-05, + "loss": 1.6671, + "mean_token_accuracy": 0.6302274829149246, + "num_tokens": 16617443.0, + "step": 5900 + }, + { + "epoch": 0.14562753331229825, + "grad_norm": 3.202986240386963, + "learning_rate": 9.706334438961249e-05, + "loss": 1.6587, + "mean_token_accuracy": 0.6335983765125275, + "num_tokens": 16886715.0, + "step": 6000 + }, + { + "epoch": 0.1480546588675032, + "grad_norm": 2.267845630645752, + "learning_rate": 9.868133646145135e-05, + "loss": 1.6594, + "mean_token_accuracy": 0.6337177205085754, + "num_tokens": 17157399.0, + "step": 6100 + }, + { + "epoch": 0.15048178442270818, + "grad_norm": 3.8720905780792236, + "learning_rate": 0.00010029932853329018, + "loss": 1.6464, + "mean_token_accuracy": 0.6355876386165619, + "num_tokens": 17431964.0, + "step": 6200 + }, + { + "epoch": 0.15290890997791315, + "grad_norm": 2.3187131881713867, + "learning_rate": 0.00010191732060512904, + "loss": 1.6635, + "mean_token_accuracy": 0.6327675718069077, + "num_tokens": 17723620.0, + "step": 6300 + }, + { + "epoch": 0.15533603553311812, + "grad_norm": 3.7374343872070312, + "learning_rate": 0.00010353531267696787, + "loss": 1.6735, + "mean_token_accuracy": 0.6294235897064209, + "num_tokens": 18017276.0, + "step": 6400 + }, + { + "epoch": 0.1577631610883231, + "grad_norm": 3.3306126594543457, + "learning_rate": 0.00010515330474880673, + "loss": 1.709, + "mean_token_accuracy": 0.6271786081790924, + "num_tokens": 18286383.0, + "step": 6500 + }, + { + "epoch": 0.16019028664352808, + "grad_norm": 2.7035059928894043, + "learning_rate": 0.00010677129682064559, + "loss": 1.714, + "mean_token_accuracy": 0.6251045203208924, + "num_tokens": 18568579.0, + "step": 6600 + }, + { + "epoch": 0.16261741219873305, + "grad_norm": 5.372984409332275, + "learning_rate": 0.00010838928889248443, + "loss": 1.715, + "mean_token_accuracy": 0.6235057324171066, + "num_tokens": 18859264.0, + "step": 6700 + }, + { + "epoch": 0.16504453775393801, + "grad_norm": 2.6342880725860596, + "learning_rate": 0.00011000728096432329, + "loss": 1.6605, + "mean_token_accuracy": 0.6317295587062836, + "num_tokens": 19155847.0, + "step": 6800 + }, + { + "epoch": 0.16747166330914298, + "grad_norm": 3.171790599822998, + "learning_rate": 0.00011162527303616212, + "loss": 1.6404, + "mean_token_accuracy": 0.6374486458301544, + "num_tokens": 19433838.0, + "step": 6900 + }, + { + "epoch": 0.16989878886434795, + "grad_norm": 2.1029253005981445, + "learning_rate": 0.00011324326510800098, + "loss": 1.6118, + "mean_token_accuracy": 0.642929772734642, + "num_tokens": 19721269.0, + "step": 7000 + }, + { + "epoch": 0.17232591441955292, + "grad_norm": 2.6623852252960205, + "learning_rate": 0.00011486125717983983, + "loss": 1.6338, + "mean_token_accuracy": 0.637419713139534, + "num_tokens": 20013973.0, + "step": 7100 + }, + { + "epoch": 0.17475303997475788, + "grad_norm": 2.519373655319214, + "learning_rate": 0.00011647924925167868, + "loss": 1.6455, + "mean_token_accuracy": 0.6351126056909561, + "num_tokens": 20286212.0, + "step": 7200 + }, + { + "epoch": 0.17718016552996285, + "grad_norm": 2.475982427597046, + "learning_rate": 0.00011809724132351752, + "loss": 1.664, + "mean_token_accuracy": 0.6314303702116013, + "num_tokens": 20578569.0, + "step": 7300 + }, + { + "epoch": 0.17960729108516785, + "grad_norm": 2.241504430770874, + "learning_rate": 0.00011971523339535637, + "loss": 1.6157, + "mean_token_accuracy": 0.6422726994752884, + "num_tokens": 20862395.0, + "step": 7400 + }, + { + "epoch": 0.18203441664037281, + "grad_norm": 2.0290799140930176, + "learning_rate": 0.0001213332254671952, + "loss": 1.6557, + "mean_token_accuracy": 0.633880580663681, + "num_tokens": 21137936.0, + "step": 7500 + }, + { + "epoch": 0.18446154219557778, + "grad_norm": 2.4877192974090576, + "learning_rate": 0.00012295121753903406, + "loss": 1.614, + "mean_token_accuracy": 0.6418092441558838, + "num_tokens": 21430954.0, + "step": 7600 + }, + { + "epoch": 0.18688866775078275, + "grad_norm": 2.3244476318359375, + "learning_rate": 0.0001245692096108729, + "loss": 1.618, + "mean_token_accuracy": 0.6409686493873596, + "num_tokens": 21700944.0, + "step": 7700 + }, + { + "epoch": 0.18931579330598772, + "grad_norm": 5.238540172576904, + "learning_rate": 0.00012618720168271175, + "loss": 1.6292, + "mean_token_accuracy": 0.6387823939323425, + "num_tokens": 21994418.0, + "step": 7800 + }, + { + "epoch": 0.19174291886119268, + "grad_norm": 2.0987367630004883, + "learning_rate": 0.0001278051937545506, + "loss": 1.6474, + "mean_token_accuracy": 0.6356805974245071, + "num_tokens": 22288421.0, + "step": 7900 + }, + { + "epoch": 0.19417004441639765, + "grad_norm": 2.3273093700408936, + "learning_rate": 0.00012942318582638947, + "loss": 1.639, + "mean_token_accuracy": 0.6383429461717606, + "num_tokens": 22557693.0, + "step": 8000 + }, + { + "epoch": 0.19659716997160262, + "grad_norm": 2.5615649223327637, + "learning_rate": 0.0001310411778982283, + "loss": 1.5919, + "mean_token_accuracy": 0.6441801345348358, + "num_tokens": 22840289.0, + "step": 8100 + }, + { + "epoch": 0.19902429552680762, + "grad_norm": 2.6359269618988037, + "learning_rate": 0.00013265916997006716, + "loss": 1.6807, + "mean_token_accuracy": 0.6301674485206604, + "num_tokens": 23128125.0, + "step": 8200 + }, + { + "epoch": 0.20145142108201258, + "grad_norm": 2.7891812324523926, + "learning_rate": 0.000134277162041906, + "loss": 1.6412, + "mean_token_accuracy": 0.635779327750206, + "num_tokens": 23408204.0, + "step": 8300 + }, + { + "epoch": 0.20387854663721755, + "grad_norm": 2.309911012649536, + "learning_rate": 0.00013589515411374485, + "loss": 1.6086, + "mean_token_accuracy": 0.64215003490448, + "num_tokens": 23690150.0, + "step": 8400 + }, + { + "epoch": 0.20630567219242252, + "grad_norm": 2.182452917098999, + "learning_rate": 0.0001375131461855837, + "loss": 1.6174, + "mean_token_accuracy": 0.6406794637441635, + "num_tokens": 23968782.0, + "step": 8500 + }, + { + "epoch": 0.20873279774762749, + "grad_norm": 2.322374105453491, + "learning_rate": 0.00013913113825742254, + "loss": 1.5783, + "mean_token_accuracy": 0.6475563657283783, + "num_tokens": 24255904.0, + "step": 8600 + }, + { + "epoch": 0.21115992330283245, + "grad_norm": 2.552272081375122, + "learning_rate": 0.0001407491303292614, + "loss": 1.5751, + "mean_token_accuracy": 0.6480100709199905, + "num_tokens": 24532151.0, + "step": 8700 + }, + { + "epoch": 0.21358704885803742, + "grad_norm": 1.9336109161376953, + "learning_rate": 0.00014236712240110023, + "loss": 1.583, + "mean_token_accuracy": 0.646995313167572, + "num_tokens": 24820503.0, + "step": 8800 + }, + { + "epoch": 0.2160141744132424, + "grad_norm": 2.162743330001831, + "learning_rate": 0.0001439851144729391, + "loss": 1.5831, + "mean_token_accuracy": 0.646264790892601, + "num_tokens": 25096658.0, + "step": 8900 + }, + { + "epoch": 0.21844129996844736, + "grad_norm": 2.5591320991516113, + "learning_rate": 0.00014560310654477795, + "loss": 1.5794, + "mean_token_accuracy": 0.6471728563308716, + "num_tokens": 25361541.0, + "step": 9000 + }, + { + "epoch": 0.22086842552365235, + "grad_norm": 2.754995822906494, + "learning_rate": 0.0001472210986166168, + "loss": 1.6184, + "mean_token_accuracy": 0.6407477951049805, + "num_tokens": 25643336.0, + "step": 9100 + }, + { + "epoch": 0.22329555107885732, + "grad_norm": 2.716937780380249, + "learning_rate": 0.00014883909068845564, + "loss": 1.5948, + "mean_token_accuracy": 0.6445745784044266, + "num_tokens": 25936458.0, + "step": 9200 + }, + { + "epoch": 0.22572267663406229, + "grad_norm": 2.3055243492126465, + "learning_rate": 0.0001504570827602945, + "loss": 1.622, + "mean_token_accuracy": 0.6412751048803329, + "num_tokens": 26230438.0, + "step": 9300 + }, + { + "epoch": 0.22814980218926725, + "grad_norm": 2.0069618225097656, + "learning_rate": 0.00015207507483213333, + "loss": 1.6072, + "mean_token_accuracy": 0.642265853881836, + "num_tokens": 26519081.0, + "step": 9400 + }, + { + "epoch": 0.23057692774447222, + "grad_norm": 2.1340370178222656, + "learning_rate": 0.00015369306690397218, + "loss": 1.5986, + "mean_token_accuracy": 0.6452283936738968, + "num_tokens": 26794162.0, + "step": 9500 + }, + { + "epoch": 0.2330040532996772, + "grad_norm": 2.440425395965576, + "learning_rate": 0.00015531105897581101, + "loss": 1.6111, + "mean_token_accuracy": 0.6408109438419342, + "num_tokens": 27070997.0, + "step": 9600 + }, + { + "epoch": 0.23543117885488216, + "grad_norm": 4.2947893142700195, + "learning_rate": 0.00015692905104764987, + "loss": 1.5983, + "mean_token_accuracy": 0.6435111856460571, + "num_tokens": 27358764.0, + "step": 9700 + }, + { + "epoch": 0.23785830441008712, + "grad_norm": 2.020979642868042, + "learning_rate": 0.0001585470431194887, + "loss": 1.6451, + "mean_token_accuracy": 0.6358766305446625, + "num_tokens": 27646756.0, + "step": 9800 + }, + { + "epoch": 0.2402854299652921, + "grad_norm": 2.3321073055267334, + "learning_rate": 0.00016016503519132756, + "loss": 1.6128, + "mean_token_accuracy": 0.6417472112178803, + "num_tokens": 27933393.0, + "step": 9900 + }, + { + "epoch": 0.2427125555204971, + "grad_norm": 1.9731568098068237, + "learning_rate": 0.00016178302726316642, + "loss": 1.5553, + "mean_token_accuracy": 0.6521957015991211, + "num_tokens": 28225907.0, + "step": 10000 + }, + { + "epoch": 0.24513968107570205, + "grad_norm": 1.7572153806686401, + "learning_rate": 0.00016340101933500528, + "loss": 1.6428, + "mean_token_accuracy": 0.6366815239191055, + "num_tokens": 28502556.0, + "step": 10100 + }, + { + "epoch": 0.24756680663090702, + "grad_norm": 2.120887041091919, + "learning_rate": 0.0001650190114068441, + "loss": 1.5659, + "mean_token_accuracy": 0.6504863280057908, + "num_tokens": 28787215.0, + "step": 10200 + }, + { + "epoch": 0.249993932186112, + "grad_norm": 1.741084337234497, + "learning_rate": 0.00016663700347868297, + "loss": 1.6045, + "mean_token_accuracy": 0.6414517712593079, + "num_tokens": 29077020.0, + "step": 10300 + }, + { + "epoch": 0.25242105774131696, + "grad_norm": 2.038705587387085, + "learning_rate": 0.00016825499555052183, + "loss": 1.5826, + "mean_token_accuracy": 0.6473750042915344, + "num_tokens": 29347254.0, + "step": 10400 + }, + { + "epoch": 0.2548481832965219, + "grad_norm": 1.8374431133270264, + "learning_rate": 0.00016987298762236066, + "loss": 1.5794, + "mean_token_accuracy": 0.6442887282371521, + "num_tokens": 29628082.0, + "step": 10500 + }, + { + "epoch": 0.2572753088517269, + "grad_norm": 1.7167983055114746, + "learning_rate": 0.00017149097969419952, + "loss": 1.5906, + "mean_token_accuracy": 0.6459539052844048, + "num_tokens": 29901377.0, + "step": 10600 + }, + { + "epoch": 0.25970243440693186, + "grad_norm": 1.7779451608657837, + "learning_rate": 0.00017310897176603835, + "loss": 1.589, + "mean_token_accuracy": 0.6448972916603088, + "num_tokens": 30176021.0, + "step": 10700 + }, + { + "epoch": 0.2621295599621368, + "grad_norm": 1.6588525772094727, + "learning_rate": 0.0001747269638378772, + "loss": 1.6071, + "mean_token_accuracy": 0.6433945900201797, + "num_tokens": 30457966.0, + "step": 10800 + }, + { + "epoch": 0.2645566855173418, + "grad_norm": 2.377293109893799, + "learning_rate": 0.00017634495590971604, + "loss": 1.6186, + "mean_token_accuracy": 0.6401382917165757, + "num_tokens": 30734208.0, + "step": 10900 + }, + { + "epoch": 0.26698381107254676, + "grad_norm": 2.399536609649658, + "learning_rate": 0.0001779629479815549, + "loss": 1.5837, + "mean_token_accuracy": 0.6470131504535676, + "num_tokens": 31020843.0, + "step": 11000 + }, + { + "epoch": 0.26941093662775173, + "grad_norm": 1.5932106971740723, + "learning_rate": 0.00017958094005339376, + "loss": 1.632, + "mean_token_accuracy": 0.6387262684106827, + "num_tokens": 31302898.0, + "step": 11100 + }, + { + "epoch": 0.2718380621829567, + "grad_norm": 2.0135538578033447, + "learning_rate": 0.0001811989321252326, + "loss": 1.5796, + "mean_token_accuracy": 0.6479561811685562, + "num_tokens": 31573395.0, + "step": 11200 + }, + { + "epoch": 0.2742651877381617, + "grad_norm": 2.4460792541503906, + "learning_rate": 0.00018281692419707144, + "loss": 1.5639, + "mean_token_accuracy": 0.6510553556680679, + "num_tokens": 31859176.0, + "step": 11300 + }, + { + "epoch": 0.2766923132933667, + "grad_norm": 1.658430814743042, + "learning_rate": 0.0001844349162689103, + "loss": 1.5697, + "mean_token_accuracy": 0.6498629409074783, + "num_tokens": 32143776.0, + "step": 11400 + }, + { + "epoch": 0.27911943884857165, + "grad_norm": 1.603789210319519, + "learning_rate": 0.00018605290834074913, + "loss": 1.6203, + "mean_token_accuracy": 0.6409107285737992, + "num_tokens": 32419826.0, + "step": 11500 + }, + { + "epoch": 0.2815465644037766, + "grad_norm": 1.9514955282211304, + "learning_rate": 0.000187670900412588, + "loss": 1.5774, + "mean_token_accuracy": 0.6493115669488907, + "num_tokens": 32682366.0, + "step": 11600 + }, + { + "epoch": 0.2839736899589816, + "grad_norm": 4.2015838623046875, + "learning_rate": 0.00018928889248442682, + "loss": 1.56, + "mean_token_accuracy": 0.653014543056488, + "num_tokens": 32943242.0, + "step": 11700 + }, + { + "epoch": 0.28640081551418656, + "grad_norm": 2.5987889766693115, + "learning_rate": 0.00019090688455626568, + "loss": 1.6036, + "mean_token_accuracy": 0.6438838475942612, + "num_tokens": 33224665.0, + "step": 11800 + }, + { + "epoch": 0.2888279410693915, + "grad_norm": 1.8440401554107666, + "learning_rate": 0.00019252487662810451, + "loss": 1.5626, + "mean_token_accuracy": 0.65259059548378, + "num_tokens": 33515048.0, + "step": 11900 + }, + { + "epoch": 0.2912550666245965, + "grad_norm": 1.672186017036438, + "learning_rate": 0.00019414286869994337, + "loss": 1.5928, + "mean_token_accuracy": 0.6442960953712463, + "num_tokens": 33805643.0, + "step": 12000 + }, + { + "epoch": 0.29368219217980146, + "grad_norm": 1.9079508781433105, + "learning_rate": 0.00019576086077178223, + "loss": 1.5616, + "mean_token_accuracy": 0.6513192290067673, + "num_tokens": 34084649.0, + "step": 12100 + }, + { + "epoch": 0.2961093177350064, + "grad_norm": 1.9318201541900635, + "learning_rate": 0.0001973788528436211, + "loss": 1.5428, + "mean_token_accuracy": 0.6533740103244782, + "num_tokens": 34351025.0, + "step": 12200 + }, + { + "epoch": 0.2985364432902114, + "grad_norm": 1.8940328359603882, + "learning_rate": 0.00019899684491545995, + "loss": 1.5848, + "mean_token_accuracy": 0.6487006124854088, + "num_tokens": 34631544.0, + "step": 12300 + }, + { + "epoch": 0.30096356884541636, + "grad_norm": 1.7644509077072144, + "learning_rate": 0.00019999994241633214, + "loss": 1.5272, + "mean_token_accuracy": 0.6580360817909241, + "num_tokens": 34914627.0, + "step": 12400 + }, + { + "epoch": 0.30339069440062133, + "grad_norm": 2.096867322921753, + "learning_rate": 0.0001999992405664207, + "loss": 1.5403, + "mean_token_accuracy": 0.6525964337587357, + "num_tokens": 35193026.0, + "step": 12500 + }, + { + "epoch": 0.3058178199558263, + "grad_norm": 1.6612098217010498, + "learning_rate": 0.00019999774116529276, + "loss": 1.593, + "mean_token_accuracy": 0.6469535285234451, + "num_tokens": 35472360.0, + "step": 12600 + }, + { + "epoch": 0.30824494551103127, + "grad_norm": 1.5085548162460327, + "learning_rate": 0.00019999544422490684, + "loss": 1.6142, + "mean_token_accuracy": 0.6417314970493316, + "num_tokens": 35764712.0, + "step": 12700 + }, + { + "epoch": 0.31067207106623623, + "grad_norm": 2.030015468597412, + "learning_rate": 0.00019999234976358237, + "loss": 1.584, + "mean_token_accuracy": 0.6494407910108566, + "num_tokens": 36041894.0, + "step": 12800 + }, + { + "epoch": 0.3130991966214412, + "grad_norm": 2.0102243423461914, + "learning_rate": 0.00019998845780599945, + "loss": 1.5482, + "mean_token_accuracy": 0.6535135221481323, + "num_tokens": 36307589.0, + "step": 12900 + }, + { + "epoch": 0.3155263221766462, + "grad_norm": 1.5919320583343506, + "learning_rate": 0.0001999837683831987, + "loss": 1.5708, + "mean_token_accuracy": 0.6485945302248001, + "num_tokens": 36586771.0, + "step": 13000 + }, + { + "epoch": 0.3179534477318512, + "grad_norm": 1.6979924440383911, + "learning_rate": 0.0001999782815325809, + "loss": 1.5185, + "mean_token_accuracy": 0.6575907075405121, + "num_tokens": 36855115.0, + "step": 13100 + }, + { + "epoch": 0.32038057328705616, + "grad_norm": 1.7762399911880493, + "learning_rate": 0.00019997199729790684, + "loss": 1.5585, + "mean_token_accuracy": 0.652558314204216, + "num_tokens": 37153394.0, + "step": 13200 + }, + { + "epoch": 0.3228076988422611, + "grad_norm": 1.9209295511245728, + "learning_rate": 0.00019996491572929695, + "loss": 1.4696, + "mean_token_accuracy": 0.6686202174425125, + "num_tokens": 37447493.0, + "step": 13300 + }, + { + "epoch": 0.3252348243974661, + "grad_norm": 1.7967995405197144, + "learning_rate": 0.00019995703688323072, + "loss": 1.5193, + "mean_token_accuracy": 0.6576752889156342, + "num_tokens": 37719538.0, + "step": 13400 + }, + { + "epoch": 0.32766194995267106, + "grad_norm": 1.6311265230178833, + "learning_rate": 0.0001999483608225465, + "loss": 1.4999, + "mean_token_accuracy": 0.6639257234334945, + "num_tokens": 37996152.0, + "step": 13500 + }, + { + "epoch": 0.33008907550787603, + "grad_norm": 1.8700850009918213, + "learning_rate": 0.00019993888761644083, + "loss": 1.5559, + "mean_token_accuracy": 0.6535019212961197, + "num_tokens": 38272368.0, + "step": 13600 + }, + { + "epoch": 0.332516201063081, + "grad_norm": 1.6935664415359497, + "learning_rate": 0.00019992861734046796, + "loss": 1.5198, + "mean_token_accuracy": 0.6603120654821396, + "num_tokens": 38534001.0, + "step": 13700 + }, + { + "epoch": 0.33494332661828596, + "grad_norm": 1.577829360961914, + "learning_rate": 0.0001999175500765392, + "loss": 1.504, + "mean_token_accuracy": 0.6614657628536225, + "num_tokens": 38811884.0, + "step": 13800 + }, + { + "epoch": 0.33737045217349093, + "grad_norm": 1.339965581893921, + "learning_rate": 0.00019990568591292238, + "loss": 1.5434, + "mean_token_accuracy": 0.6548532199859619, + "num_tokens": 39082290.0, + "step": 13900 + }, + { + "epoch": 0.3397975777286959, + "grad_norm": 1.5873082876205444, + "learning_rate": 0.0001998930249442409, + "loss": 1.5079, + "mean_token_accuracy": 0.6613044041395187, + "num_tokens": 39364724.0, + "step": 14000 + }, + { + "epoch": 0.34222470328390087, + "grad_norm": 1.4517920017242432, + "learning_rate": 0.00019987956727147334, + "loss": 1.5268, + "mean_token_accuracy": 0.655518969297409, + "num_tokens": 39663050.0, + "step": 14100 + }, + { + "epoch": 0.34465182883910583, + "grad_norm": 1.920789361000061, + "learning_rate": 0.0001998653130019523, + "loss": 1.4819, + "mean_token_accuracy": 0.6665922373533248, + "num_tokens": 39950704.0, + "step": 14200 + }, + { + "epoch": 0.3470789543943108, + "grad_norm": 1.4261436462402344, + "learning_rate": 0.00019985026224936375, + "loss": 1.4286, + "mean_token_accuracy": 0.6748318231105804, + "num_tokens": 40244181.0, + "step": 14300 + }, + { + "epoch": 0.34950607994951577, + "grad_norm": 1.6467373371124268, + "learning_rate": 0.00019983441513374603, + "loss": 1.4598, + "mean_token_accuracy": 0.6711004292964935, + "num_tokens": 40510184.0, + "step": 14400 + }, + { + "epoch": 0.35193320550472074, + "grad_norm": 1.2494763135910034, + "learning_rate": 0.000199817771781489, + "loss": 1.4752, + "mean_token_accuracy": 0.665248526930809, + "num_tokens": 40784045.0, + "step": 14500 + }, + { + "epoch": 0.3543603310599257, + "grad_norm": 1.2977705001831055, + "learning_rate": 0.00019980033232533292, + "loss": 1.4809, + "mean_token_accuracy": 0.6651502713561058, + "num_tokens": 41075689.0, + "step": 14600 + }, + { + "epoch": 0.3567874566151307, + "grad_norm": 1.475791096687317, + "learning_rate": 0.00019978209690436747, + "loss": 1.397, + "mean_token_accuracy": 0.6825127446651459, + "num_tokens": 41354944.0, + "step": 14700 + }, + { + "epoch": 0.3592145821703357, + "grad_norm": 1.385689377784729, + "learning_rate": 0.00019976306566403052, + "loss": 1.5037, + "mean_token_accuracy": 0.6625669264793396, + "num_tokens": 41637807.0, + "step": 14800 + }, + { + "epoch": 0.36164170772554066, + "grad_norm": 1.500313401222229, + "learning_rate": 0.00019974323875610712, + "loss": 1.515, + "mean_token_accuracy": 0.659018731713295, + "num_tokens": 41921489.0, + "step": 14900 + }, + { + "epoch": 0.36406883328074563, + "grad_norm": 1.7327656745910645, + "learning_rate": 0.00019972261633872826, + "loss": 1.4263, + "mean_token_accuracy": 0.6765751355886459, + "num_tokens": 42203300.0, + "step": 15000 + }, + { + "epoch": 0.3664959588359506, + "grad_norm": 1.9555410146713257, + "learning_rate": 0.00019970119857636948, + "loss": 1.4265, + "mean_token_accuracy": 0.6769354450702667, + "num_tokens": 42482247.0, + "step": 15100 + }, + { + "epoch": 0.36892308439115556, + "grad_norm": 2.1408140659332275, + "learning_rate": 0.00019967898563984978, + "loss": 1.4401, + "mean_token_accuracy": 0.6747738122940063, + "num_tokens": 42755286.0, + "step": 15200 + }, + { + "epoch": 0.37135020994636053, + "grad_norm": 1.3993452787399292, + "learning_rate": 0.00019965597770632993, + "loss": 1.4457, + "mean_token_accuracy": 0.6707744264602661, + "num_tokens": 43036164.0, + "step": 15300 + }, + { + "epoch": 0.3737773355015655, + "grad_norm": 1.423827886581421, + "learning_rate": 0.00019963217495931149, + "loss": 1.3852, + "mean_token_accuracy": 0.682267724275589, + "num_tokens": 43324843.0, + "step": 15400 + }, + { + "epoch": 0.37620446105677047, + "grad_norm": 1.3678686618804932, + "learning_rate": 0.00019960757758863488, + "loss": 1.4738, + "mean_token_accuracy": 0.6674029171466828, + "num_tokens": 43616961.0, + "step": 15500 + }, + { + "epoch": 0.37863158661197543, + "grad_norm": 1.4727699756622314, + "learning_rate": 0.00019958218579047837, + "loss": 1.4272, + "mean_token_accuracy": 0.6761853212118148, + "num_tokens": 43882893.0, + "step": 15600 + }, + { + "epoch": 0.3810587121671804, + "grad_norm": 1.4919428825378418, + "learning_rate": 0.00019955599976735598, + "loss": 1.413, + "mean_token_accuracy": 0.6803244209289551, + "num_tokens": 44155556.0, + "step": 15700 + }, + { + "epoch": 0.38348583772238537, + "grad_norm": 1.481824278831482, + "learning_rate": 0.0001995290197281163, + "loss": 1.4245, + "mean_token_accuracy": 0.6783695650100708, + "num_tokens": 44427379.0, + "step": 15800 + }, + { + "epoch": 0.38591296327759034, + "grad_norm": 1.343738317489624, + "learning_rate": 0.00019950124588794053, + "loss": 1.4281, + "mean_token_accuracy": 0.6756082916259766, + "num_tokens": 44704217.0, + "step": 15900 + }, + { + "epoch": 0.3883400888327953, + "grad_norm": 1.9732495546340942, + "learning_rate": 0.00019947267846834102, + "loss": 1.4164, + "mean_token_accuracy": 0.6776945400238037, + "num_tokens": 44994003.0, + "step": 16000 + }, + { + "epoch": 0.3907672143880003, + "grad_norm": 1.4792394638061523, + "learning_rate": 0.00019944331769715926, + "loss": 1.3882, + "mean_token_accuracy": 0.6841700464487076, + "num_tokens": 45262965.0, + "step": 16100 + }, + { + "epoch": 0.39319433994320524, + "grad_norm": 1.5121392011642456, + "learning_rate": 0.00019941316380856424, + "loss": 1.4293, + "mean_token_accuracy": 0.6759150516986847, + "num_tokens": 45544990.0, + "step": 16200 + }, + { + "epoch": 0.3956214654984102, + "grad_norm": 1.4478973150253296, + "learning_rate": 0.0001993822170430505, + "loss": 1.3958, + "mean_token_accuracy": 0.6820646327733993, + "num_tokens": 45823961.0, + "step": 16300 + }, + { + "epoch": 0.39804859105361523, + "grad_norm": 1.201532244682312, + "learning_rate": 0.0001993504776474362, + "loss": 1.3745, + "mean_token_accuracy": 0.6857378870248795, + "num_tokens": 46093285.0, + "step": 16400 + }, + { + "epoch": 0.4004757166088202, + "grad_norm": 1.3356366157531738, + "learning_rate": 0.0001993179458748612, + "loss": 1.3609, + "mean_token_accuracy": 0.6905792677402496, + "num_tokens": 46387814.0, + "step": 16500 + }, + { + "epoch": 0.40290284216402517, + "grad_norm": 1.2657437324523926, + "learning_rate": 0.00019928462198478503, + "loss": 1.3614, + "mean_token_accuracy": 0.688639857172966, + "num_tokens": 46668922.0, + "step": 16600 + }, + { + "epoch": 0.40532996771923013, + "grad_norm": 1.460304856300354, + "learning_rate": 0.00019925050624298478, + "loss": 1.3555, + "mean_token_accuracy": 0.6907580763101577, + "num_tokens": 46949382.0, + "step": 16700 + }, + { + "epoch": 0.4077570932744351, + "grad_norm": 1.256842017173767, + "learning_rate": 0.00019921559892155304, + "loss": 1.3847, + "mean_token_accuracy": 0.683619334101677, + "num_tokens": 47232718.0, + "step": 16800 + }, + { + "epoch": 0.41018421882964007, + "grad_norm": 1.5061123371124268, + "learning_rate": 0.0001991799002988957, + "loss": 1.3603, + "mean_token_accuracy": 0.6874109321832657, + "num_tokens": 47512467.0, + "step": 16900 + }, + { + "epoch": 0.41261134438484504, + "grad_norm": 1.403568983078003, + "learning_rate": 0.0001991434106597297, + "loss": 1.4669, + "mean_token_accuracy": 0.6719831627607346, + "num_tokens": 47802152.0, + "step": 17000 + }, + { + "epoch": 0.41503846994005, + "grad_norm": 1.540781021118164, + "learning_rate": 0.00019910613029508082, + "loss": 1.4166, + "mean_token_accuracy": 0.6793534135818482, + "num_tokens": 48079701.0, + "step": 17100 + }, + { + "epoch": 0.41746559549525497, + "grad_norm": 1.553440809249878, + "learning_rate": 0.00019906805950228136, + "loss": 1.3399, + "mean_token_accuracy": 0.6919099617004395, + "num_tokens": 48360187.0, + "step": 17200 + }, + { + "epoch": 0.41989272105045994, + "grad_norm": 1.4212487936019897, + "learning_rate": 0.00019902919858496762, + "loss": 1.3215, + "mean_token_accuracy": 0.6969122314453124, + "num_tokens": 48639623.0, + "step": 17300 + }, + { + "epoch": 0.4223198466056649, + "grad_norm": 1.6373494863510132, + "learning_rate": 0.00019898954785307774, + "loss": 1.318, + "mean_token_accuracy": 0.6975502675771713, + "num_tokens": 48920032.0, + "step": 17400 + }, + { + "epoch": 0.4247469721608699, + "grad_norm": 1.618812918663025, + "learning_rate": 0.00019894910762284897, + "loss": 1.3826, + "mean_token_accuracy": 0.6858366417884827, + "num_tokens": 49209460.0, + "step": 17500 + }, + { + "epoch": 0.42717409771607484, + "grad_norm": 1.2190032005310059, + "learning_rate": 0.00019890787821681534, + "loss": 1.3092, + "mean_token_accuracy": 0.6999623441696167, + "num_tokens": 49495904.0, + "step": 17600 + }, + { + "epoch": 0.4296012232712798, + "grad_norm": 1.5213158130645752, + "learning_rate": 0.00019886585996380495, + "loss": 1.337, + "mean_token_accuracy": 0.694312310218811, + "num_tokens": 49769086.0, + "step": 17700 + }, + { + "epoch": 0.4320283488264848, + "grad_norm": 1.2875466346740723, + "learning_rate": 0.00019882305319893746, + "loss": 1.3142, + "mean_token_accuracy": 0.6983224129676819, + "num_tokens": 50056312.0, + "step": 17800 + }, + { + "epoch": 0.43445547438168974, + "grad_norm": 1.364429235458374, + "learning_rate": 0.00019877945826362133, + "loss": 1.2759, + "mean_token_accuracy": 0.7073658388853074, + "num_tokens": 50346471.0, + "step": 17900 + }, + { + "epoch": 0.4368825999368947, + "grad_norm": 1.9261558055877686, + "learning_rate": 0.00019873507550555113, + "loss": 1.3359, + "mean_token_accuracy": 0.6965240675210953, + "num_tokens": 50636340.0, + "step": 18000 + }, + { + "epoch": 0.4393097254920997, + "grad_norm": 2.0433011054992676, + "learning_rate": 0.00019868990527870475, + "loss": 1.3354, + "mean_token_accuracy": 0.695448847413063, + "num_tokens": 50906335.0, + "step": 18100 + }, + { + "epoch": 0.4417368510473047, + "grad_norm": 1.8010625839233398, + "learning_rate": 0.00019864394794334066, + "loss": 1.2788, + "mean_token_accuracy": 0.7048653018474579, + "num_tokens": 51178755.0, + "step": 18200 + }, + { + "epoch": 0.44416397660250967, + "grad_norm": 1.1464262008666992, + "learning_rate": 0.00019859720386599492, + "loss": 1.2752, + "mean_token_accuracy": 0.7063197785615921, + "num_tokens": 51463497.0, + "step": 18300 + }, + { + "epoch": 0.44659110215771464, + "grad_norm": 1.6249265670776367, + "learning_rate": 0.00019854967341947832, + "loss": 1.2929, + "mean_token_accuracy": 0.7024630016088486, + "num_tokens": 51752205.0, + "step": 18400 + }, + { + "epoch": 0.4490182277129196, + "grad_norm": 1.3986220359802246, + "learning_rate": 0.00019850135698287337, + "loss": 1.3208, + "mean_token_accuracy": 0.6973824536800385, + "num_tokens": 52035214.0, + "step": 18500 + }, + { + "epoch": 0.45144535326812457, + "grad_norm": 1.3274478912353516, + "learning_rate": 0.00019845225494153138, + "loss": 1.3032, + "mean_token_accuracy": 0.7015773522853851, + "num_tokens": 52310306.0, + "step": 18600 + }, + { + "epoch": 0.45387247882332954, + "grad_norm": 2.008397102355957, + "learning_rate": 0.00019840236768706916, + "loss": 1.2791, + "mean_token_accuracy": 0.7044518172740937, + "num_tokens": 52580485.0, + "step": 18700 + }, + { + "epoch": 0.4562996043785345, + "grad_norm": 1.4408162832260132, + "learning_rate": 0.00019835169561736624, + "loss": 1.3516, + "mean_token_accuracy": 0.6925643801689148, + "num_tokens": 52853591.0, + "step": 18800 + }, + { + "epoch": 0.4587267299337395, + "grad_norm": 1.2863048315048218, + "learning_rate": 0.00019830023913656132, + "loss": 1.3084, + "mean_token_accuracy": 0.7014599156379699, + "num_tokens": 53123981.0, + "step": 18900 + }, + { + "epoch": 0.46115385548894444, + "grad_norm": 1.5343726873397827, + "learning_rate": 0.00019824799865504936, + "loss": 1.2883, + "mean_token_accuracy": 0.7038233464956284, + "num_tokens": 53404750.0, + "step": 19000 + }, + { + "epoch": 0.4635809810441494, + "grad_norm": 1.0196541547775269, + "learning_rate": 0.00019819497458947808, + "loss": 1.1983, + "mean_token_accuracy": 0.721374859213829, + "num_tokens": 53676184.0, + "step": 19100 + }, + { + "epoch": 0.4660081065993544, + "grad_norm": 1.4527913331985474, + "learning_rate": 0.00019814116736274482, + "loss": 1.2063, + "mean_token_accuracy": 0.719505894780159, + "num_tokens": 53947618.0, + "step": 19200 + }, + { + "epoch": 0.46843523215455934, + "grad_norm": 1.1889349222183228, + "learning_rate": 0.00019808657740399296, + "loss": 1.2401, + "mean_token_accuracy": 0.7134192228317261, + "num_tokens": 54225900.0, + "step": 19300 + }, + { + "epoch": 0.4708623577097643, + "grad_norm": 1.0686372518539429, + "learning_rate": 0.0001980312051486087, + "loss": 1.2694, + "mean_token_accuracy": 0.7079707849025726, + "num_tokens": 54510933.0, + "step": 19400 + }, + { + "epoch": 0.4732894832649693, + "grad_norm": 1.5365800857543945, + "learning_rate": 0.00019797505103821757, + "loss": 1.2088, + "mean_token_accuracy": 0.7209999257326126, + "num_tokens": 54789106.0, + "step": 19500 + }, + { + "epoch": 0.47571660882017425, + "grad_norm": 1.5895075798034668, + "learning_rate": 0.00019791811552068067, + "loss": 1.2628, + "mean_token_accuracy": 0.7107340413331985, + "num_tokens": 55068924.0, + "step": 19600 + }, + { + "epoch": 0.4781437343753792, + "grad_norm": 1.2052088975906372, + "learning_rate": 0.00019786039905009143, + "loss": 1.238, + "mean_token_accuracy": 0.7150384098291397, + "num_tokens": 55345057.0, + "step": 19700 + }, + { + "epoch": 0.4805708599305842, + "grad_norm": 1.183862566947937, + "learning_rate": 0.0001978019020867717, + "loss": 1.2869, + "mean_token_accuracy": 0.7046462643146515, + "num_tokens": 55626375.0, + "step": 19800 + }, + { + "epoch": 0.4829979854857892, + "grad_norm": 0.9647877216339111, + "learning_rate": 0.00019774262509726832, + "loss": 1.2112, + "mean_token_accuracy": 0.7195284950733185, + "num_tokens": 55911211.0, + "step": 19900 + }, + { + "epoch": 0.4854251110409942, + "grad_norm": 1.3305227756500244, + "learning_rate": 0.00019768256855434916, + "loss": 1.2022, + "mean_token_accuracy": 0.7205027377605439, + "num_tokens": 56197537.0, + "step": 20000 + }, + { + "epoch": 0.48785223659619914, + "grad_norm": 1.235066533088684, + "learning_rate": 0.0001976217329369996, + "loss": 1.226, + "mean_token_accuracy": 0.7162840497493744, + "num_tokens": 56452193.0, + "step": 20100 + }, + { + "epoch": 0.4902793621514041, + "grad_norm": 1.370216727256775, + "learning_rate": 0.00019756011873041854, + "loss": 1.237, + "mean_token_accuracy": 0.7148937523365021, + "num_tokens": 56729810.0, + "step": 20200 + }, + { + "epoch": 0.4927064877066091, + "grad_norm": 1.065269947052002, + "learning_rate": 0.0001974977264260145, + "loss": 1.1946, + "mean_token_accuracy": 0.7244079232215881, + "num_tokens": 56995621.0, + "step": 20300 + }, + { + "epoch": 0.49513361326181404, + "grad_norm": 1.1559139490127563, + "learning_rate": 0.00019743455652140195, + "loss": 1.2337, + "mean_token_accuracy": 0.7160074669122696, + "num_tokens": 57270242.0, + "step": 20400 + }, + { + "epoch": 0.497560738817019, + "grad_norm": 1.3941744565963745, + "learning_rate": 0.00019737060952039695, + "loss": 1.1944, + "mean_token_accuracy": 0.7223728567361831, + "num_tokens": 57539201.0, + "step": 20500 + }, + { + "epoch": 0.499987864372224, + "grad_norm": 1.42805814743042, + "learning_rate": 0.00019730588593301355, + "loss": 1.2291, + "mean_token_accuracy": 0.7149619489908219, + "num_tokens": 57827170.0, + "step": 20600 + }, + { + "epoch": 0.502414989927429, + "grad_norm": 1.1617212295532227, + "learning_rate": 0.00019724038627545935, + "loss": 1.2113, + "mean_token_accuracy": 0.7198360604047775, + "num_tokens": 58113276.0, + "step": 20700 + }, + { + "epoch": 0.5048421154826339, + "grad_norm": 1.4672633409500122, + "learning_rate": 0.00019717411107013168, + "loss": 1.2117, + "mean_token_accuracy": 0.7212896418571472, + "num_tokens": 58396273.0, + "step": 20800 + }, + { + "epoch": 0.5072692410378389, + "grad_norm": 1.4573055505752563, + "learning_rate": 0.00019710706084561324, + "loss": 1.1817, + "mean_token_accuracy": 0.7268400448560715, + "num_tokens": 58669420.0, + "step": 20900 + }, + { + "epoch": 0.5096963665930438, + "grad_norm": 1.1941968202590942, + "learning_rate": 0.00019703923613666798, + "loss": 1.1762, + "mean_token_accuracy": 0.727076341509819, + "num_tokens": 58944952.0, + "step": 21000 + }, + { + "epoch": 0.5121234921482488, + "grad_norm": 1.3396928310394287, + "learning_rate": 0.00019697063748423682, + "loss": 1.2, + "mean_token_accuracy": 0.72161645591259, + "num_tokens": 59227700.0, + "step": 21100 + }, + { + "epoch": 0.5145506177034538, + "grad_norm": 1.4734523296356201, + "learning_rate": 0.00019690126543543324, + "loss": 1.1836, + "mean_token_accuracy": 0.7250787276029587, + "num_tokens": 59505761.0, + "step": 21200 + }, + { + "epoch": 0.5169777432586588, + "grad_norm": 1.331443190574646, + "learning_rate": 0.00019683112054353914, + "loss": 1.1275, + "mean_token_accuracy": 0.7360107409954071, + "num_tokens": 59794626.0, + "step": 21300 + }, + { + "epoch": 0.5194048688138637, + "grad_norm": 2.195629596710205, + "learning_rate": 0.00019676020336800013, + "loss": 1.1866, + "mean_token_accuracy": 0.7259843808412552, + "num_tokens": 60077760.0, + "step": 21400 + }, + { + "epoch": 0.5218319943690687, + "grad_norm": 1.1297053098678589, + "learning_rate": 0.00019668851447442138, + "loss": 1.1691, + "mean_token_accuracy": 0.7298132872581482, + "num_tokens": 60357449.0, + "step": 21500 + }, + { + "epoch": 0.5242591199242737, + "grad_norm": 1.1505388021469116, + "learning_rate": 0.0001966160544345628, + "loss": 1.1943, + "mean_token_accuracy": 0.7239989709854125, + "num_tokens": 60659506.0, + "step": 21600 + }, + { + "epoch": 0.5266862454794786, + "grad_norm": 1.6284935474395752, + "learning_rate": 0.0001965428238263348, + "loss": 1.139, + "mean_token_accuracy": 0.7326309901475906, + "num_tokens": 60928432.0, + "step": 21700 + }, + { + "epoch": 0.5291133710346836, + "grad_norm": 1.4899412393569946, + "learning_rate": 0.00019646882323379333, + "loss": 1.1266, + "mean_token_accuracy": 0.7380794870853424, + "num_tokens": 61209379.0, + "step": 21800 + }, + { + "epoch": 0.5315404965898886, + "grad_norm": 1.0630362033843994, + "learning_rate": 0.0001963940532471355, + "loss": 1.1755, + "mean_token_accuracy": 0.7278030133247375, + "num_tokens": 61482390.0, + "step": 21900 + }, + { + "epoch": 0.5339676221450935, + "grad_norm": 1.0710209608078003, + "learning_rate": 0.00019631851446269487, + "loss": 1.1893, + "mean_token_accuracy": 0.7246757209300995, + "num_tokens": 61763958.0, + "step": 22000 + }, + { + "epoch": 0.5363947477002985, + "grad_norm": 1.2997487783432007, + "learning_rate": 0.0001962422074829364, + "loss": 1.1444, + "mean_token_accuracy": 0.7331423789262772, + "num_tokens": 62035789.0, + "step": 22100 + }, + { + "epoch": 0.5388218732555035, + "grad_norm": 1.2150485515594482, + "learning_rate": 0.00019616513291645206, + "loss": 1.1438, + "mean_token_accuracy": 0.7333634513616561, + "num_tokens": 62323217.0, + "step": 22200 + }, + { + "epoch": 0.5412489988107084, + "grad_norm": 1.5231380462646484, + "learning_rate": 0.00019608729137795557, + "loss": 1.1868, + "mean_token_accuracy": 0.7252021282911301, + "num_tokens": 62598509.0, + "step": 22300 + }, + { + "epoch": 0.5436761243659134, + "grad_norm": 0.9912909269332886, + "learning_rate": 0.00019600868348827787, + "loss": 1.1845, + "mean_token_accuracy": 0.7267439430952072, + "num_tokens": 62883504.0, + "step": 22400 + }, + { + "epoch": 0.5461032499211185, + "grad_norm": 0.9855025410652161, + "learning_rate": 0.0001959293098743618, + "loss": 1.1333, + "mean_token_accuracy": 0.7358741110563278, + "num_tokens": 63157450.0, + "step": 22500 + }, + { + "epoch": 0.5485303754763234, + "grad_norm": 1.5229765176773071, + "learning_rate": 0.00019584917116925748, + "loss": 1.1373, + "mean_token_accuracy": 0.7370314878225327, + "num_tokens": 63435535.0, + "step": 22600 + }, + { + "epoch": 0.5509575010315284, + "grad_norm": 1.4351425170898438, + "learning_rate": 0.00019576826801211695, + "loss": 1.1541, + "mean_token_accuracy": 0.7322686624526977, + "num_tokens": 63733409.0, + "step": 22700 + }, + { + "epoch": 0.5533846265867334, + "grad_norm": 1.2064365148544312, + "learning_rate": 0.00019568660104818918, + "loss": 1.0696, + "mean_token_accuracy": 0.748774966597557, + "num_tokens": 64021197.0, + "step": 22800 + }, + { + "epoch": 0.5558117521419383, + "grad_norm": 1.068001627922058, + "learning_rate": 0.00019560417092881504, + "loss": 1.1123, + "mean_token_accuracy": 0.7405225545167923, + "num_tokens": 64306536.0, + "step": 22900 + }, + { + "epoch": 0.5582388776971433, + "grad_norm": 0.9885738492012024, + "learning_rate": 0.00019552097831142194, + "loss": 1.1282, + "mean_token_accuracy": 0.7359859699010849, + "num_tokens": 64585245.0, + "step": 23000 + }, + { + "epoch": 0.5606660032523483, + "grad_norm": 1.1703755855560303, + "learning_rate": 0.0001954370238595186, + "loss": 1.1365, + "mean_token_accuracy": 0.7343781024217606, + "num_tokens": 64864941.0, + "step": 23100 + }, + { + "epoch": 0.5630931288075532, + "grad_norm": 1.25601327419281, + "learning_rate": 0.00019535230824268993, + "loss": 1.0856, + "mean_token_accuracy": 0.7458273768424988, + "num_tokens": 65148069.0, + "step": 23200 + }, + { + "epoch": 0.5655202543627582, + "grad_norm": 1.2828888893127441, + "learning_rate": 0.00019526683213659141, + "loss": 1.14, + "mean_token_accuracy": 0.7356480902433395, + "num_tokens": 65425846.0, + "step": 23300 + }, + { + "epoch": 0.5679473799179632, + "grad_norm": 1.0274039506912231, + "learning_rate": 0.00019518059622294405, + "loss": 1.1439, + "mean_token_accuracy": 0.7343963479995728, + "num_tokens": 65696885.0, + "step": 23400 + }, + { + "epoch": 0.5703745054731681, + "grad_norm": 1.795361042022705, + "learning_rate": 0.00019509360118952852, + "loss": 1.0995, + "mean_token_accuracy": 0.7427353280782699, + "num_tokens": 65967890.0, + "step": 23500 + }, + { + "epoch": 0.5728016310283731, + "grad_norm": 1.314911961555481, + "learning_rate": 0.00019500584773018016, + "loss": 1.1134, + "mean_token_accuracy": 0.7405674111843109, + "num_tokens": 66253601.0, + "step": 23600 + }, + { + "epoch": 0.5752287565835781, + "grad_norm": 1.4357415437698364, + "learning_rate": 0.000194917336544783, + "loss": 1.1355, + "mean_token_accuracy": 0.7366157558560371, + "num_tokens": 66546485.0, + "step": 23700 + }, + { + "epoch": 0.577655882138783, + "grad_norm": 0.9053491950035095, + "learning_rate": 0.00019482806833926445, + "loss": 1.1417, + "mean_token_accuracy": 0.7364838290214538, + "num_tokens": 66832391.0, + "step": 23800 + }, + { + "epoch": 0.580083007693988, + "grad_norm": 0.9933082461357117, + "learning_rate": 0.00019473804382558955, + "loss": 1.1164, + "mean_token_accuracy": 0.7381259691715241, + "num_tokens": 67100199.0, + "step": 23900 + }, + { + "epoch": 0.582510133249193, + "grad_norm": 1.1587954759597778, + "learning_rate": 0.0001946472637217554, + "loss": 1.114, + "mean_token_accuracy": 0.7397536563873292, + "num_tokens": 67372483.0, + "step": 24000 + }, + { + "epoch": 0.584937258804398, + "grad_norm": 1.5403295755386353, + "learning_rate": 0.0001945557287517853, + "loss": 1.116, + "mean_token_accuracy": 0.7395440202951431, + "num_tokens": 67648130.0, + "step": 24100 + }, + { + "epoch": 0.5873643843596029, + "grad_norm": 1.1019656658172607, + "learning_rate": 0.00019446343964572304, + "loss": 1.1711, + "mean_token_accuracy": 0.7280935817956924, + "num_tokens": 67930208.0, + "step": 24200 + }, + { + "epoch": 0.5897915099148079, + "grad_norm": 0.9746718406677246, + "learning_rate": 0.00019437039713962717, + "loss": 1.1329, + "mean_token_accuracy": 0.7364810371398925, + "num_tokens": 68200240.0, + "step": 24300 + }, + { + "epoch": 0.5922186354700129, + "grad_norm": 1.0284191370010376, + "learning_rate": 0.0001942766019755649, + "loss": 1.0529, + "mean_token_accuracy": 0.7535615694522858, + "num_tokens": 68483550.0, + "step": 24400 + }, + { + "epoch": 0.5946457610252178, + "grad_norm": 0.9971494674682617, + "learning_rate": 0.00019418205490160636, + "loss": 1.1004, + "mean_token_accuracy": 0.7428427731990814, + "num_tokens": 68770836.0, + "step": 24500 + }, + { + "epoch": 0.5970728865804228, + "grad_norm": 1.1013323068618774, + "learning_rate": 0.0001940867566718187, + "loss": 1.1384, + "mean_token_accuracy": 0.7362033921480179, + "num_tokens": 69054580.0, + "step": 24600 + }, + { + "epoch": 0.5995000121356278, + "grad_norm": 1.1625392436981201, + "learning_rate": 0.00019399070804625984, + "loss": 1.063, + "mean_token_accuracy": 0.748725289106369, + "num_tokens": 69340258.0, + "step": 24700 + }, + { + "epoch": 0.6019271376908327, + "grad_norm": 1.9240310192108154, + "learning_rate": 0.00019389390979097256, + "loss": 1.0823, + "mean_token_accuracy": 0.7495817458629608, + "num_tokens": 69619650.0, + "step": 24800 + }, + { + "epoch": 0.6043542632460377, + "grad_norm": 1.3660378456115723, + "learning_rate": 0.0001937963626779784, + "loss": 1.0272, + "mean_token_accuracy": 0.7577913010120392, + "num_tokens": 69890767.0, + "step": 24900 + }, + { + "epoch": 0.6067813888012427, + "grad_norm": 1.5367549657821655, + "learning_rate": 0.00019369806748527148, + "loss": 1.0573, + "mean_token_accuracy": 0.750820305943489, + "num_tokens": 70163631.0, + "step": 25000 + }, + { + "epoch": 0.6092085143564476, + "grad_norm": 1.065262794494629, + "learning_rate": 0.0001935990249968122, + "loss": 1.0723, + "mean_token_accuracy": 0.7507969135046005, + "num_tokens": 70453726.0, + "step": 25100 + }, + { + "epoch": 0.6116356399116526, + "grad_norm": 1.0172377824783325, + "learning_rate": 0.00019349923600252123, + "loss": 1.11, + "mean_token_accuracy": 0.7407309454679489, + "num_tokens": 70747381.0, + "step": 25200 + }, + { + "epoch": 0.6140627654668576, + "grad_norm": 1.1468844413757324, + "learning_rate": 0.00019339870129827285, + "loss": 1.0578, + "mean_token_accuracy": 0.7536929363012314, + "num_tokens": 71022840.0, + "step": 25300 + }, + { + "epoch": 0.6164898910220625, + "grad_norm": 1.2578643560409546, + "learning_rate": 0.00019329742168588894, + "loss": 1.0413, + "mean_token_accuracy": 0.7559024041891098, + "num_tokens": 71309539.0, + "step": 25400 + }, + { + "epoch": 0.6189170165772675, + "grad_norm": 1.2665239572525024, + "learning_rate": 0.00019319539797313243, + "loss": 1.0336, + "mean_token_accuracy": 0.7568505030870437, + "num_tokens": 71573364.0, + "step": 25500 + }, + { + "epoch": 0.6213441421324725, + "grad_norm": 1.1429393291473389, + "learning_rate": 0.00019309263097370085, + "loss": 1.0491, + "mean_token_accuracy": 0.7536306053400039, + "num_tokens": 71848695.0, + "step": 25600 + }, + { + "epoch": 0.6237712676876774, + "grad_norm": 1.2720645666122437, + "learning_rate": 0.00019298912150721989, + "loss": 1.0413, + "mean_token_accuracy": 0.7553714007139206, + "num_tokens": 72132905.0, + "step": 25700 + }, + { + "epoch": 0.6261983932428824, + "grad_norm": 1.1925904750823975, + "learning_rate": 0.0001928848703992368, + "loss": 1.057, + "mean_token_accuracy": 0.7516288584470749, + "num_tokens": 72419452.0, + "step": 25800 + }, + { + "epoch": 0.6286255187980875, + "grad_norm": 0.9983070492744446, + "learning_rate": 0.0001927798784812139, + "loss": 1.0228, + "mean_token_accuracy": 0.7606069624423981, + "num_tokens": 72719336.0, + "step": 25900 + }, + { + "epoch": 0.6310526443532924, + "grad_norm": 1.1177300214767456, + "learning_rate": 0.00019267414659052189, + "loss": 1.0434, + "mean_token_accuracy": 0.7564522981643677, + "num_tokens": 72994978.0, + "step": 26000 + }, + { + "epoch": 0.6334797699084974, + "grad_norm": 1.1599113941192627, + "learning_rate": 0.0001925676755704331, + "loss": 1.0577, + "mean_token_accuracy": 0.7541283696889878, + "num_tokens": 73281210.0, + "step": 26100 + }, + { + "epoch": 0.6359068954637024, + "grad_norm": 1.297285556793213, + "learning_rate": 0.00019246046627011495, + "loss": 1.0553, + "mean_token_accuracy": 0.7551116102933884, + "num_tokens": 73576523.0, + "step": 26200 + }, + { + "epoch": 0.6383340210189073, + "grad_norm": 1.4931613206863403, + "learning_rate": 0.00019235251954462294, + "loss": 1.0745, + "mean_token_accuracy": 0.7492081993818283, + "num_tokens": 73848522.0, + "step": 26300 + }, + { + "epoch": 0.6407611465741123, + "grad_norm": 1.0133391618728638, + "learning_rate": 0.00019224383625489412, + "loss": 1.0433, + "mean_token_accuracy": 0.7565659403800964, + "num_tokens": 74124032.0, + "step": 26400 + }, + { + "epoch": 0.6431882721293173, + "grad_norm": 1.2665462493896484, + "learning_rate": 0.00019213441726773992, + "loss": 1.0518, + "mean_token_accuracy": 0.75186956346035, + "num_tokens": 74404428.0, + "step": 26500 + }, + { + "epoch": 0.6456153976845223, + "grad_norm": 0.9439951777458191, + "learning_rate": 0.0001920242634558394, + "loss": 1.0292, + "mean_token_accuracy": 0.7585177063941956, + "num_tokens": 74696666.0, + "step": 26600 + }, + { + "epoch": 0.6480425232397272, + "grad_norm": 1.232807993888855, + "learning_rate": 0.00019191337569773235, + "loss": 0.9591, + "mean_token_accuracy": 0.7707934284210205, + "num_tokens": 74974515.0, + "step": 26700 + }, + { + "epoch": 0.6504696487949322, + "grad_norm": 1.106130599975586, + "learning_rate": 0.00019180175487781216, + "loss": 1.0185, + "mean_token_accuracy": 0.7600296914577485, + "num_tokens": 75251125.0, + "step": 26800 + }, + { + "epoch": 0.6528967743501372, + "grad_norm": 0.9902170300483704, + "learning_rate": 0.00019168940188631875, + "loss": 0.9849, + "mean_token_accuracy": 0.7675464844703674, + "num_tokens": 75543621.0, + "step": 26900 + }, + { + "epoch": 0.6553238999053421, + "grad_norm": 1.3111634254455566, + "learning_rate": 0.00019157631761933163, + "loss": 0.9863, + "mean_token_accuracy": 0.7691891992092132, + "num_tokens": 75808450.0, + "step": 27000 + }, + { + "epoch": 0.6577510254605471, + "grad_norm": 0.9136918783187866, + "learning_rate": 0.00019146250297876255, + "loss": 0.9892, + "mean_token_accuracy": 0.7655776154994964, + "num_tokens": 76091361.0, + "step": 27100 + }, + { + "epoch": 0.6601781510157521, + "grad_norm": 1.9679793119430542, + "learning_rate": 0.0001913479588723485, + "loss": 1.0011, + "mean_token_accuracy": 0.7665372800827026, + "num_tokens": 76372098.0, + "step": 27200 + }, + { + "epoch": 0.662605276570957, + "grad_norm": 1.099344253540039, + "learning_rate": 0.00019123268621364426, + "loss": 1.019, + "mean_token_accuracy": 0.7607282799482346, + "num_tokens": 76654534.0, + "step": 27300 + }, + { + "epoch": 0.665032402126162, + "grad_norm": 0.9826762080192566, + "learning_rate": 0.00019111668592201536, + "loss": 0.9837, + "mean_token_accuracy": 0.7672662001848221, + "num_tokens": 76931744.0, + "step": 27400 + }, + { + "epoch": 0.667459527681367, + "grad_norm": 1.080349087715149, + "learning_rate": 0.00019099995892263056, + "loss": 1.0261, + "mean_token_accuracy": 0.7605299288034439, + "num_tokens": 77201535.0, + "step": 27500 + }, + { + "epoch": 0.6698866532365719, + "grad_norm": 1.5150740146636963, + "learning_rate": 0.00019088250614645452, + "loss": 0.9962, + "mean_token_accuracy": 0.7669206023216247, + "num_tokens": 77485467.0, + "step": 27600 + }, + { + "epoch": 0.6723137787917769, + "grad_norm": 0.8500184416770935, + "learning_rate": 0.00019076432853024043, + "loss": 0.9651, + "mean_token_accuracy": 0.7721401613950729, + "num_tokens": 77764169.0, + "step": 27700 + }, + { + "epoch": 0.6747409043469819, + "grad_norm": 1.0744963884353638, + "learning_rate": 0.0001906454270165224, + "loss": 0.9603, + "mean_token_accuracy": 0.7727346140146255, + "num_tokens": 78047351.0, + "step": 27800 + }, + { + "epoch": 0.6771680299021868, + "grad_norm": 1.2541007995605469, + "learning_rate": 0.00019052580255360813, + "loss": 0.9878, + "mean_token_accuracy": 0.768582199215889, + "num_tokens": 78322847.0, + "step": 27900 + }, + { + "epoch": 0.6795951554573918, + "grad_norm": 0.8931077718734741, + "learning_rate": 0.00019040545609557127, + "loss": 0.9275, + "mean_token_accuracy": 0.7798711568117142, + "num_tokens": 78611878.0, + "step": 28000 + }, + { + "epoch": 0.6820222810125968, + "grad_norm": 1.128255844116211, + "learning_rate": 0.0001902843886022437, + "loss": 0.9168, + "mean_token_accuracy": 0.7815976619720459, + "num_tokens": 78876123.0, + "step": 28100 + }, + { + "epoch": 0.6844494065678017, + "grad_norm": 1.1579947471618652, + "learning_rate": 0.00019016260103920803, + "loss": 0.9599, + "mean_token_accuracy": 0.7729695934057236, + "num_tokens": 79167916.0, + "step": 28200 + }, + { + "epoch": 0.6868765321230067, + "grad_norm": 2.0827414989471436, + "learning_rate": 0.00019004009437778983, + "loss": 0.9421, + "mean_token_accuracy": 0.7752230167388916, + "num_tokens": 79460524.0, + "step": 28300 + }, + { + "epoch": 0.6893036576782117, + "grad_norm": 1.0424773693084717, + "learning_rate": 0.00018991686959504987, + "loss": 0.9958, + "mean_token_accuracy": 0.7665992087125778, + "num_tokens": 79743868.0, + "step": 28400 + }, + { + "epoch": 0.6917307832334166, + "grad_norm": 0.9780852794647217, + "learning_rate": 0.00018979292767377638, + "loss": 0.9604, + "mean_token_accuracy": 0.7735722011327744, + "num_tokens": 80041028.0, + "step": 28500 + }, + { + "epoch": 0.6941579087886216, + "grad_norm": 1.0590211153030396, + "learning_rate": 0.0001896682696024772, + "loss": 0.9759, + "mean_token_accuracy": 0.7699248534440994, + "num_tokens": 80325828.0, + "step": 28600 + }, + { + "epoch": 0.6965850343438266, + "grad_norm": 0.9975331425666809, + "learning_rate": 0.0001895428963753718, + "loss": 0.9459, + "mean_token_accuracy": 0.7763609528541565, + "num_tokens": 80602290.0, + "step": 28700 + }, + { + "epoch": 0.6990121598990315, + "grad_norm": 1.1276270151138306, + "learning_rate": 0.00018941680899238353, + "loss": 0.9968, + "mean_token_accuracy": 0.7691206991672516, + "num_tokens": 80870582.0, + "step": 28800 + }, + { + "epoch": 0.7014392854542365, + "grad_norm": 0.8435887098312378, + "learning_rate": 0.0001892900084591314, + "loss": 0.9097, + "mean_token_accuracy": 0.7834201455116272, + "num_tokens": 81156560.0, + "step": 28900 + }, + { + "epoch": 0.7038664110094415, + "grad_norm": 1.2956368923187256, + "learning_rate": 0.00018916249578692233, + "loss": 0.9028, + "mean_token_accuracy": 0.7854771196842194, + "num_tokens": 81431849.0, + "step": 29000 + }, + { + "epoch": 0.7062935365646464, + "grad_norm": 1.1730372905731201, + "learning_rate": 0.00018903427199274292, + "loss": 0.9284, + "mean_token_accuracy": 0.7804183250665665, + "num_tokens": 81722470.0, + "step": 29100 + }, + { + "epoch": 0.7087206621198514, + "grad_norm": 1.1425467729568481, + "learning_rate": 0.00018890533809925135, + "loss": 0.9165, + "mean_token_accuracy": 0.7836980718374252, + "num_tokens": 81987042.0, + "step": 29200 + }, + { + "epoch": 0.7111477876750564, + "grad_norm": 1.4441839456558228, + "learning_rate": 0.00018877569513476927, + "loss": 0.9604, + "mean_token_accuracy": 0.7747070676088333, + "num_tokens": 82271974.0, + "step": 29300 + }, + { + "epoch": 0.7135749132302615, + "grad_norm": 0.988593339920044, + "learning_rate": 0.00018864534413327356, + "loss": 0.9494, + "mean_token_accuracy": 0.776980972290039, + "num_tokens": 82549932.0, + "step": 29400 + }, + { + "epoch": 0.7160020387854664, + "grad_norm": 1.1505811214447021, + "learning_rate": 0.00018851428613438816, + "loss": 0.8924, + "mean_token_accuracy": 0.7875621300935746, + "num_tokens": 82838862.0, + "step": 29500 + }, + { + "epoch": 0.7184291643406714, + "grad_norm": 1.0360952615737915, + "learning_rate": 0.00018838252218337565, + "loss": 0.9386, + "mean_token_accuracy": 0.7802232497930527, + "num_tokens": 83121770.0, + "step": 29600 + }, + { + "epoch": 0.7208562898958764, + "grad_norm": 1.3923313617706299, + "learning_rate": 0.00018825005333112898, + "loss": 0.972, + "mean_token_accuracy": 0.771819856762886, + "num_tokens": 83398476.0, + "step": 29700 + }, + { + "epoch": 0.7232834154510813, + "grad_norm": 1.662172794342041, + "learning_rate": 0.00018811688063416314, + "loss": 0.9601, + "mean_token_accuracy": 0.7737440180778503, + "num_tokens": 83689236.0, + "step": 29800 + }, + { + "epoch": 0.7257105410062863, + "grad_norm": 0.9064234495162964, + "learning_rate": 0.00018798300515460663, + "loss": 0.8928, + "mean_token_accuracy": 0.787415645122528, + "num_tokens": 83972483.0, + "step": 29900 + }, + { + "epoch": 0.7281376665614913, + "grad_norm": 1.5885084867477417, + "learning_rate": 0.00018784842796019312, + "loss": 0.8973, + "mean_token_accuracy": 0.7884204018115998, + "num_tokens": 84253687.0, + "step": 30000 + }, + { + "epoch": 0.7305647921166962, + "grad_norm": 1.08694326877594, + "learning_rate": 0.0001877131501242528, + "loss": 0.8678, + "mean_token_accuracy": 0.7936598181724548, + "num_tokens": 84535890.0, + "step": 30100 + }, + { + "epoch": 0.7329919176719012, + "grad_norm": 1.0348883867263794, + "learning_rate": 0.00018757717272570383, + "loss": 0.9168, + "mean_token_accuracy": 0.7829198455810547, + "num_tokens": 84824617.0, + "step": 30200 + }, + { + "epoch": 0.7354190432271062, + "grad_norm": 1.0495657920837402, + "learning_rate": 0.00018744049684904393, + "loss": 0.8521, + "mean_token_accuracy": 0.7954359531402588, + "num_tokens": 85100185.0, + "step": 30300 + }, + { + "epoch": 0.7378461687823111, + "grad_norm": 0.7942479252815247, + "learning_rate": 0.00018730312358434148, + "loss": 0.8575, + "mean_token_accuracy": 0.7958718985319138, + "num_tokens": 85394788.0, + "step": 30400 + }, + { + "epoch": 0.7402732943375161, + "grad_norm": 1.0841585397720337, + "learning_rate": 0.00018716505402722688, + "loss": 0.869, + "mean_token_accuracy": 0.7918435066938401, + "num_tokens": 85679932.0, + "step": 30500 + }, + { + "epoch": 0.7427004198927211, + "grad_norm": 1.061931848526001, + "learning_rate": 0.00018702628927888396, + "loss": 0.9062, + "mean_token_accuracy": 0.7857851403951644, + "num_tokens": 85962110.0, + "step": 30600 + }, + { + "epoch": 0.745127545447926, + "grad_norm": 0.9450345635414124, + "learning_rate": 0.00018688683044604113, + "loss": 0.8633, + "mean_token_accuracy": 0.7948847645521164, + "num_tokens": 86240268.0, + "step": 30700 + }, + { + "epoch": 0.747554671003131, + "grad_norm": 1.069827914237976, + "learning_rate": 0.00018674667864096236, + "loss": 0.8936, + "mean_token_accuracy": 0.7883854901790619, + "num_tokens": 86519108.0, + "step": 30800 + }, + { + "epoch": 0.749981796558336, + "grad_norm": 0.970453679561615, + "learning_rate": 0.0001866058349814386, + "loss": 0.8866, + "mean_token_accuracy": 0.7884615308046341, + "num_tokens": 86803385.0, + "step": 30900 + }, + { + "epoch": 0.7524089221135409, + "grad_norm": 1.24496328830719, + "learning_rate": 0.0001864643005907787, + "loss": 0.8966, + "mean_token_accuracy": 0.7896036711335183, + "num_tokens": 87080614.0, + "step": 31000 + }, + { + "epoch": 0.7548360476687459, + "grad_norm": 0.9882015585899353, + "learning_rate": 0.0001863220765978005, + "loss": 0.8813, + "mean_token_accuracy": 0.7906145012378692, + "num_tokens": 87355322.0, + "step": 31100 + }, + { + "epoch": 0.7572631732239509, + "grad_norm": 1.0050567388534546, + "learning_rate": 0.0001861791641368218, + "loss": 0.8674, + "mean_token_accuracy": 0.7941360598802567, + "num_tokens": 87652338.0, + "step": 31200 + }, + { + "epoch": 0.7596902987791558, + "grad_norm": 0.9534517526626587, + "learning_rate": 0.00018603556434765127, + "loss": 0.8625, + "mean_token_accuracy": 0.7951251226663589, + "num_tokens": 87925741.0, + "step": 31300 + }, + { + "epoch": 0.7621174243343608, + "grad_norm": 0.8884904980659485, + "learning_rate": 0.00018589127837557957, + "loss": 0.868, + "mean_token_accuracy": 0.7937871700525284, + "num_tokens": 88199531.0, + "step": 31400 + }, + { + "epoch": 0.7645445498895658, + "grad_norm": 1.0444873571395874, + "learning_rate": 0.0001857463073713699, + "loss": 0.9099, + "mean_token_accuracy": 0.7861129724979401, + "num_tokens": 88484640.0, + "step": 31500 + }, + { + "epoch": 0.7669716754447707, + "grad_norm": 1.642042636871338, + "learning_rate": 0.00018560065249124906, + "loss": 0.8985, + "mean_token_accuracy": 0.7859966540336609, + "num_tokens": 88779524.0, + "step": 31600 + }, + { + "epoch": 0.7693988009999757, + "grad_norm": 0.9597316384315491, + "learning_rate": 0.00018545431489689815, + "loss": 0.8862, + "mean_token_accuracy": 0.7899703568220139, + "num_tokens": 89053357.0, + "step": 31700 + }, + { + "epoch": 0.7718259265551807, + "grad_norm": 1.1189887523651123, + "learning_rate": 0.00018530729575544328, + "loss": 0.8629, + "mean_token_accuracy": 0.7946012103557587, + "num_tokens": 89340495.0, + "step": 31800 + }, + { + "epoch": 0.7742530521103856, + "grad_norm": 1.14096999168396, + "learning_rate": 0.00018515959623944633, + "loss": 0.8709, + "mean_token_accuracy": 0.7942619401216507, + "num_tokens": 89617490.0, + "step": 31900 + }, + { + "epoch": 0.7766801776655906, + "grad_norm": 1.0808043479919434, + "learning_rate": 0.0001850112175268955, + "loss": 0.8721, + "mean_token_accuracy": 0.7936557620763779, + "num_tokens": 89893807.0, + "step": 32000 + }, + { + "epoch": 0.7791073032207956, + "grad_norm": 1.107035517692566, + "learning_rate": 0.00018486216080119608, + "loss": 0.8291, + "mean_token_accuracy": 0.8008069145679474, + "num_tokens": 90167592.0, + "step": 32100 + }, + { + "epoch": 0.7815344287760005, + "grad_norm": 1.0005587339401245, + "learning_rate": 0.0001847124272511607, + "loss": 0.854, + "mean_token_accuracy": 0.7974209851026535, + "num_tokens": 90431679.0, + "step": 32200 + }, + { + "epoch": 0.7839615543312055, + "grad_norm": 0.798092782497406, + "learning_rate": 0.00018456201807100028, + "loss": 0.9238, + "mean_token_accuracy": 0.782583869099617, + "num_tokens": 90707675.0, + "step": 32300 + }, + { + "epoch": 0.7863886798864105, + "grad_norm": 1.3278645277023315, + "learning_rate": 0.00018441093446031417, + "loss": 0.8968, + "mean_token_accuracy": 0.788212314248085, + "num_tokens": 90981022.0, + "step": 32400 + }, + { + "epoch": 0.7888158054416154, + "grad_norm": 1.225932002067566, + "learning_rate": 0.00018425917762408066, + "loss": 0.8619, + "mean_token_accuracy": 0.79517926633358, + "num_tokens": 91259313.0, + "step": 32500 + }, + { + "epoch": 0.7912429309968204, + "grad_norm": 0.8537389039993286, + "learning_rate": 0.00018410674877264744, + "loss": 0.8382, + "mean_token_accuracy": 0.7990948641300202, + "num_tokens": 91539815.0, + "step": 32600 + }, + { + "epoch": 0.7936700565520254, + "grad_norm": 1.2435106039047241, + "learning_rate": 0.00018395364912172191, + "loss": 0.8281, + "mean_token_accuracy": 0.8019495517015457, + "num_tokens": 91824981.0, + "step": 32700 + }, + { + "epoch": 0.7960971821072305, + "grad_norm": 1.0658791065216064, + "learning_rate": 0.00018379987989236148, + "loss": 0.8852, + "mean_token_accuracy": 0.7909339308738709, + "num_tokens": 92099114.0, + "step": 32800 + }, + { + "epoch": 0.7985243076624354, + "grad_norm": 1.2279072999954224, + "learning_rate": 0.00018364544231096384, + "loss": 0.8409, + "mean_token_accuracy": 0.8014136582612992, + "num_tokens": 92378255.0, + "step": 32900 + }, + { + "epoch": 0.8009514332176404, + "grad_norm": 1.0797042846679688, + "learning_rate": 0.00018349033760925709, + "loss": 0.769, + "mean_token_accuracy": 0.8132674318552017, + "num_tokens": 92657534.0, + "step": 33000 + }, + { + "epoch": 0.8033785587728454, + "grad_norm": 1.1501679420471191, + "learning_rate": 0.00018333456702429013, + "loss": 0.8413, + "mean_token_accuracy": 0.7998440200090409, + "num_tokens": 92939422.0, + "step": 33100 + }, + { + "epoch": 0.8058056843280503, + "grad_norm": 0.9171693325042725, + "learning_rate": 0.0001831781317984225, + "loss": 0.8727, + "mean_token_accuracy": 0.7943976658582688, + "num_tokens": 93234393.0, + "step": 33200 + }, + { + "epoch": 0.8082328098832553, + "grad_norm": 0.9114980101585388, + "learning_rate": 0.00018302103317931483, + "loss": 0.8398, + "mean_token_accuracy": 0.7989549374580384, + "num_tokens": 93509369.0, + "step": 33300 + }, + { + "epoch": 0.8106599354384603, + "grad_norm": 1.2949105501174927, + "learning_rate": 0.00018286327241991853, + "loss": 0.8068, + "mean_token_accuracy": 0.8070319950580597, + "num_tokens": 93785744.0, + "step": 33400 + }, + { + "epoch": 0.8130870609936652, + "grad_norm": 1.1802972555160522, + "learning_rate": 0.00018270485077846602, + "loss": 0.8512, + "mean_token_accuracy": 0.7979403012990951, + "num_tokens": 94058237.0, + "step": 33500 + }, + { + "epoch": 0.8155141865488702, + "grad_norm": 1.372752070426941, + "learning_rate": 0.0001825457695184606, + "loss": 0.8358, + "mean_token_accuracy": 0.8009763330221176, + "num_tokens": 94358495.0, + "step": 33600 + }, + { + "epoch": 0.8179413121040752, + "grad_norm": 0.97585129737854, + "learning_rate": 0.00018238602990866646, + "loss": 0.8161, + "mean_token_accuracy": 0.8052691560983658, + "num_tokens": 94633351.0, + "step": 33700 + }, + { + "epoch": 0.8203684376592801, + "grad_norm": 1.0736268758773804, + "learning_rate": 0.0001822256332230985, + "loss": 0.8244, + "mean_token_accuracy": 0.8037301182746888, + "num_tokens": 94915600.0, + "step": 33800 + }, + { + "epoch": 0.8227955632144851, + "grad_norm": 1.2356585264205933, + "learning_rate": 0.00018206458074101204, + "loss": 0.8763, + "mean_token_accuracy": 0.7945786851644516, + "num_tokens": 95201992.0, + "step": 33900 + }, + { + "epoch": 0.8252226887696901, + "grad_norm": 1.520247459411621, + "learning_rate": 0.000181902873746893, + "loss": 0.8785, + "mean_token_accuracy": 0.7953230202198028, + "num_tokens": 95493649.0, + "step": 34000 + }, + { + "epoch": 0.827649814324895, + "grad_norm": 1.2150897979736328, + "learning_rate": 0.0001817405135304472, + "loss": 0.8187, + "mean_token_accuracy": 0.8049602442979813, + "num_tokens": 95764371.0, + "step": 34100 + }, + { + "epoch": 0.8300769398801, + "grad_norm": 1.0239194631576538, + "learning_rate": 0.00018157750138659038, + "loss": 0.8111, + "mean_token_accuracy": 0.8069447958469391, + "num_tokens": 96057388.0, + "step": 34200 + }, + { + "epoch": 0.832504065435305, + "grad_norm": 1.225995421409607, + "learning_rate": 0.00018141383861543773, + "loss": 0.785, + "mean_token_accuracy": 0.81030932366848, + "num_tokens": 96320139.0, + "step": 34300 + }, + { + "epoch": 0.8349311909905099, + "grad_norm": 1.0512681007385254, + "learning_rate": 0.00018124952652229362, + "loss": 0.8081, + "mean_token_accuracy": 0.8077819764614105, + "num_tokens": 96586361.0, + "step": 34400 + }, + { + "epoch": 0.8373583165457149, + "grad_norm": 0.9870871305465698, + "learning_rate": 0.00018108456641764104, + "loss": 0.7915, + "mean_token_accuracy": 0.8107606899738312, + "num_tokens": 96851813.0, + "step": 34500 + }, + { + "epoch": 0.8397854421009199, + "grad_norm": 1.0771749019622803, + "learning_rate": 0.00018091895961713135, + "loss": 0.8465, + "mean_token_accuracy": 0.8007239735126496, + "num_tokens": 97111419.0, + "step": 34600 + }, + { + "epoch": 0.8422125676561248, + "grad_norm": 1.0361714363098145, + "learning_rate": 0.00018075270744157367, + "loss": 0.7617, + "mean_token_accuracy": 0.8161041277647019, + "num_tokens": 97390477.0, + "step": 34700 + }, + { + "epoch": 0.8446396932113298, + "grad_norm": 1.1362801790237427, + "learning_rate": 0.0001805858112169242, + "loss": 0.7813, + "mean_token_accuracy": 0.8107654428482056, + "num_tokens": 97672616.0, + "step": 34800 + }, + { + "epoch": 0.8470668187665348, + "grad_norm": 1.2550112009048462, + "learning_rate": 0.00018041827227427602, + "loss": 0.8187, + "mean_token_accuracy": 0.8066113740205765, + "num_tokens": 97936627.0, + "step": 34900 + }, + { + "epoch": 0.8494939443217397, + "grad_norm": 1.050264835357666, + "learning_rate": 0.00018025009194984817, + "loss": 0.8027, + "mean_token_accuracy": 0.8081069684028626, + "num_tokens": 98211705.0, + "step": 35000 + }, + { + "epoch": 0.8519210698769447, + "grad_norm": 1.0110602378845215, + "learning_rate": 0.00018008127158497498, + "loss": 0.751, + "mean_token_accuracy": 0.8187358456850052, + "num_tokens": 98493819.0, + "step": 35100 + }, + { + "epoch": 0.8543481954321497, + "grad_norm": 0.7575352191925049, + "learning_rate": 0.00017991181252609558, + "loss": 0.8254, + "mean_token_accuracy": 0.8047711282968522, + "num_tokens": 98772156.0, + "step": 35200 + }, + { + "epoch": 0.8567753209873546, + "grad_norm": 1.2765109539031982, + "learning_rate": 0.000179741716124743, + "loss": 0.7522, + "mean_token_accuracy": 0.8180702245235443, + "num_tokens": 99050074.0, + "step": 35300 + }, + { + "epoch": 0.8592024465425596, + "grad_norm": 0.955980658531189, + "learning_rate": 0.00017957098373753348, + "loss": 0.7575, + "mean_token_accuracy": 0.8179322808980942, + "num_tokens": 99344611.0, + "step": 35400 + }, + { + "epoch": 0.8616295720977646, + "grad_norm": 1.1932599544525146, + "learning_rate": 0.00017939961672615564, + "loss": 0.8472, + "mean_token_accuracy": 0.801171503663063, + "num_tokens": 99619715.0, + "step": 35500 + }, + { + "epoch": 0.8640566976529696, + "grad_norm": 1.0699065923690796, + "learning_rate": 0.00017922761645735945, + "loss": 0.7807, + "mean_token_accuracy": 0.8139619028568268, + "num_tokens": 99904024.0, + "step": 35600 + }, + { + "epoch": 0.8664838232081745, + "grad_norm": 0.8689550757408142, + "learning_rate": 0.00017905498430294565, + "loss": 0.7973, + "mean_token_accuracy": 0.8107730603218078, + "num_tokens": 100180461.0, + "step": 35700 + }, + { + "epoch": 0.8689109487633795, + "grad_norm": 1.159509539604187, + "learning_rate": 0.00017888172163975453, + "loss": 0.7626, + "mean_token_accuracy": 0.8172780108451844, + "num_tokens": 100454222.0, + "step": 35800 + }, + { + "epoch": 0.8713380743185845, + "grad_norm": 1.2939770221710205, + "learning_rate": 0.00017870782984965498, + "loss": 0.8335, + "mean_token_accuracy": 0.8025886970758438, + "num_tokens": 100749127.0, + "step": 35900 + }, + { + "epoch": 0.8737651998737894, + "grad_norm": 1.2724905014038086, + "learning_rate": 0.0001785333103195337, + "loss": 0.7812, + "mean_token_accuracy": 0.8140066814422607, + "num_tokens": 101028153.0, + "step": 36000 + }, + { + "epoch": 0.8761923254289944, + "grad_norm": 0.8696538209915161, + "learning_rate": 0.00017835816444128387, + "loss": 0.7961, + "mean_token_accuracy": 0.8110569196939469, + "num_tokens": 101312954.0, + "step": 36100 + }, + { + "epoch": 0.8786194509841994, + "grad_norm": 0.9427557587623596, + "learning_rate": 0.0001781823936117942, + "loss": 0.751, + "mean_token_accuracy": 0.8197416269779205, + "num_tokens": 101593661.0, + "step": 36200 + }, + { + "epoch": 0.8810465765394044, + "grad_norm": 1.0701342821121216, + "learning_rate": 0.0001780059992329377, + "loss": 0.7963, + "mean_token_accuracy": 0.8096175003051758, + "num_tokens": 101867800.0, + "step": 36300 + }, + { + "epoch": 0.8834737020946094, + "grad_norm": 0.8264694809913635, + "learning_rate": 0.00017782898271156058, + "loss": 0.7706, + "mean_token_accuracy": 0.8167759263515473, + "num_tokens": 102150665.0, + "step": 36400 + }, + { + "epoch": 0.8859008276498144, + "grad_norm": 0.8843175768852234, + "learning_rate": 0.000177651345459471, + "loss": 0.7896, + "mean_token_accuracy": 0.8110161870718002, + "num_tokens": 102440416.0, + "step": 36500 + }, + { + "epoch": 0.8883279532050193, + "grad_norm": 0.9907525777816772, + "learning_rate": 0.00017747308889342776, + "loss": 0.7726, + "mean_token_accuracy": 0.8153502416610717, + "num_tokens": 102723528.0, + "step": 36600 + }, + { + "epoch": 0.8907550787602243, + "grad_norm": 1.609206199645996, + "learning_rate": 0.00017729421443512906, + "loss": 0.7236, + "mean_token_accuracy": 0.8254850769042968, + "num_tokens": 102990192.0, + "step": 36700 + }, + { + "epoch": 0.8931822043154293, + "grad_norm": 1.0116157531738281, + "learning_rate": 0.00017711472351120117, + "loss": 0.734, + "mean_token_accuracy": 0.8239784175157547, + "num_tokens": 103268116.0, + "step": 36800 + }, + { + "epoch": 0.8956093298706342, + "grad_norm": 0.8444027304649353, + "learning_rate": 0.000176934617553187, + "loss": 0.7513, + "mean_token_accuracy": 0.819818964600563, + "num_tokens": 103548527.0, + "step": 36900 + }, + { + "epoch": 0.8980364554258392, + "grad_norm": 0.7157046794891357, + "learning_rate": 0.00017675389799753474, + "loss": 0.7635, + "mean_token_accuracy": 0.8183335334062576, + "num_tokens": 103842672.0, + "step": 37000 + }, + { + "epoch": 0.9004635809810442, + "grad_norm": 0.8269189596176147, + "learning_rate": 0.00017657256628558635, + "loss": 0.746, + "mean_token_accuracy": 0.8198094671964645, + "num_tokens": 104117428.0, + "step": 37100 + }, + { + "epoch": 0.9028907065362491, + "grad_norm": 1.2032946348190308, + "learning_rate": 0.00017639062386356604, + "loss": 0.7487, + "mean_token_accuracy": 0.8211363846063614, + "num_tokens": 104403750.0, + "step": 37200 + }, + { + "epoch": 0.9053178320914541, + "grad_norm": 0.7613286972045898, + "learning_rate": 0.00017620807218256889, + "loss": 0.7342, + "mean_token_accuracy": 0.8234205913543701, + "num_tokens": 104680182.0, + "step": 37300 + }, + { + "epoch": 0.9077449576466591, + "grad_norm": 0.8773027658462524, + "learning_rate": 0.00017602491269854906, + "loss": 0.767, + "mean_token_accuracy": 0.817679477930069, + "num_tokens": 104968706.0, + "step": 37400 + }, + { + "epoch": 0.910172083201864, + "grad_norm": 0.8545549511909485, + "learning_rate": 0.00017584114687230838, + "loss": 0.7352, + "mean_token_accuracy": 0.8224795866012573, + "num_tokens": 105262204.0, + "step": 37500 + }, + { + "epoch": 0.912599208757069, + "grad_norm": 0.9424647092819214, + "learning_rate": 0.0001756567761694845, + "loss": 0.6923, + "mean_token_accuracy": 0.834279500246048, + "num_tokens": 105539111.0, + "step": 37600 + }, + { + "epoch": 0.915026334312274, + "grad_norm": 0.7908113598823547, + "learning_rate": 0.00017547180206053945, + "loss": 0.7413, + "mean_token_accuracy": 0.8225541985034943, + "num_tokens": 105823216.0, + "step": 37700 + }, + { + "epoch": 0.917453459867479, + "grad_norm": 1.1390018463134766, + "learning_rate": 0.00017528622602074764, + "loss": 0.7383, + "mean_token_accuracy": 0.8238617664575577, + "num_tokens": 106112129.0, + "step": 37800 + }, + { + "epoch": 0.9198805854226839, + "grad_norm": 0.9344626069068909, + "learning_rate": 0.00017510004953018426, + "loss": 0.7145, + "mean_token_accuracy": 0.8279504466056824, + "num_tokens": 106395386.0, + "step": 37900 + }, + { + "epoch": 0.9223077109778889, + "grad_norm": 0.8515225052833557, + "learning_rate": 0.00017491327407371352, + "loss": 0.6988, + "mean_token_accuracy": 0.830140540599823, + "num_tokens": 106673179.0, + "step": 38000 + }, + { + "epoch": 0.9247348365330939, + "grad_norm": 0.8123283982276917, + "learning_rate": 0.0001747259011409766, + "loss": 0.7336, + "mean_token_accuracy": 0.8250963681936264, + "num_tokens": 106940340.0, + "step": 38100 + }, + { + "epoch": 0.9271619620882988, + "grad_norm": 0.9882307052612305, + "learning_rate": 0.00017453793222637997, + "loss": 0.7434, + "mean_token_accuracy": 0.8226048046350479, + "num_tokens": 107217056.0, + "step": 38200 + }, + { + "epoch": 0.9295890876435038, + "grad_norm": 1.0422884225845337, + "learning_rate": 0.00017434936882908337, + "loss": 0.7875, + "mean_token_accuracy": 0.8118562215566635, + "num_tokens": 107501552.0, + "step": 38300 + }, + { + "epoch": 0.9320162131987088, + "grad_norm": 0.9351255893707275, + "learning_rate": 0.00017416021245298793, + "loss": 0.7388, + "mean_token_accuracy": 0.8256934684514999, + "num_tokens": 107792280.0, + "step": 38400 + }, + { + "epoch": 0.9344433387539137, + "grad_norm": 1.1992120742797852, + "learning_rate": 0.00017397046460672404, + "loss": 0.7575, + "mean_token_accuracy": 0.8187384909391403, + "num_tokens": 108070349.0, + "step": 38500 + }, + { + "epoch": 0.9368704643091187, + "grad_norm": 0.8793713450431824, + "learning_rate": 0.00017378012680363945, + "loss": 0.7247, + "mean_token_accuracy": 0.8275273048877716, + "num_tokens": 108359397.0, + "step": 38600 + }, + { + "epoch": 0.9392975898643237, + "grad_norm": 0.7668092250823975, + "learning_rate": 0.00017358920056178718, + "loss": 0.7654, + "mean_token_accuracy": 0.8186299502849579, + "num_tokens": 108644525.0, + "step": 38700 + }, + { + "epoch": 0.9417247154195286, + "grad_norm": 1.0598175525665283, + "learning_rate": 0.00017339768740391333, + "loss": 0.6535, + "mean_token_accuracy": 0.8406318628787994, + "num_tokens": 108925988.0, + "step": 38800 + }, + { + "epoch": 0.9441518409747336, + "grad_norm": 1.4081029891967773, + "learning_rate": 0.00017320558885744504, + "loss": 0.7174, + "mean_token_accuracy": 0.8275231170654297, + "num_tokens": 109202693.0, + "step": 38900 + }, + { + "epoch": 0.9465789665299386, + "grad_norm": 1.382180094718933, + "learning_rate": 0.00017301290645447824, + "loss": 0.6792, + "mean_token_accuracy": 0.834947235584259, + "num_tokens": 109470412.0, + "step": 39000 + }, + { + "epoch": 0.9490060920851435, + "grad_norm": 0.6567552089691162, + "learning_rate": 0.0001728196417317655, + "loss": 0.7262, + "mean_token_accuracy": 0.8264455366134643, + "num_tokens": 109758785.0, + "step": 39100 + }, + { + "epoch": 0.9514332176403485, + "grad_norm": 0.6902262568473816, + "learning_rate": 0.00017262579623070353, + "loss": 0.7497, + "mean_token_accuracy": 0.8199244272708893, + "num_tokens": 110049715.0, + "step": 39200 + }, + { + "epoch": 0.9538603431955535, + "grad_norm": 1.103183388710022, + "learning_rate": 0.00017243137149732137, + "loss": 0.6898, + "mean_token_accuracy": 0.8338291847705841, + "num_tokens": 110334599.0, + "step": 39300 + }, + { + "epoch": 0.9562874687507584, + "grad_norm": 0.9607908129692078, + "learning_rate": 0.00017223636908226752, + "loss": 0.6989, + "mean_token_accuracy": 0.8327404856681824, + "num_tokens": 110617566.0, + "step": 39400 + }, + { + "epoch": 0.9587145943059634, + "grad_norm": 1.0758519172668457, + "learning_rate": 0.000172040790540798, + "loss": 0.6825, + "mean_token_accuracy": 0.8364751309156417, + "num_tokens": 110900065.0, + "step": 39500 + }, + { + "epoch": 0.9611417198611684, + "grad_norm": 0.8208298683166504, + "learning_rate": 0.00017184463743276365, + "loss": 0.7095, + "mean_token_accuracy": 0.8310269391536713, + "num_tokens": 111178185.0, + "step": 39600 + }, + { + "epoch": 0.9635688454163734, + "grad_norm": 1.049796223640442, + "learning_rate": 0.0001716479113225979, + "loss": 0.6954, + "mean_token_accuracy": 0.8331800949573517, + "num_tokens": 111442052.0, + "step": 39700 + }, + { + "epoch": 0.9659959709715784, + "grad_norm": 0.8910472989082336, + "learning_rate": 0.00017145061377930407, + "loss": 0.6871, + "mean_token_accuracy": 0.8366471952199936, + "num_tokens": 111718863.0, + "step": 39800 + }, + { + "epoch": 0.9684230965267834, + "grad_norm": 0.9564645886421204, + "learning_rate": 0.00017125274637644312, + "loss": 0.7159, + "mean_token_accuracy": 0.8281272131204606, + "num_tokens": 111997357.0, + "step": 39900 + }, + { + "epoch": 0.9708502220819883, + "grad_norm": 0.9473711848258972, + "learning_rate": 0.00017105431069212087, + "loss": 0.7179, + "mean_token_accuracy": 0.8280116510391236, + "num_tokens": 112284668.0, + "step": 40000 + }, + { + "epoch": 0.9732773476371933, + "grad_norm": 0.8862663507461548, + "learning_rate": 0.00017085530830897566, + "loss": 0.7195, + "mean_token_accuracy": 0.8270606195926666, + "num_tokens": 112555426.0, + "step": 40100 + }, + { + "epoch": 0.9757044731923983, + "grad_norm": 1.0798990726470947, + "learning_rate": 0.00017065574081416535, + "loss": 0.6937, + "mean_token_accuracy": 0.8331471282243729, + "num_tokens": 112833652.0, + "step": 40200 + }, + { + "epoch": 0.9781315987476032, + "grad_norm": 1.0329279899597168, + "learning_rate": 0.00017045560979935505, + "loss": 0.6973, + "mean_token_accuracy": 0.8337054681777955, + "num_tokens": 113125979.0, + "step": 40300 + }, + { + "epoch": 0.9805587243028082, + "grad_norm": 1.095126986503601, + "learning_rate": 0.00017025491686070425, + "loss": 0.7148, + "mean_token_accuracy": 0.828719516992569, + "num_tokens": 113395603.0, + "step": 40400 + }, + { + "epoch": 0.9829858498580132, + "grad_norm": 0.6569015383720398, + "learning_rate": 0.00017005366359885407, + "loss": 0.6545, + "mean_token_accuracy": 0.8421603220701218, + "num_tokens": 113677040.0, + "step": 40500 + }, + { + "epoch": 0.9854129754132182, + "grad_norm": 0.8844952583312988, + "learning_rate": 0.0001698518516189145, + "loss": 0.7239, + "mean_token_accuracy": 0.824980406165123, + "num_tokens": 113950074.0, + "step": 40600 + }, + { + "epoch": 0.9878401009684231, + "grad_norm": 0.8346448540687561, + "learning_rate": 0.00016964948253045173, + "loss": 0.6458, + "mean_token_accuracy": 0.8428018325567246, + "num_tokens": 114214049.0, + "step": 40700 + }, + { + "epoch": 0.9902672265236281, + "grad_norm": 0.8526921272277832, + "learning_rate": 0.00016944655794747505, + "loss": 0.6812, + "mean_token_accuracy": 0.8357190907001495, + "num_tokens": 114494222.0, + "step": 40800 + }, + { + "epoch": 0.992694352078833, + "grad_norm": 0.9561910033226013, + "learning_rate": 0.00016924307948842435, + "loss": 0.6848, + "mean_token_accuracy": 0.8342451626062393, + "num_tokens": 114773314.0, + "step": 40900 + }, + { + "epoch": 0.995121477634038, + "grad_norm": 0.70332270860672, + "learning_rate": 0.00016903904877615678, + "loss": 0.702, + "mean_token_accuracy": 0.8312277746200561, + "num_tokens": 115047126.0, + "step": 41000 + }, + { + "epoch": 0.997548603189243, + "grad_norm": 0.9677546620368958, + "learning_rate": 0.0001688344674379342, + "loss": 0.6646, + "mean_token_accuracy": 0.8374815553426742, + "num_tokens": 115324048.0, + "step": 41100 + }, + { + "epoch": 0.999975728744448, + "grad_norm": 0.923065185546875, + "learning_rate": 0.0001686293371054099, + "loss": 0.6843, + "mean_token_accuracy": 0.8363048458099365, + "num_tokens": 115592249.0, + "step": 41200 + }, + { + "epoch": 1.0024028542996528, + "grad_norm": 1.1214321851730347, + "learning_rate": 0.0001684236594146158, + "loss": 0.5588, + "mean_token_accuracy": 0.8611422181129456, + "num_tokens": 115866851.0, + "step": 41300 + }, + { + "epoch": 1.004829979854858, + "grad_norm": 1.639672040939331, + "learning_rate": 0.00016821743600594927, + "loss": 0.5275, + "mean_token_accuracy": 0.8682989192008972, + "num_tokens": 116144462.0, + "step": 41400 + }, + { + "epoch": 1.007257105410063, + "grad_norm": 1.342544674873352, + "learning_rate": 0.0001680106685241601, + "loss": 0.5098, + "mean_token_accuracy": 0.8725023174285889, + "num_tokens": 116419143.0, + "step": 41500 + }, + { + "epoch": 1.0096842309652678, + "grad_norm": 1.2504510879516602, + "learning_rate": 0.00016780335861833737, + "loss": 0.5978, + "mean_token_accuracy": 0.8525243228673935, + "num_tokens": 116714586.0, + "step": 41600 + }, + { + "epoch": 1.012111356520473, + "grad_norm": 0.8218744397163391, + "learning_rate": 0.00016759550794189635, + "loss": 0.5516, + "mean_token_accuracy": 0.8621615475416183, + "num_tokens": 116997028.0, + "step": 41700 + }, + { + "epoch": 1.0145384820756778, + "grad_norm": 0.677018940448761, + "learning_rate": 0.0001673871181525652, + "loss": 0.5528, + "mean_token_accuracy": 0.8615715527534484, + "num_tokens": 117285016.0, + "step": 41800 + }, + { + "epoch": 1.0169656076308828, + "grad_norm": 0.8412028551101685, + "learning_rate": 0.00016717819091237183, + "loss": 0.5825, + "mean_token_accuracy": 0.8563485831022263, + "num_tokens": 117563499.0, + "step": 41900 + }, + { + "epoch": 1.0193927331860877, + "grad_norm": 0.9962805509567261, + "learning_rate": 0.00016696872788763068, + "loss": 0.5559, + "mean_token_accuracy": 0.8621777659654617, + "num_tokens": 117856118.0, + "step": 42000 + }, + { + "epoch": 1.0218198587412928, + "grad_norm": 0.971257209777832, + "learning_rate": 0.00016675873074892928, + "loss": 0.5841, + "mean_token_accuracy": 0.8552743774652481, + "num_tokens": 118152443.0, + "step": 42100 + }, + { + "epoch": 1.0242469842964976, + "grad_norm": 1.0679978132247925, + "learning_rate": 0.00016654820117111517, + "loss": 0.551, + "mean_token_accuracy": 0.864154868721962, + "num_tokens": 118432441.0, + "step": 42200 + }, + { + "epoch": 1.0266741098517027, + "grad_norm": 0.9175618886947632, + "learning_rate": 0.00016633714083328225, + "loss": 0.5584, + "mean_token_accuracy": 0.8621039032936096, + "num_tokens": 118720631.0, + "step": 42300 + }, + { + "epoch": 1.0291012354069076, + "grad_norm": 1.0536086559295654, + "learning_rate": 0.0001661255514187576, + "loss": 0.5703, + "mean_token_accuracy": 0.8585632288455963, + "num_tokens": 119016801.0, + "step": 42400 + }, + { + "epoch": 1.0315283609621126, + "grad_norm": 0.7999328970909119, + "learning_rate": 0.00016591343461508796, + "loss": 0.5328, + "mean_token_accuracy": 0.8685509181022644, + "num_tokens": 119319993.0, + "step": 42500 + }, + { + "epoch": 1.0339554865173175, + "grad_norm": 0.8778619170188904, + "learning_rate": 0.00016570079211402638, + "loss": 0.5603, + "mean_token_accuracy": 0.8621475839614868, + "num_tokens": 119595722.0, + "step": 42600 + }, + { + "epoch": 1.0363826120725226, + "grad_norm": 0.7495865225791931, + "learning_rate": 0.0001654876256115186, + "loss": 0.5493, + "mean_token_accuracy": 0.8641699087619782, + "num_tokens": 119876137.0, + "step": 42700 + }, + { + "epoch": 1.0388097376277274, + "grad_norm": 0.8057670593261719, + "learning_rate": 0.0001652739368076895, + "loss": 0.5335, + "mean_token_accuracy": 0.8662267804145813, + "num_tokens": 120143750.0, + "step": 42800 + }, + { + "epoch": 1.0412368631829325, + "grad_norm": 0.9671706557273865, + "learning_rate": 0.00016505972740682974, + "loss": 0.5354, + "mean_token_accuracy": 0.8661771559715271, + "num_tokens": 120405127.0, + "step": 42900 + }, + { + "epoch": 1.0436639887381374, + "grad_norm": 1.1040537357330322, + "learning_rate": 0.0001648449991173819, + "loss": 0.5471, + "mean_token_accuracy": 0.8631951743364334, + "num_tokens": 120685253.0, + "step": 43000 + }, + { + "epoch": 1.0460911142933425, + "grad_norm": 1.1625252962112427, + "learning_rate": 0.0001646297536519272, + "loss": 0.5464, + "mean_token_accuracy": 0.8636760532855987, + "num_tokens": 120958040.0, + "step": 43100 + }, + { + "epoch": 1.0485182398485473, + "grad_norm": 0.7705463767051697, + "learning_rate": 0.00016441399272717138, + "loss": 0.557, + "mean_token_accuracy": 0.8624409341812134, + "num_tokens": 121234837.0, + "step": 43200 + }, + { + "epoch": 1.0509453654037524, + "grad_norm": 0.8841344714164734, + "learning_rate": 0.00016419771806393144, + "loss": 0.5191, + "mean_token_accuracy": 0.8703099459409713, + "num_tokens": 121513869.0, + "step": 43300 + }, + { + "epoch": 1.0533724909589572, + "grad_norm": 0.9004896283149719, + "learning_rate": 0.00016398093138712168, + "loss": 0.5867, + "mean_token_accuracy": 0.8558615297079086, + "num_tokens": 121785188.0, + "step": 43400 + }, + { + "epoch": 1.0557996165141623, + "grad_norm": 1.055458664894104, + "learning_rate": 0.00016376363442574, + "loss": 0.5247, + "mean_token_accuracy": 0.8702590548992157, + "num_tokens": 122066119.0, + "step": 43500 + }, + { + "epoch": 1.0582267420693672, + "grad_norm": 0.9583166241645813, + "learning_rate": 0.0001635458289128542, + "loss": 0.5368, + "mean_token_accuracy": 0.8678800988197327, + "num_tokens": 122342074.0, + "step": 43600 + }, + { + "epoch": 1.0606538676245723, + "grad_norm": 0.9781296849250793, + "learning_rate": 0.00016332751658558787, + "loss": 0.578, + "mean_token_accuracy": 0.8574618774652482, + "num_tokens": 122628920.0, + "step": 43700 + }, + { + "epoch": 1.0630809931797771, + "grad_norm": 0.9489784836769104, + "learning_rate": 0.00016310869918510692, + "loss": 0.5913, + "mean_token_accuracy": 0.8549783474206925, + "num_tokens": 122904241.0, + "step": 43800 + }, + { + "epoch": 1.0655081187349822, + "grad_norm": 0.9939345717430115, + "learning_rate": 0.0001628893784566054, + "loss": 0.5617, + "mean_token_accuracy": 0.8607113534212112, + "num_tokens": 123181146.0, + "step": 43900 + }, + { + "epoch": 1.067935244290187, + "grad_norm": 0.7742923498153687, + "learning_rate": 0.00016266955614929182, + "loss": 0.5652, + "mean_token_accuracy": 0.8603355962038041, + "num_tokens": 123463197.0, + "step": 44000 + }, + { + "epoch": 1.0703623698453921, + "grad_norm": 1.19265615940094, + "learning_rate": 0.00016244923401637488, + "loss": 0.577, + "mean_token_accuracy": 0.8579962712526321, + "num_tokens": 123741457.0, + "step": 44100 + }, + { + "epoch": 1.072789495400597, + "grad_norm": 0.7991734147071838, + "learning_rate": 0.00016222841381504982, + "loss": 0.6114, + "mean_token_accuracy": 0.8508148151636123, + "num_tokens": 124007415.0, + "step": 44200 + }, + { + "epoch": 1.075216620955802, + "grad_norm": 1.1364731788635254, + "learning_rate": 0.00016200709730648428, + "loss": 0.5637, + "mean_token_accuracy": 0.8609515500068664, + "num_tokens": 124296854.0, + "step": 44300 + }, + { + "epoch": 1.077643746511007, + "grad_norm": 1.1264718770980835, + "learning_rate": 0.00016178528625580404, + "loss": 0.5208, + "mean_token_accuracy": 0.8697484582662582, + "num_tokens": 124592659.0, + "step": 44400 + }, + { + "epoch": 1.080070872066212, + "grad_norm": 0.8853039145469666, + "learning_rate": 0.00016156298243207937, + "loss": 0.5543, + "mean_token_accuracy": 0.8613111680746078, + "num_tokens": 124871204.0, + "step": 44500 + }, + { + "epoch": 1.0824979976214169, + "grad_norm": 0.9990018010139465, + "learning_rate": 0.00016134018760831053, + "loss": 0.5721, + "mean_token_accuracy": 0.8585412526130676, + "num_tokens": 125154712.0, + "step": 44600 + }, + { + "epoch": 1.084925123176622, + "grad_norm": 0.8856194615364075, + "learning_rate": 0.00016111690356141386, + "loss": 0.5179, + "mean_token_accuracy": 0.8699715811014176, + "num_tokens": 125448505.0, + "step": 44700 + }, + { + "epoch": 1.0873522487318268, + "grad_norm": 0.808142900466919, + "learning_rate": 0.00016089313207220756, + "loss": 0.5646, + "mean_token_accuracy": 0.8604241162538528, + "num_tokens": 125724217.0, + "step": 44800 + }, + { + "epoch": 1.0897793742870319, + "grad_norm": 1.2607107162475586, + "learning_rate": 0.00016066887492539734, + "loss": 0.5373, + "mean_token_accuracy": 0.8668778890371323, + "num_tokens": 126005623.0, + "step": 44900 + }, + { + "epoch": 1.092206499842237, + "grad_norm": 0.7924454808235168, + "learning_rate": 0.00016044413390956246, + "loss": 0.542, + "mean_token_accuracy": 0.8655985903739929, + "num_tokens": 126276783.0, + "step": 45000 + }, + { + "epoch": 1.0946336253974418, + "grad_norm": 0.7327313423156738, + "learning_rate": 0.00016021891081714117, + "loss": 0.5363, + "mean_token_accuracy": 0.8668161380290985, + "num_tokens": 126555445.0, + "step": 45100 + }, + { + "epoch": 1.0970607509526469, + "grad_norm": 1.1913338899612427, + "learning_rate": 0.0001599932074444167, + "loss": 0.5347, + "mean_token_accuracy": 0.866722519993782, + "num_tokens": 126849635.0, + "step": 45200 + }, + { + "epoch": 1.0994878765078517, + "grad_norm": 0.9338855743408203, + "learning_rate": 0.0001597670255915026, + "loss": 0.5353, + "mean_token_accuracy": 0.8668248879909516, + "num_tokens": 127123102.0, + "step": 45300 + }, + { + "epoch": 1.1019150020630568, + "grad_norm": 1.211919903755188, + "learning_rate": 0.0001595403670623288, + "loss": 0.5182, + "mean_token_accuracy": 0.8705047732591629, + "num_tokens": 127397810.0, + "step": 45400 + }, + { + "epoch": 1.1043421276182617, + "grad_norm": 0.9928873181343079, + "learning_rate": 0.00015931323366462688, + "loss": 0.5326, + "mean_token_accuracy": 0.8685156333446503, + "num_tokens": 127672480.0, + "step": 45500 + }, + { + "epoch": 1.1067692531734667, + "grad_norm": 0.7845079898834229, + "learning_rate": 0.0001590856272099157, + "loss": 0.5634, + "mean_token_accuracy": 0.8639321917295456, + "num_tokens": 127945894.0, + "step": 45600 + }, + { + "epoch": 1.1091963787286716, + "grad_norm": 0.8590813279151917, + "learning_rate": 0.00015885754951348714, + "loss": 0.572, + "mean_token_accuracy": 0.8595906603336334, + "num_tokens": 128239112.0, + "step": 45700 + }, + { + "epoch": 1.1116235042838767, + "grad_norm": 0.7593941688537598, + "learning_rate": 0.0001586290023943915, + "loss": 0.5291, + "mean_token_accuracy": 0.8698451125621796, + "num_tokens": 128511760.0, + "step": 45800 + }, + { + "epoch": 1.1140506298390815, + "grad_norm": 0.8581081032752991, + "learning_rate": 0.00015839998767542285, + "loss": 0.5695, + "mean_token_accuracy": 0.8599695748090744, + "num_tokens": 128799919.0, + "step": 45900 + }, + { + "epoch": 1.1164777553942866, + "grad_norm": 0.7672972679138184, + "learning_rate": 0.00015817050718310483, + "loss": 0.5171, + "mean_token_accuracy": 0.870343611240387, + "num_tokens": 129080858.0, + "step": 46000 + }, + { + "epoch": 1.1189048809494915, + "grad_norm": 0.9828253984451294, + "learning_rate": 0.0001579405627476757, + "loss": 0.5116, + "mean_token_accuracy": 0.873331093788147, + "num_tokens": 129348368.0, + "step": 46100 + }, + { + "epoch": 1.1213320065046966, + "grad_norm": 1.2287424802780151, + "learning_rate": 0.00015771015620307416, + "loss": 0.5462, + "mean_token_accuracy": 0.8644060462713241, + "num_tokens": 129622627.0, + "step": 46200 + }, + { + "epoch": 1.1237591320599014, + "grad_norm": 0.8262332677841187, + "learning_rate": 0.00015747928938692426, + "loss": 0.5419, + "mean_token_accuracy": 0.8658415484428406, + "num_tokens": 129898767.0, + "step": 46300 + }, + { + "epoch": 1.1261862576151065, + "grad_norm": 0.8397664427757263, + "learning_rate": 0.00015724796414052112, + "loss": 0.5229, + "mean_token_accuracy": 0.8713884443044663, + "num_tokens": 130187829.0, + "step": 46400 + }, + { + "epoch": 1.1286133831703113, + "grad_norm": 0.8683608174324036, + "learning_rate": 0.0001570161823088161, + "loss": 0.5427, + "mean_token_accuracy": 0.8661625987291336, + "num_tokens": 130471642.0, + "step": 46500 + }, + { + "epoch": 1.1310405087255164, + "grad_norm": 0.7690980434417725, + "learning_rate": 0.000156783945740402, + "loss": 0.547, + "mean_token_accuracy": 0.8648913151025772, + "num_tokens": 130742090.0, + "step": 46600 + }, + { + "epoch": 1.1334676342807213, + "grad_norm": 1.0576156377792358, + "learning_rate": 0.0001565512562874985, + "loss": 0.5157, + "mean_token_accuracy": 0.871839565038681, + "num_tokens": 131012670.0, + "step": 46700 + }, + { + "epoch": 1.1358947598359264, + "grad_norm": 0.6827182769775391, + "learning_rate": 0.00015631811580593723, + "loss": 0.5295, + "mean_token_accuracy": 0.8686037886142731, + "num_tokens": 131302863.0, + "step": 46800 + }, + { + "epoch": 1.1383218853911312, + "grad_norm": 1.2057856321334839, + "learning_rate": 0.00015608452615514709, + "loss": 0.5436, + "mean_token_accuracy": 0.8661547493934632, + "num_tokens": 131582193.0, + "step": 46900 + }, + { + "epoch": 1.1407490109463363, + "grad_norm": 0.8514811396598816, + "learning_rate": 0.00015585048919813927, + "loss": 0.5122, + "mean_token_accuracy": 0.8713947582244873, + "num_tokens": 131853932.0, + "step": 47000 + }, + { + "epoch": 1.1431761365015412, + "grad_norm": 0.8594823479652405, + "learning_rate": 0.00015561600680149263, + "loss": 0.5057, + "mean_token_accuracy": 0.8751941341161728, + "num_tokens": 132137048.0, + "step": 47100 + }, + { + "epoch": 1.1456032620567462, + "grad_norm": 1.0555239915847778, + "learning_rate": 0.0001553810808353385, + "loss": 0.5096, + "mean_token_accuracy": 0.8731654924154282, + "num_tokens": 132413595.0, + "step": 47200 + }, + { + "epoch": 1.148030387611951, + "grad_norm": 0.9065008759498596, + "learning_rate": 0.00015514571317334607, + "loss": 0.5027, + "mean_token_accuracy": 0.8751115143299103, + "num_tokens": 132692775.0, + "step": 47300 + }, + { + "epoch": 1.1504575131671562, + "grad_norm": 1.3132802248001099, + "learning_rate": 0.00015490990569270725, + "loss": 0.5114, + "mean_token_accuracy": 0.8728679984807968, + "num_tokens": 132963629.0, + "step": 47400 + }, + { + "epoch": 1.152884638722361, + "grad_norm": 0.9391400814056396, + "learning_rate": 0.00015467366027412167, + "loss": 0.5216, + "mean_token_accuracy": 0.8717411142587662, + "num_tokens": 133225134.0, + "step": 47500 + }, + { + "epoch": 1.155311764277566, + "grad_norm": 0.8264212012290955, + "learning_rate": 0.00015443697880178193, + "loss": 0.5737, + "mean_token_accuracy": 0.8609393298625946, + "num_tokens": 133507440.0, + "step": 47600 + }, + { + "epoch": 1.157738889832771, + "grad_norm": 0.9553474187850952, + "learning_rate": 0.0001541998631633583, + "loss": 0.5605, + "mean_token_accuracy": 0.8621794009208679, + "num_tokens": 133784051.0, + "step": 47700 + }, + { + "epoch": 1.160166015387976, + "grad_norm": 0.7231343388557434, + "learning_rate": 0.0001539623152499838, + "loss": 0.5448, + "mean_token_accuracy": 0.8658911234140396, + "num_tokens": 134065800.0, + "step": 47800 + }, + { + "epoch": 1.162593140943181, + "grad_norm": 0.8352395296096802, + "learning_rate": 0.0001537243369562391, + "loss": 0.549, + "mean_token_accuracy": 0.866152486205101, + "num_tokens": 134365482.0, + "step": 47900 + }, + { + "epoch": 1.165020266498386, + "grad_norm": 0.6428236365318298, + "learning_rate": 0.00015348593018013742, + "loss": 0.49, + "mean_token_accuracy": 0.8778564035892487, + "num_tokens": 134656680.0, + "step": 48000 + }, + { + "epoch": 1.167447392053591, + "grad_norm": 0.7582641243934631, + "learning_rate": 0.00015324709682310925, + "loss": 0.5352, + "mean_token_accuracy": 0.8687029981613159, + "num_tokens": 134943558.0, + "step": 48100 + }, + { + "epoch": 1.169874517608796, + "grad_norm": 1.4312922954559326, + "learning_rate": 0.0001530078387899875, + "loss": 0.5237, + "mean_token_accuracy": 0.870193499326706, + "num_tokens": 135204101.0, + "step": 48200 + }, + { + "epoch": 1.1723016431640008, + "grad_norm": 1.1317497491836548, + "learning_rate": 0.00015276815798899196, + "loss": 0.5042, + "mean_token_accuracy": 0.875024488568306, + "num_tokens": 135482167.0, + "step": 48300 + }, + { + "epoch": 1.1747287687192058, + "grad_norm": 0.759957492351532, + "learning_rate": 0.0001525280563317143, + "loss": 0.5172, + "mean_token_accuracy": 0.8706453287601471, + "num_tokens": 135753938.0, + "step": 48400 + }, + { + "epoch": 1.177155894274411, + "grad_norm": 1.225609302520752, + "learning_rate": 0.00015228753573310274, + "loss": 0.4963, + "mean_token_accuracy": 0.8768988955020904, + "num_tokens": 136036050.0, + "step": 48500 + }, + { + "epoch": 1.1795830198296158, + "grad_norm": 0.5895382761955261, + "learning_rate": 0.00015204659811144685, + "loss": 0.5199, + "mean_token_accuracy": 0.8719930589199066, + "num_tokens": 136327290.0, + "step": 48600 + }, + { + "epoch": 1.1820101453848209, + "grad_norm": 0.7858893871307373, + "learning_rate": 0.00015180524538836214, + "loss": 0.5029, + "mean_token_accuracy": 0.8763481539487838, + "num_tokens": 136614197.0, + "step": 48700 + }, + { + "epoch": 1.1844372709400257, + "grad_norm": 0.7872015833854675, + "learning_rate": 0.00015156347948877481, + "loss": 0.5278, + "mean_token_accuracy": 0.8710852640867234, + "num_tokens": 136885103.0, + "step": 48800 + }, + { + "epoch": 1.1868643964952308, + "grad_norm": 1.455886960029602, + "learning_rate": 0.00015132130234090635, + "loss": 0.4845, + "mean_token_accuracy": 0.8797940272092819, + "num_tokens": 137165613.0, + "step": 48900 + }, + { + "epoch": 1.1892915220504356, + "grad_norm": 0.7938244938850403, + "learning_rate": 0.00015107871587625825, + "loss": 0.469, + "mean_token_accuracy": 0.883348827958107, + "num_tokens": 137444709.0, + "step": 49000 + }, + { + "epoch": 1.1917186476056407, + "grad_norm": 0.6187699437141418, + "learning_rate": 0.00015083572202959645, + "loss": 0.4834, + "mean_token_accuracy": 0.8800075507164001, + "num_tokens": 137733269.0, + "step": 49100 + }, + { + "epoch": 1.1941457731608456, + "grad_norm": 0.6408334374427795, + "learning_rate": 0.00015059232273893612, + "loss": 0.5254, + "mean_token_accuracy": 0.8707186883687973, + "num_tokens": 138005114.0, + "step": 49200 + }, + { + "epoch": 1.1965728987160507, + "grad_norm": 1.040124773979187, + "learning_rate": 0.00015034851994552594, + "loss": 0.5541, + "mean_token_accuracy": 0.8649247759580612, + "num_tokens": 138291510.0, + "step": 49300 + }, + { + "epoch": 1.1990000242712555, + "grad_norm": 0.8435716032981873, + "learning_rate": 0.00015010431559383286, + "loss": 0.5154, + "mean_token_accuracy": 0.8739145505428314, + "num_tokens": 138576071.0, + "step": 49400 + }, + { + "epoch": 1.2014271498264606, + "grad_norm": 0.743602454662323, + "learning_rate": 0.00014985971163152646, + "loss": 0.4815, + "mean_token_accuracy": 0.8803683018684387, + "num_tokens": 138856997.0, + "step": 49500 + }, + { + "epoch": 1.2038542753816655, + "grad_norm": 0.5915963649749756, + "learning_rate": 0.00014961471000946342, + "loss": 0.4817, + "mean_token_accuracy": 0.8809413892030716, + "num_tokens": 139136968.0, + "step": 49600 + }, + { + "epoch": 1.2062814009368705, + "grad_norm": 1.0429223775863647, + "learning_rate": 0.00014936931268167198, + "loss": 0.5032, + "mean_token_accuracy": 0.8750742501020432, + "num_tokens": 139405929.0, + "step": 49700 + }, + { + "epoch": 1.2087085264920754, + "grad_norm": 0.8039318323135376, + "learning_rate": 0.00014912352160533637, + "loss": 0.4694, + "mean_token_accuracy": 0.8825742882490158, + "num_tokens": 139685506.0, + "step": 49800 + }, + { + "epoch": 1.2111356520472805, + "grad_norm": 0.7582560777664185, + "learning_rate": 0.00014887733874078125, + "loss": 0.5195, + "mean_token_accuracy": 0.872511134147644, + "num_tokens": 139964509.0, + "step": 49900 + }, + { + "epoch": 1.2135627776024853, + "grad_norm": 0.9156772494316101, + "learning_rate": 0.0001486307660514559, + "loss": 0.5017, + "mean_token_accuracy": 0.8769380474090576, + "num_tokens": 140239409.0, + "step": 50000 + }, + { + "epoch": 1.2159899031576904, + "grad_norm": 0.8511459827423096, + "learning_rate": 0.0001483838055039188, + "loss": 0.5256, + "mean_token_accuracy": 0.8710427796840667, + "num_tokens": 140535879.0, + "step": 50100 + }, + { + "epoch": 1.2184170287128953, + "grad_norm": 0.977913498878479, + "learning_rate": 0.0001481364590678217, + "loss": 0.4692, + "mean_token_accuracy": 0.8834199732542038, + "num_tokens": 140816971.0, + "step": 50200 + }, + { + "epoch": 1.2208441542681003, + "grad_norm": 0.7674177885055542, + "learning_rate": 0.00014788872871589412, + "loss": 0.5214, + "mean_token_accuracy": 0.8728067207336426, + "num_tokens": 141101347.0, + "step": 50300 + }, + { + "epoch": 1.2232712798233052, + "grad_norm": 1.071616291999817, + "learning_rate": 0.0001476406164239275, + "loss": 0.5249, + "mean_token_accuracy": 0.8710673362016678, + "num_tokens": 141373365.0, + "step": 50400 + }, + { + "epoch": 1.2256984053785103, + "grad_norm": 0.719029426574707, + "learning_rate": 0.00014739212417075944, + "loss": 0.418, + "mean_token_accuracy": 0.8950458431243896, + "num_tokens": 141650751.0, + "step": 50500 + }, + { + "epoch": 1.2281255309337151, + "grad_norm": 0.735320508480072, + "learning_rate": 0.00014714325393825806, + "loss": 0.4815, + "mean_token_accuracy": 0.8804818952083587, + "num_tokens": 141940470.0, + "step": 50600 + }, + { + "epoch": 1.2305526564889202, + "grad_norm": 1.0473617315292358, + "learning_rate": 0.0001468940077113059, + "loss": 0.4744, + "mean_token_accuracy": 0.8823691141605378, + "num_tokens": 142212013.0, + "step": 50700 + }, + { + "epoch": 1.232979782044125, + "grad_norm": 0.8890407681465149, + "learning_rate": 0.00014664438747778445, + "loss": 0.4904, + "mean_token_accuracy": 0.8795123302936554, + "num_tokens": 142488791.0, + "step": 50800 + }, + { + "epoch": 1.2354069075993301, + "grad_norm": 0.7927642464637756, + "learning_rate": 0.00014639439522855802, + "loss": 0.4976, + "mean_token_accuracy": 0.8760099506378174, + "num_tokens": 142773638.0, + "step": 50900 + }, + { + "epoch": 1.237834033154535, + "grad_norm": 0.649486780166626, + "learning_rate": 0.00014614403295745793, + "loss": 0.5262, + "mean_token_accuracy": 0.8704094517230988, + "num_tokens": 143072258.0, + "step": 51000 + }, + { + "epoch": 1.24026115870974, + "grad_norm": 0.8649196624755859, + "learning_rate": 0.00014589330266126673, + "loss": 0.4881, + "mean_token_accuracy": 0.8796616047620773, + "num_tokens": 143353223.0, + "step": 51100 + }, + { + "epoch": 1.242688284264945, + "grad_norm": 1.0805305242538452, + "learning_rate": 0.00014564220633970213, + "loss": 0.4871, + "mean_token_accuracy": 0.8797647720575332, + "num_tokens": 143630606.0, + "step": 51200 + }, + { + "epoch": 1.24511540982015, + "grad_norm": 1.1342017650604248, + "learning_rate": 0.0001453907459954011, + "loss": 0.5004, + "mean_token_accuracy": 0.877112073302269, + "num_tokens": 143909256.0, + "step": 51300 + }, + { + "epoch": 1.2475425353753549, + "grad_norm": 0.7277913689613342, + "learning_rate": 0.0001451389236339039, + "loss": 0.4879, + "mean_token_accuracy": 0.8794431656599044, + "num_tokens": 144184234.0, + "step": 51400 + }, + { + "epoch": 1.24996966093056, + "grad_norm": 1.1246638298034668, + "learning_rate": 0.0001448867412636381, + "loss": 0.5452, + "mean_token_accuracy": 0.8682722455263138, + "num_tokens": 144472017.0, + "step": 51500 + }, + { + "epoch": 1.252396786485765, + "grad_norm": 0.7161349654197693, + "learning_rate": 0.00014463420089590255, + "loss": 0.4728, + "mean_token_accuracy": 0.8832290679216385, + "num_tokens": 144769954.0, + "step": 51600 + }, + { + "epoch": 1.2548239120409699, + "grad_norm": 0.9632692933082581, + "learning_rate": 0.00014438130454485126, + "loss": 0.4891, + "mean_token_accuracy": 0.8817065650224686, + "num_tokens": 145053314.0, + "step": 51700 + }, + { + "epoch": 1.2572510375961747, + "grad_norm": 1.6564607620239258, + "learning_rate": 0.00014412805422747753, + "loss": 0.4838, + "mean_token_accuracy": 0.8807741963863372, + "num_tokens": 145328651.0, + "step": 51800 + }, + { + "epoch": 1.2596781631513798, + "grad_norm": 0.6910829544067383, + "learning_rate": 0.00014387445196359768, + "loss": 0.4659, + "mean_token_accuracy": 0.884465005993843, + "num_tokens": 145616910.0, + "step": 51900 + }, + { + "epoch": 1.262105288706585, + "grad_norm": 0.638996422290802, + "learning_rate": 0.000143620499775835, + "loss": 0.4694, + "mean_token_accuracy": 0.8855656278133393, + "num_tokens": 145893173.0, + "step": 52000 + }, + { + "epoch": 1.2645324142617898, + "grad_norm": 0.835657000541687, + "learning_rate": 0.00014336619968960363, + "loss": 0.5138, + "mean_token_accuracy": 0.8743197238445282, + "num_tokens": 146166823.0, + "step": 52100 + }, + { + "epoch": 1.2669595398169946, + "grad_norm": 0.753837525844574, + "learning_rate": 0.0001431115537330924, + "loss": 0.5221, + "mean_token_accuracy": 0.8723088407516479, + "num_tokens": 146426455.0, + "step": 52200 + }, + { + "epoch": 1.2693866653721997, + "grad_norm": 0.6533814072608948, + "learning_rate": 0.00014285656393724872, + "loss": 0.4719, + "mean_token_accuracy": 0.8833791041374206, + "num_tokens": 146700648.0, + "step": 52300 + }, + { + "epoch": 1.2718137909274048, + "grad_norm": 0.75107741355896, + "learning_rate": 0.00014260123233576216, + "loss": 0.4743, + "mean_token_accuracy": 0.8824715614318848, + "num_tokens": 146986402.0, + "step": 52400 + }, + { + "epoch": 1.2742409164826096, + "grad_norm": 0.9934569597244263, + "learning_rate": 0.00014234556096504852, + "loss": 0.5059, + "mean_token_accuracy": 0.875720181465149, + "num_tokens": 147271442.0, + "step": 52500 + }, + { + "epoch": 1.2766680420378147, + "grad_norm": 0.8318378329277039, + "learning_rate": 0.00014208955186423343, + "loss": 0.4761, + "mean_token_accuracy": 0.8846336156129837, + "num_tokens": 147538750.0, + "step": 52600 + }, + { + "epoch": 1.2790951675930196, + "grad_norm": 0.6633797287940979, + "learning_rate": 0.00014183320707513605, + "loss": 0.4853, + "mean_token_accuracy": 0.8816380894184113, + "num_tokens": 147816277.0, + "step": 52700 + }, + { + "epoch": 1.2815222931482246, + "grad_norm": 0.6250624060630798, + "learning_rate": 0.00014157652864225297, + "loss": 0.492, + "mean_token_accuracy": 0.8797484838962555, + "num_tokens": 148104573.0, + "step": 52800 + }, + { + "epoch": 1.2839494187034295, + "grad_norm": 0.6616709232330322, + "learning_rate": 0.0001413195186127416, + "loss": 0.4848, + "mean_token_accuracy": 0.8814722353219986, + "num_tokens": 148385633.0, + "step": 52900 + }, + { + "epoch": 1.2863765442586346, + "grad_norm": 1.0039057731628418, + "learning_rate": 0.00014106217903640423, + "loss": 0.4626, + "mean_token_accuracy": 0.8860047221183777, + "num_tokens": 148675690.0, + "step": 53000 + }, + { + "epoch": 1.2888036698138394, + "grad_norm": 0.8785421252250671, + "learning_rate": 0.00014080451196567126, + "loss": 0.4661, + "mean_token_accuracy": 0.8858687382936478, + "num_tokens": 148958845.0, + "step": 53100 + }, + { + "epoch": 1.2912307953690445, + "grad_norm": 0.6556010842323303, + "learning_rate": 0.00014054651945558523, + "loss": 0.495, + "mean_token_accuracy": 0.8791573786735535, + "num_tokens": 149223683.0, + "step": 53200 + }, + { + "epoch": 1.2936579209242494, + "grad_norm": 0.6850820183753967, + "learning_rate": 0.00014028820356378412, + "loss": 0.5025, + "mean_token_accuracy": 0.8775439101457596, + "num_tokens": 149506639.0, + "step": 53300 + }, + { + "epoch": 1.2960850464794544, + "grad_norm": 1.0605202913284302, + "learning_rate": 0.00014002956635048518, + "loss": 0.4572, + "mean_token_accuracy": 0.8866003942489624, + "num_tokens": 149782390.0, + "step": 53400 + }, + { + "epoch": 1.2985121720346593, + "grad_norm": 0.7321272492408752, + "learning_rate": 0.00013977060987846823, + "loss": 0.4635, + "mean_token_accuracy": 0.8857458835840225, + "num_tokens": 150060443.0, + "step": 53500 + }, + { + "epoch": 1.3009392975898644, + "grad_norm": 0.8881349563598633, + "learning_rate": 0.00013951133621305955, + "loss": 0.4523, + "mean_token_accuracy": 0.8875695073604584, + "num_tokens": 150325449.0, + "step": 53600 + }, + { + "epoch": 1.3033664231450692, + "grad_norm": 0.715872049331665, + "learning_rate": 0.00013925174742211502, + "loss": 0.46, + "mean_token_accuracy": 0.8867443364858627, + "num_tokens": 150598642.0, + "step": 53700 + }, + { + "epoch": 1.3057935487002743, + "grad_norm": 0.7793150544166565, + "learning_rate": 0.00013899184557600398, + "loss": 0.4517, + "mean_token_accuracy": 0.8886141180992126, + "num_tokens": 150898833.0, + "step": 53800 + }, + { + "epoch": 1.3082206742554792, + "grad_norm": 1.7053370475769043, + "learning_rate": 0.00013873163274759248, + "loss": 0.4616, + "mean_token_accuracy": 0.8852343845367432, + "num_tokens": 151168628.0, + "step": 53900 + }, + { + "epoch": 1.3106477998106842, + "grad_norm": 0.8918945789337158, + "learning_rate": 0.00013847111101222687, + "loss": 0.433, + "mean_token_accuracy": 0.8909021973609924, + "num_tokens": 151450450.0, + "step": 54000 + }, + { + "epoch": 1.313074925365889, + "grad_norm": 1.0090819597244263, + "learning_rate": 0.0001382102824477172, + "loss": 0.4479, + "mean_token_accuracy": 0.8897096848487854, + "num_tokens": 151740252.0, + "step": 54100 + }, + { + "epoch": 1.3155020509210942, + "grad_norm": 0.9295179843902588, + "learning_rate": 0.0001379491491343207, + "loss": 0.4753, + "mean_token_accuracy": 0.8836379379034043, + "num_tokens": 152021939.0, + "step": 54200 + }, + { + "epoch": 1.317929176476299, + "grad_norm": 0.7607155442237854, + "learning_rate": 0.00013768771315472502, + "loss": 0.4402, + "mean_token_accuracy": 0.8909223932027817, + "num_tokens": 152302100.0, + "step": 54300 + }, + { + "epoch": 1.3203563020315041, + "grad_norm": 0.8280419111251831, + "learning_rate": 0.00013742597659403187, + "loss": 0.4331, + "mean_token_accuracy": 0.8921854722499848, + "num_tokens": 152575683.0, + "step": 54400 + }, + { + "epoch": 1.322783427586709, + "grad_norm": 0.7644575238227844, + "learning_rate": 0.0001371639415397402, + "loss": 0.4523, + "mean_token_accuracy": 0.8892707014083863, + "num_tokens": 152864215.0, + "step": 54500 + }, + { + "epoch": 1.325210553141914, + "grad_norm": 0.6379640698432922, + "learning_rate": 0.00013690161008172975, + "loss": 0.4586, + "mean_token_accuracy": 0.8860977566242219, + "num_tokens": 153144000.0, + "step": 54600 + }, + { + "epoch": 1.3276376786971191, + "grad_norm": 1.0883742570877075, + "learning_rate": 0.00013663898431224403, + "loss": 0.4432, + "mean_token_accuracy": 0.8918490415811539, + "num_tokens": 153428731.0, + "step": 54700 + }, + { + "epoch": 1.330064804252324, + "grad_norm": 0.708063542842865, + "learning_rate": 0.000136376066325874, + "loss": 0.4054, + "mean_token_accuracy": 0.8982583904266357, + "num_tokens": 153707438.0, + "step": 54800 + }, + { + "epoch": 1.3324919298075288, + "grad_norm": 0.9368717074394226, + "learning_rate": 0.0001361128582195412, + "loss": 0.4359, + "mean_token_accuracy": 0.8917915511131287, + "num_tokens": 154010417.0, + "step": 54900 + }, + { + "epoch": 1.334919055362734, + "grad_norm": 0.8366745710372925, + "learning_rate": 0.000135849362092481, + "loss": 0.4584, + "mean_token_accuracy": 0.8874488598108292, + "num_tokens": 154284670.0, + "step": 55000 + }, + { + "epoch": 1.337346180917939, + "grad_norm": 0.8448474407196045, + "learning_rate": 0.00013558558004622597, + "loss": 0.4232, + "mean_token_accuracy": 0.8938626998662949, + "num_tokens": 154557986.0, + "step": 55100 + }, + { + "epoch": 1.3397733064731439, + "grad_norm": 0.7955624461174011, + "learning_rate": 0.00013532151418458894, + "loss": 0.4239, + "mean_token_accuracy": 0.8958518970012664, + "num_tokens": 154831844.0, + "step": 55200 + }, + { + "epoch": 1.3422004320283487, + "grad_norm": 0.9134492874145508, + "learning_rate": 0.00013505716661364641, + "loss": 0.4122, + "mean_token_accuracy": 0.8974074327945709, + "num_tokens": 155109417.0, + "step": 55300 + }, + { + "epoch": 1.3446275575835538, + "grad_norm": 0.8050549626350403, + "learning_rate": 0.00013479253944172177, + "loss": 0.4699, + "mean_token_accuracy": 0.8858902537822724, + "num_tokens": 155387541.0, + "step": 55400 + }, + { + "epoch": 1.3470546831387589, + "grad_norm": 0.9216250777244568, + "learning_rate": 0.00013452763477936813, + "loss": 0.4241, + "mean_token_accuracy": 0.8962391257286072, + "num_tokens": 155672639.0, + "step": 55500 + }, + { + "epoch": 1.3494818086939637, + "grad_norm": 0.9659915566444397, + "learning_rate": 0.00013426245473935198, + "loss": 0.4656, + "mean_token_accuracy": 0.8873411613702774, + "num_tokens": 155947597.0, + "step": 55600 + }, + { + "epoch": 1.3519089342491686, + "grad_norm": 0.7852197289466858, + "learning_rate": 0.00013399700143663598, + "loss": 0.4728, + "mean_token_accuracy": 0.8850886446237564, + "num_tokens": 156238718.0, + "step": 55700 + }, + { + "epoch": 1.3543360598043737, + "grad_norm": 0.78695148229599, + "learning_rate": 0.00013373127698836223, + "loss": 0.4239, + "mean_token_accuracy": 0.8962297958135604, + "num_tokens": 156517043.0, + "step": 55800 + }, + { + "epoch": 1.3567631853595787, + "grad_norm": 0.8713340759277344, + "learning_rate": 0.00013346528351383544, + "loss": 0.433, + "mean_token_accuracy": 0.8928613644838334, + "num_tokens": 156799092.0, + "step": 55900 + }, + { + "epoch": 1.3591903109147836, + "grad_norm": 0.6828170418739319, + "learning_rate": 0.00013319902313450594, + "loss": 0.4466, + "mean_token_accuracy": 0.8914445018768311, + "num_tokens": 157093747.0, + "step": 56000 + }, + { + "epoch": 1.3616174364699887, + "grad_norm": 0.9015485048294067, + "learning_rate": 0.00013293249797395268, + "loss": 0.4516, + "mean_token_accuracy": 0.8881697982549668, + "num_tokens": 157384150.0, + "step": 56100 + }, + { + "epoch": 1.3640445620251935, + "grad_norm": 0.7890717387199402, + "learning_rate": 0.0001326657101578665, + "loss": 0.4054, + "mean_token_accuracy": 0.899683158993721, + "num_tokens": 157680572.0, + "step": 56200 + }, + { + "epoch": 1.3664716875803986, + "grad_norm": 0.7425559163093567, + "learning_rate": 0.00013239866181403303, + "loss": 0.4496, + "mean_token_accuracy": 0.8901696348190308, + "num_tokens": 157960621.0, + "step": 56300 + }, + { + "epoch": 1.3688988131356035, + "grad_norm": 0.9881618022918701, + "learning_rate": 0.00013213135507231577, + "loss": 0.488, + "mean_token_accuracy": 0.8811579287052155, + "num_tokens": 158246805.0, + "step": 56400 + }, + { + "epoch": 1.3713259386908085, + "grad_norm": 0.7465482950210571, + "learning_rate": 0.00013186379206463902, + "loss": 0.423, + "mean_token_accuracy": 0.8940630006790161, + "num_tokens": 158514607.0, + "step": 56500 + }, + { + "epoch": 1.3737530642460134, + "grad_norm": 0.8724623918533325, + "learning_rate": 0.00013159597492497113, + "loss": 0.4433, + "mean_token_accuracy": 0.8902081990242005, + "num_tokens": 158792485.0, + "step": 56600 + }, + { + "epoch": 1.3761801898012185, + "grad_norm": 0.741797924041748, + "learning_rate": 0.00013132790578930704, + "loss": 0.455, + "mean_token_accuracy": 0.8891710060834884, + "num_tokens": 159075983.0, + "step": 56700 + }, + { + "epoch": 1.3786073153564233, + "grad_norm": 0.6590899229049683, + "learning_rate": 0.00013105958679565175, + "loss": 0.4431, + "mean_token_accuracy": 0.8901263773441315, + "num_tokens": 159364576.0, + "step": 56800 + }, + { + "epoch": 1.3810344409116284, + "grad_norm": 0.7108921408653259, + "learning_rate": 0.00013079102008400285, + "loss": 0.4451, + "mean_token_accuracy": 0.8915826362371445, + "num_tokens": 159648774.0, + "step": 56900 + }, + { + "epoch": 1.3834615664668333, + "grad_norm": 0.7816290259361267, + "learning_rate": 0.00013052220779633368, + "loss": 0.426, + "mean_token_accuracy": 0.8951281315088272, + "num_tokens": 159934295.0, + "step": 57000 + }, + { + "epoch": 1.3858886920220383, + "grad_norm": 0.5931801199913025, + "learning_rate": 0.00013025315207657625, + "loss": 0.4106, + "mean_token_accuracy": 0.8973771643638611, + "num_tokens": 160207873.0, + "step": 57100 + }, + { + "epoch": 1.3883158175772432, + "grad_norm": 0.9697718024253845, + "learning_rate": 0.00012998385507060394, + "loss": 0.401, + "mean_token_accuracy": 0.9002759510278702, + "num_tokens": 160493828.0, + "step": 57200 + }, + { + "epoch": 1.3907429431324483, + "grad_norm": 0.9094292521476746, + "learning_rate": 0.00012971431892621466, + "loss": 0.4554, + "mean_token_accuracy": 0.8877697956562042, + "num_tokens": 160772760.0, + "step": 57300 + }, + { + "epoch": 1.3931700686876531, + "grad_norm": 0.6295153498649597, + "learning_rate": 0.00012944454579311355, + "loss": 0.4425, + "mean_token_accuracy": 0.8911004400253296, + "num_tokens": 161056385.0, + "step": 57400 + }, + { + "epoch": 1.3955971942428582, + "grad_norm": 0.6608839631080627, + "learning_rate": 0.00012917453782289578, + "loss": 0.3784, + "mean_token_accuracy": 0.904485547542572, + "num_tokens": 161348717.0, + "step": 57500 + }, + { + "epoch": 1.398024319798063, + "grad_norm": 0.8495998978614807, + "learning_rate": 0.00012890429716902964, + "loss": 0.4467, + "mean_token_accuracy": 0.8919394075870514, + "num_tokens": 161627738.0, + "step": 57600 + }, + { + "epoch": 1.4004514453532682, + "grad_norm": 0.593804657459259, + "learning_rate": 0.00012863382598683908, + "loss": 0.4381, + "mean_token_accuracy": 0.8924766594171524, + "num_tokens": 161903764.0, + "step": 57700 + }, + { + "epoch": 1.4028785709084732, + "grad_norm": 0.8975790739059448, + "learning_rate": 0.00012836312643348666, + "loss": 0.4423, + "mean_token_accuracy": 0.8917118191719056, + "num_tokens": 162185851.0, + "step": 57800 + }, + { + "epoch": 1.405305696463678, + "grad_norm": 0.6902527809143066, + "learning_rate": 0.0001280922006679564, + "loss": 0.4134, + "mean_token_accuracy": 0.8982209903001785, + "num_tokens": 162457139.0, + "step": 57900 + }, + { + "epoch": 1.407732822018883, + "grad_norm": 0.48210135102272034, + "learning_rate": 0.00012782105085103648, + "loss": 0.3923, + "mean_token_accuracy": 0.9033814692497253, + "num_tokens": 162748843.0, + "step": 58000 + }, + { + "epoch": 1.410159947574088, + "grad_norm": 0.6424720883369446, + "learning_rate": 0.00012754967914530192, + "loss": 0.4599, + "mean_token_accuracy": 0.8881127738952637, + "num_tokens": 163031004.0, + "step": 58100 + }, + { + "epoch": 1.412587073129293, + "grad_norm": 0.6549175381660461, + "learning_rate": 0.0001272780877150975, + "loss": 0.4222, + "mean_token_accuracy": 0.8963321650028229, + "num_tokens": 163312021.0, + "step": 58200 + }, + { + "epoch": 1.415014198684498, + "grad_norm": 0.7186068296432495, + "learning_rate": 0.0001270062787265205, + "loss": 0.4635, + "mean_token_accuracy": 0.8873461991548538, + "num_tokens": 163592479.0, + "step": 58300 + }, + { + "epoch": 1.4174413242397028, + "grad_norm": 0.9494929909706116, + "learning_rate": 0.00012673425434740315, + "loss": 0.4281, + "mean_token_accuracy": 0.8949847120046616, + "num_tokens": 163875865.0, + "step": 58400 + }, + { + "epoch": 1.419868449794908, + "grad_norm": 0.546734094619751, + "learning_rate": 0.00012646201674729578, + "loss": 0.3853, + "mean_token_accuracy": 0.9039551669359207, + "num_tokens": 164147648.0, + "step": 58500 + }, + { + "epoch": 1.422295575350113, + "grad_norm": 0.598573625087738, + "learning_rate": 0.00012618956809744915, + "loss": 0.3922, + "mean_token_accuracy": 0.9019291293621063, + "num_tokens": 164431800.0, + "step": 58600 + }, + { + "epoch": 1.4247227009053178, + "grad_norm": 0.656909704208374, + "learning_rate": 0.00012591691057079726, + "loss": 0.4027, + "mean_token_accuracy": 0.9010864180326462, + "num_tokens": 164713062.0, + "step": 58700 + }, + { + "epoch": 1.4271498264605227, + "grad_norm": 0.6891266703605652, + "learning_rate": 0.00012564404634194006, + "loss": 0.4059, + "mean_token_accuracy": 0.9008734589815139, + "num_tokens": 165013179.0, + "step": 58800 + }, + { + "epoch": 1.4295769520157278, + "grad_norm": 0.8763130903244019, + "learning_rate": 0.00012537097758712605, + "loss": 0.3738, + "mean_token_accuracy": 0.9075895375013352, + "num_tokens": 165302834.0, + "step": 58900 + }, + { + "epoch": 1.4320040775709328, + "grad_norm": 0.8782544136047363, + "learning_rate": 0.00012509770648423495, + "loss": 0.4009, + "mean_token_accuracy": 0.9022983765602112, + "num_tokens": 165582933.0, + "step": 59000 + }, + { + "epoch": 1.4344312031261377, + "grad_norm": 0.7040051221847534, + "learning_rate": 0.0001248242352127603, + "loss": 0.4491, + "mean_token_accuracy": 0.8912272185087204, + "num_tokens": 165868666.0, + "step": 59100 + }, + { + "epoch": 1.4368583286813426, + "grad_norm": 0.65074622631073, + "learning_rate": 0.00012455056595379212, + "loss": 0.4357, + "mean_token_accuracy": 0.8948434573411942, + "num_tokens": 166145044.0, + "step": 59200 + }, + { + "epoch": 1.4392854542365476, + "grad_norm": 0.7173576354980469, + "learning_rate": 0.0001242767008899995, + "loss": 0.4587, + "mean_token_accuracy": 0.888498705625534, + "num_tokens": 166438123.0, + "step": 59300 + }, + { + "epoch": 1.4417125797917527, + "grad_norm": 0.7104485630989075, + "learning_rate": 0.00012400264220561322, + "loss": 0.41, + "mean_token_accuracy": 0.9003169226646424, + "num_tokens": 166725344.0, + "step": 59400 + }, + { + "epoch": 1.4441397053469576, + "grad_norm": 0.8286243677139282, + "learning_rate": 0.00012372839208640816, + "loss": 0.4163, + "mean_token_accuracy": 0.8978767049312592, + "num_tokens": 167007251.0, + "step": 59500 + }, + { + "epoch": 1.4465668309021626, + "grad_norm": 1.1104589700698853, + "learning_rate": 0.00012345395271968617, + "loss": 0.4176, + "mean_token_accuracy": 0.8967356210947037, + "num_tokens": 167285691.0, + "step": 59600 + }, + { + "epoch": 1.4489939564573675, + "grad_norm": 0.7582440376281738, + "learning_rate": 0.00012317932629425833, + "loss": 0.4242, + "mean_token_accuracy": 0.8963595002889633, + "num_tokens": 167577906.0, + "step": 59700 + }, + { + "epoch": 1.4514210820125726, + "grad_norm": 0.651850163936615, + "learning_rate": 0.00012290451500042768, + "loss": 0.3934, + "mean_token_accuracy": 0.9035976582765579, + "num_tokens": 167857298.0, + "step": 59800 + }, + { + "epoch": 1.4538482075677774, + "grad_norm": 0.5393962860107422, + "learning_rate": 0.00012262952102997168, + "loss": 0.3914, + "mean_token_accuracy": 0.9034919530153275, + "num_tokens": 168138829.0, + "step": 59900 + }, + { + "epoch": 1.4562753331229825, + "grad_norm": 0.6189530491828918, + "learning_rate": 0.00012235434657612482, + "loss": 0.4317, + "mean_token_accuracy": 0.8959889018535614, + "num_tokens": 168426422.0, + "step": 60000 + }, + { + "epoch": 1.4587024586781874, + "grad_norm": 0.7726854085922241, + "learning_rate": 0.00012207899383356085, + "loss": 0.4203, + "mean_token_accuracy": 0.8981534975767136, + "num_tokens": 168707865.0, + "step": 60100 + }, + { + "epoch": 1.4611295842333925, + "grad_norm": 0.7277886867523193, + "learning_rate": 0.00012180346499837567, + "loss": 0.3604, + "mean_token_accuracy": 0.909367219209671, + "num_tokens": 168985511.0, + "step": 60200 + }, + { + "epoch": 1.4635567097885973, + "grad_norm": 0.7043424248695374, + "learning_rate": 0.00012152776226806955, + "loss": 0.3954, + "mean_token_accuracy": 0.9025062710046768, + "num_tokens": 169272720.0, + "step": 60300 + }, + { + "epoch": 1.4659838353438024, + "grad_norm": 0.6475239396095276, + "learning_rate": 0.0001212518878415296, + "loss": 0.402, + "mean_token_accuracy": 0.9011396139860153, + "num_tokens": 169545792.0, + "step": 60400 + }, + { + "epoch": 1.4684109608990072, + "grad_norm": 0.9668689966201782, + "learning_rate": 0.00012097584391901244, + "loss": 0.4272, + "mean_token_accuracy": 0.8966371548175812, + "num_tokens": 169820743.0, + "step": 60500 + }, + { + "epoch": 1.4708380864542123, + "grad_norm": 0.5923229455947876, + "learning_rate": 0.00012069963270212645, + "loss": 0.3847, + "mean_token_accuracy": 0.9052494311332703, + "num_tokens": 170077149.0, + "step": 60600 + }, + { + "epoch": 1.4732652120094172, + "grad_norm": 0.9675098061561584, + "learning_rate": 0.00012042325639381424, + "loss": 0.4158, + "mean_token_accuracy": 0.8990872955322265, + "num_tokens": 170357611.0, + "step": 60700 + }, + { + "epoch": 1.4756923375646223, + "grad_norm": 0.9949508905410767, + "learning_rate": 0.00012014671719833521, + "loss": 0.4143, + "mean_token_accuracy": 0.8991787171363831, + "num_tokens": 170636443.0, + "step": 60800 + }, + { + "epoch": 1.4781194631198271, + "grad_norm": 0.6021668314933777, + "learning_rate": 0.0001198700173212478, + "loss": 0.4384, + "mean_token_accuracy": 0.8946084171533585, + "num_tokens": 170928533.0, + "step": 60900 + }, + { + "epoch": 1.4805465886750322, + "grad_norm": 0.7970176935195923, + "learning_rate": 0.000119593158969392, + "loss": 0.3983, + "mean_token_accuracy": 0.9021727895736694, + "num_tokens": 171211196.0, + "step": 61000 + }, + { + "epoch": 1.482973714230237, + "grad_norm": 0.7371858954429626, + "learning_rate": 0.00011931614435087174, + "loss": 0.4151, + "mean_token_accuracy": 0.8985683709383011, + "num_tokens": 171479735.0, + "step": 61100 + }, + { + "epoch": 1.4854008397854421, + "grad_norm": 0.7278369069099426, + "learning_rate": 0.00011903897567503723, + "loss": 0.3895, + "mean_token_accuracy": 0.904181165099144, + "num_tokens": 171755093.0, + "step": 61200 + }, + { + "epoch": 1.4878279653406472, + "grad_norm": 0.9078586101531982, + "learning_rate": 0.00011876165515246744, + "loss": 0.4234, + "mean_token_accuracy": 0.8975623792409897, + "num_tokens": 172048894.0, + "step": 61300 + }, + { + "epoch": 1.490255090895852, + "grad_norm": 0.6660951375961304, + "learning_rate": 0.0001184841849949524, + "loss": 0.3481, + "mean_token_accuracy": 0.9130461746454239, + "num_tokens": 172356896.0, + "step": 61400 + }, + { + "epoch": 1.492682216451057, + "grad_norm": 0.7461407780647278, + "learning_rate": 0.00011820656741547544, + "loss": 0.3653, + "mean_token_accuracy": 0.9098727852106094, + "num_tokens": 172640726.0, + "step": 61500 + }, + { + "epoch": 1.495109342006262, + "grad_norm": 0.5631619691848755, + "learning_rate": 0.00011792880462819585, + "loss": 0.4279, + "mean_token_accuracy": 0.8971470546722412, + "num_tokens": 172906449.0, + "step": 61600 + }, + { + "epoch": 1.497536467561467, + "grad_norm": 1.2110614776611328, + "learning_rate": 0.00011765089884843091, + "loss": 0.373, + "mean_token_accuracy": 0.9086796522140503, + "num_tokens": 173172381.0, + "step": 61700 + }, + { + "epoch": 1.499963593116672, + "grad_norm": 0.5904493927955627, + "learning_rate": 0.00011737285229263832, + "loss": 0.3914, + "mean_token_accuracy": 0.9038412702083588, + "num_tokens": 173456589.0, + "step": 61800 + }, + { + "epoch": 1.5023907186718768, + "grad_norm": 0.9040527939796448, + "learning_rate": 0.0001170946671783987, + "loss": 0.393, + "mean_token_accuracy": 0.902976735830307, + "num_tokens": 173729202.0, + "step": 61900 + }, + { + "epoch": 1.5048178442270819, + "grad_norm": 0.5546716451644897, + "learning_rate": 0.00011681634572439761, + "loss": 0.4038, + "mean_token_accuracy": 0.901577382683754, + "num_tokens": 174014930.0, + "step": 62000 + }, + { + "epoch": 1.507244969782287, + "grad_norm": 0.8447543978691101, + "learning_rate": 0.00011653789015040805, + "loss": 0.3878, + "mean_token_accuracy": 0.9046182173490525, + "num_tokens": 174290764.0, + "step": 62100 + }, + { + "epoch": 1.5096720953374918, + "grad_norm": 0.8927494883537292, + "learning_rate": 0.00011625930267727276, + "loss": 0.4146, + "mean_token_accuracy": 0.8986788499355316, + "num_tokens": 174582875.0, + "step": 62200 + }, + { + "epoch": 1.5120992208926967, + "grad_norm": 0.9438863396644592, + "learning_rate": 0.00011598058552688633, + "loss": 0.3777, + "mean_token_accuracy": 0.9074046242237092, + "num_tokens": 174867623.0, + "step": 62300 + }, + { + "epoch": 1.5145263464479017, + "grad_norm": 1.016975998878479, + "learning_rate": 0.00011570174092217769, + "loss": 0.4071, + "mean_token_accuracy": 0.9008045589923859, + "num_tokens": 175157062.0, + "step": 62400 + }, + { + "epoch": 1.5169534720031068, + "grad_norm": 0.7987212538719177, + "learning_rate": 0.00011542277108709226, + "loss": 0.3882, + "mean_token_accuracy": 0.904829193353653, + "num_tokens": 175436247.0, + "step": 62500 + }, + { + "epoch": 1.5193805975583117, + "grad_norm": 0.8533787727355957, + "learning_rate": 0.00011514367824657427, + "loss": 0.392, + "mean_token_accuracy": 0.9054911434650421, + "num_tokens": 175720578.0, + "step": 62600 + }, + { + "epoch": 1.5218077231135165, + "grad_norm": 0.760500967502594, + "learning_rate": 0.00011486446462654892, + "loss": 0.4002, + "mean_token_accuracy": 0.9018949067592621, + "num_tokens": 175995977.0, + "step": 62700 + }, + { + "epoch": 1.5242348486687216, + "grad_norm": 0.8798636794090271, + "learning_rate": 0.00011458513245390483, + "loss": 0.3706, + "mean_token_accuracy": 0.9091184973716736, + "num_tokens": 176277514.0, + "step": 62800 + }, + { + "epoch": 1.5266619742239267, + "grad_norm": 0.5968068838119507, + "learning_rate": 0.00011430568395647598, + "loss": 0.3607, + "mean_token_accuracy": 0.9119984036684037, + "num_tokens": 176569080.0, + "step": 62900 + }, + { + "epoch": 1.5290890997791315, + "grad_norm": 0.653016209602356, + "learning_rate": 0.00011402612136302421, + "loss": 0.4304, + "mean_token_accuracy": 0.8962489527463913, + "num_tokens": 176836863.0, + "step": 63000 + }, + { + "epoch": 1.5315162253343364, + "grad_norm": 0.4983295798301697, + "learning_rate": 0.00011374644690322136, + "loss": 0.3713, + "mean_token_accuracy": 0.9084924668073654, + "num_tokens": 177119720.0, + "step": 63100 + }, + { + "epoch": 1.5339433508895415, + "grad_norm": 0.743560254573822, + "learning_rate": 0.00011346666280763134, + "loss": 0.3405, + "mean_token_accuracy": 0.9152973371744156, + "num_tokens": 177396225.0, + "step": 63200 + }, + { + "epoch": 1.5363704764447466, + "grad_norm": 0.5968629121780396, + "learning_rate": 0.00011318677130769261, + "loss": 0.3543, + "mean_token_accuracy": 0.9123020827770233, + "num_tokens": 177670691.0, + "step": 63300 + }, + { + "epoch": 1.5387976019999514, + "grad_norm": 0.7266377210617065, + "learning_rate": 0.00011290677463570024, + "loss": 0.4014, + "mean_token_accuracy": 0.902683122754097, + "num_tokens": 177944237.0, + "step": 63400 + }, + { + "epoch": 1.5412247275551563, + "grad_norm": 0.6020427346229553, + "learning_rate": 0.00011262667502478793, + "loss": 0.3823, + "mean_token_accuracy": 0.9062705504894256, + "num_tokens": 178226347.0, + "step": 63500 + }, + { + "epoch": 1.5436518531103613, + "grad_norm": 0.7702438831329346, + "learning_rate": 0.00011234647470891059, + "loss": 0.3765, + "mean_token_accuracy": 0.907245215177536, + "num_tokens": 178510805.0, + "step": 63600 + }, + { + "epoch": 1.5460789786655664, + "grad_norm": 0.9679001569747925, + "learning_rate": 0.00011206617592282623, + "loss": 0.3457, + "mean_token_accuracy": 0.9141608655452729, + "num_tokens": 178787352.0, + "step": 63700 + }, + { + "epoch": 1.5485061042207713, + "grad_norm": 0.8241939544677734, + "learning_rate": 0.0001117857809020781, + "loss": 0.3556, + "mean_token_accuracy": 0.9122043627500535, + "num_tokens": 179050596.0, + "step": 63800 + }, + { + "epoch": 1.5509332297759764, + "grad_norm": 0.696847677230835, + "learning_rate": 0.00011150529188297716, + "loss": 0.4277, + "mean_token_accuracy": 0.8969793927669525, + "num_tokens": 179332713.0, + "step": 63900 + }, + { + "epoch": 1.5533603553311814, + "grad_norm": 0.6272198557853699, + "learning_rate": 0.00011122471110258401, + "loss": 0.3932, + "mean_token_accuracy": 0.9046736443042755, + "num_tokens": 179613755.0, + "step": 64000 + }, + { + "epoch": 1.5557874808863863, + "grad_norm": 0.7845979332923889, + "learning_rate": 0.000110944040798691, + "loss": 0.3948, + "mean_token_accuracy": 0.9039056050777435, + "num_tokens": 179884755.0, + "step": 64100 + }, + { + "epoch": 1.5582146064415912, + "grad_norm": 1.0909096002578735, + "learning_rate": 0.00011066328320980464, + "loss": 0.3662, + "mean_token_accuracy": 0.9101364880800247, + "num_tokens": 180167093.0, + "step": 64200 + }, + { + "epoch": 1.5606417319967962, + "grad_norm": 0.6839550137519836, + "learning_rate": 0.00011038244057512743, + "loss": 0.4025, + "mean_token_accuracy": 0.9013819855451584, + "num_tokens": 180442997.0, + "step": 64300 + }, + { + "epoch": 1.5630688575520013, + "grad_norm": 0.6133870482444763, + "learning_rate": 0.00011010151513454023, + "loss": 0.328, + "mean_token_accuracy": 0.918245889544487, + "num_tokens": 180733470.0, + "step": 64400 + }, + { + "epoch": 1.5654959831072062, + "grad_norm": 0.576290488243103, + "learning_rate": 0.00010982050912858435, + "loss": 0.3754, + "mean_token_accuracy": 0.9084422850608825, + "num_tokens": 181004342.0, + "step": 64500 + }, + { + "epoch": 1.567923108662411, + "grad_norm": 1.109480857849121, + "learning_rate": 0.00010953942479844364, + "loss": 0.3778, + "mean_token_accuracy": 0.907360492348671, + "num_tokens": 181280978.0, + "step": 64600 + }, + { + "epoch": 1.570350234217616, + "grad_norm": 0.7722584009170532, + "learning_rate": 0.00010925826438592657, + "loss": 0.3491, + "mean_token_accuracy": 0.9142099022865295, + "num_tokens": 181559969.0, + "step": 64700 + }, + { + "epoch": 1.5727773597728212, + "grad_norm": 0.6564502120018005, + "learning_rate": 0.00010897703013344854, + "loss": 0.3711, + "mean_token_accuracy": 0.9086374562978744, + "num_tokens": 181841280.0, + "step": 64800 + }, + { + "epoch": 1.575204485328026, + "grad_norm": 0.5784518122673035, + "learning_rate": 0.00010869572428401371, + "loss": 0.3975, + "mean_token_accuracy": 0.9038605362176895, + "num_tokens": 182121713.0, + "step": 64900 + }, + { + "epoch": 1.577631610883231, + "grad_norm": 0.7998061180114746, + "learning_rate": 0.00010841434908119736, + "loss": 0.3398, + "mean_token_accuracy": 0.9166411167383194, + "num_tokens": 182413653.0, + "step": 65000 + }, + { + "epoch": 1.580058736438436, + "grad_norm": 0.6847976446151733, + "learning_rate": 0.00010813290676912792, + "loss": 0.3515, + "mean_token_accuracy": 0.9136654776334763, + "num_tokens": 182689793.0, + "step": 65100 + }, + { + "epoch": 1.582485861993641, + "grad_norm": 0.5379784107208252, + "learning_rate": 0.000107851399592469, + "loss": 0.3889, + "mean_token_accuracy": 0.905264812707901, + "num_tokens": 182969672.0, + "step": 65200 + }, + { + "epoch": 1.584912987548846, + "grad_norm": 0.9807937145233154, + "learning_rate": 0.00010756982979640154, + "loss": 0.3658, + "mean_token_accuracy": 0.9102151840925217, + "num_tokens": 183245797.0, + "step": 65300 + }, + { + "epoch": 1.5873401131040508, + "grad_norm": 0.9323781132698059, + "learning_rate": 0.00010728819962660598, + "loss": 0.375, + "mean_token_accuracy": 0.9090581947565078, + "num_tokens": 183522770.0, + "step": 65400 + }, + { + "epoch": 1.5897672386592558, + "grad_norm": 0.7109302282333374, + "learning_rate": 0.00010700651132924418, + "loss": 0.3142, + "mean_token_accuracy": 0.9218410032987595, + "num_tokens": 183797170.0, + "step": 65500 + }, + { + "epoch": 1.592194364214461, + "grad_norm": 0.7105025053024292, + "learning_rate": 0.00010672476715094167, + "loss": 0.3766, + "mean_token_accuracy": 0.9082463669776917, + "num_tokens": 184074307.0, + "step": 65600 + }, + { + "epoch": 1.5946214897696658, + "grad_norm": 0.813896119594574, + "learning_rate": 0.0001064429693387696, + "loss": 0.3739, + "mean_token_accuracy": 0.909969937801361, + "num_tokens": 184370954.0, + "step": 65700 + }, + { + "epoch": 1.5970486153248706, + "grad_norm": 0.8465865850448608, + "learning_rate": 0.00010616112014022694, + "loss": 0.353, + "mean_token_accuracy": 0.9132819199562072, + "num_tokens": 184655076.0, + "step": 65800 + }, + { + "epoch": 1.5994757408800757, + "grad_norm": 0.66167151927948, + "learning_rate": 0.0001058792218032225, + "loss": 0.3819, + "mean_token_accuracy": 0.9072164571285248, + "num_tokens": 184934233.0, + "step": 65900 + }, + { + "epoch": 1.6019028664352808, + "grad_norm": 0.49021396040916443, + "learning_rate": 0.00010559727657605694, + "loss": 0.3625, + "mean_token_accuracy": 0.9116124641895295, + "num_tokens": 185211848.0, + "step": 66000 + }, + { + "epoch": 1.6043299919904856, + "grad_norm": 0.6963539719581604, + "learning_rate": 0.00010531528670740492, + "loss": 0.3535, + "mean_token_accuracy": 0.9135640490055085, + "num_tokens": 185484414.0, + "step": 66100 + }, + { + "epoch": 1.6067571175456905, + "grad_norm": 0.7452908754348755, + "learning_rate": 0.00010503325444629717, + "loss": 0.3922, + "mean_token_accuracy": 0.9036144858598709, + "num_tokens": 185773330.0, + "step": 66200 + }, + { + "epoch": 1.6091842431008956, + "grad_norm": 0.7494684457778931, + "learning_rate": 0.0001047511820421025, + "loss": 0.3397, + "mean_token_accuracy": 0.9164110898971558, + "num_tokens": 186065496.0, + "step": 66300 + }, + { + "epoch": 1.6116113686561007, + "grad_norm": 0.5952861309051514, + "learning_rate": 0.00010446907174450983, + "loss": 0.4031, + "mean_token_accuracy": 0.9032100856304168, + "num_tokens": 186349033.0, + "step": 66400 + }, + { + "epoch": 1.6140384942113055, + "grad_norm": 1.3102141618728638, + "learning_rate": 0.00010418692580351046, + "loss": 0.3342, + "mean_token_accuracy": 0.9185534894466401, + "num_tokens": 186650789.0, + "step": 66500 + }, + { + "epoch": 1.6164656197665104, + "grad_norm": 0.7236355543136597, + "learning_rate": 0.0001039047464693798, + "loss": 0.3566, + "mean_token_accuracy": 0.9125622332096099, + "num_tokens": 186923321.0, + "step": 66600 + }, + { + "epoch": 1.6188927453217155, + "grad_norm": 0.8594217896461487, + "learning_rate": 0.00010362253599265963, + "loss": 0.3639, + "mean_token_accuracy": 0.9123637741804123, + "num_tokens": 187201566.0, + "step": 66700 + }, + { + "epoch": 1.6213198708769205, + "grad_norm": 0.710017204284668, + "learning_rate": 0.0001033402966241402, + "loss": 0.4091, + "mean_token_accuracy": 0.9025905507802964, + "num_tokens": 187485117.0, + "step": 66800 + }, + { + "epoch": 1.6237469964321254, + "grad_norm": 0.6511447429656982, + "learning_rate": 0.0001030580306148421, + "loss": 0.3392, + "mean_token_accuracy": 0.9179734051227569, + "num_tokens": 187778358.0, + "step": 66900 + }, + { + "epoch": 1.6261741219873302, + "grad_norm": 0.656478762626648, + "learning_rate": 0.00010277574021599836, + "loss": 0.3134, + "mean_token_accuracy": 0.9212139695882797, + "num_tokens": 188044395.0, + "step": 67000 + }, + { + "epoch": 1.6286012475425353, + "grad_norm": 0.5202460289001465, + "learning_rate": 0.00010249342767903667, + "loss": 0.3384, + "mean_token_accuracy": 0.9168726438283921, + "num_tokens": 188316468.0, + "step": 67100 + }, + { + "epoch": 1.6310283730977404, + "grad_norm": 0.4842274487018585, + "learning_rate": 0.00010221109525556116, + "loss": 0.3457, + "mean_token_accuracy": 0.9164014345407486, + "num_tokens": 188581889.0, + "step": 67200 + }, + { + "epoch": 1.6334554986529453, + "grad_norm": 0.6255847215652466, + "learning_rate": 0.0001019287451973346, + "loss": 0.3154, + "mean_token_accuracy": 0.9217046529054642, + "num_tokens": 188859717.0, + "step": 67300 + }, + { + "epoch": 1.6358826242081503, + "grad_norm": 0.6042391657829285, + "learning_rate": 0.00010164637975626045, + "loss": 0.3318, + "mean_token_accuracy": 0.9182331216335297, + "num_tokens": 189135523.0, + "step": 67400 + }, + { + "epoch": 1.6383097497633554, + "grad_norm": 0.6733984351158142, + "learning_rate": 0.0001013640011843648, + "loss": 0.3288, + "mean_token_accuracy": 0.9198346930742264, + "num_tokens": 189423670.0, + "step": 67500 + }, + { + "epoch": 1.6407368753185603, + "grad_norm": 0.7567030787467957, + "learning_rate": 0.0001010816117337785, + "loss": 0.3475, + "mean_token_accuracy": 0.9150215810537339, + "num_tokens": 189701230.0, + "step": 67600 + }, + { + "epoch": 1.6431640008737651, + "grad_norm": 0.6925593614578247, + "learning_rate": 0.00010079921365671914, + "loss": 0.3163, + "mean_token_accuracy": 0.9217036014795303, + "num_tokens": 189977907.0, + "step": 67700 + }, + { + "epoch": 1.6455911264289702, + "grad_norm": 0.554702639579773, + "learning_rate": 0.00010051680920547313, + "loss": 0.3277, + "mean_token_accuracy": 0.9187106388807297, + "num_tokens": 190256876.0, + "step": 67800 + }, + { + "epoch": 1.6480182519841753, + "grad_norm": 0.9477412700653076, + "learning_rate": 0.0001002344006323777, + "loss": 0.3244, + "mean_token_accuracy": 0.9190926957130432, + "num_tokens": 190538466.0, + "step": 67900 + }, + { + "epoch": 1.6504453775393801, + "grad_norm": 0.7824245691299438, + "learning_rate": 9.9951990189803e-05, + "loss": 0.3529, + "mean_token_accuracy": 0.9144074618816376, + "num_tokens": 190823841.0, + "step": 68000 + }, + { + "epoch": 1.652872503094585, + "grad_norm": 0.6957218050956726, + "learning_rate": 9.9669580130134e-05, + "loss": 0.3402, + "mean_token_accuracy": 0.9166605508327484, + "num_tokens": 191086712.0, + "step": 68100 + }, + { + "epoch": 1.65529962864979, + "grad_norm": 0.4525381624698639, + "learning_rate": 9.938717270575272e-05, + "loss": 0.338, + "mean_token_accuracy": 0.9177921932935714, + "num_tokens": 191373552.0, + "step": 68200 + }, + { + "epoch": 1.6577267542049952, + "grad_norm": 0.4912790358066559, + "learning_rate": 9.910477016902016e-05, + "loss": 0.3639, + "mean_token_accuracy": 0.9108395028114319, + "num_tokens": 191648507.0, + "step": 68300 + }, + { + "epoch": 1.6601538797602, + "grad_norm": 0.6194826364517212, + "learning_rate": 9.882237477225816e-05, + "loss": 0.3618, + "mean_token_accuracy": 0.9116045045852661, + "num_tokens": 191928076.0, + "step": 68400 + }, + { + "epoch": 1.6625810053154049, + "grad_norm": 0.6417632102966309, + "learning_rate": 9.853998876773183e-05, + "loss": 0.34, + "mean_token_accuracy": 0.9173044127225876, + "num_tokens": 192200434.0, + "step": 68500 + }, + { + "epoch": 1.66500813087061, + "grad_norm": 0.7448539733886719, + "learning_rate": 9.825761440763129e-05, + "loss": 0.3352, + "mean_token_accuracy": 0.9177643340826035, + "num_tokens": 192480776.0, + "step": 68600 + }, + { + "epoch": 1.667435256425815, + "grad_norm": 0.6451402306556702, + "learning_rate": 9.797525394405376e-05, + "loss": 0.3339, + "mean_token_accuracy": 0.9174343067407608, + "num_tokens": 192754899.0, + "step": 68700 + }, + { + "epoch": 1.6698623819810199, + "grad_norm": 0.982666552066803, + "learning_rate": 9.769290962898566e-05, + "loss": 0.3221, + "mean_token_accuracy": 0.9207504653930664, + "num_tokens": 193041219.0, + "step": 68800 + }, + { + "epoch": 1.6722895075362247, + "grad_norm": 0.5121039748191833, + "learning_rate": 9.741058371428468e-05, + "loss": 0.3547, + "mean_token_accuracy": 0.9136625772714615, + "num_tokens": 193314947.0, + "step": 68900 + }, + { + "epoch": 1.6747166330914298, + "grad_norm": 0.702271580696106, + "learning_rate": 9.712827845166157e-05, + "loss": 0.3137, + "mean_token_accuracy": 0.9225055646896362, + "num_tokens": 193591255.0, + "step": 69000 + }, + { + "epoch": 1.677143758646635, + "grad_norm": 0.685634970664978, + "learning_rate": 9.684599609266253e-05, + "loss": 0.3345, + "mean_token_accuracy": 0.9185575705766678, + "num_tokens": 193860731.0, + "step": 69100 + }, + { + "epoch": 1.6795708842018398, + "grad_norm": 0.7711082100868225, + "learning_rate": 9.65637388886511e-05, + "loss": 0.3724, + "mean_token_accuracy": 0.9091116571426392, + "num_tokens": 194128841.0, + "step": 69200 + }, + { + "epoch": 1.6819980097570446, + "grad_norm": 1.1533504724502563, + "learning_rate": 9.628150909079007e-05, + "loss": 0.3446, + "mean_token_accuracy": 0.9169007408618927, + "num_tokens": 194407308.0, + "step": 69300 + }, + { + "epoch": 1.6844251353122497, + "grad_norm": 0.5997574925422668, + "learning_rate": 9.599930895002375e-05, + "loss": 0.3207, + "mean_token_accuracy": 0.9212809431552887, + "num_tokens": 194691502.0, + "step": 69400 + }, + { + "epoch": 1.6868522608674548, + "grad_norm": 0.4966389834880829, + "learning_rate": 9.571714071705993e-05, + "loss": 0.3017, + "mean_token_accuracy": 0.92467877805233, + "num_tokens": 194986931.0, + "step": 69500 + }, + { + "epoch": 1.6892793864226596, + "grad_norm": 1.2698211669921875, + "learning_rate": 9.54350066423518e-05, + "loss": 0.3154, + "mean_token_accuracy": 0.9225751388072968, + "num_tokens": 195268236.0, + "step": 69600 + }, + { + "epoch": 1.6917065119778645, + "grad_norm": 0.9074554443359375, + "learning_rate": 9.515290897608021e-05, + "loss": 0.3801, + "mean_token_accuracy": 0.9084242284297943, + "num_tokens": 195545051.0, + "step": 69700 + }, + { + "epoch": 1.6941336375330696, + "grad_norm": 0.6302804946899414, + "learning_rate": 9.487084996813566e-05, + "loss": 0.3175, + "mean_token_accuracy": 0.9217024165391922, + "num_tokens": 195822504.0, + "step": 69800 + }, + { + "epoch": 1.6965607630882746, + "grad_norm": 0.829451322555542, + "learning_rate": 9.458883186810024e-05, + "loss": 0.3278, + "mean_token_accuracy": 0.9191747254133225, + "num_tokens": 196095524.0, + "step": 69900 + }, + { + "epoch": 1.6989878886434795, + "grad_norm": 0.7346980571746826, + "learning_rate": 9.430685692522983e-05, + "loss": 0.3116, + "mean_token_accuracy": 0.9237532925605774, + "num_tokens": 196371157.0, + "step": 70000 + }, + { + "epoch": 1.7014150141986844, + "grad_norm": 0.5194724798202515, + "learning_rate": 9.402492738843615e-05, + "loss": 0.3158, + "mean_token_accuracy": 0.9228990191221237, + "num_tokens": 196661666.0, + "step": 70100 + }, + { + "epoch": 1.7038421397538894, + "grad_norm": 0.8965851068496704, + "learning_rate": 9.374304550626863e-05, + "loss": 0.3175, + "mean_token_accuracy": 0.9222236788272857, + "num_tokens": 196930313.0, + "step": 70200 + }, + { + "epoch": 1.7062692653090945, + "grad_norm": 0.7471325397491455, + "learning_rate": 9.346121352689677e-05, + "loss": 0.3038, + "mean_token_accuracy": 0.9257321381568908, + "num_tokens": 197226205.0, + "step": 70300 + }, + { + "epoch": 1.7086963908642994, + "grad_norm": 1.0250093936920166, + "learning_rate": 9.317943369809208e-05, + "loss": 0.3302, + "mean_token_accuracy": 0.920040818452835, + "num_tokens": 197506788.0, + "step": 70400 + }, + { + "epoch": 1.7111235164195042, + "grad_norm": 0.6335323452949524, + "learning_rate": 9.289770826720997e-05, + "loss": 0.3482, + "mean_token_accuracy": 0.9151440984010697, + "num_tokens": 197793320.0, + "step": 70500 + }, + { + "epoch": 1.7135506419747093, + "grad_norm": 0.745668888092041, + "learning_rate": 9.261603948117215e-05, + "loss": 0.3118, + "mean_token_accuracy": 0.922760015130043, + "num_tokens": 198080716.0, + "step": 70600 + }, + { + "epoch": 1.7159777675299144, + "grad_norm": 0.8320471048355103, + "learning_rate": 9.233442958644854e-05, + "loss": 0.3121, + "mean_token_accuracy": 0.9230057382583619, + "num_tokens": 198366181.0, + "step": 70700 + }, + { + "epoch": 1.7184048930851192, + "grad_norm": 0.48289257287979126, + "learning_rate": 9.20528808290393e-05, + "loss": 0.2983, + "mean_token_accuracy": 0.9262391251325607, + "num_tokens": 198633088.0, + "step": 70800 + }, + { + "epoch": 1.7208320186403243, + "grad_norm": 0.4999804198741913, + "learning_rate": 9.177139545445705e-05, + "loss": 0.3451, + "mean_token_accuracy": 0.9155993515253067, + "num_tokens": 198914414.0, + "step": 70900 + }, + { + "epoch": 1.7232591441955294, + "grad_norm": 0.5464954376220703, + "learning_rate": 9.148997570770889e-05, + "loss": 0.2955, + "mean_token_accuracy": 0.9270257157087326, + "num_tokens": 199205965.0, + "step": 71000 + }, + { + "epoch": 1.7256862697507342, + "grad_norm": 0.7479568719863892, + "learning_rate": 9.12086238332784e-05, + "loss": 0.3156, + "mean_token_accuracy": 0.921921678185463, + "num_tokens": 199478121.0, + "step": 71100 + }, + { + "epoch": 1.728113395305939, + "grad_norm": 0.5852890014648438, + "learning_rate": 9.092734207510796e-05, + "loss": 0.3586, + "mean_token_accuracy": 0.9138590520620347, + "num_tokens": 199758362.0, + "step": 71200 + }, + { + "epoch": 1.7305405208611442, + "grad_norm": 0.5110465884208679, + "learning_rate": 9.064613267658073e-05, + "loss": 0.3286, + "mean_token_accuracy": 0.9191134518384934, + "num_tokens": 200054741.0, + "step": 71300 + }, + { + "epoch": 1.7329676464163493, + "grad_norm": 0.5434827208518982, + "learning_rate": 9.036499788050265e-05, + "loss": 0.313, + "mean_token_accuracy": 0.9231429475545884, + "num_tokens": 200335558.0, + "step": 71400 + }, + { + "epoch": 1.7353947719715541, + "grad_norm": 0.5807352066040039, + "learning_rate": 9.00839399290848e-05, + "loss": 0.281, + "mean_token_accuracy": 0.9307907569408417, + "num_tokens": 200604065.0, + "step": 71500 + }, + { + "epoch": 1.737821897526759, + "grad_norm": 0.5315217971801758, + "learning_rate": 8.98029610639253e-05, + "loss": 0.3414, + "mean_token_accuracy": 0.9182829517126083, + "num_tokens": 200879049.0, + "step": 71600 + }, + { + "epoch": 1.740249023081964, + "grad_norm": 0.7881130576133728, + "learning_rate": 8.952206352599149e-05, + "loss": 0.3491, + "mean_token_accuracy": 0.9159856230020523, + "num_tokens": 201165260.0, + "step": 71700 + }, + { + "epoch": 1.7426761486371691, + "grad_norm": 0.6103109121322632, + "learning_rate": 8.924124955560213e-05, + "loss": 0.3239, + "mean_token_accuracy": 0.9207637649774552, + "num_tokens": 201429653.0, + "step": 71800 + }, + { + "epoch": 1.745103274192374, + "grad_norm": 0.5265923142433167, + "learning_rate": 8.896052139240949e-05, + "loss": 0.3101, + "mean_token_accuracy": 0.9246016055345535, + "num_tokens": 201699010.0, + "step": 71900 + }, + { + "epoch": 1.7475303997475788, + "grad_norm": 0.864488959312439, + "learning_rate": 8.867988127538143e-05, + "loss": 0.3371, + "mean_token_accuracy": 0.9180309230089188, + "num_tokens": 201989268.0, + "step": 72000 + }, + { + "epoch": 1.749957525302784, + "grad_norm": 0.7725526094436646, + "learning_rate": 8.839933144278361e-05, + "loss": 0.3159, + "mean_token_accuracy": 0.9218523550033569, + "num_tokens": 202268722.0, + "step": 72100 + }, + { + "epoch": 1.752384650857989, + "grad_norm": 0.7077100872993469, + "learning_rate": 8.811887413216165e-05, + "loss": 0.3141, + "mean_token_accuracy": 0.9246137911081314, + "num_tokens": 202566420.0, + "step": 72200 + }, + { + "epoch": 1.7548117764131939, + "grad_norm": 0.9547737240791321, + "learning_rate": 8.783851158032325e-05, + "loss": 0.3201, + "mean_token_accuracy": 0.9220850229263305, + "num_tokens": 202850518.0, + "step": 72300 + }, + { + "epoch": 1.7572389019683987, + "grad_norm": 0.5615896582603455, + "learning_rate": 8.755824602332025e-05, + "loss": 0.3197, + "mean_token_accuracy": 0.9238210040330886, + "num_tokens": 203155202.0, + "step": 72400 + }, + { + "epoch": 1.7596660275236038, + "grad_norm": 0.7283564805984497, + "learning_rate": 8.727807969643105e-05, + "loss": 0.2929, + "mean_token_accuracy": 0.9278195726871491, + "num_tokens": 203426340.0, + "step": 72500 + }, + { + "epoch": 1.7620931530788089, + "grad_norm": 0.7179740071296692, + "learning_rate": 8.699801483414258e-05, + "loss": 0.297, + "mean_token_accuracy": 0.9268648004531861, + "num_tokens": 203706287.0, + "step": 72600 + }, + { + "epoch": 1.7645202786340137, + "grad_norm": 0.4863494634628296, + "learning_rate": 8.671805367013247e-05, + "loss": 0.3066, + "mean_token_accuracy": 0.925693176984787, + "num_tokens": 203990745.0, + "step": 72700 + }, + { + "epoch": 1.7669474041892186, + "grad_norm": 0.6124697327613831, + "learning_rate": 8.643819843725137e-05, + "loss": 0.3197, + "mean_token_accuracy": 0.9222535014152526, + "num_tokens": 204263513.0, + "step": 72800 + }, + { + "epoch": 1.7693745297444237, + "grad_norm": 0.4539005756378174, + "learning_rate": 8.615845136750508e-05, + "loss": 0.2946, + "mean_token_accuracy": 0.9275599610805512, + "num_tokens": 204525364.0, + "step": 72900 + }, + { + "epoch": 1.7718016552996287, + "grad_norm": 0.6153349876403809, + "learning_rate": 8.587881469203664e-05, + "loss": 0.3038, + "mean_token_accuracy": 0.9263102424144745, + "num_tokens": 204810853.0, + "step": 73000 + }, + { + "epoch": 1.7742287808548336, + "grad_norm": 0.801893949508667, + "learning_rate": 8.559929064110873e-05, + "loss": 0.3046, + "mean_token_accuracy": 0.9255858755111694, + "num_tokens": 205103416.0, + "step": 73100 + }, + { + "epoch": 1.7766559064100385, + "grad_norm": 0.9156849384307861, + "learning_rate": 8.531988144408576e-05, + "loss": 0.2973, + "mean_token_accuracy": 0.9272646516561508, + "num_tokens": 205382911.0, + "step": 73200 + }, + { + "epoch": 1.7790830319652435, + "grad_norm": 0.5409168004989624, + "learning_rate": 8.504058932941605e-05, + "loss": 0.3031, + "mean_token_accuracy": 0.9254523950815201, + "num_tokens": 205646639.0, + "step": 73300 + }, + { + "epoch": 1.7815101575204486, + "grad_norm": 0.620744526386261, + "learning_rate": 8.476141652461421e-05, + "loss": 0.3098, + "mean_token_accuracy": 0.9240190213918686, + "num_tokens": 205915824.0, + "step": 73400 + }, + { + "epoch": 1.7839372830756535, + "grad_norm": 0.5530264377593994, + "learning_rate": 8.448236525624331e-05, + "loss": 0.3359, + "mean_token_accuracy": 0.9183445227146149, + "num_tokens": 206207345.0, + "step": 73500 + }, + { + "epoch": 1.7863644086308583, + "grad_norm": 0.4213729500770569, + "learning_rate": 8.420343774989691e-05, + "loss": 0.3388, + "mean_token_accuracy": 0.9183846575021744, + "num_tokens": 206493187.0, + "step": 73600 + }, + { + "epoch": 1.7887915341860634, + "grad_norm": 0.47710779309272766, + "learning_rate": 8.392463623018172e-05, + "loss": 0.3611, + "mean_token_accuracy": 0.9139343810081482, + "num_tokens": 206777359.0, + "step": 73700 + }, + { + "epoch": 1.7912186597412685, + "grad_norm": 0.9268105626106262, + "learning_rate": 8.364596292069953e-05, + "loss": 0.3057, + "mean_token_accuracy": 0.9253049170970917, + "num_tokens": 207060316.0, + "step": 73800 + }, + { + "epoch": 1.7936457852964733, + "grad_norm": 0.6438888907432556, + "learning_rate": 8.336742004402954e-05, + "loss": 0.2599, + "mean_token_accuracy": 0.9355102443695068, + "num_tokens": 207333293.0, + "step": 73900 + }, + { + "epoch": 1.7960729108516782, + "grad_norm": 0.4977918565273285, + "learning_rate": 8.308900982171075e-05, + "loss": 0.2811, + "mean_token_accuracy": 0.9305684345960618, + "num_tokens": 207603072.0, + "step": 74000 + }, + { + "epoch": 1.7985000364068833, + "grad_norm": 0.4704795479774475, + "learning_rate": 8.28107344742242e-05, + "loss": 0.2805, + "mean_token_accuracy": 0.9318266582489013, + "num_tokens": 207883751.0, + "step": 74100 + }, + { + "epoch": 1.8009271619620884, + "grad_norm": 0.6442781686782837, + "learning_rate": 8.253259622097505e-05, + "loss": 0.3198, + "mean_token_accuracy": 0.9221637785434723, + "num_tokens": 208156078.0, + "step": 74200 + }, + { + "epoch": 1.8033542875172932, + "grad_norm": 0.6045205593109131, + "learning_rate": 8.225459728027522e-05, + "loss": 0.3355, + "mean_token_accuracy": 0.9196035987138749, + "num_tokens": 208438407.0, + "step": 74300 + }, + { + "epoch": 1.8057814130724983, + "grad_norm": 1.1112462282180786, + "learning_rate": 8.197673986932553e-05, + "loss": 0.3074, + "mean_token_accuracy": 0.9248217070102691, + "num_tokens": 208717797.0, + "step": 74400 + }, + { + "epoch": 1.8082085386277034, + "grad_norm": 1.077713966369629, + "learning_rate": 8.169902620419789e-05, + "loss": 0.2507, + "mean_token_accuracy": 0.9370506715774536, + "num_tokens": 209002743.0, + "step": 74500 + }, + { + "epoch": 1.8106356641829082, + "grad_norm": 0.744132399559021, + "learning_rate": 8.142145849981788e-05, + "loss": 0.2938, + "mean_token_accuracy": 0.9279677730798721, + "num_tokens": 209279733.0, + "step": 74600 + }, + { + "epoch": 1.813062789738113, + "grad_norm": 0.7917492985725403, + "learning_rate": 8.114403896994692e-05, + "loss": 0.3001, + "mean_token_accuracy": 0.9269537425041199, + "num_tokens": 209569692.0, + "step": 74700 + }, + { + "epoch": 1.8154899152933182, + "grad_norm": 0.5599649548530579, + "learning_rate": 8.086676982716463e-05, + "loss": 0.2734, + "mean_token_accuracy": 0.9324289971590042, + "num_tokens": 209851240.0, + "step": 74800 + }, + { + "epoch": 1.8179170408485232, + "grad_norm": 0.7313913106918335, + "learning_rate": 8.058965328285122e-05, + "loss": 0.2901, + "mean_token_accuracy": 0.9296905469894409, + "num_tokens": 210122572.0, + "step": 74900 + }, + { + "epoch": 1.820344166403728, + "grad_norm": 0.6392823457717896, + "learning_rate": 8.03126915471699e-05, + "loss": 0.2558, + "mean_token_accuracy": 0.9362455844879151, + "num_tokens": 210389408.0, + "step": 75000 + }, + { + "epoch": 1.822771291958933, + "grad_norm": 0.6192830204963684, + "learning_rate": 8.003588682904904e-05, + "loss": 0.2875, + "mean_token_accuracy": 0.9278698348999024, + "num_tokens": 210649018.0, + "step": 75100 + }, + { + "epoch": 1.825198417514138, + "grad_norm": 0.876402735710144, + "learning_rate": 7.975924133616489e-05, + "loss": 0.3214, + "mean_token_accuracy": 0.9232221174240113, + "num_tokens": 210920552.0, + "step": 75200 + }, + { + "epoch": 1.827625543069343, + "grad_norm": 0.4038306772708893, + "learning_rate": 7.948275727492365e-05, + "loss": 0.2791, + "mean_token_accuracy": 0.9318163025379181, + "num_tokens": 211213728.0, + "step": 75300 + }, + { + "epoch": 1.830052668624548, + "grad_norm": 0.869914174079895, + "learning_rate": 7.920643685044407e-05, + "loss": 0.2907, + "mean_token_accuracy": 0.9293239808082581, + "num_tokens": 211497350.0, + "step": 75400 + }, + { + "epoch": 1.8324797941797528, + "grad_norm": 0.5940338969230652, + "learning_rate": 7.893028226653982e-05, + "loss": 0.3187, + "mean_token_accuracy": 0.9223272824287414, + "num_tokens": 211784740.0, + "step": 75500 + }, + { + "epoch": 1.834906919734958, + "grad_norm": 0.842191755771637, + "learning_rate": 7.865429572570187e-05, + "loss": 0.2469, + "mean_token_accuracy": 0.9377169865369797, + "num_tokens": 212067327.0, + "step": 75600 + }, + { + "epoch": 1.837334045290163, + "grad_norm": 0.5681836605072021, + "learning_rate": 7.83784794290809e-05, + "loss": 0.2841, + "mean_token_accuracy": 0.9311885517835617, + "num_tokens": 212340490.0, + "step": 75700 + }, + { + "epoch": 1.8397611708453678, + "grad_norm": 0.6963329911231995, + "learning_rate": 7.810283557646988e-05, + "loss": 0.303, + "mean_token_accuracy": 0.926787257194519, + "num_tokens": 212630815.0, + "step": 75800 + }, + { + "epoch": 1.8421882964005727, + "grad_norm": 0.8300701379776001, + "learning_rate": 7.782736636628641e-05, + "loss": 0.3003, + "mean_token_accuracy": 0.9274958628416061, + "num_tokens": 212900625.0, + "step": 75900 + }, + { + "epoch": 1.8446154219557778, + "grad_norm": 0.6039229035377502, + "learning_rate": 7.75520739955552e-05, + "loss": 0.2644, + "mean_token_accuracy": 0.9343709653615951, + "num_tokens": 213192202.0, + "step": 76000 + }, + { + "epoch": 1.8470425475109828, + "grad_norm": 0.7922122478485107, + "learning_rate": 7.727696065989057e-05, + "loss": 0.2747, + "mean_token_accuracy": 0.9328130561113358, + "num_tokens": 213471876.0, + "step": 76100 + }, + { + "epoch": 1.8494696730661877, + "grad_norm": 0.9440976977348328, + "learning_rate": 7.700202855347895e-05, + "loss": 0.2602, + "mean_token_accuracy": 0.9356586462259293, + "num_tokens": 213748520.0, + "step": 76200 + }, + { + "epoch": 1.8518967986213926, + "grad_norm": 0.47323721647262573, + "learning_rate": 7.67272798690614e-05, + "loss": 0.3049, + "mean_token_accuracy": 0.9259419536590576, + "num_tokens": 214033390.0, + "step": 76300 + }, + { + "epoch": 1.8543239241765976, + "grad_norm": 0.5896041989326477, + "learning_rate": 7.645271679791594e-05, + "loss": 0.2809, + "mean_token_accuracy": 0.931108033657074, + "num_tokens": 214315261.0, + "step": 76400 + }, + { + "epoch": 1.8567510497318027, + "grad_norm": 0.6122065186500549, + "learning_rate": 7.617834152984035e-05, + "loss": 0.265, + "mean_token_accuracy": 0.9348021644353867, + "num_tokens": 214612081.0, + "step": 76500 + }, + { + "epoch": 1.8591781752870076, + "grad_norm": 0.4658614993095398, + "learning_rate": 7.590415625313459e-05, + "loss": 0.2978, + "mean_token_accuracy": 0.9276400768756866, + "num_tokens": 214894960.0, + "step": 76600 + }, + { + "epoch": 1.8616053008422124, + "grad_norm": 1.103578805923462, + "learning_rate": 7.56301631545832e-05, + "loss": 0.3092, + "mean_token_accuracy": 0.9259757554531097, + "num_tokens": 215168979.0, + "step": 76700 + }, + { + "epoch": 1.8640324263974175, + "grad_norm": 0.6536977291107178, + "learning_rate": 7.53563644194381e-05, + "loss": 0.2677, + "mean_token_accuracy": 0.9338239932060242, + "num_tokens": 215444157.0, + "step": 76800 + }, + { + "epoch": 1.8664595519526226, + "grad_norm": 0.6290289759635925, + "learning_rate": 7.508276223140105e-05, + "loss": 0.2939, + "mean_token_accuracy": 0.9283382612466812, + "num_tokens": 215724557.0, + "step": 76900 + }, + { + "epoch": 1.8688866775078274, + "grad_norm": 0.5545627474784851, + "learning_rate": 7.480935877260616e-05, + "loss": 0.3151, + "mean_token_accuracy": 0.9245033538341523, + "num_tokens": 216001207.0, + "step": 77000 + }, + { + "epoch": 1.8713138030630323, + "grad_norm": 0.5959055423736572, + "learning_rate": 7.45361562236026e-05, + "loss": 0.2919, + "mean_token_accuracy": 0.9289400810003281, + "num_tokens": 216282783.0, + "step": 77100 + }, + { + "epoch": 1.8737409286182374, + "grad_norm": 0.6361558437347412, + "learning_rate": 7.426315676333719e-05, + "loss": 0.2703, + "mean_token_accuracy": 0.933252164721489, + "num_tokens": 216570897.0, + "step": 77200 + }, + { + "epoch": 1.8761680541734425, + "grad_norm": 0.48569005727767944, + "learning_rate": 7.399036256913696e-05, + "loss": 0.2841, + "mean_token_accuracy": 0.9301125872135162, + "num_tokens": 216850770.0, + "step": 77300 + }, + { + "epoch": 1.8785951797286473, + "grad_norm": 0.7510759830474854, + "learning_rate": 7.371777581669185e-05, + "loss": 0.2619, + "mean_token_accuracy": 0.9353511029481888, + "num_tokens": 217138106.0, + "step": 77400 + }, + { + "epoch": 1.8810223052838522, + "grad_norm": 0.5605314373970032, + "learning_rate": 7.34453986800374e-05, + "loss": 0.2599, + "mean_token_accuracy": 0.9363309639692307, + "num_tokens": 217410591.0, + "step": 77500 + }, + { + "epoch": 1.8834494308390575, + "grad_norm": 0.4645470082759857, + "learning_rate": 7.317323333153717e-05, + "loss": 0.299, + "mean_token_accuracy": 0.9282932883501053, + "num_tokens": 217682755.0, + "step": 77600 + }, + { + "epoch": 1.8858765563942623, + "grad_norm": 0.41098716855049133, + "learning_rate": 7.290128194186574e-05, + "loss": 0.2657, + "mean_token_accuracy": 0.9342436867952347, + "num_tokens": 217955304.0, + "step": 77700 + }, + { + "epoch": 1.8883036819494672, + "grad_norm": 0.7157009243965149, + "learning_rate": 7.262954667999125e-05, + "loss": 0.2413, + "mean_token_accuracy": 0.9404269510507584, + "num_tokens": 218229072.0, + "step": 77800 + }, + { + "epoch": 1.8907308075046723, + "grad_norm": 0.5151275396347046, + "learning_rate": 7.235802971315793e-05, + "loss": 0.2916, + "mean_token_accuracy": 0.9299118751287461, + "num_tokens": 218512265.0, + "step": 77900 + }, + { + "epoch": 1.8931579330598773, + "grad_norm": 0.6814290285110474, + "learning_rate": 7.208673320686916e-05, + "loss": 0.2557, + "mean_token_accuracy": 0.9371042233705521, + "num_tokens": 218815553.0, + "step": 78000 + }, + { + "epoch": 1.8955850586150822, + "grad_norm": 0.4364314377307892, + "learning_rate": 7.181565932487e-05, + "loss": 0.2683, + "mean_token_accuracy": 0.9351461017131806, + "num_tokens": 219109518.0, + "step": 78100 + }, + { + "epoch": 1.898012184170287, + "grad_norm": 0.9119032621383667, + "learning_rate": 7.154481022912985e-05, + "loss": 0.2591, + "mean_token_accuracy": 0.9369248080253602, + "num_tokens": 219400520.0, + "step": 78200 + }, + { + "epoch": 1.9004393097254921, + "grad_norm": 0.5391600131988525, + "learning_rate": 7.127418807982538e-05, + "loss": 0.2814, + "mean_token_accuracy": 0.9312341713905334, + "num_tokens": 219686579.0, + "step": 78300 + }, + { + "epoch": 1.9028664352806972, + "grad_norm": 0.9579749703407288, + "learning_rate": 7.10037950353233e-05, + "loss": 0.2771, + "mean_token_accuracy": 0.9320234954357147, + "num_tokens": 219972043.0, + "step": 78400 + }, + { + "epoch": 1.905293560835902, + "grad_norm": 0.6177464127540588, + "learning_rate": 7.07336332521629e-05, + "loss": 0.2881, + "mean_token_accuracy": 0.9298322373628616, + "num_tokens": 220247450.0, + "step": 78500 + }, + { + "epoch": 1.907720686391107, + "grad_norm": 0.4801132380962372, + "learning_rate": 7.046370488503915e-05, + "loss": 0.256, + "mean_token_accuracy": 0.9365988206863404, + "num_tokens": 220516332.0, + "step": 78600 + }, + { + "epoch": 1.910147811946312, + "grad_norm": 0.7056084871292114, + "learning_rate": 7.019401208678544e-05, + "loss": 0.2763, + "mean_token_accuracy": 0.9320674824714661, + "num_tokens": 220801836.0, + "step": 78700 + }, + { + "epoch": 1.912574937501517, + "grad_norm": 0.5265929698944092, + "learning_rate": 6.992455700835624e-05, + "loss": 0.3129, + "mean_token_accuracy": 0.9254920214414597, + "num_tokens": 221082541.0, + "step": 78800 + }, + { + "epoch": 1.915002063056722, + "grad_norm": 0.6001086831092834, + "learning_rate": 6.965534179881014e-05, + "loss": 0.2677, + "mean_token_accuracy": 0.9343756914138794, + "num_tokens": 221363824.0, + "step": 78900 + }, + { + "epoch": 1.9174291886119268, + "grad_norm": 0.5919141173362732, + "learning_rate": 6.938636860529268e-05, + "loss": 0.2368, + "mean_token_accuracy": 0.940570114850998, + "num_tokens": 221642721.0, + "step": 79000 + }, + { + "epoch": 1.9198563141671319, + "grad_norm": 0.5845506191253662, + "learning_rate": 6.911763957301904e-05, + "loss": 0.2563, + "mean_token_accuracy": 0.9364815109968185, + "num_tokens": 221918380.0, + "step": 79100 + }, + { + "epoch": 1.922283439722337, + "grad_norm": 0.9840309619903564, + "learning_rate": 6.884915684525719e-05, + "loss": 0.2811, + "mean_token_accuracy": 0.9315145230293274, + "num_tokens": 222203368.0, + "step": 79200 + }, + { + "epoch": 1.9247105652775418, + "grad_norm": 0.6356433033943176, + "learning_rate": 6.85809225633107e-05, + "loss": 0.2738, + "mean_token_accuracy": 0.9337194603681565, + "num_tokens": 222472383.0, + "step": 79300 + }, + { + "epoch": 1.9271376908327467, + "grad_norm": 0.5394325256347656, + "learning_rate": 6.831293886650154e-05, + "loss": 0.2904, + "mean_token_accuracy": 0.9293972563743591, + "num_tokens": 222746002.0, + "step": 79400 + }, + { + "epoch": 1.9295648163879517, + "grad_norm": 0.6073657870292664, + "learning_rate": 6.804520789215319e-05, + "loss": 0.2376, + "mean_token_accuracy": 0.9415085107088089, + "num_tokens": 223036552.0, + "step": 79500 + }, + { + "epoch": 1.9319919419431568, + "grad_norm": 0.4766857326030731, + "learning_rate": 6.777773177557357e-05, + "loss": 0.2365, + "mean_token_accuracy": 0.9424629133939743, + "num_tokens": 223326315.0, + "step": 79600 + }, + { + "epoch": 1.9344190674983617, + "grad_norm": 0.8072234392166138, + "learning_rate": 6.751051265003779e-05, + "loss": 0.2577, + "mean_token_accuracy": 0.9369598460197449, + "num_tokens": 223600784.0, + "step": 79700 + }, + { + "epoch": 1.9368461930535665, + "grad_norm": 0.5053612589836121, + "learning_rate": 6.724355264677148e-05, + "loss": 0.243, + "mean_token_accuracy": 0.9401568102836609, + "num_tokens": 223895731.0, + "step": 79800 + }, + { + "epoch": 1.9392733186087716, + "grad_norm": 0.8727260828018188, + "learning_rate": 6.69768538949336e-05, + "loss": 0.2488, + "mean_token_accuracy": 0.9389541667699813, + "num_tokens": 224176472.0, + "step": 79900 + }, + { + "epoch": 1.9417004441639767, + "grad_norm": 0.8288789987564087, + "learning_rate": 6.671041852159937e-05, + "loss": 0.2571, + "mean_token_accuracy": 0.936657503247261, + "num_tokens": 224447801.0, + "step": 80000 + }, + { + "epoch": 1.9441275697191815, + "grad_norm": 0.3764743506908417, + "learning_rate": 6.644424865174354e-05, + "loss": 0.2589, + "mean_token_accuracy": 0.9362627816200256, + "num_tokens": 224750029.0, + "step": 80100 + }, + { + "epoch": 1.9465546952743864, + "grad_norm": 0.5590245723724365, + "learning_rate": 6.617834640822325e-05, + "loss": 0.2438, + "mean_token_accuracy": 0.9403492480516433, + "num_tokens": 225021335.0, + "step": 80200 + }, + { + "epoch": 1.9489818208295915, + "grad_norm": 0.5417003035545349, + "learning_rate": 6.591271391176121e-05, + "loss": 0.2651, + "mean_token_accuracy": 0.935703609585762, + "num_tokens": 225307283.0, + "step": 80300 + }, + { + "epoch": 1.9514089463847966, + "grad_norm": 0.5252110958099365, + "learning_rate": 6.564735328092868e-05, + "loss": 0.2718, + "mean_token_accuracy": 0.9335632401704789, + "num_tokens": 225605260.0, + "step": 80400 + }, + { + "epoch": 1.9538360719400014, + "grad_norm": 0.7686030268669128, + "learning_rate": 6.538226663212868e-05, + "loss": 0.2416, + "mean_token_accuracy": 0.9402110683917999, + "num_tokens": 225875026.0, + "step": 80500 + }, + { + "epoch": 1.9562631974952063, + "grad_norm": 0.33636564016342163, + "learning_rate": 6.511745607957906e-05, + "loss": 0.2678, + "mean_token_accuracy": 0.93425896525383, + "num_tokens": 226153286.0, + "step": 80600 + }, + { + "epoch": 1.9586903230504114, + "grad_norm": 0.8506200313568115, + "learning_rate": 6.485292373529565e-05, + "loss": 0.2372, + "mean_token_accuracy": 0.9409923559427261, + "num_tokens": 226438345.0, + "step": 80700 + }, + { + "epoch": 1.9611174486056164, + "grad_norm": 0.4994686543941498, + "learning_rate": 6.458867170907539e-05, + "loss": 0.3036, + "mean_token_accuracy": 0.9279145467281341, + "num_tokens": 226724818.0, + "step": 80800 + }, + { + "epoch": 1.9635445741608213, + "grad_norm": 0.3879760205745697, + "learning_rate": 6.432470210847958e-05, + "loss": 0.304, + "mean_token_accuracy": 0.9270832985639572, + "num_tokens": 227002630.0, + "step": 80900 + }, + { + "epoch": 1.9659716997160261, + "grad_norm": 0.5716784000396729, + "learning_rate": 6.40610170388169e-05, + "loss": 0.2548, + "mean_token_accuracy": 0.9366704255342484, + "num_tokens": 227297146.0, + "step": 81000 + }, + { + "epoch": 1.9683988252712314, + "grad_norm": 0.7849399447441101, + "learning_rate": 6.379761860312682e-05, + "loss": 0.2394, + "mean_token_accuracy": 0.9409961378574372, + "num_tokens": 227576379.0, + "step": 81100 + }, + { + "epoch": 1.9708259508264363, + "grad_norm": 0.5937269330024719, + "learning_rate": 6.353450890216279e-05, + "loss": 0.2384, + "mean_token_accuracy": 0.9405011647939682, + "num_tokens": 227839735.0, + "step": 81200 + }, + { + "epoch": 1.9732530763816412, + "grad_norm": 1.1839408874511719, + "learning_rate": 6.327169003437527e-05, + "loss": 0.2298, + "mean_token_accuracy": 0.9422443634271622, + "num_tokens": 228113960.0, + "step": 81300 + }, + { + "epoch": 1.9756802019368462, + "grad_norm": 0.4460374712944031, + "learning_rate": 6.300916409589531e-05, + "loss": 0.2479, + "mean_token_accuracy": 0.9397810900211334, + "num_tokens": 228395773.0, + "step": 81400 + }, + { + "epoch": 1.9781073274920513, + "grad_norm": 0.5764565467834473, + "learning_rate": 6.274693318051765e-05, + "loss": 0.2468, + "mean_token_accuracy": 0.9393291866779327, + "num_tokens": 228674065.0, + "step": 81500 + }, + { + "epoch": 1.9805344530472562, + "grad_norm": 0.5391059517860413, + "learning_rate": 6.248499937968397e-05, + "loss": 0.2694, + "mean_token_accuracy": 0.9344400101900101, + "num_tokens": 228950893.0, + "step": 81600 + }, + { + "epoch": 1.982961578602461, + "grad_norm": 0.44228363037109375, + "learning_rate": 6.222336478246636e-05, + "loss": 0.2147, + "mean_token_accuracy": 0.9461823815107345, + "num_tokens": 229227673.0, + "step": 81700 + }, + { + "epoch": 1.985388704157666, + "grad_norm": 0.6836258172988892, + "learning_rate": 6.196203147555066e-05, + "loss": 0.2465, + "mean_token_accuracy": 0.9392766255140305, + "num_tokens": 229499479.0, + "step": 81800 + }, + { + "epoch": 1.9878158297128712, + "grad_norm": 0.8605006337165833, + "learning_rate": 6.170100154321954e-05, + "loss": 0.2592, + "mean_token_accuracy": 0.9367054688930512, + "num_tokens": 229769342.0, + "step": 81900 + }, + { + "epoch": 1.990242955268076, + "grad_norm": 0.7405674457550049, + "learning_rate": 6.144027706733629e-05, + "loss": 0.2554, + "mean_token_accuracy": 0.9385197955369949, + "num_tokens": 230033787.0, + "step": 82000 + }, + { + "epoch": 1.992670080823281, + "grad_norm": 0.5927315950393677, + "learning_rate": 6.117986012732793e-05, + "loss": 0.2853, + "mean_token_accuracy": 0.929689479470253, + "num_tokens": 230313605.0, + "step": 82100 + }, + { + "epoch": 1.995097206378486, + "grad_norm": 0.5205606818199158, + "learning_rate": 6.091975280016868e-05, + "loss": 0.259, + "mean_token_accuracy": 0.9373654896020889, + "num_tokens": 230603964.0, + "step": 82200 + }, + { + "epoch": 1.997524331933691, + "grad_norm": 0.9966670870780945, + "learning_rate": 6.065995716036346e-05, + "loss": 0.2725, + "mean_token_accuracy": 0.9339149141311646, + "num_tokens": 230887154.0, + "step": 82300 + }, + { + "epoch": 1.999951457488896, + "grad_norm": 0.8154625296592712, + "learning_rate": 6.0400475279931315e-05, + "loss": 0.2743, + "mean_token_accuracy": 0.9331579411029816, + "num_tokens": 231184353.0, + "step": 82400 + }, + { + "epoch": 2.0023785830441008, + "grad_norm": 0.5808880925178528, + "learning_rate": 6.014130922838881e-05, + "loss": 0.1519, + "mean_token_accuracy": 0.9608761477470398, + "num_tokens": 231479894.0, + "step": 82500 + }, + { + "epoch": 2.0048057085993056, + "grad_norm": 0.4847000241279602, + "learning_rate": 5.988246107273365e-05, + "loss": 0.1805, + "mean_token_accuracy": 0.9541962110996246, + "num_tokens": 231784377.0, + "step": 82600 + }, + { + "epoch": 2.007232834154511, + "grad_norm": 0.5827914476394653, + "learning_rate": 5.962393287742816e-05, + "loss": 0.1396, + "mean_token_accuracy": 0.9642374736070632, + "num_tokens": 232063461.0, + "step": 82700 + }, + { + "epoch": 2.009659959709716, + "grad_norm": 0.6424087882041931, + "learning_rate": 5.936572670438273e-05, + "loss": 0.1362, + "mean_token_accuracy": 0.9648731410503387, + "num_tokens": 232342507.0, + "step": 82800 + }, + { + "epoch": 2.0120870852649206, + "grad_norm": 0.6409827470779419, + "learning_rate": 5.910784461293948e-05, + "loss": 0.1643, + "mean_token_accuracy": 0.9581991600990295, + "num_tokens": 232625967.0, + "step": 82900 + }, + { + "epoch": 2.014514210820126, + "grad_norm": 0.5854642987251282, + "learning_rate": 5.8850288659855846e-05, + "loss": 0.1559, + "mean_token_accuracy": 0.9603128689527511, + "num_tokens": 232900644.0, + "step": 83000 + }, + { + "epoch": 2.016941336375331, + "grad_norm": 0.521918535232544, + "learning_rate": 5.8593060899287965e-05, + "loss": 0.1492, + "mean_token_accuracy": 0.9615022081136704, + "num_tokens": 233181484.0, + "step": 83100 + }, + { + "epoch": 2.0193684619305357, + "grad_norm": 0.7428982257843018, + "learning_rate": 5.8336163382774586e-05, + "loss": 0.1688, + "mean_token_accuracy": 0.9571781551837921, + "num_tokens": 233459275.0, + "step": 83200 + }, + { + "epoch": 2.0217955874857405, + "grad_norm": 0.44911694526672363, + "learning_rate": 5.8079598159220625e-05, + "loss": 0.1464, + "mean_token_accuracy": 0.9628020924329758, + "num_tokens": 233730444.0, + "step": 83300 + }, + { + "epoch": 2.024222713040946, + "grad_norm": 0.4201023280620575, + "learning_rate": 5.782336727488054e-05, + "loss": 0.1616, + "mean_token_accuracy": 0.9586890661716461, + "num_tokens": 234007650.0, + "step": 83400 + }, + { + "epoch": 2.0266498385961507, + "grad_norm": 0.5022528171539307, + "learning_rate": 5.756747277334249e-05, + "loss": 0.1889, + "mean_token_accuracy": 0.9528667098283767, + "num_tokens": 234297366.0, + "step": 83500 + }, + { + "epoch": 2.0290769641513555, + "grad_norm": 0.29196932911872864, + "learning_rate": 5.731191669551165e-05, + "loss": 0.152, + "mean_token_accuracy": 0.9611919575929642, + "num_tokens": 234582826.0, + "step": 83600 + }, + { + "epoch": 2.0315040897065604, + "grad_norm": 0.2950987219810486, + "learning_rate": 5.705670107959407e-05, + "loss": 0.1749, + "mean_token_accuracy": 0.9554368454217911, + "num_tokens": 234863426.0, + "step": 83700 + }, + { + "epoch": 2.0339312152617657, + "grad_norm": 0.49717605113983154, + "learning_rate": 5.6801827961080536e-05, + "loss": 0.16, + "mean_token_accuracy": 0.9598457747697831, + "num_tokens": 235146656.0, + "step": 83800 + }, + { + "epoch": 2.0363583408169705, + "grad_norm": 0.4079115092754364, + "learning_rate": 5.654729937273009e-05, + "loss": 0.1395, + "mean_token_accuracy": 0.9637018305063247, + "num_tokens": 235426935.0, + "step": 83900 + }, + { + "epoch": 2.0387854663721754, + "grad_norm": 2.313652515411377, + "learning_rate": 5.629311734455403e-05, + "loss": 0.1656, + "mean_token_accuracy": 0.9576411491632462, + "num_tokens": 235714856.0, + "step": 84000 + }, + { + "epoch": 2.0412125919273802, + "grad_norm": 0.5406439900398254, + "learning_rate": 5.6039283903799534e-05, + "loss": 0.173, + "mean_token_accuracy": 0.9562010073661804, + "num_tokens": 235984367.0, + "step": 84100 + }, + { + "epoch": 2.0436397174825855, + "grad_norm": 0.360782653093338, + "learning_rate": 5.5785801074933786e-05, + "loss": 0.1532, + "mean_token_accuracy": 0.9607203018665313, + "num_tokens": 236262299.0, + "step": 84200 + }, + { + "epoch": 2.0460668430377904, + "grad_norm": 0.5406671762466431, + "learning_rate": 5.5532670879627455e-05, + "loss": 0.1522, + "mean_token_accuracy": 0.9607596349716186, + "num_tokens": 236550358.0, + "step": 84300 + }, + { + "epoch": 2.0484939685929953, + "grad_norm": 0.4788415729999542, + "learning_rate": 5.527989533673881e-05, + "loss": 0.1448, + "mean_token_accuracy": 0.9628503209352494, + "num_tokens": 236822868.0, + "step": 84400 + }, + { + "epoch": 2.0509210941482, + "grad_norm": 0.5984004139900208, + "learning_rate": 5.5027476462297635e-05, + "loss": 0.1625, + "mean_token_accuracy": 0.9581832873821259, + "num_tokens": 237108453.0, + "step": 84500 + }, + { + "epoch": 2.0533482197034054, + "grad_norm": 0.41232043504714966, + "learning_rate": 5.4775416269489e-05, + "loss": 0.1541, + "mean_token_accuracy": 0.9605655258893967, + "num_tokens": 237386750.0, + "step": 84600 + }, + { + "epoch": 2.0557753452586103, + "grad_norm": 0.5192024111747742, + "learning_rate": 5.45237167686373e-05, + "loss": 0.1611, + "mean_token_accuracy": 0.9575982117652893, + "num_tokens": 237657392.0, + "step": 84700 + }, + { + "epoch": 2.058202470813815, + "grad_norm": 0.4785330593585968, + "learning_rate": 5.427237996719017e-05, + "loss": 0.1511, + "mean_token_accuracy": 0.9610388678312302, + "num_tokens": 237928389.0, + "step": 84800 + }, + { + "epoch": 2.06062959636902, + "grad_norm": 0.4883587956428528, + "learning_rate": 5.402140786970264e-05, + "loss": 0.1323, + "mean_token_accuracy": 0.9661263453960419, + "num_tokens": 238194140.0, + "step": 84900 + }, + { + "epoch": 2.0630567219242253, + "grad_norm": 0.3393934965133667, + "learning_rate": 5.377080247782088e-05, + "loss": 0.15, + "mean_token_accuracy": 0.9611387658119201, + "num_tokens": 238474238.0, + "step": 85000 + }, + { + "epoch": 2.06548384747943, + "grad_norm": 0.40510058403015137, + "learning_rate": 5.3520565790266386e-05, + "loss": 0.1544, + "mean_token_accuracy": 0.9601859641075134, + "num_tokens": 238755497.0, + "step": 85100 + }, + { + "epoch": 2.067910973034635, + "grad_norm": 0.4711453318595886, + "learning_rate": 5.327069980282017e-05, + "loss": 0.1559, + "mean_token_accuracy": 0.9597002917528152, + "num_tokens": 239032904.0, + "step": 85200 + }, + { + "epoch": 2.07033809858984, + "grad_norm": 0.20721179246902466, + "learning_rate": 5.302120650830643e-05, + "loss": 0.1568, + "mean_token_accuracy": 0.9598673725128174, + "num_tokens": 239324806.0, + "step": 85300 + }, + { + "epoch": 2.072765224145045, + "grad_norm": 0.6029618978500366, + "learning_rate": 5.277208789657722e-05, + "loss": 0.1616, + "mean_token_accuracy": 0.959028176665306, + "num_tokens": 239598686.0, + "step": 85400 + }, + { + "epoch": 2.07519234970025, + "grad_norm": 0.8069055080413818, + "learning_rate": 5.252334595449608e-05, + "loss": 0.1614, + "mean_token_accuracy": 0.9587891620397567, + "num_tokens": 239875219.0, + "step": 85500 + }, + { + "epoch": 2.077619475255455, + "grad_norm": 0.576477587223053, + "learning_rate": 5.227498266592249e-05, + "loss": 0.1725, + "mean_token_accuracy": 0.9557231628894806, + "num_tokens": 240146288.0, + "step": 85600 + }, + { + "epoch": 2.0800466008106597, + "grad_norm": 0.8168060183525085, + "learning_rate": 5.202700001169587e-05, + "loss": 0.1486, + "mean_token_accuracy": 0.9620643395185471, + "num_tokens": 240435471.0, + "step": 85700 + }, + { + "epoch": 2.082473726365865, + "grad_norm": 0.49945294857025146, + "learning_rate": 5.177939996961998e-05, + "loss": 0.1776, + "mean_token_accuracy": 0.95517926633358, + "num_tokens": 240731297.0, + "step": 85800 + }, + { + "epoch": 2.08490085192107, + "grad_norm": 0.30427655577659607, + "learning_rate": 5.1532184514446955e-05, + "loss": 0.1584, + "mean_token_accuracy": 0.9593313550949096, + "num_tokens": 241013465.0, + "step": 85900 + }, + { + "epoch": 2.0873279774762747, + "grad_norm": 0.7176592350006104, + "learning_rate": 5.128535561786159e-05, + "loss": 0.1782, + "mean_token_accuracy": 0.95425497174263, + "num_tokens": 241286724.0, + "step": 86000 + }, + { + "epoch": 2.0897551030314796, + "grad_norm": 0.7456308603286743, + "learning_rate": 5.10389152484658e-05, + "loss": 0.1433, + "mean_token_accuracy": 0.9630332779884339, + "num_tokens": 241565348.0, + "step": 86100 + }, + { + "epoch": 2.092182228586685, + "grad_norm": 0.3332330882549286, + "learning_rate": 5.0792865371762534e-05, + "loss": 0.178, + "mean_token_accuracy": 0.9545424026250839, + "num_tokens": 241842815.0, + "step": 86200 + }, + { + "epoch": 2.0946093541418898, + "grad_norm": 0.21731993556022644, + "learning_rate": 5.054720795014061e-05, + "loss": 0.1677, + "mean_token_accuracy": 0.9578067481517791, + "num_tokens": 242130092.0, + "step": 86300 + }, + { + "epoch": 2.0970364796970946, + "grad_norm": 0.5908037424087524, + "learning_rate": 5.030194494285859e-05, + "loss": 0.1718, + "mean_token_accuracy": 0.9564685708284378, + "num_tokens": 242405499.0, + "step": 86400 + }, + { + "epoch": 2.0994636052523, + "grad_norm": 0.3582291007041931, + "learning_rate": 5.005707830602942e-05, + "loss": 0.145, + "mean_token_accuracy": 0.9633462530374527, + "num_tokens": 242688957.0, + "step": 86500 + }, + { + "epoch": 2.1018907308075048, + "grad_norm": 0.630010724067688, + "learning_rate": 4.981260999260484e-05, + "loss": 0.137, + "mean_token_accuracy": 0.96411676466465, + "num_tokens": 242966623.0, + "step": 86600 + }, + { + "epoch": 2.1043178563627096, + "grad_norm": 0.2612079977989197, + "learning_rate": 4.956854195235963e-05, + "loss": 0.1725, + "mean_token_accuracy": 0.9565469098091125, + "num_tokens": 243227363.0, + "step": 86700 + }, + { + "epoch": 2.1067449819179145, + "grad_norm": 0.44951051473617554, + "learning_rate": 4.9324876131876186e-05, + "loss": 0.1574, + "mean_token_accuracy": 0.9592509204149247, + "num_tokens": 243509573.0, + "step": 86800 + }, + { + "epoch": 2.10917210747312, + "grad_norm": 0.601586103439331, + "learning_rate": 4.9081614474528944e-05, + "loss": 0.1843, + "mean_token_accuracy": 0.9530947256088257, + "num_tokens": 243780742.0, + "step": 86900 + }, + { + "epoch": 2.1115992330283246, + "grad_norm": 0.3754229247570038, + "learning_rate": 4.883875892046904e-05, + "loss": 0.159, + "mean_token_accuracy": 0.9592651635408401, + "num_tokens": 244048761.0, + "step": 87000 + }, + { + "epoch": 2.1140263585835295, + "grad_norm": 0.5469896793365479, + "learning_rate": 4.8596311406608555e-05, + "loss": 0.1515, + "mean_token_accuracy": 0.9612551802396774, + "num_tokens": 244323569.0, + "step": 87100 + }, + { + "epoch": 2.1164534841387344, + "grad_norm": 0.33415502309799194, + "learning_rate": 4.8354273866605234e-05, + "loss": 0.1572, + "mean_token_accuracy": 0.9593722647428513, + "num_tokens": 244613096.0, + "step": 87200 + }, + { + "epoch": 2.1188806096939397, + "grad_norm": 0.38363969326019287, + "learning_rate": 4.8112648230847204e-05, + "loss": 0.1724, + "mean_token_accuracy": 0.9562730824947357, + "num_tokens": 244888619.0, + "step": 87300 + }, + { + "epoch": 2.1213077352491445, + "grad_norm": 0.3998253345489502, + "learning_rate": 4.787143642643715e-05, + "loss": 0.1561, + "mean_token_accuracy": 0.9594432032108307, + "num_tokens": 245179499.0, + "step": 87400 + }, + { + "epoch": 2.1237348608043494, + "grad_norm": 0.38273563981056213, + "learning_rate": 4.763064037717745e-05, + "loss": 0.1474, + "mean_token_accuracy": 0.962109939455986, + "num_tokens": 245467289.0, + "step": 87500 + }, + { + "epoch": 2.1261619863595542, + "grad_norm": 0.24416208267211914, + "learning_rate": 4.7390262003554484e-05, + "loss": 0.1514, + "mean_token_accuracy": 0.9603712677955627, + "num_tokens": 245758328.0, + "step": 87600 + }, + { + "epoch": 2.1285891119147595, + "grad_norm": 1.0365910530090332, + "learning_rate": 4.7150303222723434e-05, + "loss": 0.1556, + "mean_token_accuracy": 0.960610026717186, + "num_tokens": 246051662.0, + "step": 87700 + }, + { + "epoch": 2.1310162374699644, + "grad_norm": 0.9765134453773499, + "learning_rate": 4.691076594849308e-05, + "loss": 0.1446, + "mean_token_accuracy": 0.9626650083065033, + "num_tokens": 246339457.0, + "step": 87800 + }, + { + "epoch": 2.1334433630251692, + "grad_norm": 0.6015363931655884, + "learning_rate": 4.6671652091310305e-05, + "loss": 0.179, + "mean_token_accuracy": 0.9543536853790283, + "num_tokens": 246621131.0, + "step": 87900 + }, + { + "epoch": 2.135870488580374, + "grad_norm": 0.3435966968536377, + "learning_rate": 4.64329635582451e-05, + "loss": 0.1693, + "mean_token_accuracy": 0.9579638803005218, + "num_tokens": 246905268.0, + "step": 88000 + }, + { + "epoch": 2.1382976141355794, + "grad_norm": 0.957961916923523, + "learning_rate": 4.6194702252975184e-05, + "loss": 0.1371, + "mean_token_accuracy": 0.9644148916006088, + "num_tokens": 247179877.0, + "step": 88100 + }, + { + "epoch": 2.1407247396907843, + "grad_norm": 0.5038435459136963, + "learning_rate": 4.5956870075771e-05, + "loss": 0.1409, + "mean_token_accuracy": 0.9635371136665344, + "num_tokens": 247460346.0, + "step": 88200 + }, + { + "epoch": 2.143151865245989, + "grad_norm": 0.6354673504829407, + "learning_rate": 4.571946892348032e-05, + "loss": 0.1741, + "mean_token_accuracy": 0.9555548268556595, + "num_tokens": 247731885.0, + "step": 88300 + }, + { + "epoch": 2.145578990801194, + "grad_norm": 0.7258239984512329, + "learning_rate": 4.54825006895133e-05, + "loss": 0.1639, + "mean_token_accuracy": 0.9581418389081955, + "num_tokens": 248011968.0, + "step": 88400 + }, + { + "epoch": 2.1480061163563993, + "grad_norm": 0.31706157326698303, + "learning_rate": 4.524596726382736e-05, + "loss": 0.1597, + "mean_token_accuracy": 0.9591193878650666, + "num_tokens": 248294086.0, + "step": 88500 + }, + { + "epoch": 2.150433241911604, + "grad_norm": 0.6476277709007263, + "learning_rate": 4.500987053291202e-05, + "loss": 0.1445, + "mean_token_accuracy": 0.9631332182884216, + "num_tokens": 248571328.0, + "step": 88600 + }, + { + "epoch": 2.152860367466809, + "grad_norm": 0.39604470133781433, + "learning_rate": 4.477421237977394e-05, + "loss": 0.1386, + "mean_token_accuracy": 0.9638280099630356, + "num_tokens": 248842735.0, + "step": 88700 + }, + { + "epoch": 2.155287493022014, + "grad_norm": 0.2958824634552002, + "learning_rate": 4.453899468392181e-05, + "loss": 0.1247, + "mean_token_accuracy": 0.9681485140323639, + "num_tokens": 249108092.0, + "step": 88800 + }, + { + "epoch": 2.157714618577219, + "grad_norm": 0.6001761555671692, + "learning_rate": 4.430421932135156e-05, + "loss": 0.1384, + "mean_token_accuracy": 0.9643767905235291, + "num_tokens": 249385884.0, + "step": 88900 + }, + { + "epoch": 2.160141744132424, + "grad_norm": 0.9776992201805115, + "learning_rate": 4.406988816453111e-05, + "loss": 0.1633, + "mean_token_accuracy": 0.9592347794771194, + "num_tokens": 249664562.0, + "step": 89000 + }, + { + "epoch": 2.162568869687629, + "grad_norm": 0.3612774908542633, + "learning_rate": 4.3836003082385636e-05, + "loss": 0.1489, + "mean_token_accuracy": 0.9616664987802506, + "num_tokens": 249940419.0, + "step": 89100 + }, + { + "epoch": 2.1649959952428337, + "grad_norm": 0.6092312335968018, + "learning_rate": 4.3602565940282715e-05, + "loss": 0.1508, + "mean_token_accuracy": 0.9607537633180618, + "num_tokens": 250215092.0, + "step": 89200 + }, + { + "epoch": 2.167423120798039, + "grad_norm": 0.46185943484306335, + "learning_rate": 4.3369578600017105e-05, + "loss": 0.1726, + "mean_token_accuracy": 0.9566762536764145, + "num_tokens": 250503318.0, + "step": 89300 + }, + { + "epoch": 2.169850246353244, + "grad_norm": 0.5038064122200012, + "learning_rate": 4.31370429197964e-05, + "loss": 0.1628, + "mean_token_accuracy": 0.958371422290802, + "num_tokens": 250783576.0, + "step": 89400 + }, + { + "epoch": 2.1722773719084487, + "grad_norm": 0.3340010941028595, + "learning_rate": 4.290496075422581e-05, + "loss": 0.1456, + "mean_token_accuracy": 0.9626976943016052, + "num_tokens": 251067459.0, + "step": 89500 + }, + { + "epoch": 2.1747044974636536, + "grad_norm": 0.3565070927143097, + "learning_rate": 4.2673333954293496e-05, + "loss": 0.1524, + "mean_token_accuracy": 0.9609863895177841, + "num_tokens": 251372607.0, + "step": 89600 + }, + { + "epoch": 2.177131623018859, + "grad_norm": 0.3239290118217468, + "learning_rate": 4.244216436735583e-05, + "loss": 0.1696, + "mean_token_accuracy": 0.956989768743515, + "num_tokens": 251643396.0, + "step": 89700 + }, + { + "epoch": 2.1795587485740637, + "grad_norm": 0.8524539470672607, + "learning_rate": 4.221145383712274e-05, + "loss": 0.1486, + "mean_token_accuracy": 0.9617244362831116, + "num_tokens": 251917277.0, + "step": 89800 + }, + { + "epoch": 2.1819858741292686, + "grad_norm": 0.3852197825908661, + "learning_rate": 4.19812042036428e-05, + "loss": 0.1583, + "mean_token_accuracy": 0.9597838073968887, + "num_tokens": 252194410.0, + "step": 89900 + }, + { + "epoch": 2.184412999684474, + "grad_norm": 0.5747129917144775, + "learning_rate": 4.1751417303288666e-05, + "loss": 0.1539, + "mean_token_accuracy": 0.9608057707548141, + "num_tokens": 252465960.0, + "step": 90000 + }, + { + "epoch": 2.1868401252396787, + "grad_norm": 0.5769911408424377, + "learning_rate": 4.15220949687426e-05, + "loss": 0.1549, + "mean_token_accuracy": 0.9605850332975387, + "num_tokens": 252737863.0, + "step": 90100 + }, + { + "epoch": 2.1892672507948836, + "grad_norm": 0.45625218749046326, + "learning_rate": 4.12932390289814e-05, + "loss": 0.1548, + "mean_token_accuracy": 0.960279346704483, + "num_tokens": 253005781.0, + "step": 90200 + }, + { + "epoch": 2.1916943763500885, + "grad_norm": 0.6102867126464844, + "learning_rate": 4.1064851309262374e-05, + "loss": 0.1719, + "mean_token_accuracy": 0.9562070280313492, + "num_tokens": 253282421.0, + "step": 90300 + }, + { + "epoch": 2.1941215019052938, + "grad_norm": 0.418035626411438, + "learning_rate": 4.0836933631108364e-05, + "loss": 0.1544, + "mean_token_accuracy": 0.9601026111841202, + "num_tokens": 253548675.0, + "step": 90400 + }, + { + "epoch": 2.1965486274604986, + "grad_norm": 0.5558539032936096, + "learning_rate": 4.0609487812293344e-05, + "loss": 0.1428, + "mean_token_accuracy": 0.962897977232933, + "num_tokens": 253824242.0, + "step": 90500 + }, + { + "epoch": 2.1989757530157035, + "grad_norm": 0.31991976499557495, + "learning_rate": 4.038251566682806e-05, + "loss": 0.1684, + "mean_token_accuracy": 0.9578196865320205, + "num_tokens": 254108773.0, + "step": 90600 + }, + { + "epoch": 2.2014028785709083, + "grad_norm": 0.35955339670181274, + "learning_rate": 4.0156019004945336e-05, + "loss": 0.1552, + "mean_token_accuracy": 0.960447661280632, + "num_tokens": 254398547.0, + "step": 90700 + }, + { + "epoch": 2.2038300041261136, + "grad_norm": 0.3822309374809265, + "learning_rate": 3.9929999633085757e-05, + "loss": 0.1532, + "mean_token_accuracy": 0.961047391295433, + "num_tokens": 254649735.0, + "step": 90800 + }, + { + "epoch": 2.2062571296813185, + "grad_norm": 0.6658386588096619, + "learning_rate": 3.970445935388325e-05, + "loss": 0.1549, + "mean_token_accuracy": 0.9600626587867737, + "num_tokens": 254930759.0, + "step": 90900 + }, + { + "epoch": 2.2086842552365233, + "grad_norm": 0.4035944938659668, + "learning_rate": 3.947939996615077e-05, + "loss": 0.1532, + "mean_token_accuracy": 0.9611799818277359, + "num_tokens": 255195830.0, + "step": 91000 + }, + { + "epoch": 2.211111380791728, + "grad_norm": 0.3729723393917084, + "learning_rate": 3.9254823264865805e-05, + "loss": 0.145, + "mean_token_accuracy": 0.9629390466213227, + "num_tokens": 255473905.0, + "step": 91100 + }, + { + "epoch": 2.2135385063469335, + "grad_norm": 0.390670508146286, + "learning_rate": 3.9030731041156135e-05, + "loss": 0.1423, + "mean_token_accuracy": 0.9631787258386612, + "num_tokens": 255757741.0, + "step": 91200 + }, + { + "epoch": 2.2159656319021384, + "grad_norm": 0.8064032196998596, + "learning_rate": 3.880712508228568e-05, + "loss": 0.1654, + "mean_token_accuracy": 0.9579305970668792, + "num_tokens": 256037860.0, + "step": 91300 + }, + { + "epoch": 2.218392757457343, + "grad_norm": 0.43664440512657166, + "learning_rate": 3.858400717163991e-05, + "loss": 0.1533, + "mean_token_accuracy": 0.9610221099853515, + "num_tokens": 256309514.0, + "step": 91400 + }, + { + "epoch": 2.220819883012548, + "grad_norm": 0.4479061961174011, + "learning_rate": 3.8361379088712055e-05, + "loss": 0.1518, + "mean_token_accuracy": 0.9613274109363555, + "num_tokens": 256599516.0, + "step": 91500 + }, + { + "epoch": 2.2232470085677534, + "grad_norm": 0.2272028774023056, + "learning_rate": 3.8139242609088535e-05, + "loss": 0.1486, + "mean_token_accuracy": 0.9616603809595108, + "num_tokens": 256883150.0, + "step": 91600 + }, + { + "epoch": 2.2256741341229582, + "grad_norm": 0.6324403285980225, + "learning_rate": 3.7917599504434976e-05, + "loss": 0.1714, + "mean_token_accuracy": 0.9572657948732376, + "num_tokens": 257177448.0, + "step": 91700 + }, + { + "epoch": 2.228101259678163, + "grad_norm": 0.43273621797561646, + "learning_rate": 3.7696451542482136e-05, + "loss": 0.1381, + "mean_token_accuracy": 0.9640599012374877, + "num_tokens": 257449995.0, + "step": 91800 + }, + { + "epoch": 2.230528385233368, + "grad_norm": 0.46854960918426514, + "learning_rate": 3.747580048701165e-05, + "loss": 0.1443, + "mean_token_accuracy": 0.9633812409639358, + "num_tokens": 257727340.0, + "step": 91900 + }, + { + "epoch": 2.2329555107885732, + "grad_norm": 0.5193477272987366, + "learning_rate": 3.7255648097842044e-05, + "loss": 0.1529, + "mean_token_accuracy": 0.9609187692403793, + "num_tokens": 257997280.0, + "step": 92000 + }, + { + "epoch": 2.235382636343778, + "grad_norm": 0.9509478807449341, + "learning_rate": 3.7035996130814696e-05, + "loss": 0.1286, + "mean_token_accuracy": 0.9667005783319473, + "num_tokens": 258291657.0, + "step": 92100 + }, + { + "epoch": 2.237809761898983, + "grad_norm": 0.7098160982131958, + "learning_rate": 3.681684633777991e-05, + "loss": 0.1394, + "mean_token_accuracy": 0.9636602461338043, + "num_tokens": 258573145.0, + "step": 92200 + }, + { + "epoch": 2.240236887454188, + "grad_norm": 0.3125475347042084, + "learning_rate": 3.6598200466582744e-05, + "loss": 0.1639, + "mean_token_accuracy": 0.9590049821138382, + "num_tokens": 258859618.0, + "step": 92300 + }, + { + "epoch": 2.242664013009393, + "grad_norm": 0.43968746066093445, + "learning_rate": 3.6380060261049244e-05, + "loss": 0.1455, + "mean_token_accuracy": 0.9627101892232894, + "num_tokens": 259138960.0, + "step": 92400 + }, + { + "epoch": 2.245091138564598, + "grad_norm": 0.3362375497817993, + "learning_rate": 3.616242746097254e-05, + "loss": 0.1606, + "mean_token_accuracy": 0.9594524520635604, + "num_tokens": 259415537.0, + "step": 92500 + }, + { + "epoch": 2.247518264119803, + "grad_norm": 0.41409507393836975, + "learning_rate": 3.594530380209881e-05, + "loss": 0.1436, + "mean_token_accuracy": 0.9630869609117508, + "num_tokens": 259689569.0, + "step": 92600 + }, + { + "epoch": 2.249945389675008, + "grad_norm": 0.23429489135742188, + "learning_rate": 3.57286910161136e-05, + "loss": 0.1385, + "mean_token_accuracy": 0.9647276037931443, + "num_tokens": 259962737.0, + "step": 92700 + }, + { + "epoch": 2.252372515230213, + "grad_norm": 0.3240128457546234, + "learning_rate": 3.551259083062792e-05, + "loss": 0.1479, + "mean_token_accuracy": 0.9617342710494995, + "num_tokens": 260232903.0, + "step": 92800 + }, + { + "epoch": 2.254799640785418, + "grad_norm": 0.31519198417663574, + "learning_rate": 3.5297004969164604e-05, + "loss": 0.1412, + "mean_token_accuracy": 0.9640700775384903, + "num_tokens": 260503493.0, + "step": 92900 + }, + { + "epoch": 2.2572267663406227, + "grad_norm": 0.3446758985519409, + "learning_rate": 3.508193515114423e-05, + "loss": 0.139, + "mean_token_accuracy": 0.9643152779340745, + "num_tokens": 260792452.0, + "step": 93000 + }, + { + "epoch": 2.2596538918958275, + "grad_norm": 0.5381621718406677, + "learning_rate": 3.4867383091871854e-05, + "loss": 0.1531, + "mean_token_accuracy": 0.9607863134145737, + "num_tokens": 261083921.0, + "step": 93100 + }, + { + "epoch": 2.262081017451033, + "grad_norm": 0.5895318388938904, + "learning_rate": 3.46533505025231e-05, + "loss": 0.134, + "mean_token_accuracy": 0.9654397374391556, + "num_tokens": 261369478.0, + "step": 93200 + }, + { + "epoch": 2.2645081430062377, + "grad_norm": 0.49713894724845886, + "learning_rate": 3.443983909013029e-05, + "loss": 0.1392, + "mean_token_accuracy": 0.964362952709198, + "num_tokens": 261640652.0, + "step": 93300 + }, + { + "epoch": 2.2669352685614426, + "grad_norm": 0.561633288860321, + "learning_rate": 3.422685055756936e-05, + "loss": 0.1356, + "mean_token_accuracy": 0.9651344537734985, + "num_tokens": 261918219.0, + "step": 93400 + }, + { + "epoch": 2.269362394116648, + "grad_norm": 0.5383502840995789, + "learning_rate": 3.401438660354579e-05, + "loss": 0.1461, + "mean_token_accuracy": 0.9628910517692566, + "num_tokens": 262208454.0, + "step": 93500 + }, + { + "epoch": 2.2717895196718527, + "grad_norm": 0.4417279064655304, + "learning_rate": 3.38024489225813e-05, + "loss": 0.1177, + "mean_token_accuracy": 0.9697069215774536, + "num_tokens": 262493658.0, + "step": 93600 + }, + { + "epoch": 2.2742166452270576, + "grad_norm": 0.38798466324806213, + "learning_rate": 3.359103920500025e-05, + "loss": 0.1556, + "mean_token_accuracy": 0.9601915013790131, + "num_tokens": 262774349.0, + "step": 93700 + }, + { + "epoch": 2.2766437707822624, + "grad_norm": 0.4257045388221741, + "learning_rate": 3.3380159136916287e-05, + "loss": 0.1441, + "mean_token_accuracy": 0.9632410365343094, + "num_tokens": 263049670.0, + "step": 93800 + }, + { + "epoch": 2.2790708963374673, + "grad_norm": 0.45404383540153503, + "learning_rate": 3.3169810400218707e-05, + "loss": 0.1187, + "mean_token_accuracy": 0.9688757717609405, + "num_tokens": 263325433.0, + "step": 93900 + }, + { + "epoch": 2.2814980218926726, + "grad_norm": 0.5358030200004578, + "learning_rate": 3.29599946725591e-05, + "loss": 0.1425, + "mean_token_accuracy": 0.9637533491849899, + "num_tokens": 263607707.0, + "step": 94000 + }, + { + "epoch": 2.2839251474478774, + "grad_norm": 0.2688960134983063, + "learning_rate": 3.275071362733818e-05, + "loss": 0.152, + "mean_token_accuracy": 0.9612334263324738, + "num_tokens": 263884293.0, + "step": 94100 + }, + { + "epoch": 2.2863522730030823, + "grad_norm": 0.4777241051197052, + "learning_rate": 3.2541968933692014e-05, + "loss": 0.1398, + "mean_token_accuracy": 0.9639061081409455, + "num_tokens": 264165137.0, + "step": 94200 + }, + { + "epoch": 2.2887793985582876, + "grad_norm": 0.6694493889808655, + "learning_rate": 3.233376225647919e-05, + "loss": 0.1532, + "mean_token_accuracy": 0.960937168598175, + "num_tokens": 264437076.0, + "step": 94300 + }, + { + "epoch": 2.2912065241134925, + "grad_norm": 0.6230403780937195, + "learning_rate": 3.2126095256267166e-05, + "loss": 0.1441, + "mean_token_accuracy": 0.9634155595302581, + "num_tokens": 264710455.0, + "step": 94400 + }, + { + "epoch": 2.2936336496686973, + "grad_norm": 0.48842713236808777, + "learning_rate": 3.191896958931917e-05, + "loss": 0.1471, + "mean_token_accuracy": 0.962595841884613, + "num_tokens": 264992074.0, + "step": 94500 + }, + { + "epoch": 2.296060775223902, + "grad_norm": 0.2296297550201416, + "learning_rate": 3.1712386907581105e-05, + "loss": 0.147, + "mean_token_accuracy": 0.9623468136787414, + "num_tokens": 265277308.0, + "step": 94600 + }, + { + "epoch": 2.2984879007791075, + "grad_norm": 0.2678510844707489, + "learning_rate": 3.150634885866812e-05, + "loss": 0.1658, + "mean_token_accuracy": 0.9582432430982589, + "num_tokens": 265554404.0, + "step": 94700 + }, + { + "epoch": 2.3009150263343123, + "grad_norm": 0.24679508805274963, + "learning_rate": 3.130085708585169e-05, + "loss": 0.1339, + "mean_token_accuracy": 0.9652345758676529, + "num_tokens": 265839961.0, + "step": 94800 + }, + { + "epoch": 2.303342151889517, + "grad_norm": 0.5927382707595825, + "learning_rate": 3.109591322804634e-05, + "loss": 0.1408, + "mean_token_accuracy": 0.9638234466314316, + "num_tokens": 266123252.0, + "step": 94900 + }, + { + "epoch": 2.305769277444722, + "grad_norm": 0.30509328842163086, + "learning_rate": 3.089151891979678e-05, + "loss": 0.124, + "mean_token_accuracy": 0.9677273762226105, + "num_tokens": 266401504.0, + "step": 95000 + }, + { + "epoch": 2.3081964029999273, + "grad_norm": 0.7120338678359985, + "learning_rate": 3.0687675791264656e-05, + "loss": 0.1276, + "mean_token_accuracy": 0.9665586167573929, + "num_tokens": 266681814.0, + "step": 95100 + }, + { + "epoch": 2.310623528555132, + "grad_norm": 0.312296599149704, + "learning_rate": 3.0484385468215637e-05, + "loss": 0.1298, + "mean_token_accuracy": 0.9670355755090714, + "num_tokens": 266966361.0, + "step": 95200 + }, + { + "epoch": 2.313050654110337, + "grad_norm": 0.2584036588668823, + "learning_rate": 3.0281649572006566e-05, + "loss": 0.1374, + "mean_token_accuracy": 0.9641981101036072, + "num_tokens": 267243149.0, + "step": 95300 + }, + { + "epoch": 2.315477779665542, + "grad_norm": 0.9581860303878784, + "learning_rate": 3.0079469719572206e-05, + "loss": 0.1483, + "mean_token_accuracy": 0.9624637293815613, + "num_tokens": 267532595.0, + "step": 95400 + }, + { + "epoch": 2.317904905220747, + "grad_norm": 0.6013193726539612, + "learning_rate": 2.9877847523412784e-05, + "loss": 0.1355, + "mean_token_accuracy": 0.9647196280956268, + "num_tokens": 267805629.0, + "step": 95500 + }, + { + "epoch": 2.320332030775952, + "grad_norm": 0.4272877275943756, + "learning_rate": 2.9676784591580743e-05, + "loss": 0.1443, + "mean_token_accuracy": 0.9627549290657044, + "num_tokens": 268072889.0, + "step": 95600 + }, + { + "epoch": 2.322759156331157, + "grad_norm": 0.4217541515827179, + "learning_rate": 2.9476282527668097e-05, + "loss": 0.1322, + "mean_token_accuracy": 0.9650915312767029, + "num_tokens": 268342006.0, + "step": 95700 + }, + { + "epoch": 2.325186281886362, + "grad_norm": 0.5044764876365662, + "learning_rate": 2.927634293079369e-05, + "loss": 0.1358, + "mean_token_accuracy": 0.9652171546220779, + "num_tokens": 268613702.0, + "step": 95800 + }, + { + "epoch": 2.327613407441567, + "grad_norm": 0.7305507063865662, + "learning_rate": 2.9076967395590305e-05, + "loss": 0.1371, + "mean_token_accuracy": 0.9650354409217834, + "num_tokens": 268903603.0, + "step": 95900 + }, + { + "epoch": 2.330040532996772, + "grad_norm": 0.2776518762111664, + "learning_rate": 2.8878157512192017e-05, + "loss": 0.1535, + "mean_token_accuracy": 0.9604264676570893, + "num_tokens": 269200151.0, + "step": 96000 + }, + { + "epoch": 2.332467658551977, + "grad_norm": 0.4138396680355072, + "learning_rate": 2.867991486622148e-05, + "loss": 0.1339, + "mean_token_accuracy": 0.9657935965061187, + "num_tokens": 269489962.0, + "step": 96100 + }, + { + "epoch": 2.334894784107182, + "grad_norm": 0.32120972871780396, + "learning_rate": 2.848224103877739e-05, + "loss": 0.1465, + "mean_token_accuracy": 0.9628925704956055, + "num_tokens": 269766960.0, + "step": 96200 + }, + { + "epoch": 2.337321909662387, + "grad_norm": 0.8308660387992859, + "learning_rate": 2.8285137606421728e-05, + "loss": 0.1427, + "mean_token_accuracy": 0.9637695878744126, + "num_tokens": 270042872.0, + "step": 96300 + }, + { + "epoch": 2.339749035217592, + "grad_norm": 0.3830409646034241, + "learning_rate": 2.8088606141167196e-05, + "loss": 0.1443, + "mean_token_accuracy": 0.9631164449453354, + "num_tokens": 270329997.0, + "step": 96400 + }, + { + "epoch": 2.3421761607727967, + "grad_norm": 0.4517403244972229, + "learning_rate": 2.7892648210464878e-05, + "loss": 0.1466, + "mean_token_accuracy": 0.9625603079795837, + "num_tokens": 270606605.0, + "step": 96500 + }, + { + "epoch": 2.3446032863280015, + "grad_norm": 0.3587762713432312, + "learning_rate": 2.769726537719145e-05, + "loss": 0.1662, + "mean_token_accuracy": 0.9586023986339569, + "num_tokens": 270903974.0, + "step": 96600 + }, + { + "epoch": 2.347030411883207, + "grad_norm": 0.31619253754615784, + "learning_rate": 2.7502459199636908e-05, + "loss": 0.1189, + "mean_token_accuracy": 0.9690315425395966, + "num_tokens": 271194872.0, + "step": 96700 + }, + { + "epoch": 2.3494575374384117, + "grad_norm": 0.5178159475326538, + "learning_rate": 2.7308231231492087e-05, + "loss": 0.1333, + "mean_token_accuracy": 0.9646364021301269, + "num_tokens": 271473378.0, + "step": 96800 + }, + { + "epoch": 2.3518846629936165, + "grad_norm": 0.32339227199554443, + "learning_rate": 2.711458302183635e-05, + "loss": 0.1484, + "mean_token_accuracy": 0.9622978901863098, + "num_tokens": 271766276.0, + "step": 96900 + }, + { + "epoch": 2.354311788548822, + "grad_norm": 0.6405932903289795, + "learning_rate": 2.6921516115124944e-05, + "loss": 0.134, + "mean_token_accuracy": 0.9650397008657455, + "num_tokens": 272051531.0, + "step": 97000 + }, + { + "epoch": 2.3567389141040267, + "grad_norm": 0.26447561383247375, + "learning_rate": 2.672903205117714e-05, + "loss": 0.1352, + "mean_token_accuracy": 0.9650300896167755, + "num_tokens": 272322818.0, + "step": 97100 + }, + { + "epoch": 2.3591660396592316, + "grad_norm": 0.32654133439064026, + "learning_rate": 2.6537132365163542e-05, + "loss": 0.1411, + "mean_token_accuracy": 0.9641784363985062, + "num_tokens": 272588000.0, + "step": 97200 + }, + { + "epoch": 2.3615931652144364, + "grad_norm": 0.38720738887786865, + "learning_rate": 2.634581858759403e-05, + "loss": 0.144, + "mean_token_accuracy": 0.9631247913837433, + "num_tokens": 272861777.0, + "step": 97300 + }, + { + "epoch": 2.3640202907696417, + "grad_norm": 0.5359301567077637, + "learning_rate": 2.615509224430561e-05, + "loss": 0.1471, + "mean_token_accuracy": 0.9627832245826721, + "num_tokens": 273141380.0, + "step": 97400 + }, + { + "epoch": 2.3664474163248466, + "grad_norm": 0.7095302939414978, + "learning_rate": 2.5964954856450087e-05, + "loss": 0.1369, + "mean_token_accuracy": 0.9649500614404678, + "num_tokens": 273425819.0, + "step": 97500 + }, + { + "epoch": 2.3688745418800514, + "grad_norm": 0.2362442910671234, + "learning_rate": 2.577540794048201e-05, + "loss": 0.1422, + "mean_token_accuracy": 0.9634811532497406, + "num_tokens": 273711441.0, + "step": 97600 + }, + { + "epoch": 2.3713016674352563, + "grad_norm": 0.46207818388938904, + "learning_rate": 2.558645300814657e-05, + "loss": 0.1253, + "mean_token_accuracy": 0.9674885600805283, + "num_tokens": 273991541.0, + "step": 97700 + }, + { + "epoch": 2.3737287929904616, + "grad_norm": 0.628451943397522, + "learning_rate": 2.539809156646764e-05, + "loss": 0.1159, + "mean_token_accuracy": 0.9699374061822891, + "num_tokens": 274253602.0, + "step": 97800 + }, + { + "epoch": 2.3761559185456664, + "grad_norm": 0.3115593492984772, + "learning_rate": 2.5210325117735557e-05, + "loss": 0.1381, + "mean_token_accuracy": 0.9645976287126541, + "num_tokens": 274529807.0, + "step": 97900 + }, + { + "epoch": 2.3785830441008713, + "grad_norm": 0.2984253764152527, + "learning_rate": 2.5023155159495282e-05, + "loss": 0.1201, + "mean_token_accuracy": 0.9683735293149948, + "num_tokens": 274822575.0, + "step": 98000 + }, + { + "epoch": 2.381010169656076, + "grad_norm": 0.5238213539123535, + "learning_rate": 2.4836583184534488e-05, + "loss": 0.1287, + "mean_token_accuracy": 0.9667371785640717, + "num_tokens": 275099672.0, + "step": 98100 + }, + { + "epoch": 2.3834372952112814, + "grad_norm": 0.6737905740737915, + "learning_rate": 2.4650610680871444e-05, + "loss": 0.1604, + "mean_token_accuracy": 0.959024778008461, + "num_tokens": 275386344.0, + "step": 98200 + }, + { + "epoch": 2.3858644207664863, + "grad_norm": 0.5642903447151184, + "learning_rate": 2.4465239131743446e-05, + "loss": 0.1349, + "mean_token_accuracy": 0.9656519025564194, + "num_tokens": 275656742.0, + "step": 98300 + }, + { + "epoch": 2.388291546321691, + "grad_norm": 0.398689866065979, + "learning_rate": 2.4280470015594748e-05, + "loss": 0.1608, + "mean_token_accuracy": 0.9593168395757675, + "num_tokens": 275952854.0, + "step": 98400 + }, + { + "epoch": 2.390718671876896, + "grad_norm": 0.27904191613197327, + "learning_rate": 2.409630480606484e-05, + "loss": 0.1235, + "mean_token_accuracy": 0.9679072177410126, + "num_tokens": 276222937.0, + "step": 98500 + }, + { + "epoch": 2.3931457974321013, + "grad_norm": 1.006825566291809, + "learning_rate": 2.3912744971976808e-05, + "loss": 0.1306, + "mean_token_accuracy": 0.9661997479200363, + "num_tokens": 276504510.0, + "step": 98600 + }, + { + "epoch": 2.395572922987306, + "grad_norm": 0.3777683675289154, + "learning_rate": 2.3729791977325432e-05, + "loss": 0.1297, + "mean_token_accuracy": 0.9665063762664795, + "num_tokens": 276772770.0, + "step": 98700 + }, + { + "epoch": 2.398000048542511, + "grad_norm": 0.28916266560554504, + "learning_rate": 2.3547447281265635e-05, + "loss": 0.1593, + "mean_token_accuracy": 0.960017095208168, + "num_tokens": 277050087.0, + "step": 98800 + }, + { + "epoch": 2.400427174097716, + "grad_norm": 0.2858158349990845, + "learning_rate": 2.3365712338100755e-05, + "loss": 0.1177, + "mean_token_accuracy": 0.9696782982349396, + "num_tokens": 277350875.0, + "step": 98900 + }, + { + "epoch": 2.402854299652921, + "grad_norm": 0.35637909173965454, + "learning_rate": 2.318458859727112e-05, + "loss": 0.1389, + "mean_token_accuracy": 0.9645104986429215, + "num_tokens": 277641318.0, + "step": 99000 + }, + { + "epoch": 2.405281425208126, + "grad_norm": 0.3532889783382416, + "learning_rate": 2.3004077503342257e-05, + "loss": 0.1209, + "mean_token_accuracy": 0.9687686038017272, + "num_tokens": 277914597.0, + "step": 99100 + }, + { + "epoch": 2.407708550763331, + "grad_norm": 0.4838126301765442, + "learning_rate": 2.2824180495993518e-05, + "loss": 0.1351, + "mean_token_accuracy": 0.9651601392030716, + "num_tokens": 278193197.0, + "step": 99200 + }, + { + "epoch": 2.4101356763185358, + "grad_norm": 0.5864011645317078, + "learning_rate": 2.264489901000665e-05, + "loss": 0.1277, + "mean_token_accuracy": 0.9669712960720063, + "num_tokens": 278469259.0, + "step": 99300 + }, + { + "epoch": 2.412562801873741, + "grad_norm": 0.23830416798591614, + "learning_rate": 2.2466234475254067e-05, + "loss": 0.1412, + "mean_token_accuracy": 0.9635849434137345, + "num_tokens": 278748270.0, + "step": 99400 + }, + { + "epoch": 2.414989927428946, + "grad_norm": 0.2923569977283478, + "learning_rate": 2.228818831668783e-05, + "loss": 0.1327, + "mean_token_accuracy": 0.9659154057502747, + "num_tokens": 279029338.0, + "step": 99500 + }, + { + "epoch": 2.4174170529841508, + "grad_norm": 0.2777637243270874, + "learning_rate": 2.211076195432803e-05, + "loss": 0.117, + "mean_token_accuracy": 0.969894996881485, + "num_tokens": 279314859.0, + "step": 99600 + }, + { + "epoch": 2.419844178539356, + "grad_norm": 0.4714739918708801, + "learning_rate": 2.1933956803251464e-05, + "loss": 0.1283, + "mean_token_accuracy": 0.9672115963697433, + "num_tokens": 279599623.0, + "step": 99700 + }, + { + "epoch": 2.422271304094561, + "grad_norm": 0.17279647290706635, + "learning_rate": 2.1757774273580555e-05, + "loss": 0.1173, + "mean_token_accuracy": 0.9694797676801682, + "num_tokens": 279878335.0, + "step": 99800 + }, + { + "epoch": 2.424698429649766, + "grad_norm": 0.6131448149681091, + "learning_rate": 2.1582215770471813e-05, + "loss": 0.1331, + "mean_token_accuracy": 0.965504584312439, + "num_tokens": 280169261.0, + "step": 99900 + }, + { + "epoch": 2.4271255552049706, + "grad_norm": 0.63509202003479, + "learning_rate": 2.1407282694104945e-05, + "loss": 0.1367, + "mean_token_accuracy": 0.9649469172954559, + "num_tokens": 280457393.0, + "step": 100000 + }, + { + "epoch": 2.4295526807601755, + "grad_norm": 0.6302922964096069, + "learning_rate": 2.1232976439671305e-05, + "loss": 0.1215, + "mean_token_accuracy": 0.9686316031217576, + "num_tokens": 280737937.0, + "step": 100100 + }, + { + "epoch": 2.431979806315381, + "grad_norm": 0.2724493443965912, + "learning_rate": 2.105929839736319e-05, + "loss": 0.1112, + "mean_token_accuracy": 0.9709296381473541, + "num_tokens": 281011603.0, + "step": 100200 + }, + { + "epoch": 2.4344069318705857, + "grad_norm": 0.780457079410553, + "learning_rate": 2.0886249952362423e-05, + "loss": 0.1202, + "mean_token_accuracy": 0.9688206619024277, + "num_tokens": 281297746.0, + "step": 100300 + }, + { + "epoch": 2.4368340574257905, + "grad_norm": 0.23311544954776764, + "learning_rate": 2.0713832484829454e-05, + "loss": 0.1356, + "mean_token_accuracy": 0.9651333147287369, + "num_tokens": 281583883.0, + "step": 100400 + }, + { + "epoch": 2.439261182980996, + "grad_norm": 0.6378100514411926, + "learning_rate": 2.054204736989237e-05, + "loss": 0.141, + "mean_token_accuracy": 0.9639557653665543, + "num_tokens": 281861952.0, + "step": 100500 + }, + { + "epoch": 2.4416883085362007, + "grad_norm": 0.5750208497047424, + "learning_rate": 2.0370895977635827e-05, + "loss": 0.1295, + "mean_token_accuracy": 0.9666093420982361, + "num_tokens": 282135836.0, + "step": 100600 + }, + { + "epoch": 2.4441154340914055, + "grad_norm": 0.7432602643966675, + "learning_rate": 2.020037967309022e-05, + "loss": 0.14, + "mean_token_accuracy": 0.9642543375492096, + "num_tokens": 282418573.0, + "step": 100700 + }, + { + "epoch": 2.4465425596466104, + "grad_norm": 0.7422667741775513, + "learning_rate": 2.0030499816220692e-05, + "loss": 0.1231, + "mean_token_accuracy": 0.9678942495584488, + "num_tokens": 282702369.0, + "step": 100800 + }, + { + "epoch": 2.4489696852018157, + "grad_norm": 0.32463338971138, + "learning_rate": 1.9861257761916495e-05, + "loss": 0.1387, + "mean_token_accuracy": 0.9645810687541961, + "num_tokens": 282976614.0, + "step": 100900 + }, + { + "epoch": 2.4513968107570205, + "grad_norm": 0.3175111413002014, + "learning_rate": 1.969265485997982e-05, + "loss": 0.1303, + "mean_token_accuracy": 0.9666304177045822, + "num_tokens": 283259813.0, + "step": 101000 + }, + { + "epoch": 2.4538239363122254, + "grad_norm": 0.3536398708820343, + "learning_rate": 1.9524692455115456e-05, + "loss": 0.1157, + "mean_token_accuracy": 0.969986400604248, + "num_tokens": 283545148.0, + "step": 101100 + }, + { + "epoch": 2.4562510618674303, + "grad_norm": 0.39317232370376587, + "learning_rate": 1.9357371886919762e-05, + "loss": 0.1227, + "mean_token_accuracy": 0.9676568216085434, + "num_tokens": 283827994.0, + "step": 101200 + }, + { + "epoch": 2.4586781874226356, + "grad_norm": 0.7098958492279053, + "learning_rate": 1.9190694489870066e-05, + "loss": 0.1501, + "mean_token_accuracy": 0.9624185985326768, + "num_tokens": 284123970.0, + "step": 101300 + }, + { + "epoch": 2.4611053129778404, + "grad_norm": 0.1925334334373474, + "learning_rate": 1.902466159331413e-05, + "loss": 0.1325, + "mean_token_accuracy": 0.9658237969875336, + "num_tokens": 284404970.0, + "step": 101400 + }, + { + "epoch": 2.4635324385330453, + "grad_norm": 0.7176932692527771, + "learning_rate": 1.885927452145937e-05, + "loss": 0.1465, + "mean_token_accuracy": 0.9626785635948181, + "num_tokens": 284692921.0, + "step": 101500 + }, + { + "epoch": 2.46595956408825, + "grad_norm": 0.4300325810909271, + "learning_rate": 1.86945345933624e-05, + "loss": 0.1295, + "mean_token_accuracy": 0.9664794594049454, + "num_tokens": 284966467.0, + "step": 101600 + }, + { + "epoch": 2.4683866896434554, + "grad_norm": 0.49719369411468506, + "learning_rate": 1.8530443122918483e-05, + "loss": 0.1263, + "mean_token_accuracy": 0.9671731227636338, + "num_tokens": 285260131.0, + "step": 101700 + }, + { + "epoch": 2.4708138151986603, + "grad_norm": 0.6283943057060242, + "learning_rate": 1.8367001418851126e-05, + "loss": 0.1403, + "mean_token_accuracy": 0.9638696962594986, + "num_tokens": 285550527.0, + "step": 101800 + }, + { + "epoch": 2.473240940753865, + "grad_norm": 0.5366162061691284, + "learning_rate": 1.8204210784701492e-05, + "loss": 0.1239, + "mean_token_accuracy": 0.9679791039228439, + "num_tokens": 285826548.0, + "step": 101900 + }, + { + "epoch": 2.47566806630907, + "grad_norm": 0.4290122091770172, + "learning_rate": 1.804207251881811e-05, + "loss": 0.1357, + "mean_token_accuracy": 0.9650535964965821, + "num_tokens": 286112898.0, + "step": 102000 + }, + { + "epoch": 2.4780951918642753, + "grad_norm": 0.6725316643714905, + "learning_rate": 1.788058791434658e-05, + "loss": 0.1124, + "mean_token_accuracy": 0.9707834237813949, + "num_tokens": 286388657.0, + "step": 102100 + }, + { + "epoch": 2.48052231741948, + "grad_norm": 0.40707212686538696, + "learning_rate": 1.7719758259219012e-05, + "loss": 0.1442, + "mean_token_accuracy": 0.9627831757068634, + "num_tokens": 286679858.0, + "step": 102200 + }, + { + "epoch": 2.482949442974685, + "grad_norm": 0.48535874485969543, + "learning_rate": 1.7559584836144095e-05, + "loss": 0.1299, + "mean_token_accuracy": 0.9669503039121627, + "num_tokens": 286953076.0, + "step": 102300 + }, + { + "epoch": 2.48537656852989, + "grad_norm": 0.20833340287208557, + "learning_rate": 1.7400068922596603e-05, + "loss": 0.1425, + "mean_token_accuracy": 0.9637488210201264, + "num_tokens": 287233777.0, + "step": 102400 + }, + { + "epoch": 2.487803694085095, + "grad_norm": 0.24462640285491943, + "learning_rate": 1.7241211790807256e-05, + "loss": 0.1299, + "mean_token_accuracy": 0.9663457351922989, + "num_tokens": 287520202.0, + "step": 102500 + }, + { + "epoch": 2.4902308196403, + "grad_norm": 0.3491087853908539, + "learning_rate": 1.7083014707752732e-05, + "loss": 0.1385, + "mean_token_accuracy": 0.9639142185449601, + "num_tokens": 287791973.0, + "step": 102600 + }, + { + "epoch": 2.492657945195505, + "grad_norm": 0.5299773216247559, + "learning_rate": 1.6925478935145335e-05, + "loss": 0.1356, + "mean_token_accuracy": 0.9650074106454849, + "num_tokens": 288075820.0, + "step": 102700 + }, + { + "epoch": 2.4950850707507097, + "grad_norm": 0.5309982299804688, + "learning_rate": 1.676860572942309e-05, + "loss": 0.1328, + "mean_token_accuracy": 0.9659540796279907, + "num_tokens": 288366268.0, + "step": 102800 + }, + { + "epoch": 2.497512196305915, + "grad_norm": 0.9474865794181824, + "learning_rate": 1.6612396341739633e-05, + "loss": 0.1385, + "mean_token_accuracy": 0.9651324534416199, + "num_tokens": 288652494.0, + "step": 102900 + }, + { + "epoch": 2.49993932186112, + "grad_norm": 0.37610140442848206, + "learning_rate": 1.645685201795435e-05, + "loss": 0.1187, + "mean_token_accuracy": 0.9689668655395508, + "num_tokens": 288930537.0, + "step": 103000 + }, + { + "epoch": 2.5023664474163247, + "grad_norm": 0.7315070033073425, + "learning_rate": 1.6301973998622267e-05, + "loss": 0.1377, + "mean_token_accuracy": 0.9647850877046585, + "num_tokens": 289200621.0, + "step": 103100 + }, + { + "epoch": 2.50479357297153, + "grad_norm": 0.4755480885505676, + "learning_rate": 1.6147763518984283e-05, + "loss": 0.125, + "mean_token_accuracy": 0.9684723728895187, + "num_tokens": 289485794.0, + "step": 103200 + }, + { + "epoch": 2.507220698526735, + "grad_norm": 0.20060652494430542, + "learning_rate": 1.5994221808957354e-05, + "loss": 0.1241, + "mean_token_accuracy": 0.9680253058671952, + "num_tokens": 289768904.0, + "step": 103300 + }, + { + "epoch": 2.5096478240819398, + "grad_norm": 0.3017840087413788, + "learning_rate": 1.5841350093124473e-05, + "loss": 0.1304, + "mean_token_accuracy": 0.9663189828395844, + "num_tokens": 290053063.0, + "step": 103400 + }, + { + "epoch": 2.5120749496371446, + "grad_norm": 0.2239573448896408, + "learning_rate": 1.5689149590725205e-05, + "loss": 0.1494, + "mean_token_accuracy": 0.9615071332454681, + "num_tokens": 290332650.0, + "step": 103500 + }, + { + "epoch": 2.5145020751923495, + "grad_norm": 0.2974654734134674, + "learning_rate": 1.5537621515645685e-05, + "loss": 0.1132, + "mean_token_accuracy": 0.9711314821243286, + "num_tokens": 290627061.0, + "step": 103600 + }, + { + "epoch": 2.5169292007475548, + "grad_norm": 0.24081680178642273, + "learning_rate": 1.5386767076409093e-05, + "loss": 0.1138, + "mean_token_accuracy": 0.9703037095069885, + "num_tokens": 290906135.0, + "step": 103700 + }, + { + "epoch": 2.5193563263027596, + "grad_norm": 0.4012824296951294, + "learning_rate": 1.5236587476166009e-05, + "loss": 0.1438, + "mean_token_accuracy": 0.9640105599164963, + "num_tokens": 291205760.0, + "step": 103800 + }, + { + "epoch": 2.5217834518579645, + "grad_norm": 0.1920708417892456, + "learning_rate": 1.508708391268473e-05, + "loss": 0.117, + "mean_token_accuracy": 0.9692934077978134, + "num_tokens": 291501172.0, + "step": 103900 + }, + { + "epoch": 2.52421057741317, + "grad_norm": 0.7257289290428162, + "learning_rate": 1.4938257578341851e-05, + "loss": 0.1271, + "mean_token_accuracy": 0.9669335162639618, + "num_tokens": 291762700.0, + "step": 104000 + }, + { + "epoch": 2.5266377029683746, + "grad_norm": 0.4053400456905365, + "learning_rate": 1.479010966011255e-05, + "loss": 0.118, + "mean_token_accuracy": 0.9697389549016953, + "num_tokens": 292029791.0, + "step": 104100 + }, + { + "epoch": 2.5290648285235795, + "grad_norm": 0.7388555407524109, + "learning_rate": 1.4642641339561392e-05, + "loss": 0.1472, + "mean_token_accuracy": 0.9629099863767624, + "num_tokens": 292297139.0, + "step": 104200 + }, + { + "epoch": 2.5314919540787844, + "grad_norm": 0.44577348232269287, + "learning_rate": 1.4495853792832658e-05, + "loss": 0.1287, + "mean_token_accuracy": 0.9671968942880631, + "num_tokens": 292570449.0, + "step": 104300 + }, + { + "epoch": 2.533919079633989, + "grad_norm": 0.41142553091049194, + "learning_rate": 1.4349748190641088e-05, + "loss": 0.1371, + "mean_token_accuracy": 0.9647838968038559, + "num_tokens": 292846299.0, + "step": 104400 + }, + { + "epoch": 2.5363462051891945, + "grad_norm": 0.519055187702179, + "learning_rate": 1.42043256982626e-05, + "loss": 0.1298, + "mean_token_accuracy": 0.966575453877449, + "num_tokens": 293128911.0, + "step": 104500 + }, + { + "epoch": 2.5387733307443994, + "grad_norm": 0.8179194331169128, + "learning_rate": 1.4059587475524804e-05, + "loss": 0.1254, + "mean_token_accuracy": 0.9670647913217545, + "num_tokens": 293391008.0, + "step": 104600 + }, + { + "epoch": 2.5412004562996042, + "grad_norm": 0.4637313187122345, + "learning_rate": 1.3915534676797948e-05, + "loss": 0.1212, + "mean_token_accuracy": 0.9688729846477508, + "num_tokens": 293667868.0, + "step": 104700 + }, + { + "epoch": 2.5436275818548095, + "grad_norm": 0.45500364899635315, + "learning_rate": 1.3772168450985578e-05, + "loss": 0.1301, + "mean_token_accuracy": 0.9666210097074509, + "num_tokens": 293944089.0, + "step": 104800 + }, + { + "epoch": 2.5460547074100144, + "grad_norm": 0.5273195505142212, + "learning_rate": 1.3629489941515527e-05, + "loss": 0.1299, + "mean_token_accuracy": 0.9664002645015717, + "num_tokens": 294209744.0, + "step": 104900 + }, + { + "epoch": 2.5484818329652192, + "grad_norm": 0.6371331214904785, + "learning_rate": 1.3487500286330524e-05, + "loss": 0.1141, + "mean_token_accuracy": 0.9706168347597122, + "num_tokens": 294481283.0, + "step": 105000 + }, + { + "epoch": 2.550908958520424, + "grad_norm": 0.6027917265892029, + "learning_rate": 1.3346200617879457e-05, + "loss": 0.1051, + "mean_token_accuracy": 0.9724947756528854, + "num_tokens": 294760945.0, + "step": 105100 + }, + { + "epoch": 2.5533360840756294, + "grad_norm": 0.7076954245567322, + "learning_rate": 1.3205592063108108e-05, + "loss": 0.1231, + "mean_token_accuracy": 0.9681785267591476, + "num_tokens": 295040707.0, + "step": 105200 + }, + { + "epoch": 2.5557632096308343, + "grad_norm": 0.3423120081424713, + "learning_rate": 1.306567574345019e-05, + "loss": 0.1149, + "mean_token_accuracy": 0.9702116006612778, + "num_tokens": 295328440.0, + "step": 105300 + }, + { + "epoch": 2.558190335186039, + "grad_norm": 0.3347417116165161, + "learning_rate": 1.2926452774818531e-05, + "loss": 0.1201, + "mean_token_accuracy": 0.9698320901393891, + "num_tokens": 295616342.0, + "step": 105400 + }, + { + "epoch": 2.560617460741244, + "grad_norm": 0.25471988320350647, + "learning_rate": 1.278792426759604e-05, + "loss": 0.115, + "mean_token_accuracy": 0.970246964097023, + "num_tokens": 295885464.0, + "step": 105500 + }, + { + "epoch": 2.5630445862964493, + "grad_norm": 0.18230029940605164, + "learning_rate": 1.2650091326626878e-05, + "loss": 0.1155, + "mean_token_accuracy": 0.9699338835477829, + "num_tokens": 296165406.0, + "step": 105600 + }, + { + "epoch": 2.565471711851654, + "grad_norm": 0.6567848324775696, + "learning_rate": 1.2512955051207698e-05, + "loss": 0.1237, + "mean_token_accuracy": 0.9680297696590423, + "num_tokens": 296436162.0, + "step": 105700 + }, + { + "epoch": 2.567898837406859, + "grad_norm": 0.8726966977119446, + "learning_rate": 1.2376516535078852e-05, + "loss": 0.1186, + "mean_token_accuracy": 0.9697349029779434, + "num_tokens": 296723223.0, + "step": 105800 + }, + { + "epoch": 2.5703259629620643, + "grad_norm": 0.6806580424308777, + "learning_rate": 1.2240776866415649e-05, + "loss": 0.133, + "mean_token_accuracy": 0.9650469082593918, + "num_tokens": 297006971.0, + "step": 105900 + }, + { + "epoch": 2.572753088517269, + "grad_norm": 0.3688866198062897, + "learning_rate": 1.210573712781966e-05, + "loss": 0.1292, + "mean_token_accuracy": 0.9668610745668411, + "num_tokens": 297298957.0, + "step": 106000 + }, + { + "epoch": 2.575180214072474, + "grad_norm": 0.31866592168807983, + "learning_rate": 1.1971398396310219e-05, + "loss": 0.1368, + "mean_token_accuracy": 0.9650294834375381, + "num_tokens": 297575766.0, + "step": 106100 + }, + { + "epoch": 2.577607339627679, + "grad_norm": 0.29714086651802063, + "learning_rate": 1.1837761743315534e-05, + "loss": 0.119, + "mean_token_accuracy": 0.9692506629228592, + "num_tokens": 297853168.0, + "step": 106200 + }, + { + "epoch": 2.5800344651828837, + "grad_norm": 0.3458079397678375, + "learning_rate": 1.1704828234664544e-05, + "loss": 0.1211, + "mean_token_accuracy": 0.9684927022457123, + "num_tokens": 298124718.0, + "step": 106300 + }, + { + "epoch": 2.582461590738089, + "grad_norm": 0.47059133648872375, + "learning_rate": 1.1572598930578083e-05, + "loss": 0.1364, + "mean_token_accuracy": 0.9651955342292786, + "num_tokens": 298399917.0, + "step": 106400 + }, + { + "epoch": 2.584888716293294, + "grad_norm": 0.37811002135276794, + "learning_rate": 1.1441074885660564e-05, + "loss": 0.1244, + "mean_token_accuracy": 0.967754932641983, + "num_tokens": 298679331.0, + "step": 106500 + }, + { + "epoch": 2.5873158418484987, + "grad_norm": 0.3915969133377075, + "learning_rate": 1.1310257148891622e-05, + "loss": 0.1263, + "mean_token_accuracy": 0.9675187277793884, + "num_tokens": 298958742.0, + "step": 106600 + }, + { + "epoch": 2.589742967403704, + "grad_norm": 0.2337946891784668, + "learning_rate": 1.1180146763617604e-05, + "loss": 0.1239, + "mean_token_accuracy": 0.9683219474554062, + "num_tokens": 299244025.0, + "step": 106700 + }, + { + "epoch": 2.592170092958909, + "grad_norm": 0.23497796058654785, + "learning_rate": 1.1050744767543341e-05, + "loss": 0.1313, + "mean_token_accuracy": 0.9658564746379852, + "num_tokens": 299522954.0, + "step": 106800 + }, + { + "epoch": 2.5945972185141137, + "grad_norm": 0.7963089346885681, + "learning_rate": 1.0922052192723864e-05, + "loss": 0.1162, + "mean_token_accuracy": 0.969860834479332, + "num_tokens": 299806975.0, + "step": 106900 + }, + { + "epoch": 2.5970243440693186, + "grad_norm": 0.24625255167484283, + "learning_rate": 1.0794070065556195e-05, + "loss": 0.1468, + "mean_token_accuracy": 0.9626657491922379, + "num_tokens": 300083725.0, + "step": 107000 + }, + { + "epoch": 2.5994514696245234, + "grad_norm": 0.3800683319568634, + "learning_rate": 1.0666799406771078e-05, + "loss": 0.1108, + "mean_token_accuracy": 0.9710573416948318, + "num_tokens": 300344158.0, + "step": 107100 + }, + { + "epoch": 2.6018785951797287, + "grad_norm": 0.6734281182289124, + "learning_rate": 1.0540241231424885e-05, + "loss": 0.1325, + "mean_token_accuracy": 0.9659317535161972, + "num_tokens": 300623724.0, + "step": 107200 + }, + { + "epoch": 2.6043057207349336, + "grad_norm": 0.33141759037971497, + "learning_rate": 1.0414396548891625e-05, + "loss": 0.1079, + "mean_token_accuracy": 0.9718086642026901, + "num_tokens": 300902698.0, + "step": 107300 + }, + { + "epoch": 2.6067328462901385, + "grad_norm": 0.21568359434604645, + "learning_rate": 1.028926636285461e-05, + "loss": 0.1249, + "mean_token_accuracy": 0.9674285870790481, + "num_tokens": 301174551.0, + "step": 107400 + }, + { + "epoch": 2.6091599718453438, + "grad_norm": 0.19938012957572937, + "learning_rate": 1.0164851671298824e-05, + "loss": 0.1146, + "mean_token_accuracy": 0.9703828579187393, + "num_tokens": 301458177.0, + "step": 107500 + }, + { + "epoch": 2.6115870974005486, + "grad_norm": 0.6210383176803589, + "learning_rate": 1.0041153466502662e-05, + "loss": 0.1093, + "mean_token_accuracy": 0.9717652726173401, + "num_tokens": 301740628.0, + "step": 107600 + }, + { + "epoch": 2.6140142229557535, + "grad_norm": 0.581802487373352, + "learning_rate": 9.918172735030129e-06, + "loss": 0.1294, + "mean_token_accuracy": 0.9672193360328675, + "num_tokens": 302027384.0, + "step": 107700 + }, + { + "epoch": 2.6164413485109583, + "grad_norm": 0.27364861965179443, + "learning_rate": 9.795910457723035e-06, + "loss": 0.1168, + "mean_token_accuracy": 0.9696778047084809, + "num_tokens": 302321015.0, + "step": 107800 + }, + { + "epoch": 2.618868474066163, + "grad_norm": 0.6201646327972412, + "learning_rate": 9.674367609693024e-06, + "loss": 0.1381, + "mean_token_accuracy": 0.9657150173187256, + "num_tokens": 302606608.0, + "step": 107900 + }, + { + "epoch": 2.6212955996213685, + "grad_norm": 0.25466471910476685, + "learning_rate": 9.553545160313981e-06, + "loss": 0.1083, + "mean_token_accuracy": 0.9716448175907135, + "num_tokens": 302876241.0, + "step": 108000 + }, + { + "epoch": 2.6237227251765733, + "grad_norm": 0.3042312264442444, + "learning_rate": 9.433444073214071e-06, + "loss": 0.1373, + "mean_token_accuracy": 0.9643556636571884, + "num_tokens": 303166869.0, + "step": 108100 + }, + { + "epoch": 2.626149850731778, + "grad_norm": 0.44712647795677185, + "learning_rate": 9.314065306268293e-06, + "loss": 0.113, + "mean_token_accuracy": 0.9708542251586914, + "num_tokens": 303454686.0, + "step": 108200 + }, + { + "epoch": 2.6285769762869835, + "grad_norm": 0.21764476597309113, + "learning_rate": 9.195409811590671e-06, + "loss": 0.1295, + "mean_token_accuracy": 0.9666325169801712, + "num_tokens": 303747965.0, + "step": 108300 + }, + { + "epoch": 2.6310041018421884, + "grad_norm": 0.2885514795780182, + "learning_rate": 9.07747853552674e-06, + "loss": 0.1141, + "mean_token_accuracy": 0.9705150628089905, + "num_tokens": 304024604.0, + "step": 108400 + }, + { + "epoch": 2.633431227397393, + "grad_norm": 0.2512131631374359, + "learning_rate": 8.960272418645942e-06, + "loss": 0.1168, + "mean_token_accuracy": 0.9695372092723846, + "num_tokens": 304307717.0, + "step": 108500 + }, + { + "epoch": 2.635858352952598, + "grad_norm": 0.3841225802898407, + "learning_rate": 8.84379239573423e-06, + "loss": 0.1289, + "mean_token_accuracy": 0.9664428579807282, + "num_tokens": 304581847.0, + "step": 108600 + }, + { + "epoch": 2.6382854785078034, + "grad_norm": 0.7938812375068665, + "learning_rate": 8.72803939578648e-06, + "loss": 0.1311, + "mean_token_accuracy": 0.966459589600563, + "num_tokens": 304851167.0, + "step": 108700 + }, + { + "epoch": 2.6407126040630082, + "grad_norm": 0.1958301067352295, + "learning_rate": 8.613014341999148e-06, + "loss": 0.1156, + "mean_token_accuracy": 0.9697409337759018, + "num_tokens": 305133812.0, + "step": 108800 + }, + { + "epoch": 2.643139729618213, + "grad_norm": 0.2697140574455261, + "learning_rate": 8.498718151762952e-06, + "loss": 0.1104, + "mean_token_accuracy": 0.971248230934143, + "num_tokens": 305415322.0, + "step": 108900 + }, + { + "epoch": 2.645566855173418, + "grad_norm": 0.6028584241867065, + "learning_rate": 8.385151736655428e-06, + "loss": 0.1246, + "mean_token_accuracy": 0.9677672815322876, + "num_tokens": 305701665.0, + "step": 109000 + }, + { + "epoch": 2.6479939807286232, + "grad_norm": 0.8087351322174072, + "learning_rate": 8.272316002433811e-06, + "loss": 0.1244, + "mean_token_accuracy": 0.9675776773691177, + "num_tokens": 305983629.0, + "step": 109100 + }, + { + "epoch": 2.650421106283828, + "grad_norm": 0.31361258029937744, + "learning_rate": 8.160211849027688e-06, + "loss": 0.1016, + "mean_token_accuracy": 0.9731613707542419, + "num_tokens": 306260513.0, + "step": 109200 + }, + { + "epoch": 2.652848231839033, + "grad_norm": 0.36700600385665894, + "learning_rate": 8.04884017053189e-06, + "loss": 0.1185, + "mean_token_accuracy": 0.9694353520870209, + "num_tokens": 306530485.0, + "step": 109300 + }, + { + "epoch": 2.6552753573942383, + "grad_norm": 0.455331027507782, + "learning_rate": 7.938201855199357e-06, + "loss": 0.1144, + "mean_token_accuracy": 0.9698577016592026, + "num_tokens": 306807912.0, + "step": 109400 + }, + { + "epoch": 2.657702482949443, + "grad_norm": 0.28509321808815, + "learning_rate": 7.828297785434014e-06, + "loss": 0.1076, + "mean_token_accuracy": 0.9716468757390976, + "num_tokens": 307088631.0, + "step": 109500 + }, + { + "epoch": 2.660129608504648, + "grad_norm": 0.5025367140769958, + "learning_rate": 7.719128837783751e-06, + "loss": 0.138, + "mean_token_accuracy": 0.9643630236387253, + "num_tokens": 307376670.0, + "step": 109600 + }, + { + "epoch": 2.662556734059853, + "grad_norm": 0.3343975245952606, + "learning_rate": 7.61069588293345e-06, + "loss": 0.1071, + "mean_token_accuracy": 0.9722916567325592, + "num_tokens": 307651794.0, + "step": 109700 + }, + { + "epoch": 2.6649838596150577, + "grad_norm": 0.3086334466934204, + "learning_rate": 7.50299978569805e-06, + "loss": 0.0937, + "mean_token_accuracy": 0.9749183577299118, + "num_tokens": 307921590.0, + "step": 109800 + }, + { + "epoch": 2.667410985170263, + "grad_norm": 0.21876056492328644, + "learning_rate": 7.3960414050155856e-06, + "loss": 0.1199, + "mean_token_accuracy": 0.9686343175172806, + "num_tokens": 308210040.0, + "step": 109900 + }, + { + "epoch": 2.669838110725468, + "grad_norm": 0.4001712203025818, + "learning_rate": 7.289821593940405e-06, + "loss": 0.1351, + "mean_token_accuracy": 0.9652160155773163, + "num_tokens": 308485810.0, + "step": 110000 + }, + { + "epoch": 2.6722652362806727, + "grad_norm": 0.15460790693759918, + "learning_rate": 7.184341199636368e-06, + "loss": 0.114, + "mean_token_accuracy": 0.9703251016139984, + "num_tokens": 308758085.0, + "step": 110100 + }, + { + "epoch": 2.674692361835878, + "grad_norm": 0.506861686706543, + "learning_rate": 7.079601063369978e-06, + "loss": 0.1153, + "mean_token_accuracy": 0.969694042801857, + "num_tokens": 309024784.0, + "step": 110200 + }, + { + "epoch": 2.677119487391083, + "grad_norm": 0.31870442628860474, + "learning_rate": 6.975602020503846e-06, + "loss": 0.0984, + "mean_token_accuracy": 0.9742819786071777, + "num_tokens": 309312124.0, + "step": 110300 + }, + { + "epoch": 2.6795466129462877, + "grad_norm": 0.4337054193019867, + "learning_rate": 6.872344900489902e-06, + "loss": 0.1076, + "mean_token_accuracy": 0.9716127079725265, + "num_tokens": 309575184.0, + "step": 110400 + }, + { + "epoch": 2.6819737385014926, + "grad_norm": 0.5748428106307983, + "learning_rate": 6.769830526862797e-06, + "loss": 0.1231, + "mean_token_accuracy": 0.9681743580102921, + "num_tokens": 309867557.0, + "step": 110500 + }, + { + "epoch": 2.6844008640566974, + "grad_norm": 0.34900379180908203, + "learning_rate": 6.668059717233399e-06, + "loss": 0.106, + "mean_token_accuracy": 0.9726725119352341, + "num_tokens": 310161974.0, + "step": 110600 + }, + { + "epoch": 2.6868279896119027, + "grad_norm": 0.19695007801055908, + "learning_rate": 6.567033283282187e-06, + "loss": 0.1159, + "mean_token_accuracy": 0.9697107589244842, + "num_tokens": 310447635.0, + "step": 110700 + }, + { + "epoch": 2.6892551151671076, + "grad_norm": 0.7395570874214172, + "learning_rate": 6.466752030752831e-06, + "loss": 0.124, + "mean_token_accuracy": 0.9683857929706573, + "num_tokens": 310731828.0, + "step": 110800 + }, + { + "epoch": 2.6916822407223124, + "grad_norm": 0.26191890239715576, + "learning_rate": 6.36721675944576e-06, + "loss": 0.104, + "mean_token_accuracy": 0.9727838003635406, + "num_tokens": 310998120.0, + "step": 110900 + }, + { + "epoch": 2.6941093662775177, + "grad_norm": 0.24544072151184082, + "learning_rate": 6.268428263211778e-06, + "loss": 0.1191, + "mean_token_accuracy": 0.9691353464126586, + "num_tokens": 311277764.0, + "step": 111000 + }, + { + "epoch": 2.6965364918327226, + "grad_norm": 0.18376019597053528, + "learning_rate": 6.170387329945704e-06, + "loss": 0.1145, + "mean_token_accuracy": 0.9702999091148377, + "num_tokens": 311556366.0, + "step": 111100 + }, + { + "epoch": 2.6989636173879274, + "grad_norm": 0.5311710238456726, + "learning_rate": 6.073094741580143e-06, + "loss": 0.1307, + "mean_token_accuracy": 0.9665592706203461, + "num_tokens": 311845090.0, + "step": 111200 + }, + { + "epoch": 2.7013907429431323, + "grad_norm": 0.31867825984954834, + "learning_rate": 5.976551274079234e-06, + "loss": 0.1135, + "mean_token_accuracy": 0.9705018663406372, + "num_tokens": 312123948.0, + "step": 111300 + }, + { + "epoch": 2.703817868498337, + "grad_norm": 0.24618180096149445, + "learning_rate": 5.880757697432359e-06, + "loss": 0.1306, + "mean_token_accuracy": 0.9667133390903473, + "num_tokens": 312411540.0, + "step": 111400 + }, + { + "epoch": 2.7062449940535425, + "grad_norm": 0.26771923899650574, + "learning_rate": 5.785714775648221e-06, + "loss": 0.1302, + "mean_token_accuracy": 0.966540464758873, + "num_tokens": 312691963.0, + "step": 111500 + }, + { + "epoch": 2.7086721196087473, + "grad_norm": 0.2688523232936859, + "learning_rate": 5.69142326674853e-06, + "loss": 0.1187, + "mean_token_accuracy": 0.9695068788528443, + "num_tokens": 312971327.0, + "step": 111600 + }, + { + "epoch": 2.711099245163952, + "grad_norm": 0.7141240835189819, + "learning_rate": 5.59788392276206e-06, + "loss": 0.1038, + "mean_token_accuracy": 0.9725120347738266, + "num_tokens": 313247248.0, + "step": 111700 + }, + { + "epoch": 2.7135263707191575, + "grad_norm": 0.5409464240074158, + "learning_rate": 5.50509748971868e-06, + "loss": 0.1188, + "mean_token_accuracy": 0.9693516844511032, + "num_tokens": 313516922.0, + "step": 111800 + }, + { + "epoch": 2.7159534962743623, + "grad_norm": 0.3617819845676422, + "learning_rate": 5.413064707643311e-06, + "loss": 0.105, + "mean_token_accuracy": 0.9724315345287323, + "num_tokens": 313788670.0, + "step": 111900 + }, + { + "epoch": 2.718380621829567, + "grad_norm": 0.4758976995944977, + "learning_rate": 5.321786310550147e-06, + "loss": 0.1238, + "mean_token_accuracy": 0.9674117451906205, + "num_tokens": 314086215.0, + "step": 112000 + }, + { + "epoch": 2.7208077473847725, + "grad_norm": 0.5627454519271851, + "learning_rate": 5.231263026436606e-06, + "loss": 0.1153, + "mean_token_accuracy": 0.9697802549600602, + "num_tokens": 314365015.0, + "step": 112100 + }, + { + "epoch": 2.7232348729399773, + "grad_norm": 0.3024059534072876, + "learning_rate": 5.141495577277788e-06, + "loss": 0.1252, + "mean_token_accuracy": 0.9676291835308075, + "num_tokens": 314653127.0, + "step": 112200 + }, + { + "epoch": 2.725661998495182, + "grad_norm": 0.27323031425476074, + "learning_rate": 5.052484679020486e-06, + "loss": 0.1107, + "mean_token_accuracy": 0.9716593581438064, + "num_tokens": 314947107.0, + "step": 112300 + }, + { + "epoch": 2.728089124050387, + "grad_norm": 0.3902416527271271, + "learning_rate": 4.9642310415776046e-06, + "loss": 0.1188, + "mean_token_accuracy": 0.9689583003520965, + "num_tokens": 315235175.0, + "step": 112400 + }, + { + "epoch": 2.730516249605592, + "grad_norm": 0.18243947625160217, + "learning_rate": 4.876735368822427e-06, + "loss": 0.1178, + "mean_token_accuracy": 0.9699918162822724, + "num_tokens": 315517014.0, + "step": 112500 + }, + { + "epoch": 2.732943375160797, + "grad_norm": 0.3229522109031677, + "learning_rate": 4.789998358583081e-06, + "loss": 0.1063, + "mean_token_accuracy": 0.9719061130285263, + "num_tokens": 315800900.0, + "step": 112600 + }, + { + "epoch": 2.735370500716002, + "grad_norm": 0.3077257573604584, + "learning_rate": 4.704020702636902e-06, + "loss": 0.1078, + "mean_token_accuracy": 0.9717499113082886, + "num_tokens": 316069792.0, + "step": 112700 + }, + { + "epoch": 2.737797626271207, + "grad_norm": 0.604117751121521, + "learning_rate": 4.6188030867049035e-06, + "loss": 0.1279, + "mean_token_accuracy": 0.9671464514732361, + "num_tokens": 316351166.0, + "step": 112800 + }, + { + "epoch": 2.7402247518264122, + "grad_norm": 0.5633140206336975, + "learning_rate": 4.534346190446425e-06, + "loss": 0.1267, + "mean_token_accuracy": 0.9674019521474838, + "num_tokens": 316640163.0, + "step": 112900 + }, + { + "epoch": 2.742651877381617, + "grad_norm": 0.2642538249492645, + "learning_rate": 4.4506506874535505e-06, + "loss": 0.1009, + "mean_token_accuracy": 0.9735317534208298, + "num_tokens": 316911400.0, + "step": 113000 + }, + { + "epoch": 2.745079002936822, + "grad_norm": 0.5643557906150818, + "learning_rate": 4.367717245245873e-06, + "loss": 0.1357, + "mean_token_accuracy": 0.9655422532558441, + "num_tokens": 317192284.0, + "step": 113100 + }, + { + "epoch": 2.747506128492027, + "grad_norm": 0.5519342422485352, + "learning_rate": 4.285546525265083e-06, + "loss": 0.1178, + "mean_token_accuracy": 0.9697508090734481, + "num_tokens": 317475335.0, + "step": 113200 + }, + { + "epoch": 2.7499332540472317, + "grad_norm": 0.35835447907447815, + "learning_rate": 4.204139182869715e-06, + "loss": 0.1359, + "mean_token_accuracy": 0.9650754129886627, + "num_tokens": 317759240.0, + "step": 113300 + }, + { + "epoch": 2.752360379602437, + "grad_norm": 0.41857218742370605, + "learning_rate": 4.123495867329996e-06, + "loss": 0.1305, + "mean_token_accuracy": 0.9665146750211716, + "num_tokens": 318050262.0, + "step": 113400 + }, + { + "epoch": 2.754787505157642, + "grad_norm": 0.3513345420360565, + "learning_rate": 4.043617221822527e-06, + "loss": 0.1104, + "mean_token_accuracy": 0.971141186952591, + "num_tokens": 318344518.0, + "step": 113500 + }, + { + "epoch": 2.7572146307128467, + "grad_norm": 0.29043108224868774, + "learning_rate": 3.964503883425253e-06, + "loss": 0.1037, + "mean_token_accuracy": 0.9722469764947891, + "num_tokens": 318624075.0, + "step": 113600 + }, + { + "epoch": 2.759641756268052, + "grad_norm": 0.1855383664369583, + "learning_rate": 3.886156483112346e-06, + "loss": 0.0983, + "mean_token_accuracy": 0.9739230316877365, + "num_tokens": 318887655.0, + "step": 113700 + }, + { + "epoch": 2.762068881823257, + "grad_norm": 0.4655309021472931, + "learning_rate": 3.808575645749235e-06, + "loss": 0.1137, + "mean_token_accuracy": 0.970301919579506, + "num_tokens": 319159124.0, + "step": 113800 + }, + { + "epoch": 2.7644960073784617, + "grad_norm": 0.1889120638370514, + "learning_rate": 3.731761990087501e-06, + "loss": 0.1197, + "mean_token_accuracy": 0.9691438144445419, + "num_tokens": 319445815.0, + "step": 113900 + }, + { + "epoch": 2.7669231329336665, + "grad_norm": 0.478685200214386, + "learning_rate": 3.655716128760045e-06, + "loss": 0.1108, + "mean_token_accuracy": 0.9712050658464432, + "num_tokens": 319719626.0, + "step": 114000 + }, + { + "epoch": 2.7693502584888714, + "grad_norm": 0.37950462102890015, + "learning_rate": 3.580438668276187e-06, + "loss": 0.113, + "mean_token_accuracy": 0.9705785620212555, + "num_tokens": 319995278.0, + "step": 114100 + }, + { + "epoch": 2.7717773840440767, + "grad_norm": 0.5169498920440674, + "learning_rate": 3.50593020901675e-06, + "loss": 0.1332, + "mean_token_accuracy": 0.9659777718782425, + "num_tokens": 320290364.0, + "step": 114200 + }, + { + "epoch": 2.7742045095992816, + "grad_norm": 0.17640161514282227, + "learning_rate": 3.4321913452294076e-06, + "loss": 0.119, + "mean_token_accuracy": 0.9695685094594956, + "num_tokens": 320574116.0, + "step": 114300 + }, + { + "epoch": 2.7766316351544864, + "grad_norm": 0.2589649260044098, + "learning_rate": 3.359222665023809e-06, + "loss": 0.1097, + "mean_token_accuracy": 0.9713182425498963, + "num_tokens": 320864956.0, + "step": 114400 + }, + { + "epoch": 2.7790587607096917, + "grad_norm": 0.47062090039253235, + "learning_rate": 3.287024750366974e-06, + "loss": 0.1069, + "mean_token_accuracy": 0.972015266418457, + "num_tokens": 321149365.0, + "step": 114500 + }, + { + "epoch": 2.7814858862648966, + "grad_norm": 0.40544161200523376, + "learning_rate": 3.215598177078627e-06, + "loss": 0.1115, + "mean_token_accuracy": 0.9708707851171493, + "num_tokens": 321422551.0, + "step": 114600 + }, + { + "epoch": 2.7839130118201014, + "grad_norm": 0.2606247067451477, + "learning_rate": 3.1449435148266036e-06, + "loss": 0.0969, + "mean_token_accuracy": 0.974098664522171, + "num_tokens": 321709417.0, + "step": 114700 + }, + { + "epoch": 2.7863401373753063, + "grad_norm": 0.4302670359611511, + "learning_rate": 3.075061327122286e-06, + "loss": 0.1121, + "mean_token_accuracy": 0.9709526228904725, + "num_tokens": 321989833.0, + "step": 114800 + }, + { + "epoch": 2.788767262930511, + "grad_norm": 0.280659943819046, + "learning_rate": 3.0059521713161507e-06, + "loss": 0.1362, + "mean_token_accuracy": 0.9656502819061279, + "num_tokens": 322258838.0, + "step": 114900 + }, + { + "epoch": 2.7911943884857164, + "grad_norm": 0.3072628676891327, + "learning_rate": 2.9376165985933067e-06, + "loss": 0.1184, + "mean_token_accuracy": 0.9694582325220108, + "num_tokens": 322544203.0, + "step": 115000 + }, + { + "epoch": 2.7936215140409213, + "grad_norm": 0.578826904296875, + "learning_rate": 2.8700551539690757e-06, + "loss": 0.1013, + "mean_token_accuracy": 0.9733384555578232, + "num_tokens": 322836934.0, + "step": 115100 + }, + { + "epoch": 2.796048639596126, + "grad_norm": 0.20911192893981934, + "learning_rate": 2.8032683762846623e-06, + "loss": 0.1415, + "mean_token_accuracy": 0.96405100107193, + "num_tokens": 323122676.0, + "step": 115200 + }, + { + "epoch": 2.7984757651513315, + "grad_norm": 0.5314410924911499, + "learning_rate": 2.737256798202914e-06, + "loss": 0.1256, + "mean_token_accuracy": 0.9679098558425904, + "num_tokens": 323420602.0, + "step": 115300 + }, + { + "epoch": 2.8009028907065363, + "grad_norm": 0.41332700848579407, + "learning_rate": 2.6720209462039126e-06, + "loss": 0.1145, + "mean_token_accuracy": 0.9704767823219299, + "num_tokens": 323701792.0, + "step": 115400 + }, + { + "epoch": 2.803330016261741, + "grad_norm": 0.8187140822410583, + "learning_rate": 2.6075613405809775e-06, + "loss": 0.1099, + "mean_token_accuracy": 0.9712301725149155, + "num_tokens": 323980477.0, + "step": 115500 + }, + { + "epoch": 2.8057571418169465, + "grad_norm": 0.4253501296043396, + "learning_rate": 2.5438784954363693e-06, + "loss": 0.1076, + "mean_token_accuracy": 0.9717456769943237, + "num_tokens": 324265276.0, + "step": 115600 + }, + { + "epoch": 2.8081842673721513, + "grad_norm": 0.40823256969451904, + "learning_rate": 2.4809729186772713e-06, + "loss": 0.1144, + "mean_token_accuracy": 0.9703187984228134, + "num_tokens": 324549587.0, + "step": 115700 + }, + { + "epoch": 2.810611392927356, + "grad_norm": 0.2143670618534088, + "learning_rate": 2.4188451120116473e-06, + "loss": 0.1154, + "mean_token_accuracy": 0.9702856040000916, + "num_tokens": 324826840.0, + "step": 115800 + }, + { + "epoch": 2.813038518482561, + "grad_norm": 0.5098114609718323, + "learning_rate": 2.357495570944368e-06, + "loss": 0.1018, + "mean_token_accuracy": 0.9735467559099198, + "num_tokens": 325106484.0, + "step": 115900 + }, + { + "epoch": 2.815465644037766, + "grad_norm": 0.6569923758506775, + "learning_rate": 2.296924784773169e-06, + "loss": 0.1078, + "mean_token_accuracy": 0.9718588262796402, + "num_tokens": 325383372.0, + "step": 116000 + }, + { + "epoch": 2.817892769592971, + "grad_norm": 0.25976550579071045, + "learning_rate": 2.237133236584732e-06, + "loss": 0.1104, + "mean_token_accuracy": 0.970865985751152, + "num_tokens": 325665856.0, + "step": 116100 + }, + { + "epoch": 2.820319895148176, + "grad_norm": 0.32673880457878113, + "learning_rate": 2.178121403250921e-06, + "loss": 0.1082, + "mean_token_accuracy": 0.9717980152368546, + "num_tokens": 325955490.0, + "step": 116200 + }, + { + "epoch": 2.822747020703381, + "grad_norm": 0.24631185829639435, + "learning_rate": 2.1198897554248866e-06, + "loss": 0.1134, + "mean_token_accuracy": 0.9706428390741348, + "num_tokens": 326229143.0, + "step": 116300 + }, + { + "epoch": 2.825174146258586, + "grad_norm": 0.5602017045021057, + "learning_rate": 2.0624387575373884e-06, + "loss": 0.1253, + "mean_token_accuracy": 0.967502401471138, + "num_tokens": 326501871.0, + "step": 116400 + }, + { + "epoch": 2.827601271813791, + "grad_norm": 0.22061678767204285, + "learning_rate": 2.005768867793023e-06, + "loss": 0.1217, + "mean_token_accuracy": 0.9690318322181701, + "num_tokens": 326780314.0, + "step": 116500 + }, + { + "epoch": 2.830028397368996, + "grad_norm": 0.17672549188137054, + "learning_rate": 1.9498805381666373e-06, + "loss": 0.1036, + "mean_token_accuracy": 0.973377079963684, + "num_tokens": 327053685.0, + "step": 116600 + }, + { + "epoch": 2.8324555229242008, + "grad_norm": 0.28869152069091797, + "learning_rate": 1.894774214399664e-06, + "loss": 0.1211, + "mean_token_accuracy": 0.9686681467294693, + "num_tokens": 327345823.0, + "step": 116700 + }, + { + "epoch": 2.8348826484794056, + "grad_norm": 0.2163640260696411, + "learning_rate": 1.8404503359965685e-06, + "loss": 0.1177, + "mean_token_accuracy": 0.969739454984665, + "num_tokens": 327628687.0, + "step": 116800 + }, + { + "epoch": 2.837309774034611, + "grad_norm": 0.17181088030338287, + "learning_rate": 1.7869093362214428e-06, + "loss": 0.1195, + "mean_token_accuracy": 0.9691154944896698, + "num_tokens": 327909703.0, + "step": 116900 + }, + { + "epoch": 2.839736899589816, + "grad_norm": 0.30073028802871704, + "learning_rate": 1.7341516420943838e-06, + "loss": 0.1421, + "mean_token_accuracy": 0.9641453611850739, + "num_tokens": 328189309.0, + "step": 117000 + }, + { + "epoch": 2.8421640251450206, + "grad_norm": 0.20629960298538208, + "learning_rate": 1.6821776743882412e-06, + "loss": 0.1136, + "mean_token_accuracy": 0.9706392961740494, + "num_tokens": 328481607.0, + "step": 117100 + }, + { + "epoch": 2.844591150700226, + "grad_norm": 0.13787657022476196, + "learning_rate": 1.630987847625165e-06, + "loss": 0.1177, + "mean_token_accuracy": 0.9695311176776886, + "num_tokens": 328760774.0, + "step": 117200 + }, + { + "epoch": 2.847018276255431, + "grad_norm": 0.3100675046443939, + "learning_rate": 1.58058257007333e-06, + "loss": 0.1236, + "mean_token_accuracy": 0.9676383012533187, + "num_tokens": 329044214.0, + "step": 117300 + }, + { + "epoch": 2.8494454018106357, + "grad_norm": 0.3160964846611023, + "learning_rate": 1.5309622437437165e-06, + "loss": 0.1031, + "mean_token_accuracy": 0.9725679057836533, + "num_tokens": 329316724.0, + "step": 117400 + }, + { + "epoch": 2.8518725273658405, + "grad_norm": 0.8570364117622375, + "learning_rate": 1.4821272643868456e-06, + "loss": 0.1245, + "mean_token_accuracy": 0.9679946649074554, + "num_tokens": 329596205.0, + "step": 117500 + }, + { + "epoch": 2.8542996529210454, + "grad_norm": 0.22737392783164978, + "learning_rate": 1.4340780214896377e-06, + "loss": 0.1141, + "mean_token_accuracy": 0.9702414745092391, + "num_tokens": 329879803.0, + "step": 117600 + }, + { + "epoch": 2.8567267784762507, + "grad_norm": 0.21855413913726807, + "learning_rate": 1.386814898272315e-06, + "loss": 0.089, + "mean_token_accuracy": 0.9766771763563156, + "num_tokens": 330172869.0, + "step": 117700 + }, + { + "epoch": 2.8591539040314555, + "grad_norm": 0.20194584131240845, + "learning_rate": 1.3403382716853707e-06, + "loss": 0.1022, + "mean_token_accuracy": 0.9733722984790802, + "num_tokens": 330458642.0, + "step": 117800 + }, + { + "epoch": 2.8615810295866604, + "grad_norm": 0.34213054180145264, + "learning_rate": 1.2946485124065045e-06, + "loss": 0.1123, + "mean_token_accuracy": 0.9709642374515534, + "num_tokens": 330759617.0, + "step": 117900 + }, + { + "epoch": 2.8640081551418657, + "grad_norm": 0.2635839879512787, + "learning_rate": 1.2497459848377357e-06, + "loss": 0.1088, + "mean_token_accuracy": 0.9715518152713776, + "num_tokens": 331047262.0, + "step": 118000 + }, + { + "epoch": 2.8664352806970705, + "grad_norm": 0.2805730998516083, + "learning_rate": 1.205631047102429e-06, + "loss": 0.1164, + "mean_token_accuracy": 0.9698609280586242, + "num_tokens": 331316862.0, + "step": 118100 + }, + { + "epoch": 2.8688624062522754, + "grad_norm": 0.9312669038772583, + "learning_rate": 1.1623040510424733e-06, + "loss": 0.1362, + "mean_token_accuracy": 0.9651178181171417, + "num_tokens": 331590294.0, + "step": 118200 + }, + { + "epoch": 2.8712895318074803, + "grad_norm": 0.48080629110336304, + "learning_rate": 1.119765342215484e-06, + "loss": 0.1112, + "mean_token_accuracy": 0.9709939730167388, + "num_tokens": 331882823.0, + "step": 118300 + }, + { + "epoch": 2.873716657362685, + "grad_norm": 0.34005963802337646, + "learning_rate": 1.0780152598920291e-06, + "loss": 0.1164, + "mean_token_accuracy": 0.9699809849262238, + "num_tokens": 332171535.0, + "step": 118400 + }, + { + "epoch": 2.8761437829178904, + "grad_norm": 0.6795576810836792, + "learning_rate": 1.0370541370529063e-06, + "loss": 0.1318, + "mean_token_accuracy": 0.9663900202512741, + "num_tokens": 332445375.0, + "step": 118500 + }, + { + "epoch": 2.8785709084730953, + "grad_norm": 0.33189335465431213, + "learning_rate": 9.96882300386559e-07, + "loss": 0.1112, + "mean_token_accuracy": 0.9712472832202912, + "num_tokens": 332716597.0, + "step": 118600 + }, + { + "epoch": 2.8809980340283, + "grad_norm": 0.5224782824516296, + "learning_rate": 9.57500070286388e-07, + "loss": 0.1196, + "mean_token_accuracy": 0.9688737159967422, + "num_tokens": 332989752.0, + "step": 118700 + }, + { + "epoch": 2.8834251595835054, + "grad_norm": 0.20184582471847534, + "learning_rate": 9.189077608482088e-07, + "loss": 0.1032, + "mean_token_accuracy": 0.9731306463479996, + "num_tokens": 333287367.0, + "step": 118800 + }, + { + "epoch": 2.8858522851387103, + "grad_norm": 0.2816540002822876, + "learning_rate": 8.811056798678219e-07, + "loss": 0.1252, + "mean_token_accuracy": 0.9680552417039872, + "num_tokens": 333557454.0, + "step": 118900 + }, + { + "epoch": 2.888279410693915, + "grad_norm": 0.7499918341636658, + "learning_rate": 8.440941288384573e-07, + "loss": 0.1041, + "mean_token_accuracy": 0.9726737987995148, + "num_tokens": 333841334.0, + "step": 119000 + }, + { + "epoch": 2.8907065362491204, + "grad_norm": 0.5361903309822083, + "learning_rate": 8.078734029484336e-07, + "loss": 0.1214, + "mean_token_accuracy": 0.9687109112739563, + "num_tokens": 334133058.0, + "step": 119100 + }, + { + "epoch": 2.8931336618043253, + "grad_norm": 0.3385041058063507, + "learning_rate": 7.724437910787918e-07, + "loss": 0.1268, + "mean_token_accuracy": 0.9669984877109528, + "num_tokens": 334413493.0, + "step": 119200 + }, + { + "epoch": 2.89556078735953, + "grad_norm": 0.7476818561553955, + "learning_rate": 7.378055758009761e-07, + "loss": 0.1364, + "mean_token_accuracy": 0.9651480865478516, + "num_tokens": 334683284.0, + "step": 119300 + }, + { + "epoch": 2.897987912914735, + "grad_norm": 0.19367779791355133, + "learning_rate": 7.039590333745794e-07, + "loss": 0.1304, + "mean_token_accuracy": 0.966360149383545, + "num_tokens": 334958476.0, + "step": 119400 + }, + { + "epoch": 2.90041503846994, + "grad_norm": 0.2946068346500397, + "learning_rate": 6.709044337451786e-07, + "loss": 0.116, + "mean_token_accuracy": 0.9700655668973923, + "num_tokens": 335240625.0, + "step": 119500 + }, + { + "epoch": 2.902842164025145, + "grad_norm": 0.2045404314994812, + "learning_rate": 6.386420405421367e-07, + "loss": 0.1196, + "mean_token_accuracy": 0.9686876207590103, + "num_tokens": 335523206.0, + "step": 119600 + }, + { + "epoch": 2.90526928958035, + "grad_norm": 0.27502840757369995, + "learning_rate": 6.071721110765038e-07, + "loss": 0.0995, + "mean_token_accuracy": 0.9739311110973358, + "num_tokens": 335814183.0, + "step": 119700 + }, + { + "epoch": 2.907696415135555, + "grad_norm": 0.43896734714508057, + "learning_rate": 5.764948963389971e-07, + "loss": 0.1198, + "mean_token_accuracy": 0.9692691284418106, + "num_tokens": 336083135.0, + "step": 119800 + }, + { + "epoch": 2.91012354069076, + "grad_norm": 0.1466599702835083, + "learning_rate": 5.46610640997991e-07, + "loss": 0.1114, + "mean_token_accuracy": 0.9710710817575454, + "num_tokens": 336349492.0, + "step": 119900 + }, + { + "epoch": 2.912550666245965, + "grad_norm": 0.5170989036560059, + "learning_rate": 5.175195833975077e-07, + "loss": 0.1317, + "mean_token_accuracy": 0.9662616163492203, + "num_tokens": 336623591.0, + "step": 120000 + }, + { + "epoch": 2.91497779180117, + "grad_norm": 0.22670291364192963, + "learning_rate": 4.892219555554189e-07, + "loss": 0.1206, + "mean_token_accuracy": 0.9689949071407318, + "num_tokens": 336895993.0, + "step": 120100 + }, + { + "epoch": 2.9174049173563747, + "grad_norm": 0.5331772565841675, + "learning_rate": 4.617179831614915e-07, + "loss": 0.116, + "mean_token_accuracy": 0.9697140926122665, + "num_tokens": 337166227.0, + "step": 120200 + }, + { + "epoch": 2.9198320429115796, + "grad_norm": 0.4976193904876709, + "learning_rate": 4.350078855756778e-07, + "loss": 0.1269, + "mean_token_accuracy": 0.9676454555988312, + "num_tokens": 337439450.0, + "step": 120300 + }, + { + "epoch": 2.922259168466785, + "grad_norm": 0.7994723320007324, + "learning_rate": 4.0909187582630625e-07, + "loss": 0.1117, + "mean_token_accuracy": 0.9715138351917267, + "num_tokens": 337722491.0, + "step": 120400 + }, + { + "epoch": 2.9246862940219898, + "grad_norm": 0.3039077818393707, + "learning_rate": 3.839701606083712e-07, + "loss": 0.1238, + "mean_token_accuracy": 0.9683326798677444, + "num_tokens": 338013188.0, + "step": 120500 + }, + { + "epoch": 2.9271134195771946, + "grad_norm": 0.4227394759654999, + "learning_rate": 3.5964294028195676e-07, + "loss": 0.1271, + "mean_token_accuracy": 0.9676005560159683, + "num_tokens": 338299002.0, + "step": 120600 + }, + { + "epoch": 2.9295405451324, + "grad_norm": 0.4402371346950531, + "learning_rate": 3.3611040887058243e-07, + "loss": 0.1078, + "mean_token_accuracy": 0.9718052613735199, + "num_tokens": 338590741.0, + "step": 120700 + }, + { + "epoch": 2.9319676706876048, + "grad_norm": 0.2943148612976074, + "learning_rate": 3.1337275405964874e-07, + "loss": 0.1125, + "mean_token_accuracy": 0.9705305284261704, + "num_tokens": 338893353.0, + "step": 120800 + }, + { + "epoch": 2.9343947962428096, + "grad_norm": 0.3009592294692993, + "learning_rate": 2.91430157194994e-07, + "loss": 0.1046, + "mean_token_accuracy": 0.9726232421398163, + "num_tokens": 339174963.0, + "step": 120900 + }, + { + "epoch": 2.9368219217980145, + "grad_norm": 0.16092665493488312, + "learning_rate": 2.702827932813845e-07, + "loss": 0.1218, + "mean_token_accuracy": 0.9687193125486374, + "num_tokens": 339448824.0, + "step": 121000 + }, + { + "epoch": 2.9392490473532193, + "grad_norm": 0.2029600590467453, + "learning_rate": 2.499308309811599e-07, + "loss": 0.1142, + "mean_token_accuracy": 0.9698378247022629, + "num_tokens": 339724271.0, + "step": 121100 + }, + { + "epoch": 2.9416761729084246, + "grad_norm": 0.25630271434783936, + "learning_rate": 2.303744326128787e-07, + "loss": 0.1213, + "mean_token_accuracy": 0.9688639378547669, + "num_tokens": 340001286.0, + "step": 121200 + }, + { + "epoch": 2.9441032984636295, + "grad_norm": 0.22377553582191467, + "learning_rate": 2.116137541500196e-07, + "loss": 0.1017, + "mean_token_accuracy": 0.9735622280836105, + "num_tokens": 340284957.0, + "step": 121300 + }, + { + "epoch": 2.9465304240188344, + "grad_norm": 0.570862352848053, + "learning_rate": 1.9364894521972654e-07, + "loss": 0.1164, + "mean_token_accuracy": 0.9696594834327698, + "num_tokens": 340561900.0, + "step": 121400 + }, + { + "epoch": 2.9489575495740397, + "grad_norm": 0.24297547340393066, + "learning_rate": 1.7648014910165434e-07, + "loss": 0.1248, + "mean_token_accuracy": 0.9681780362129211, + "num_tokens": 340844943.0, + "step": 121500 + }, + { + "epoch": 2.9513846751292445, + "grad_norm": 0.4155631363391876, + "learning_rate": 1.6010750272678065e-07, + "loss": 0.1016, + "mean_token_accuracy": 0.9732653588056565, + "num_tokens": 341118671.0, + "step": 121600 + }, + { + "epoch": 2.9538118006844494, + "grad_norm": 0.2027473896741867, + "learning_rate": 1.4453113667634022e-07, + "loss": 0.1171, + "mean_token_accuracy": 0.9696639609336853, + "num_tokens": 341405359.0, + "step": 121700 + }, + { + "epoch": 2.9562389262396542, + "grad_norm": 0.3320136070251465, + "learning_rate": 1.2975117518077007e-07, + "loss": 0.1159, + "mean_token_accuracy": 0.9702119541168213, + "num_tokens": 341696526.0, + "step": 121800 + }, + { + "epoch": 2.958666051794859, + "grad_norm": 0.8280718922615051, + "learning_rate": 1.1576773611872149e-07, + "loss": 0.1074, + "mean_token_accuracy": 0.9725628489255905, + "num_tokens": 341991885.0, + "step": 121900 + }, + { + "epoch": 2.9610931773500644, + "grad_norm": 0.3056747615337372, + "learning_rate": 1.0258093101613853e-07, + "loss": 0.098, + "mean_token_accuracy": 0.9740756869316101, + "num_tokens": 342273671.0, + "step": 122000 + }, + { + "epoch": 2.9635203029052692, + "grad_norm": 0.3317147493362427, + "learning_rate": 9.019086504533648e-08, + "loss": 0.1041, + "mean_token_accuracy": 0.9729535073041916, + "num_tokens": 342565263.0, + "step": 122100 + }, + { + "epoch": 2.965947428460474, + "grad_norm": 0.16931921243667603, + "learning_rate": 7.859763702420253e-08, + "loss": 0.1218, + "mean_token_accuracy": 0.9689329898357392, + "num_tokens": 342845461.0, + "step": 122200 + }, + { + "epoch": 2.9683745540156794, + "grad_norm": 0.14930430054664612, + "learning_rate": 6.780133941535205e-08, + "loss": 0.1075, + "mean_token_accuracy": 0.9722885394096374, + "num_tokens": 343124544.0, + "step": 122300 + }, + { + "epoch": 2.9708016795708843, + "grad_norm": 0.17570771276950836, + "learning_rate": 5.780205832545127e-08, + "loss": 0.1289, + "mean_token_accuracy": 0.9669734662771226, + "num_tokens": 343409061.0, + "step": 122400 + }, + { + "epoch": 2.973228805126089, + "grad_norm": 0.27162694931030273, + "learning_rate": 4.859987350449568e-08, + "loss": 0.1088, + "mean_token_accuracy": 0.9716758787631988, + "num_tokens": 343697831.0, + "step": 122500 + }, + { + "epoch": 2.9756559306812944, + "grad_norm": 0.42215201258659363, + "learning_rate": 4.0194858345188324e-08, + "loss": 0.1138, + "mean_token_accuracy": 0.9704602026939392, + "num_tokens": 343979543.0, + "step": 122600 + }, + { + "epoch": 2.9780830562364993, + "grad_norm": 0.18965835869312286, + "learning_rate": 3.258707988231802e-08, + "loss": 0.1147, + "mean_token_accuracy": 0.9699524390697479, + "num_tokens": 344259353.0, + "step": 122700 + }, + { + "epoch": 2.980510181791704, + "grad_norm": 0.424998939037323, + "learning_rate": 2.5776598792282002e-08, + "loss": 0.113, + "mean_token_accuracy": 0.9705224817991257, + "num_tokens": 344541167.0, + "step": 122800 + }, + { + "epoch": 2.982937307346909, + "grad_norm": 0.4654737114906311, + "learning_rate": 1.976346939257523e-08, + "loss": 0.1203, + "mean_token_accuracy": 0.9689262408018112, + "num_tokens": 344820069.0, + "step": 122900 + }, + { + "epoch": 2.985364432902114, + "grad_norm": 0.21616946160793304, + "learning_rate": 1.4547739641335157e-08, + "loss": 0.1163, + "mean_token_accuracy": 0.9700032860040665, + "num_tokens": 345099173.0, + "step": 123000 + }, + { + "epoch": 2.987791558457319, + "grad_norm": 0.41284772753715515, + "learning_rate": 1.0129451137008695e-08, + "loss": 0.0992, + "mean_token_accuracy": 0.9735272574424744, + "num_tokens": 345382836.0, + "step": 123100 + }, + { + "epoch": 2.990218684012524, + "grad_norm": 0.5250483751296997, + "learning_rate": 6.508639117974724e-09, + "loss": 0.1255, + "mean_token_accuracy": 0.9675652432441711, + "num_tokens": 345659959.0, + "step": 123200 + }, + { + "epoch": 2.992645809567729, + "grad_norm": 0.27128636837005615, + "learning_rate": 3.6853324622776463e-09, + "loss": 0.1217, + "mean_token_accuracy": 0.9685781133174897, + "num_tokens": 345952286.0, + "step": 123300 + }, + { + "epoch": 2.995072935122934, + "grad_norm": 0.4348767399787903, + "learning_rate": 1.659553687394233e-09, + "loss": 0.1134, + "mean_token_accuracy": 0.9702136415243149, + "num_tokens": 346215211.0, + "step": 123400 + }, + { + "epoch": 2.997500060678139, + "grad_norm": 0.2931930124759674, + "learning_rate": 4.313189501004011e-10, + "loss": 0.1253, + "mean_token_accuracy": 0.9676681870222091, + "num_tokens": 346504966.0, + "step": 123500 + }, + { + "epoch": 2.999927186233344, + "grad_norm": 0.3851180374622345, + "learning_rate": 6.380462380661811e-13, + "loss": 0.1069, + "mean_token_accuracy": 0.9724932503700257, + "num_tokens": 346775752.0, + "step": 123600 + } + ], + "logging_steps": 100, + "max_steps": 123603, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 2000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.932579451067695e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}