{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 123603, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024271255552049706, "grad_norm": 15.667192459106445, "learning_rate": 1.6018121511204597e-06, "loss": 6.3579, "mean_token_accuracy": 0.16570602521300315, "num_tokens": 286328.0, "step": 100 }, { "epoch": 0.004854251110409941, "grad_norm": 12.743103981018066, "learning_rate": 3.219804222959308e-06, "loss": 4.8459, "mean_token_accuracy": 0.23822063744068145, "num_tokens": 566683.0, "step": 200 }, { "epoch": 0.007281376665614912, "grad_norm": 9.71696662902832, "learning_rate": 4.837796294798156e-06, "loss": 3.9201, "mean_token_accuracy": 0.3217995056509972, "num_tokens": 859370.0, "step": 300 }, { "epoch": 0.009708502220819882, "grad_norm": 7.237846851348877, "learning_rate": 6.455788366637004e-06, "loss": 3.425, "mean_token_accuracy": 0.37329041689634324, "num_tokens": 1148220.0, "step": 400 }, { "epoch": 0.012135627776024853, "grad_norm": 7.680373668670654, "learning_rate": 8.073780438475852e-06, "loss": 3.1271, "mean_token_accuracy": 0.4050374338030815, "num_tokens": 1425964.0, "step": 500 }, { "epoch": 0.014562753331229824, "grad_norm": 5.841236114501953, "learning_rate": 9.6917725103147e-06, "loss": 2.8971, "mean_token_accuracy": 0.43935800462961194, "num_tokens": 1691169.0, "step": 600 }, { "epoch": 0.016989878886434794, "grad_norm": 6.139180660247803, "learning_rate": 1.1309764582153549e-05, "loss": 2.7676, "mean_token_accuracy": 0.4539221778512001, "num_tokens": 1971620.0, "step": 700 }, { "epoch": 0.019417004441639765, "grad_norm": 7.90980863571167, "learning_rate": 1.2927756653992395e-05, "loss": 2.6594, "mean_token_accuracy": 0.4727063673734665, "num_tokens": 2256913.0, "step": 800 }, { "epoch": 0.021844129996844736, "grad_norm": 6.0803985595703125, "learning_rate": 1.4545748725831243e-05, "loss": 2.5591, "mean_token_accuracy": 0.48720634669065477, "num_tokens": 2539493.0, "step": 900 }, { "epoch": 0.024271255552049707, "grad_norm": 5.513032913208008, "learning_rate": 1.6163740797670095e-05, "loss": 2.5159, "mean_token_accuracy": 0.4934208956360817, "num_tokens": 2833735.0, "step": 1000 }, { "epoch": 0.026698381107254678, "grad_norm": 5.765513896942139, "learning_rate": 1.778173286950894e-05, "loss": 2.4417, "mean_token_accuracy": 0.5049500033259392, "num_tokens": 3107550.0, "step": 1100 }, { "epoch": 0.02912550666245965, "grad_norm": 4.920138835906982, "learning_rate": 1.9399724941347788e-05, "loss": 2.3992, "mean_token_accuracy": 0.5102156156301498, "num_tokens": 3386017.0, "step": 1200 }, { "epoch": 0.03155263221766462, "grad_norm": 5.399600505828857, "learning_rate": 2.1017717013186636e-05, "loss": 2.3613, "mean_token_accuracy": 0.5177968215942382, "num_tokens": 3677046.0, "step": 1300 }, { "epoch": 0.03397975777286959, "grad_norm": 7.163160800933838, "learning_rate": 2.2635709085025484e-05, "loss": 2.291, "mean_token_accuracy": 0.5300437113642693, "num_tokens": 3950955.0, "step": 1400 }, { "epoch": 0.03640688332807456, "grad_norm": 4.863125324249268, "learning_rate": 2.4253701156864333e-05, "loss": 2.2754, "mean_token_accuracy": 0.5317086404561997, "num_tokens": 4235827.0, "step": 1500 }, { "epoch": 0.03883400888327953, "grad_norm": 5.184915542602539, "learning_rate": 2.587169322870318e-05, "loss": 2.2208, "mean_token_accuracy": 0.5397129538655281, "num_tokens": 4523618.0, "step": 1600 }, { "epoch": 0.041261134438484504, "grad_norm": 4.9580841064453125, "learning_rate": 2.748968530054203e-05, "loss": 2.1542, "mean_token_accuracy": 0.552655583024025, "num_tokens": 4806914.0, "step": 1700 }, { "epoch": 0.04368825999368947, "grad_norm": 4.563178062438965, "learning_rate": 2.9107677372380877e-05, "loss": 2.1012, "mean_token_accuracy": 0.5598031800985336, "num_tokens": 5070432.0, "step": 1800 }, { "epoch": 0.046115385548894446, "grad_norm": 5.044736862182617, "learning_rate": 3.0725669444219726e-05, "loss": 2.0686, "mean_token_accuracy": 0.5646976864337921, "num_tokens": 5348835.0, "step": 1900 }, { "epoch": 0.04854251110409941, "grad_norm": 4.853013038635254, "learning_rate": 3.234366151605857e-05, "loss": 2.0491, "mean_token_accuracy": 0.5689935937523842, "num_tokens": 5644954.0, "step": 2000 }, { "epoch": 0.05096963665930439, "grad_norm": 4.354330062866211, "learning_rate": 3.3961653587897415e-05, "loss": 2.0365, "mean_token_accuracy": 0.5719451946020127, "num_tokens": 5919205.0, "step": 2100 }, { "epoch": 0.053396762214509355, "grad_norm": 5.173582553863525, "learning_rate": 3.5579645659736274e-05, "loss": 1.9987, "mean_token_accuracy": 0.5776787069439888, "num_tokens": 6201450.0, "step": 2200 }, { "epoch": 0.05582388776971433, "grad_norm": 4.418239116668701, "learning_rate": 3.719763773157512e-05, "loss": 1.9778, "mean_token_accuracy": 0.5795711416006089, "num_tokens": 6485924.0, "step": 2300 }, { "epoch": 0.0582510133249193, "grad_norm": 5.564678192138672, "learning_rate": 3.881562980341396e-05, "loss": 1.9145, "mean_token_accuracy": 0.5895163154602051, "num_tokens": 6779709.0, "step": 2400 }, { "epoch": 0.06067813888012427, "grad_norm": 4.3705153465271, "learning_rate": 4.0433621875252815e-05, "loss": 1.938, "mean_token_accuracy": 0.5882080298662186, "num_tokens": 7053740.0, "step": 2500 }, { "epoch": 0.06310526443532924, "grad_norm": 3.672412633895874, "learning_rate": 4.205161394709166e-05, "loss": 1.8963, "mean_token_accuracy": 0.5925538367033005, "num_tokens": 7355267.0, "step": 2600 }, { "epoch": 0.0655323899905342, "grad_norm": 4.00920295715332, "learning_rate": 4.366960601893051e-05, "loss": 1.8685, "mean_token_accuracy": 0.59998017847538, "num_tokens": 7632131.0, "step": 2700 }, { "epoch": 0.06795951554573917, "grad_norm": 3.901211977005005, "learning_rate": 4.5287598090769356e-05, "loss": 1.8885, "mean_token_accuracy": 0.5954222363233567, "num_tokens": 7917324.0, "step": 2800 }, { "epoch": 0.07038664110094416, "grad_norm": 3.7556352615356445, "learning_rate": 4.69055901626082e-05, "loss": 1.8447, "mean_token_accuracy": 0.6014421585202218, "num_tokens": 8197157.0, "step": 2900 }, { "epoch": 0.07281376665614912, "grad_norm": 4.258702278137207, "learning_rate": 4.852358223444705e-05, "loss": 1.8817, "mean_token_accuracy": 0.5944454026222229, "num_tokens": 8486350.0, "step": 3000 }, { "epoch": 0.07524089221135409, "grad_norm": 3.6492583751678467, "learning_rate": 5.01415743062859e-05, "loss": 1.8413, "mean_token_accuracy": 0.6042111480236053, "num_tokens": 8766318.0, "step": 3100 }, { "epoch": 0.07766801776655906, "grad_norm": 3.297952175140381, "learning_rate": 5.175956637812475e-05, "loss": 1.836, "mean_token_accuracy": 0.6041852396726608, "num_tokens": 9041673.0, "step": 3200 }, { "epoch": 0.08009514332176404, "grad_norm": 3.869326114654541, "learning_rate": 5.3377558449963594e-05, "loss": 1.7743, "mean_token_accuracy": 0.6153368014097214, "num_tokens": 9320058.0, "step": 3300 }, { "epoch": 0.08252226887696901, "grad_norm": 4.298006534576416, "learning_rate": 5.499555052180244e-05, "loss": 1.825, "mean_token_accuracy": 0.6060752519965171, "num_tokens": 9607570.0, "step": 3400 }, { "epoch": 0.08494939443217397, "grad_norm": 2.8588080406188965, "learning_rate": 5.661354259364129e-05, "loss": 1.823, "mean_token_accuracy": 0.6073589542508125, "num_tokens": 9891913.0, "step": 3500 }, { "epoch": 0.08737651998737894, "grad_norm": 3.600383996963501, "learning_rate": 5.823153466548015e-05, "loss": 1.8353, "mean_token_accuracy": 0.6057981529831886, "num_tokens": 10171842.0, "step": 3600 }, { "epoch": 0.08980364554258392, "grad_norm": 3.0030224323272705, "learning_rate": 5.9849526737318994e-05, "loss": 1.7668, "mean_token_accuracy": 0.6160938149690628, "num_tokens": 10443178.0, "step": 3700 }, { "epoch": 0.09223077109778889, "grad_norm": 3.0406975746154785, "learning_rate": 6.146751880915784e-05, "loss": 1.7535, "mean_token_accuracy": 0.6178180432319641, "num_tokens": 10732155.0, "step": 3800 }, { "epoch": 0.09465789665299386, "grad_norm": 2.917201519012451, "learning_rate": 6.308551088099668e-05, "loss": 1.7572, "mean_token_accuracy": 0.6191618722677231, "num_tokens": 11009552.0, "step": 3900 }, { "epoch": 0.09708502220819883, "grad_norm": 3.206831932067871, "learning_rate": 6.470350295283554e-05, "loss": 1.749, "mean_token_accuracy": 0.6184579783678055, "num_tokens": 11297388.0, "step": 4000 }, { "epoch": 0.09951214776340381, "grad_norm": 3.438751459121704, "learning_rate": 6.632149502467439e-05, "loss": 1.7667, "mean_token_accuracy": 0.6156092411279679, "num_tokens": 11570862.0, "step": 4100 }, { "epoch": 0.10193927331860878, "grad_norm": 8.892073631286621, "learning_rate": 6.793948709651323e-05, "loss": 1.7377, "mean_token_accuracy": 0.6204822993278504, "num_tokens": 11846985.0, "step": 4200 }, { "epoch": 0.10436639887381374, "grad_norm": 3.2197091579437256, "learning_rate": 6.955747916835208e-05, "loss": 1.7421, "mean_token_accuracy": 0.6201369935274124, "num_tokens": 12125408.0, "step": 4300 }, { "epoch": 0.10679352442901871, "grad_norm": 2.925734281539917, "learning_rate": 7.117547124019092e-05, "loss": 1.7426, "mean_token_accuracy": 0.6202259063720703, "num_tokens": 12413939.0, "step": 4400 }, { "epoch": 0.10922064998422368, "grad_norm": 3.6164560317993164, "learning_rate": 7.279346331202978e-05, "loss": 1.7318, "mean_token_accuracy": 0.6214462512731552, "num_tokens": 12696604.0, "step": 4500 }, { "epoch": 0.11164777553942866, "grad_norm": 3.639244556427002, "learning_rate": 7.441145538386862e-05, "loss": 1.7056, "mean_token_accuracy": 0.6253874599933624, "num_tokens": 12969852.0, "step": 4600 }, { "epoch": 0.11407490109463363, "grad_norm": 2.9974870681762695, "learning_rate": 7.602944745570747e-05, "loss": 1.7024, "mean_token_accuracy": 0.62675716817379, "num_tokens": 13248912.0, "step": 4700 }, { "epoch": 0.1165020266498386, "grad_norm": 2.8196988105773926, "learning_rate": 7.764743952754631e-05, "loss": 1.7229, "mean_token_accuracy": 0.6227961453795433, "num_tokens": 13545940.0, "step": 4800 }, { "epoch": 0.11892915220504356, "grad_norm": 3.317641496658325, "learning_rate": 7.926543159938516e-05, "loss": 1.7493, "mean_token_accuracy": 0.6176340329647064, "num_tokens": 13823732.0, "step": 4900 }, { "epoch": 0.12135627776024854, "grad_norm": 2.6800076961517334, "learning_rate": 8.088342367122402e-05, "loss": 1.7526, "mean_token_accuracy": 0.6184008997678757, "num_tokens": 14101813.0, "step": 5000 }, { "epoch": 0.12378340331545351, "grad_norm": 3.6860008239746094, "learning_rate": 8.250141574306286e-05, "loss": 1.7355, "mean_token_accuracy": 0.6187130987644196, "num_tokens": 14396116.0, "step": 5100 }, { "epoch": 0.12621052887065848, "grad_norm": 3.056393623352051, "learning_rate": 8.41194078149017e-05, "loss": 1.6988, "mean_token_accuracy": 0.6259019857645035, "num_tokens": 14670147.0, "step": 5200 }, { "epoch": 0.12863765442586345, "grad_norm": 3.0868637561798096, "learning_rate": 8.573739988674055e-05, "loss": 1.6956, "mean_token_accuracy": 0.627558496594429, "num_tokens": 14958834.0, "step": 5300 }, { "epoch": 0.1310647799810684, "grad_norm": 4.050454139709473, "learning_rate": 8.73553919585794e-05, "loss": 1.6534, "mean_token_accuracy": 0.63445347905159, "num_tokens": 15237854.0, "step": 5400 }, { "epoch": 0.13349190553627338, "grad_norm": 3.293698787689209, "learning_rate": 8.897338403041825e-05, "loss": 1.6915, "mean_token_accuracy": 0.6279270613193512, "num_tokens": 15516061.0, "step": 5500 }, { "epoch": 0.13591903109147835, "grad_norm": 2.8466482162475586, "learning_rate": 9.059137610225711e-05, "loss": 1.6734, "mean_token_accuracy": 0.6312123596668243, "num_tokens": 15805438.0, "step": 5600 }, { "epoch": 0.13834615664668334, "grad_norm": 2.694309949874878, "learning_rate": 9.220936817409596e-05, "loss": 1.7153, "mean_token_accuracy": 0.625066881775856, "num_tokens": 16080134.0, "step": 5700 }, { "epoch": 0.1407732822018883, "grad_norm": 3.828099489212036, "learning_rate": 9.38273602459348e-05, "loss": 1.6925, "mean_token_accuracy": 0.6267722380161286, "num_tokens": 16342095.0, "step": 5800 }, { "epoch": 0.14320040775709328, "grad_norm": 2.3940014839172363, "learning_rate": 9.544535231777365e-05, "loss": 1.6671, "mean_token_accuracy": 0.6302274829149246, "num_tokens": 16617443.0, "step": 5900 }, { "epoch": 0.14562753331229825, "grad_norm": 3.202986240386963, "learning_rate": 9.706334438961249e-05, "loss": 1.6587, "mean_token_accuracy": 0.6335983765125275, "num_tokens": 16886715.0, "step": 6000 }, { "epoch": 0.1480546588675032, "grad_norm": 2.267845630645752, "learning_rate": 9.868133646145135e-05, "loss": 1.6594, "mean_token_accuracy": 0.6337177205085754, "num_tokens": 17157399.0, "step": 6100 }, { "epoch": 0.15048178442270818, "grad_norm": 3.8720905780792236, "learning_rate": 0.00010029932853329018, "loss": 1.6464, "mean_token_accuracy": 0.6355876386165619, "num_tokens": 17431964.0, "step": 6200 }, { "epoch": 0.15290890997791315, "grad_norm": 2.3187131881713867, "learning_rate": 0.00010191732060512904, "loss": 1.6635, "mean_token_accuracy": 0.6327675718069077, "num_tokens": 17723620.0, "step": 6300 }, { "epoch": 0.15533603553311812, "grad_norm": 3.7374343872070312, "learning_rate": 0.00010353531267696787, "loss": 1.6735, "mean_token_accuracy": 0.6294235897064209, "num_tokens": 18017276.0, "step": 6400 }, { "epoch": 0.1577631610883231, "grad_norm": 3.3306126594543457, "learning_rate": 0.00010515330474880673, "loss": 1.709, "mean_token_accuracy": 0.6271786081790924, "num_tokens": 18286383.0, "step": 6500 }, { "epoch": 0.16019028664352808, "grad_norm": 2.7035059928894043, "learning_rate": 0.00010677129682064559, "loss": 1.714, "mean_token_accuracy": 0.6251045203208924, "num_tokens": 18568579.0, "step": 6600 }, { "epoch": 0.16261741219873305, "grad_norm": 5.372984409332275, "learning_rate": 0.00010838928889248443, "loss": 1.715, "mean_token_accuracy": 0.6235057324171066, "num_tokens": 18859264.0, "step": 6700 }, { "epoch": 0.16504453775393801, "grad_norm": 2.6342880725860596, "learning_rate": 0.00011000728096432329, "loss": 1.6605, "mean_token_accuracy": 0.6317295587062836, "num_tokens": 19155847.0, "step": 6800 }, { "epoch": 0.16747166330914298, "grad_norm": 3.171790599822998, "learning_rate": 0.00011162527303616212, "loss": 1.6404, "mean_token_accuracy": 0.6374486458301544, "num_tokens": 19433838.0, "step": 6900 }, { "epoch": 0.16989878886434795, "grad_norm": 2.1029253005981445, "learning_rate": 0.00011324326510800098, "loss": 1.6118, "mean_token_accuracy": 0.642929772734642, "num_tokens": 19721269.0, "step": 7000 }, { "epoch": 0.17232591441955292, "grad_norm": 2.6623852252960205, "learning_rate": 0.00011486125717983983, "loss": 1.6338, "mean_token_accuracy": 0.637419713139534, "num_tokens": 20013973.0, "step": 7100 }, { "epoch": 0.17475303997475788, "grad_norm": 2.519373655319214, "learning_rate": 0.00011647924925167868, "loss": 1.6455, "mean_token_accuracy": 0.6351126056909561, "num_tokens": 20286212.0, "step": 7200 }, { "epoch": 0.17718016552996285, "grad_norm": 2.475982427597046, "learning_rate": 0.00011809724132351752, "loss": 1.664, "mean_token_accuracy": 0.6314303702116013, "num_tokens": 20578569.0, "step": 7300 }, { "epoch": 0.17960729108516785, "grad_norm": 2.241504430770874, "learning_rate": 0.00011971523339535637, "loss": 1.6157, "mean_token_accuracy": 0.6422726994752884, "num_tokens": 20862395.0, "step": 7400 }, { "epoch": 0.18203441664037281, "grad_norm": 2.0290799140930176, "learning_rate": 0.0001213332254671952, "loss": 1.6557, "mean_token_accuracy": 0.633880580663681, "num_tokens": 21137936.0, "step": 7500 }, { "epoch": 0.18446154219557778, "grad_norm": 2.4877192974090576, "learning_rate": 0.00012295121753903406, "loss": 1.614, "mean_token_accuracy": 0.6418092441558838, "num_tokens": 21430954.0, "step": 7600 }, { "epoch": 0.18688866775078275, "grad_norm": 2.3244476318359375, "learning_rate": 0.0001245692096108729, "loss": 1.618, "mean_token_accuracy": 0.6409686493873596, "num_tokens": 21700944.0, "step": 7700 }, { "epoch": 0.18931579330598772, "grad_norm": 5.238540172576904, "learning_rate": 0.00012618720168271175, "loss": 1.6292, "mean_token_accuracy": 0.6387823939323425, "num_tokens": 21994418.0, "step": 7800 }, { "epoch": 0.19174291886119268, "grad_norm": 2.0987367630004883, "learning_rate": 0.0001278051937545506, "loss": 1.6474, "mean_token_accuracy": 0.6356805974245071, "num_tokens": 22288421.0, "step": 7900 }, { "epoch": 0.19417004441639765, "grad_norm": 2.3273093700408936, "learning_rate": 0.00012942318582638947, "loss": 1.639, "mean_token_accuracy": 0.6383429461717606, "num_tokens": 22557693.0, "step": 8000 }, { "epoch": 0.19659716997160262, "grad_norm": 2.5615649223327637, "learning_rate": 0.0001310411778982283, "loss": 1.5919, "mean_token_accuracy": 0.6441801345348358, "num_tokens": 22840289.0, "step": 8100 }, { "epoch": 0.19902429552680762, "grad_norm": 2.6359269618988037, "learning_rate": 0.00013265916997006716, "loss": 1.6807, "mean_token_accuracy": 0.6301674485206604, "num_tokens": 23128125.0, "step": 8200 }, { "epoch": 0.20145142108201258, "grad_norm": 2.7891812324523926, "learning_rate": 0.000134277162041906, "loss": 1.6412, "mean_token_accuracy": 0.635779327750206, "num_tokens": 23408204.0, "step": 8300 }, { "epoch": 0.20387854663721755, "grad_norm": 2.309911012649536, "learning_rate": 0.00013589515411374485, "loss": 1.6086, "mean_token_accuracy": 0.64215003490448, "num_tokens": 23690150.0, "step": 8400 }, { "epoch": 0.20630567219242252, "grad_norm": 2.182452917098999, "learning_rate": 0.0001375131461855837, "loss": 1.6174, "mean_token_accuracy": 0.6406794637441635, "num_tokens": 23968782.0, "step": 8500 }, { "epoch": 0.20873279774762749, "grad_norm": 2.322374105453491, "learning_rate": 0.00013913113825742254, "loss": 1.5783, "mean_token_accuracy": 0.6475563657283783, "num_tokens": 24255904.0, "step": 8600 }, { "epoch": 0.21115992330283245, "grad_norm": 2.552272081375122, "learning_rate": 0.0001407491303292614, "loss": 1.5751, "mean_token_accuracy": 0.6480100709199905, "num_tokens": 24532151.0, "step": 8700 }, { "epoch": 0.21358704885803742, "grad_norm": 1.9336109161376953, "learning_rate": 0.00014236712240110023, "loss": 1.583, "mean_token_accuracy": 0.646995313167572, "num_tokens": 24820503.0, "step": 8800 }, { "epoch": 0.2160141744132424, "grad_norm": 2.162743330001831, "learning_rate": 0.0001439851144729391, "loss": 1.5831, "mean_token_accuracy": 0.646264790892601, "num_tokens": 25096658.0, "step": 8900 }, { "epoch": 0.21844129996844736, "grad_norm": 2.5591320991516113, "learning_rate": 0.00014560310654477795, "loss": 1.5794, "mean_token_accuracy": 0.6471728563308716, "num_tokens": 25361541.0, "step": 9000 }, { "epoch": 0.22086842552365235, "grad_norm": 2.754995822906494, "learning_rate": 0.0001472210986166168, "loss": 1.6184, "mean_token_accuracy": 0.6407477951049805, "num_tokens": 25643336.0, "step": 9100 }, { "epoch": 0.22329555107885732, "grad_norm": 2.716937780380249, "learning_rate": 0.00014883909068845564, "loss": 1.5948, "mean_token_accuracy": 0.6445745784044266, "num_tokens": 25936458.0, "step": 9200 }, { "epoch": 0.22572267663406229, "grad_norm": 2.3055243492126465, "learning_rate": 0.0001504570827602945, "loss": 1.622, "mean_token_accuracy": 0.6412751048803329, "num_tokens": 26230438.0, "step": 9300 }, { "epoch": 0.22814980218926725, "grad_norm": 2.0069618225097656, "learning_rate": 0.00015207507483213333, "loss": 1.6072, "mean_token_accuracy": 0.642265853881836, "num_tokens": 26519081.0, "step": 9400 }, { "epoch": 0.23057692774447222, "grad_norm": 2.1340370178222656, "learning_rate": 0.00015369306690397218, "loss": 1.5986, "mean_token_accuracy": 0.6452283936738968, "num_tokens": 26794162.0, "step": 9500 }, { "epoch": 0.2330040532996772, "grad_norm": 2.440425395965576, "learning_rate": 0.00015531105897581101, "loss": 1.6111, "mean_token_accuracy": 0.6408109438419342, "num_tokens": 27070997.0, "step": 9600 }, { "epoch": 0.23543117885488216, "grad_norm": 4.2947893142700195, "learning_rate": 0.00015692905104764987, "loss": 1.5983, "mean_token_accuracy": 0.6435111856460571, "num_tokens": 27358764.0, "step": 9700 }, { "epoch": 0.23785830441008712, "grad_norm": 2.020979642868042, "learning_rate": 0.0001585470431194887, "loss": 1.6451, "mean_token_accuracy": 0.6358766305446625, "num_tokens": 27646756.0, "step": 9800 }, { "epoch": 0.2402854299652921, "grad_norm": 2.3321073055267334, "learning_rate": 0.00016016503519132756, "loss": 1.6128, "mean_token_accuracy": 0.6417472112178803, "num_tokens": 27933393.0, "step": 9900 }, { "epoch": 0.2427125555204971, "grad_norm": 1.9731568098068237, "learning_rate": 0.00016178302726316642, "loss": 1.5553, "mean_token_accuracy": 0.6521957015991211, "num_tokens": 28225907.0, "step": 10000 }, { "epoch": 0.24513968107570205, "grad_norm": 1.7572153806686401, "learning_rate": 0.00016340101933500528, "loss": 1.6428, "mean_token_accuracy": 0.6366815239191055, "num_tokens": 28502556.0, "step": 10100 }, { "epoch": 0.24756680663090702, "grad_norm": 2.120887041091919, "learning_rate": 0.0001650190114068441, "loss": 1.5659, "mean_token_accuracy": 0.6504863280057908, "num_tokens": 28787215.0, "step": 10200 }, { "epoch": 0.249993932186112, "grad_norm": 1.741084337234497, "learning_rate": 0.00016663700347868297, "loss": 1.6045, "mean_token_accuracy": 0.6414517712593079, "num_tokens": 29077020.0, "step": 10300 }, { "epoch": 0.25242105774131696, "grad_norm": 2.038705587387085, "learning_rate": 0.00016825499555052183, "loss": 1.5826, "mean_token_accuracy": 0.6473750042915344, "num_tokens": 29347254.0, "step": 10400 }, { "epoch": 0.2548481832965219, "grad_norm": 1.8374431133270264, "learning_rate": 0.00016987298762236066, "loss": 1.5794, "mean_token_accuracy": 0.6442887282371521, "num_tokens": 29628082.0, "step": 10500 }, { "epoch": 0.2572753088517269, "grad_norm": 1.7167983055114746, "learning_rate": 0.00017149097969419952, "loss": 1.5906, "mean_token_accuracy": 0.6459539052844048, "num_tokens": 29901377.0, "step": 10600 }, { "epoch": 0.25970243440693186, "grad_norm": 1.7779451608657837, "learning_rate": 0.00017310897176603835, "loss": 1.589, "mean_token_accuracy": 0.6448972916603088, "num_tokens": 30176021.0, "step": 10700 }, { "epoch": 0.2621295599621368, "grad_norm": 1.6588525772094727, "learning_rate": 0.0001747269638378772, "loss": 1.6071, "mean_token_accuracy": 0.6433945900201797, "num_tokens": 30457966.0, "step": 10800 }, { "epoch": 0.2645566855173418, "grad_norm": 2.377293109893799, "learning_rate": 0.00017634495590971604, "loss": 1.6186, "mean_token_accuracy": 0.6401382917165757, "num_tokens": 30734208.0, "step": 10900 }, { "epoch": 0.26698381107254676, "grad_norm": 2.399536609649658, "learning_rate": 0.0001779629479815549, "loss": 1.5837, "mean_token_accuracy": 0.6470131504535676, "num_tokens": 31020843.0, "step": 11000 }, { "epoch": 0.26941093662775173, "grad_norm": 1.5932106971740723, "learning_rate": 0.00017958094005339376, "loss": 1.632, "mean_token_accuracy": 0.6387262684106827, "num_tokens": 31302898.0, "step": 11100 }, { "epoch": 0.2718380621829567, "grad_norm": 2.0135538578033447, "learning_rate": 0.0001811989321252326, "loss": 1.5796, "mean_token_accuracy": 0.6479561811685562, "num_tokens": 31573395.0, "step": 11200 }, { "epoch": 0.2742651877381617, "grad_norm": 2.4460792541503906, "learning_rate": 0.00018281692419707144, "loss": 1.5639, "mean_token_accuracy": 0.6510553556680679, "num_tokens": 31859176.0, "step": 11300 }, { "epoch": 0.2766923132933667, "grad_norm": 1.658430814743042, "learning_rate": 0.0001844349162689103, "loss": 1.5697, "mean_token_accuracy": 0.6498629409074783, "num_tokens": 32143776.0, "step": 11400 }, { "epoch": 0.27911943884857165, "grad_norm": 1.603789210319519, "learning_rate": 0.00018605290834074913, "loss": 1.6203, "mean_token_accuracy": 0.6409107285737992, "num_tokens": 32419826.0, "step": 11500 }, { "epoch": 0.2815465644037766, "grad_norm": 1.9514955282211304, "learning_rate": 0.000187670900412588, "loss": 1.5774, "mean_token_accuracy": 0.6493115669488907, "num_tokens": 32682366.0, "step": 11600 }, { "epoch": 0.2839736899589816, "grad_norm": 4.2015838623046875, "learning_rate": 0.00018928889248442682, "loss": 1.56, "mean_token_accuracy": 0.653014543056488, "num_tokens": 32943242.0, "step": 11700 }, { "epoch": 0.28640081551418656, "grad_norm": 2.5987889766693115, "learning_rate": 0.00019090688455626568, "loss": 1.6036, "mean_token_accuracy": 0.6438838475942612, "num_tokens": 33224665.0, "step": 11800 }, { "epoch": 0.2888279410693915, "grad_norm": 1.8440401554107666, "learning_rate": 0.00019252487662810451, "loss": 1.5626, "mean_token_accuracy": 0.65259059548378, "num_tokens": 33515048.0, "step": 11900 }, { "epoch": 0.2912550666245965, "grad_norm": 1.672186017036438, "learning_rate": 0.00019414286869994337, "loss": 1.5928, "mean_token_accuracy": 0.6442960953712463, "num_tokens": 33805643.0, "step": 12000 }, { "epoch": 0.29368219217980146, "grad_norm": 1.9079508781433105, "learning_rate": 0.00019576086077178223, "loss": 1.5616, "mean_token_accuracy": 0.6513192290067673, "num_tokens": 34084649.0, "step": 12100 }, { "epoch": 0.2961093177350064, "grad_norm": 1.9318201541900635, "learning_rate": 0.0001973788528436211, "loss": 1.5428, "mean_token_accuracy": 0.6533740103244782, "num_tokens": 34351025.0, "step": 12200 }, { "epoch": 0.2985364432902114, "grad_norm": 1.8940328359603882, "learning_rate": 0.00019899684491545995, "loss": 1.5848, "mean_token_accuracy": 0.6487006124854088, "num_tokens": 34631544.0, "step": 12300 }, { "epoch": 0.30096356884541636, "grad_norm": 1.7644509077072144, "learning_rate": 0.00019999994241633214, "loss": 1.5272, "mean_token_accuracy": 0.6580360817909241, "num_tokens": 34914627.0, "step": 12400 }, { "epoch": 0.30339069440062133, "grad_norm": 2.096867322921753, "learning_rate": 0.0001999992405664207, "loss": 1.5403, "mean_token_accuracy": 0.6525964337587357, "num_tokens": 35193026.0, "step": 12500 }, { "epoch": 0.3058178199558263, "grad_norm": 1.6612098217010498, "learning_rate": 0.00019999774116529276, "loss": 1.593, "mean_token_accuracy": 0.6469535285234451, "num_tokens": 35472360.0, "step": 12600 }, { "epoch": 0.30824494551103127, "grad_norm": 1.5085548162460327, "learning_rate": 0.00019999544422490684, "loss": 1.6142, "mean_token_accuracy": 0.6417314970493316, "num_tokens": 35764712.0, "step": 12700 }, { "epoch": 0.31067207106623623, "grad_norm": 2.030015468597412, "learning_rate": 0.00019999234976358237, "loss": 1.584, "mean_token_accuracy": 0.6494407910108566, "num_tokens": 36041894.0, "step": 12800 }, { "epoch": 0.3130991966214412, "grad_norm": 2.0102243423461914, "learning_rate": 0.00019998845780599945, "loss": 1.5482, "mean_token_accuracy": 0.6535135221481323, "num_tokens": 36307589.0, "step": 12900 }, { "epoch": 0.3155263221766462, "grad_norm": 1.5919320583343506, "learning_rate": 0.0001999837683831987, "loss": 1.5708, "mean_token_accuracy": 0.6485945302248001, "num_tokens": 36586771.0, "step": 13000 }, { "epoch": 0.3179534477318512, "grad_norm": 1.6979924440383911, "learning_rate": 0.0001999782815325809, "loss": 1.5185, "mean_token_accuracy": 0.6575907075405121, "num_tokens": 36855115.0, "step": 13100 }, { "epoch": 0.32038057328705616, "grad_norm": 1.7762399911880493, "learning_rate": 0.00019997199729790684, "loss": 1.5585, "mean_token_accuracy": 0.652558314204216, "num_tokens": 37153394.0, "step": 13200 }, { "epoch": 0.3228076988422611, "grad_norm": 1.9209295511245728, "learning_rate": 0.00019996491572929695, "loss": 1.4696, "mean_token_accuracy": 0.6686202174425125, "num_tokens": 37447493.0, "step": 13300 }, { "epoch": 0.3252348243974661, "grad_norm": 1.7967995405197144, "learning_rate": 0.00019995703688323072, "loss": 1.5193, "mean_token_accuracy": 0.6576752889156342, "num_tokens": 37719538.0, "step": 13400 }, { "epoch": 0.32766194995267106, "grad_norm": 1.6311265230178833, "learning_rate": 0.0001999483608225465, "loss": 1.4999, "mean_token_accuracy": 0.6639257234334945, "num_tokens": 37996152.0, "step": 13500 }, { "epoch": 0.33008907550787603, "grad_norm": 1.8700850009918213, "learning_rate": 0.00019993888761644083, "loss": 1.5559, "mean_token_accuracy": 0.6535019212961197, "num_tokens": 38272368.0, "step": 13600 }, { "epoch": 0.332516201063081, "grad_norm": 1.6935664415359497, "learning_rate": 0.00019992861734046796, "loss": 1.5198, "mean_token_accuracy": 0.6603120654821396, "num_tokens": 38534001.0, "step": 13700 }, { "epoch": 0.33494332661828596, "grad_norm": 1.577829360961914, "learning_rate": 0.0001999175500765392, "loss": 1.504, "mean_token_accuracy": 0.6614657628536225, "num_tokens": 38811884.0, "step": 13800 }, { "epoch": 0.33737045217349093, "grad_norm": 1.339965581893921, "learning_rate": 0.00019990568591292238, "loss": 1.5434, "mean_token_accuracy": 0.6548532199859619, "num_tokens": 39082290.0, "step": 13900 }, { "epoch": 0.3397975777286959, "grad_norm": 1.5873082876205444, "learning_rate": 0.0001998930249442409, "loss": 1.5079, "mean_token_accuracy": 0.6613044041395187, "num_tokens": 39364724.0, "step": 14000 }, { "epoch": 0.34222470328390087, "grad_norm": 1.4517920017242432, "learning_rate": 0.00019987956727147334, "loss": 1.5268, "mean_token_accuracy": 0.655518969297409, "num_tokens": 39663050.0, "step": 14100 }, { "epoch": 0.34465182883910583, "grad_norm": 1.920789361000061, "learning_rate": 0.0001998653130019523, "loss": 1.4819, "mean_token_accuracy": 0.6665922373533248, "num_tokens": 39950704.0, "step": 14200 }, { "epoch": 0.3470789543943108, "grad_norm": 1.4261436462402344, "learning_rate": 0.00019985026224936375, "loss": 1.4286, "mean_token_accuracy": 0.6748318231105804, "num_tokens": 40244181.0, "step": 14300 }, { "epoch": 0.34950607994951577, "grad_norm": 1.6467373371124268, "learning_rate": 0.00019983441513374603, "loss": 1.4598, "mean_token_accuracy": 0.6711004292964935, "num_tokens": 40510184.0, "step": 14400 }, { "epoch": 0.35193320550472074, "grad_norm": 1.2494763135910034, "learning_rate": 0.000199817771781489, "loss": 1.4752, "mean_token_accuracy": 0.665248526930809, "num_tokens": 40784045.0, "step": 14500 }, { "epoch": 0.3543603310599257, "grad_norm": 1.2977705001831055, "learning_rate": 0.00019980033232533292, "loss": 1.4809, "mean_token_accuracy": 0.6651502713561058, "num_tokens": 41075689.0, "step": 14600 }, { "epoch": 0.3567874566151307, "grad_norm": 1.475791096687317, "learning_rate": 0.00019978209690436747, "loss": 1.397, "mean_token_accuracy": 0.6825127446651459, "num_tokens": 41354944.0, "step": 14700 }, { "epoch": 0.3592145821703357, "grad_norm": 1.385689377784729, "learning_rate": 0.00019976306566403052, "loss": 1.5037, "mean_token_accuracy": 0.6625669264793396, "num_tokens": 41637807.0, "step": 14800 }, { "epoch": 0.36164170772554066, "grad_norm": 1.500313401222229, "learning_rate": 0.00019974323875610712, "loss": 1.515, "mean_token_accuracy": 0.659018731713295, "num_tokens": 41921489.0, "step": 14900 }, { "epoch": 0.36406883328074563, "grad_norm": 1.7327656745910645, "learning_rate": 0.00019972261633872826, "loss": 1.4263, "mean_token_accuracy": 0.6765751355886459, "num_tokens": 42203300.0, "step": 15000 }, { "epoch": 0.3664959588359506, "grad_norm": 1.9555410146713257, "learning_rate": 0.00019970119857636948, "loss": 1.4265, "mean_token_accuracy": 0.6769354450702667, "num_tokens": 42482247.0, "step": 15100 }, { "epoch": 0.36892308439115556, "grad_norm": 2.1408140659332275, "learning_rate": 0.00019967898563984978, "loss": 1.4401, "mean_token_accuracy": 0.6747738122940063, "num_tokens": 42755286.0, "step": 15200 }, { "epoch": 0.37135020994636053, "grad_norm": 1.3993452787399292, "learning_rate": 0.00019965597770632993, "loss": 1.4457, "mean_token_accuracy": 0.6707744264602661, "num_tokens": 43036164.0, "step": 15300 }, { "epoch": 0.3737773355015655, "grad_norm": 1.423827886581421, "learning_rate": 0.00019963217495931149, "loss": 1.3852, "mean_token_accuracy": 0.682267724275589, "num_tokens": 43324843.0, "step": 15400 }, { "epoch": 0.37620446105677047, "grad_norm": 1.3678686618804932, "learning_rate": 0.00019960757758863488, "loss": 1.4738, "mean_token_accuracy": 0.6674029171466828, "num_tokens": 43616961.0, "step": 15500 }, { "epoch": 0.37863158661197543, "grad_norm": 1.4727699756622314, "learning_rate": 0.00019958218579047837, "loss": 1.4272, "mean_token_accuracy": 0.6761853212118148, "num_tokens": 43882893.0, "step": 15600 }, { "epoch": 0.3810587121671804, "grad_norm": 1.4919428825378418, "learning_rate": 0.00019955599976735598, "loss": 1.413, "mean_token_accuracy": 0.6803244209289551, "num_tokens": 44155556.0, "step": 15700 }, { "epoch": 0.38348583772238537, "grad_norm": 1.481824278831482, "learning_rate": 0.0001995290197281163, "loss": 1.4245, "mean_token_accuracy": 0.6783695650100708, "num_tokens": 44427379.0, "step": 15800 }, { "epoch": 0.38591296327759034, "grad_norm": 1.343738317489624, "learning_rate": 0.00019950124588794053, "loss": 1.4281, "mean_token_accuracy": 0.6756082916259766, "num_tokens": 44704217.0, "step": 15900 }, { "epoch": 0.3883400888327953, "grad_norm": 1.9732495546340942, "learning_rate": 0.00019947267846834102, "loss": 1.4164, "mean_token_accuracy": 0.6776945400238037, "num_tokens": 44994003.0, "step": 16000 }, { "epoch": 0.3907672143880003, "grad_norm": 1.4792394638061523, "learning_rate": 0.00019944331769715926, "loss": 1.3882, "mean_token_accuracy": 0.6841700464487076, "num_tokens": 45262965.0, "step": 16100 }, { "epoch": 0.39319433994320524, "grad_norm": 1.5121392011642456, "learning_rate": 0.00019941316380856424, "loss": 1.4293, "mean_token_accuracy": 0.6759150516986847, "num_tokens": 45544990.0, "step": 16200 }, { "epoch": 0.3956214654984102, "grad_norm": 1.4478973150253296, "learning_rate": 0.0001993822170430505, "loss": 1.3958, "mean_token_accuracy": 0.6820646327733993, "num_tokens": 45823961.0, "step": 16300 }, { "epoch": 0.39804859105361523, "grad_norm": 1.201532244682312, "learning_rate": 0.0001993504776474362, "loss": 1.3745, "mean_token_accuracy": 0.6857378870248795, "num_tokens": 46093285.0, "step": 16400 }, { "epoch": 0.4004757166088202, "grad_norm": 1.3356366157531738, "learning_rate": 0.0001993179458748612, "loss": 1.3609, "mean_token_accuracy": 0.6905792677402496, "num_tokens": 46387814.0, "step": 16500 }, { "epoch": 0.40290284216402517, "grad_norm": 1.2657437324523926, "learning_rate": 0.00019928462198478503, "loss": 1.3614, "mean_token_accuracy": 0.688639857172966, "num_tokens": 46668922.0, "step": 16600 }, { "epoch": 0.40532996771923013, "grad_norm": 1.460304856300354, "learning_rate": 0.00019925050624298478, "loss": 1.3555, "mean_token_accuracy": 0.6907580763101577, "num_tokens": 46949382.0, "step": 16700 }, { "epoch": 0.4077570932744351, "grad_norm": 1.256842017173767, "learning_rate": 0.00019921559892155304, "loss": 1.3847, "mean_token_accuracy": 0.683619334101677, "num_tokens": 47232718.0, "step": 16800 }, { "epoch": 0.41018421882964007, "grad_norm": 1.5061123371124268, "learning_rate": 0.0001991799002988957, "loss": 1.3603, "mean_token_accuracy": 0.6874109321832657, "num_tokens": 47512467.0, "step": 16900 }, { "epoch": 0.41261134438484504, "grad_norm": 1.403568983078003, "learning_rate": 0.0001991434106597297, "loss": 1.4669, "mean_token_accuracy": 0.6719831627607346, "num_tokens": 47802152.0, "step": 17000 }, { "epoch": 0.41503846994005, "grad_norm": 1.540781021118164, "learning_rate": 0.00019910613029508082, "loss": 1.4166, "mean_token_accuracy": 0.6793534135818482, "num_tokens": 48079701.0, "step": 17100 }, { "epoch": 0.41746559549525497, "grad_norm": 1.553440809249878, "learning_rate": 0.00019906805950228136, "loss": 1.3399, "mean_token_accuracy": 0.6919099617004395, "num_tokens": 48360187.0, "step": 17200 }, { "epoch": 0.41989272105045994, "grad_norm": 1.4212487936019897, "learning_rate": 0.00019902919858496762, "loss": 1.3215, "mean_token_accuracy": 0.6969122314453124, "num_tokens": 48639623.0, "step": 17300 }, { "epoch": 0.4223198466056649, "grad_norm": 1.6373494863510132, "learning_rate": 0.00019898954785307774, "loss": 1.318, "mean_token_accuracy": 0.6975502675771713, "num_tokens": 48920032.0, "step": 17400 }, { "epoch": 0.4247469721608699, "grad_norm": 1.618812918663025, "learning_rate": 0.00019894910762284897, "loss": 1.3826, "mean_token_accuracy": 0.6858366417884827, "num_tokens": 49209460.0, "step": 17500 }, { "epoch": 0.42717409771607484, "grad_norm": 1.2190032005310059, "learning_rate": 0.00019890787821681534, "loss": 1.3092, "mean_token_accuracy": 0.6999623441696167, "num_tokens": 49495904.0, "step": 17600 }, { "epoch": 0.4296012232712798, "grad_norm": 1.5213158130645752, "learning_rate": 0.00019886585996380495, "loss": 1.337, "mean_token_accuracy": 0.694312310218811, "num_tokens": 49769086.0, "step": 17700 }, { "epoch": 0.4320283488264848, "grad_norm": 1.2875466346740723, "learning_rate": 0.00019882305319893746, "loss": 1.3142, "mean_token_accuracy": 0.6983224129676819, "num_tokens": 50056312.0, "step": 17800 }, { "epoch": 0.43445547438168974, "grad_norm": 1.364429235458374, "learning_rate": 0.00019877945826362133, "loss": 1.2759, "mean_token_accuracy": 0.7073658388853074, "num_tokens": 50346471.0, "step": 17900 }, { "epoch": 0.4368825999368947, "grad_norm": 1.9261558055877686, "learning_rate": 0.00019873507550555113, "loss": 1.3359, "mean_token_accuracy": 0.6965240675210953, "num_tokens": 50636340.0, "step": 18000 }, { "epoch": 0.4393097254920997, "grad_norm": 2.0433011054992676, "learning_rate": 0.00019868990527870475, "loss": 1.3354, "mean_token_accuracy": 0.695448847413063, "num_tokens": 50906335.0, "step": 18100 }, { "epoch": 0.4417368510473047, "grad_norm": 1.8010625839233398, "learning_rate": 0.00019864394794334066, "loss": 1.2788, "mean_token_accuracy": 0.7048653018474579, "num_tokens": 51178755.0, "step": 18200 }, { "epoch": 0.44416397660250967, "grad_norm": 1.1464262008666992, "learning_rate": 0.00019859720386599492, "loss": 1.2752, "mean_token_accuracy": 0.7063197785615921, "num_tokens": 51463497.0, "step": 18300 }, { "epoch": 0.44659110215771464, "grad_norm": 1.6249265670776367, "learning_rate": 0.00019854967341947832, "loss": 1.2929, "mean_token_accuracy": 0.7024630016088486, "num_tokens": 51752205.0, "step": 18400 }, { "epoch": 0.4490182277129196, "grad_norm": 1.3986220359802246, "learning_rate": 0.00019850135698287337, "loss": 1.3208, "mean_token_accuracy": 0.6973824536800385, "num_tokens": 52035214.0, "step": 18500 }, { "epoch": 0.45144535326812457, "grad_norm": 1.3274478912353516, "learning_rate": 0.00019845225494153138, "loss": 1.3032, "mean_token_accuracy": 0.7015773522853851, "num_tokens": 52310306.0, "step": 18600 }, { "epoch": 0.45387247882332954, "grad_norm": 2.008397102355957, "learning_rate": 0.00019840236768706916, "loss": 1.2791, "mean_token_accuracy": 0.7044518172740937, "num_tokens": 52580485.0, "step": 18700 }, { "epoch": 0.4562996043785345, "grad_norm": 1.4408162832260132, "learning_rate": 0.00019835169561736624, "loss": 1.3516, "mean_token_accuracy": 0.6925643801689148, "num_tokens": 52853591.0, "step": 18800 }, { "epoch": 0.4587267299337395, "grad_norm": 1.2863048315048218, "learning_rate": 0.00019830023913656132, "loss": 1.3084, "mean_token_accuracy": 0.7014599156379699, "num_tokens": 53123981.0, "step": 18900 }, { "epoch": 0.46115385548894444, "grad_norm": 1.5343726873397827, "learning_rate": 0.00019824799865504936, "loss": 1.2883, "mean_token_accuracy": 0.7038233464956284, "num_tokens": 53404750.0, "step": 19000 }, { "epoch": 0.4635809810441494, "grad_norm": 1.0196541547775269, "learning_rate": 0.00019819497458947808, "loss": 1.1983, "mean_token_accuracy": 0.721374859213829, "num_tokens": 53676184.0, "step": 19100 }, { "epoch": 0.4660081065993544, "grad_norm": 1.4527913331985474, "learning_rate": 0.00019814116736274482, "loss": 1.2063, "mean_token_accuracy": 0.719505894780159, "num_tokens": 53947618.0, "step": 19200 }, { "epoch": 0.46843523215455934, "grad_norm": 1.1889349222183228, "learning_rate": 0.00019808657740399296, "loss": 1.2401, "mean_token_accuracy": 0.7134192228317261, "num_tokens": 54225900.0, "step": 19300 }, { "epoch": 0.4708623577097643, "grad_norm": 1.0686372518539429, "learning_rate": 0.0001980312051486087, "loss": 1.2694, "mean_token_accuracy": 0.7079707849025726, "num_tokens": 54510933.0, "step": 19400 }, { "epoch": 0.4732894832649693, "grad_norm": 1.5365800857543945, "learning_rate": 0.00019797505103821757, "loss": 1.2088, "mean_token_accuracy": 0.7209999257326126, "num_tokens": 54789106.0, "step": 19500 }, { "epoch": 0.47571660882017425, "grad_norm": 1.5895075798034668, "learning_rate": 0.00019791811552068067, "loss": 1.2628, "mean_token_accuracy": 0.7107340413331985, "num_tokens": 55068924.0, "step": 19600 }, { "epoch": 0.4781437343753792, "grad_norm": 1.2052088975906372, "learning_rate": 0.00019786039905009143, "loss": 1.238, "mean_token_accuracy": 0.7150384098291397, "num_tokens": 55345057.0, "step": 19700 }, { "epoch": 0.4805708599305842, "grad_norm": 1.183862566947937, "learning_rate": 0.0001978019020867717, "loss": 1.2869, "mean_token_accuracy": 0.7046462643146515, "num_tokens": 55626375.0, "step": 19800 }, { "epoch": 0.4829979854857892, "grad_norm": 0.9647877216339111, "learning_rate": 0.00019774262509726832, "loss": 1.2112, "mean_token_accuracy": 0.7195284950733185, "num_tokens": 55911211.0, "step": 19900 }, { "epoch": 0.4854251110409942, "grad_norm": 1.3305227756500244, "learning_rate": 0.00019768256855434916, "loss": 1.2022, "mean_token_accuracy": 0.7205027377605439, "num_tokens": 56197537.0, "step": 20000 }, { "epoch": 0.48785223659619914, "grad_norm": 1.235066533088684, "learning_rate": 0.0001976217329369996, "loss": 1.226, "mean_token_accuracy": 0.7162840497493744, "num_tokens": 56452193.0, "step": 20100 }, { "epoch": 0.4902793621514041, "grad_norm": 1.370216727256775, "learning_rate": 0.00019756011873041854, "loss": 1.237, "mean_token_accuracy": 0.7148937523365021, "num_tokens": 56729810.0, "step": 20200 }, { "epoch": 0.4927064877066091, "grad_norm": 1.065269947052002, "learning_rate": 0.0001974977264260145, "loss": 1.1946, "mean_token_accuracy": 0.7244079232215881, "num_tokens": 56995621.0, "step": 20300 }, { "epoch": 0.49513361326181404, "grad_norm": 1.1559139490127563, "learning_rate": 0.00019743455652140195, "loss": 1.2337, "mean_token_accuracy": 0.7160074669122696, "num_tokens": 57270242.0, "step": 20400 }, { "epoch": 0.497560738817019, "grad_norm": 1.3941744565963745, "learning_rate": 0.00019737060952039695, "loss": 1.1944, "mean_token_accuracy": 0.7223728567361831, "num_tokens": 57539201.0, "step": 20500 }, { "epoch": 0.499987864372224, "grad_norm": 1.42805814743042, "learning_rate": 0.00019730588593301355, "loss": 1.2291, "mean_token_accuracy": 0.7149619489908219, "num_tokens": 57827170.0, "step": 20600 }, { "epoch": 0.502414989927429, "grad_norm": 1.1617212295532227, "learning_rate": 0.00019724038627545935, "loss": 1.2113, "mean_token_accuracy": 0.7198360604047775, "num_tokens": 58113276.0, "step": 20700 }, { "epoch": 0.5048421154826339, "grad_norm": 1.4672633409500122, "learning_rate": 0.00019717411107013168, "loss": 1.2117, "mean_token_accuracy": 0.7212896418571472, "num_tokens": 58396273.0, "step": 20800 }, { "epoch": 0.5072692410378389, "grad_norm": 1.4573055505752563, "learning_rate": 0.00019710706084561324, "loss": 1.1817, "mean_token_accuracy": 0.7268400448560715, "num_tokens": 58669420.0, "step": 20900 }, { "epoch": 0.5096963665930438, "grad_norm": 1.1941968202590942, "learning_rate": 0.00019703923613666798, "loss": 1.1762, "mean_token_accuracy": 0.727076341509819, "num_tokens": 58944952.0, "step": 21000 }, { "epoch": 0.5121234921482488, "grad_norm": 1.3396928310394287, "learning_rate": 0.00019697063748423682, "loss": 1.2, "mean_token_accuracy": 0.72161645591259, "num_tokens": 59227700.0, "step": 21100 }, { "epoch": 0.5145506177034538, "grad_norm": 1.4734523296356201, "learning_rate": 0.00019690126543543324, "loss": 1.1836, "mean_token_accuracy": 0.7250787276029587, "num_tokens": 59505761.0, "step": 21200 }, { "epoch": 0.5169777432586588, "grad_norm": 1.331443190574646, "learning_rate": 0.00019683112054353914, "loss": 1.1275, "mean_token_accuracy": 0.7360107409954071, "num_tokens": 59794626.0, "step": 21300 }, { "epoch": 0.5194048688138637, "grad_norm": 2.195629596710205, "learning_rate": 0.00019676020336800013, "loss": 1.1866, "mean_token_accuracy": 0.7259843808412552, "num_tokens": 60077760.0, "step": 21400 }, { "epoch": 0.5218319943690687, "grad_norm": 1.1297053098678589, "learning_rate": 0.00019668851447442138, "loss": 1.1691, "mean_token_accuracy": 0.7298132872581482, "num_tokens": 60357449.0, "step": 21500 }, { "epoch": 0.5242591199242737, "grad_norm": 1.1505388021469116, "learning_rate": 0.0001966160544345628, "loss": 1.1943, "mean_token_accuracy": 0.7239989709854125, "num_tokens": 60659506.0, "step": 21600 }, { "epoch": 0.5266862454794786, "grad_norm": 1.6284935474395752, "learning_rate": 0.0001965428238263348, "loss": 1.139, "mean_token_accuracy": 0.7326309901475906, "num_tokens": 60928432.0, "step": 21700 }, { "epoch": 0.5291133710346836, "grad_norm": 1.4899412393569946, "learning_rate": 0.00019646882323379333, "loss": 1.1266, "mean_token_accuracy": 0.7380794870853424, "num_tokens": 61209379.0, "step": 21800 }, { "epoch": 0.5315404965898886, "grad_norm": 1.0630362033843994, "learning_rate": 0.0001963940532471355, "loss": 1.1755, "mean_token_accuracy": 0.7278030133247375, "num_tokens": 61482390.0, "step": 21900 }, { "epoch": 0.5339676221450935, "grad_norm": 1.0710209608078003, "learning_rate": 0.00019631851446269487, "loss": 1.1893, "mean_token_accuracy": 0.7246757209300995, "num_tokens": 61763958.0, "step": 22000 }, { "epoch": 0.5363947477002985, "grad_norm": 1.2997487783432007, "learning_rate": 0.0001962422074829364, "loss": 1.1444, "mean_token_accuracy": 0.7331423789262772, "num_tokens": 62035789.0, "step": 22100 }, { "epoch": 0.5388218732555035, "grad_norm": 1.2150485515594482, "learning_rate": 0.00019616513291645206, "loss": 1.1438, "mean_token_accuracy": 0.7333634513616561, "num_tokens": 62323217.0, "step": 22200 }, { "epoch": 0.5412489988107084, "grad_norm": 1.5231380462646484, "learning_rate": 0.00019608729137795557, "loss": 1.1868, "mean_token_accuracy": 0.7252021282911301, "num_tokens": 62598509.0, "step": 22300 }, { "epoch": 0.5436761243659134, "grad_norm": 0.9912909269332886, "learning_rate": 0.00019600868348827787, "loss": 1.1845, "mean_token_accuracy": 0.7267439430952072, "num_tokens": 62883504.0, "step": 22400 }, { "epoch": 0.5461032499211185, "grad_norm": 0.9855025410652161, "learning_rate": 0.0001959293098743618, "loss": 1.1333, "mean_token_accuracy": 0.7358741110563278, "num_tokens": 63157450.0, "step": 22500 }, { "epoch": 0.5485303754763234, "grad_norm": 1.5229765176773071, "learning_rate": 0.00019584917116925748, "loss": 1.1373, "mean_token_accuracy": 0.7370314878225327, "num_tokens": 63435535.0, "step": 22600 }, { "epoch": 0.5509575010315284, "grad_norm": 1.4351425170898438, "learning_rate": 0.00019576826801211695, "loss": 1.1541, "mean_token_accuracy": 0.7322686624526977, "num_tokens": 63733409.0, "step": 22700 }, { "epoch": 0.5533846265867334, "grad_norm": 1.2064365148544312, "learning_rate": 0.00019568660104818918, "loss": 1.0696, "mean_token_accuracy": 0.748774966597557, "num_tokens": 64021197.0, "step": 22800 }, { "epoch": 0.5558117521419383, "grad_norm": 1.068001627922058, "learning_rate": 0.00019560417092881504, "loss": 1.1123, "mean_token_accuracy": 0.7405225545167923, "num_tokens": 64306536.0, "step": 22900 }, { "epoch": 0.5582388776971433, "grad_norm": 0.9885738492012024, "learning_rate": 0.00019552097831142194, "loss": 1.1282, "mean_token_accuracy": 0.7359859699010849, "num_tokens": 64585245.0, "step": 23000 }, { "epoch": 0.5606660032523483, "grad_norm": 1.1703755855560303, "learning_rate": 0.0001954370238595186, "loss": 1.1365, "mean_token_accuracy": 0.7343781024217606, "num_tokens": 64864941.0, "step": 23100 }, { "epoch": 0.5630931288075532, "grad_norm": 1.25601327419281, "learning_rate": 0.00019535230824268993, "loss": 1.0856, "mean_token_accuracy": 0.7458273768424988, "num_tokens": 65148069.0, "step": 23200 }, { "epoch": 0.5655202543627582, "grad_norm": 1.2828888893127441, "learning_rate": 0.00019526683213659141, "loss": 1.14, "mean_token_accuracy": 0.7356480902433395, "num_tokens": 65425846.0, "step": 23300 }, { "epoch": 0.5679473799179632, "grad_norm": 1.0274039506912231, "learning_rate": 0.00019518059622294405, "loss": 1.1439, "mean_token_accuracy": 0.7343963479995728, "num_tokens": 65696885.0, "step": 23400 }, { "epoch": 0.5703745054731681, "grad_norm": 1.795361042022705, "learning_rate": 0.00019509360118952852, "loss": 1.0995, "mean_token_accuracy": 0.7427353280782699, "num_tokens": 65967890.0, "step": 23500 }, { "epoch": 0.5728016310283731, "grad_norm": 1.314911961555481, "learning_rate": 0.00019500584773018016, "loss": 1.1134, "mean_token_accuracy": 0.7405674111843109, "num_tokens": 66253601.0, "step": 23600 }, { "epoch": 0.5752287565835781, "grad_norm": 1.4357415437698364, "learning_rate": 0.000194917336544783, "loss": 1.1355, "mean_token_accuracy": 0.7366157558560371, "num_tokens": 66546485.0, "step": 23700 }, { "epoch": 0.577655882138783, "grad_norm": 0.9053491950035095, "learning_rate": 0.00019482806833926445, "loss": 1.1417, "mean_token_accuracy": 0.7364838290214538, "num_tokens": 66832391.0, "step": 23800 }, { "epoch": 0.580083007693988, "grad_norm": 0.9933082461357117, "learning_rate": 0.00019473804382558955, "loss": 1.1164, "mean_token_accuracy": 0.7381259691715241, "num_tokens": 67100199.0, "step": 23900 }, { "epoch": 0.582510133249193, "grad_norm": 1.1587954759597778, "learning_rate": 0.0001946472637217554, "loss": 1.114, "mean_token_accuracy": 0.7397536563873292, "num_tokens": 67372483.0, "step": 24000 }, { "epoch": 0.584937258804398, "grad_norm": 1.5403295755386353, "learning_rate": 0.0001945557287517853, "loss": 1.116, "mean_token_accuracy": 0.7395440202951431, "num_tokens": 67648130.0, "step": 24100 }, { "epoch": 0.5873643843596029, "grad_norm": 1.1019656658172607, "learning_rate": 0.00019446343964572304, "loss": 1.1711, "mean_token_accuracy": 0.7280935817956924, "num_tokens": 67930208.0, "step": 24200 }, { "epoch": 0.5897915099148079, "grad_norm": 0.9746718406677246, "learning_rate": 0.00019437039713962717, "loss": 1.1329, "mean_token_accuracy": 0.7364810371398925, "num_tokens": 68200240.0, "step": 24300 }, { "epoch": 0.5922186354700129, "grad_norm": 1.0284191370010376, "learning_rate": 0.0001942766019755649, "loss": 1.0529, "mean_token_accuracy": 0.7535615694522858, "num_tokens": 68483550.0, "step": 24400 }, { "epoch": 0.5946457610252178, "grad_norm": 0.9971494674682617, "learning_rate": 0.00019418205490160636, "loss": 1.1004, "mean_token_accuracy": 0.7428427731990814, "num_tokens": 68770836.0, "step": 24500 }, { "epoch": 0.5970728865804228, "grad_norm": 1.1013323068618774, "learning_rate": 0.0001940867566718187, "loss": 1.1384, "mean_token_accuracy": 0.7362033921480179, "num_tokens": 69054580.0, "step": 24600 }, { "epoch": 0.5995000121356278, "grad_norm": 1.1625392436981201, "learning_rate": 0.00019399070804625984, "loss": 1.063, "mean_token_accuracy": 0.748725289106369, "num_tokens": 69340258.0, "step": 24700 }, { "epoch": 0.6019271376908327, "grad_norm": 1.9240310192108154, "learning_rate": 0.00019389390979097256, "loss": 1.0823, "mean_token_accuracy": 0.7495817458629608, "num_tokens": 69619650.0, "step": 24800 }, { "epoch": 0.6043542632460377, "grad_norm": 1.3660378456115723, "learning_rate": 0.0001937963626779784, "loss": 1.0272, "mean_token_accuracy": 0.7577913010120392, "num_tokens": 69890767.0, "step": 24900 }, { "epoch": 0.6067813888012427, "grad_norm": 1.5367549657821655, "learning_rate": 0.00019369806748527148, "loss": 1.0573, "mean_token_accuracy": 0.750820305943489, "num_tokens": 70163631.0, "step": 25000 }, { "epoch": 0.6092085143564476, "grad_norm": 1.065262794494629, "learning_rate": 0.0001935990249968122, "loss": 1.0723, "mean_token_accuracy": 0.7507969135046005, "num_tokens": 70453726.0, "step": 25100 }, { "epoch": 0.6116356399116526, "grad_norm": 1.0172377824783325, "learning_rate": 0.00019349923600252123, "loss": 1.11, "mean_token_accuracy": 0.7407309454679489, "num_tokens": 70747381.0, "step": 25200 }, { "epoch": 0.6140627654668576, "grad_norm": 1.1468844413757324, "learning_rate": 0.00019339870129827285, "loss": 1.0578, "mean_token_accuracy": 0.7536929363012314, "num_tokens": 71022840.0, "step": 25300 }, { "epoch": 0.6164898910220625, "grad_norm": 1.2578643560409546, "learning_rate": 0.00019329742168588894, "loss": 1.0413, "mean_token_accuracy": 0.7559024041891098, "num_tokens": 71309539.0, "step": 25400 }, { "epoch": 0.6189170165772675, "grad_norm": 1.2665239572525024, "learning_rate": 0.00019319539797313243, "loss": 1.0336, "mean_token_accuracy": 0.7568505030870437, "num_tokens": 71573364.0, "step": 25500 }, { "epoch": 0.6213441421324725, "grad_norm": 1.1429393291473389, "learning_rate": 0.00019309263097370085, "loss": 1.0491, "mean_token_accuracy": 0.7536306053400039, "num_tokens": 71848695.0, "step": 25600 }, { "epoch": 0.6237712676876774, "grad_norm": 1.2720645666122437, "learning_rate": 0.00019298912150721989, "loss": 1.0413, "mean_token_accuracy": 0.7553714007139206, "num_tokens": 72132905.0, "step": 25700 }, { "epoch": 0.6261983932428824, "grad_norm": 1.1925904750823975, "learning_rate": 0.0001928848703992368, "loss": 1.057, "mean_token_accuracy": 0.7516288584470749, "num_tokens": 72419452.0, "step": 25800 }, { "epoch": 0.6286255187980875, "grad_norm": 0.9983070492744446, "learning_rate": 0.0001927798784812139, "loss": 1.0228, "mean_token_accuracy": 0.7606069624423981, "num_tokens": 72719336.0, "step": 25900 }, { "epoch": 0.6310526443532924, "grad_norm": 1.1177300214767456, "learning_rate": 0.00019267414659052189, "loss": 1.0434, "mean_token_accuracy": 0.7564522981643677, "num_tokens": 72994978.0, "step": 26000 }, { "epoch": 0.6334797699084974, "grad_norm": 1.1599113941192627, "learning_rate": 0.0001925676755704331, "loss": 1.0577, "mean_token_accuracy": 0.7541283696889878, "num_tokens": 73281210.0, "step": 26100 }, { "epoch": 0.6359068954637024, "grad_norm": 1.297285556793213, "learning_rate": 0.00019246046627011495, "loss": 1.0553, "mean_token_accuracy": 0.7551116102933884, "num_tokens": 73576523.0, "step": 26200 }, { "epoch": 0.6383340210189073, "grad_norm": 1.4931613206863403, "learning_rate": 0.00019235251954462294, "loss": 1.0745, "mean_token_accuracy": 0.7492081993818283, "num_tokens": 73848522.0, "step": 26300 }, { "epoch": 0.6407611465741123, "grad_norm": 1.0133391618728638, "learning_rate": 0.00019224383625489412, "loss": 1.0433, "mean_token_accuracy": 0.7565659403800964, "num_tokens": 74124032.0, "step": 26400 }, { "epoch": 0.6431882721293173, "grad_norm": 1.2665462493896484, "learning_rate": 0.00019213441726773992, "loss": 1.0518, "mean_token_accuracy": 0.75186956346035, "num_tokens": 74404428.0, "step": 26500 }, { "epoch": 0.6456153976845223, "grad_norm": 0.9439951777458191, "learning_rate": 0.0001920242634558394, "loss": 1.0292, "mean_token_accuracy": 0.7585177063941956, "num_tokens": 74696666.0, "step": 26600 }, { "epoch": 0.6480425232397272, "grad_norm": 1.232807993888855, "learning_rate": 0.00019191337569773235, "loss": 0.9591, "mean_token_accuracy": 0.7707934284210205, "num_tokens": 74974515.0, "step": 26700 }, { "epoch": 0.6504696487949322, "grad_norm": 1.106130599975586, "learning_rate": 0.00019180175487781216, "loss": 1.0185, "mean_token_accuracy": 0.7600296914577485, "num_tokens": 75251125.0, "step": 26800 }, { "epoch": 0.6528967743501372, "grad_norm": 0.9902170300483704, "learning_rate": 0.00019168940188631875, "loss": 0.9849, "mean_token_accuracy": 0.7675464844703674, "num_tokens": 75543621.0, "step": 26900 }, { "epoch": 0.6553238999053421, "grad_norm": 1.3111634254455566, "learning_rate": 0.00019157631761933163, "loss": 0.9863, "mean_token_accuracy": 0.7691891992092132, "num_tokens": 75808450.0, "step": 27000 }, { "epoch": 0.6577510254605471, "grad_norm": 0.9136918783187866, "learning_rate": 0.00019146250297876255, "loss": 0.9892, "mean_token_accuracy": 0.7655776154994964, "num_tokens": 76091361.0, "step": 27100 }, { "epoch": 0.6601781510157521, "grad_norm": 1.9679793119430542, "learning_rate": 0.0001913479588723485, "loss": 1.0011, "mean_token_accuracy": 0.7665372800827026, "num_tokens": 76372098.0, "step": 27200 }, { "epoch": 0.662605276570957, "grad_norm": 1.099344253540039, "learning_rate": 0.00019123268621364426, "loss": 1.019, "mean_token_accuracy": 0.7607282799482346, "num_tokens": 76654534.0, "step": 27300 }, { "epoch": 0.665032402126162, "grad_norm": 0.9826762080192566, "learning_rate": 0.00019111668592201536, "loss": 0.9837, "mean_token_accuracy": 0.7672662001848221, "num_tokens": 76931744.0, "step": 27400 }, { "epoch": 0.667459527681367, "grad_norm": 1.080349087715149, "learning_rate": 0.00019099995892263056, "loss": 1.0261, "mean_token_accuracy": 0.7605299288034439, "num_tokens": 77201535.0, "step": 27500 }, { "epoch": 0.6698866532365719, "grad_norm": 1.5150740146636963, "learning_rate": 0.00019088250614645452, "loss": 0.9962, "mean_token_accuracy": 0.7669206023216247, "num_tokens": 77485467.0, "step": 27600 }, { "epoch": 0.6723137787917769, "grad_norm": 0.8500184416770935, "learning_rate": 0.00019076432853024043, "loss": 0.9651, "mean_token_accuracy": 0.7721401613950729, "num_tokens": 77764169.0, "step": 27700 }, { "epoch": 0.6747409043469819, "grad_norm": 1.0744963884353638, "learning_rate": 0.0001906454270165224, "loss": 0.9603, "mean_token_accuracy": 0.7727346140146255, "num_tokens": 78047351.0, "step": 27800 }, { "epoch": 0.6771680299021868, "grad_norm": 1.2541007995605469, "learning_rate": 0.00019052580255360813, "loss": 0.9878, "mean_token_accuracy": 0.768582199215889, "num_tokens": 78322847.0, "step": 27900 }, { "epoch": 0.6795951554573918, "grad_norm": 0.8931077718734741, "learning_rate": 0.00019040545609557127, "loss": 0.9275, "mean_token_accuracy": 0.7798711568117142, "num_tokens": 78611878.0, "step": 28000 }, { "epoch": 0.6820222810125968, "grad_norm": 1.128255844116211, "learning_rate": 0.0001902843886022437, "loss": 0.9168, "mean_token_accuracy": 0.7815976619720459, "num_tokens": 78876123.0, "step": 28100 }, { "epoch": 0.6844494065678017, "grad_norm": 1.1579947471618652, "learning_rate": 0.00019016260103920803, "loss": 0.9599, "mean_token_accuracy": 0.7729695934057236, "num_tokens": 79167916.0, "step": 28200 }, { "epoch": 0.6868765321230067, "grad_norm": 2.0827414989471436, "learning_rate": 0.00019004009437778983, "loss": 0.9421, "mean_token_accuracy": 0.7752230167388916, "num_tokens": 79460524.0, "step": 28300 }, { "epoch": 0.6893036576782117, "grad_norm": 1.0424773693084717, "learning_rate": 0.00018991686959504987, "loss": 0.9958, "mean_token_accuracy": 0.7665992087125778, "num_tokens": 79743868.0, "step": 28400 }, { "epoch": 0.6917307832334166, "grad_norm": 0.9780852794647217, "learning_rate": 0.00018979292767377638, "loss": 0.9604, "mean_token_accuracy": 0.7735722011327744, "num_tokens": 80041028.0, "step": 28500 }, { "epoch": 0.6941579087886216, "grad_norm": 1.0590211153030396, "learning_rate": 0.0001896682696024772, "loss": 0.9759, "mean_token_accuracy": 0.7699248534440994, "num_tokens": 80325828.0, "step": 28600 }, { "epoch": 0.6965850343438266, "grad_norm": 0.9975331425666809, "learning_rate": 0.0001895428963753718, "loss": 0.9459, "mean_token_accuracy": 0.7763609528541565, "num_tokens": 80602290.0, "step": 28700 }, { "epoch": 0.6990121598990315, "grad_norm": 1.1276270151138306, "learning_rate": 0.00018941680899238353, "loss": 0.9968, "mean_token_accuracy": 0.7691206991672516, "num_tokens": 80870582.0, "step": 28800 }, { "epoch": 0.7014392854542365, "grad_norm": 0.8435887098312378, "learning_rate": 0.0001892900084591314, "loss": 0.9097, "mean_token_accuracy": 0.7834201455116272, "num_tokens": 81156560.0, "step": 28900 }, { "epoch": 0.7038664110094415, "grad_norm": 1.2956368923187256, "learning_rate": 0.00018916249578692233, "loss": 0.9028, "mean_token_accuracy": 0.7854771196842194, "num_tokens": 81431849.0, "step": 29000 }, { "epoch": 0.7062935365646464, "grad_norm": 1.1730372905731201, "learning_rate": 0.00018903427199274292, "loss": 0.9284, "mean_token_accuracy": 0.7804183250665665, "num_tokens": 81722470.0, "step": 29100 }, { "epoch": 0.7087206621198514, "grad_norm": 1.1425467729568481, "learning_rate": 0.00018890533809925135, "loss": 0.9165, "mean_token_accuracy": 0.7836980718374252, "num_tokens": 81987042.0, "step": 29200 }, { "epoch": 0.7111477876750564, "grad_norm": 1.4441839456558228, "learning_rate": 0.00018877569513476927, "loss": 0.9604, "mean_token_accuracy": 0.7747070676088333, "num_tokens": 82271974.0, "step": 29300 }, { "epoch": 0.7135749132302615, "grad_norm": 0.988593339920044, "learning_rate": 0.00018864534413327356, "loss": 0.9494, "mean_token_accuracy": 0.776980972290039, "num_tokens": 82549932.0, "step": 29400 }, { "epoch": 0.7160020387854664, "grad_norm": 1.1505811214447021, "learning_rate": 0.00018851428613438816, "loss": 0.8924, "mean_token_accuracy": 0.7875621300935746, "num_tokens": 82838862.0, "step": 29500 }, { "epoch": 0.7184291643406714, "grad_norm": 1.0360952615737915, "learning_rate": 0.00018838252218337565, "loss": 0.9386, "mean_token_accuracy": 0.7802232497930527, "num_tokens": 83121770.0, "step": 29600 }, { "epoch": 0.7208562898958764, "grad_norm": 1.3923313617706299, "learning_rate": 0.00018825005333112898, "loss": 0.972, "mean_token_accuracy": 0.771819856762886, "num_tokens": 83398476.0, "step": 29700 }, { "epoch": 0.7232834154510813, "grad_norm": 1.662172794342041, "learning_rate": 0.00018811688063416314, "loss": 0.9601, "mean_token_accuracy": 0.7737440180778503, "num_tokens": 83689236.0, "step": 29800 }, { "epoch": 0.7257105410062863, "grad_norm": 0.9064234495162964, "learning_rate": 0.00018798300515460663, "loss": 0.8928, "mean_token_accuracy": 0.787415645122528, "num_tokens": 83972483.0, "step": 29900 }, { "epoch": 0.7281376665614913, "grad_norm": 1.5885084867477417, "learning_rate": 0.00018784842796019312, "loss": 0.8973, "mean_token_accuracy": 0.7884204018115998, "num_tokens": 84253687.0, "step": 30000 }, { "epoch": 0.7305647921166962, "grad_norm": 1.08694326877594, "learning_rate": 0.0001877131501242528, "loss": 0.8678, "mean_token_accuracy": 0.7936598181724548, "num_tokens": 84535890.0, "step": 30100 }, { "epoch": 0.7329919176719012, "grad_norm": 1.0348883867263794, "learning_rate": 0.00018757717272570383, "loss": 0.9168, "mean_token_accuracy": 0.7829198455810547, "num_tokens": 84824617.0, "step": 30200 }, { "epoch": 0.7354190432271062, "grad_norm": 1.0495657920837402, "learning_rate": 0.00018744049684904393, "loss": 0.8521, "mean_token_accuracy": 0.7954359531402588, "num_tokens": 85100185.0, "step": 30300 }, { "epoch": 0.7378461687823111, "grad_norm": 0.7942479252815247, "learning_rate": 0.00018730312358434148, "loss": 0.8575, "mean_token_accuracy": 0.7958718985319138, "num_tokens": 85394788.0, "step": 30400 }, { "epoch": 0.7402732943375161, "grad_norm": 1.0841585397720337, "learning_rate": 0.00018716505402722688, "loss": 0.869, "mean_token_accuracy": 0.7918435066938401, "num_tokens": 85679932.0, "step": 30500 }, { "epoch": 0.7427004198927211, "grad_norm": 1.061931848526001, "learning_rate": 0.00018702628927888396, "loss": 0.9062, "mean_token_accuracy": 0.7857851403951644, "num_tokens": 85962110.0, "step": 30600 }, { "epoch": 0.745127545447926, "grad_norm": 0.9450345635414124, "learning_rate": 0.00018688683044604113, "loss": 0.8633, "mean_token_accuracy": 0.7948847645521164, "num_tokens": 86240268.0, "step": 30700 }, { "epoch": 0.747554671003131, "grad_norm": 1.069827914237976, "learning_rate": 0.00018674667864096236, "loss": 0.8936, "mean_token_accuracy": 0.7883854901790619, "num_tokens": 86519108.0, "step": 30800 }, { "epoch": 0.749981796558336, "grad_norm": 0.970453679561615, "learning_rate": 0.0001866058349814386, "loss": 0.8866, "mean_token_accuracy": 0.7884615308046341, "num_tokens": 86803385.0, "step": 30900 }, { "epoch": 0.7524089221135409, "grad_norm": 1.24496328830719, "learning_rate": 0.0001864643005907787, "loss": 0.8966, "mean_token_accuracy": 0.7896036711335183, "num_tokens": 87080614.0, "step": 31000 }, { "epoch": 0.7548360476687459, "grad_norm": 0.9882015585899353, "learning_rate": 0.0001863220765978005, "loss": 0.8813, "mean_token_accuracy": 0.7906145012378692, "num_tokens": 87355322.0, "step": 31100 }, { "epoch": 0.7572631732239509, "grad_norm": 1.0050567388534546, "learning_rate": 0.0001861791641368218, "loss": 0.8674, "mean_token_accuracy": 0.7941360598802567, "num_tokens": 87652338.0, "step": 31200 }, { "epoch": 0.7596902987791558, "grad_norm": 0.9534517526626587, "learning_rate": 0.00018603556434765127, "loss": 0.8625, "mean_token_accuracy": 0.7951251226663589, "num_tokens": 87925741.0, "step": 31300 }, { "epoch": 0.7621174243343608, "grad_norm": 0.8884904980659485, "learning_rate": 0.00018589127837557957, "loss": 0.868, "mean_token_accuracy": 0.7937871700525284, "num_tokens": 88199531.0, "step": 31400 }, { "epoch": 0.7645445498895658, "grad_norm": 1.0444873571395874, "learning_rate": 0.0001857463073713699, "loss": 0.9099, "mean_token_accuracy": 0.7861129724979401, "num_tokens": 88484640.0, "step": 31500 }, { "epoch": 0.7669716754447707, "grad_norm": 1.642042636871338, "learning_rate": 0.00018560065249124906, "loss": 0.8985, "mean_token_accuracy": 0.7859966540336609, "num_tokens": 88779524.0, "step": 31600 }, { "epoch": 0.7693988009999757, "grad_norm": 0.9597316384315491, "learning_rate": 0.00018545431489689815, "loss": 0.8862, "mean_token_accuracy": 0.7899703568220139, "num_tokens": 89053357.0, "step": 31700 }, { "epoch": 0.7718259265551807, "grad_norm": 1.1189887523651123, "learning_rate": 0.00018530729575544328, "loss": 0.8629, "mean_token_accuracy": 0.7946012103557587, "num_tokens": 89340495.0, "step": 31800 }, { "epoch": 0.7742530521103856, "grad_norm": 1.14096999168396, "learning_rate": 0.00018515959623944633, "loss": 0.8709, "mean_token_accuracy": 0.7942619401216507, "num_tokens": 89617490.0, "step": 31900 }, { "epoch": 0.7766801776655906, "grad_norm": 1.0808043479919434, "learning_rate": 0.0001850112175268955, "loss": 0.8721, "mean_token_accuracy": 0.7936557620763779, "num_tokens": 89893807.0, "step": 32000 }, { "epoch": 0.7791073032207956, "grad_norm": 1.107035517692566, "learning_rate": 0.00018486216080119608, "loss": 0.8291, "mean_token_accuracy": 0.8008069145679474, "num_tokens": 90167592.0, "step": 32100 }, { "epoch": 0.7815344287760005, "grad_norm": 1.0005587339401245, "learning_rate": 0.0001847124272511607, "loss": 0.854, "mean_token_accuracy": 0.7974209851026535, "num_tokens": 90431679.0, "step": 32200 }, { "epoch": 0.7839615543312055, "grad_norm": 0.798092782497406, "learning_rate": 0.00018456201807100028, "loss": 0.9238, "mean_token_accuracy": 0.782583869099617, "num_tokens": 90707675.0, "step": 32300 }, { "epoch": 0.7863886798864105, "grad_norm": 1.3278645277023315, "learning_rate": 0.00018441093446031417, "loss": 0.8968, "mean_token_accuracy": 0.788212314248085, "num_tokens": 90981022.0, "step": 32400 }, { "epoch": 0.7888158054416154, "grad_norm": 1.225932002067566, "learning_rate": 0.00018425917762408066, "loss": 0.8619, "mean_token_accuracy": 0.79517926633358, "num_tokens": 91259313.0, "step": 32500 }, { "epoch": 0.7912429309968204, "grad_norm": 0.8537389039993286, "learning_rate": 0.00018410674877264744, "loss": 0.8382, "mean_token_accuracy": 0.7990948641300202, "num_tokens": 91539815.0, "step": 32600 }, { "epoch": 0.7936700565520254, "grad_norm": 1.2435106039047241, "learning_rate": 0.00018395364912172191, "loss": 0.8281, "mean_token_accuracy": 0.8019495517015457, "num_tokens": 91824981.0, "step": 32700 }, { "epoch": 0.7960971821072305, "grad_norm": 1.0658791065216064, "learning_rate": 0.00018379987989236148, "loss": 0.8852, "mean_token_accuracy": 0.7909339308738709, "num_tokens": 92099114.0, "step": 32800 }, { "epoch": 0.7985243076624354, "grad_norm": 1.2279072999954224, "learning_rate": 0.00018364544231096384, "loss": 0.8409, "mean_token_accuracy": 0.8014136582612992, "num_tokens": 92378255.0, "step": 32900 }, { "epoch": 0.8009514332176404, "grad_norm": 1.0797042846679688, "learning_rate": 0.00018349033760925709, "loss": 0.769, "mean_token_accuracy": 0.8132674318552017, "num_tokens": 92657534.0, "step": 33000 }, { "epoch": 0.8033785587728454, "grad_norm": 1.1501679420471191, "learning_rate": 0.00018333456702429013, "loss": 0.8413, "mean_token_accuracy": 0.7998440200090409, "num_tokens": 92939422.0, "step": 33100 }, { "epoch": 0.8058056843280503, "grad_norm": 0.9171693325042725, "learning_rate": 0.0001831781317984225, "loss": 0.8727, "mean_token_accuracy": 0.7943976658582688, "num_tokens": 93234393.0, "step": 33200 }, { "epoch": 0.8082328098832553, "grad_norm": 0.9114980101585388, "learning_rate": 0.00018302103317931483, "loss": 0.8398, "mean_token_accuracy": 0.7989549374580384, "num_tokens": 93509369.0, "step": 33300 }, { "epoch": 0.8106599354384603, "grad_norm": 1.2949105501174927, "learning_rate": 0.00018286327241991853, "loss": 0.8068, "mean_token_accuracy": 0.8070319950580597, "num_tokens": 93785744.0, "step": 33400 }, { "epoch": 0.8130870609936652, "grad_norm": 1.1802972555160522, "learning_rate": 0.00018270485077846602, "loss": 0.8512, "mean_token_accuracy": 0.7979403012990951, "num_tokens": 94058237.0, "step": 33500 }, { "epoch": 0.8155141865488702, "grad_norm": 1.372752070426941, "learning_rate": 0.0001825457695184606, "loss": 0.8358, "mean_token_accuracy": 0.8009763330221176, "num_tokens": 94358495.0, "step": 33600 }, { "epoch": 0.8179413121040752, "grad_norm": 0.97585129737854, "learning_rate": 0.00018238602990866646, "loss": 0.8161, "mean_token_accuracy": 0.8052691560983658, "num_tokens": 94633351.0, "step": 33700 }, { "epoch": 0.8203684376592801, "grad_norm": 1.0736268758773804, "learning_rate": 0.0001822256332230985, "loss": 0.8244, "mean_token_accuracy": 0.8037301182746888, "num_tokens": 94915600.0, "step": 33800 }, { "epoch": 0.8227955632144851, "grad_norm": 1.2356585264205933, "learning_rate": 0.00018206458074101204, "loss": 0.8763, "mean_token_accuracy": 0.7945786851644516, "num_tokens": 95201992.0, "step": 33900 }, { "epoch": 0.8252226887696901, "grad_norm": 1.520247459411621, "learning_rate": 0.000181902873746893, "loss": 0.8785, "mean_token_accuracy": 0.7953230202198028, "num_tokens": 95493649.0, "step": 34000 }, { "epoch": 0.827649814324895, "grad_norm": 1.2150897979736328, "learning_rate": 0.0001817405135304472, "loss": 0.8187, "mean_token_accuracy": 0.8049602442979813, "num_tokens": 95764371.0, "step": 34100 }, { "epoch": 0.8300769398801, "grad_norm": 1.0239194631576538, "learning_rate": 0.00018157750138659038, "loss": 0.8111, "mean_token_accuracy": 0.8069447958469391, "num_tokens": 96057388.0, "step": 34200 }, { "epoch": 0.832504065435305, "grad_norm": 1.225995421409607, "learning_rate": 0.00018141383861543773, "loss": 0.785, "mean_token_accuracy": 0.81030932366848, "num_tokens": 96320139.0, "step": 34300 }, { "epoch": 0.8349311909905099, "grad_norm": 1.0512681007385254, "learning_rate": 0.00018124952652229362, "loss": 0.8081, "mean_token_accuracy": 0.8077819764614105, "num_tokens": 96586361.0, "step": 34400 }, { "epoch": 0.8373583165457149, "grad_norm": 0.9870871305465698, "learning_rate": 0.00018108456641764104, "loss": 0.7915, "mean_token_accuracy": 0.8107606899738312, "num_tokens": 96851813.0, "step": 34500 }, { "epoch": 0.8397854421009199, "grad_norm": 1.0771749019622803, "learning_rate": 0.00018091895961713135, "loss": 0.8465, "mean_token_accuracy": 0.8007239735126496, "num_tokens": 97111419.0, "step": 34600 }, { "epoch": 0.8422125676561248, "grad_norm": 1.0361714363098145, "learning_rate": 0.00018075270744157367, "loss": 0.7617, "mean_token_accuracy": 0.8161041277647019, "num_tokens": 97390477.0, "step": 34700 }, { "epoch": 0.8446396932113298, "grad_norm": 1.1362801790237427, "learning_rate": 0.0001805858112169242, "loss": 0.7813, "mean_token_accuracy": 0.8107654428482056, "num_tokens": 97672616.0, "step": 34800 }, { "epoch": 0.8470668187665348, "grad_norm": 1.2550112009048462, "learning_rate": 0.00018041827227427602, "loss": 0.8187, "mean_token_accuracy": 0.8066113740205765, "num_tokens": 97936627.0, "step": 34900 }, { "epoch": 0.8494939443217397, "grad_norm": 1.050264835357666, "learning_rate": 0.00018025009194984817, "loss": 0.8027, "mean_token_accuracy": 0.8081069684028626, "num_tokens": 98211705.0, "step": 35000 }, { "epoch": 0.8519210698769447, "grad_norm": 1.0110602378845215, "learning_rate": 0.00018008127158497498, "loss": 0.751, "mean_token_accuracy": 0.8187358456850052, "num_tokens": 98493819.0, "step": 35100 }, { "epoch": 0.8543481954321497, "grad_norm": 0.7575352191925049, "learning_rate": 0.00017991181252609558, "loss": 0.8254, "mean_token_accuracy": 0.8047711282968522, "num_tokens": 98772156.0, "step": 35200 }, { "epoch": 0.8567753209873546, "grad_norm": 1.2765109539031982, "learning_rate": 0.000179741716124743, "loss": 0.7522, "mean_token_accuracy": 0.8180702245235443, "num_tokens": 99050074.0, "step": 35300 }, { "epoch": 0.8592024465425596, "grad_norm": 0.955980658531189, "learning_rate": 0.00017957098373753348, "loss": 0.7575, "mean_token_accuracy": 0.8179322808980942, "num_tokens": 99344611.0, "step": 35400 }, { "epoch": 0.8616295720977646, "grad_norm": 1.1932599544525146, "learning_rate": 0.00017939961672615564, "loss": 0.8472, "mean_token_accuracy": 0.801171503663063, "num_tokens": 99619715.0, "step": 35500 }, { "epoch": 0.8640566976529696, "grad_norm": 1.0699065923690796, "learning_rate": 0.00017922761645735945, "loss": 0.7807, "mean_token_accuracy": 0.8139619028568268, "num_tokens": 99904024.0, "step": 35600 }, { "epoch": 0.8664838232081745, "grad_norm": 0.8689550757408142, "learning_rate": 0.00017905498430294565, "loss": 0.7973, "mean_token_accuracy": 0.8107730603218078, "num_tokens": 100180461.0, "step": 35700 }, { "epoch": 0.8689109487633795, "grad_norm": 1.159509539604187, "learning_rate": 0.00017888172163975453, "loss": 0.7626, "mean_token_accuracy": 0.8172780108451844, "num_tokens": 100454222.0, "step": 35800 }, { "epoch": 0.8713380743185845, "grad_norm": 1.2939770221710205, "learning_rate": 0.00017870782984965498, "loss": 0.8335, "mean_token_accuracy": 0.8025886970758438, "num_tokens": 100749127.0, "step": 35900 }, { "epoch": 0.8737651998737894, "grad_norm": 1.2724905014038086, "learning_rate": 0.0001785333103195337, "loss": 0.7812, "mean_token_accuracy": 0.8140066814422607, "num_tokens": 101028153.0, "step": 36000 }, { "epoch": 0.8761923254289944, "grad_norm": 0.8696538209915161, "learning_rate": 0.00017835816444128387, "loss": 0.7961, "mean_token_accuracy": 0.8110569196939469, "num_tokens": 101312954.0, "step": 36100 }, { "epoch": 0.8786194509841994, "grad_norm": 0.9427557587623596, "learning_rate": 0.0001781823936117942, "loss": 0.751, "mean_token_accuracy": 0.8197416269779205, "num_tokens": 101593661.0, "step": 36200 }, { "epoch": 0.8810465765394044, "grad_norm": 1.0701342821121216, "learning_rate": 0.0001780059992329377, "loss": 0.7963, "mean_token_accuracy": 0.8096175003051758, "num_tokens": 101867800.0, "step": 36300 }, { "epoch": 0.8834737020946094, "grad_norm": 0.8264694809913635, "learning_rate": 0.00017782898271156058, "loss": 0.7706, "mean_token_accuracy": 0.8167759263515473, "num_tokens": 102150665.0, "step": 36400 }, { "epoch": 0.8859008276498144, "grad_norm": 0.8843175768852234, "learning_rate": 0.000177651345459471, "loss": 0.7896, "mean_token_accuracy": 0.8110161870718002, "num_tokens": 102440416.0, "step": 36500 }, { "epoch": 0.8883279532050193, "grad_norm": 0.9907525777816772, "learning_rate": 0.00017747308889342776, "loss": 0.7726, "mean_token_accuracy": 0.8153502416610717, "num_tokens": 102723528.0, "step": 36600 }, { "epoch": 0.8907550787602243, "grad_norm": 1.609206199645996, "learning_rate": 0.00017729421443512906, "loss": 0.7236, "mean_token_accuracy": 0.8254850769042968, "num_tokens": 102990192.0, "step": 36700 }, { "epoch": 0.8931822043154293, "grad_norm": 1.0116157531738281, "learning_rate": 0.00017711472351120117, "loss": 0.734, "mean_token_accuracy": 0.8239784175157547, "num_tokens": 103268116.0, "step": 36800 }, { "epoch": 0.8956093298706342, "grad_norm": 0.8444027304649353, "learning_rate": 0.000176934617553187, "loss": 0.7513, "mean_token_accuracy": 0.819818964600563, "num_tokens": 103548527.0, "step": 36900 }, { "epoch": 0.8980364554258392, "grad_norm": 0.7157046794891357, "learning_rate": 0.00017675389799753474, "loss": 0.7635, "mean_token_accuracy": 0.8183335334062576, "num_tokens": 103842672.0, "step": 37000 }, { "epoch": 0.9004635809810442, "grad_norm": 0.8269189596176147, "learning_rate": 0.00017657256628558635, "loss": 0.746, "mean_token_accuracy": 0.8198094671964645, "num_tokens": 104117428.0, "step": 37100 }, { "epoch": 0.9028907065362491, "grad_norm": 1.2032946348190308, "learning_rate": 0.00017639062386356604, "loss": 0.7487, "mean_token_accuracy": 0.8211363846063614, "num_tokens": 104403750.0, "step": 37200 }, { "epoch": 0.9053178320914541, "grad_norm": 0.7613286972045898, "learning_rate": 0.00017620807218256889, "loss": 0.7342, "mean_token_accuracy": 0.8234205913543701, "num_tokens": 104680182.0, "step": 37300 }, { "epoch": 0.9077449576466591, "grad_norm": 0.8773027658462524, "learning_rate": 0.00017602491269854906, "loss": 0.767, "mean_token_accuracy": 0.817679477930069, "num_tokens": 104968706.0, "step": 37400 }, { "epoch": 0.910172083201864, "grad_norm": 0.8545549511909485, "learning_rate": 0.00017584114687230838, "loss": 0.7352, "mean_token_accuracy": 0.8224795866012573, "num_tokens": 105262204.0, "step": 37500 }, { "epoch": 0.912599208757069, "grad_norm": 0.9424647092819214, "learning_rate": 0.0001756567761694845, "loss": 0.6923, "mean_token_accuracy": 0.834279500246048, "num_tokens": 105539111.0, "step": 37600 }, { "epoch": 0.915026334312274, "grad_norm": 0.7908113598823547, "learning_rate": 0.00017547180206053945, "loss": 0.7413, "mean_token_accuracy": 0.8225541985034943, "num_tokens": 105823216.0, "step": 37700 }, { "epoch": 0.917453459867479, "grad_norm": 1.1390018463134766, "learning_rate": 0.00017528622602074764, "loss": 0.7383, "mean_token_accuracy": 0.8238617664575577, "num_tokens": 106112129.0, "step": 37800 }, { "epoch": 0.9198805854226839, "grad_norm": 0.9344626069068909, "learning_rate": 0.00017510004953018426, "loss": 0.7145, "mean_token_accuracy": 0.8279504466056824, "num_tokens": 106395386.0, "step": 37900 }, { "epoch": 0.9223077109778889, "grad_norm": 0.8515225052833557, "learning_rate": 0.00017491327407371352, "loss": 0.6988, "mean_token_accuracy": 0.830140540599823, "num_tokens": 106673179.0, "step": 38000 }, { "epoch": 0.9247348365330939, "grad_norm": 0.8123283982276917, "learning_rate": 0.0001747259011409766, "loss": 0.7336, "mean_token_accuracy": 0.8250963681936264, "num_tokens": 106940340.0, "step": 38100 }, { "epoch": 0.9271619620882988, "grad_norm": 0.9882307052612305, "learning_rate": 0.00017453793222637997, "loss": 0.7434, "mean_token_accuracy": 0.8226048046350479, "num_tokens": 107217056.0, "step": 38200 }, { "epoch": 0.9295890876435038, "grad_norm": 1.0422884225845337, "learning_rate": 0.00017434936882908337, "loss": 0.7875, "mean_token_accuracy": 0.8118562215566635, "num_tokens": 107501552.0, "step": 38300 }, { "epoch": 0.9320162131987088, "grad_norm": 0.9351255893707275, "learning_rate": 0.00017416021245298793, "loss": 0.7388, "mean_token_accuracy": 0.8256934684514999, "num_tokens": 107792280.0, "step": 38400 }, { "epoch": 0.9344433387539137, "grad_norm": 1.1992120742797852, "learning_rate": 0.00017397046460672404, "loss": 0.7575, "mean_token_accuracy": 0.8187384909391403, "num_tokens": 108070349.0, "step": 38500 }, { "epoch": 0.9368704643091187, "grad_norm": 0.8793713450431824, "learning_rate": 0.00017378012680363945, "loss": 0.7247, "mean_token_accuracy": 0.8275273048877716, "num_tokens": 108359397.0, "step": 38600 }, { "epoch": 0.9392975898643237, "grad_norm": 0.7668092250823975, "learning_rate": 0.00017358920056178718, "loss": 0.7654, "mean_token_accuracy": 0.8186299502849579, "num_tokens": 108644525.0, "step": 38700 }, { "epoch": 0.9417247154195286, "grad_norm": 1.0598175525665283, "learning_rate": 0.00017339768740391333, "loss": 0.6535, "mean_token_accuracy": 0.8406318628787994, "num_tokens": 108925988.0, "step": 38800 }, { "epoch": 0.9441518409747336, "grad_norm": 1.4081029891967773, "learning_rate": 0.00017320558885744504, "loss": 0.7174, "mean_token_accuracy": 0.8275231170654297, "num_tokens": 109202693.0, "step": 38900 }, { "epoch": 0.9465789665299386, "grad_norm": 1.382180094718933, "learning_rate": 0.00017301290645447824, "loss": 0.6792, "mean_token_accuracy": 0.834947235584259, "num_tokens": 109470412.0, "step": 39000 }, { "epoch": 0.9490060920851435, "grad_norm": 0.6567552089691162, "learning_rate": 0.0001728196417317655, "loss": 0.7262, "mean_token_accuracy": 0.8264455366134643, "num_tokens": 109758785.0, "step": 39100 }, { "epoch": 0.9514332176403485, "grad_norm": 0.6902262568473816, "learning_rate": 0.00017262579623070353, "loss": 0.7497, "mean_token_accuracy": 0.8199244272708893, "num_tokens": 110049715.0, "step": 39200 }, { "epoch": 0.9538603431955535, "grad_norm": 1.103183388710022, "learning_rate": 0.00017243137149732137, "loss": 0.6898, "mean_token_accuracy": 0.8338291847705841, "num_tokens": 110334599.0, "step": 39300 }, { "epoch": 0.9562874687507584, "grad_norm": 0.9607908129692078, "learning_rate": 0.00017223636908226752, "loss": 0.6989, "mean_token_accuracy": 0.8327404856681824, "num_tokens": 110617566.0, "step": 39400 }, { "epoch": 0.9587145943059634, "grad_norm": 1.0758519172668457, "learning_rate": 0.000172040790540798, "loss": 0.6825, "mean_token_accuracy": 0.8364751309156417, "num_tokens": 110900065.0, "step": 39500 }, { "epoch": 0.9611417198611684, "grad_norm": 0.8208298683166504, "learning_rate": 0.00017184463743276365, "loss": 0.7095, "mean_token_accuracy": 0.8310269391536713, "num_tokens": 111178185.0, "step": 39600 }, { "epoch": 0.9635688454163734, "grad_norm": 1.049796223640442, "learning_rate": 0.0001716479113225979, "loss": 0.6954, "mean_token_accuracy": 0.8331800949573517, "num_tokens": 111442052.0, "step": 39700 }, { "epoch": 0.9659959709715784, "grad_norm": 0.8910472989082336, "learning_rate": 0.00017145061377930407, "loss": 0.6871, "mean_token_accuracy": 0.8366471952199936, "num_tokens": 111718863.0, "step": 39800 }, { "epoch": 0.9684230965267834, "grad_norm": 0.9564645886421204, "learning_rate": 0.00017125274637644312, "loss": 0.7159, "mean_token_accuracy": 0.8281272131204606, "num_tokens": 111997357.0, "step": 39900 }, { "epoch": 0.9708502220819883, "grad_norm": 0.9473711848258972, "learning_rate": 0.00017105431069212087, "loss": 0.7179, "mean_token_accuracy": 0.8280116510391236, "num_tokens": 112284668.0, "step": 40000 }, { "epoch": 0.9732773476371933, "grad_norm": 0.8862663507461548, "learning_rate": 0.00017085530830897566, "loss": 0.7195, "mean_token_accuracy": 0.8270606195926666, "num_tokens": 112555426.0, "step": 40100 }, { "epoch": 0.9757044731923983, "grad_norm": 1.0798990726470947, "learning_rate": 0.00017065574081416535, "loss": 0.6937, "mean_token_accuracy": 0.8331471282243729, "num_tokens": 112833652.0, "step": 40200 }, { "epoch": 0.9781315987476032, "grad_norm": 1.0329279899597168, "learning_rate": 0.00017045560979935505, "loss": 0.6973, "mean_token_accuracy": 0.8337054681777955, "num_tokens": 113125979.0, "step": 40300 }, { "epoch": 0.9805587243028082, "grad_norm": 1.095126986503601, "learning_rate": 0.00017025491686070425, "loss": 0.7148, "mean_token_accuracy": 0.828719516992569, "num_tokens": 113395603.0, "step": 40400 }, { "epoch": 0.9829858498580132, "grad_norm": 0.6569015383720398, "learning_rate": 0.00017005366359885407, "loss": 0.6545, "mean_token_accuracy": 0.8421603220701218, "num_tokens": 113677040.0, "step": 40500 }, { "epoch": 0.9854129754132182, "grad_norm": 0.8844952583312988, "learning_rate": 0.0001698518516189145, "loss": 0.7239, "mean_token_accuracy": 0.824980406165123, "num_tokens": 113950074.0, "step": 40600 }, { "epoch": 0.9878401009684231, "grad_norm": 0.8346448540687561, "learning_rate": 0.00016964948253045173, "loss": 0.6458, "mean_token_accuracy": 0.8428018325567246, "num_tokens": 114214049.0, "step": 40700 }, { "epoch": 0.9902672265236281, "grad_norm": 0.8526921272277832, "learning_rate": 0.00016944655794747505, "loss": 0.6812, "mean_token_accuracy": 0.8357190907001495, "num_tokens": 114494222.0, "step": 40800 }, { "epoch": 0.992694352078833, "grad_norm": 0.9561910033226013, "learning_rate": 0.00016924307948842435, "loss": 0.6848, "mean_token_accuracy": 0.8342451626062393, "num_tokens": 114773314.0, "step": 40900 }, { "epoch": 0.995121477634038, "grad_norm": 0.70332270860672, "learning_rate": 0.00016903904877615678, "loss": 0.702, "mean_token_accuracy": 0.8312277746200561, "num_tokens": 115047126.0, "step": 41000 }, { "epoch": 0.997548603189243, "grad_norm": 0.9677546620368958, "learning_rate": 0.0001688344674379342, "loss": 0.6646, "mean_token_accuracy": 0.8374815553426742, "num_tokens": 115324048.0, "step": 41100 }, { "epoch": 0.999975728744448, "grad_norm": 0.923065185546875, "learning_rate": 0.0001686293371054099, "loss": 0.6843, "mean_token_accuracy": 0.8363048458099365, "num_tokens": 115592249.0, "step": 41200 }, { "epoch": 1.0024028542996528, "grad_norm": 1.1214321851730347, "learning_rate": 0.0001684236594146158, "loss": 0.5588, "mean_token_accuracy": 0.8611422181129456, "num_tokens": 115866851.0, "step": 41300 }, { "epoch": 1.004829979854858, "grad_norm": 1.639672040939331, "learning_rate": 0.00016821743600594927, "loss": 0.5275, "mean_token_accuracy": 0.8682989192008972, "num_tokens": 116144462.0, "step": 41400 }, { "epoch": 1.007257105410063, "grad_norm": 1.342544674873352, "learning_rate": 0.0001680106685241601, "loss": 0.5098, "mean_token_accuracy": 0.8725023174285889, "num_tokens": 116419143.0, "step": 41500 }, { "epoch": 1.0096842309652678, "grad_norm": 1.2504510879516602, "learning_rate": 0.00016780335861833737, "loss": 0.5978, "mean_token_accuracy": 0.8525243228673935, "num_tokens": 116714586.0, "step": 41600 }, { "epoch": 1.012111356520473, "grad_norm": 0.8218744397163391, "learning_rate": 0.00016759550794189635, "loss": 0.5516, "mean_token_accuracy": 0.8621615475416183, "num_tokens": 116997028.0, "step": 41700 }, { "epoch": 1.0145384820756778, "grad_norm": 0.677018940448761, "learning_rate": 0.0001673871181525652, "loss": 0.5528, "mean_token_accuracy": 0.8615715527534484, "num_tokens": 117285016.0, "step": 41800 }, { "epoch": 1.0169656076308828, "grad_norm": 0.8412028551101685, "learning_rate": 0.00016717819091237183, "loss": 0.5825, "mean_token_accuracy": 0.8563485831022263, "num_tokens": 117563499.0, "step": 41900 }, { "epoch": 1.0193927331860877, "grad_norm": 0.9962805509567261, "learning_rate": 0.00016696872788763068, "loss": 0.5559, "mean_token_accuracy": 0.8621777659654617, "num_tokens": 117856118.0, "step": 42000 }, { "epoch": 1.0218198587412928, "grad_norm": 0.971257209777832, "learning_rate": 0.00016675873074892928, "loss": 0.5841, "mean_token_accuracy": 0.8552743774652481, "num_tokens": 118152443.0, "step": 42100 }, { "epoch": 1.0242469842964976, "grad_norm": 1.0679978132247925, "learning_rate": 0.00016654820117111517, "loss": 0.551, "mean_token_accuracy": 0.864154868721962, "num_tokens": 118432441.0, "step": 42200 }, { "epoch": 1.0266741098517027, "grad_norm": 0.9175618886947632, "learning_rate": 0.00016633714083328225, "loss": 0.5584, "mean_token_accuracy": 0.8621039032936096, "num_tokens": 118720631.0, "step": 42300 }, { "epoch": 1.0291012354069076, "grad_norm": 1.0536086559295654, "learning_rate": 0.0001661255514187576, "loss": 0.5703, "mean_token_accuracy": 0.8585632288455963, "num_tokens": 119016801.0, "step": 42400 }, { "epoch": 1.0315283609621126, "grad_norm": 0.7999328970909119, "learning_rate": 0.00016591343461508796, "loss": 0.5328, "mean_token_accuracy": 0.8685509181022644, "num_tokens": 119319993.0, "step": 42500 }, { "epoch": 1.0339554865173175, "grad_norm": 0.8778619170188904, "learning_rate": 0.00016570079211402638, "loss": 0.5603, "mean_token_accuracy": 0.8621475839614868, "num_tokens": 119595722.0, "step": 42600 }, { "epoch": 1.0363826120725226, "grad_norm": 0.7495865225791931, "learning_rate": 0.0001654876256115186, "loss": 0.5493, "mean_token_accuracy": 0.8641699087619782, "num_tokens": 119876137.0, "step": 42700 }, { "epoch": 1.0388097376277274, "grad_norm": 0.8057670593261719, "learning_rate": 0.0001652739368076895, "loss": 0.5335, "mean_token_accuracy": 0.8662267804145813, "num_tokens": 120143750.0, "step": 42800 }, { "epoch": 1.0412368631829325, "grad_norm": 0.9671706557273865, "learning_rate": 0.00016505972740682974, "loss": 0.5354, "mean_token_accuracy": 0.8661771559715271, "num_tokens": 120405127.0, "step": 42900 }, { "epoch": 1.0436639887381374, "grad_norm": 1.1040537357330322, "learning_rate": 0.0001648449991173819, "loss": 0.5471, "mean_token_accuracy": 0.8631951743364334, "num_tokens": 120685253.0, "step": 43000 }, { "epoch": 1.0460911142933425, "grad_norm": 1.1625252962112427, "learning_rate": 0.0001646297536519272, "loss": 0.5464, "mean_token_accuracy": 0.8636760532855987, "num_tokens": 120958040.0, "step": 43100 }, { "epoch": 1.0485182398485473, "grad_norm": 0.7705463767051697, "learning_rate": 0.00016441399272717138, "loss": 0.557, "mean_token_accuracy": 0.8624409341812134, "num_tokens": 121234837.0, "step": 43200 }, { "epoch": 1.0509453654037524, "grad_norm": 0.8841344714164734, "learning_rate": 0.00016419771806393144, "loss": 0.5191, "mean_token_accuracy": 0.8703099459409713, "num_tokens": 121513869.0, "step": 43300 }, { "epoch": 1.0533724909589572, "grad_norm": 0.9004896283149719, "learning_rate": 0.00016398093138712168, "loss": 0.5867, "mean_token_accuracy": 0.8558615297079086, "num_tokens": 121785188.0, "step": 43400 }, { "epoch": 1.0557996165141623, "grad_norm": 1.055458664894104, "learning_rate": 0.00016376363442574, "loss": 0.5247, "mean_token_accuracy": 0.8702590548992157, "num_tokens": 122066119.0, "step": 43500 }, { "epoch": 1.0582267420693672, "grad_norm": 0.9583166241645813, "learning_rate": 0.0001635458289128542, "loss": 0.5368, "mean_token_accuracy": 0.8678800988197327, "num_tokens": 122342074.0, "step": 43600 }, { "epoch": 1.0606538676245723, "grad_norm": 0.9781296849250793, "learning_rate": 0.00016332751658558787, "loss": 0.578, "mean_token_accuracy": 0.8574618774652482, "num_tokens": 122628920.0, "step": 43700 }, { "epoch": 1.0630809931797771, "grad_norm": 0.9489784836769104, "learning_rate": 0.00016310869918510692, "loss": 0.5913, "mean_token_accuracy": 0.8549783474206925, "num_tokens": 122904241.0, "step": 43800 }, { "epoch": 1.0655081187349822, "grad_norm": 0.9939345717430115, "learning_rate": 0.0001628893784566054, "loss": 0.5617, "mean_token_accuracy": 0.8607113534212112, "num_tokens": 123181146.0, "step": 43900 }, { "epoch": 1.067935244290187, "grad_norm": 0.7742923498153687, "learning_rate": 0.00016266955614929182, "loss": 0.5652, "mean_token_accuracy": 0.8603355962038041, "num_tokens": 123463197.0, "step": 44000 }, { "epoch": 1.0703623698453921, "grad_norm": 1.19265615940094, "learning_rate": 0.00016244923401637488, "loss": 0.577, "mean_token_accuracy": 0.8579962712526321, "num_tokens": 123741457.0, "step": 44100 }, { "epoch": 1.072789495400597, "grad_norm": 0.7991734147071838, "learning_rate": 0.00016222841381504982, "loss": 0.6114, "mean_token_accuracy": 0.8508148151636123, "num_tokens": 124007415.0, "step": 44200 }, { "epoch": 1.075216620955802, "grad_norm": 1.1364731788635254, "learning_rate": 0.00016200709730648428, "loss": 0.5637, "mean_token_accuracy": 0.8609515500068664, "num_tokens": 124296854.0, "step": 44300 }, { "epoch": 1.077643746511007, "grad_norm": 1.1264718770980835, "learning_rate": 0.00016178528625580404, "loss": 0.5208, "mean_token_accuracy": 0.8697484582662582, "num_tokens": 124592659.0, "step": 44400 }, { "epoch": 1.080070872066212, "grad_norm": 0.8853039145469666, "learning_rate": 0.00016156298243207937, "loss": 0.5543, "mean_token_accuracy": 0.8613111680746078, "num_tokens": 124871204.0, "step": 44500 }, { "epoch": 1.0824979976214169, "grad_norm": 0.9990018010139465, "learning_rate": 0.00016134018760831053, "loss": 0.5721, "mean_token_accuracy": 0.8585412526130676, "num_tokens": 125154712.0, "step": 44600 }, { "epoch": 1.084925123176622, "grad_norm": 0.8856194615364075, "learning_rate": 0.00016111690356141386, "loss": 0.5179, "mean_token_accuracy": 0.8699715811014176, "num_tokens": 125448505.0, "step": 44700 }, { "epoch": 1.0873522487318268, "grad_norm": 0.808142900466919, "learning_rate": 0.00016089313207220756, "loss": 0.5646, "mean_token_accuracy": 0.8604241162538528, "num_tokens": 125724217.0, "step": 44800 }, { "epoch": 1.0897793742870319, "grad_norm": 1.2607107162475586, "learning_rate": 0.00016066887492539734, "loss": 0.5373, "mean_token_accuracy": 0.8668778890371323, "num_tokens": 126005623.0, "step": 44900 }, { "epoch": 1.092206499842237, "grad_norm": 0.7924454808235168, "learning_rate": 0.00016044413390956246, "loss": 0.542, "mean_token_accuracy": 0.8655985903739929, "num_tokens": 126276783.0, "step": 45000 }, { "epoch": 1.0946336253974418, "grad_norm": 0.7327313423156738, "learning_rate": 0.00016021891081714117, "loss": 0.5363, "mean_token_accuracy": 0.8668161380290985, "num_tokens": 126555445.0, "step": 45100 }, { "epoch": 1.0970607509526469, "grad_norm": 1.1913338899612427, "learning_rate": 0.0001599932074444167, "loss": 0.5347, "mean_token_accuracy": 0.866722519993782, "num_tokens": 126849635.0, "step": 45200 }, { "epoch": 1.0994878765078517, "grad_norm": 0.9338855743408203, "learning_rate": 0.0001597670255915026, "loss": 0.5353, "mean_token_accuracy": 0.8668248879909516, "num_tokens": 127123102.0, "step": 45300 }, { "epoch": 1.1019150020630568, "grad_norm": 1.211919903755188, "learning_rate": 0.0001595403670623288, "loss": 0.5182, "mean_token_accuracy": 0.8705047732591629, "num_tokens": 127397810.0, "step": 45400 }, { "epoch": 1.1043421276182617, "grad_norm": 0.9928873181343079, "learning_rate": 0.00015931323366462688, "loss": 0.5326, "mean_token_accuracy": 0.8685156333446503, "num_tokens": 127672480.0, "step": 45500 }, { "epoch": 1.1067692531734667, "grad_norm": 0.7845079898834229, "learning_rate": 0.0001590856272099157, "loss": 0.5634, "mean_token_accuracy": 0.8639321917295456, "num_tokens": 127945894.0, "step": 45600 }, { "epoch": 1.1091963787286716, "grad_norm": 0.8590813279151917, "learning_rate": 0.00015885754951348714, "loss": 0.572, "mean_token_accuracy": 0.8595906603336334, "num_tokens": 128239112.0, "step": 45700 }, { "epoch": 1.1116235042838767, "grad_norm": 0.7593941688537598, "learning_rate": 0.0001586290023943915, "loss": 0.5291, "mean_token_accuracy": 0.8698451125621796, "num_tokens": 128511760.0, "step": 45800 }, { "epoch": 1.1140506298390815, "grad_norm": 0.8581081032752991, "learning_rate": 0.00015839998767542285, "loss": 0.5695, "mean_token_accuracy": 0.8599695748090744, "num_tokens": 128799919.0, "step": 45900 }, { "epoch": 1.1164777553942866, "grad_norm": 0.7672972679138184, "learning_rate": 0.00015817050718310483, "loss": 0.5171, "mean_token_accuracy": 0.870343611240387, "num_tokens": 129080858.0, "step": 46000 }, { "epoch": 1.1189048809494915, "grad_norm": 0.9828253984451294, "learning_rate": 0.0001579405627476757, "loss": 0.5116, "mean_token_accuracy": 0.873331093788147, "num_tokens": 129348368.0, "step": 46100 }, { "epoch": 1.1213320065046966, "grad_norm": 1.2287424802780151, "learning_rate": 0.00015771015620307416, "loss": 0.5462, "mean_token_accuracy": 0.8644060462713241, "num_tokens": 129622627.0, "step": 46200 }, { "epoch": 1.1237591320599014, "grad_norm": 0.8262332677841187, "learning_rate": 0.00015747928938692426, "loss": 0.5419, "mean_token_accuracy": 0.8658415484428406, "num_tokens": 129898767.0, "step": 46300 }, { "epoch": 1.1261862576151065, "grad_norm": 0.8397664427757263, "learning_rate": 0.00015724796414052112, "loss": 0.5229, "mean_token_accuracy": 0.8713884443044663, "num_tokens": 130187829.0, "step": 46400 }, { "epoch": 1.1286133831703113, "grad_norm": 0.8683608174324036, "learning_rate": 0.0001570161823088161, "loss": 0.5427, "mean_token_accuracy": 0.8661625987291336, "num_tokens": 130471642.0, "step": 46500 }, { "epoch": 1.1310405087255164, "grad_norm": 0.7690980434417725, "learning_rate": 0.000156783945740402, "loss": 0.547, "mean_token_accuracy": 0.8648913151025772, "num_tokens": 130742090.0, "step": 46600 }, { "epoch": 1.1334676342807213, "grad_norm": 1.0576156377792358, "learning_rate": 0.0001565512562874985, "loss": 0.5157, "mean_token_accuracy": 0.871839565038681, "num_tokens": 131012670.0, "step": 46700 }, { "epoch": 1.1358947598359264, "grad_norm": 0.6827182769775391, "learning_rate": 0.00015631811580593723, "loss": 0.5295, "mean_token_accuracy": 0.8686037886142731, "num_tokens": 131302863.0, "step": 46800 }, { "epoch": 1.1383218853911312, "grad_norm": 1.2057856321334839, "learning_rate": 0.00015608452615514709, "loss": 0.5436, "mean_token_accuracy": 0.8661547493934632, "num_tokens": 131582193.0, "step": 46900 }, { "epoch": 1.1407490109463363, "grad_norm": 0.8514811396598816, "learning_rate": 0.00015585048919813927, "loss": 0.5122, "mean_token_accuracy": 0.8713947582244873, "num_tokens": 131853932.0, "step": 47000 }, { "epoch": 1.1431761365015412, "grad_norm": 0.8594823479652405, "learning_rate": 0.00015561600680149263, "loss": 0.5057, "mean_token_accuracy": 0.8751941341161728, "num_tokens": 132137048.0, "step": 47100 }, { "epoch": 1.1456032620567462, "grad_norm": 1.0555239915847778, "learning_rate": 0.0001553810808353385, "loss": 0.5096, "mean_token_accuracy": 0.8731654924154282, "num_tokens": 132413595.0, "step": 47200 }, { "epoch": 1.148030387611951, "grad_norm": 0.9065008759498596, "learning_rate": 0.00015514571317334607, "loss": 0.5027, "mean_token_accuracy": 0.8751115143299103, "num_tokens": 132692775.0, "step": 47300 }, { "epoch": 1.1504575131671562, "grad_norm": 1.3132802248001099, "learning_rate": 0.00015490990569270725, "loss": 0.5114, "mean_token_accuracy": 0.8728679984807968, "num_tokens": 132963629.0, "step": 47400 }, { "epoch": 1.152884638722361, "grad_norm": 0.9391400814056396, "learning_rate": 0.00015467366027412167, "loss": 0.5216, "mean_token_accuracy": 0.8717411142587662, "num_tokens": 133225134.0, "step": 47500 }, { "epoch": 1.155311764277566, "grad_norm": 0.8264212012290955, "learning_rate": 0.00015443697880178193, "loss": 0.5737, "mean_token_accuracy": 0.8609393298625946, "num_tokens": 133507440.0, "step": 47600 }, { "epoch": 1.157738889832771, "grad_norm": 0.9553474187850952, "learning_rate": 0.0001541998631633583, "loss": 0.5605, "mean_token_accuracy": 0.8621794009208679, "num_tokens": 133784051.0, "step": 47700 }, { "epoch": 1.160166015387976, "grad_norm": 0.7231343388557434, "learning_rate": 0.0001539623152499838, "loss": 0.5448, "mean_token_accuracy": 0.8658911234140396, "num_tokens": 134065800.0, "step": 47800 }, { "epoch": 1.162593140943181, "grad_norm": 0.8352395296096802, "learning_rate": 0.0001537243369562391, "loss": 0.549, "mean_token_accuracy": 0.866152486205101, "num_tokens": 134365482.0, "step": 47900 }, { "epoch": 1.165020266498386, "grad_norm": 0.6428236365318298, "learning_rate": 0.00015348593018013742, "loss": 0.49, "mean_token_accuracy": 0.8778564035892487, "num_tokens": 134656680.0, "step": 48000 }, { "epoch": 1.167447392053591, "grad_norm": 0.7582641243934631, "learning_rate": 0.00015324709682310925, "loss": 0.5352, "mean_token_accuracy": 0.8687029981613159, "num_tokens": 134943558.0, "step": 48100 }, { "epoch": 1.169874517608796, "grad_norm": 1.4312922954559326, "learning_rate": 0.0001530078387899875, "loss": 0.5237, "mean_token_accuracy": 0.870193499326706, "num_tokens": 135204101.0, "step": 48200 }, { "epoch": 1.1723016431640008, "grad_norm": 1.1317497491836548, "learning_rate": 0.00015276815798899196, "loss": 0.5042, "mean_token_accuracy": 0.875024488568306, "num_tokens": 135482167.0, "step": 48300 }, { "epoch": 1.1747287687192058, "grad_norm": 0.759957492351532, "learning_rate": 0.0001525280563317143, "loss": 0.5172, "mean_token_accuracy": 0.8706453287601471, "num_tokens": 135753938.0, "step": 48400 }, { "epoch": 1.177155894274411, "grad_norm": 1.225609302520752, "learning_rate": 0.00015228753573310274, "loss": 0.4963, "mean_token_accuracy": 0.8768988955020904, "num_tokens": 136036050.0, "step": 48500 }, { "epoch": 1.1795830198296158, "grad_norm": 0.5895382761955261, "learning_rate": 0.00015204659811144685, "loss": 0.5199, "mean_token_accuracy": 0.8719930589199066, "num_tokens": 136327290.0, "step": 48600 }, { "epoch": 1.1820101453848209, "grad_norm": 0.7858893871307373, "learning_rate": 0.00015180524538836214, "loss": 0.5029, "mean_token_accuracy": 0.8763481539487838, "num_tokens": 136614197.0, "step": 48700 }, { "epoch": 1.1844372709400257, "grad_norm": 0.7872015833854675, "learning_rate": 0.00015156347948877481, "loss": 0.5278, "mean_token_accuracy": 0.8710852640867234, "num_tokens": 136885103.0, "step": 48800 }, { "epoch": 1.1868643964952308, "grad_norm": 1.455886960029602, "learning_rate": 0.00015132130234090635, "loss": 0.4845, "mean_token_accuracy": 0.8797940272092819, "num_tokens": 137165613.0, "step": 48900 }, { "epoch": 1.1892915220504356, "grad_norm": 0.7938244938850403, "learning_rate": 0.00015107871587625825, "loss": 0.469, "mean_token_accuracy": 0.883348827958107, "num_tokens": 137444709.0, "step": 49000 }, { "epoch": 1.1917186476056407, "grad_norm": 0.6187699437141418, "learning_rate": 0.00015083572202959645, "loss": 0.4834, "mean_token_accuracy": 0.8800075507164001, "num_tokens": 137733269.0, "step": 49100 }, { "epoch": 1.1941457731608456, "grad_norm": 0.6408334374427795, "learning_rate": 0.00015059232273893612, "loss": 0.5254, "mean_token_accuracy": 0.8707186883687973, "num_tokens": 138005114.0, "step": 49200 }, { "epoch": 1.1965728987160507, "grad_norm": 1.040124773979187, "learning_rate": 0.00015034851994552594, "loss": 0.5541, "mean_token_accuracy": 0.8649247759580612, "num_tokens": 138291510.0, "step": 49300 }, { "epoch": 1.1990000242712555, "grad_norm": 0.8435716032981873, "learning_rate": 0.00015010431559383286, "loss": 0.5154, "mean_token_accuracy": 0.8739145505428314, "num_tokens": 138576071.0, "step": 49400 }, { "epoch": 1.2014271498264606, "grad_norm": 0.743602454662323, "learning_rate": 0.00014985971163152646, "loss": 0.4815, "mean_token_accuracy": 0.8803683018684387, "num_tokens": 138856997.0, "step": 49500 }, { "epoch": 1.2038542753816655, "grad_norm": 0.5915963649749756, "learning_rate": 0.00014961471000946342, "loss": 0.4817, "mean_token_accuracy": 0.8809413892030716, "num_tokens": 139136968.0, "step": 49600 }, { "epoch": 1.2062814009368705, "grad_norm": 1.0429223775863647, "learning_rate": 0.00014936931268167198, "loss": 0.5032, "mean_token_accuracy": 0.8750742501020432, "num_tokens": 139405929.0, "step": 49700 }, { "epoch": 1.2087085264920754, "grad_norm": 0.8039318323135376, "learning_rate": 0.00014912352160533637, "loss": 0.4694, "mean_token_accuracy": 0.8825742882490158, "num_tokens": 139685506.0, "step": 49800 }, { "epoch": 1.2111356520472805, "grad_norm": 0.7582560777664185, "learning_rate": 0.00014887733874078125, "loss": 0.5195, "mean_token_accuracy": 0.872511134147644, "num_tokens": 139964509.0, "step": 49900 }, { "epoch": 1.2135627776024853, "grad_norm": 0.9156772494316101, "learning_rate": 0.0001486307660514559, "loss": 0.5017, "mean_token_accuracy": 0.8769380474090576, "num_tokens": 140239409.0, "step": 50000 }, { "epoch": 1.2159899031576904, "grad_norm": 0.8511459827423096, "learning_rate": 0.0001483838055039188, "loss": 0.5256, "mean_token_accuracy": 0.8710427796840667, "num_tokens": 140535879.0, "step": 50100 }, { "epoch": 1.2184170287128953, "grad_norm": 0.977913498878479, "learning_rate": 0.0001481364590678217, "loss": 0.4692, "mean_token_accuracy": 0.8834199732542038, "num_tokens": 140816971.0, "step": 50200 }, { "epoch": 1.2208441542681003, "grad_norm": 0.7674177885055542, "learning_rate": 0.00014788872871589412, "loss": 0.5214, "mean_token_accuracy": 0.8728067207336426, "num_tokens": 141101347.0, "step": 50300 }, { "epoch": 1.2232712798233052, "grad_norm": 1.071616291999817, "learning_rate": 0.0001476406164239275, "loss": 0.5249, "mean_token_accuracy": 0.8710673362016678, "num_tokens": 141373365.0, "step": 50400 }, { "epoch": 1.2256984053785103, "grad_norm": 0.719029426574707, "learning_rate": 0.00014739212417075944, "loss": 0.418, "mean_token_accuracy": 0.8950458431243896, "num_tokens": 141650751.0, "step": 50500 }, { "epoch": 1.2281255309337151, "grad_norm": 0.735320508480072, "learning_rate": 0.00014714325393825806, "loss": 0.4815, "mean_token_accuracy": 0.8804818952083587, "num_tokens": 141940470.0, "step": 50600 }, { "epoch": 1.2305526564889202, "grad_norm": 1.0473617315292358, "learning_rate": 0.0001468940077113059, "loss": 0.4744, "mean_token_accuracy": 0.8823691141605378, "num_tokens": 142212013.0, "step": 50700 }, { "epoch": 1.232979782044125, "grad_norm": 0.8890407681465149, "learning_rate": 0.00014664438747778445, "loss": 0.4904, "mean_token_accuracy": 0.8795123302936554, "num_tokens": 142488791.0, "step": 50800 }, { "epoch": 1.2354069075993301, "grad_norm": 0.7927642464637756, "learning_rate": 0.00014639439522855802, "loss": 0.4976, "mean_token_accuracy": 0.8760099506378174, "num_tokens": 142773638.0, "step": 50900 }, { "epoch": 1.237834033154535, "grad_norm": 0.649486780166626, "learning_rate": 0.00014614403295745793, "loss": 0.5262, "mean_token_accuracy": 0.8704094517230988, "num_tokens": 143072258.0, "step": 51000 }, { "epoch": 1.24026115870974, "grad_norm": 0.8649196624755859, "learning_rate": 0.00014589330266126673, "loss": 0.4881, "mean_token_accuracy": 0.8796616047620773, "num_tokens": 143353223.0, "step": 51100 }, { "epoch": 1.242688284264945, "grad_norm": 1.0805305242538452, "learning_rate": 0.00014564220633970213, "loss": 0.4871, "mean_token_accuracy": 0.8797647720575332, "num_tokens": 143630606.0, "step": 51200 }, { "epoch": 1.24511540982015, "grad_norm": 1.1342017650604248, "learning_rate": 0.0001453907459954011, "loss": 0.5004, "mean_token_accuracy": 0.877112073302269, "num_tokens": 143909256.0, "step": 51300 }, { "epoch": 1.2475425353753549, "grad_norm": 0.7277913689613342, "learning_rate": 0.0001451389236339039, "loss": 0.4879, "mean_token_accuracy": 0.8794431656599044, "num_tokens": 144184234.0, "step": 51400 }, { "epoch": 1.24996966093056, "grad_norm": 1.1246638298034668, "learning_rate": 0.0001448867412636381, "loss": 0.5452, "mean_token_accuracy": 0.8682722455263138, "num_tokens": 144472017.0, "step": 51500 }, { "epoch": 1.252396786485765, "grad_norm": 0.7161349654197693, "learning_rate": 0.00014463420089590255, "loss": 0.4728, "mean_token_accuracy": 0.8832290679216385, "num_tokens": 144769954.0, "step": 51600 }, { "epoch": 1.2548239120409699, "grad_norm": 0.9632692933082581, "learning_rate": 0.00014438130454485126, "loss": 0.4891, "mean_token_accuracy": 0.8817065650224686, "num_tokens": 145053314.0, "step": 51700 }, { "epoch": 1.2572510375961747, "grad_norm": 1.6564607620239258, "learning_rate": 0.00014412805422747753, "loss": 0.4838, "mean_token_accuracy": 0.8807741963863372, "num_tokens": 145328651.0, "step": 51800 }, { "epoch": 1.2596781631513798, "grad_norm": 0.6910829544067383, "learning_rate": 0.00014387445196359768, "loss": 0.4659, "mean_token_accuracy": 0.884465005993843, "num_tokens": 145616910.0, "step": 51900 }, { "epoch": 1.262105288706585, "grad_norm": 0.638996422290802, "learning_rate": 0.000143620499775835, "loss": 0.4694, "mean_token_accuracy": 0.8855656278133393, "num_tokens": 145893173.0, "step": 52000 }, { "epoch": 1.2645324142617898, "grad_norm": 0.835657000541687, "learning_rate": 0.00014336619968960363, "loss": 0.5138, "mean_token_accuracy": 0.8743197238445282, "num_tokens": 146166823.0, "step": 52100 }, { "epoch": 1.2669595398169946, "grad_norm": 0.753837525844574, "learning_rate": 0.0001431115537330924, "loss": 0.5221, "mean_token_accuracy": 0.8723088407516479, "num_tokens": 146426455.0, "step": 52200 }, { "epoch": 1.2693866653721997, "grad_norm": 0.6533814072608948, "learning_rate": 0.00014285656393724872, "loss": 0.4719, "mean_token_accuracy": 0.8833791041374206, "num_tokens": 146700648.0, "step": 52300 }, { "epoch": 1.2718137909274048, "grad_norm": 0.75107741355896, "learning_rate": 0.00014260123233576216, "loss": 0.4743, "mean_token_accuracy": 0.8824715614318848, "num_tokens": 146986402.0, "step": 52400 }, { "epoch": 1.2742409164826096, "grad_norm": 0.9934569597244263, "learning_rate": 0.00014234556096504852, "loss": 0.5059, "mean_token_accuracy": 0.875720181465149, "num_tokens": 147271442.0, "step": 52500 }, { "epoch": 1.2766680420378147, "grad_norm": 0.8318378329277039, "learning_rate": 0.00014208955186423343, "loss": 0.4761, "mean_token_accuracy": 0.8846336156129837, "num_tokens": 147538750.0, "step": 52600 }, { "epoch": 1.2790951675930196, "grad_norm": 0.6633797287940979, "learning_rate": 0.00014183320707513605, "loss": 0.4853, "mean_token_accuracy": 0.8816380894184113, "num_tokens": 147816277.0, "step": 52700 }, { "epoch": 1.2815222931482246, "grad_norm": 0.6250624060630798, "learning_rate": 0.00014157652864225297, "loss": 0.492, "mean_token_accuracy": 0.8797484838962555, "num_tokens": 148104573.0, "step": 52800 }, { "epoch": 1.2839494187034295, "grad_norm": 0.6616709232330322, "learning_rate": 0.0001413195186127416, "loss": 0.4848, "mean_token_accuracy": 0.8814722353219986, "num_tokens": 148385633.0, "step": 52900 }, { "epoch": 1.2863765442586346, "grad_norm": 1.0039057731628418, "learning_rate": 0.00014106217903640423, "loss": 0.4626, "mean_token_accuracy": 0.8860047221183777, "num_tokens": 148675690.0, "step": 53000 }, { "epoch": 1.2888036698138394, "grad_norm": 0.8785421252250671, "learning_rate": 0.00014080451196567126, "loss": 0.4661, "mean_token_accuracy": 0.8858687382936478, "num_tokens": 148958845.0, "step": 53100 }, { "epoch": 1.2912307953690445, "grad_norm": 0.6556010842323303, "learning_rate": 0.00014054651945558523, "loss": 0.495, "mean_token_accuracy": 0.8791573786735535, "num_tokens": 149223683.0, "step": 53200 }, { "epoch": 1.2936579209242494, "grad_norm": 0.6850820183753967, "learning_rate": 0.00014028820356378412, "loss": 0.5025, "mean_token_accuracy": 0.8775439101457596, "num_tokens": 149506639.0, "step": 53300 }, { "epoch": 1.2960850464794544, "grad_norm": 1.0605202913284302, "learning_rate": 0.00014002956635048518, "loss": 0.4572, "mean_token_accuracy": 0.8866003942489624, "num_tokens": 149782390.0, "step": 53400 }, { "epoch": 1.2985121720346593, "grad_norm": 0.7321272492408752, "learning_rate": 0.00013977060987846823, "loss": 0.4635, "mean_token_accuracy": 0.8857458835840225, "num_tokens": 150060443.0, "step": 53500 }, { "epoch": 1.3009392975898644, "grad_norm": 0.8881349563598633, "learning_rate": 0.00013951133621305955, "loss": 0.4523, "mean_token_accuracy": 0.8875695073604584, "num_tokens": 150325449.0, "step": 53600 }, { "epoch": 1.3033664231450692, "grad_norm": 0.715872049331665, "learning_rate": 0.00013925174742211502, "loss": 0.46, "mean_token_accuracy": 0.8867443364858627, "num_tokens": 150598642.0, "step": 53700 }, { "epoch": 1.3057935487002743, "grad_norm": 0.7793150544166565, "learning_rate": 0.00013899184557600398, "loss": 0.4517, "mean_token_accuracy": 0.8886141180992126, "num_tokens": 150898833.0, "step": 53800 }, { "epoch": 1.3082206742554792, "grad_norm": 1.7053370475769043, "learning_rate": 0.00013873163274759248, "loss": 0.4616, "mean_token_accuracy": 0.8852343845367432, "num_tokens": 151168628.0, "step": 53900 }, { "epoch": 1.3106477998106842, "grad_norm": 0.8918945789337158, "learning_rate": 0.00013847111101222687, "loss": 0.433, "mean_token_accuracy": 0.8909021973609924, "num_tokens": 151450450.0, "step": 54000 }, { "epoch": 1.313074925365889, "grad_norm": 1.0090819597244263, "learning_rate": 0.0001382102824477172, "loss": 0.4479, "mean_token_accuracy": 0.8897096848487854, "num_tokens": 151740252.0, "step": 54100 }, { "epoch": 1.3155020509210942, "grad_norm": 0.9295179843902588, "learning_rate": 0.0001379491491343207, "loss": 0.4753, "mean_token_accuracy": 0.8836379379034043, "num_tokens": 152021939.0, "step": 54200 }, { "epoch": 1.317929176476299, "grad_norm": 0.7607155442237854, "learning_rate": 0.00013768771315472502, "loss": 0.4402, "mean_token_accuracy": 0.8909223932027817, "num_tokens": 152302100.0, "step": 54300 }, { "epoch": 1.3203563020315041, "grad_norm": 0.8280419111251831, "learning_rate": 0.00013742597659403187, "loss": 0.4331, "mean_token_accuracy": 0.8921854722499848, "num_tokens": 152575683.0, "step": 54400 }, { "epoch": 1.322783427586709, "grad_norm": 0.7644575238227844, "learning_rate": 0.0001371639415397402, "loss": 0.4523, "mean_token_accuracy": 0.8892707014083863, "num_tokens": 152864215.0, "step": 54500 }, { "epoch": 1.325210553141914, "grad_norm": 0.6379640698432922, "learning_rate": 0.00013690161008172975, "loss": 0.4586, "mean_token_accuracy": 0.8860977566242219, "num_tokens": 153144000.0, "step": 54600 }, { "epoch": 1.3276376786971191, "grad_norm": 1.0883742570877075, "learning_rate": 0.00013663898431224403, "loss": 0.4432, "mean_token_accuracy": 0.8918490415811539, "num_tokens": 153428731.0, "step": 54700 }, { "epoch": 1.330064804252324, "grad_norm": 0.708063542842865, "learning_rate": 0.000136376066325874, "loss": 0.4054, "mean_token_accuracy": 0.8982583904266357, "num_tokens": 153707438.0, "step": 54800 }, { "epoch": 1.3324919298075288, "grad_norm": 0.9368717074394226, "learning_rate": 0.0001361128582195412, "loss": 0.4359, "mean_token_accuracy": 0.8917915511131287, "num_tokens": 154010417.0, "step": 54900 }, { "epoch": 1.334919055362734, "grad_norm": 0.8366745710372925, "learning_rate": 0.000135849362092481, "loss": 0.4584, "mean_token_accuracy": 0.8874488598108292, "num_tokens": 154284670.0, "step": 55000 }, { "epoch": 1.337346180917939, "grad_norm": 0.8448474407196045, "learning_rate": 0.00013558558004622597, "loss": 0.4232, "mean_token_accuracy": 0.8938626998662949, "num_tokens": 154557986.0, "step": 55100 }, { "epoch": 1.3397733064731439, "grad_norm": 0.7955624461174011, "learning_rate": 0.00013532151418458894, "loss": 0.4239, "mean_token_accuracy": 0.8958518970012664, "num_tokens": 154831844.0, "step": 55200 }, { "epoch": 1.3422004320283487, "grad_norm": 0.9134492874145508, "learning_rate": 0.00013505716661364641, "loss": 0.4122, "mean_token_accuracy": 0.8974074327945709, "num_tokens": 155109417.0, "step": 55300 }, { "epoch": 1.3446275575835538, "grad_norm": 0.8050549626350403, "learning_rate": 0.00013479253944172177, "loss": 0.4699, "mean_token_accuracy": 0.8858902537822724, "num_tokens": 155387541.0, "step": 55400 }, { "epoch": 1.3470546831387589, "grad_norm": 0.9216250777244568, "learning_rate": 0.00013452763477936813, "loss": 0.4241, "mean_token_accuracy": 0.8962391257286072, "num_tokens": 155672639.0, "step": 55500 }, { "epoch": 1.3494818086939637, "grad_norm": 0.9659915566444397, "learning_rate": 0.00013426245473935198, "loss": 0.4656, "mean_token_accuracy": 0.8873411613702774, "num_tokens": 155947597.0, "step": 55600 }, { "epoch": 1.3519089342491686, "grad_norm": 0.7852197289466858, "learning_rate": 0.00013399700143663598, "loss": 0.4728, "mean_token_accuracy": 0.8850886446237564, "num_tokens": 156238718.0, "step": 55700 }, { "epoch": 1.3543360598043737, "grad_norm": 0.78695148229599, "learning_rate": 0.00013373127698836223, "loss": 0.4239, "mean_token_accuracy": 0.8962297958135604, "num_tokens": 156517043.0, "step": 55800 }, { "epoch": 1.3567631853595787, "grad_norm": 0.8713340759277344, "learning_rate": 0.00013346528351383544, "loss": 0.433, "mean_token_accuracy": 0.8928613644838334, "num_tokens": 156799092.0, "step": 55900 }, { "epoch": 1.3591903109147836, "grad_norm": 0.6828170418739319, "learning_rate": 0.00013319902313450594, "loss": 0.4466, "mean_token_accuracy": 0.8914445018768311, "num_tokens": 157093747.0, "step": 56000 }, { "epoch": 1.3616174364699887, "grad_norm": 0.9015485048294067, "learning_rate": 0.00013293249797395268, "loss": 0.4516, "mean_token_accuracy": 0.8881697982549668, "num_tokens": 157384150.0, "step": 56100 }, { "epoch": 1.3640445620251935, "grad_norm": 0.7890717387199402, "learning_rate": 0.0001326657101578665, "loss": 0.4054, "mean_token_accuracy": 0.899683158993721, "num_tokens": 157680572.0, "step": 56200 }, { "epoch": 1.3664716875803986, "grad_norm": 0.7425559163093567, "learning_rate": 0.00013239866181403303, "loss": 0.4496, "mean_token_accuracy": 0.8901696348190308, "num_tokens": 157960621.0, "step": 56300 }, { "epoch": 1.3688988131356035, "grad_norm": 0.9881618022918701, "learning_rate": 0.00013213135507231577, "loss": 0.488, "mean_token_accuracy": 0.8811579287052155, "num_tokens": 158246805.0, "step": 56400 }, { "epoch": 1.3713259386908085, "grad_norm": 0.7465482950210571, "learning_rate": 0.00013186379206463902, "loss": 0.423, "mean_token_accuracy": 0.8940630006790161, "num_tokens": 158514607.0, "step": 56500 }, { "epoch": 1.3737530642460134, "grad_norm": 0.8724623918533325, "learning_rate": 0.00013159597492497113, "loss": 0.4433, "mean_token_accuracy": 0.8902081990242005, "num_tokens": 158792485.0, "step": 56600 }, { "epoch": 1.3761801898012185, "grad_norm": 0.741797924041748, "learning_rate": 0.00013132790578930704, "loss": 0.455, "mean_token_accuracy": 0.8891710060834884, "num_tokens": 159075983.0, "step": 56700 }, { "epoch": 1.3786073153564233, "grad_norm": 0.6590899229049683, "learning_rate": 0.00013105958679565175, "loss": 0.4431, "mean_token_accuracy": 0.8901263773441315, "num_tokens": 159364576.0, "step": 56800 }, { "epoch": 1.3810344409116284, "grad_norm": 0.7108921408653259, "learning_rate": 0.00013079102008400285, "loss": 0.4451, "mean_token_accuracy": 0.8915826362371445, "num_tokens": 159648774.0, "step": 56900 }, { "epoch": 1.3834615664668333, "grad_norm": 0.7816290259361267, "learning_rate": 0.00013052220779633368, "loss": 0.426, "mean_token_accuracy": 0.8951281315088272, "num_tokens": 159934295.0, "step": 57000 }, { "epoch": 1.3858886920220383, "grad_norm": 0.5931801199913025, "learning_rate": 0.00013025315207657625, "loss": 0.4106, "mean_token_accuracy": 0.8973771643638611, "num_tokens": 160207873.0, "step": 57100 }, { "epoch": 1.3883158175772432, "grad_norm": 0.9697718024253845, "learning_rate": 0.00012998385507060394, "loss": 0.401, "mean_token_accuracy": 0.9002759510278702, "num_tokens": 160493828.0, "step": 57200 }, { "epoch": 1.3907429431324483, "grad_norm": 0.9094292521476746, "learning_rate": 0.00012971431892621466, "loss": 0.4554, "mean_token_accuracy": 0.8877697956562042, "num_tokens": 160772760.0, "step": 57300 }, { "epoch": 1.3931700686876531, "grad_norm": 0.6295153498649597, "learning_rate": 0.00012944454579311355, "loss": 0.4425, "mean_token_accuracy": 0.8911004400253296, "num_tokens": 161056385.0, "step": 57400 }, { "epoch": 1.3955971942428582, "grad_norm": 0.6608839631080627, "learning_rate": 0.00012917453782289578, "loss": 0.3784, "mean_token_accuracy": 0.904485547542572, "num_tokens": 161348717.0, "step": 57500 }, { "epoch": 1.398024319798063, "grad_norm": 0.8495998978614807, "learning_rate": 0.00012890429716902964, "loss": 0.4467, "mean_token_accuracy": 0.8919394075870514, "num_tokens": 161627738.0, "step": 57600 }, { "epoch": 1.4004514453532682, "grad_norm": 0.593804657459259, "learning_rate": 0.00012863382598683908, "loss": 0.4381, "mean_token_accuracy": 0.8924766594171524, "num_tokens": 161903764.0, "step": 57700 }, { "epoch": 1.4028785709084732, "grad_norm": 0.8975790739059448, "learning_rate": 0.00012836312643348666, "loss": 0.4423, "mean_token_accuracy": 0.8917118191719056, "num_tokens": 162185851.0, "step": 57800 }, { "epoch": 1.405305696463678, "grad_norm": 0.6902527809143066, "learning_rate": 0.0001280922006679564, "loss": 0.4134, "mean_token_accuracy": 0.8982209903001785, "num_tokens": 162457139.0, "step": 57900 }, { "epoch": 1.407732822018883, "grad_norm": 0.48210135102272034, "learning_rate": 0.00012782105085103648, "loss": 0.3923, "mean_token_accuracy": 0.9033814692497253, "num_tokens": 162748843.0, "step": 58000 }, { "epoch": 1.410159947574088, "grad_norm": 0.6424720883369446, "learning_rate": 0.00012754967914530192, "loss": 0.4599, "mean_token_accuracy": 0.8881127738952637, "num_tokens": 163031004.0, "step": 58100 }, { "epoch": 1.412587073129293, "grad_norm": 0.6549175381660461, "learning_rate": 0.0001272780877150975, "loss": 0.4222, "mean_token_accuracy": 0.8963321650028229, "num_tokens": 163312021.0, "step": 58200 }, { "epoch": 1.415014198684498, "grad_norm": 0.7186068296432495, "learning_rate": 0.0001270062787265205, "loss": 0.4635, "mean_token_accuracy": 0.8873461991548538, "num_tokens": 163592479.0, "step": 58300 }, { "epoch": 1.4174413242397028, "grad_norm": 0.9494929909706116, "learning_rate": 0.00012673425434740315, "loss": 0.4281, "mean_token_accuracy": 0.8949847120046616, "num_tokens": 163875865.0, "step": 58400 }, { "epoch": 1.419868449794908, "grad_norm": 0.546734094619751, "learning_rate": 0.00012646201674729578, "loss": 0.3853, "mean_token_accuracy": 0.9039551669359207, "num_tokens": 164147648.0, "step": 58500 }, { "epoch": 1.422295575350113, "grad_norm": 0.598573625087738, "learning_rate": 0.00012618956809744915, "loss": 0.3922, "mean_token_accuracy": 0.9019291293621063, "num_tokens": 164431800.0, "step": 58600 }, { "epoch": 1.4247227009053178, "grad_norm": 0.656909704208374, "learning_rate": 0.00012591691057079726, "loss": 0.4027, "mean_token_accuracy": 0.9010864180326462, "num_tokens": 164713062.0, "step": 58700 }, { "epoch": 1.4271498264605227, "grad_norm": 0.6891266703605652, "learning_rate": 0.00012564404634194006, "loss": 0.4059, "mean_token_accuracy": 0.9008734589815139, "num_tokens": 165013179.0, "step": 58800 }, { "epoch": 1.4295769520157278, "grad_norm": 0.8763130903244019, "learning_rate": 0.00012537097758712605, "loss": 0.3738, "mean_token_accuracy": 0.9075895375013352, "num_tokens": 165302834.0, "step": 58900 }, { "epoch": 1.4320040775709328, "grad_norm": 0.8782544136047363, "learning_rate": 0.00012509770648423495, "loss": 0.4009, "mean_token_accuracy": 0.9022983765602112, "num_tokens": 165582933.0, "step": 59000 }, { "epoch": 1.4344312031261377, "grad_norm": 0.7040051221847534, "learning_rate": 0.0001248242352127603, "loss": 0.4491, "mean_token_accuracy": 0.8912272185087204, "num_tokens": 165868666.0, "step": 59100 }, { "epoch": 1.4368583286813426, "grad_norm": 0.65074622631073, "learning_rate": 0.00012455056595379212, "loss": 0.4357, "mean_token_accuracy": 0.8948434573411942, "num_tokens": 166145044.0, "step": 59200 }, { "epoch": 1.4392854542365476, "grad_norm": 0.7173576354980469, "learning_rate": 0.0001242767008899995, "loss": 0.4587, "mean_token_accuracy": 0.888498705625534, "num_tokens": 166438123.0, "step": 59300 }, { "epoch": 1.4417125797917527, "grad_norm": 0.7104485630989075, "learning_rate": 0.00012400264220561322, "loss": 0.41, "mean_token_accuracy": 0.9003169226646424, "num_tokens": 166725344.0, "step": 59400 }, { "epoch": 1.4441397053469576, "grad_norm": 0.8286243677139282, "learning_rate": 0.00012372839208640816, "loss": 0.4163, "mean_token_accuracy": 0.8978767049312592, "num_tokens": 167007251.0, "step": 59500 }, { "epoch": 1.4465668309021626, "grad_norm": 1.1104589700698853, "learning_rate": 0.00012345395271968617, "loss": 0.4176, "mean_token_accuracy": 0.8967356210947037, "num_tokens": 167285691.0, "step": 59600 }, { "epoch": 1.4489939564573675, "grad_norm": 0.7582440376281738, "learning_rate": 0.00012317932629425833, "loss": 0.4242, "mean_token_accuracy": 0.8963595002889633, "num_tokens": 167577906.0, "step": 59700 }, { "epoch": 1.4514210820125726, "grad_norm": 0.651850163936615, "learning_rate": 0.00012290451500042768, "loss": 0.3934, "mean_token_accuracy": 0.9035976582765579, "num_tokens": 167857298.0, "step": 59800 }, { "epoch": 1.4538482075677774, "grad_norm": 0.5393962860107422, "learning_rate": 0.00012262952102997168, "loss": 0.3914, "mean_token_accuracy": 0.9034919530153275, "num_tokens": 168138829.0, "step": 59900 }, { "epoch": 1.4562753331229825, "grad_norm": 0.6189530491828918, "learning_rate": 0.00012235434657612482, "loss": 0.4317, "mean_token_accuracy": 0.8959889018535614, "num_tokens": 168426422.0, "step": 60000 }, { "epoch": 1.4587024586781874, "grad_norm": 0.7726854085922241, "learning_rate": 0.00012207899383356085, "loss": 0.4203, "mean_token_accuracy": 0.8981534975767136, "num_tokens": 168707865.0, "step": 60100 }, { "epoch": 1.4611295842333925, "grad_norm": 0.7277886867523193, "learning_rate": 0.00012180346499837567, "loss": 0.3604, "mean_token_accuracy": 0.909367219209671, "num_tokens": 168985511.0, "step": 60200 }, { "epoch": 1.4635567097885973, "grad_norm": 0.7043424248695374, "learning_rate": 0.00012152776226806955, "loss": 0.3954, "mean_token_accuracy": 0.9025062710046768, "num_tokens": 169272720.0, "step": 60300 }, { "epoch": 1.4659838353438024, "grad_norm": 0.6475239396095276, "learning_rate": 0.0001212518878415296, "loss": 0.402, "mean_token_accuracy": 0.9011396139860153, "num_tokens": 169545792.0, "step": 60400 }, { "epoch": 1.4684109608990072, "grad_norm": 0.9668689966201782, "learning_rate": 0.00012097584391901244, "loss": 0.4272, "mean_token_accuracy": 0.8966371548175812, "num_tokens": 169820743.0, "step": 60500 }, { "epoch": 1.4708380864542123, "grad_norm": 0.5923229455947876, "learning_rate": 0.00012069963270212645, "loss": 0.3847, "mean_token_accuracy": 0.9052494311332703, "num_tokens": 170077149.0, "step": 60600 }, { "epoch": 1.4732652120094172, "grad_norm": 0.9675098061561584, "learning_rate": 0.00012042325639381424, "loss": 0.4158, "mean_token_accuracy": 0.8990872955322265, "num_tokens": 170357611.0, "step": 60700 }, { "epoch": 1.4756923375646223, "grad_norm": 0.9949508905410767, "learning_rate": 0.00012014671719833521, "loss": 0.4143, "mean_token_accuracy": 0.8991787171363831, "num_tokens": 170636443.0, "step": 60800 }, { "epoch": 1.4781194631198271, "grad_norm": 0.6021668314933777, "learning_rate": 0.0001198700173212478, "loss": 0.4384, "mean_token_accuracy": 0.8946084171533585, "num_tokens": 170928533.0, "step": 60900 }, { "epoch": 1.4805465886750322, "grad_norm": 0.7970176935195923, "learning_rate": 0.000119593158969392, "loss": 0.3983, "mean_token_accuracy": 0.9021727895736694, "num_tokens": 171211196.0, "step": 61000 }, { "epoch": 1.482973714230237, "grad_norm": 0.7371858954429626, "learning_rate": 0.00011931614435087174, "loss": 0.4151, "mean_token_accuracy": 0.8985683709383011, "num_tokens": 171479735.0, "step": 61100 }, { "epoch": 1.4854008397854421, "grad_norm": 0.7278369069099426, "learning_rate": 0.00011903897567503723, "loss": 0.3895, "mean_token_accuracy": 0.904181165099144, "num_tokens": 171755093.0, "step": 61200 }, { "epoch": 1.4878279653406472, "grad_norm": 0.9078586101531982, "learning_rate": 0.00011876165515246744, "loss": 0.4234, "mean_token_accuracy": 0.8975623792409897, "num_tokens": 172048894.0, "step": 61300 }, { "epoch": 1.490255090895852, "grad_norm": 0.6660951375961304, "learning_rate": 0.0001184841849949524, "loss": 0.3481, "mean_token_accuracy": 0.9130461746454239, "num_tokens": 172356896.0, "step": 61400 }, { "epoch": 1.492682216451057, "grad_norm": 0.7461407780647278, "learning_rate": 0.00011820656741547544, "loss": 0.3653, "mean_token_accuracy": 0.9098727852106094, "num_tokens": 172640726.0, "step": 61500 }, { "epoch": 1.495109342006262, "grad_norm": 0.5631619691848755, "learning_rate": 0.00011792880462819585, "loss": 0.4279, "mean_token_accuracy": 0.8971470546722412, "num_tokens": 172906449.0, "step": 61600 }, { "epoch": 1.497536467561467, "grad_norm": 1.2110614776611328, "learning_rate": 0.00011765089884843091, "loss": 0.373, "mean_token_accuracy": 0.9086796522140503, "num_tokens": 173172381.0, "step": 61700 }, { "epoch": 1.499963593116672, "grad_norm": 0.5904493927955627, "learning_rate": 0.00011737285229263832, "loss": 0.3914, "mean_token_accuracy": 0.9038412702083588, "num_tokens": 173456589.0, "step": 61800 }, { "epoch": 1.5023907186718768, "grad_norm": 0.9040527939796448, "learning_rate": 0.0001170946671783987, "loss": 0.393, "mean_token_accuracy": 0.902976735830307, "num_tokens": 173729202.0, "step": 61900 }, { "epoch": 1.5048178442270819, "grad_norm": 0.5546716451644897, "learning_rate": 0.00011681634572439761, "loss": 0.4038, "mean_token_accuracy": 0.901577382683754, "num_tokens": 174014930.0, "step": 62000 }, { "epoch": 1.507244969782287, "grad_norm": 0.8447543978691101, "learning_rate": 0.00011653789015040805, "loss": 0.3878, "mean_token_accuracy": 0.9046182173490525, "num_tokens": 174290764.0, "step": 62100 }, { "epoch": 1.5096720953374918, "grad_norm": 0.8927494883537292, "learning_rate": 0.00011625930267727276, "loss": 0.4146, "mean_token_accuracy": 0.8986788499355316, "num_tokens": 174582875.0, "step": 62200 }, { "epoch": 1.5120992208926967, "grad_norm": 0.9438863396644592, "learning_rate": 0.00011598058552688633, "loss": 0.3777, "mean_token_accuracy": 0.9074046242237092, "num_tokens": 174867623.0, "step": 62300 }, { "epoch": 1.5145263464479017, "grad_norm": 1.016975998878479, "learning_rate": 0.00011570174092217769, "loss": 0.4071, "mean_token_accuracy": 0.9008045589923859, "num_tokens": 175157062.0, "step": 62400 }, { "epoch": 1.5169534720031068, "grad_norm": 0.7987212538719177, "learning_rate": 0.00011542277108709226, "loss": 0.3882, "mean_token_accuracy": 0.904829193353653, "num_tokens": 175436247.0, "step": 62500 }, { "epoch": 1.5193805975583117, "grad_norm": 0.8533787727355957, "learning_rate": 0.00011514367824657427, "loss": 0.392, "mean_token_accuracy": 0.9054911434650421, "num_tokens": 175720578.0, "step": 62600 }, { "epoch": 1.5218077231135165, "grad_norm": 0.760500967502594, "learning_rate": 0.00011486446462654892, "loss": 0.4002, "mean_token_accuracy": 0.9018949067592621, "num_tokens": 175995977.0, "step": 62700 }, { "epoch": 1.5242348486687216, "grad_norm": 0.8798636794090271, "learning_rate": 0.00011458513245390483, "loss": 0.3706, "mean_token_accuracy": 0.9091184973716736, "num_tokens": 176277514.0, "step": 62800 }, { "epoch": 1.5266619742239267, "grad_norm": 0.5968068838119507, "learning_rate": 0.00011430568395647598, "loss": 0.3607, "mean_token_accuracy": 0.9119984036684037, "num_tokens": 176569080.0, "step": 62900 }, { "epoch": 1.5290890997791315, "grad_norm": 0.653016209602356, "learning_rate": 0.00011402612136302421, "loss": 0.4304, "mean_token_accuracy": 0.8962489527463913, "num_tokens": 176836863.0, "step": 63000 }, { "epoch": 1.5315162253343364, "grad_norm": 0.4983295798301697, "learning_rate": 0.00011374644690322136, "loss": 0.3713, "mean_token_accuracy": 0.9084924668073654, "num_tokens": 177119720.0, "step": 63100 }, { "epoch": 1.5339433508895415, "grad_norm": 0.743560254573822, "learning_rate": 0.00011346666280763134, "loss": 0.3405, "mean_token_accuracy": 0.9152973371744156, "num_tokens": 177396225.0, "step": 63200 }, { "epoch": 1.5363704764447466, "grad_norm": 0.5968629121780396, "learning_rate": 0.00011318677130769261, "loss": 0.3543, "mean_token_accuracy": 0.9123020827770233, "num_tokens": 177670691.0, "step": 63300 }, { "epoch": 1.5387976019999514, "grad_norm": 0.7266377210617065, "learning_rate": 0.00011290677463570024, "loss": 0.4014, "mean_token_accuracy": 0.902683122754097, "num_tokens": 177944237.0, "step": 63400 }, { "epoch": 1.5412247275551563, "grad_norm": 0.6020427346229553, "learning_rate": 0.00011262667502478793, "loss": 0.3823, "mean_token_accuracy": 0.9062705504894256, "num_tokens": 178226347.0, "step": 63500 }, { "epoch": 1.5436518531103613, "grad_norm": 0.7702438831329346, "learning_rate": 0.00011234647470891059, "loss": 0.3765, "mean_token_accuracy": 0.907245215177536, "num_tokens": 178510805.0, "step": 63600 }, { "epoch": 1.5460789786655664, "grad_norm": 0.9679001569747925, "learning_rate": 0.00011206617592282623, "loss": 0.3457, "mean_token_accuracy": 0.9141608655452729, "num_tokens": 178787352.0, "step": 63700 }, { "epoch": 1.5485061042207713, "grad_norm": 0.8241939544677734, "learning_rate": 0.0001117857809020781, "loss": 0.3556, "mean_token_accuracy": 0.9122043627500535, "num_tokens": 179050596.0, "step": 63800 }, { "epoch": 1.5509332297759764, "grad_norm": 0.696847677230835, "learning_rate": 0.00011150529188297716, "loss": 0.4277, "mean_token_accuracy": 0.8969793927669525, "num_tokens": 179332713.0, "step": 63900 }, { "epoch": 1.5533603553311814, "grad_norm": 0.6272198557853699, "learning_rate": 0.00011122471110258401, "loss": 0.3932, "mean_token_accuracy": 0.9046736443042755, "num_tokens": 179613755.0, "step": 64000 }, { "epoch": 1.5557874808863863, "grad_norm": 0.7845979332923889, "learning_rate": 0.000110944040798691, "loss": 0.3948, "mean_token_accuracy": 0.9039056050777435, "num_tokens": 179884755.0, "step": 64100 }, { "epoch": 1.5582146064415912, "grad_norm": 1.0909096002578735, "learning_rate": 0.00011066328320980464, "loss": 0.3662, "mean_token_accuracy": 0.9101364880800247, "num_tokens": 180167093.0, "step": 64200 }, { "epoch": 1.5606417319967962, "grad_norm": 0.6839550137519836, "learning_rate": 0.00011038244057512743, "loss": 0.4025, "mean_token_accuracy": 0.9013819855451584, "num_tokens": 180442997.0, "step": 64300 }, { "epoch": 1.5630688575520013, "grad_norm": 0.6133870482444763, "learning_rate": 0.00011010151513454023, "loss": 0.328, "mean_token_accuracy": 0.918245889544487, "num_tokens": 180733470.0, "step": 64400 }, { "epoch": 1.5654959831072062, "grad_norm": 0.576290488243103, "learning_rate": 0.00010982050912858435, "loss": 0.3754, "mean_token_accuracy": 0.9084422850608825, "num_tokens": 181004342.0, "step": 64500 }, { "epoch": 1.567923108662411, "grad_norm": 1.109480857849121, "learning_rate": 0.00010953942479844364, "loss": 0.3778, "mean_token_accuracy": 0.907360492348671, "num_tokens": 181280978.0, "step": 64600 }, { "epoch": 1.570350234217616, "grad_norm": 0.7722584009170532, "learning_rate": 0.00010925826438592657, "loss": 0.3491, "mean_token_accuracy": 0.9142099022865295, "num_tokens": 181559969.0, "step": 64700 }, { "epoch": 1.5727773597728212, "grad_norm": 0.6564502120018005, "learning_rate": 0.00010897703013344854, "loss": 0.3711, "mean_token_accuracy": 0.9086374562978744, "num_tokens": 181841280.0, "step": 64800 }, { "epoch": 1.575204485328026, "grad_norm": 0.5784518122673035, "learning_rate": 0.00010869572428401371, "loss": 0.3975, "mean_token_accuracy": 0.9038605362176895, "num_tokens": 182121713.0, "step": 64900 }, { "epoch": 1.577631610883231, "grad_norm": 0.7998061180114746, "learning_rate": 0.00010841434908119736, "loss": 0.3398, "mean_token_accuracy": 0.9166411167383194, "num_tokens": 182413653.0, "step": 65000 }, { "epoch": 1.580058736438436, "grad_norm": 0.6847976446151733, "learning_rate": 0.00010813290676912792, "loss": 0.3515, "mean_token_accuracy": 0.9136654776334763, "num_tokens": 182689793.0, "step": 65100 }, { "epoch": 1.582485861993641, "grad_norm": 0.5379784107208252, "learning_rate": 0.000107851399592469, "loss": 0.3889, "mean_token_accuracy": 0.905264812707901, "num_tokens": 182969672.0, "step": 65200 }, { "epoch": 1.584912987548846, "grad_norm": 0.9807937145233154, "learning_rate": 0.00010756982979640154, "loss": 0.3658, "mean_token_accuracy": 0.9102151840925217, "num_tokens": 183245797.0, "step": 65300 }, { "epoch": 1.5873401131040508, "grad_norm": 0.9323781132698059, "learning_rate": 0.00010728819962660598, "loss": 0.375, "mean_token_accuracy": 0.9090581947565078, "num_tokens": 183522770.0, "step": 65400 }, { "epoch": 1.5897672386592558, "grad_norm": 0.7109302282333374, "learning_rate": 0.00010700651132924418, "loss": 0.3142, "mean_token_accuracy": 0.9218410032987595, "num_tokens": 183797170.0, "step": 65500 }, { "epoch": 1.592194364214461, "grad_norm": 0.7105025053024292, "learning_rate": 0.00010672476715094167, "loss": 0.3766, "mean_token_accuracy": 0.9082463669776917, "num_tokens": 184074307.0, "step": 65600 }, { "epoch": 1.5946214897696658, "grad_norm": 0.813896119594574, "learning_rate": 0.0001064429693387696, "loss": 0.3739, "mean_token_accuracy": 0.909969937801361, "num_tokens": 184370954.0, "step": 65700 }, { "epoch": 1.5970486153248706, "grad_norm": 0.8465865850448608, "learning_rate": 0.00010616112014022694, "loss": 0.353, "mean_token_accuracy": 0.9132819199562072, "num_tokens": 184655076.0, "step": 65800 }, { "epoch": 1.5994757408800757, "grad_norm": 0.66167151927948, "learning_rate": 0.0001058792218032225, "loss": 0.3819, "mean_token_accuracy": 0.9072164571285248, "num_tokens": 184934233.0, "step": 65900 }, { "epoch": 1.6019028664352808, "grad_norm": 0.49021396040916443, "learning_rate": 0.00010559727657605694, "loss": 0.3625, "mean_token_accuracy": 0.9116124641895295, "num_tokens": 185211848.0, "step": 66000 }, { "epoch": 1.6043299919904856, "grad_norm": 0.6963539719581604, "learning_rate": 0.00010531528670740492, "loss": 0.3535, "mean_token_accuracy": 0.9135640490055085, "num_tokens": 185484414.0, "step": 66100 }, { "epoch": 1.6067571175456905, "grad_norm": 0.7452908754348755, "learning_rate": 0.00010503325444629717, "loss": 0.3922, "mean_token_accuracy": 0.9036144858598709, "num_tokens": 185773330.0, "step": 66200 }, { "epoch": 1.6091842431008956, "grad_norm": 0.7494684457778931, "learning_rate": 0.0001047511820421025, "loss": 0.3397, "mean_token_accuracy": 0.9164110898971558, "num_tokens": 186065496.0, "step": 66300 }, { "epoch": 1.6116113686561007, "grad_norm": 0.5952861309051514, "learning_rate": 0.00010446907174450983, "loss": 0.4031, "mean_token_accuracy": 0.9032100856304168, "num_tokens": 186349033.0, "step": 66400 }, { "epoch": 1.6140384942113055, "grad_norm": 1.3102141618728638, "learning_rate": 0.00010418692580351046, "loss": 0.3342, "mean_token_accuracy": 0.9185534894466401, "num_tokens": 186650789.0, "step": 66500 }, { "epoch": 1.6164656197665104, "grad_norm": 0.7236355543136597, "learning_rate": 0.0001039047464693798, "loss": 0.3566, "mean_token_accuracy": 0.9125622332096099, "num_tokens": 186923321.0, "step": 66600 }, { "epoch": 1.6188927453217155, "grad_norm": 0.8594217896461487, "learning_rate": 0.00010362253599265963, "loss": 0.3639, "mean_token_accuracy": 0.9123637741804123, "num_tokens": 187201566.0, "step": 66700 }, { "epoch": 1.6213198708769205, "grad_norm": 0.710017204284668, "learning_rate": 0.0001033402966241402, "loss": 0.4091, "mean_token_accuracy": 0.9025905507802964, "num_tokens": 187485117.0, "step": 66800 }, { "epoch": 1.6237469964321254, "grad_norm": 0.6511447429656982, "learning_rate": 0.0001030580306148421, "loss": 0.3392, "mean_token_accuracy": 0.9179734051227569, "num_tokens": 187778358.0, "step": 66900 }, { "epoch": 1.6261741219873302, "grad_norm": 0.656478762626648, "learning_rate": 0.00010277574021599836, "loss": 0.3134, "mean_token_accuracy": 0.9212139695882797, "num_tokens": 188044395.0, "step": 67000 }, { "epoch": 1.6286012475425353, "grad_norm": 0.5202460289001465, "learning_rate": 0.00010249342767903667, "loss": 0.3384, "mean_token_accuracy": 0.9168726438283921, "num_tokens": 188316468.0, "step": 67100 }, { "epoch": 1.6310283730977404, "grad_norm": 0.4842274487018585, "learning_rate": 0.00010221109525556116, "loss": 0.3457, "mean_token_accuracy": 0.9164014345407486, "num_tokens": 188581889.0, "step": 67200 }, { "epoch": 1.6334554986529453, "grad_norm": 0.6255847215652466, "learning_rate": 0.0001019287451973346, "loss": 0.3154, "mean_token_accuracy": 0.9217046529054642, "num_tokens": 188859717.0, "step": 67300 }, { "epoch": 1.6358826242081503, "grad_norm": 0.6042391657829285, "learning_rate": 0.00010164637975626045, "loss": 0.3318, "mean_token_accuracy": 0.9182331216335297, "num_tokens": 189135523.0, "step": 67400 }, { "epoch": 1.6383097497633554, "grad_norm": 0.6733984351158142, "learning_rate": 0.0001013640011843648, "loss": 0.3288, "mean_token_accuracy": 0.9198346930742264, "num_tokens": 189423670.0, "step": 67500 }, { "epoch": 1.6407368753185603, "grad_norm": 0.7567030787467957, "learning_rate": 0.0001010816117337785, "loss": 0.3475, "mean_token_accuracy": 0.9150215810537339, "num_tokens": 189701230.0, "step": 67600 }, { "epoch": 1.6431640008737651, "grad_norm": 0.6925593614578247, "learning_rate": 0.00010079921365671914, "loss": 0.3163, "mean_token_accuracy": 0.9217036014795303, "num_tokens": 189977907.0, "step": 67700 }, { "epoch": 1.6455911264289702, "grad_norm": 0.554702639579773, "learning_rate": 0.00010051680920547313, "loss": 0.3277, "mean_token_accuracy": 0.9187106388807297, "num_tokens": 190256876.0, "step": 67800 }, { "epoch": 1.6480182519841753, "grad_norm": 0.9477412700653076, "learning_rate": 0.0001002344006323777, "loss": 0.3244, "mean_token_accuracy": 0.9190926957130432, "num_tokens": 190538466.0, "step": 67900 }, { "epoch": 1.6504453775393801, "grad_norm": 0.7824245691299438, "learning_rate": 9.9951990189803e-05, "loss": 0.3529, "mean_token_accuracy": 0.9144074618816376, "num_tokens": 190823841.0, "step": 68000 }, { "epoch": 1.652872503094585, "grad_norm": 0.6957218050956726, "learning_rate": 9.9669580130134e-05, "loss": 0.3402, "mean_token_accuracy": 0.9166605508327484, "num_tokens": 191086712.0, "step": 68100 }, { "epoch": 1.65529962864979, "grad_norm": 0.4525381624698639, "learning_rate": 9.938717270575272e-05, "loss": 0.338, "mean_token_accuracy": 0.9177921932935714, "num_tokens": 191373552.0, "step": 68200 }, { "epoch": 1.6577267542049952, "grad_norm": 0.4912790358066559, "learning_rate": 9.910477016902016e-05, "loss": 0.3639, "mean_token_accuracy": 0.9108395028114319, "num_tokens": 191648507.0, "step": 68300 }, { "epoch": 1.6601538797602, "grad_norm": 0.6194826364517212, "learning_rate": 9.882237477225816e-05, "loss": 0.3618, "mean_token_accuracy": 0.9116045045852661, "num_tokens": 191928076.0, "step": 68400 }, { "epoch": 1.6625810053154049, "grad_norm": 0.6417632102966309, "learning_rate": 9.853998876773183e-05, "loss": 0.34, "mean_token_accuracy": 0.9173044127225876, "num_tokens": 192200434.0, "step": 68500 }, { "epoch": 1.66500813087061, "grad_norm": 0.7448539733886719, "learning_rate": 9.825761440763129e-05, "loss": 0.3352, "mean_token_accuracy": 0.9177643340826035, "num_tokens": 192480776.0, "step": 68600 }, { "epoch": 1.667435256425815, "grad_norm": 0.6451402306556702, "learning_rate": 9.797525394405376e-05, "loss": 0.3339, "mean_token_accuracy": 0.9174343067407608, "num_tokens": 192754899.0, "step": 68700 }, { "epoch": 1.6698623819810199, "grad_norm": 0.982666552066803, "learning_rate": 9.769290962898566e-05, "loss": 0.3221, "mean_token_accuracy": 0.9207504653930664, "num_tokens": 193041219.0, "step": 68800 }, { "epoch": 1.6722895075362247, "grad_norm": 0.5121039748191833, "learning_rate": 9.741058371428468e-05, "loss": 0.3547, "mean_token_accuracy": 0.9136625772714615, "num_tokens": 193314947.0, "step": 68900 }, { "epoch": 1.6747166330914298, "grad_norm": 0.702271580696106, "learning_rate": 9.712827845166157e-05, "loss": 0.3137, "mean_token_accuracy": 0.9225055646896362, "num_tokens": 193591255.0, "step": 69000 }, { "epoch": 1.677143758646635, "grad_norm": 0.685634970664978, "learning_rate": 9.684599609266253e-05, "loss": 0.3345, "mean_token_accuracy": 0.9185575705766678, "num_tokens": 193860731.0, "step": 69100 }, { "epoch": 1.6795708842018398, "grad_norm": 0.7711082100868225, "learning_rate": 9.65637388886511e-05, "loss": 0.3724, "mean_token_accuracy": 0.9091116571426392, "num_tokens": 194128841.0, "step": 69200 }, { "epoch": 1.6819980097570446, "grad_norm": 1.1533504724502563, "learning_rate": 9.628150909079007e-05, "loss": 0.3446, "mean_token_accuracy": 0.9169007408618927, "num_tokens": 194407308.0, "step": 69300 }, { "epoch": 1.6844251353122497, "grad_norm": 0.5997574925422668, "learning_rate": 9.599930895002375e-05, "loss": 0.3207, "mean_token_accuracy": 0.9212809431552887, "num_tokens": 194691502.0, "step": 69400 }, { "epoch": 1.6868522608674548, "grad_norm": 0.4966389834880829, "learning_rate": 9.571714071705993e-05, "loss": 0.3017, "mean_token_accuracy": 0.92467877805233, "num_tokens": 194986931.0, "step": 69500 }, { "epoch": 1.6892793864226596, "grad_norm": 1.2698211669921875, "learning_rate": 9.54350066423518e-05, "loss": 0.3154, "mean_token_accuracy": 0.9225751388072968, "num_tokens": 195268236.0, "step": 69600 }, { "epoch": 1.6917065119778645, "grad_norm": 0.9074554443359375, "learning_rate": 9.515290897608021e-05, "loss": 0.3801, "mean_token_accuracy": 0.9084242284297943, "num_tokens": 195545051.0, "step": 69700 }, { "epoch": 1.6941336375330696, "grad_norm": 0.6302804946899414, "learning_rate": 9.487084996813566e-05, "loss": 0.3175, "mean_token_accuracy": 0.9217024165391922, "num_tokens": 195822504.0, "step": 69800 }, { "epoch": 1.6965607630882746, "grad_norm": 0.829451322555542, "learning_rate": 9.458883186810024e-05, "loss": 0.3278, "mean_token_accuracy": 0.9191747254133225, "num_tokens": 196095524.0, "step": 69900 }, { "epoch": 1.6989878886434795, "grad_norm": 0.7346980571746826, "learning_rate": 9.430685692522983e-05, "loss": 0.3116, "mean_token_accuracy": 0.9237532925605774, "num_tokens": 196371157.0, "step": 70000 }, { "epoch": 1.7014150141986844, "grad_norm": 0.5194724798202515, "learning_rate": 9.402492738843615e-05, "loss": 0.3158, "mean_token_accuracy": 0.9228990191221237, "num_tokens": 196661666.0, "step": 70100 }, { "epoch": 1.7038421397538894, "grad_norm": 0.8965851068496704, "learning_rate": 9.374304550626863e-05, "loss": 0.3175, "mean_token_accuracy": 0.9222236788272857, "num_tokens": 196930313.0, "step": 70200 }, { "epoch": 1.7062692653090945, "grad_norm": 0.7471325397491455, "learning_rate": 9.346121352689677e-05, "loss": 0.3038, "mean_token_accuracy": 0.9257321381568908, "num_tokens": 197226205.0, "step": 70300 }, { "epoch": 1.7086963908642994, "grad_norm": 1.0250093936920166, "learning_rate": 9.317943369809208e-05, "loss": 0.3302, "mean_token_accuracy": 0.920040818452835, "num_tokens": 197506788.0, "step": 70400 }, { "epoch": 1.7111235164195042, "grad_norm": 0.6335323452949524, "learning_rate": 9.289770826720997e-05, "loss": 0.3482, "mean_token_accuracy": 0.9151440984010697, "num_tokens": 197793320.0, "step": 70500 }, { "epoch": 1.7135506419747093, "grad_norm": 0.745668888092041, "learning_rate": 9.261603948117215e-05, "loss": 0.3118, "mean_token_accuracy": 0.922760015130043, "num_tokens": 198080716.0, "step": 70600 }, { "epoch": 1.7159777675299144, "grad_norm": 0.8320471048355103, "learning_rate": 9.233442958644854e-05, "loss": 0.3121, "mean_token_accuracy": 0.9230057382583619, "num_tokens": 198366181.0, "step": 70700 }, { "epoch": 1.7184048930851192, "grad_norm": 0.48289257287979126, "learning_rate": 9.20528808290393e-05, "loss": 0.2983, "mean_token_accuracy": 0.9262391251325607, "num_tokens": 198633088.0, "step": 70800 }, { "epoch": 1.7208320186403243, "grad_norm": 0.4999804198741913, "learning_rate": 9.177139545445705e-05, "loss": 0.3451, "mean_token_accuracy": 0.9155993515253067, "num_tokens": 198914414.0, "step": 70900 }, { "epoch": 1.7232591441955294, "grad_norm": 0.5464954376220703, "learning_rate": 9.148997570770889e-05, "loss": 0.2955, "mean_token_accuracy": 0.9270257157087326, "num_tokens": 199205965.0, "step": 71000 }, { "epoch": 1.7256862697507342, "grad_norm": 0.7479568719863892, "learning_rate": 9.12086238332784e-05, "loss": 0.3156, "mean_token_accuracy": 0.921921678185463, "num_tokens": 199478121.0, "step": 71100 }, { "epoch": 1.728113395305939, "grad_norm": 0.5852890014648438, "learning_rate": 9.092734207510796e-05, "loss": 0.3586, "mean_token_accuracy": 0.9138590520620347, "num_tokens": 199758362.0, "step": 71200 }, { "epoch": 1.7305405208611442, "grad_norm": 0.5110465884208679, "learning_rate": 9.064613267658073e-05, "loss": 0.3286, "mean_token_accuracy": 0.9191134518384934, "num_tokens": 200054741.0, "step": 71300 }, { "epoch": 1.7329676464163493, "grad_norm": 0.5434827208518982, "learning_rate": 9.036499788050265e-05, "loss": 0.313, "mean_token_accuracy": 0.9231429475545884, "num_tokens": 200335558.0, "step": 71400 }, { "epoch": 1.7353947719715541, "grad_norm": 0.5807352066040039, "learning_rate": 9.00839399290848e-05, "loss": 0.281, "mean_token_accuracy": 0.9307907569408417, "num_tokens": 200604065.0, "step": 71500 }, { "epoch": 1.737821897526759, "grad_norm": 0.5315217971801758, "learning_rate": 8.98029610639253e-05, "loss": 0.3414, "mean_token_accuracy": 0.9182829517126083, "num_tokens": 200879049.0, "step": 71600 }, { "epoch": 1.740249023081964, "grad_norm": 0.7881130576133728, "learning_rate": 8.952206352599149e-05, "loss": 0.3491, "mean_token_accuracy": 0.9159856230020523, "num_tokens": 201165260.0, "step": 71700 }, { "epoch": 1.7426761486371691, "grad_norm": 0.6103109121322632, "learning_rate": 8.924124955560213e-05, "loss": 0.3239, "mean_token_accuracy": 0.9207637649774552, "num_tokens": 201429653.0, "step": 71800 }, { "epoch": 1.745103274192374, "grad_norm": 0.5265923142433167, "learning_rate": 8.896052139240949e-05, "loss": 0.3101, "mean_token_accuracy": 0.9246016055345535, "num_tokens": 201699010.0, "step": 71900 }, { "epoch": 1.7475303997475788, "grad_norm": 0.864488959312439, "learning_rate": 8.867988127538143e-05, "loss": 0.3371, "mean_token_accuracy": 0.9180309230089188, "num_tokens": 201989268.0, "step": 72000 }, { "epoch": 1.749957525302784, "grad_norm": 0.7725526094436646, "learning_rate": 8.839933144278361e-05, "loss": 0.3159, "mean_token_accuracy": 0.9218523550033569, "num_tokens": 202268722.0, "step": 72100 }, { "epoch": 1.752384650857989, "grad_norm": 0.7077100872993469, "learning_rate": 8.811887413216165e-05, "loss": 0.3141, "mean_token_accuracy": 0.9246137911081314, "num_tokens": 202566420.0, "step": 72200 }, { "epoch": 1.7548117764131939, "grad_norm": 0.9547737240791321, "learning_rate": 8.783851158032325e-05, "loss": 0.3201, "mean_token_accuracy": 0.9220850229263305, "num_tokens": 202850518.0, "step": 72300 }, { "epoch": 1.7572389019683987, "grad_norm": 0.5615896582603455, "learning_rate": 8.755824602332025e-05, "loss": 0.3197, "mean_token_accuracy": 0.9238210040330886, "num_tokens": 203155202.0, "step": 72400 }, { "epoch": 1.7596660275236038, "grad_norm": 0.7283564805984497, "learning_rate": 8.727807969643105e-05, "loss": 0.2929, "mean_token_accuracy": 0.9278195726871491, "num_tokens": 203426340.0, "step": 72500 }, { "epoch": 1.7620931530788089, "grad_norm": 0.7179740071296692, "learning_rate": 8.699801483414258e-05, "loss": 0.297, "mean_token_accuracy": 0.9268648004531861, "num_tokens": 203706287.0, "step": 72600 }, { "epoch": 1.7645202786340137, "grad_norm": 0.4863494634628296, "learning_rate": 8.671805367013247e-05, "loss": 0.3066, "mean_token_accuracy": 0.925693176984787, "num_tokens": 203990745.0, "step": 72700 }, { "epoch": 1.7669474041892186, "grad_norm": 0.6124697327613831, "learning_rate": 8.643819843725137e-05, "loss": 0.3197, "mean_token_accuracy": 0.9222535014152526, "num_tokens": 204263513.0, "step": 72800 }, { "epoch": 1.7693745297444237, "grad_norm": 0.4539005756378174, "learning_rate": 8.615845136750508e-05, "loss": 0.2946, "mean_token_accuracy": 0.9275599610805512, "num_tokens": 204525364.0, "step": 72900 }, { "epoch": 1.7718016552996287, "grad_norm": 0.6153349876403809, "learning_rate": 8.587881469203664e-05, "loss": 0.3038, "mean_token_accuracy": 0.9263102424144745, "num_tokens": 204810853.0, "step": 73000 }, { "epoch": 1.7742287808548336, "grad_norm": 0.801893949508667, "learning_rate": 8.559929064110873e-05, "loss": 0.3046, "mean_token_accuracy": 0.9255858755111694, "num_tokens": 205103416.0, "step": 73100 }, { "epoch": 1.7766559064100385, "grad_norm": 0.9156849384307861, "learning_rate": 8.531988144408576e-05, "loss": 0.2973, "mean_token_accuracy": 0.9272646516561508, "num_tokens": 205382911.0, "step": 73200 }, { "epoch": 1.7790830319652435, "grad_norm": 0.5409168004989624, "learning_rate": 8.504058932941605e-05, "loss": 0.3031, "mean_token_accuracy": 0.9254523950815201, "num_tokens": 205646639.0, "step": 73300 }, { "epoch": 1.7815101575204486, "grad_norm": 0.620744526386261, "learning_rate": 8.476141652461421e-05, "loss": 0.3098, "mean_token_accuracy": 0.9240190213918686, "num_tokens": 205915824.0, "step": 73400 }, { "epoch": 1.7839372830756535, "grad_norm": 0.5530264377593994, "learning_rate": 8.448236525624331e-05, "loss": 0.3359, "mean_token_accuracy": 0.9183445227146149, "num_tokens": 206207345.0, "step": 73500 }, { "epoch": 1.7863644086308583, "grad_norm": 0.4213729500770569, "learning_rate": 8.420343774989691e-05, "loss": 0.3388, "mean_token_accuracy": 0.9183846575021744, "num_tokens": 206493187.0, "step": 73600 }, { "epoch": 1.7887915341860634, "grad_norm": 0.47710779309272766, "learning_rate": 8.392463623018172e-05, "loss": 0.3611, "mean_token_accuracy": 0.9139343810081482, "num_tokens": 206777359.0, "step": 73700 }, { "epoch": 1.7912186597412685, "grad_norm": 0.9268105626106262, "learning_rate": 8.364596292069953e-05, "loss": 0.3057, "mean_token_accuracy": 0.9253049170970917, "num_tokens": 207060316.0, "step": 73800 }, { "epoch": 1.7936457852964733, "grad_norm": 0.6438888907432556, "learning_rate": 8.336742004402954e-05, "loss": 0.2599, "mean_token_accuracy": 0.9355102443695068, "num_tokens": 207333293.0, "step": 73900 }, { "epoch": 1.7960729108516782, "grad_norm": 0.4977918565273285, "learning_rate": 8.308900982171075e-05, "loss": 0.2811, "mean_token_accuracy": 0.9305684345960618, "num_tokens": 207603072.0, "step": 74000 }, { "epoch": 1.7985000364068833, "grad_norm": 0.4704795479774475, "learning_rate": 8.28107344742242e-05, "loss": 0.2805, "mean_token_accuracy": 0.9318266582489013, "num_tokens": 207883751.0, "step": 74100 }, { "epoch": 1.8009271619620884, "grad_norm": 0.6442781686782837, "learning_rate": 8.253259622097505e-05, "loss": 0.3198, "mean_token_accuracy": 0.9221637785434723, "num_tokens": 208156078.0, "step": 74200 }, { "epoch": 1.8033542875172932, "grad_norm": 0.6045205593109131, "learning_rate": 8.225459728027522e-05, "loss": 0.3355, "mean_token_accuracy": 0.9196035987138749, "num_tokens": 208438407.0, "step": 74300 }, { "epoch": 1.8057814130724983, "grad_norm": 1.1112462282180786, "learning_rate": 8.197673986932553e-05, "loss": 0.3074, "mean_token_accuracy": 0.9248217070102691, "num_tokens": 208717797.0, "step": 74400 }, { "epoch": 1.8082085386277034, "grad_norm": 1.077713966369629, "learning_rate": 8.169902620419789e-05, "loss": 0.2507, "mean_token_accuracy": 0.9370506715774536, "num_tokens": 209002743.0, "step": 74500 }, { "epoch": 1.8106356641829082, "grad_norm": 0.744132399559021, "learning_rate": 8.142145849981788e-05, "loss": 0.2938, "mean_token_accuracy": 0.9279677730798721, "num_tokens": 209279733.0, "step": 74600 }, { "epoch": 1.813062789738113, "grad_norm": 0.7917492985725403, "learning_rate": 8.114403896994692e-05, "loss": 0.3001, "mean_token_accuracy": 0.9269537425041199, "num_tokens": 209569692.0, "step": 74700 }, { "epoch": 1.8154899152933182, "grad_norm": 0.5599649548530579, "learning_rate": 8.086676982716463e-05, "loss": 0.2734, "mean_token_accuracy": 0.9324289971590042, "num_tokens": 209851240.0, "step": 74800 }, { "epoch": 1.8179170408485232, "grad_norm": 0.7313913106918335, "learning_rate": 8.058965328285122e-05, "loss": 0.2901, "mean_token_accuracy": 0.9296905469894409, "num_tokens": 210122572.0, "step": 74900 }, { "epoch": 1.820344166403728, "grad_norm": 0.6392823457717896, "learning_rate": 8.03126915471699e-05, "loss": 0.2558, "mean_token_accuracy": 0.9362455844879151, "num_tokens": 210389408.0, "step": 75000 }, { "epoch": 1.822771291958933, "grad_norm": 0.6192830204963684, "learning_rate": 8.003588682904904e-05, "loss": 0.2875, "mean_token_accuracy": 0.9278698348999024, "num_tokens": 210649018.0, "step": 75100 }, { "epoch": 1.825198417514138, "grad_norm": 0.876402735710144, "learning_rate": 7.975924133616489e-05, "loss": 0.3214, "mean_token_accuracy": 0.9232221174240113, "num_tokens": 210920552.0, "step": 75200 }, { "epoch": 1.827625543069343, "grad_norm": 0.4038306772708893, "learning_rate": 7.948275727492365e-05, "loss": 0.2791, "mean_token_accuracy": 0.9318163025379181, "num_tokens": 211213728.0, "step": 75300 }, { "epoch": 1.830052668624548, "grad_norm": 0.869914174079895, "learning_rate": 7.920643685044407e-05, "loss": 0.2907, "mean_token_accuracy": 0.9293239808082581, "num_tokens": 211497350.0, "step": 75400 }, { "epoch": 1.8324797941797528, "grad_norm": 0.5940338969230652, "learning_rate": 7.893028226653982e-05, "loss": 0.3187, "mean_token_accuracy": 0.9223272824287414, "num_tokens": 211784740.0, "step": 75500 }, { "epoch": 1.834906919734958, "grad_norm": 0.842191755771637, "learning_rate": 7.865429572570187e-05, "loss": 0.2469, "mean_token_accuracy": 0.9377169865369797, "num_tokens": 212067327.0, "step": 75600 }, { "epoch": 1.837334045290163, "grad_norm": 0.5681836605072021, "learning_rate": 7.83784794290809e-05, "loss": 0.2841, "mean_token_accuracy": 0.9311885517835617, "num_tokens": 212340490.0, "step": 75700 }, { "epoch": 1.8397611708453678, "grad_norm": 0.6963329911231995, "learning_rate": 7.810283557646988e-05, "loss": 0.303, "mean_token_accuracy": 0.926787257194519, "num_tokens": 212630815.0, "step": 75800 }, { "epoch": 1.8421882964005727, "grad_norm": 0.8300701379776001, "learning_rate": 7.782736636628641e-05, "loss": 0.3003, "mean_token_accuracy": 0.9274958628416061, "num_tokens": 212900625.0, "step": 75900 }, { "epoch": 1.8446154219557778, "grad_norm": 0.6039229035377502, "learning_rate": 7.75520739955552e-05, "loss": 0.2644, "mean_token_accuracy": 0.9343709653615951, "num_tokens": 213192202.0, "step": 76000 }, { "epoch": 1.8470425475109828, "grad_norm": 0.7922122478485107, "learning_rate": 7.727696065989057e-05, "loss": 0.2747, "mean_token_accuracy": 0.9328130561113358, "num_tokens": 213471876.0, "step": 76100 }, { "epoch": 1.8494696730661877, "grad_norm": 0.9440976977348328, "learning_rate": 7.700202855347895e-05, "loss": 0.2602, "mean_token_accuracy": 0.9356586462259293, "num_tokens": 213748520.0, "step": 76200 }, { "epoch": 1.8518967986213926, "grad_norm": 0.47323721647262573, "learning_rate": 7.67272798690614e-05, "loss": 0.3049, "mean_token_accuracy": 0.9259419536590576, "num_tokens": 214033390.0, "step": 76300 }, { "epoch": 1.8543239241765976, "grad_norm": 0.5896041989326477, "learning_rate": 7.645271679791594e-05, "loss": 0.2809, "mean_token_accuracy": 0.931108033657074, "num_tokens": 214315261.0, "step": 76400 }, { "epoch": 1.8567510497318027, "grad_norm": 0.6122065186500549, "learning_rate": 7.617834152984035e-05, "loss": 0.265, "mean_token_accuracy": 0.9348021644353867, "num_tokens": 214612081.0, "step": 76500 }, { "epoch": 1.8591781752870076, "grad_norm": 0.4658614993095398, "learning_rate": 7.590415625313459e-05, "loss": 0.2978, "mean_token_accuracy": 0.9276400768756866, "num_tokens": 214894960.0, "step": 76600 }, { "epoch": 1.8616053008422124, "grad_norm": 1.103578805923462, "learning_rate": 7.56301631545832e-05, "loss": 0.3092, "mean_token_accuracy": 0.9259757554531097, "num_tokens": 215168979.0, "step": 76700 }, { "epoch": 1.8640324263974175, "grad_norm": 0.6536977291107178, "learning_rate": 7.53563644194381e-05, "loss": 0.2677, "mean_token_accuracy": 0.9338239932060242, "num_tokens": 215444157.0, "step": 76800 }, { "epoch": 1.8664595519526226, "grad_norm": 0.6290289759635925, "learning_rate": 7.508276223140105e-05, "loss": 0.2939, "mean_token_accuracy": 0.9283382612466812, "num_tokens": 215724557.0, "step": 76900 }, { "epoch": 1.8688866775078274, "grad_norm": 0.5545627474784851, "learning_rate": 7.480935877260616e-05, "loss": 0.3151, "mean_token_accuracy": 0.9245033538341523, "num_tokens": 216001207.0, "step": 77000 }, { "epoch": 1.8713138030630323, "grad_norm": 0.5959055423736572, "learning_rate": 7.45361562236026e-05, "loss": 0.2919, "mean_token_accuracy": 0.9289400810003281, "num_tokens": 216282783.0, "step": 77100 }, { "epoch": 1.8737409286182374, "grad_norm": 0.6361558437347412, "learning_rate": 7.426315676333719e-05, "loss": 0.2703, "mean_token_accuracy": 0.933252164721489, "num_tokens": 216570897.0, "step": 77200 }, { "epoch": 1.8761680541734425, "grad_norm": 0.48569005727767944, "learning_rate": 7.399036256913696e-05, "loss": 0.2841, "mean_token_accuracy": 0.9301125872135162, "num_tokens": 216850770.0, "step": 77300 }, { "epoch": 1.8785951797286473, "grad_norm": 0.7510759830474854, "learning_rate": 7.371777581669185e-05, "loss": 0.2619, "mean_token_accuracy": 0.9353511029481888, "num_tokens": 217138106.0, "step": 77400 }, { "epoch": 1.8810223052838522, "grad_norm": 0.5605314373970032, "learning_rate": 7.34453986800374e-05, "loss": 0.2599, "mean_token_accuracy": 0.9363309639692307, "num_tokens": 217410591.0, "step": 77500 }, { "epoch": 1.8834494308390575, "grad_norm": 0.4645470082759857, "learning_rate": 7.317323333153717e-05, "loss": 0.299, "mean_token_accuracy": 0.9282932883501053, "num_tokens": 217682755.0, "step": 77600 }, { "epoch": 1.8858765563942623, "grad_norm": 0.41098716855049133, "learning_rate": 7.290128194186574e-05, "loss": 0.2657, "mean_token_accuracy": 0.9342436867952347, "num_tokens": 217955304.0, "step": 77700 }, { "epoch": 1.8883036819494672, "grad_norm": 0.7157009243965149, "learning_rate": 7.262954667999125e-05, "loss": 0.2413, "mean_token_accuracy": 0.9404269510507584, "num_tokens": 218229072.0, "step": 77800 }, { "epoch": 1.8907308075046723, "grad_norm": 0.5151275396347046, "learning_rate": 7.235802971315793e-05, "loss": 0.2916, "mean_token_accuracy": 0.9299118751287461, "num_tokens": 218512265.0, "step": 77900 }, { "epoch": 1.8931579330598773, "grad_norm": 0.6814290285110474, "learning_rate": 7.208673320686916e-05, "loss": 0.2557, "mean_token_accuracy": 0.9371042233705521, "num_tokens": 218815553.0, "step": 78000 }, { "epoch": 1.8955850586150822, "grad_norm": 0.4364314377307892, "learning_rate": 7.181565932487e-05, "loss": 0.2683, "mean_token_accuracy": 0.9351461017131806, "num_tokens": 219109518.0, "step": 78100 }, { "epoch": 1.898012184170287, "grad_norm": 0.9119032621383667, "learning_rate": 7.154481022912985e-05, "loss": 0.2591, "mean_token_accuracy": 0.9369248080253602, "num_tokens": 219400520.0, "step": 78200 }, { "epoch": 1.9004393097254921, "grad_norm": 0.5391600131988525, "learning_rate": 7.127418807982538e-05, "loss": 0.2814, "mean_token_accuracy": 0.9312341713905334, "num_tokens": 219686579.0, "step": 78300 }, { "epoch": 1.9028664352806972, "grad_norm": 0.9579749703407288, "learning_rate": 7.10037950353233e-05, "loss": 0.2771, "mean_token_accuracy": 0.9320234954357147, "num_tokens": 219972043.0, "step": 78400 }, { "epoch": 1.905293560835902, "grad_norm": 0.6177464127540588, "learning_rate": 7.07336332521629e-05, "loss": 0.2881, "mean_token_accuracy": 0.9298322373628616, "num_tokens": 220247450.0, "step": 78500 }, { "epoch": 1.907720686391107, "grad_norm": 0.4801132380962372, "learning_rate": 7.046370488503915e-05, "loss": 0.256, "mean_token_accuracy": 0.9365988206863404, "num_tokens": 220516332.0, "step": 78600 }, { "epoch": 1.910147811946312, "grad_norm": 0.7056084871292114, "learning_rate": 7.019401208678544e-05, "loss": 0.2763, "mean_token_accuracy": 0.9320674824714661, "num_tokens": 220801836.0, "step": 78700 }, { "epoch": 1.912574937501517, "grad_norm": 0.5265929698944092, "learning_rate": 6.992455700835624e-05, "loss": 0.3129, "mean_token_accuracy": 0.9254920214414597, "num_tokens": 221082541.0, "step": 78800 }, { "epoch": 1.915002063056722, "grad_norm": 0.6001086831092834, "learning_rate": 6.965534179881014e-05, "loss": 0.2677, "mean_token_accuracy": 0.9343756914138794, "num_tokens": 221363824.0, "step": 78900 }, { "epoch": 1.9174291886119268, "grad_norm": 0.5919141173362732, "learning_rate": 6.938636860529268e-05, "loss": 0.2368, "mean_token_accuracy": 0.940570114850998, "num_tokens": 221642721.0, "step": 79000 }, { "epoch": 1.9198563141671319, "grad_norm": 0.5845506191253662, "learning_rate": 6.911763957301904e-05, "loss": 0.2563, "mean_token_accuracy": 0.9364815109968185, "num_tokens": 221918380.0, "step": 79100 }, { "epoch": 1.922283439722337, "grad_norm": 0.9840309619903564, "learning_rate": 6.884915684525719e-05, "loss": 0.2811, "mean_token_accuracy": 0.9315145230293274, "num_tokens": 222203368.0, "step": 79200 }, { "epoch": 1.9247105652775418, "grad_norm": 0.6356433033943176, "learning_rate": 6.85809225633107e-05, "loss": 0.2738, "mean_token_accuracy": 0.9337194603681565, "num_tokens": 222472383.0, "step": 79300 }, { "epoch": 1.9271376908327467, "grad_norm": 0.5394325256347656, "learning_rate": 6.831293886650154e-05, "loss": 0.2904, "mean_token_accuracy": 0.9293972563743591, "num_tokens": 222746002.0, "step": 79400 }, { "epoch": 1.9295648163879517, "grad_norm": 0.6073657870292664, "learning_rate": 6.804520789215319e-05, "loss": 0.2376, "mean_token_accuracy": 0.9415085107088089, "num_tokens": 223036552.0, "step": 79500 }, { "epoch": 1.9319919419431568, "grad_norm": 0.4766857326030731, "learning_rate": 6.777773177557357e-05, "loss": 0.2365, "mean_token_accuracy": 0.9424629133939743, "num_tokens": 223326315.0, "step": 79600 }, { "epoch": 1.9344190674983617, "grad_norm": 0.8072234392166138, "learning_rate": 6.751051265003779e-05, "loss": 0.2577, "mean_token_accuracy": 0.9369598460197449, "num_tokens": 223600784.0, "step": 79700 }, { "epoch": 1.9368461930535665, "grad_norm": 0.5053612589836121, "learning_rate": 6.724355264677148e-05, "loss": 0.243, "mean_token_accuracy": 0.9401568102836609, "num_tokens": 223895731.0, "step": 79800 }, { "epoch": 1.9392733186087716, "grad_norm": 0.8727260828018188, "learning_rate": 6.69768538949336e-05, "loss": 0.2488, "mean_token_accuracy": 0.9389541667699813, "num_tokens": 224176472.0, "step": 79900 }, { "epoch": 1.9417004441639767, "grad_norm": 0.8288789987564087, "learning_rate": 6.671041852159937e-05, "loss": 0.2571, "mean_token_accuracy": 0.936657503247261, "num_tokens": 224447801.0, "step": 80000 }, { "epoch": 1.9441275697191815, "grad_norm": 0.3764743506908417, "learning_rate": 6.644424865174354e-05, "loss": 0.2589, "mean_token_accuracy": 0.9362627816200256, "num_tokens": 224750029.0, "step": 80100 }, { "epoch": 1.9465546952743864, "grad_norm": 0.5590245723724365, "learning_rate": 6.617834640822325e-05, "loss": 0.2438, "mean_token_accuracy": 0.9403492480516433, "num_tokens": 225021335.0, "step": 80200 }, { "epoch": 1.9489818208295915, "grad_norm": 0.5417003035545349, "learning_rate": 6.591271391176121e-05, "loss": 0.2651, "mean_token_accuracy": 0.935703609585762, "num_tokens": 225307283.0, "step": 80300 }, { "epoch": 1.9514089463847966, "grad_norm": 0.5252110958099365, "learning_rate": 6.564735328092868e-05, "loss": 0.2718, "mean_token_accuracy": 0.9335632401704789, "num_tokens": 225605260.0, "step": 80400 }, { "epoch": 1.9538360719400014, "grad_norm": 0.7686030268669128, "learning_rate": 6.538226663212868e-05, "loss": 0.2416, "mean_token_accuracy": 0.9402110683917999, "num_tokens": 225875026.0, "step": 80500 }, { "epoch": 1.9562631974952063, "grad_norm": 0.33636564016342163, "learning_rate": 6.511745607957906e-05, "loss": 0.2678, "mean_token_accuracy": 0.93425896525383, "num_tokens": 226153286.0, "step": 80600 }, { "epoch": 1.9586903230504114, "grad_norm": 0.8506200313568115, "learning_rate": 6.485292373529565e-05, "loss": 0.2372, "mean_token_accuracy": 0.9409923559427261, "num_tokens": 226438345.0, "step": 80700 }, { "epoch": 1.9611174486056164, "grad_norm": 0.4994686543941498, "learning_rate": 6.458867170907539e-05, "loss": 0.3036, "mean_token_accuracy": 0.9279145467281341, "num_tokens": 226724818.0, "step": 80800 }, { "epoch": 1.9635445741608213, "grad_norm": 0.3879760205745697, "learning_rate": 6.432470210847958e-05, "loss": 0.304, "mean_token_accuracy": 0.9270832985639572, "num_tokens": 227002630.0, "step": 80900 }, { "epoch": 1.9659716997160261, "grad_norm": 0.5716784000396729, "learning_rate": 6.40610170388169e-05, "loss": 0.2548, "mean_token_accuracy": 0.9366704255342484, "num_tokens": 227297146.0, "step": 81000 }, { "epoch": 1.9683988252712314, "grad_norm": 0.7849399447441101, "learning_rate": 6.379761860312682e-05, "loss": 0.2394, "mean_token_accuracy": 0.9409961378574372, "num_tokens": 227576379.0, "step": 81100 }, { "epoch": 1.9708259508264363, "grad_norm": 0.5937269330024719, "learning_rate": 6.353450890216279e-05, "loss": 0.2384, "mean_token_accuracy": 0.9405011647939682, "num_tokens": 227839735.0, "step": 81200 }, { "epoch": 1.9732530763816412, "grad_norm": 1.1839408874511719, "learning_rate": 6.327169003437527e-05, "loss": 0.2298, "mean_token_accuracy": 0.9422443634271622, "num_tokens": 228113960.0, "step": 81300 }, { "epoch": 1.9756802019368462, "grad_norm": 0.4460374712944031, "learning_rate": 6.300916409589531e-05, "loss": 0.2479, "mean_token_accuracy": 0.9397810900211334, "num_tokens": 228395773.0, "step": 81400 }, { "epoch": 1.9781073274920513, "grad_norm": 0.5764565467834473, "learning_rate": 6.274693318051765e-05, "loss": 0.2468, "mean_token_accuracy": 0.9393291866779327, "num_tokens": 228674065.0, "step": 81500 }, { "epoch": 1.9805344530472562, "grad_norm": 0.5391059517860413, "learning_rate": 6.248499937968397e-05, "loss": 0.2694, "mean_token_accuracy": 0.9344400101900101, "num_tokens": 228950893.0, "step": 81600 }, { "epoch": 1.982961578602461, "grad_norm": 0.44228363037109375, "learning_rate": 6.222336478246636e-05, "loss": 0.2147, "mean_token_accuracy": 0.9461823815107345, "num_tokens": 229227673.0, "step": 81700 }, { "epoch": 1.985388704157666, "grad_norm": 0.6836258172988892, "learning_rate": 6.196203147555066e-05, "loss": 0.2465, "mean_token_accuracy": 0.9392766255140305, "num_tokens": 229499479.0, "step": 81800 }, { "epoch": 1.9878158297128712, "grad_norm": 0.8605006337165833, "learning_rate": 6.170100154321954e-05, "loss": 0.2592, "mean_token_accuracy": 0.9367054688930512, "num_tokens": 229769342.0, "step": 81900 }, { "epoch": 1.990242955268076, "grad_norm": 0.7405674457550049, "learning_rate": 6.144027706733629e-05, "loss": 0.2554, "mean_token_accuracy": 0.9385197955369949, "num_tokens": 230033787.0, "step": 82000 }, { "epoch": 1.992670080823281, "grad_norm": 0.5927315950393677, "learning_rate": 6.117986012732793e-05, "loss": 0.2853, "mean_token_accuracy": 0.929689479470253, "num_tokens": 230313605.0, "step": 82100 }, { "epoch": 1.995097206378486, "grad_norm": 0.5205606818199158, "learning_rate": 6.091975280016868e-05, "loss": 0.259, "mean_token_accuracy": 0.9373654896020889, "num_tokens": 230603964.0, "step": 82200 }, { "epoch": 1.997524331933691, "grad_norm": 0.9966670870780945, "learning_rate": 6.065995716036346e-05, "loss": 0.2725, "mean_token_accuracy": 0.9339149141311646, "num_tokens": 230887154.0, "step": 82300 }, { "epoch": 1.999951457488896, "grad_norm": 0.8154625296592712, "learning_rate": 6.0400475279931315e-05, "loss": 0.2743, "mean_token_accuracy": 0.9331579411029816, "num_tokens": 231184353.0, "step": 82400 }, { "epoch": 2.0023785830441008, "grad_norm": 0.5808880925178528, "learning_rate": 6.014130922838881e-05, "loss": 0.1519, "mean_token_accuracy": 0.9608761477470398, "num_tokens": 231479894.0, "step": 82500 }, { "epoch": 2.0048057085993056, "grad_norm": 0.4847000241279602, "learning_rate": 5.988246107273365e-05, "loss": 0.1805, "mean_token_accuracy": 0.9541962110996246, "num_tokens": 231784377.0, "step": 82600 }, { "epoch": 2.007232834154511, "grad_norm": 0.5827914476394653, "learning_rate": 5.962393287742816e-05, "loss": 0.1396, "mean_token_accuracy": 0.9642374736070632, "num_tokens": 232063461.0, "step": 82700 }, { "epoch": 2.009659959709716, "grad_norm": 0.6424087882041931, "learning_rate": 5.936572670438273e-05, "loss": 0.1362, "mean_token_accuracy": 0.9648731410503387, "num_tokens": 232342507.0, "step": 82800 }, { "epoch": 2.0120870852649206, "grad_norm": 0.6409827470779419, "learning_rate": 5.910784461293948e-05, "loss": 0.1643, "mean_token_accuracy": 0.9581991600990295, "num_tokens": 232625967.0, "step": 82900 }, { "epoch": 2.014514210820126, "grad_norm": 0.5854642987251282, "learning_rate": 5.8850288659855846e-05, "loss": 0.1559, "mean_token_accuracy": 0.9603128689527511, "num_tokens": 232900644.0, "step": 83000 }, { "epoch": 2.016941336375331, "grad_norm": 0.521918535232544, "learning_rate": 5.8593060899287965e-05, "loss": 0.1492, "mean_token_accuracy": 0.9615022081136704, "num_tokens": 233181484.0, "step": 83100 }, { "epoch": 2.0193684619305357, "grad_norm": 0.7428982257843018, "learning_rate": 5.8336163382774586e-05, "loss": 0.1688, "mean_token_accuracy": 0.9571781551837921, "num_tokens": 233459275.0, "step": 83200 }, { "epoch": 2.0217955874857405, "grad_norm": 0.44911694526672363, "learning_rate": 5.8079598159220625e-05, "loss": 0.1464, "mean_token_accuracy": 0.9628020924329758, "num_tokens": 233730444.0, "step": 83300 }, { "epoch": 2.024222713040946, "grad_norm": 0.4201023280620575, "learning_rate": 5.782336727488054e-05, "loss": 0.1616, "mean_token_accuracy": 0.9586890661716461, "num_tokens": 234007650.0, "step": 83400 }, { "epoch": 2.0266498385961507, "grad_norm": 0.5022528171539307, "learning_rate": 5.756747277334249e-05, "loss": 0.1889, "mean_token_accuracy": 0.9528667098283767, "num_tokens": 234297366.0, "step": 83500 }, { "epoch": 2.0290769641513555, "grad_norm": 0.29196932911872864, "learning_rate": 5.731191669551165e-05, "loss": 0.152, "mean_token_accuracy": 0.9611919575929642, "num_tokens": 234582826.0, "step": 83600 }, { "epoch": 2.0315040897065604, "grad_norm": 0.2950987219810486, "learning_rate": 5.705670107959407e-05, "loss": 0.1749, "mean_token_accuracy": 0.9554368454217911, "num_tokens": 234863426.0, "step": 83700 }, { "epoch": 2.0339312152617657, "grad_norm": 0.49717605113983154, "learning_rate": 5.6801827961080536e-05, "loss": 0.16, "mean_token_accuracy": 0.9598457747697831, "num_tokens": 235146656.0, "step": 83800 }, { "epoch": 2.0363583408169705, "grad_norm": 0.4079115092754364, "learning_rate": 5.654729937273009e-05, "loss": 0.1395, "mean_token_accuracy": 0.9637018305063247, "num_tokens": 235426935.0, "step": 83900 }, { "epoch": 2.0387854663721754, "grad_norm": 2.313652515411377, "learning_rate": 5.629311734455403e-05, "loss": 0.1656, "mean_token_accuracy": 0.9576411491632462, "num_tokens": 235714856.0, "step": 84000 }, { "epoch": 2.0412125919273802, "grad_norm": 0.5406439900398254, "learning_rate": 5.6039283903799534e-05, "loss": 0.173, "mean_token_accuracy": 0.9562010073661804, "num_tokens": 235984367.0, "step": 84100 }, { "epoch": 2.0436397174825855, "grad_norm": 0.360782653093338, "learning_rate": 5.5785801074933786e-05, "loss": 0.1532, "mean_token_accuracy": 0.9607203018665313, "num_tokens": 236262299.0, "step": 84200 }, { "epoch": 2.0460668430377904, "grad_norm": 0.5406671762466431, "learning_rate": 5.5532670879627455e-05, "loss": 0.1522, "mean_token_accuracy": 0.9607596349716186, "num_tokens": 236550358.0, "step": 84300 }, { "epoch": 2.0484939685929953, "grad_norm": 0.4788415729999542, "learning_rate": 5.527989533673881e-05, "loss": 0.1448, "mean_token_accuracy": 0.9628503209352494, "num_tokens": 236822868.0, "step": 84400 }, { "epoch": 2.0509210941482, "grad_norm": 0.5984004139900208, "learning_rate": 5.5027476462297635e-05, "loss": 0.1625, "mean_token_accuracy": 0.9581832873821259, "num_tokens": 237108453.0, "step": 84500 }, { "epoch": 2.0533482197034054, "grad_norm": 0.41232043504714966, "learning_rate": 5.4775416269489e-05, "loss": 0.1541, "mean_token_accuracy": 0.9605655258893967, "num_tokens": 237386750.0, "step": 84600 }, { "epoch": 2.0557753452586103, "grad_norm": 0.5192024111747742, "learning_rate": 5.45237167686373e-05, "loss": 0.1611, "mean_token_accuracy": 0.9575982117652893, "num_tokens": 237657392.0, "step": 84700 }, { "epoch": 2.058202470813815, "grad_norm": 0.4785330593585968, "learning_rate": 5.427237996719017e-05, "loss": 0.1511, "mean_token_accuracy": 0.9610388678312302, "num_tokens": 237928389.0, "step": 84800 }, { "epoch": 2.06062959636902, "grad_norm": 0.4883587956428528, "learning_rate": 5.402140786970264e-05, "loss": 0.1323, "mean_token_accuracy": 0.9661263453960419, "num_tokens": 238194140.0, "step": 84900 }, { "epoch": 2.0630567219242253, "grad_norm": 0.3393934965133667, "learning_rate": 5.377080247782088e-05, "loss": 0.15, "mean_token_accuracy": 0.9611387658119201, "num_tokens": 238474238.0, "step": 85000 }, { "epoch": 2.06548384747943, "grad_norm": 0.40510058403015137, "learning_rate": 5.3520565790266386e-05, "loss": 0.1544, "mean_token_accuracy": 0.9601859641075134, "num_tokens": 238755497.0, "step": 85100 }, { "epoch": 2.067910973034635, "grad_norm": 0.4711453318595886, "learning_rate": 5.327069980282017e-05, "loss": 0.1559, "mean_token_accuracy": 0.9597002917528152, "num_tokens": 239032904.0, "step": 85200 }, { "epoch": 2.07033809858984, "grad_norm": 0.20721179246902466, "learning_rate": 5.302120650830643e-05, "loss": 0.1568, "mean_token_accuracy": 0.9598673725128174, "num_tokens": 239324806.0, "step": 85300 }, { "epoch": 2.072765224145045, "grad_norm": 0.6029618978500366, "learning_rate": 5.277208789657722e-05, "loss": 0.1616, "mean_token_accuracy": 0.959028176665306, "num_tokens": 239598686.0, "step": 85400 }, { "epoch": 2.07519234970025, "grad_norm": 0.8069055080413818, "learning_rate": 5.252334595449608e-05, "loss": 0.1614, "mean_token_accuracy": 0.9587891620397567, "num_tokens": 239875219.0, "step": 85500 }, { "epoch": 2.077619475255455, "grad_norm": 0.576477587223053, "learning_rate": 5.227498266592249e-05, "loss": 0.1725, "mean_token_accuracy": 0.9557231628894806, "num_tokens": 240146288.0, "step": 85600 }, { "epoch": 2.0800466008106597, "grad_norm": 0.8168060183525085, "learning_rate": 5.202700001169587e-05, "loss": 0.1486, "mean_token_accuracy": 0.9620643395185471, "num_tokens": 240435471.0, "step": 85700 }, { "epoch": 2.082473726365865, "grad_norm": 0.49945294857025146, "learning_rate": 5.177939996961998e-05, "loss": 0.1776, "mean_token_accuracy": 0.95517926633358, "num_tokens": 240731297.0, "step": 85800 }, { "epoch": 2.08490085192107, "grad_norm": 0.30427655577659607, "learning_rate": 5.1532184514446955e-05, "loss": 0.1584, "mean_token_accuracy": 0.9593313550949096, "num_tokens": 241013465.0, "step": 85900 }, { "epoch": 2.0873279774762747, "grad_norm": 0.7176592350006104, "learning_rate": 5.128535561786159e-05, "loss": 0.1782, "mean_token_accuracy": 0.95425497174263, "num_tokens": 241286724.0, "step": 86000 }, { "epoch": 2.0897551030314796, "grad_norm": 0.7456308603286743, "learning_rate": 5.10389152484658e-05, "loss": 0.1433, "mean_token_accuracy": 0.9630332779884339, "num_tokens": 241565348.0, "step": 86100 }, { "epoch": 2.092182228586685, "grad_norm": 0.3332330882549286, "learning_rate": 5.0792865371762534e-05, "loss": 0.178, "mean_token_accuracy": 0.9545424026250839, "num_tokens": 241842815.0, "step": 86200 }, { "epoch": 2.0946093541418898, "grad_norm": 0.21731993556022644, "learning_rate": 5.054720795014061e-05, "loss": 0.1677, "mean_token_accuracy": 0.9578067481517791, "num_tokens": 242130092.0, "step": 86300 }, { "epoch": 2.0970364796970946, "grad_norm": 0.5908037424087524, "learning_rate": 5.030194494285859e-05, "loss": 0.1718, "mean_token_accuracy": 0.9564685708284378, "num_tokens": 242405499.0, "step": 86400 }, { "epoch": 2.0994636052523, "grad_norm": 0.3582291007041931, "learning_rate": 5.005707830602942e-05, "loss": 0.145, "mean_token_accuracy": 0.9633462530374527, "num_tokens": 242688957.0, "step": 86500 }, { "epoch": 2.1018907308075048, "grad_norm": 0.630010724067688, "learning_rate": 4.981260999260484e-05, "loss": 0.137, "mean_token_accuracy": 0.96411676466465, "num_tokens": 242966623.0, "step": 86600 }, { "epoch": 2.1043178563627096, "grad_norm": 0.2612079977989197, "learning_rate": 4.956854195235963e-05, "loss": 0.1725, "mean_token_accuracy": 0.9565469098091125, "num_tokens": 243227363.0, "step": 86700 }, { "epoch": 2.1067449819179145, "grad_norm": 0.44951051473617554, "learning_rate": 4.9324876131876186e-05, "loss": 0.1574, "mean_token_accuracy": 0.9592509204149247, "num_tokens": 243509573.0, "step": 86800 }, { "epoch": 2.10917210747312, "grad_norm": 0.601586103439331, "learning_rate": 4.9081614474528944e-05, "loss": 0.1843, "mean_token_accuracy": 0.9530947256088257, "num_tokens": 243780742.0, "step": 86900 }, { "epoch": 2.1115992330283246, "grad_norm": 0.3754229247570038, "learning_rate": 4.883875892046904e-05, "loss": 0.159, "mean_token_accuracy": 0.9592651635408401, "num_tokens": 244048761.0, "step": 87000 }, { "epoch": 2.1140263585835295, "grad_norm": 0.5469896793365479, "learning_rate": 4.8596311406608555e-05, "loss": 0.1515, "mean_token_accuracy": 0.9612551802396774, "num_tokens": 244323569.0, "step": 87100 }, { "epoch": 2.1164534841387344, "grad_norm": 0.33415502309799194, "learning_rate": 4.8354273866605234e-05, "loss": 0.1572, "mean_token_accuracy": 0.9593722647428513, "num_tokens": 244613096.0, "step": 87200 }, { "epoch": 2.1188806096939397, "grad_norm": 0.38363969326019287, "learning_rate": 4.8112648230847204e-05, "loss": 0.1724, "mean_token_accuracy": 0.9562730824947357, "num_tokens": 244888619.0, "step": 87300 }, { "epoch": 2.1213077352491445, "grad_norm": 0.3998253345489502, "learning_rate": 4.787143642643715e-05, "loss": 0.1561, "mean_token_accuracy": 0.9594432032108307, "num_tokens": 245179499.0, "step": 87400 }, { "epoch": 2.1237348608043494, "grad_norm": 0.38273563981056213, "learning_rate": 4.763064037717745e-05, "loss": 0.1474, "mean_token_accuracy": 0.962109939455986, "num_tokens": 245467289.0, "step": 87500 }, { "epoch": 2.1261619863595542, "grad_norm": 0.24416208267211914, "learning_rate": 4.7390262003554484e-05, "loss": 0.1514, "mean_token_accuracy": 0.9603712677955627, "num_tokens": 245758328.0, "step": 87600 }, { "epoch": 2.1285891119147595, "grad_norm": 1.0365910530090332, "learning_rate": 4.7150303222723434e-05, "loss": 0.1556, "mean_token_accuracy": 0.960610026717186, "num_tokens": 246051662.0, "step": 87700 }, { "epoch": 2.1310162374699644, "grad_norm": 0.9765134453773499, "learning_rate": 4.691076594849308e-05, "loss": 0.1446, "mean_token_accuracy": 0.9626650083065033, "num_tokens": 246339457.0, "step": 87800 }, { "epoch": 2.1334433630251692, "grad_norm": 0.6015363931655884, "learning_rate": 4.6671652091310305e-05, "loss": 0.179, "mean_token_accuracy": 0.9543536853790283, "num_tokens": 246621131.0, "step": 87900 }, { "epoch": 2.135870488580374, "grad_norm": 0.3435966968536377, "learning_rate": 4.64329635582451e-05, "loss": 0.1693, "mean_token_accuracy": 0.9579638803005218, "num_tokens": 246905268.0, "step": 88000 }, { "epoch": 2.1382976141355794, "grad_norm": 0.957961916923523, "learning_rate": 4.6194702252975184e-05, "loss": 0.1371, "mean_token_accuracy": 0.9644148916006088, "num_tokens": 247179877.0, "step": 88100 }, { "epoch": 2.1407247396907843, "grad_norm": 0.5038435459136963, "learning_rate": 4.5956870075771e-05, "loss": 0.1409, "mean_token_accuracy": 0.9635371136665344, "num_tokens": 247460346.0, "step": 88200 }, { "epoch": 2.143151865245989, "grad_norm": 0.6354673504829407, "learning_rate": 4.571946892348032e-05, "loss": 0.1741, "mean_token_accuracy": 0.9555548268556595, "num_tokens": 247731885.0, "step": 88300 }, { "epoch": 2.145578990801194, "grad_norm": 0.7258239984512329, "learning_rate": 4.54825006895133e-05, "loss": 0.1639, "mean_token_accuracy": 0.9581418389081955, "num_tokens": 248011968.0, "step": 88400 }, { "epoch": 2.1480061163563993, "grad_norm": 0.31706157326698303, "learning_rate": 4.524596726382736e-05, "loss": 0.1597, "mean_token_accuracy": 0.9591193878650666, "num_tokens": 248294086.0, "step": 88500 }, { "epoch": 2.150433241911604, "grad_norm": 0.6476277709007263, "learning_rate": 4.500987053291202e-05, "loss": 0.1445, "mean_token_accuracy": 0.9631332182884216, "num_tokens": 248571328.0, "step": 88600 }, { "epoch": 2.152860367466809, "grad_norm": 0.39604470133781433, "learning_rate": 4.477421237977394e-05, "loss": 0.1386, "mean_token_accuracy": 0.9638280099630356, "num_tokens": 248842735.0, "step": 88700 }, { "epoch": 2.155287493022014, "grad_norm": 0.2958824634552002, "learning_rate": 4.453899468392181e-05, "loss": 0.1247, "mean_token_accuracy": 0.9681485140323639, "num_tokens": 249108092.0, "step": 88800 }, { "epoch": 2.157714618577219, "grad_norm": 0.6001761555671692, "learning_rate": 4.430421932135156e-05, "loss": 0.1384, "mean_token_accuracy": 0.9643767905235291, "num_tokens": 249385884.0, "step": 88900 }, { "epoch": 2.160141744132424, "grad_norm": 0.9776992201805115, "learning_rate": 4.406988816453111e-05, "loss": 0.1633, "mean_token_accuracy": 0.9592347794771194, "num_tokens": 249664562.0, "step": 89000 }, { "epoch": 2.162568869687629, "grad_norm": 0.3612774908542633, "learning_rate": 4.3836003082385636e-05, "loss": 0.1489, "mean_token_accuracy": 0.9616664987802506, "num_tokens": 249940419.0, "step": 89100 }, { "epoch": 2.1649959952428337, "grad_norm": 0.6092312335968018, "learning_rate": 4.3602565940282715e-05, "loss": 0.1508, "mean_token_accuracy": 0.9607537633180618, "num_tokens": 250215092.0, "step": 89200 }, { "epoch": 2.167423120798039, "grad_norm": 0.46185943484306335, "learning_rate": 4.3369578600017105e-05, "loss": 0.1726, "mean_token_accuracy": 0.9566762536764145, "num_tokens": 250503318.0, "step": 89300 }, { "epoch": 2.169850246353244, "grad_norm": 0.5038064122200012, "learning_rate": 4.31370429197964e-05, "loss": 0.1628, "mean_token_accuracy": 0.958371422290802, "num_tokens": 250783576.0, "step": 89400 }, { "epoch": 2.1722773719084487, "grad_norm": 0.3340010941028595, "learning_rate": 4.290496075422581e-05, "loss": 0.1456, "mean_token_accuracy": 0.9626976943016052, "num_tokens": 251067459.0, "step": 89500 }, { "epoch": 2.1747044974636536, "grad_norm": 0.3565070927143097, "learning_rate": 4.2673333954293496e-05, "loss": 0.1524, "mean_token_accuracy": 0.9609863895177841, "num_tokens": 251372607.0, "step": 89600 }, { "epoch": 2.177131623018859, "grad_norm": 0.3239290118217468, "learning_rate": 4.244216436735583e-05, "loss": 0.1696, "mean_token_accuracy": 0.956989768743515, "num_tokens": 251643396.0, "step": 89700 }, { "epoch": 2.1795587485740637, "grad_norm": 0.8524539470672607, "learning_rate": 4.221145383712274e-05, "loss": 0.1486, "mean_token_accuracy": 0.9617244362831116, "num_tokens": 251917277.0, "step": 89800 }, { "epoch": 2.1819858741292686, "grad_norm": 0.3852197825908661, "learning_rate": 4.19812042036428e-05, "loss": 0.1583, "mean_token_accuracy": 0.9597838073968887, "num_tokens": 252194410.0, "step": 89900 }, { "epoch": 2.184412999684474, "grad_norm": 0.5747129917144775, "learning_rate": 4.1751417303288666e-05, "loss": 0.1539, "mean_token_accuracy": 0.9608057707548141, "num_tokens": 252465960.0, "step": 90000 }, { "epoch": 2.1868401252396787, "grad_norm": 0.5769911408424377, "learning_rate": 4.15220949687426e-05, "loss": 0.1549, "mean_token_accuracy": 0.9605850332975387, "num_tokens": 252737863.0, "step": 90100 }, { "epoch": 2.1892672507948836, "grad_norm": 0.45625218749046326, "learning_rate": 4.12932390289814e-05, "loss": 0.1548, "mean_token_accuracy": 0.960279346704483, "num_tokens": 253005781.0, "step": 90200 }, { "epoch": 2.1916943763500885, "grad_norm": 0.6102867126464844, "learning_rate": 4.1064851309262374e-05, "loss": 0.1719, "mean_token_accuracy": 0.9562070280313492, "num_tokens": 253282421.0, "step": 90300 }, { "epoch": 2.1941215019052938, "grad_norm": 0.418035626411438, "learning_rate": 4.0836933631108364e-05, "loss": 0.1544, "mean_token_accuracy": 0.9601026111841202, "num_tokens": 253548675.0, "step": 90400 }, { "epoch": 2.1965486274604986, "grad_norm": 0.5558539032936096, "learning_rate": 4.0609487812293344e-05, "loss": 0.1428, "mean_token_accuracy": 0.962897977232933, "num_tokens": 253824242.0, "step": 90500 }, { "epoch": 2.1989757530157035, "grad_norm": 0.31991976499557495, "learning_rate": 4.038251566682806e-05, "loss": 0.1684, "mean_token_accuracy": 0.9578196865320205, "num_tokens": 254108773.0, "step": 90600 }, { "epoch": 2.2014028785709083, "grad_norm": 0.35955339670181274, "learning_rate": 4.0156019004945336e-05, "loss": 0.1552, "mean_token_accuracy": 0.960447661280632, "num_tokens": 254398547.0, "step": 90700 }, { "epoch": 2.2038300041261136, "grad_norm": 0.3822309374809265, "learning_rate": 3.9929999633085757e-05, "loss": 0.1532, "mean_token_accuracy": 0.961047391295433, "num_tokens": 254649735.0, "step": 90800 }, { "epoch": 2.2062571296813185, "grad_norm": 0.6658386588096619, "learning_rate": 3.970445935388325e-05, "loss": 0.1549, "mean_token_accuracy": 0.9600626587867737, "num_tokens": 254930759.0, "step": 90900 }, { "epoch": 2.2086842552365233, "grad_norm": 0.4035944938659668, "learning_rate": 3.947939996615077e-05, "loss": 0.1532, "mean_token_accuracy": 0.9611799818277359, "num_tokens": 255195830.0, "step": 91000 }, { "epoch": 2.211111380791728, "grad_norm": 0.3729723393917084, "learning_rate": 3.9254823264865805e-05, "loss": 0.145, "mean_token_accuracy": 0.9629390466213227, "num_tokens": 255473905.0, "step": 91100 }, { "epoch": 2.2135385063469335, "grad_norm": 0.390670508146286, "learning_rate": 3.9030731041156135e-05, "loss": 0.1423, "mean_token_accuracy": 0.9631787258386612, "num_tokens": 255757741.0, "step": 91200 }, { "epoch": 2.2159656319021384, "grad_norm": 0.8064032196998596, "learning_rate": 3.880712508228568e-05, "loss": 0.1654, "mean_token_accuracy": 0.9579305970668792, "num_tokens": 256037860.0, "step": 91300 }, { "epoch": 2.218392757457343, "grad_norm": 0.43664440512657166, "learning_rate": 3.858400717163991e-05, "loss": 0.1533, "mean_token_accuracy": 0.9610221099853515, "num_tokens": 256309514.0, "step": 91400 }, { "epoch": 2.220819883012548, "grad_norm": 0.4479061961174011, "learning_rate": 3.8361379088712055e-05, "loss": 0.1518, "mean_token_accuracy": 0.9613274109363555, "num_tokens": 256599516.0, "step": 91500 }, { "epoch": 2.2232470085677534, "grad_norm": 0.2272028774023056, "learning_rate": 3.8139242609088535e-05, "loss": 0.1486, "mean_token_accuracy": 0.9616603809595108, "num_tokens": 256883150.0, "step": 91600 }, { "epoch": 2.2256741341229582, "grad_norm": 0.6324403285980225, "learning_rate": 3.7917599504434976e-05, "loss": 0.1714, "mean_token_accuracy": 0.9572657948732376, "num_tokens": 257177448.0, "step": 91700 }, { "epoch": 2.228101259678163, "grad_norm": 0.43273621797561646, "learning_rate": 3.7696451542482136e-05, "loss": 0.1381, "mean_token_accuracy": 0.9640599012374877, "num_tokens": 257449995.0, "step": 91800 }, { "epoch": 2.230528385233368, "grad_norm": 0.46854960918426514, "learning_rate": 3.747580048701165e-05, "loss": 0.1443, "mean_token_accuracy": 0.9633812409639358, "num_tokens": 257727340.0, "step": 91900 }, { "epoch": 2.2329555107885732, "grad_norm": 0.5193477272987366, "learning_rate": 3.7255648097842044e-05, "loss": 0.1529, "mean_token_accuracy": 0.9609187692403793, "num_tokens": 257997280.0, "step": 92000 }, { "epoch": 2.235382636343778, "grad_norm": 0.9509478807449341, "learning_rate": 3.7035996130814696e-05, "loss": 0.1286, "mean_token_accuracy": 0.9667005783319473, "num_tokens": 258291657.0, "step": 92100 }, { "epoch": 2.237809761898983, "grad_norm": 0.7098160982131958, "learning_rate": 3.681684633777991e-05, "loss": 0.1394, "mean_token_accuracy": 0.9636602461338043, "num_tokens": 258573145.0, "step": 92200 }, { "epoch": 2.240236887454188, "grad_norm": 0.3125475347042084, "learning_rate": 3.6598200466582744e-05, "loss": 0.1639, "mean_token_accuracy": 0.9590049821138382, "num_tokens": 258859618.0, "step": 92300 }, { "epoch": 2.242664013009393, "grad_norm": 0.43968746066093445, "learning_rate": 3.6380060261049244e-05, "loss": 0.1455, "mean_token_accuracy": 0.9627101892232894, "num_tokens": 259138960.0, "step": 92400 }, { "epoch": 2.245091138564598, "grad_norm": 0.3362375497817993, "learning_rate": 3.616242746097254e-05, "loss": 0.1606, "mean_token_accuracy": 0.9594524520635604, "num_tokens": 259415537.0, "step": 92500 }, { "epoch": 2.247518264119803, "grad_norm": 0.41409507393836975, "learning_rate": 3.594530380209881e-05, "loss": 0.1436, "mean_token_accuracy": 0.9630869609117508, "num_tokens": 259689569.0, "step": 92600 }, { "epoch": 2.249945389675008, "grad_norm": 0.23429489135742188, "learning_rate": 3.57286910161136e-05, "loss": 0.1385, "mean_token_accuracy": 0.9647276037931443, "num_tokens": 259962737.0, "step": 92700 }, { "epoch": 2.252372515230213, "grad_norm": 0.3240128457546234, "learning_rate": 3.551259083062792e-05, "loss": 0.1479, "mean_token_accuracy": 0.9617342710494995, "num_tokens": 260232903.0, "step": 92800 }, { "epoch": 2.254799640785418, "grad_norm": 0.31519198417663574, "learning_rate": 3.5297004969164604e-05, "loss": 0.1412, "mean_token_accuracy": 0.9640700775384903, "num_tokens": 260503493.0, "step": 92900 }, { "epoch": 2.2572267663406227, "grad_norm": 0.3446758985519409, "learning_rate": 3.508193515114423e-05, "loss": 0.139, "mean_token_accuracy": 0.9643152779340745, "num_tokens": 260792452.0, "step": 93000 }, { "epoch": 2.2596538918958275, "grad_norm": 0.5381621718406677, "learning_rate": 3.4867383091871854e-05, "loss": 0.1531, "mean_token_accuracy": 0.9607863134145737, "num_tokens": 261083921.0, "step": 93100 }, { "epoch": 2.262081017451033, "grad_norm": 0.5895318388938904, "learning_rate": 3.46533505025231e-05, "loss": 0.134, "mean_token_accuracy": 0.9654397374391556, "num_tokens": 261369478.0, "step": 93200 }, { "epoch": 2.2645081430062377, "grad_norm": 0.49713894724845886, "learning_rate": 3.443983909013029e-05, "loss": 0.1392, "mean_token_accuracy": 0.964362952709198, "num_tokens": 261640652.0, "step": 93300 }, { "epoch": 2.2669352685614426, "grad_norm": 0.561633288860321, "learning_rate": 3.422685055756936e-05, "loss": 0.1356, "mean_token_accuracy": 0.9651344537734985, "num_tokens": 261918219.0, "step": 93400 }, { "epoch": 2.269362394116648, "grad_norm": 0.5383502840995789, "learning_rate": 3.401438660354579e-05, "loss": 0.1461, "mean_token_accuracy": 0.9628910517692566, "num_tokens": 262208454.0, "step": 93500 }, { "epoch": 2.2717895196718527, "grad_norm": 0.4417279064655304, "learning_rate": 3.38024489225813e-05, "loss": 0.1177, "mean_token_accuracy": 0.9697069215774536, "num_tokens": 262493658.0, "step": 93600 }, { "epoch": 2.2742166452270576, "grad_norm": 0.38798466324806213, "learning_rate": 3.359103920500025e-05, "loss": 0.1556, "mean_token_accuracy": 0.9601915013790131, "num_tokens": 262774349.0, "step": 93700 }, { "epoch": 2.2766437707822624, "grad_norm": 0.4257045388221741, "learning_rate": 3.3380159136916287e-05, "loss": 0.1441, "mean_token_accuracy": 0.9632410365343094, "num_tokens": 263049670.0, "step": 93800 }, { "epoch": 2.2790708963374673, "grad_norm": 0.45404383540153503, "learning_rate": 3.3169810400218707e-05, "loss": 0.1187, "mean_token_accuracy": 0.9688757717609405, "num_tokens": 263325433.0, "step": 93900 }, { "epoch": 2.2814980218926726, "grad_norm": 0.5358030200004578, "learning_rate": 3.29599946725591e-05, "loss": 0.1425, "mean_token_accuracy": 0.9637533491849899, "num_tokens": 263607707.0, "step": 94000 }, { "epoch": 2.2839251474478774, "grad_norm": 0.2688960134983063, "learning_rate": 3.275071362733818e-05, "loss": 0.152, "mean_token_accuracy": 0.9612334263324738, "num_tokens": 263884293.0, "step": 94100 }, { "epoch": 2.2863522730030823, "grad_norm": 0.4777241051197052, "learning_rate": 3.2541968933692014e-05, "loss": 0.1398, "mean_token_accuracy": 0.9639061081409455, "num_tokens": 264165137.0, "step": 94200 }, { "epoch": 2.2887793985582876, "grad_norm": 0.6694493889808655, "learning_rate": 3.233376225647919e-05, "loss": 0.1532, "mean_token_accuracy": 0.960937168598175, "num_tokens": 264437076.0, "step": 94300 }, { "epoch": 2.2912065241134925, "grad_norm": 0.6230403780937195, "learning_rate": 3.2126095256267166e-05, "loss": 0.1441, "mean_token_accuracy": 0.9634155595302581, "num_tokens": 264710455.0, "step": 94400 }, { "epoch": 2.2936336496686973, "grad_norm": 0.48842713236808777, "learning_rate": 3.191896958931917e-05, "loss": 0.1471, "mean_token_accuracy": 0.962595841884613, "num_tokens": 264992074.0, "step": 94500 }, { "epoch": 2.296060775223902, "grad_norm": 0.2296297550201416, "learning_rate": 3.1712386907581105e-05, "loss": 0.147, "mean_token_accuracy": 0.9623468136787414, "num_tokens": 265277308.0, "step": 94600 }, { "epoch": 2.2984879007791075, "grad_norm": 0.2678510844707489, "learning_rate": 3.150634885866812e-05, "loss": 0.1658, "mean_token_accuracy": 0.9582432430982589, "num_tokens": 265554404.0, "step": 94700 }, { "epoch": 2.3009150263343123, "grad_norm": 0.24679508805274963, "learning_rate": 3.130085708585169e-05, "loss": 0.1339, "mean_token_accuracy": 0.9652345758676529, "num_tokens": 265839961.0, "step": 94800 }, { "epoch": 2.303342151889517, "grad_norm": 0.5927382707595825, "learning_rate": 3.109591322804634e-05, "loss": 0.1408, "mean_token_accuracy": 0.9638234466314316, "num_tokens": 266123252.0, "step": 94900 }, { "epoch": 2.305769277444722, "grad_norm": 0.30509328842163086, "learning_rate": 3.089151891979678e-05, "loss": 0.124, "mean_token_accuracy": 0.9677273762226105, "num_tokens": 266401504.0, "step": 95000 }, { "epoch": 2.3081964029999273, "grad_norm": 0.7120338678359985, "learning_rate": 3.0687675791264656e-05, "loss": 0.1276, "mean_token_accuracy": 0.9665586167573929, "num_tokens": 266681814.0, "step": 95100 }, { "epoch": 2.310623528555132, "grad_norm": 0.312296599149704, "learning_rate": 3.0484385468215637e-05, "loss": 0.1298, "mean_token_accuracy": 0.9670355755090714, "num_tokens": 266966361.0, "step": 95200 }, { "epoch": 2.313050654110337, "grad_norm": 0.2584036588668823, "learning_rate": 3.0281649572006566e-05, "loss": 0.1374, "mean_token_accuracy": 0.9641981101036072, "num_tokens": 267243149.0, "step": 95300 }, { "epoch": 2.315477779665542, "grad_norm": 0.9581860303878784, "learning_rate": 3.0079469719572206e-05, "loss": 0.1483, "mean_token_accuracy": 0.9624637293815613, "num_tokens": 267532595.0, "step": 95400 }, { "epoch": 2.317904905220747, "grad_norm": 0.6013193726539612, "learning_rate": 2.9877847523412784e-05, "loss": 0.1355, "mean_token_accuracy": 0.9647196280956268, "num_tokens": 267805629.0, "step": 95500 }, { "epoch": 2.320332030775952, "grad_norm": 0.4272877275943756, "learning_rate": 2.9676784591580743e-05, "loss": 0.1443, "mean_token_accuracy": 0.9627549290657044, "num_tokens": 268072889.0, "step": 95600 }, { "epoch": 2.322759156331157, "grad_norm": 0.4217541515827179, "learning_rate": 2.9476282527668097e-05, "loss": 0.1322, "mean_token_accuracy": 0.9650915312767029, "num_tokens": 268342006.0, "step": 95700 }, { "epoch": 2.325186281886362, "grad_norm": 0.5044764876365662, "learning_rate": 2.927634293079369e-05, "loss": 0.1358, "mean_token_accuracy": 0.9652171546220779, "num_tokens": 268613702.0, "step": 95800 }, { "epoch": 2.327613407441567, "grad_norm": 0.7305507063865662, "learning_rate": 2.9076967395590305e-05, "loss": 0.1371, "mean_token_accuracy": 0.9650354409217834, "num_tokens": 268903603.0, "step": 95900 }, { "epoch": 2.330040532996772, "grad_norm": 0.2776518762111664, "learning_rate": 2.8878157512192017e-05, "loss": 0.1535, "mean_token_accuracy": 0.9604264676570893, "num_tokens": 269200151.0, "step": 96000 }, { "epoch": 2.332467658551977, "grad_norm": 0.4138396680355072, "learning_rate": 2.867991486622148e-05, "loss": 0.1339, "mean_token_accuracy": 0.9657935965061187, "num_tokens": 269489962.0, "step": 96100 }, { "epoch": 2.334894784107182, "grad_norm": 0.32120972871780396, "learning_rate": 2.848224103877739e-05, "loss": 0.1465, "mean_token_accuracy": 0.9628925704956055, "num_tokens": 269766960.0, "step": 96200 }, { "epoch": 2.337321909662387, "grad_norm": 0.8308660387992859, "learning_rate": 2.8285137606421728e-05, "loss": 0.1427, "mean_token_accuracy": 0.9637695878744126, "num_tokens": 270042872.0, "step": 96300 }, { "epoch": 2.339749035217592, "grad_norm": 0.3830409646034241, "learning_rate": 2.8088606141167196e-05, "loss": 0.1443, "mean_token_accuracy": 0.9631164449453354, "num_tokens": 270329997.0, "step": 96400 }, { "epoch": 2.3421761607727967, "grad_norm": 0.4517403244972229, "learning_rate": 2.7892648210464878e-05, "loss": 0.1466, "mean_token_accuracy": 0.9625603079795837, "num_tokens": 270606605.0, "step": 96500 }, { "epoch": 2.3446032863280015, "grad_norm": 0.3587762713432312, "learning_rate": 2.769726537719145e-05, "loss": 0.1662, "mean_token_accuracy": 0.9586023986339569, "num_tokens": 270903974.0, "step": 96600 }, { "epoch": 2.347030411883207, "grad_norm": 0.31619253754615784, "learning_rate": 2.7502459199636908e-05, "loss": 0.1189, "mean_token_accuracy": 0.9690315425395966, "num_tokens": 271194872.0, "step": 96700 }, { "epoch": 2.3494575374384117, "grad_norm": 0.5178159475326538, "learning_rate": 2.7308231231492087e-05, "loss": 0.1333, "mean_token_accuracy": 0.9646364021301269, "num_tokens": 271473378.0, "step": 96800 }, { "epoch": 2.3518846629936165, "grad_norm": 0.32339227199554443, "learning_rate": 2.711458302183635e-05, "loss": 0.1484, "mean_token_accuracy": 0.9622978901863098, "num_tokens": 271766276.0, "step": 96900 }, { "epoch": 2.354311788548822, "grad_norm": 0.6405932903289795, "learning_rate": 2.6921516115124944e-05, "loss": 0.134, "mean_token_accuracy": 0.9650397008657455, "num_tokens": 272051531.0, "step": 97000 }, { "epoch": 2.3567389141040267, "grad_norm": 0.26447561383247375, "learning_rate": 2.672903205117714e-05, "loss": 0.1352, "mean_token_accuracy": 0.9650300896167755, "num_tokens": 272322818.0, "step": 97100 }, { "epoch": 2.3591660396592316, "grad_norm": 0.32654133439064026, "learning_rate": 2.6537132365163542e-05, "loss": 0.1411, "mean_token_accuracy": 0.9641784363985062, "num_tokens": 272588000.0, "step": 97200 }, { "epoch": 2.3615931652144364, "grad_norm": 0.38720738887786865, "learning_rate": 2.634581858759403e-05, "loss": 0.144, "mean_token_accuracy": 0.9631247913837433, "num_tokens": 272861777.0, "step": 97300 }, { "epoch": 2.3640202907696417, "grad_norm": 0.5359301567077637, "learning_rate": 2.615509224430561e-05, "loss": 0.1471, "mean_token_accuracy": 0.9627832245826721, "num_tokens": 273141380.0, "step": 97400 }, { "epoch": 2.3664474163248466, "grad_norm": 0.7095302939414978, "learning_rate": 2.5964954856450087e-05, "loss": 0.1369, "mean_token_accuracy": 0.9649500614404678, "num_tokens": 273425819.0, "step": 97500 }, { "epoch": 2.3688745418800514, "grad_norm": 0.2362442910671234, "learning_rate": 2.577540794048201e-05, "loss": 0.1422, "mean_token_accuracy": 0.9634811532497406, "num_tokens": 273711441.0, "step": 97600 }, { "epoch": 2.3713016674352563, "grad_norm": 0.46207818388938904, "learning_rate": 2.558645300814657e-05, "loss": 0.1253, "mean_token_accuracy": 0.9674885600805283, "num_tokens": 273991541.0, "step": 97700 }, { "epoch": 2.3737287929904616, "grad_norm": 0.628451943397522, "learning_rate": 2.539809156646764e-05, "loss": 0.1159, "mean_token_accuracy": 0.9699374061822891, "num_tokens": 274253602.0, "step": 97800 }, { "epoch": 2.3761559185456664, "grad_norm": 0.3115593492984772, "learning_rate": 2.5210325117735557e-05, "loss": 0.1381, "mean_token_accuracy": 0.9645976287126541, "num_tokens": 274529807.0, "step": 97900 }, { "epoch": 2.3785830441008713, "grad_norm": 0.2984253764152527, "learning_rate": 2.5023155159495282e-05, "loss": 0.1201, "mean_token_accuracy": 0.9683735293149948, "num_tokens": 274822575.0, "step": 98000 }, { "epoch": 2.381010169656076, "grad_norm": 0.5238213539123535, "learning_rate": 2.4836583184534488e-05, "loss": 0.1287, "mean_token_accuracy": 0.9667371785640717, "num_tokens": 275099672.0, "step": 98100 }, { "epoch": 2.3834372952112814, "grad_norm": 0.6737905740737915, "learning_rate": 2.4650610680871444e-05, "loss": 0.1604, "mean_token_accuracy": 0.959024778008461, "num_tokens": 275386344.0, "step": 98200 }, { "epoch": 2.3858644207664863, "grad_norm": 0.5642903447151184, "learning_rate": 2.4465239131743446e-05, "loss": 0.1349, "mean_token_accuracy": 0.9656519025564194, "num_tokens": 275656742.0, "step": 98300 }, { "epoch": 2.388291546321691, "grad_norm": 0.398689866065979, "learning_rate": 2.4280470015594748e-05, "loss": 0.1608, "mean_token_accuracy": 0.9593168395757675, "num_tokens": 275952854.0, "step": 98400 }, { "epoch": 2.390718671876896, "grad_norm": 0.27904191613197327, "learning_rate": 2.409630480606484e-05, "loss": 0.1235, "mean_token_accuracy": 0.9679072177410126, "num_tokens": 276222937.0, "step": 98500 }, { "epoch": 2.3931457974321013, "grad_norm": 1.006825566291809, "learning_rate": 2.3912744971976808e-05, "loss": 0.1306, "mean_token_accuracy": 0.9661997479200363, "num_tokens": 276504510.0, "step": 98600 }, { "epoch": 2.395572922987306, "grad_norm": 0.3777683675289154, "learning_rate": 2.3729791977325432e-05, "loss": 0.1297, "mean_token_accuracy": 0.9665063762664795, "num_tokens": 276772770.0, "step": 98700 }, { "epoch": 2.398000048542511, "grad_norm": 0.28916266560554504, "learning_rate": 2.3547447281265635e-05, "loss": 0.1593, "mean_token_accuracy": 0.960017095208168, "num_tokens": 277050087.0, "step": 98800 }, { "epoch": 2.400427174097716, "grad_norm": 0.2858158349990845, "learning_rate": 2.3365712338100755e-05, "loss": 0.1177, "mean_token_accuracy": 0.9696782982349396, "num_tokens": 277350875.0, "step": 98900 }, { "epoch": 2.402854299652921, "grad_norm": 0.35637909173965454, "learning_rate": 2.318458859727112e-05, "loss": 0.1389, "mean_token_accuracy": 0.9645104986429215, "num_tokens": 277641318.0, "step": 99000 }, { "epoch": 2.405281425208126, "grad_norm": 0.3532889783382416, "learning_rate": 2.3004077503342257e-05, "loss": 0.1209, "mean_token_accuracy": 0.9687686038017272, "num_tokens": 277914597.0, "step": 99100 }, { "epoch": 2.407708550763331, "grad_norm": 0.4838126301765442, "learning_rate": 2.2824180495993518e-05, "loss": 0.1351, "mean_token_accuracy": 0.9651601392030716, "num_tokens": 278193197.0, "step": 99200 }, { "epoch": 2.4101356763185358, "grad_norm": 0.5864011645317078, "learning_rate": 2.264489901000665e-05, "loss": 0.1277, "mean_token_accuracy": 0.9669712960720063, "num_tokens": 278469259.0, "step": 99300 }, { "epoch": 2.412562801873741, "grad_norm": 0.23830416798591614, "learning_rate": 2.2466234475254067e-05, "loss": 0.1412, "mean_token_accuracy": 0.9635849434137345, "num_tokens": 278748270.0, "step": 99400 }, { "epoch": 2.414989927428946, "grad_norm": 0.2923569977283478, "learning_rate": 2.228818831668783e-05, "loss": 0.1327, "mean_token_accuracy": 0.9659154057502747, "num_tokens": 279029338.0, "step": 99500 }, { "epoch": 2.4174170529841508, "grad_norm": 0.2777637243270874, "learning_rate": 2.211076195432803e-05, "loss": 0.117, "mean_token_accuracy": 0.969894996881485, "num_tokens": 279314859.0, "step": 99600 }, { "epoch": 2.419844178539356, "grad_norm": 0.4714739918708801, "learning_rate": 2.1933956803251464e-05, "loss": 0.1283, "mean_token_accuracy": 0.9672115963697433, "num_tokens": 279599623.0, "step": 99700 }, { "epoch": 2.422271304094561, "grad_norm": 0.17279647290706635, "learning_rate": 2.1757774273580555e-05, "loss": 0.1173, "mean_token_accuracy": 0.9694797676801682, "num_tokens": 279878335.0, "step": 99800 }, { "epoch": 2.424698429649766, "grad_norm": 0.6131448149681091, "learning_rate": 2.1582215770471813e-05, "loss": 0.1331, "mean_token_accuracy": 0.965504584312439, "num_tokens": 280169261.0, "step": 99900 }, { "epoch": 2.4271255552049706, "grad_norm": 0.63509202003479, "learning_rate": 2.1407282694104945e-05, "loss": 0.1367, "mean_token_accuracy": 0.9649469172954559, "num_tokens": 280457393.0, "step": 100000 }, { "epoch": 2.4295526807601755, "grad_norm": 0.6302922964096069, "learning_rate": 2.1232976439671305e-05, "loss": 0.1215, "mean_token_accuracy": 0.9686316031217576, "num_tokens": 280737937.0, "step": 100100 }, { "epoch": 2.431979806315381, "grad_norm": 0.2724493443965912, "learning_rate": 2.105929839736319e-05, "loss": 0.1112, "mean_token_accuracy": 0.9709296381473541, "num_tokens": 281011603.0, "step": 100200 }, { "epoch": 2.4344069318705857, "grad_norm": 0.780457079410553, "learning_rate": 2.0886249952362423e-05, "loss": 0.1202, "mean_token_accuracy": 0.9688206619024277, "num_tokens": 281297746.0, "step": 100300 }, { "epoch": 2.4368340574257905, "grad_norm": 0.23311544954776764, "learning_rate": 2.0713832484829454e-05, "loss": 0.1356, "mean_token_accuracy": 0.9651333147287369, "num_tokens": 281583883.0, "step": 100400 }, { "epoch": 2.439261182980996, "grad_norm": 0.6378100514411926, "learning_rate": 2.054204736989237e-05, "loss": 0.141, "mean_token_accuracy": 0.9639557653665543, "num_tokens": 281861952.0, "step": 100500 }, { "epoch": 2.4416883085362007, "grad_norm": 0.5750208497047424, "learning_rate": 2.0370895977635827e-05, "loss": 0.1295, "mean_token_accuracy": 0.9666093420982361, "num_tokens": 282135836.0, "step": 100600 }, { "epoch": 2.4441154340914055, "grad_norm": 0.7432602643966675, "learning_rate": 2.020037967309022e-05, "loss": 0.14, "mean_token_accuracy": 0.9642543375492096, "num_tokens": 282418573.0, "step": 100700 }, { "epoch": 2.4465425596466104, "grad_norm": 0.7422667741775513, "learning_rate": 2.0030499816220692e-05, "loss": 0.1231, "mean_token_accuracy": 0.9678942495584488, "num_tokens": 282702369.0, "step": 100800 }, { "epoch": 2.4489696852018157, "grad_norm": 0.32463338971138, "learning_rate": 1.9861257761916495e-05, "loss": 0.1387, "mean_token_accuracy": 0.9645810687541961, "num_tokens": 282976614.0, "step": 100900 }, { "epoch": 2.4513968107570205, "grad_norm": 0.3175111413002014, "learning_rate": 1.969265485997982e-05, "loss": 0.1303, "mean_token_accuracy": 0.9666304177045822, "num_tokens": 283259813.0, "step": 101000 }, { "epoch": 2.4538239363122254, "grad_norm": 0.3536398708820343, "learning_rate": 1.9524692455115456e-05, "loss": 0.1157, "mean_token_accuracy": 0.969986400604248, "num_tokens": 283545148.0, "step": 101100 }, { "epoch": 2.4562510618674303, "grad_norm": 0.39317232370376587, "learning_rate": 1.9357371886919762e-05, "loss": 0.1227, "mean_token_accuracy": 0.9676568216085434, "num_tokens": 283827994.0, "step": 101200 }, { "epoch": 2.4586781874226356, "grad_norm": 0.7098958492279053, "learning_rate": 1.9190694489870066e-05, "loss": 0.1501, "mean_token_accuracy": 0.9624185985326768, "num_tokens": 284123970.0, "step": 101300 }, { "epoch": 2.4611053129778404, "grad_norm": 0.1925334334373474, "learning_rate": 1.902466159331413e-05, "loss": 0.1325, "mean_token_accuracy": 0.9658237969875336, "num_tokens": 284404970.0, "step": 101400 }, { "epoch": 2.4635324385330453, "grad_norm": 0.7176932692527771, "learning_rate": 1.885927452145937e-05, "loss": 0.1465, "mean_token_accuracy": 0.9626785635948181, "num_tokens": 284692921.0, "step": 101500 }, { "epoch": 2.46595956408825, "grad_norm": 0.4300325810909271, "learning_rate": 1.86945345933624e-05, "loss": 0.1295, "mean_token_accuracy": 0.9664794594049454, "num_tokens": 284966467.0, "step": 101600 }, { "epoch": 2.4683866896434554, "grad_norm": 0.49719369411468506, "learning_rate": 1.8530443122918483e-05, "loss": 0.1263, "mean_token_accuracy": 0.9671731227636338, "num_tokens": 285260131.0, "step": 101700 }, { "epoch": 2.4708138151986603, "grad_norm": 0.6283943057060242, "learning_rate": 1.8367001418851126e-05, "loss": 0.1403, "mean_token_accuracy": 0.9638696962594986, "num_tokens": 285550527.0, "step": 101800 }, { "epoch": 2.473240940753865, "grad_norm": 0.5366162061691284, "learning_rate": 1.8204210784701492e-05, "loss": 0.1239, "mean_token_accuracy": 0.9679791039228439, "num_tokens": 285826548.0, "step": 101900 }, { "epoch": 2.47566806630907, "grad_norm": 0.4290122091770172, "learning_rate": 1.804207251881811e-05, "loss": 0.1357, "mean_token_accuracy": 0.9650535964965821, "num_tokens": 286112898.0, "step": 102000 }, { "epoch": 2.4780951918642753, "grad_norm": 0.6725316643714905, "learning_rate": 1.788058791434658e-05, "loss": 0.1124, "mean_token_accuracy": 0.9707834237813949, "num_tokens": 286388657.0, "step": 102100 }, { "epoch": 2.48052231741948, "grad_norm": 0.40707212686538696, "learning_rate": 1.7719758259219012e-05, "loss": 0.1442, "mean_token_accuracy": 0.9627831757068634, "num_tokens": 286679858.0, "step": 102200 }, { "epoch": 2.482949442974685, "grad_norm": 0.48535874485969543, "learning_rate": 1.7559584836144095e-05, "loss": 0.1299, "mean_token_accuracy": 0.9669503039121627, "num_tokens": 286953076.0, "step": 102300 }, { "epoch": 2.48537656852989, "grad_norm": 0.20833340287208557, "learning_rate": 1.7400068922596603e-05, "loss": 0.1425, "mean_token_accuracy": 0.9637488210201264, "num_tokens": 287233777.0, "step": 102400 }, { "epoch": 2.487803694085095, "grad_norm": 0.24462640285491943, "learning_rate": 1.7241211790807256e-05, "loss": 0.1299, "mean_token_accuracy": 0.9663457351922989, "num_tokens": 287520202.0, "step": 102500 }, { "epoch": 2.4902308196403, "grad_norm": 0.3491087853908539, "learning_rate": 1.7083014707752732e-05, "loss": 0.1385, "mean_token_accuracy": 0.9639142185449601, "num_tokens": 287791973.0, "step": 102600 }, { "epoch": 2.492657945195505, "grad_norm": 0.5299773216247559, "learning_rate": 1.6925478935145335e-05, "loss": 0.1356, "mean_token_accuracy": 0.9650074106454849, "num_tokens": 288075820.0, "step": 102700 }, { "epoch": 2.4950850707507097, "grad_norm": 0.5309982299804688, "learning_rate": 1.676860572942309e-05, "loss": 0.1328, "mean_token_accuracy": 0.9659540796279907, "num_tokens": 288366268.0, "step": 102800 }, { "epoch": 2.497512196305915, "grad_norm": 0.9474865794181824, "learning_rate": 1.6612396341739633e-05, "loss": 0.1385, "mean_token_accuracy": 0.9651324534416199, "num_tokens": 288652494.0, "step": 102900 }, { "epoch": 2.49993932186112, "grad_norm": 0.37610140442848206, "learning_rate": 1.645685201795435e-05, "loss": 0.1187, "mean_token_accuracy": 0.9689668655395508, "num_tokens": 288930537.0, "step": 103000 }, { "epoch": 2.5023664474163247, "grad_norm": 0.7315070033073425, "learning_rate": 1.6301973998622267e-05, "loss": 0.1377, "mean_token_accuracy": 0.9647850877046585, "num_tokens": 289200621.0, "step": 103100 }, { "epoch": 2.50479357297153, "grad_norm": 0.4755480885505676, "learning_rate": 1.6147763518984283e-05, "loss": 0.125, "mean_token_accuracy": 0.9684723728895187, "num_tokens": 289485794.0, "step": 103200 }, { "epoch": 2.507220698526735, "grad_norm": 0.20060652494430542, "learning_rate": 1.5994221808957354e-05, "loss": 0.1241, "mean_token_accuracy": 0.9680253058671952, "num_tokens": 289768904.0, "step": 103300 }, { "epoch": 2.5096478240819398, "grad_norm": 0.3017840087413788, "learning_rate": 1.5841350093124473e-05, "loss": 0.1304, "mean_token_accuracy": 0.9663189828395844, "num_tokens": 290053063.0, "step": 103400 }, { "epoch": 2.5120749496371446, "grad_norm": 0.2239573448896408, "learning_rate": 1.5689149590725205e-05, "loss": 0.1494, "mean_token_accuracy": 0.9615071332454681, "num_tokens": 290332650.0, "step": 103500 }, { "epoch": 2.5145020751923495, "grad_norm": 0.2974654734134674, "learning_rate": 1.5537621515645685e-05, "loss": 0.1132, "mean_token_accuracy": 0.9711314821243286, "num_tokens": 290627061.0, "step": 103600 }, { "epoch": 2.5169292007475548, "grad_norm": 0.24081680178642273, "learning_rate": 1.5386767076409093e-05, "loss": 0.1138, "mean_token_accuracy": 0.9703037095069885, "num_tokens": 290906135.0, "step": 103700 }, { "epoch": 2.5193563263027596, "grad_norm": 0.4012824296951294, "learning_rate": 1.5236587476166009e-05, "loss": 0.1438, "mean_token_accuracy": 0.9640105599164963, "num_tokens": 291205760.0, "step": 103800 }, { "epoch": 2.5217834518579645, "grad_norm": 0.1920708417892456, "learning_rate": 1.508708391268473e-05, "loss": 0.117, "mean_token_accuracy": 0.9692934077978134, "num_tokens": 291501172.0, "step": 103900 }, { "epoch": 2.52421057741317, "grad_norm": 0.7257289290428162, "learning_rate": 1.4938257578341851e-05, "loss": 0.1271, "mean_token_accuracy": 0.9669335162639618, "num_tokens": 291762700.0, "step": 104000 }, { "epoch": 2.5266377029683746, "grad_norm": 0.4053400456905365, "learning_rate": 1.479010966011255e-05, "loss": 0.118, "mean_token_accuracy": 0.9697389549016953, "num_tokens": 292029791.0, "step": 104100 }, { "epoch": 2.5290648285235795, "grad_norm": 0.7388555407524109, "learning_rate": 1.4642641339561392e-05, "loss": 0.1472, "mean_token_accuracy": 0.9629099863767624, "num_tokens": 292297139.0, "step": 104200 }, { "epoch": 2.5314919540787844, "grad_norm": 0.44577348232269287, "learning_rate": 1.4495853792832658e-05, "loss": 0.1287, "mean_token_accuracy": 0.9671968942880631, "num_tokens": 292570449.0, "step": 104300 }, { "epoch": 2.533919079633989, "grad_norm": 0.41142553091049194, "learning_rate": 1.4349748190641088e-05, "loss": 0.1371, "mean_token_accuracy": 0.9647838968038559, "num_tokens": 292846299.0, "step": 104400 }, { "epoch": 2.5363462051891945, "grad_norm": 0.519055187702179, "learning_rate": 1.42043256982626e-05, "loss": 0.1298, "mean_token_accuracy": 0.966575453877449, "num_tokens": 293128911.0, "step": 104500 }, { "epoch": 2.5387733307443994, "grad_norm": 0.8179194331169128, "learning_rate": 1.4059587475524804e-05, "loss": 0.1254, "mean_token_accuracy": 0.9670647913217545, "num_tokens": 293391008.0, "step": 104600 }, { "epoch": 2.5412004562996042, "grad_norm": 0.4637313187122345, "learning_rate": 1.3915534676797948e-05, "loss": 0.1212, "mean_token_accuracy": 0.9688729846477508, "num_tokens": 293667868.0, "step": 104700 }, { "epoch": 2.5436275818548095, "grad_norm": 0.45500364899635315, "learning_rate": 1.3772168450985578e-05, "loss": 0.1301, "mean_token_accuracy": 0.9666210097074509, "num_tokens": 293944089.0, "step": 104800 }, { "epoch": 2.5460547074100144, "grad_norm": 0.5273195505142212, "learning_rate": 1.3629489941515527e-05, "loss": 0.1299, "mean_token_accuracy": 0.9664002645015717, "num_tokens": 294209744.0, "step": 104900 }, { "epoch": 2.5484818329652192, "grad_norm": 0.6371331214904785, "learning_rate": 1.3487500286330524e-05, "loss": 0.1141, "mean_token_accuracy": 0.9706168347597122, "num_tokens": 294481283.0, "step": 105000 }, { "epoch": 2.550908958520424, "grad_norm": 0.6027917265892029, "learning_rate": 1.3346200617879457e-05, "loss": 0.1051, "mean_token_accuracy": 0.9724947756528854, "num_tokens": 294760945.0, "step": 105100 }, { "epoch": 2.5533360840756294, "grad_norm": 0.7076954245567322, "learning_rate": 1.3205592063108108e-05, "loss": 0.1231, "mean_token_accuracy": 0.9681785267591476, "num_tokens": 295040707.0, "step": 105200 }, { "epoch": 2.5557632096308343, "grad_norm": 0.3423120081424713, "learning_rate": 1.306567574345019e-05, "loss": 0.1149, "mean_token_accuracy": 0.9702116006612778, "num_tokens": 295328440.0, "step": 105300 }, { "epoch": 2.558190335186039, "grad_norm": 0.3347417116165161, "learning_rate": 1.2926452774818531e-05, "loss": 0.1201, "mean_token_accuracy": 0.9698320901393891, "num_tokens": 295616342.0, "step": 105400 }, { "epoch": 2.560617460741244, "grad_norm": 0.25471988320350647, "learning_rate": 1.278792426759604e-05, "loss": 0.115, "mean_token_accuracy": 0.970246964097023, "num_tokens": 295885464.0, "step": 105500 }, { "epoch": 2.5630445862964493, "grad_norm": 0.18230029940605164, "learning_rate": 1.2650091326626878e-05, "loss": 0.1155, "mean_token_accuracy": 0.9699338835477829, "num_tokens": 296165406.0, "step": 105600 }, { "epoch": 2.565471711851654, "grad_norm": 0.6567848324775696, "learning_rate": 1.2512955051207698e-05, "loss": 0.1237, "mean_token_accuracy": 0.9680297696590423, "num_tokens": 296436162.0, "step": 105700 }, { "epoch": 2.567898837406859, "grad_norm": 0.8726966977119446, "learning_rate": 1.2376516535078852e-05, "loss": 0.1186, "mean_token_accuracy": 0.9697349029779434, "num_tokens": 296723223.0, "step": 105800 }, { "epoch": 2.5703259629620643, "grad_norm": 0.6806580424308777, "learning_rate": 1.2240776866415649e-05, "loss": 0.133, "mean_token_accuracy": 0.9650469082593918, "num_tokens": 297006971.0, "step": 105900 }, { "epoch": 2.572753088517269, "grad_norm": 0.3688866198062897, "learning_rate": 1.210573712781966e-05, "loss": 0.1292, "mean_token_accuracy": 0.9668610745668411, "num_tokens": 297298957.0, "step": 106000 }, { "epoch": 2.575180214072474, "grad_norm": 0.31866592168807983, "learning_rate": 1.1971398396310219e-05, "loss": 0.1368, "mean_token_accuracy": 0.9650294834375381, "num_tokens": 297575766.0, "step": 106100 }, { "epoch": 2.577607339627679, "grad_norm": 0.29714086651802063, "learning_rate": 1.1837761743315534e-05, "loss": 0.119, "mean_token_accuracy": 0.9692506629228592, "num_tokens": 297853168.0, "step": 106200 }, { "epoch": 2.5800344651828837, "grad_norm": 0.3458079397678375, "learning_rate": 1.1704828234664544e-05, "loss": 0.1211, "mean_token_accuracy": 0.9684927022457123, "num_tokens": 298124718.0, "step": 106300 }, { "epoch": 2.582461590738089, "grad_norm": 0.47059133648872375, "learning_rate": 1.1572598930578083e-05, "loss": 0.1364, "mean_token_accuracy": 0.9651955342292786, "num_tokens": 298399917.0, "step": 106400 }, { "epoch": 2.584888716293294, "grad_norm": 0.37811002135276794, "learning_rate": 1.1441074885660564e-05, "loss": 0.1244, "mean_token_accuracy": 0.967754932641983, "num_tokens": 298679331.0, "step": 106500 }, { "epoch": 2.5873158418484987, "grad_norm": 0.3915969133377075, "learning_rate": 1.1310257148891622e-05, "loss": 0.1263, "mean_token_accuracy": 0.9675187277793884, "num_tokens": 298958742.0, "step": 106600 }, { "epoch": 2.589742967403704, "grad_norm": 0.2337946891784668, "learning_rate": 1.1180146763617604e-05, "loss": 0.1239, "mean_token_accuracy": 0.9683219474554062, "num_tokens": 299244025.0, "step": 106700 }, { "epoch": 2.592170092958909, "grad_norm": 0.23497796058654785, "learning_rate": 1.1050744767543341e-05, "loss": 0.1313, "mean_token_accuracy": 0.9658564746379852, "num_tokens": 299522954.0, "step": 106800 }, { "epoch": 2.5945972185141137, "grad_norm": 0.7963089346885681, "learning_rate": 1.0922052192723864e-05, "loss": 0.1162, "mean_token_accuracy": 0.969860834479332, "num_tokens": 299806975.0, "step": 106900 }, { "epoch": 2.5970243440693186, "grad_norm": 0.24625255167484283, "learning_rate": 1.0794070065556195e-05, "loss": 0.1468, "mean_token_accuracy": 0.9626657491922379, "num_tokens": 300083725.0, "step": 107000 }, { "epoch": 2.5994514696245234, "grad_norm": 0.3800683319568634, "learning_rate": 1.0666799406771078e-05, "loss": 0.1108, "mean_token_accuracy": 0.9710573416948318, "num_tokens": 300344158.0, "step": 107100 }, { "epoch": 2.6018785951797287, "grad_norm": 0.6734281182289124, "learning_rate": 1.0540241231424885e-05, "loss": 0.1325, "mean_token_accuracy": 0.9659317535161972, "num_tokens": 300623724.0, "step": 107200 }, { "epoch": 2.6043057207349336, "grad_norm": 0.33141759037971497, "learning_rate": 1.0414396548891625e-05, "loss": 0.1079, "mean_token_accuracy": 0.9718086642026901, "num_tokens": 300902698.0, "step": 107300 }, { "epoch": 2.6067328462901385, "grad_norm": 0.21568359434604645, "learning_rate": 1.028926636285461e-05, "loss": 0.1249, "mean_token_accuracy": 0.9674285870790481, "num_tokens": 301174551.0, "step": 107400 }, { "epoch": 2.6091599718453438, "grad_norm": 0.19938012957572937, "learning_rate": 1.0164851671298824e-05, "loss": 0.1146, "mean_token_accuracy": 0.9703828579187393, "num_tokens": 301458177.0, "step": 107500 }, { "epoch": 2.6115870974005486, "grad_norm": 0.6210383176803589, "learning_rate": 1.0041153466502662e-05, "loss": 0.1093, "mean_token_accuracy": 0.9717652726173401, "num_tokens": 301740628.0, "step": 107600 }, { "epoch": 2.6140142229557535, "grad_norm": 0.581802487373352, "learning_rate": 9.918172735030129e-06, "loss": 0.1294, "mean_token_accuracy": 0.9672193360328675, "num_tokens": 302027384.0, "step": 107700 }, { "epoch": 2.6164413485109583, "grad_norm": 0.27364861965179443, "learning_rate": 9.795910457723035e-06, "loss": 0.1168, "mean_token_accuracy": 0.9696778047084809, "num_tokens": 302321015.0, "step": 107800 }, { "epoch": 2.618868474066163, "grad_norm": 0.6201646327972412, "learning_rate": 9.674367609693024e-06, "loss": 0.1381, "mean_token_accuracy": 0.9657150173187256, "num_tokens": 302606608.0, "step": 107900 }, { "epoch": 2.6212955996213685, "grad_norm": 0.25466471910476685, "learning_rate": 9.553545160313981e-06, "loss": 0.1083, "mean_token_accuracy": 0.9716448175907135, "num_tokens": 302876241.0, "step": 108000 }, { "epoch": 2.6237227251765733, "grad_norm": 0.3042312264442444, "learning_rate": 9.433444073214071e-06, "loss": 0.1373, "mean_token_accuracy": 0.9643556636571884, "num_tokens": 303166869.0, "step": 108100 }, { "epoch": 2.626149850731778, "grad_norm": 0.44712647795677185, "learning_rate": 9.314065306268293e-06, "loss": 0.113, "mean_token_accuracy": 0.9708542251586914, "num_tokens": 303454686.0, "step": 108200 }, { "epoch": 2.6285769762869835, "grad_norm": 0.21764476597309113, "learning_rate": 9.195409811590671e-06, "loss": 0.1295, "mean_token_accuracy": 0.9666325169801712, "num_tokens": 303747965.0, "step": 108300 }, { "epoch": 2.6310041018421884, "grad_norm": 0.2885514795780182, "learning_rate": 9.07747853552674e-06, "loss": 0.1141, "mean_token_accuracy": 0.9705150628089905, "num_tokens": 304024604.0, "step": 108400 }, { "epoch": 2.633431227397393, "grad_norm": 0.2512131631374359, "learning_rate": 8.960272418645942e-06, "loss": 0.1168, "mean_token_accuracy": 0.9695372092723846, "num_tokens": 304307717.0, "step": 108500 }, { "epoch": 2.635858352952598, "grad_norm": 0.3841225802898407, "learning_rate": 8.84379239573423e-06, "loss": 0.1289, "mean_token_accuracy": 0.9664428579807282, "num_tokens": 304581847.0, "step": 108600 }, { "epoch": 2.6382854785078034, "grad_norm": 0.7938812375068665, "learning_rate": 8.72803939578648e-06, "loss": 0.1311, "mean_token_accuracy": 0.966459589600563, "num_tokens": 304851167.0, "step": 108700 }, { "epoch": 2.6407126040630082, "grad_norm": 0.1958301067352295, "learning_rate": 8.613014341999148e-06, "loss": 0.1156, "mean_token_accuracy": 0.9697409337759018, "num_tokens": 305133812.0, "step": 108800 }, { "epoch": 2.643139729618213, "grad_norm": 0.2697140574455261, "learning_rate": 8.498718151762952e-06, "loss": 0.1104, "mean_token_accuracy": 0.971248230934143, "num_tokens": 305415322.0, "step": 108900 }, { "epoch": 2.645566855173418, "grad_norm": 0.6028584241867065, "learning_rate": 8.385151736655428e-06, "loss": 0.1246, "mean_token_accuracy": 0.9677672815322876, "num_tokens": 305701665.0, "step": 109000 }, { "epoch": 2.6479939807286232, "grad_norm": 0.8087351322174072, "learning_rate": 8.272316002433811e-06, "loss": 0.1244, "mean_token_accuracy": 0.9675776773691177, "num_tokens": 305983629.0, "step": 109100 }, { "epoch": 2.650421106283828, "grad_norm": 0.31361258029937744, "learning_rate": 8.160211849027688e-06, "loss": 0.1016, "mean_token_accuracy": 0.9731613707542419, "num_tokens": 306260513.0, "step": 109200 }, { "epoch": 2.652848231839033, "grad_norm": 0.36700600385665894, "learning_rate": 8.04884017053189e-06, "loss": 0.1185, "mean_token_accuracy": 0.9694353520870209, "num_tokens": 306530485.0, "step": 109300 }, { "epoch": 2.6552753573942383, "grad_norm": 0.455331027507782, "learning_rate": 7.938201855199357e-06, "loss": 0.1144, "mean_token_accuracy": 0.9698577016592026, "num_tokens": 306807912.0, "step": 109400 }, { "epoch": 2.657702482949443, "grad_norm": 0.28509321808815, "learning_rate": 7.828297785434014e-06, "loss": 0.1076, "mean_token_accuracy": 0.9716468757390976, "num_tokens": 307088631.0, "step": 109500 }, { "epoch": 2.660129608504648, "grad_norm": 0.5025367140769958, "learning_rate": 7.719128837783751e-06, "loss": 0.138, "mean_token_accuracy": 0.9643630236387253, "num_tokens": 307376670.0, "step": 109600 }, { "epoch": 2.662556734059853, "grad_norm": 0.3343975245952606, "learning_rate": 7.61069588293345e-06, "loss": 0.1071, "mean_token_accuracy": 0.9722916567325592, "num_tokens": 307651794.0, "step": 109700 }, { "epoch": 2.6649838596150577, "grad_norm": 0.3086334466934204, "learning_rate": 7.50299978569805e-06, "loss": 0.0937, "mean_token_accuracy": 0.9749183577299118, "num_tokens": 307921590.0, "step": 109800 }, { "epoch": 2.667410985170263, "grad_norm": 0.21876056492328644, "learning_rate": 7.3960414050155856e-06, "loss": 0.1199, "mean_token_accuracy": 0.9686343175172806, "num_tokens": 308210040.0, "step": 109900 }, { "epoch": 2.669838110725468, "grad_norm": 0.4001712203025818, "learning_rate": 7.289821593940405e-06, "loss": 0.1351, "mean_token_accuracy": 0.9652160155773163, "num_tokens": 308485810.0, "step": 110000 }, { "epoch": 2.6722652362806727, "grad_norm": 0.15460790693759918, "learning_rate": 7.184341199636368e-06, "loss": 0.114, "mean_token_accuracy": 0.9703251016139984, "num_tokens": 308758085.0, "step": 110100 }, { "epoch": 2.674692361835878, "grad_norm": 0.506861686706543, "learning_rate": 7.079601063369978e-06, "loss": 0.1153, "mean_token_accuracy": 0.969694042801857, "num_tokens": 309024784.0, "step": 110200 }, { "epoch": 2.677119487391083, "grad_norm": 0.31870442628860474, "learning_rate": 6.975602020503846e-06, "loss": 0.0984, "mean_token_accuracy": 0.9742819786071777, "num_tokens": 309312124.0, "step": 110300 }, { "epoch": 2.6795466129462877, "grad_norm": 0.4337054193019867, "learning_rate": 6.872344900489902e-06, "loss": 0.1076, "mean_token_accuracy": 0.9716127079725265, "num_tokens": 309575184.0, "step": 110400 }, { "epoch": 2.6819737385014926, "grad_norm": 0.5748428106307983, "learning_rate": 6.769830526862797e-06, "loss": 0.1231, "mean_token_accuracy": 0.9681743580102921, "num_tokens": 309867557.0, "step": 110500 }, { "epoch": 2.6844008640566974, "grad_norm": 0.34900379180908203, "learning_rate": 6.668059717233399e-06, "loss": 0.106, "mean_token_accuracy": 0.9726725119352341, "num_tokens": 310161974.0, "step": 110600 }, { "epoch": 2.6868279896119027, "grad_norm": 0.19695007801055908, "learning_rate": 6.567033283282187e-06, "loss": 0.1159, "mean_token_accuracy": 0.9697107589244842, "num_tokens": 310447635.0, "step": 110700 }, { "epoch": 2.6892551151671076, "grad_norm": 0.7395570874214172, "learning_rate": 6.466752030752831e-06, "loss": 0.124, "mean_token_accuracy": 0.9683857929706573, "num_tokens": 310731828.0, "step": 110800 }, { "epoch": 2.6916822407223124, "grad_norm": 0.26191890239715576, "learning_rate": 6.36721675944576e-06, "loss": 0.104, "mean_token_accuracy": 0.9727838003635406, "num_tokens": 310998120.0, "step": 110900 }, { "epoch": 2.6941093662775177, "grad_norm": 0.24544072151184082, "learning_rate": 6.268428263211778e-06, "loss": 0.1191, "mean_token_accuracy": 0.9691353464126586, "num_tokens": 311277764.0, "step": 111000 }, { "epoch": 2.6965364918327226, "grad_norm": 0.18376019597053528, "learning_rate": 6.170387329945704e-06, "loss": 0.1145, "mean_token_accuracy": 0.9702999091148377, "num_tokens": 311556366.0, "step": 111100 }, { "epoch": 2.6989636173879274, "grad_norm": 0.5311710238456726, "learning_rate": 6.073094741580143e-06, "loss": 0.1307, "mean_token_accuracy": 0.9665592706203461, "num_tokens": 311845090.0, "step": 111200 }, { "epoch": 2.7013907429431323, "grad_norm": 0.31867825984954834, "learning_rate": 5.976551274079234e-06, "loss": 0.1135, "mean_token_accuracy": 0.9705018663406372, "num_tokens": 312123948.0, "step": 111300 }, { "epoch": 2.703817868498337, "grad_norm": 0.24618180096149445, "learning_rate": 5.880757697432359e-06, "loss": 0.1306, "mean_token_accuracy": 0.9667133390903473, "num_tokens": 312411540.0, "step": 111400 }, { "epoch": 2.7062449940535425, "grad_norm": 0.26771923899650574, "learning_rate": 5.785714775648221e-06, "loss": 0.1302, "mean_token_accuracy": 0.966540464758873, "num_tokens": 312691963.0, "step": 111500 }, { "epoch": 2.7086721196087473, "grad_norm": 0.2688523232936859, "learning_rate": 5.69142326674853e-06, "loss": 0.1187, "mean_token_accuracy": 0.9695068788528443, "num_tokens": 312971327.0, "step": 111600 }, { "epoch": 2.711099245163952, "grad_norm": 0.7141240835189819, "learning_rate": 5.59788392276206e-06, "loss": 0.1038, "mean_token_accuracy": 0.9725120347738266, "num_tokens": 313247248.0, "step": 111700 }, { "epoch": 2.7135263707191575, "grad_norm": 0.5409464240074158, "learning_rate": 5.50509748971868e-06, "loss": 0.1188, "mean_token_accuracy": 0.9693516844511032, "num_tokens": 313516922.0, "step": 111800 }, { "epoch": 2.7159534962743623, "grad_norm": 0.3617819845676422, "learning_rate": 5.413064707643311e-06, "loss": 0.105, "mean_token_accuracy": 0.9724315345287323, "num_tokens": 313788670.0, "step": 111900 }, { "epoch": 2.718380621829567, "grad_norm": 0.4758976995944977, "learning_rate": 5.321786310550147e-06, "loss": 0.1238, "mean_token_accuracy": 0.9674117451906205, "num_tokens": 314086215.0, "step": 112000 }, { "epoch": 2.7208077473847725, "grad_norm": 0.5627454519271851, "learning_rate": 5.231263026436606e-06, "loss": 0.1153, "mean_token_accuracy": 0.9697802549600602, "num_tokens": 314365015.0, "step": 112100 }, { "epoch": 2.7232348729399773, "grad_norm": 0.3024059534072876, "learning_rate": 5.141495577277788e-06, "loss": 0.1252, "mean_token_accuracy": 0.9676291835308075, "num_tokens": 314653127.0, "step": 112200 }, { "epoch": 2.725661998495182, "grad_norm": 0.27323031425476074, "learning_rate": 5.052484679020486e-06, "loss": 0.1107, "mean_token_accuracy": 0.9716593581438064, "num_tokens": 314947107.0, "step": 112300 }, { "epoch": 2.728089124050387, "grad_norm": 0.3902416527271271, "learning_rate": 4.9642310415776046e-06, "loss": 0.1188, "mean_token_accuracy": 0.9689583003520965, "num_tokens": 315235175.0, "step": 112400 }, { "epoch": 2.730516249605592, "grad_norm": 0.18243947625160217, "learning_rate": 4.876735368822427e-06, "loss": 0.1178, "mean_token_accuracy": 0.9699918162822724, "num_tokens": 315517014.0, "step": 112500 }, { "epoch": 2.732943375160797, "grad_norm": 0.3229522109031677, "learning_rate": 4.789998358583081e-06, "loss": 0.1063, "mean_token_accuracy": 0.9719061130285263, "num_tokens": 315800900.0, "step": 112600 }, { "epoch": 2.735370500716002, "grad_norm": 0.3077257573604584, "learning_rate": 4.704020702636902e-06, "loss": 0.1078, "mean_token_accuracy": 0.9717499113082886, "num_tokens": 316069792.0, "step": 112700 }, { "epoch": 2.737797626271207, "grad_norm": 0.604117751121521, "learning_rate": 4.6188030867049035e-06, "loss": 0.1279, "mean_token_accuracy": 0.9671464514732361, "num_tokens": 316351166.0, "step": 112800 }, { "epoch": 2.7402247518264122, "grad_norm": 0.5633140206336975, "learning_rate": 4.534346190446425e-06, "loss": 0.1267, "mean_token_accuracy": 0.9674019521474838, "num_tokens": 316640163.0, "step": 112900 }, { "epoch": 2.742651877381617, "grad_norm": 0.2642538249492645, "learning_rate": 4.4506506874535505e-06, "loss": 0.1009, "mean_token_accuracy": 0.9735317534208298, "num_tokens": 316911400.0, "step": 113000 }, { "epoch": 2.745079002936822, "grad_norm": 0.5643557906150818, "learning_rate": 4.367717245245873e-06, "loss": 0.1357, "mean_token_accuracy": 0.9655422532558441, "num_tokens": 317192284.0, "step": 113100 }, { "epoch": 2.747506128492027, "grad_norm": 0.5519342422485352, "learning_rate": 4.285546525265083e-06, "loss": 0.1178, "mean_token_accuracy": 0.9697508090734481, "num_tokens": 317475335.0, "step": 113200 }, { "epoch": 2.7499332540472317, "grad_norm": 0.35835447907447815, "learning_rate": 4.204139182869715e-06, "loss": 0.1359, "mean_token_accuracy": 0.9650754129886627, "num_tokens": 317759240.0, "step": 113300 }, { "epoch": 2.752360379602437, "grad_norm": 0.41857218742370605, "learning_rate": 4.123495867329996e-06, "loss": 0.1305, "mean_token_accuracy": 0.9665146750211716, "num_tokens": 318050262.0, "step": 113400 }, { "epoch": 2.754787505157642, "grad_norm": 0.3513345420360565, "learning_rate": 4.043617221822527e-06, "loss": 0.1104, "mean_token_accuracy": 0.971141186952591, "num_tokens": 318344518.0, "step": 113500 }, { "epoch": 2.7572146307128467, "grad_norm": 0.29043108224868774, "learning_rate": 3.964503883425253e-06, "loss": 0.1037, "mean_token_accuracy": 0.9722469764947891, "num_tokens": 318624075.0, "step": 113600 }, { "epoch": 2.759641756268052, "grad_norm": 0.1855383664369583, "learning_rate": 3.886156483112346e-06, "loss": 0.0983, "mean_token_accuracy": 0.9739230316877365, "num_tokens": 318887655.0, "step": 113700 }, { "epoch": 2.762068881823257, "grad_norm": 0.4655309021472931, "learning_rate": 3.808575645749235e-06, "loss": 0.1137, "mean_token_accuracy": 0.970301919579506, "num_tokens": 319159124.0, "step": 113800 }, { "epoch": 2.7644960073784617, "grad_norm": 0.1889120638370514, "learning_rate": 3.731761990087501e-06, "loss": 0.1197, "mean_token_accuracy": 0.9691438144445419, "num_tokens": 319445815.0, "step": 113900 }, { "epoch": 2.7669231329336665, "grad_norm": 0.478685200214386, "learning_rate": 3.655716128760045e-06, "loss": 0.1108, "mean_token_accuracy": 0.9712050658464432, "num_tokens": 319719626.0, "step": 114000 }, { "epoch": 2.7693502584888714, "grad_norm": 0.37950462102890015, "learning_rate": 3.580438668276187e-06, "loss": 0.113, "mean_token_accuracy": 0.9705785620212555, "num_tokens": 319995278.0, "step": 114100 }, { "epoch": 2.7717773840440767, "grad_norm": 0.5169498920440674, "learning_rate": 3.50593020901675e-06, "loss": 0.1332, "mean_token_accuracy": 0.9659777718782425, "num_tokens": 320290364.0, "step": 114200 }, { "epoch": 2.7742045095992816, "grad_norm": 0.17640161514282227, "learning_rate": 3.4321913452294076e-06, "loss": 0.119, "mean_token_accuracy": 0.9695685094594956, "num_tokens": 320574116.0, "step": 114300 }, { "epoch": 2.7766316351544864, "grad_norm": 0.2589649260044098, "learning_rate": 3.359222665023809e-06, "loss": 0.1097, "mean_token_accuracy": 0.9713182425498963, "num_tokens": 320864956.0, "step": 114400 }, { "epoch": 2.7790587607096917, "grad_norm": 0.47062090039253235, "learning_rate": 3.287024750366974e-06, "loss": 0.1069, "mean_token_accuracy": 0.972015266418457, "num_tokens": 321149365.0, "step": 114500 }, { "epoch": 2.7814858862648966, "grad_norm": 0.40544161200523376, "learning_rate": 3.215598177078627e-06, "loss": 0.1115, "mean_token_accuracy": 0.9708707851171493, "num_tokens": 321422551.0, "step": 114600 }, { "epoch": 2.7839130118201014, "grad_norm": 0.2606247067451477, "learning_rate": 3.1449435148266036e-06, "loss": 0.0969, "mean_token_accuracy": 0.974098664522171, "num_tokens": 321709417.0, "step": 114700 }, { "epoch": 2.7863401373753063, "grad_norm": 0.4302670359611511, "learning_rate": 3.075061327122286e-06, "loss": 0.1121, "mean_token_accuracy": 0.9709526228904725, "num_tokens": 321989833.0, "step": 114800 }, { "epoch": 2.788767262930511, "grad_norm": 0.280659943819046, "learning_rate": 3.0059521713161507e-06, "loss": 0.1362, "mean_token_accuracy": 0.9656502819061279, "num_tokens": 322258838.0, "step": 114900 }, { "epoch": 2.7911943884857164, "grad_norm": 0.3072628676891327, "learning_rate": 2.9376165985933067e-06, "loss": 0.1184, "mean_token_accuracy": 0.9694582325220108, "num_tokens": 322544203.0, "step": 115000 }, { "epoch": 2.7936215140409213, "grad_norm": 0.578826904296875, "learning_rate": 2.8700551539690757e-06, "loss": 0.1013, "mean_token_accuracy": 0.9733384555578232, "num_tokens": 322836934.0, "step": 115100 }, { "epoch": 2.796048639596126, "grad_norm": 0.20911192893981934, "learning_rate": 2.8032683762846623e-06, "loss": 0.1415, "mean_token_accuracy": 0.96405100107193, "num_tokens": 323122676.0, "step": 115200 }, { "epoch": 2.7984757651513315, "grad_norm": 0.5314410924911499, "learning_rate": 2.737256798202914e-06, "loss": 0.1256, "mean_token_accuracy": 0.9679098558425904, "num_tokens": 323420602.0, "step": 115300 }, { "epoch": 2.8009028907065363, "grad_norm": 0.41332700848579407, "learning_rate": 2.6720209462039126e-06, "loss": 0.1145, "mean_token_accuracy": 0.9704767823219299, "num_tokens": 323701792.0, "step": 115400 }, { "epoch": 2.803330016261741, "grad_norm": 0.8187140822410583, "learning_rate": 2.6075613405809775e-06, "loss": 0.1099, "mean_token_accuracy": 0.9712301725149155, "num_tokens": 323980477.0, "step": 115500 }, { "epoch": 2.8057571418169465, "grad_norm": 0.4253501296043396, "learning_rate": 2.5438784954363693e-06, "loss": 0.1076, "mean_token_accuracy": 0.9717456769943237, "num_tokens": 324265276.0, "step": 115600 }, { "epoch": 2.8081842673721513, "grad_norm": 0.40823256969451904, "learning_rate": 2.4809729186772713e-06, "loss": 0.1144, "mean_token_accuracy": 0.9703187984228134, "num_tokens": 324549587.0, "step": 115700 }, { "epoch": 2.810611392927356, "grad_norm": 0.2143670618534088, "learning_rate": 2.4188451120116473e-06, "loss": 0.1154, "mean_token_accuracy": 0.9702856040000916, "num_tokens": 324826840.0, "step": 115800 }, { "epoch": 2.813038518482561, "grad_norm": 0.5098114609718323, "learning_rate": 2.357495570944368e-06, "loss": 0.1018, "mean_token_accuracy": 0.9735467559099198, "num_tokens": 325106484.0, "step": 115900 }, { "epoch": 2.815465644037766, "grad_norm": 0.6569923758506775, "learning_rate": 2.296924784773169e-06, "loss": 0.1078, "mean_token_accuracy": 0.9718588262796402, "num_tokens": 325383372.0, "step": 116000 }, { "epoch": 2.817892769592971, "grad_norm": 0.25976550579071045, "learning_rate": 2.237133236584732e-06, "loss": 0.1104, "mean_token_accuracy": 0.970865985751152, "num_tokens": 325665856.0, "step": 116100 }, { "epoch": 2.820319895148176, "grad_norm": 0.32673880457878113, "learning_rate": 2.178121403250921e-06, "loss": 0.1082, "mean_token_accuracy": 0.9717980152368546, "num_tokens": 325955490.0, "step": 116200 }, { "epoch": 2.822747020703381, "grad_norm": 0.24631185829639435, "learning_rate": 2.1198897554248866e-06, "loss": 0.1134, "mean_token_accuracy": 0.9706428390741348, "num_tokens": 326229143.0, "step": 116300 }, { "epoch": 2.825174146258586, "grad_norm": 0.5602017045021057, "learning_rate": 2.0624387575373884e-06, "loss": 0.1253, "mean_token_accuracy": 0.967502401471138, "num_tokens": 326501871.0, "step": 116400 }, { "epoch": 2.827601271813791, "grad_norm": 0.22061678767204285, "learning_rate": 2.005768867793023e-06, "loss": 0.1217, "mean_token_accuracy": 0.9690318322181701, "num_tokens": 326780314.0, "step": 116500 }, { "epoch": 2.830028397368996, "grad_norm": 0.17672549188137054, "learning_rate": 1.9498805381666373e-06, "loss": 0.1036, "mean_token_accuracy": 0.973377079963684, "num_tokens": 327053685.0, "step": 116600 }, { "epoch": 2.8324555229242008, "grad_norm": 0.28869152069091797, "learning_rate": 1.894774214399664e-06, "loss": 0.1211, "mean_token_accuracy": 0.9686681467294693, "num_tokens": 327345823.0, "step": 116700 }, { "epoch": 2.8348826484794056, "grad_norm": 0.2163640260696411, "learning_rate": 1.8404503359965685e-06, "loss": 0.1177, "mean_token_accuracy": 0.969739454984665, "num_tokens": 327628687.0, "step": 116800 }, { "epoch": 2.837309774034611, "grad_norm": 0.17181088030338287, "learning_rate": 1.7869093362214428e-06, "loss": 0.1195, "mean_token_accuracy": 0.9691154944896698, "num_tokens": 327909703.0, "step": 116900 }, { "epoch": 2.839736899589816, "grad_norm": 0.30073028802871704, "learning_rate": 1.7341516420943838e-06, "loss": 0.1421, "mean_token_accuracy": 0.9641453611850739, "num_tokens": 328189309.0, "step": 117000 }, { "epoch": 2.8421640251450206, "grad_norm": 0.20629960298538208, "learning_rate": 1.6821776743882412e-06, "loss": 0.1136, "mean_token_accuracy": 0.9706392961740494, "num_tokens": 328481607.0, "step": 117100 }, { "epoch": 2.844591150700226, "grad_norm": 0.13787657022476196, "learning_rate": 1.630987847625165e-06, "loss": 0.1177, "mean_token_accuracy": 0.9695311176776886, "num_tokens": 328760774.0, "step": 117200 }, { "epoch": 2.847018276255431, "grad_norm": 0.3100675046443939, "learning_rate": 1.58058257007333e-06, "loss": 0.1236, "mean_token_accuracy": 0.9676383012533187, "num_tokens": 329044214.0, "step": 117300 }, { "epoch": 2.8494454018106357, "grad_norm": 0.3160964846611023, "learning_rate": 1.5309622437437165e-06, "loss": 0.1031, "mean_token_accuracy": 0.9725679057836533, "num_tokens": 329316724.0, "step": 117400 }, { "epoch": 2.8518725273658405, "grad_norm": 0.8570364117622375, "learning_rate": 1.4821272643868456e-06, "loss": 0.1245, "mean_token_accuracy": 0.9679946649074554, "num_tokens": 329596205.0, "step": 117500 }, { "epoch": 2.8542996529210454, "grad_norm": 0.22737392783164978, "learning_rate": 1.4340780214896377e-06, "loss": 0.1141, "mean_token_accuracy": 0.9702414745092391, "num_tokens": 329879803.0, "step": 117600 }, { "epoch": 2.8567267784762507, "grad_norm": 0.21855413913726807, "learning_rate": 1.386814898272315e-06, "loss": 0.089, "mean_token_accuracy": 0.9766771763563156, "num_tokens": 330172869.0, "step": 117700 }, { "epoch": 2.8591539040314555, "grad_norm": 0.20194584131240845, "learning_rate": 1.3403382716853707e-06, "loss": 0.1022, "mean_token_accuracy": 0.9733722984790802, "num_tokens": 330458642.0, "step": 117800 }, { "epoch": 2.8615810295866604, "grad_norm": 0.34213054180145264, "learning_rate": 1.2946485124065045e-06, "loss": 0.1123, "mean_token_accuracy": 0.9709642374515534, "num_tokens": 330759617.0, "step": 117900 }, { "epoch": 2.8640081551418657, "grad_norm": 0.2635839879512787, "learning_rate": 1.2497459848377357e-06, "loss": 0.1088, "mean_token_accuracy": 0.9715518152713776, "num_tokens": 331047262.0, "step": 118000 }, { "epoch": 2.8664352806970705, "grad_norm": 0.2805730998516083, "learning_rate": 1.205631047102429e-06, "loss": 0.1164, "mean_token_accuracy": 0.9698609280586242, "num_tokens": 331316862.0, "step": 118100 }, { "epoch": 2.8688624062522754, "grad_norm": 0.9312669038772583, "learning_rate": 1.1623040510424733e-06, "loss": 0.1362, "mean_token_accuracy": 0.9651178181171417, "num_tokens": 331590294.0, "step": 118200 }, { "epoch": 2.8712895318074803, "grad_norm": 0.48080629110336304, "learning_rate": 1.119765342215484e-06, "loss": 0.1112, "mean_token_accuracy": 0.9709939730167388, "num_tokens": 331882823.0, "step": 118300 }, { "epoch": 2.873716657362685, "grad_norm": 0.34005963802337646, "learning_rate": 1.0780152598920291e-06, "loss": 0.1164, "mean_token_accuracy": 0.9699809849262238, "num_tokens": 332171535.0, "step": 118400 }, { "epoch": 2.8761437829178904, "grad_norm": 0.6795576810836792, "learning_rate": 1.0370541370529063e-06, "loss": 0.1318, "mean_token_accuracy": 0.9663900202512741, "num_tokens": 332445375.0, "step": 118500 }, { "epoch": 2.8785709084730953, "grad_norm": 0.33189335465431213, "learning_rate": 9.96882300386559e-07, "loss": 0.1112, "mean_token_accuracy": 0.9712472832202912, "num_tokens": 332716597.0, "step": 118600 }, { "epoch": 2.8809980340283, "grad_norm": 0.5224782824516296, "learning_rate": 9.57500070286388e-07, "loss": 0.1196, "mean_token_accuracy": 0.9688737159967422, "num_tokens": 332989752.0, "step": 118700 }, { "epoch": 2.8834251595835054, "grad_norm": 0.20184582471847534, "learning_rate": 9.189077608482088e-07, "loss": 0.1032, "mean_token_accuracy": 0.9731306463479996, "num_tokens": 333287367.0, "step": 118800 }, { "epoch": 2.8858522851387103, "grad_norm": 0.2816540002822876, "learning_rate": 8.811056798678219e-07, "loss": 0.1252, "mean_token_accuracy": 0.9680552417039872, "num_tokens": 333557454.0, "step": 118900 }, { "epoch": 2.888279410693915, "grad_norm": 0.7499918341636658, "learning_rate": 8.440941288384573e-07, "loss": 0.1041, "mean_token_accuracy": 0.9726737987995148, "num_tokens": 333841334.0, "step": 119000 }, { "epoch": 2.8907065362491204, "grad_norm": 0.5361903309822083, "learning_rate": 8.078734029484336e-07, "loss": 0.1214, "mean_token_accuracy": 0.9687109112739563, "num_tokens": 334133058.0, "step": 119100 }, { "epoch": 2.8931336618043253, "grad_norm": 0.3385041058063507, "learning_rate": 7.724437910787918e-07, "loss": 0.1268, "mean_token_accuracy": 0.9669984877109528, "num_tokens": 334413493.0, "step": 119200 }, { "epoch": 2.89556078735953, "grad_norm": 0.7476818561553955, "learning_rate": 7.378055758009761e-07, "loss": 0.1364, "mean_token_accuracy": 0.9651480865478516, "num_tokens": 334683284.0, "step": 119300 }, { "epoch": 2.897987912914735, "grad_norm": 0.19367779791355133, "learning_rate": 7.039590333745794e-07, "loss": 0.1304, "mean_token_accuracy": 0.966360149383545, "num_tokens": 334958476.0, "step": 119400 }, { "epoch": 2.90041503846994, "grad_norm": 0.2946068346500397, "learning_rate": 6.709044337451786e-07, "loss": 0.116, "mean_token_accuracy": 0.9700655668973923, "num_tokens": 335240625.0, "step": 119500 }, { "epoch": 2.902842164025145, "grad_norm": 0.2045404314994812, "learning_rate": 6.386420405421367e-07, "loss": 0.1196, "mean_token_accuracy": 0.9686876207590103, "num_tokens": 335523206.0, "step": 119600 }, { "epoch": 2.90526928958035, "grad_norm": 0.27502840757369995, "learning_rate": 6.071721110765038e-07, "loss": 0.0995, "mean_token_accuracy": 0.9739311110973358, "num_tokens": 335814183.0, "step": 119700 }, { "epoch": 2.907696415135555, "grad_norm": 0.43896734714508057, "learning_rate": 5.764948963389971e-07, "loss": 0.1198, "mean_token_accuracy": 0.9692691284418106, "num_tokens": 336083135.0, "step": 119800 }, { "epoch": 2.91012354069076, "grad_norm": 0.1466599702835083, "learning_rate": 5.46610640997991e-07, "loss": 0.1114, "mean_token_accuracy": 0.9710710817575454, "num_tokens": 336349492.0, "step": 119900 }, { "epoch": 2.912550666245965, "grad_norm": 0.5170989036560059, "learning_rate": 5.175195833975077e-07, "loss": 0.1317, "mean_token_accuracy": 0.9662616163492203, "num_tokens": 336623591.0, "step": 120000 }, { "epoch": 2.91497779180117, "grad_norm": 0.22670291364192963, "learning_rate": 4.892219555554189e-07, "loss": 0.1206, "mean_token_accuracy": 0.9689949071407318, "num_tokens": 336895993.0, "step": 120100 }, { "epoch": 2.9174049173563747, "grad_norm": 0.5331772565841675, "learning_rate": 4.617179831614915e-07, "loss": 0.116, "mean_token_accuracy": 0.9697140926122665, "num_tokens": 337166227.0, "step": 120200 }, { "epoch": 2.9198320429115796, "grad_norm": 0.4976193904876709, "learning_rate": 4.350078855756778e-07, "loss": 0.1269, "mean_token_accuracy": 0.9676454555988312, "num_tokens": 337439450.0, "step": 120300 }, { "epoch": 2.922259168466785, "grad_norm": 0.7994723320007324, "learning_rate": 4.0909187582630625e-07, "loss": 0.1117, "mean_token_accuracy": 0.9715138351917267, "num_tokens": 337722491.0, "step": 120400 }, { "epoch": 2.9246862940219898, "grad_norm": 0.3039077818393707, "learning_rate": 3.839701606083712e-07, "loss": 0.1238, "mean_token_accuracy": 0.9683326798677444, "num_tokens": 338013188.0, "step": 120500 }, { "epoch": 2.9271134195771946, "grad_norm": 0.4227394759654999, "learning_rate": 3.5964294028195676e-07, "loss": 0.1271, "mean_token_accuracy": 0.9676005560159683, "num_tokens": 338299002.0, "step": 120600 }, { "epoch": 2.9295405451324, "grad_norm": 0.4402371346950531, "learning_rate": 3.3611040887058243e-07, "loss": 0.1078, "mean_token_accuracy": 0.9718052613735199, "num_tokens": 338590741.0, "step": 120700 }, { "epoch": 2.9319676706876048, "grad_norm": 0.2943148612976074, "learning_rate": 3.1337275405964874e-07, "loss": 0.1125, "mean_token_accuracy": 0.9705305284261704, "num_tokens": 338893353.0, "step": 120800 }, { "epoch": 2.9343947962428096, "grad_norm": 0.3009592294692993, "learning_rate": 2.91430157194994e-07, "loss": 0.1046, "mean_token_accuracy": 0.9726232421398163, "num_tokens": 339174963.0, "step": 120900 }, { "epoch": 2.9368219217980145, "grad_norm": 0.16092665493488312, "learning_rate": 2.702827932813845e-07, "loss": 0.1218, "mean_token_accuracy": 0.9687193125486374, "num_tokens": 339448824.0, "step": 121000 }, { "epoch": 2.9392490473532193, "grad_norm": 0.2029600590467453, "learning_rate": 2.499308309811599e-07, "loss": 0.1142, "mean_token_accuracy": 0.9698378247022629, "num_tokens": 339724271.0, "step": 121100 }, { "epoch": 2.9416761729084246, "grad_norm": 0.25630271434783936, "learning_rate": 2.303744326128787e-07, "loss": 0.1213, "mean_token_accuracy": 0.9688639378547669, "num_tokens": 340001286.0, "step": 121200 }, { "epoch": 2.9441032984636295, "grad_norm": 0.22377553582191467, "learning_rate": 2.116137541500196e-07, "loss": 0.1017, "mean_token_accuracy": 0.9735622280836105, "num_tokens": 340284957.0, "step": 121300 }, { "epoch": 2.9465304240188344, "grad_norm": 0.570862352848053, "learning_rate": 1.9364894521972654e-07, "loss": 0.1164, "mean_token_accuracy": 0.9696594834327698, "num_tokens": 340561900.0, "step": 121400 }, { "epoch": 2.9489575495740397, "grad_norm": 0.24297547340393066, "learning_rate": 1.7648014910165434e-07, "loss": 0.1248, "mean_token_accuracy": 0.9681780362129211, "num_tokens": 340844943.0, "step": 121500 }, { "epoch": 2.9513846751292445, "grad_norm": 0.4155631363391876, "learning_rate": 1.6010750272678065e-07, "loss": 0.1016, "mean_token_accuracy": 0.9732653588056565, "num_tokens": 341118671.0, "step": 121600 }, { "epoch": 2.9538118006844494, "grad_norm": 0.2027473896741867, "learning_rate": 1.4453113667634022e-07, "loss": 0.1171, "mean_token_accuracy": 0.9696639609336853, "num_tokens": 341405359.0, "step": 121700 }, { "epoch": 2.9562389262396542, "grad_norm": 0.3320136070251465, "learning_rate": 1.2975117518077007e-07, "loss": 0.1159, "mean_token_accuracy": 0.9702119541168213, "num_tokens": 341696526.0, "step": 121800 }, { "epoch": 2.958666051794859, "grad_norm": 0.8280718922615051, "learning_rate": 1.1576773611872149e-07, "loss": 0.1074, "mean_token_accuracy": 0.9725628489255905, "num_tokens": 341991885.0, "step": 121900 }, { "epoch": 2.9610931773500644, "grad_norm": 0.3056747615337372, "learning_rate": 1.0258093101613853e-07, "loss": 0.098, "mean_token_accuracy": 0.9740756869316101, "num_tokens": 342273671.0, "step": 122000 }, { "epoch": 2.9635203029052692, "grad_norm": 0.3317147493362427, "learning_rate": 9.019086504533648e-08, "loss": 0.1041, "mean_token_accuracy": 0.9729535073041916, "num_tokens": 342565263.0, "step": 122100 }, { "epoch": 2.965947428460474, "grad_norm": 0.16931921243667603, "learning_rate": 7.859763702420253e-08, "loss": 0.1218, "mean_token_accuracy": 0.9689329898357392, "num_tokens": 342845461.0, "step": 122200 }, { "epoch": 2.9683745540156794, "grad_norm": 0.14930430054664612, "learning_rate": 6.780133941535205e-08, "loss": 0.1075, "mean_token_accuracy": 0.9722885394096374, "num_tokens": 343124544.0, "step": 122300 }, { "epoch": 2.9708016795708843, "grad_norm": 0.17570771276950836, "learning_rate": 5.780205832545127e-08, "loss": 0.1289, "mean_token_accuracy": 0.9669734662771226, "num_tokens": 343409061.0, "step": 122400 }, { "epoch": 2.973228805126089, "grad_norm": 0.27162694931030273, "learning_rate": 4.859987350449568e-08, "loss": 0.1088, "mean_token_accuracy": 0.9716758787631988, "num_tokens": 343697831.0, "step": 122500 }, { "epoch": 2.9756559306812944, "grad_norm": 0.42215201258659363, "learning_rate": 4.0194858345188324e-08, "loss": 0.1138, "mean_token_accuracy": 0.9704602026939392, "num_tokens": 343979543.0, "step": 122600 }, { "epoch": 2.9780830562364993, "grad_norm": 0.18965835869312286, "learning_rate": 3.258707988231802e-08, "loss": 0.1147, "mean_token_accuracy": 0.9699524390697479, "num_tokens": 344259353.0, "step": 122700 }, { "epoch": 2.980510181791704, "grad_norm": 0.424998939037323, "learning_rate": 2.5776598792282002e-08, "loss": 0.113, "mean_token_accuracy": 0.9705224817991257, "num_tokens": 344541167.0, "step": 122800 }, { "epoch": 2.982937307346909, "grad_norm": 0.4654737114906311, "learning_rate": 1.976346939257523e-08, "loss": 0.1203, "mean_token_accuracy": 0.9689262408018112, "num_tokens": 344820069.0, "step": 122900 }, { "epoch": 2.985364432902114, "grad_norm": 0.21616946160793304, "learning_rate": 1.4547739641335157e-08, "loss": 0.1163, "mean_token_accuracy": 0.9700032860040665, "num_tokens": 345099173.0, "step": 123000 }, { "epoch": 2.987791558457319, "grad_norm": 0.41284772753715515, "learning_rate": 1.0129451137008695e-08, "loss": 0.0992, "mean_token_accuracy": 0.9735272574424744, "num_tokens": 345382836.0, "step": 123100 }, { "epoch": 2.990218684012524, "grad_norm": 0.5250483751296997, "learning_rate": 6.508639117974724e-09, "loss": 0.1255, "mean_token_accuracy": 0.9675652432441711, "num_tokens": 345659959.0, "step": 123200 }, { "epoch": 2.992645809567729, "grad_norm": 0.27128636837005615, "learning_rate": 3.6853324622776463e-09, "loss": 0.1217, "mean_token_accuracy": 0.9685781133174897, "num_tokens": 345952286.0, "step": 123300 }, { "epoch": 2.995072935122934, "grad_norm": 0.4348767399787903, "learning_rate": 1.659553687394233e-09, "loss": 0.1134, "mean_token_accuracy": 0.9702136415243149, "num_tokens": 346215211.0, "step": 123400 }, { "epoch": 2.997500060678139, "grad_norm": 0.2931930124759674, "learning_rate": 4.313189501004011e-10, "loss": 0.1253, "mean_token_accuracy": 0.9676681870222091, "num_tokens": 346504966.0, "step": 123500 }, { "epoch": 2.999927186233344, "grad_norm": 0.3851180374622345, "learning_rate": 6.380462380661811e-13, "loss": 0.1069, "mean_token_accuracy": 0.9724932503700257, "num_tokens": 346775752.0, "step": 123600 } ], "logging_steps": 100, "max_steps": 123603, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.932579451067695e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }