| { |
| "best_global_step": 678, |
| "best_metric": 0.19718654453754425, |
| "best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_cb_1757340190/checkpoint-678", |
| "epoch": 20.0, |
| "eval_steps": 113, |
| "global_step": 2260, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.04424778761061947, |
| "grad_norm": 124.13675689697266, |
| "learning_rate": 8.849557522123894e-07, |
| "loss": 9.9574, |
| "num_input_tokens_seen": 1296, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.08849557522123894, |
| "grad_norm": 99.57308959960938, |
| "learning_rate": 1.991150442477876e-06, |
| "loss": 9.1928, |
| "num_input_tokens_seen": 2944, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.13274336283185842, |
| "grad_norm": 97.25202178955078, |
| "learning_rate": 3.097345132743363e-06, |
| "loss": 8.3882, |
| "num_input_tokens_seen": 4544, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.17699115044247787, |
| "grad_norm": 74.67471313476562, |
| "learning_rate": 4.2035398230088504e-06, |
| "loss": 6.6802, |
| "num_input_tokens_seen": 6048, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.22123893805309736, |
| "grad_norm": 97.10627746582031, |
| "learning_rate": 5.3097345132743365e-06, |
| "loss": 5.33, |
| "num_input_tokens_seen": 7296, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.26548672566371684, |
| "grad_norm": 102.13815307617188, |
| "learning_rate": 6.415929203539823e-06, |
| "loss": 3.7174, |
| "num_input_tokens_seen": 8944, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.30973451327433627, |
| "grad_norm": 41.23496627807617, |
| "learning_rate": 7.52212389380531e-06, |
| "loss": 2.1391, |
| "num_input_tokens_seen": 10848, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.35398230088495575, |
| "grad_norm": 36.60578918457031, |
| "learning_rate": 8.628318584070797e-06, |
| "loss": 1.1691, |
| "num_input_tokens_seen": 12416, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.39823008849557523, |
| "grad_norm": 24.01310157775879, |
| "learning_rate": 9.734513274336284e-06, |
| "loss": 0.3791, |
| "num_input_tokens_seen": 14064, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.4424778761061947, |
| "grad_norm": 48.26759338378906, |
| "learning_rate": 1.0840707964601771e-05, |
| "loss": 1.4857, |
| "num_input_tokens_seen": 15504, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.48672566371681414, |
| "grad_norm": 106.74925231933594, |
| "learning_rate": 1.1946902654867258e-05, |
| "loss": 0.8824, |
| "num_input_tokens_seen": 16608, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.5309734513274337, |
| "grad_norm": 23.899396896362305, |
| "learning_rate": 1.3053097345132745e-05, |
| "loss": 0.7704, |
| "num_input_tokens_seen": 17904, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5752212389380531, |
| "grad_norm": 28.427888870239258, |
| "learning_rate": 1.415929203539823e-05, |
| "loss": 0.2696, |
| "num_input_tokens_seen": 19440, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.6194690265486725, |
| "grad_norm": 40.272621154785156, |
| "learning_rate": 1.5265486725663717e-05, |
| "loss": 0.4162, |
| "num_input_tokens_seen": 21216, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.6637168141592921, |
| "grad_norm": 31.2790584564209, |
| "learning_rate": 1.6371681415929206e-05, |
| "loss": 0.2749, |
| "num_input_tokens_seen": 22400, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.7079646017699115, |
| "grad_norm": 18.20651626586914, |
| "learning_rate": 1.747787610619469e-05, |
| "loss": 0.2068, |
| "num_input_tokens_seen": 24016, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.7522123893805309, |
| "grad_norm": 34.98295593261719, |
| "learning_rate": 1.858407079646018e-05, |
| "loss": 0.2301, |
| "num_input_tokens_seen": 25360, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.7964601769911505, |
| "grad_norm": 71.71989440917969, |
| "learning_rate": 1.9690265486725665e-05, |
| "loss": 0.5576, |
| "num_input_tokens_seen": 26528, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.8407079646017699, |
| "grad_norm": 34.36787033081055, |
| "learning_rate": 2.079646017699115e-05, |
| "loss": 0.9594, |
| "num_input_tokens_seen": 27728, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.8849557522123894, |
| "grad_norm": 19.029399871826172, |
| "learning_rate": 2.190265486725664e-05, |
| "loss": 0.3092, |
| "num_input_tokens_seen": 29024, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.9292035398230089, |
| "grad_norm": 25.1923770904541, |
| "learning_rate": 2.3008849557522124e-05, |
| "loss": 0.2401, |
| "num_input_tokens_seen": 30128, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.9734513274336283, |
| "grad_norm": 7.299718379974365, |
| "learning_rate": 2.411504424778761e-05, |
| "loss": 0.4348, |
| "num_input_tokens_seen": 31312, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6302770972251892, |
| "eval_runtime": 0.6841, |
| "eval_samples_per_second": 36.543, |
| "eval_steps_per_second": 19.002, |
| "num_input_tokens_seen": 31992, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.0176991150442478, |
| "grad_norm": 0.8405959606170654, |
| "learning_rate": 2.5221238938053098e-05, |
| "loss": 0.2195, |
| "num_input_tokens_seen": 32664, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.0619469026548674, |
| "grad_norm": 18.019176483154297, |
| "learning_rate": 2.6327433628318586e-05, |
| "loss": 0.7089, |
| "num_input_tokens_seen": 33992, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.1061946902654867, |
| "grad_norm": 4.267525672912598, |
| "learning_rate": 2.743362831858407e-05, |
| "loss": 0.2452, |
| "num_input_tokens_seen": 35528, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.1504424778761062, |
| "grad_norm": 35.18408203125, |
| "learning_rate": 2.853982300884956e-05, |
| "loss": 0.4569, |
| "num_input_tokens_seen": 36984, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.1946902654867257, |
| "grad_norm": 11.760722160339355, |
| "learning_rate": 2.964601769911505e-05, |
| "loss": 0.434, |
| "num_input_tokens_seen": 38152, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.238938053097345, |
| "grad_norm": 11.829302787780762, |
| "learning_rate": 3.075221238938053e-05, |
| "loss": 0.2286, |
| "num_input_tokens_seen": 39528, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.2831858407079646, |
| "grad_norm": 20.860637664794922, |
| "learning_rate": 3.185840707964602e-05, |
| "loss": 0.4061, |
| "num_input_tokens_seen": 41288, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.3274336283185841, |
| "grad_norm": 11.283042907714844, |
| "learning_rate": 3.296460176991151e-05, |
| "loss": 0.235, |
| "num_input_tokens_seen": 42744, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.3716814159292037, |
| "grad_norm": 25.772708892822266, |
| "learning_rate": 3.407079646017699e-05, |
| "loss": 0.6785, |
| "num_input_tokens_seen": 43976, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.415929203539823, |
| "grad_norm": 4.896139621734619, |
| "learning_rate": 3.517699115044248e-05, |
| "loss": 0.7029, |
| "num_input_tokens_seen": 45576, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.4601769911504425, |
| "grad_norm": 2.0146803855895996, |
| "learning_rate": 3.628318584070797e-05, |
| "loss": 0.2659, |
| "num_input_tokens_seen": 47208, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.504424778761062, |
| "grad_norm": 0.1884833574295044, |
| "learning_rate": 3.7389380530973455e-05, |
| "loss": 0.5641, |
| "num_input_tokens_seen": 48488, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.5486725663716814, |
| "grad_norm": 23.161418914794922, |
| "learning_rate": 3.849557522123894e-05, |
| "loss": 1.1083, |
| "num_input_tokens_seen": 49736, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.592920353982301, |
| "grad_norm": 11.689558982849121, |
| "learning_rate": 3.9601769911504426e-05, |
| "loss": 0.4059, |
| "num_input_tokens_seen": 50920, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.6371681415929205, |
| "grad_norm": 0.553432047367096, |
| "learning_rate": 4.0707964601769914e-05, |
| "loss": 0.3867, |
| "num_input_tokens_seen": 52296, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.6814159292035398, |
| "grad_norm": 2.754047393798828, |
| "learning_rate": 4.1814159292035396e-05, |
| "loss": 0.3848, |
| "num_input_tokens_seen": 54120, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.7256637168141593, |
| "grad_norm": 3.9011263847351074, |
| "learning_rate": 4.2920353982300885e-05, |
| "loss": 0.3852, |
| "num_input_tokens_seen": 55640, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.7699115044247788, |
| "grad_norm": 0.6502580046653748, |
| "learning_rate": 4.4026548672566373e-05, |
| "loss": 0.1387, |
| "num_input_tokens_seen": 56728, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.8141592920353982, |
| "grad_norm": 9.314152717590332, |
| "learning_rate": 4.5132743362831855e-05, |
| "loss": 0.6029, |
| "num_input_tokens_seen": 58344, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.8584070796460177, |
| "grad_norm": 10.491519927978516, |
| "learning_rate": 4.6238938053097344e-05, |
| "loss": 0.4839, |
| "num_input_tokens_seen": 59512, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.9026548672566372, |
| "grad_norm": 7.456204891204834, |
| "learning_rate": 4.734513274336283e-05, |
| "loss": 0.3309, |
| "num_input_tokens_seen": 60968, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.9469026548672566, |
| "grad_norm": 20.350399017333984, |
| "learning_rate": 4.845132743362832e-05, |
| "loss": 0.6613, |
| "num_input_tokens_seen": 62152, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.991150442477876, |
| "grad_norm": 4.571124076843262, |
| "learning_rate": 4.955752212389381e-05, |
| "loss": 0.5809, |
| "num_input_tokens_seen": 63432, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.24710243940353394, |
| "eval_runtime": 0.6846, |
| "eval_samples_per_second": 36.517, |
| "eval_steps_per_second": 18.989, |
| "num_input_tokens_seen": 63544, |
| "step": 226 |
| }, |
| { |
| "epoch": 2.0353982300884956, |
| "grad_norm": 4.01071310043335, |
| "learning_rate": 4.9999731620342936e-05, |
| "loss": 0.1231, |
| "num_input_tokens_seen": 64520, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.079646017699115, |
| "grad_norm": 2.9519360065460205, |
| "learning_rate": 4.9998091543305845e-05, |
| "loss": 0.1799, |
| "num_input_tokens_seen": 66104, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.1238938053097347, |
| "grad_norm": 1.984820008277893, |
| "learning_rate": 4.999496058673635e-05, |
| "loss": 0.1095, |
| "num_input_tokens_seen": 67464, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.168141592920354, |
| "grad_norm": 0.9182684421539307, |
| "learning_rate": 4.999033893736386e-05, |
| "loss": 0.3554, |
| "num_input_tokens_seen": 68824, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.2123893805309733, |
| "grad_norm": 0.19130097329616547, |
| "learning_rate": 4.99842268708223e-05, |
| "loss": 0.3142, |
| "num_input_tokens_seen": 70040, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.256637168141593, |
| "grad_norm": 9.770721435546875, |
| "learning_rate": 4.9976624751633725e-05, |
| "loss": 0.7187, |
| "num_input_tokens_seen": 71784, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.3008849557522124, |
| "grad_norm": 1.7178610563278198, |
| "learning_rate": 4.996753303318648e-05, |
| "loss": 0.3055, |
| "num_input_tokens_seen": 73096, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.3451327433628317, |
| "grad_norm": 5.27429723739624, |
| "learning_rate": 4.995695225770825e-05, |
| "loss": 0.4558, |
| "num_input_tokens_seen": 74328, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.3893805309734515, |
| "grad_norm": 6.162055969238281, |
| "learning_rate": 4.994488305623365e-05, |
| "loss": 0.3365, |
| "num_input_tokens_seen": 75656, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.433628318584071, |
| "grad_norm": 6.64207649230957, |
| "learning_rate": 4.993132614856666e-05, |
| "loss": 0.4052, |
| "num_input_tokens_seen": 77016, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.47787610619469, |
| "grad_norm": 20.863346099853516, |
| "learning_rate": 4.991628234323765e-05, |
| "loss": 0.6785, |
| "num_input_tokens_seen": 78376, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.52212389380531, |
| "grad_norm": 10.060355186462402, |
| "learning_rate": 4.9899752537455166e-05, |
| "loss": 0.6746, |
| "num_input_tokens_seen": 79784, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.566371681415929, |
| "grad_norm": 6.53773832321167, |
| "learning_rate": 4.9881737717052436e-05, |
| "loss": 0.3809, |
| "num_input_tokens_seen": 81192, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.6106194690265485, |
| "grad_norm": 6.66525411605835, |
| "learning_rate": 4.9862238956428556e-05, |
| "loss": 0.2737, |
| "num_input_tokens_seen": 82552, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.6548672566371683, |
| "grad_norm": 1.6459722518920898, |
| "learning_rate": 4.984125741848441e-05, |
| "loss": 0.2936, |
| "num_input_tokens_seen": 84280, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.6991150442477876, |
| "grad_norm": 6.932806015014648, |
| "learning_rate": 4.981879435455336e-05, |
| "loss": 0.1703, |
| "num_input_tokens_seen": 85752, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.7433628318584073, |
| "grad_norm": 10.215063095092773, |
| "learning_rate": 4.9794851104326554e-05, |
| "loss": 0.1801, |
| "num_input_tokens_seen": 87448, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.7876106194690267, |
| "grad_norm": 2.963151454925537, |
| "learning_rate": 4.976942909577307e-05, |
| "loss": 0.2619, |
| "num_input_tokens_seen": 89080, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.831858407079646, |
| "grad_norm": 9.370272636413574, |
| "learning_rate": 4.974252984505475e-05, |
| "loss": 0.3157, |
| "num_input_tokens_seen": 90456, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.8761061946902657, |
| "grad_norm": 3.291374683380127, |
| "learning_rate": 4.971415495643574e-05, |
| "loss": 0.4967, |
| "num_input_tokens_seen": 91624, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.920353982300885, |
| "grad_norm": 1.0663483142852783, |
| "learning_rate": 4.968430612218687e-05, |
| "loss": 0.1591, |
| "num_input_tokens_seen": 92792, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.9646017699115044, |
| "grad_norm": 6.0550384521484375, |
| "learning_rate": 4.965298512248466e-05, |
| "loss": 0.2713, |
| "num_input_tokens_seen": 94280, |
| "step": 335 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.27530211210250854, |
| "eval_runtime": 0.6865, |
| "eval_samples_per_second": 36.415, |
| "eval_steps_per_second": 18.936, |
| "num_input_tokens_seen": 95320, |
| "step": 339 |
| }, |
| { |
| "epoch": 3.0088495575221237, |
| "grad_norm": 0.40051472187042236, |
| "learning_rate": 4.962019382530521e-05, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 95752, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.0530973451327434, |
| "grad_norm": 0.08082585781812668, |
| "learning_rate": 4.958593418631275e-05, |
| "loss": 0.3302, |
| "num_input_tokens_seen": 97000, |
| "step": 345 |
| }, |
| { |
| "epoch": 3.0973451327433628, |
| "grad_norm": 0.02288784831762314, |
| "learning_rate": 4.955020824874307e-05, |
| "loss": 0.0965, |
| "num_input_tokens_seen": 98248, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.1415929203539825, |
| "grad_norm": 12.415743827819824, |
| "learning_rate": 4.951301814328157e-05, |
| "loss": 0.0867, |
| "num_input_tokens_seen": 99800, |
| "step": 355 |
| }, |
| { |
| "epoch": 3.185840707964602, |
| "grad_norm": 3.7518856525421143, |
| "learning_rate": 4.947436608793624e-05, |
| "loss": 0.2178, |
| "num_input_tokens_seen": 101080, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.230088495575221, |
| "grad_norm": 0.6995785236358643, |
| "learning_rate": 4.9434254387905395e-05, |
| "loss": 0.2807, |
| "num_input_tokens_seen": 102488, |
| "step": 365 |
| }, |
| { |
| "epoch": 3.274336283185841, |
| "grad_norm": 0.8097388744354248, |
| "learning_rate": 4.9392685435440154e-05, |
| "loss": 0.111, |
| "num_input_tokens_seen": 103912, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.3185840707964602, |
| "grad_norm": 0.5394904613494873, |
| "learning_rate": 4.93496617097018e-05, |
| "loss": 0.0641, |
| "num_input_tokens_seen": 105400, |
| "step": 375 |
| }, |
| { |
| "epoch": 3.3628318584070795, |
| "grad_norm": 0.05450151860713959, |
| "learning_rate": 4.930518577661388e-05, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 106920, |
| "step": 380 |
| }, |
| { |
| "epoch": 3.4070796460176993, |
| "grad_norm": 7.512651443481445, |
| "learning_rate": 4.925926028870923e-05, |
| "loss": 0.5528, |
| "num_input_tokens_seen": 108280, |
| "step": 385 |
| }, |
| { |
| "epoch": 3.4513274336283186, |
| "grad_norm": 6.470648288726807, |
| "learning_rate": 4.921188798497173e-05, |
| "loss": 0.3068, |
| "num_input_tokens_seen": 110024, |
| "step": 390 |
| }, |
| { |
| "epoch": 3.495575221238938, |
| "grad_norm": 0.24640507996082306, |
| "learning_rate": 4.9163071690672973e-05, |
| "loss": 0.1243, |
| "num_input_tokens_seen": 111304, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.5398230088495577, |
| "grad_norm": 4.358043193817139, |
| "learning_rate": 4.911281431720378e-05, |
| "loss": 0.3414, |
| "num_input_tokens_seen": 112696, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.584070796460177, |
| "grad_norm": 1.6343210935592651, |
| "learning_rate": 4.9061118861900537e-05, |
| "loss": 0.1262, |
| "num_input_tokens_seen": 114248, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.6283185840707963, |
| "grad_norm": 0.2647433280944824, |
| "learning_rate": 4.900798840786645e-05, |
| "loss": 0.1359, |
| "num_input_tokens_seen": 115400, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.672566371681416, |
| "grad_norm": 4.174383640289307, |
| "learning_rate": 4.8953426123787674e-05, |
| "loss": 0.2879, |
| "num_input_tokens_seen": 116808, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.7168141592920354, |
| "grad_norm": 1.054221510887146, |
| "learning_rate": 4.889743526374432e-05, |
| "loss": 0.6858, |
| "num_input_tokens_seen": 118056, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.7610619469026547, |
| "grad_norm": 0.40787625312805176, |
| "learning_rate": 4.884001916701639e-05, |
| "loss": 0.1859, |
| "num_input_tokens_seen": 119512, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.8053097345132745, |
| "grad_norm": 29.34970474243164, |
| "learning_rate": 4.878118125788462e-05, |
| "loss": 0.1844, |
| "num_input_tokens_seen": 120872, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.849557522123894, |
| "grad_norm": 3.395024061203003, |
| "learning_rate": 4.872092504542629e-05, |
| "loss": 0.2313, |
| "num_input_tokens_seen": 122376, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.893805309734513, |
| "grad_norm": 1.026843547821045, |
| "learning_rate": 4.865925412330586e-05, |
| "loss": 0.2146, |
| "num_input_tokens_seen": 123544, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.938053097345133, |
| "grad_norm": 0.7162097096443176, |
| "learning_rate": 4.859617216956074e-05, |
| "loss": 0.2767, |
| "num_input_tokens_seen": 125720, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.982300884955752, |
| "grad_norm": 1.510809063911438, |
| "learning_rate": 4.8531682946381874e-05, |
| "loss": 0.5372, |
| "num_input_tokens_seen": 126952, |
| "step": 450 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.22356851398944855, |
| "eval_runtime": 0.692, |
| "eval_samples_per_second": 36.129, |
| "eval_steps_per_second": 18.787, |
| "num_input_tokens_seen": 127288, |
| "step": 452 |
| }, |
| { |
| "epoch": 4.0265486725663715, |
| "grad_norm": 2.590193033218384, |
| "learning_rate": 4.846579029988939e-05, |
| "loss": 0.2322, |
| "num_input_tokens_seen": 128488, |
| "step": 455 |
| }, |
| { |
| "epoch": 4.070796460176991, |
| "grad_norm": 3.3609442710876465, |
| "learning_rate": 4.8398498159903194e-05, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 129880, |
| "step": 460 |
| }, |
| { |
| "epoch": 4.115044247787611, |
| "grad_norm": 3.177797317504883, |
| "learning_rate": 4.8329810539708625e-05, |
| "loss": 0.2224, |
| "num_input_tokens_seen": 131176, |
| "step": 465 |
| }, |
| { |
| "epoch": 4.15929203539823, |
| "grad_norm": 0.27489417791366577, |
| "learning_rate": 4.825973153581709e-05, |
| "loss": 0.3069, |
| "num_input_tokens_seen": 132376, |
| "step": 470 |
| }, |
| { |
| "epoch": 4.20353982300885, |
| "grad_norm": 0.0791027620434761, |
| "learning_rate": 4.818826532772174e-05, |
| "loss": 0.2894, |
| "num_input_tokens_seen": 133608, |
| "step": 475 |
| }, |
| { |
| "epoch": 4.247787610619469, |
| "grad_norm": 1.5238655805587769, |
| "learning_rate": 4.8115416177648234e-05, |
| "loss": 0.2254, |
| "num_input_tokens_seen": 135080, |
| "step": 480 |
| }, |
| { |
| "epoch": 4.292035398230088, |
| "grad_norm": 0.9858080148696899, |
| "learning_rate": 4.804118843030049e-05, |
| "loss": 0.0772, |
| "num_input_tokens_seen": 136264, |
| "step": 485 |
| }, |
| { |
| "epoch": 4.336283185840708, |
| "grad_norm": 0.17767775058746338, |
| "learning_rate": 4.796558651260165e-05, |
| "loss": 0.2416, |
| "num_input_tokens_seen": 137480, |
| "step": 490 |
| }, |
| { |
| "epoch": 4.380530973451328, |
| "grad_norm": 4.733283042907715, |
| "learning_rate": 4.7888614933429955e-05, |
| "loss": 0.5112, |
| "num_input_tokens_seen": 138536, |
| "step": 495 |
| }, |
| { |
| "epoch": 4.424778761061947, |
| "grad_norm": 0.43331053853034973, |
| "learning_rate": 4.781027828334994e-05, |
| "loss": 0.0427, |
| "num_input_tokens_seen": 140360, |
| "step": 500 |
| }, |
| { |
| "epoch": 4.469026548672566, |
| "grad_norm": 0.32186347246170044, |
| "learning_rate": 4.773058123433857e-05, |
| "loss": 0.1443, |
| "num_input_tokens_seen": 141784, |
| "step": 505 |
| }, |
| { |
| "epoch": 4.513274336283186, |
| "grad_norm": 3.078956127166748, |
| "learning_rate": 4.7649528539506673e-05, |
| "loss": 0.2569, |
| "num_input_tokens_seen": 143192, |
| "step": 510 |
| }, |
| { |
| "epoch": 4.557522123893805, |
| "grad_norm": 3.7797951698303223, |
| "learning_rate": 4.7567125032815394e-05, |
| "loss": 0.1434, |
| "num_input_tokens_seen": 144440, |
| "step": 515 |
| }, |
| { |
| "epoch": 4.601769911504425, |
| "grad_norm": 0.16947458684444427, |
| "learning_rate": 4.7483375628787975e-05, |
| "loss": 0.2527, |
| "num_input_tokens_seen": 145768, |
| "step": 520 |
| }, |
| { |
| "epoch": 4.646017699115045, |
| "grad_norm": 0.18396349251270294, |
| "learning_rate": 4.739828532221661e-05, |
| "loss": 0.3232, |
| "num_input_tokens_seen": 147128, |
| "step": 525 |
| }, |
| { |
| "epoch": 4.6902654867256635, |
| "grad_norm": 0.8579729199409485, |
| "learning_rate": 4.731185918786453e-05, |
| "loss": 0.0121, |
| "num_input_tokens_seen": 148184, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.734513274336283, |
| "grad_norm": 6.984184265136719, |
| "learning_rate": 4.722410238016343e-05, |
| "loss": 0.2366, |
| "num_input_tokens_seen": 149880, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.778761061946903, |
| "grad_norm": 0.4492368698120117, |
| "learning_rate": 4.7135020132905985e-05, |
| "loss": 0.2717, |
| "num_input_tokens_seen": 151528, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.823008849557522, |
| "grad_norm": 0.18383368849754333, |
| "learning_rate": 4.7044617758933714e-05, |
| "loss": 0.284, |
| "num_input_tokens_seen": 153080, |
| "step": 545 |
| }, |
| { |
| "epoch": 4.867256637168142, |
| "grad_norm": 0.3099288046360016, |
| "learning_rate": 4.695290064982018e-05, |
| "loss": 0.117, |
| "num_input_tokens_seen": 154952, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.911504424778761, |
| "grad_norm": 0.1691308617591858, |
| "learning_rate": 4.6859874275549376e-05, |
| "loss": 0.1019, |
| "num_input_tokens_seen": 156136, |
| "step": 555 |
| }, |
| { |
| "epoch": 4.95575221238938, |
| "grad_norm": 0.10711519420146942, |
| "learning_rate": 4.676554418418953e-05, |
| "loss": 0.0055, |
| "num_input_tokens_seen": 157560, |
| "step": 560 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 12.879459381103516, |
| "learning_rate": 4.66699160015622e-05, |
| "loss": 0.1443, |
| "num_input_tokens_seen": 158768, |
| "step": 565 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.3391265869140625, |
| "eval_runtime": 0.6894, |
| "eval_samples_per_second": 36.266, |
| "eval_steps_per_second": 18.858, |
| "num_input_tokens_seen": 158768, |
| "step": 565 |
| }, |
| { |
| "epoch": 5.04424778761062, |
| "grad_norm": 0.3337021470069885, |
| "learning_rate": 4.6572995430906784e-05, |
| "loss": 0.3145, |
| "num_input_tokens_seen": 160016, |
| "step": 570 |
| }, |
| { |
| "epoch": 5.088495575221239, |
| "grad_norm": 0.10985293984413147, |
| "learning_rate": 4.6474788252540323e-05, |
| "loss": 0.2231, |
| "num_input_tokens_seen": 161360, |
| "step": 575 |
| }, |
| { |
| "epoch": 5.132743362831858, |
| "grad_norm": 0.20570193231105804, |
| "learning_rate": 4.637530032351284e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 163104, |
| "step": 580 |
| }, |
| { |
| "epoch": 5.176991150442478, |
| "grad_norm": 0.3489188551902771, |
| "learning_rate": 4.627453757725796e-05, |
| "loss": 0.1658, |
| "num_input_tokens_seen": 164368, |
| "step": 585 |
| }, |
| { |
| "epoch": 5.221238938053097, |
| "grad_norm": 0.5157064199447632, |
| "learning_rate": 4.617250602323907e-05, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 165936, |
| "step": 590 |
| }, |
| { |
| "epoch": 5.265486725663717, |
| "grad_norm": 5.129257678985596, |
| "learning_rate": 4.6069211746590926e-05, |
| "loss": 0.2893, |
| "num_input_tokens_seen": 167360, |
| "step": 595 |
| }, |
| { |
| "epoch": 5.3097345132743365, |
| "grad_norm": 10.236189842224121, |
| "learning_rate": 4.596466090775672e-05, |
| "loss": 0.1541, |
| "num_input_tokens_seen": 168944, |
| "step": 600 |
| }, |
| { |
| "epoch": 5.353982300884955, |
| "grad_norm": 0.2498709112405777, |
| "learning_rate": 4.585885974212068e-05, |
| "loss": 0.0729, |
| "num_input_tokens_seen": 170672, |
| "step": 605 |
| }, |
| { |
| "epoch": 5.398230088495575, |
| "grad_norm": 0.10966335982084274, |
| "learning_rate": 4.575181455963619e-05, |
| "loss": 0.0358, |
| "num_input_tokens_seen": 172016, |
| "step": 610 |
| }, |
| { |
| "epoch": 5.442477876106195, |
| "grad_norm": 1.829491138458252, |
| "learning_rate": 4.5643531744449474e-05, |
| "loss": 0.0655, |
| "num_input_tokens_seen": 173184, |
| "step": 615 |
| }, |
| { |
| "epoch": 5.486725663716814, |
| "grad_norm": 0.10314864665269852, |
| "learning_rate": 4.553401775451882e-05, |
| "loss": 0.0682, |
| "num_input_tokens_seen": 174352, |
| "step": 620 |
| }, |
| { |
| "epoch": 5.530973451327434, |
| "grad_norm": 15.532014846801758, |
| "learning_rate": 4.542327912122949e-05, |
| "loss": 0.213, |
| "num_input_tokens_seen": 175968, |
| "step": 625 |
| }, |
| { |
| "epoch": 5.575221238938053, |
| "grad_norm": 4.856271266937256, |
| "learning_rate": 4.531132244900411e-05, |
| "loss": 0.3143, |
| "num_input_tokens_seen": 177408, |
| "step": 630 |
| }, |
| { |
| "epoch": 5.619469026548672, |
| "grad_norm": 0.02352801524102688, |
| "learning_rate": 4.519815441490884e-05, |
| "loss": 0.2459, |
| "num_input_tokens_seen": 178608, |
| "step": 635 |
| }, |
| { |
| "epoch": 5.663716814159292, |
| "grad_norm": 0.03717682883143425, |
| "learning_rate": 4.508378176825516e-05, |
| "loss": 0.1598, |
| "num_input_tokens_seen": 179824, |
| "step": 640 |
| }, |
| { |
| "epoch": 5.707964601769912, |
| "grad_norm": 0.13317222893238068, |
| "learning_rate": 4.496821133019728e-05, |
| "loss": 0.3235, |
| "num_input_tokens_seen": 181088, |
| "step": 645 |
| }, |
| { |
| "epoch": 5.752212389380531, |
| "grad_norm": 15.621245384216309, |
| "learning_rate": 4.485144999332541e-05, |
| "loss": 0.2219, |
| "num_input_tokens_seen": 182816, |
| "step": 650 |
| }, |
| { |
| "epoch": 5.79646017699115, |
| "grad_norm": 7.962857246398926, |
| "learning_rate": 4.4733504721254625e-05, |
| "loss": 0.2013, |
| "num_input_tokens_seen": 184224, |
| "step": 655 |
| }, |
| { |
| "epoch": 5.84070796460177, |
| "grad_norm": 0.0713452622294426, |
| "learning_rate": 4.461438254820959e-05, |
| "loss": 0.1827, |
| "num_input_tokens_seen": 185776, |
| "step": 660 |
| }, |
| { |
| "epoch": 5.88495575221239, |
| "grad_norm": 0.3040229082107544, |
| "learning_rate": 4.449409057860504e-05, |
| "loss": 0.1937, |
| "num_input_tokens_seen": 187184, |
| "step": 665 |
| }, |
| { |
| "epoch": 5.929203539823009, |
| "grad_norm": 0.3687923848628998, |
| "learning_rate": 4.4372635986622044e-05, |
| "loss": 0.3286, |
| "num_input_tokens_seen": 188528, |
| "step": 670 |
| }, |
| { |
| "epoch": 5.9734513274336285, |
| "grad_norm": 4.886104583740234, |
| "learning_rate": 4.425002601578017e-05, |
| "loss": 0.161, |
| "num_input_tokens_seen": 189680, |
| "step": 675 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.19718654453754425, |
| "eval_runtime": 0.6843, |
| "eval_samples_per_second": 36.536, |
| "eval_steps_per_second": 18.999, |
| "num_input_tokens_seen": 190304, |
| "step": 678 |
| }, |
| { |
| "epoch": 6.017699115044247, |
| "grad_norm": 4.4775872230529785, |
| "learning_rate": 4.4126267978505486e-05, |
| "loss": 0.0549, |
| "num_input_tokens_seen": 191008, |
| "step": 680 |
| }, |
| { |
| "epoch": 6.061946902654867, |
| "grad_norm": 4.587867736816406, |
| "learning_rate": 4.4001369255694416e-05, |
| "loss": 0.0435, |
| "num_input_tokens_seen": 192512, |
| "step": 685 |
| }, |
| { |
| "epoch": 6.106194690265487, |
| "grad_norm": 4.515646457672119, |
| "learning_rate": 4.387533729627359e-05, |
| "loss": 0.0554, |
| "num_input_tokens_seen": 193904, |
| "step": 690 |
| }, |
| { |
| "epoch": 6.150442477876107, |
| "grad_norm": 0.49409228563308716, |
| "learning_rate": 4.374817961675553e-05, |
| "loss": 0.692, |
| "num_input_tokens_seen": 195616, |
| "step": 695 |
| }, |
| { |
| "epoch": 6.1946902654867255, |
| "grad_norm": 0.2975232005119324, |
| "learning_rate": 4.3619903800790465e-05, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 196832, |
| "step": 700 |
| }, |
| { |
| "epoch": 6.238938053097345, |
| "grad_norm": 0.4422145187854767, |
| "learning_rate": 4.3490517498713924e-05, |
| "loss": 0.1484, |
| "num_input_tokens_seen": 198592, |
| "step": 705 |
| }, |
| { |
| "epoch": 6.283185840707965, |
| "grad_norm": 2.669018507003784, |
| "learning_rate": 4.336002842709057e-05, |
| "loss": 0.3438, |
| "num_input_tokens_seen": 199664, |
| "step": 710 |
| }, |
| { |
| "epoch": 6.327433628318584, |
| "grad_norm": 0.12243154644966125, |
| "learning_rate": 4.3228444368253925e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 201456, |
| "step": 715 |
| }, |
| { |
| "epoch": 6.371681415929204, |
| "grad_norm": 6.243483066558838, |
| "learning_rate": 4.309577316984228e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 202752, |
| "step": 720 |
| }, |
| { |
| "epoch": 6.415929203539823, |
| "grad_norm": 0.019561927765607834, |
| "learning_rate": 4.2962022744330616e-05, |
| "loss": 0.2073, |
| "num_input_tokens_seen": 204336, |
| "step": 725 |
| }, |
| { |
| "epoch": 6.460176991150442, |
| "grad_norm": 0.36098241806030273, |
| "learning_rate": 4.282720106855876e-05, |
| "loss": 0.0032, |
| "num_input_tokens_seen": 205488, |
| "step": 730 |
| }, |
| { |
| "epoch": 6.504424778761062, |
| "grad_norm": 0.0058883423916995525, |
| "learning_rate": 4.269131618325559e-05, |
| "loss": 0.0434, |
| "num_input_tokens_seen": 207008, |
| "step": 735 |
| }, |
| { |
| "epoch": 6.548672566371682, |
| "grad_norm": 0.006368419621139765, |
| "learning_rate": 4.255437619255955e-05, |
| "loss": 0.0553, |
| "num_input_tokens_seen": 208512, |
| "step": 740 |
| }, |
| { |
| "epoch": 6.592920353982301, |
| "grad_norm": 0.28119367361068726, |
| "learning_rate": 4.241638926353526e-05, |
| "loss": 0.334, |
| "num_input_tokens_seen": 209888, |
| "step": 745 |
| }, |
| { |
| "epoch": 6.6371681415929205, |
| "grad_norm": 0.049205418676137924, |
| "learning_rate": 4.2277363625686475e-05, |
| "loss": 0.0095, |
| "num_input_tokens_seen": 211008, |
| "step": 750 |
| }, |
| { |
| "epoch": 6.68141592920354, |
| "grad_norm": 0.033452827483415604, |
| "learning_rate": 4.213730757046528e-05, |
| "loss": 0.1773, |
| "num_input_tokens_seen": 212240, |
| "step": 755 |
| }, |
| { |
| "epoch": 6.725663716814159, |
| "grad_norm": 3.038914918899536, |
| "learning_rate": 4.199622945077755e-05, |
| "loss": 0.0801, |
| "num_input_tokens_seen": 213456, |
| "step": 760 |
| }, |
| { |
| "epoch": 6.769911504424779, |
| "grad_norm": 0.26061660051345825, |
| "learning_rate": 4.185413768048483e-05, |
| "loss": 0.0512, |
| "num_input_tokens_seen": 214768, |
| "step": 765 |
| }, |
| { |
| "epoch": 6.814159292035399, |
| "grad_norm": 0.022336583584547043, |
| "learning_rate": 4.1711040733902526e-05, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 216000, |
| "step": 770 |
| }, |
| { |
| "epoch": 6.8584070796460175, |
| "grad_norm": 0.05515335872769356, |
| "learning_rate": 4.1566947145294474e-05, |
| "loss": 0.5409, |
| "num_input_tokens_seen": 217520, |
| "step": 775 |
| }, |
| { |
| "epoch": 6.902654867256637, |
| "grad_norm": 14.747519493103027, |
| "learning_rate": 4.142186550836399e-05, |
| "loss": 0.1545, |
| "num_input_tokens_seen": 219040, |
| "step": 780 |
| }, |
| { |
| "epoch": 6.946902654867257, |
| "grad_norm": 0.026229240000247955, |
| "learning_rate": 4.127580447574131e-05, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 220688, |
| "step": 785 |
| }, |
| { |
| "epoch": 6.991150442477876, |
| "grad_norm": 0.01179127674549818, |
| "learning_rate": 4.1128772758467604e-05, |
| "loss": 0.1019, |
| "num_input_tokens_seen": 222304, |
| "step": 790 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.20210684835910797, |
| "eval_runtime": 0.6906, |
| "eval_samples_per_second": 36.202, |
| "eval_steps_per_second": 18.825, |
| "num_input_tokens_seen": 222400, |
| "step": 791 |
| }, |
| { |
| "epoch": 7.035398230088496, |
| "grad_norm": 0.1927720159292221, |
| "learning_rate": 4.098077912547536e-05, |
| "loss": 0.0238, |
| "num_input_tokens_seen": 223584, |
| "step": 795 |
| }, |
| { |
| "epoch": 7.079646017699115, |
| "grad_norm": 0.011334261856973171, |
| "learning_rate": 4.0831832403065526e-05, |
| "loss": 0.0573, |
| "num_input_tokens_seen": 224816, |
| "step": 800 |
| }, |
| { |
| "epoch": 7.123893805309734, |
| "grad_norm": 0.003715845989063382, |
| "learning_rate": 4.068194147438101e-05, |
| "loss": 0.006, |
| "num_input_tokens_seen": 226192, |
| "step": 805 |
| }, |
| { |
| "epoch": 7.168141592920354, |
| "grad_norm": 0.004327030386775732, |
| "learning_rate": 4.0531115278876934e-05, |
| "loss": 0.0034, |
| "num_input_tokens_seen": 227440, |
| "step": 810 |
| }, |
| { |
| "epoch": 7.212389380530974, |
| "grad_norm": 0.0020909528248012066, |
| "learning_rate": 4.0379362811787504e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 228928, |
| "step": 815 |
| }, |
| { |
| "epoch": 7.256637168141593, |
| "grad_norm": 0.7747687697410583, |
| "learning_rate": 4.022669312358949e-05, |
| "loss": 0.0103, |
| "num_input_tokens_seen": 230416, |
| "step": 820 |
| }, |
| { |
| "epoch": 7.300884955752212, |
| "grad_norm": 0.040525127202272415, |
| "learning_rate": 4.007311531946252e-05, |
| "loss": 0.1555, |
| "num_input_tokens_seen": 231536, |
| "step": 825 |
| }, |
| { |
| "epoch": 7.345132743362832, |
| "grad_norm": 0.10391277074813843, |
| "learning_rate": 3.9918638558745966e-05, |
| "loss": 0.1358, |
| "num_input_tokens_seen": 233072, |
| "step": 830 |
| }, |
| { |
| "epoch": 7.389380530973451, |
| "grad_norm": 0.0045501478016376495, |
| "learning_rate": 3.976327205439279e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 234416, |
| "step": 835 |
| }, |
| { |
| "epoch": 7.433628318584071, |
| "grad_norm": 0.7568858861923218, |
| "learning_rate": 3.9607025072419986e-05, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 235776, |
| "step": 840 |
| }, |
| { |
| "epoch": 7.477876106194691, |
| "grad_norm": 0.028789546340703964, |
| "learning_rate": 3.9449906931356005e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 237248, |
| "step": 845 |
| }, |
| { |
| "epoch": 7.522123893805309, |
| "grad_norm": 0.003365432843565941, |
| "learning_rate": 3.929192700168501e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 238544, |
| "step": 850 |
| }, |
| { |
| "epoch": 7.566371681415929, |
| "grad_norm": 0.010612981393933296, |
| "learning_rate": 3.9133094705287984e-05, |
| "loss": 0.1846, |
| "num_input_tokens_seen": 239840, |
| "step": 855 |
| }, |
| { |
| "epoch": 7.610619469026549, |
| "grad_norm": 0.06699702143669128, |
| "learning_rate": 3.897341951488087e-05, |
| "loss": 0.1606, |
| "num_input_tokens_seen": 241440, |
| "step": 860 |
| }, |
| { |
| "epoch": 7.654867256637168, |
| "grad_norm": 1.6820495128631592, |
| "learning_rate": 3.8812910953449555e-05, |
| "loss": 0.013, |
| "num_input_tokens_seen": 243216, |
| "step": 865 |
| }, |
| { |
| "epoch": 7.699115044247788, |
| "grad_norm": 20.158716201782227, |
| "learning_rate": 3.865157859368196e-05, |
| "loss": 0.2123, |
| "num_input_tokens_seen": 244400, |
| "step": 870 |
| }, |
| { |
| "epoch": 7.743362831858407, |
| "grad_norm": 0.03708009421825409, |
| "learning_rate": 3.848943205739711e-05, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 245520, |
| "step": 875 |
| }, |
| { |
| "epoch": 7.787610619469026, |
| "grad_norm": 0.009764638729393482, |
| "learning_rate": 3.832648101497134e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 246688, |
| "step": 880 |
| }, |
| { |
| "epoch": 7.831858407079646, |
| "grad_norm": 23.658842086791992, |
| "learning_rate": 3.8162735184761476e-05, |
| "loss": 0.2551, |
| "num_input_tokens_seen": 248320, |
| "step": 885 |
| }, |
| { |
| "epoch": 7.876106194690266, |
| "grad_norm": 0.1080569326877594, |
| "learning_rate": 3.799820433252529e-05, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 249616, |
| "step": 890 |
| }, |
| { |
| "epoch": 7.920353982300885, |
| "grad_norm": 1.0283671617507935, |
| "learning_rate": 3.783289827083905e-05, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 251264, |
| "step": 895 |
| }, |
| { |
| "epoch": 7.964601769911504, |
| "grad_norm": 0.022368498146533966, |
| "learning_rate": 3.766682685851234e-05, |
| "loss": 0.0051, |
| "num_input_tokens_seen": 252768, |
| "step": 900 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.3095754384994507, |
| "eval_runtime": 0.6903, |
| "eval_samples_per_second": 36.214, |
| "eval_steps_per_second": 18.831, |
| "num_input_tokens_seen": 253488, |
| "step": 904 |
| }, |
| { |
| "epoch": 8.008849557522124, |
| "grad_norm": 0.030048973858356476, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 254096, |
| "step": 905 |
| }, |
| { |
| "epoch": 8.053097345132743, |
| "grad_norm": 0.007619574666023254, |
| "learning_rate": 3.733242764481154e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 255408, |
| "step": 910 |
| }, |
| { |
| "epoch": 8.097345132743364, |
| "grad_norm": 0.060429807752370834, |
| "learning_rate": 3.716411978691766e-05, |
| "loss": 0.1043, |
| "num_input_tokens_seen": 257008, |
| "step": 915 |
| }, |
| { |
| "epoch": 8.141592920353983, |
| "grad_norm": 0.004338063299655914, |
| "learning_rate": 3.699508646415424e-05, |
| "loss": 0.0833, |
| "num_input_tokens_seen": 258176, |
| "step": 920 |
| }, |
| { |
| "epoch": 8.185840707964601, |
| "grad_norm": 0.0030190609395503998, |
| "learning_rate": 3.6825337757623696e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 259552, |
| "step": 925 |
| }, |
| { |
| "epoch": 8.230088495575222, |
| "grad_norm": 0.3338499963283539, |
| "learning_rate": 3.665488379109377e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 261104, |
| "step": 930 |
| }, |
| { |
| "epoch": 8.274336283185841, |
| "grad_norm": 0.25098684430122375, |
| "learning_rate": 3.648373473039368e-05, |
| "loss": 0.0056, |
| "num_input_tokens_seen": 262496, |
| "step": 935 |
| }, |
| { |
| "epoch": 8.31858407079646, |
| "grad_norm": 0.013036888092756271, |
| "learning_rate": 3.631190078280791e-05, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 263744, |
| "step": 940 |
| }, |
| { |
| "epoch": 8.36283185840708, |
| "grad_norm": 0.29324567317962646, |
| "learning_rate": 3.613939219646739e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 265024, |
| "step": 945 |
| }, |
| { |
| "epoch": 8.4070796460177, |
| "grad_norm": 0.005112594924867153, |
| "learning_rate": 3.596621925973835e-05, |
| "loss": 0.1552, |
| "num_input_tokens_seen": 266320, |
| "step": 950 |
| }, |
| { |
| "epoch": 8.451327433628318, |
| "grad_norm": 6.880082130432129, |
| "learning_rate": 3.579239230060867e-05, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 267648, |
| "step": 955 |
| }, |
| { |
| "epoch": 8.495575221238939, |
| "grad_norm": 0.0326354093849659, |
| "learning_rate": 3.5617921686071995e-05, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 269264, |
| "step": 960 |
| }, |
| { |
| "epoch": 8.539823008849558, |
| "grad_norm": 0.0023343353532254696, |
| "learning_rate": 3.544281782150936e-05, |
| "loss": 0.0551, |
| "num_input_tokens_seen": 270816, |
| "step": 965 |
| }, |
| { |
| "epoch": 8.584070796460177, |
| "grad_norm": 4.35261869430542, |
| "learning_rate": 3.526709115006871e-05, |
| "loss": 0.0098, |
| "num_input_tokens_seen": 272304, |
| "step": 970 |
| }, |
| { |
| "epoch": 8.628318584070797, |
| "grad_norm": 0.0024698644410818815, |
| "learning_rate": 3.5090752152041975e-05, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 273824, |
| "step": 975 |
| }, |
| { |
| "epoch": 8.672566371681416, |
| "grad_norm": 23.142860412597656, |
| "learning_rate": 3.491381134424012e-05, |
| "loss": 0.055, |
| "num_input_tokens_seen": 275024, |
| "step": 980 |
| }, |
| { |
| "epoch": 8.716814159292035, |
| "grad_norm": 0.0028112020809203386, |
| "learning_rate": 3.4736279279365876e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 276256, |
| "step": 985 |
| }, |
| { |
| "epoch": 8.761061946902656, |
| "grad_norm": 0.0034610675647854805, |
| "learning_rate": 3.455816654538438e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 277536, |
| "step": 990 |
| }, |
| { |
| "epoch": 8.805309734513274, |
| "grad_norm": 0.013308167457580566, |
| "learning_rate": 3.437948376489172e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 279104, |
| "step": 995 |
| }, |
| { |
| "epoch": 8.849557522123893, |
| "grad_norm": 0.015844911336898804, |
| "learning_rate": 3.420024159448142e-05, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 280480, |
| "step": 1000 |
| }, |
| { |
| "epoch": 8.893805309734514, |
| "grad_norm": 14.841525077819824, |
| "learning_rate": 3.402045072410886e-05, |
| "loss": 0.0381, |
| "num_input_tokens_seen": 281664, |
| "step": 1005 |
| }, |
| { |
| "epoch": 8.938053097345133, |
| "grad_norm": 0.0037512825801968575, |
| "learning_rate": 3.3840121876453734e-05, |
| "loss": 0.0301, |
| "num_input_tokens_seen": 283248, |
| "step": 1010 |
| }, |
| { |
| "epoch": 8.982300884955752, |
| "grad_norm": 0.0376361683011055, |
| "learning_rate": 3.365926580628057e-05, |
| "loss": 0.0223, |
| "num_input_tokens_seen": 284400, |
| "step": 1015 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.300649493932724, |
| "eval_runtime": 0.6904, |
| "eval_samples_per_second": 36.209, |
| "eval_steps_per_second": 18.829, |
| "num_input_tokens_seen": 284920, |
| "step": 1017 |
| }, |
| { |
| "epoch": 9.026548672566372, |
| "grad_norm": 0.001532257185317576, |
| "learning_rate": 3.3477893299797304e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 285880, |
| "step": 1020 |
| }, |
| { |
| "epoch": 9.070796460176991, |
| "grad_norm": 0.1362602561712265, |
| "learning_rate": 3.3296015174011984e-05, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 287480, |
| "step": 1025 |
| }, |
| { |
| "epoch": 9.11504424778761, |
| "grad_norm": 0.0020374886225908995, |
| "learning_rate": 3.311364227608768e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 289144, |
| "step": 1030 |
| }, |
| { |
| "epoch": 9.15929203539823, |
| "grad_norm": 0.002412878442555666, |
| "learning_rate": 3.293078548269553e-05, |
| "loss": 0.0027, |
| "num_input_tokens_seen": 290504, |
| "step": 1035 |
| }, |
| { |
| "epoch": 9.20353982300885, |
| "grad_norm": 0.006883608177304268, |
| "learning_rate": 3.2747455699366056e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 291784, |
| "step": 1040 |
| }, |
| { |
| "epoch": 9.247787610619469, |
| "grad_norm": 0.0028563826344907284, |
| "learning_rate": 3.256366385983879e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 292904, |
| "step": 1045 |
| }, |
| { |
| "epoch": 9.29203539823009, |
| "grad_norm": 0.003951858263462782, |
| "learning_rate": 3.237942092541018e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 294360, |
| "step": 1050 |
| }, |
| { |
| "epoch": 9.336283185840708, |
| "grad_norm": 0.014090816490352154, |
| "learning_rate": 3.219473788427984e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 295992, |
| "step": 1055 |
| }, |
| { |
| "epoch": 9.380530973451327, |
| "grad_norm": 0.012684347108006477, |
| "learning_rate": 3.2009625750895224e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 297272, |
| "step": 1060 |
| }, |
| { |
| "epoch": 9.424778761061948, |
| "grad_norm": 0.0035236075054854155, |
| "learning_rate": 3.182409556529476e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 298360, |
| "step": 1065 |
| }, |
| { |
| "epoch": 9.469026548672566, |
| "grad_norm": 0.0013637221418321133, |
| "learning_rate": 3.163815839244937e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 299656, |
| "step": 1070 |
| }, |
| { |
| "epoch": 9.513274336283185, |
| "grad_norm": 0.0017074494389817119, |
| "learning_rate": 3.14518253216026e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 301432, |
| "step": 1075 |
| }, |
| { |
| "epoch": 9.557522123893806, |
| "grad_norm": 0.002450718777254224, |
| "learning_rate": 3.126510746560925e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 302952, |
| "step": 1080 |
| }, |
| { |
| "epoch": 9.601769911504425, |
| "grad_norm": 0.009025582112371922, |
| "learning_rate": 3.107801596027261e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 304296, |
| "step": 1085 |
| }, |
| { |
| "epoch": 9.646017699115044, |
| "grad_norm": 0.0015989805106073618, |
| "learning_rate": 3.0890561963680306e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 305784, |
| "step": 1090 |
| }, |
| { |
| "epoch": 9.690265486725664, |
| "grad_norm": 0.0018171430565416813, |
| "learning_rate": 3.0702756655538835e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 307592, |
| "step": 1095 |
| }, |
| { |
| "epoch": 9.734513274336283, |
| "grad_norm": 0.004553630482405424, |
| "learning_rate": 3.051461123650685e-05, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 308760, |
| "step": 1100 |
| }, |
| { |
| "epoch": 9.778761061946902, |
| "grad_norm": 0.03179660439491272, |
| "learning_rate": 3.032613692752711e-05, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 310040, |
| "step": 1105 |
| }, |
| { |
| "epoch": 9.823008849557523, |
| "grad_norm": 0.0016913724830374122, |
| "learning_rate": 3.0137344969157284e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 311752, |
| "step": 1110 |
| }, |
| { |
| "epoch": 9.867256637168142, |
| "grad_norm": 0.0025879093445837498, |
| "learning_rate": 2.9948246620899557e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 313128, |
| "step": 1115 |
| }, |
| { |
| "epoch": 9.91150442477876, |
| "grad_norm": 0.017512066289782524, |
| "learning_rate": 2.9758853160529148e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 314568, |
| "step": 1120 |
| }, |
| { |
| "epoch": 9.955752212389381, |
| "grad_norm": 0.002485684584826231, |
| "learning_rate": 2.9569175883421672e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 315800, |
| "step": 1125 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.011832266114652157, |
| "learning_rate": 2.93792261018795e-05, |
| "loss": 0.0903, |
| "num_input_tokens_seen": 316840, |
| "step": 1130 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.33689600229263306, |
| "eval_runtime": 0.688, |
| "eval_samples_per_second": 36.335, |
| "eval_steps_per_second": 18.894, |
| "num_input_tokens_seen": 316840, |
| "step": 1130 |
| }, |
| { |
| "epoch": 10.044247787610619, |
| "grad_norm": 0.0016681695124134421, |
| "learning_rate": 2.9189015144457087e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 318328, |
| "step": 1135 |
| }, |
| { |
| "epoch": 10.08849557522124, |
| "grad_norm": 0.26901766657829285, |
| "learning_rate": 2.8998554355285355e-05, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 319640, |
| "step": 1140 |
| }, |
| { |
| "epoch": 10.132743362831858, |
| "grad_norm": 0.0016187011497095227, |
| "learning_rate": 2.8807855093395126e-05, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 320888, |
| "step": 1145 |
| }, |
| { |
| "epoch": 10.176991150442477, |
| "grad_norm": 0.038544315844774246, |
| "learning_rate": 2.8616928732039684e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 322296, |
| "step": 1150 |
| }, |
| { |
| "epoch": 10.221238938053098, |
| "grad_norm": 0.0028821935411542654, |
| "learning_rate": 2.8425786658016423e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 323560, |
| "step": 1155 |
| }, |
| { |
| "epoch": 10.265486725663717, |
| "grad_norm": 0.002713311230763793, |
| "learning_rate": 2.8234440270987837e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 324936, |
| "step": 1160 |
| }, |
| { |
| "epoch": 10.309734513274336, |
| "grad_norm": 0.003134116530418396, |
| "learning_rate": 2.804290098280155e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 326248, |
| "step": 1165 |
| }, |
| { |
| "epoch": 10.353982300884956, |
| "grad_norm": 0.0010184970451518893, |
| "learning_rate": 2.7851180216809796e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 327400, |
| "step": 1170 |
| }, |
| { |
| "epoch": 10.398230088495575, |
| "grad_norm": 0.0012547356309369206, |
| "learning_rate": 2.765928940718806e-05, |
| "loss": 0.0502, |
| "num_input_tokens_seen": 329048, |
| "step": 1175 |
| }, |
| { |
| "epoch": 10.442477876106194, |
| "grad_norm": 0.0076978872530162334, |
| "learning_rate": 2.7467239998253214e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 330376, |
| "step": 1180 |
| }, |
| { |
| "epoch": 10.486725663716815, |
| "grad_norm": 0.0014734775759279728, |
| "learning_rate": 2.7275043443780934e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 331848, |
| "step": 1185 |
| }, |
| { |
| "epoch": 10.530973451327434, |
| "grad_norm": 0.0013326797634363174, |
| "learning_rate": 2.708271120632262e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 332984, |
| "step": 1190 |
| }, |
| { |
| "epoch": 10.575221238938052, |
| "grad_norm": 0.06982653588056564, |
| "learning_rate": 2.6890254756521778e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 334184, |
| "step": 1195 |
| }, |
| { |
| "epoch": 10.619469026548673, |
| "grad_norm": 0.001104174298234284, |
| "learning_rate": 2.6697685572429886e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 335400, |
| "step": 1200 |
| }, |
| { |
| "epoch": 10.663716814159292, |
| "grad_norm": 0.0052129547111690044, |
| "learning_rate": 2.65050151388219e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 336696, |
| "step": 1205 |
| }, |
| { |
| "epoch": 10.70796460176991, |
| "grad_norm": 0.068180613219738, |
| "learning_rate": 2.6312254946511217e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 338312, |
| "step": 1210 |
| }, |
| { |
| "epoch": 10.752212389380531, |
| "grad_norm": 0.0005940846749581397, |
| "learning_rate": 2.6119416491664472e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 339864, |
| "step": 1215 |
| }, |
| { |
| "epoch": 10.79646017699115, |
| "grad_norm": 0.0007898228941485286, |
| "learning_rate": 2.5926511275115827e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 341304, |
| "step": 1220 |
| }, |
| { |
| "epoch": 10.84070796460177, |
| "grad_norm": 0.0007905475795269012, |
| "learning_rate": 2.57335508016811e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 343000, |
| "step": 1225 |
| }, |
| { |
| "epoch": 10.88495575221239, |
| "grad_norm": 0.0005679920432157815, |
| "learning_rate": 2.5540546579471624e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 344376, |
| "step": 1230 |
| }, |
| { |
| "epoch": 10.929203539823009, |
| "grad_norm": 0.0008169038337655365, |
| "learning_rate": 2.5347510119207878e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 345992, |
| "step": 1235 |
| }, |
| { |
| "epoch": 10.973451327433628, |
| "grad_norm": 0.0018366064177826047, |
| "learning_rate": 2.515445293353304e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 347416, |
| "step": 1240 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.30118870735168457, |
| "eval_runtime": 0.686, |
| "eval_samples_per_second": 36.442, |
| "eval_steps_per_second": 18.95, |
| "num_input_tokens_seen": 348312, |
| "step": 1243 |
| }, |
| { |
| "epoch": 11.017699115044248, |
| "grad_norm": 0.0014641822781413794, |
| "learning_rate": 2.4961386536326307e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 348776, |
| "step": 1245 |
| }, |
| { |
| "epoch": 11.061946902654867, |
| "grad_norm": 0.0008946527959778905, |
| "learning_rate": 2.4768322442016278e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 350008, |
| "step": 1250 |
| }, |
| { |
| "epoch": 11.106194690265486, |
| "grad_norm": 0.0006893971585668623, |
| "learning_rate": 2.457527216489421e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 351480, |
| "step": 1255 |
| }, |
| { |
| "epoch": 11.150442477876107, |
| "grad_norm": 0.0011873808689415455, |
| "learning_rate": 2.438224721842728e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 352568, |
| "step": 1260 |
| }, |
| { |
| "epoch": 11.194690265486726, |
| "grad_norm": 0.0006970542017370462, |
| "learning_rate": 2.4189259114571984e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 353816, |
| "step": 1265 |
| }, |
| { |
| "epoch": 11.238938053097344, |
| "grad_norm": 0.022083770483732224, |
| "learning_rate": 2.39963193630875e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 355432, |
| "step": 1270 |
| }, |
| { |
| "epoch": 11.283185840707965, |
| "grad_norm": 0.0006550828693434596, |
| "learning_rate": 2.3803439470849335e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 357000, |
| "step": 1275 |
| }, |
| { |
| "epoch": 11.327433628318584, |
| "grad_norm": 0.0011878663208335638, |
| "learning_rate": 2.361063094116293e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 358696, |
| "step": 1280 |
| }, |
| { |
| "epoch": 11.371681415929203, |
| "grad_norm": 0.0005949685000814497, |
| "learning_rate": 2.3417905273077756e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 360440, |
| "step": 1285 |
| }, |
| { |
| "epoch": 11.415929203539823, |
| "grad_norm": 0.00303826411254704, |
| "learning_rate": 2.32252739607014e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 361992, |
| "step": 1290 |
| }, |
| { |
| "epoch": 11.460176991150442, |
| "grad_norm": 0.0020150241907685995, |
| "learning_rate": 2.3032748492514116e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 363304, |
| "step": 1295 |
| }, |
| { |
| "epoch": 11.504424778761061, |
| "grad_norm": 0.004308300092816353, |
| "learning_rate": 2.2840340350683622e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 364728, |
| "step": 1300 |
| }, |
| { |
| "epoch": 11.548672566371682, |
| "grad_norm": 0.0012285029515624046, |
| "learning_rate": 2.2648061010380346e-05, |
| "loss": 0.0469, |
| "num_input_tokens_seen": 365992, |
| "step": 1305 |
| }, |
| { |
| "epoch": 11.5929203539823, |
| "grad_norm": 0.0020916915964335203, |
| "learning_rate": 2.2455921939093e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 367528, |
| "step": 1310 |
| }, |
| { |
| "epoch": 11.63716814159292, |
| "grad_norm": 0.00835400726646185, |
| "learning_rate": 2.2263934595944716e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 369096, |
| "step": 1315 |
| }, |
| { |
| "epoch": 11.68141592920354, |
| "grad_norm": 0.004979610443115234, |
| "learning_rate": 2.207211043100958e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 370392, |
| "step": 1320 |
| }, |
| { |
| "epoch": 11.725663716814159, |
| "grad_norm": 0.0009325013379566371, |
| "learning_rate": 2.188046088462979e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 371544, |
| "step": 1325 |
| }, |
| { |
| "epoch": 11.769911504424778, |
| "grad_norm": 0.004648986738175154, |
| "learning_rate": 2.1688997386733316e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 372600, |
| "step": 1330 |
| }, |
| { |
| "epoch": 11.814159292035399, |
| "grad_norm": 0.00304238754324615, |
| "learning_rate": 2.1497731356152286e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 374072, |
| "step": 1335 |
| }, |
| { |
| "epoch": 11.858407079646017, |
| "grad_norm": 0.0009990925900638103, |
| "learning_rate": 2.1306674199941872e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 375336, |
| "step": 1340 |
| }, |
| { |
| "epoch": 11.902654867256636, |
| "grad_norm": 0.009212859906256199, |
| "learning_rate": 2.1115837312700088e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 376680, |
| "step": 1345 |
| }, |
| { |
| "epoch": 11.946902654867257, |
| "grad_norm": 0.003240500343963504, |
| "learning_rate": 2.0925232075888143e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 378280, |
| "step": 1350 |
| }, |
| { |
| "epoch": 11.991150442477876, |
| "grad_norm": 0.004589646123349667, |
| "learning_rate": 2.0734869857151666e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 379800, |
| "step": 1355 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.34491831064224243, |
| "eval_runtime": 0.6888, |
| "eval_samples_per_second": 36.295, |
| "eval_steps_per_second": 18.873, |
| "num_input_tokens_seen": 379920, |
| "step": 1356 |
| }, |
| { |
| "epoch": 12.035398230088495, |
| "grad_norm": 0.0009743705159053206, |
| "learning_rate": 2.054476200964278e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 381456, |
| "step": 1360 |
| }, |
| { |
| "epoch": 12.079646017699115, |
| "grad_norm": 0.000763334333896637, |
| "learning_rate": 2.035491987134294e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 382672, |
| "step": 1365 |
| }, |
| { |
| "epoch": 12.123893805309734, |
| "grad_norm": 0.003923848737031221, |
| "learning_rate": 2.0165354764386807e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 383840, |
| "step": 1370 |
| }, |
| { |
| "epoch": 12.168141592920353, |
| "grad_norm": 0.00451483391225338, |
| "learning_rate": 1.997607799438694e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 385376, |
| "step": 1375 |
| }, |
| { |
| "epoch": 12.212389380530974, |
| "grad_norm": 0.0026476862840354443, |
| "learning_rate": 1.978710084975959e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 386944, |
| "step": 1380 |
| }, |
| { |
| "epoch": 12.256637168141593, |
| "grad_norm": 0.0006807944155298173, |
| "learning_rate": 1.9598434601051386e-05, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 388544, |
| "step": 1385 |
| }, |
| { |
| "epoch": 12.300884955752213, |
| "grad_norm": 0.003704607719555497, |
| "learning_rate": 1.941009050026726e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 389824, |
| "step": 1390 |
| }, |
| { |
| "epoch": 12.345132743362832, |
| "grad_norm": 0.003849397413432598, |
| "learning_rate": 1.922207978019928e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 391072, |
| "step": 1395 |
| }, |
| { |
| "epoch": 12.389380530973451, |
| "grad_norm": 0.00124372320715338, |
| "learning_rate": 1.903441365375681e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 392416, |
| "step": 1400 |
| }, |
| { |
| "epoch": 12.43362831858407, |
| "grad_norm": 0.0008837588829919696, |
| "learning_rate": 1.884710331329772e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 394176, |
| "step": 1405 |
| }, |
| { |
| "epoch": 12.47787610619469, |
| "grad_norm": 0.001560322241857648, |
| "learning_rate": 1.8660159929960914e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 395536, |
| "step": 1410 |
| }, |
| { |
| "epoch": 12.52212389380531, |
| "grad_norm": 0.00227886950597167, |
| "learning_rate": 1.847359465300006e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 396896, |
| "step": 1415 |
| }, |
| { |
| "epoch": 12.56637168141593, |
| "grad_norm": 0.0012985066277906299, |
| "learning_rate": 1.828741860911867e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 398400, |
| "step": 1420 |
| }, |
| { |
| "epoch": 12.610619469026549, |
| "grad_norm": 0.000994998263195157, |
| "learning_rate": 1.8101642901806486e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 399664, |
| "step": 1425 |
| }, |
| { |
| "epoch": 12.654867256637168, |
| "grad_norm": 0.0008799034985713661, |
| "learning_rate": 1.791627861067731e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 401120, |
| "step": 1430 |
| }, |
| { |
| "epoch": 12.699115044247787, |
| "grad_norm": 0.0050034355372190475, |
| "learning_rate": 1.7731336790808146e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 402672, |
| "step": 1435 |
| }, |
| { |
| "epoch": 12.743362831858407, |
| "grad_norm": 0.0028873751871287823, |
| "learning_rate": 1.7546828472079992e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 404112, |
| "step": 1440 |
| }, |
| { |
| "epoch": 12.787610619469026, |
| "grad_norm": 0.0010144957341253757, |
| "learning_rate": 1.7362764658519877e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 405232, |
| "step": 1445 |
| }, |
| { |
| "epoch": 12.831858407079647, |
| "grad_norm": 0.0014636432752013206, |
| "learning_rate": 1.7179156327644724e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 406640, |
| "step": 1450 |
| }, |
| { |
| "epoch": 12.876106194690266, |
| "grad_norm": 0.0007868976681493223, |
| "learning_rate": 1.699601442980655e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 407968, |
| "step": 1455 |
| }, |
| { |
| "epoch": 12.920353982300885, |
| "grad_norm": 0.0021187462843954563, |
| "learning_rate": 1.6813349887539443e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 409504, |
| "step": 1460 |
| }, |
| { |
| "epoch": 12.964601769911503, |
| "grad_norm": 0.0018395393854007125, |
| "learning_rate": 1.663117359490814e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 410784, |
| "step": 1465 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.3428613841533661, |
| "eval_runtime": 0.6875, |
| "eval_samples_per_second": 36.361, |
| "eval_steps_per_second": 18.908, |
| "num_input_tokens_seen": 411568, |
| "step": 1469 |
| }, |
| { |
| "epoch": 13.008849557522124, |
| "grad_norm": 0.022588098421692848, |
| "learning_rate": 1.6449496416858284e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 411840, |
| "step": 1470 |
| }, |
| { |
| "epoch": 13.053097345132743, |
| "grad_norm": 0.004319458268582821, |
| "learning_rate": 1.6268329188568468e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 413568, |
| "step": 1475 |
| }, |
| { |
| "epoch": 13.097345132743364, |
| "grad_norm": 0.001528352964669466, |
| "learning_rate": 1.6087682714804002e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 415168, |
| "step": 1480 |
| }, |
| { |
| "epoch": 13.141592920353983, |
| "grad_norm": 0.0008898127125576138, |
| "learning_rate": 1.5907567769272568e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 416384, |
| "step": 1485 |
| }, |
| { |
| "epoch": 13.185840707964601, |
| "grad_norm": 0.0009187717805616558, |
| "learning_rate": 1.5727995093981598e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 417584, |
| "step": 1490 |
| }, |
| { |
| "epoch": 13.230088495575222, |
| "grad_norm": 0.006485578138381243, |
| "learning_rate": 1.5548975398597718e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 418800, |
| "step": 1495 |
| }, |
| { |
| "epoch": 13.274336283185841, |
| "grad_norm": 0.004421485122293234, |
| "learning_rate": 1.537051935980794e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 420336, |
| "step": 1500 |
| }, |
| { |
| "epoch": 13.31858407079646, |
| "grad_norm": 0.001087520970031619, |
| "learning_rate": 1.5192637620682981e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 421744, |
| "step": 1505 |
| }, |
| { |
| "epoch": 13.36283185840708, |
| "grad_norm": 0.00048730187700130045, |
| "learning_rate": 1.5015340790042446e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 423088, |
| "step": 1510 |
| }, |
| { |
| "epoch": 13.4070796460177, |
| "grad_norm": 0.000995234469883144, |
| "learning_rate": 1.4838639441822183e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 424208, |
| "step": 1515 |
| }, |
| { |
| "epoch": 13.451327433628318, |
| "grad_norm": 0.0006783698918297887, |
| "learning_rate": 1.46625441144436e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 426272, |
| "step": 1520 |
| }, |
| { |
| "epoch": 13.495575221238939, |
| "grad_norm": 0.0011923641432076693, |
| "learning_rate": 1.4487065310185202e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 427600, |
| "step": 1525 |
| }, |
| { |
| "epoch": 13.539823008849558, |
| "grad_norm": 0.005615973379462957, |
| "learning_rate": 1.4312213494556218e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 428752, |
| "step": 1530 |
| }, |
| { |
| "epoch": 13.584070796460177, |
| "grad_norm": 0.000547691248357296, |
| "learning_rate": 1.4137999095672444e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 430032, |
| "step": 1535 |
| }, |
| { |
| "epoch": 13.628318584070797, |
| "grad_norm": 0.0041353595443069935, |
| "learning_rate": 1.3964432503634281e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 431392, |
| "step": 1540 |
| }, |
| { |
| "epoch": 13.672566371681416, |
| "grad_norm": 0.0010105908149853349, |
| "learning_rate": 1.3791524069907141e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 433008, |
| "step": 1545 |
| }, |
| { |
| "epoch": 13.716814159292035, |
| "grad_norm": 0.0003498003934510052, |
| "learning_rate": 1.361928410670403e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 434416, |
| "step": 1550 |
| }, |
| { |
| "epoch": 13.761061946902656, |
| "grad_norm": 0.0037561210338026285, |
| "learning_rate": 1.3447722886370565e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 436032, |
| "step": 1555 |
| }, |
| { |
| "epoch": 13.805309734513274, |
| "grad_norm": 0.0003635417378973216, |
| "learning_rate": 1.3276850640772288e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 437456, |
| "step": 1560 |
| }, |
| { |
| "epoch": 13.849557522123893, |
| "grad_norm": 0.010033582337200642, |
| "learning_rate": 1.3106677560684494e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 439008, |
| "step": 1565 |
| }, |
| { |
| "epoch": 13.893805309734514, |
| "grad_norm": 0.0005096778040751815, |
| "learning_rate": 1.2937213795184434e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 440624, |
| "step": 1570 |
| }, |
| { |
| "epoch": 13.938053097345133, |
| "grad_norm": 0.002165464451536536, |
| "learning_rate": 1.2768469451046029e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 442080, |
| "step": 1575 |
| }, |
| { |
| "epoch": 13.982300884955752, |
| "grad_norm": 0.0005349584389477968, |
| "learning_rate": 1.2600454592137062e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 443168, |
| "step": 1580 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.34805554151535034, |
| "eval_runtime": 0.6885, |
| "eval_samples_per_second": 36.311, |
| "eval_steps_per_second": 18.882, |
| "num_input_tokens_seen": 443536, |
| "step": 1582 |
| }, |
| { |
| "epoch": 14.026548672566372, |
| "grad_norm": 0.0013768604258075356, |
| "learning_rate": 1.2433179238819077e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 444304, |
| "step": 1585 |
| }, |
| { |
| "epoch": 14.070796460176991, |
| "grad_norm": 0.0010517387418076396, |
| "learning_rate": 1.2266653367349657e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 445648, |
| "step": 1590 |
| }, |
| { |
| "epoch": 14.11504424778761, |
| "grad_norm": 0.0005446486757136881, |
| "learning_rate": 1.2100886909287478e-05, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 447248, |
| "step": 1595 |
| }, |
| { |
| "epoch": 14.15929203539823, |
| "grad_norm": 0.004769509192556143, |
| "learning_rate": 1.1935889750900034e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 448496, |
| "step": 1600 |
| }, |
| { |
| "epoch": 14.20353982300885, |
| "grad_norm": 0.000676127034239471, |
| "learning_rate": 1.1771671732573976e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 449856, |
| "step": 1605 |
| }, |
| { |
| "epoch": 14.247787610619469, |
| "grad_norm": 0.0003581287164706737, |
| "learning_rate": 1.1608242648228257e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 451120, |
| "step": 1610 |
| }, |
| { |
| "epoch": 14.29203539823009, |
| "grad_norm": 0.0007504928507842124, |
| "learning_rate": 1.1445612244729984e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 452416, |
| "step": 1615 |
| }, |
| { |
| "epoch": 14.336283185840708, |
| "grad_norm": 0.00384066766127944, |
| "learning_rate": 1.1283790221313208e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 453968, |
| "step": 1620 |
| }, |
| { |
| "epoch": 14.380530973451327, |
| "grad_norm": 0.0008372140000574291, |
| "learning_rate": 1.1122786229000356e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 455360, |
| "step": 1625 |
| }, |
| { |
| "epoch": 14.424778761061948, |
| "grad_norm": 0.0009218368795700371, |
| "learning_rate": 1.0962609870026724e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 457136, |
| "step": 1630 |
| }, |
| { |
| "epoch": 14.469026548672566, |
| "grad_norm": 0.0007669464102946222, |
| "learning_rate": 1.0803270697267764e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 458416, |
| "step": 1635 |
| }, |
| { |
| "epoch": 14.513274336283185, |
| "grad_norm": 0.0012133128475397825, |
| "learning_rate": 1.0644778213669385e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 459504, |
| "step": 1640 |
| }, |
| { |
| "epoch": 14.557522123893806, |
| "grad_norm": 0.002074818592518568, |
| "learning_rate": 1.0487141871681142e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 461040, |
| "step": 1645 |
| }, |
| { |
| "epoch": 14.601769911504425, |
| "grad_norm": 0.0013546557165682316, |
| "learning_rate": 1.0330371072692565e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 462496, |
| "step": 1650 |
| }, |
| { |
| "epoch": 14.646017699115044, |
| "grad_norm": 0.0005924918223172426, |
| "learning_rate": 1.0174475166472417e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 463744, |
| "step": 1655 |
| }, |
| { |
| "epoch": 14.690265486725664, |
| "grad_norm": 0.0008239169255830348, |
| "learning_rate": 1.0019463450611103e-05, |
| "loss": 0.0, |
| "num_input_tokens_seen": 465792, |
| "step": 1660 |
| }, |
| { |
| "epoch": 14.734513274336283, |
| "grad_norm": 0.002870932687073946, |
| "learning_rate": 9.865345169966114e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 467504, |
| "step": 1665 |
| }, |
| { |
| "epoch": 14.778761061946902, |
| "grad_norm": 0.00171751924790442, |
| "learning_rate": 9.71212951611074e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 468720, |
| "step": 1670 |
| }, |
| { |
| "epoch": 14.823008849557523, |
| "grad_norm": 0.007390305399894714, |
| "learning_rate": 9.559825626785837e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 469968, |
| "step": 1675 |
| }, |
| { |
| "epoch": 14.867256637168142, |
| "grad_norm": 0.00038980445242486894, |
| "learning_rate": 9.40844258535487e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 471488, |
| "step": 1680 |
| }, |
| { |
| "epoch": 14.91150442477876, |
| "grad_norm": 0.0027684990782290697, |
| "learning_rate": 9.257989420262151e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 472816, |
| "step": 1685 |
| }, |
| { |
| "epoch": 14.955752212389381, |
| "grad_norm": 0.0006087150541134179, |
| "learning_rate": 9.108475104494475e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 473920, |
| "step": 1690 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.0006553462008014321, |
| "learning_rate": 8.959908555045846e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 475000, |
| "step": 1695 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.34205034375190735, |
| "eval_runtime": 0.696, |
| "eval_samples_per_second": 35.922, |
| "eval_steps_per_second": 18.679, |
| "num_input_tokens_seen": 475000, |
| "step": 1695 |
| }, |
| { |
| "epoch": 15.044247787610619, |
| "grad_norm": 0.0009022035519592464, |
| "learning_rate": 8.812298632385784e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 476392, |
| "step": 1700 |
| }, |
| { |
| "epoch": 15.08849557522124, |
| "grad_norm": 0.0018264963291585445, |
| "learning_rate": 8.66565413993082e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 477960, |
| "step": 1705 |
| }, |
| { |
| "epoch": 15.132743362831858, |
| "grad_norm": 0.002551417564973235, |
| "learning_rate": 8.519983823519496e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 479560, |
| "step": 1710 |
| }, |
| { |
| "epoch": 15.176991150442477, |
| "grad_norm": 0.005302222911268473, |
| "learning_rate": 8.375296370890749e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 481080, |
| "step": 1715 |
| }, |
| { |
| "epoch": 15.221238938053098, |
| "grad_norm": 0.001917452784255147, |
| "learning_rate": 8.231600411165757e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 482504, |
| "step": 1720 |
| }, |
| { |
| "epoch": 15.265486725663717, |
| "grad_norm": 0.0006103357882238925, |
| "learning_rate": 8.088904514333384e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 483768, |
| "step": 1725 |
| }, |
| { |
| "epoch": 15.309734513274336, |
| "grad_norm": 0.0009480651351623237, |
| "learning_rate": 7.947217190738945e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 485048, |
| "step": 1730 |
| }, |
| { |
| "epoch": 15.353982300884956, |
| "grad_norm": 0.001061386545188725, |
| "learning_rate": 7.806546890576753e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 486680, |
| "step": 1735 |
| }, |
| { |
| "epoch": 15.398230088495575, |
| "grad_norm": 0.0004345499910414219, |
| "learning_rate": 7.666902003386104e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 488184, |
| "step": 1740 |
| }, |
| { |
| "epoch": 15.442477876106194, |
| "grad_norm": 0.002841173205524683, |
| "learning_rate": 7.528290857550943e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 489560, |
| "step": 1745 |
| }, |
| { |
| "epoch": 15.486725663716815, |
| "grad_norm": 0.00136508047580719, |
| "learning_rate": 7.390721719803137e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 490712, |
| "step": 1750 |
| }, |
| { |
| "epoch": 15.530973451327434, |
| "grad_norm": 0.0014027354773133993, |
| "learning_rate": 7.254202794729484e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 492072, |
| "step": 1755 |
| }, |
| { |
| "epoch": 15.575221238938052, |
| "grad_norm": 0.0007366806967183948, |
| "learning_rate": 7.11874222428238e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 493512, |
| "step": 1760 |
| }, |
| { |
| "epoch": 15.619469026548673, |
| "grad_norm": 0.0016336990520358086, |
| "learning_rate": 6.9843480872942294e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 494744, |
| "step": 1765 |
| }, |
| { |
| "epoch": 15.663716814159292, |
| "grad_norm": 0.0013420904288068414, |
| "learning_rate": 6.851028398995607e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 496232, |
| "step": 1770 |
| }, |
| { |
| "epoch": 15.70796460176991, |
| "grad_norm": 0.00037971933488734066, |
| "learning_rate": 6.718791110537287e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 497368, |
| "step": 1775 |
| }, |
| { |
| "epoch": 15.752212389380531, |
| "grad_norm": 0.0004303538298700005, |
| "learning_rate": 6.587644108515986e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 498920, |
| "step": 1780 |
| }, |
| { |
| "epoch": 15.79646017699115, |
| "grad_norm": 0.005797963589429855, |
| "learning_rate": 6.457595214504042e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 500328, |
| "step": 1785 |
| }, |
| { |
| "epoch": 15.84070796460177, |
| "grad_norm": 0.0007089356076903641, |
| "learning_rate": 6.328652184582884e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 501848, |
| "step": 1790 |
| }, |
| { |
| "epoch": 15.88495575221239, |
| "grad_norm": 0.0006735983188264072, |
| "learning_rate": 6.200822708880563e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 503304, |
| "step": 1795 |
| }, |
| { |
| "epoch": 15.929203539823009, |
| "grad_norm": 0.0011508794268593192, |
| "learning_rate": 6.074114411112997e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 504776, |
| "step": 1800 |
| }, |
| { |
| "epoch": 15.973451327433628, |
| "grad_norm": 0.0010680333944037557, |
| "learning_rate": 5.948534848129378e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 506104, |
| "step": 1805 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.3372170031070709, |
| "eval_runtime": 0.6893, |
| "eval_samples_per_second": 36.268, |
| "eval_steps_per_second": 18.859, |
| "num_input_tokens_seen": 506880, |
| "step": 1808 |
| }, |
| { |
| "epoch": 16.01769911504425, |
| "grad_norm": 0.0023007013369351625, |
| "learning_rate": 5.824091509461449e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 507328, |
| "step": 1810 |
| }, |
| { |
| "epoch": 16.061946902654867, |
| "grad_norm": 0.00177915976382792, |
| "learning_rate": 5.7007918168768405e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 508384, |
| "step": 1815 |
| }, |
| { |
| "epoch": 16.106194690265486, |
| "grad_norm": 0.0008482469129376113, |
| "learning_rate": 5.5786431239364365e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 510064, |
| "step": 1820 |
| }, |
| { |
| "epoch": 16.150442477876105, |
| "grad_norm": 0.0009070294327102602, |
| "learning_rate": 5.457652715555781e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 511744, |
| "step": 1825 |
| }, |
| { |
| "epoch": 16.194690265486727, |
| "grad_norm": 0.0029111511539667845, |
| "learning_rate": 5.337827807570689e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 513424, |
| "step": 1830 |
| }, |
| { |
| "epoch": 16.238938053097346, |
| "grad_norm": 0.0016602243995293975, |
| "learning_rate": 5.219175546306784e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 514992, |
| "step": 1835 |
| }, |
| { |
| "epoch": 16.283185840707965, |
| "grad_norm": 0.0005636098794639111, |
| "learning_rate": 5.1017030081533914e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 516192, |
| "step": 1840 |
| }, |
| { |
| "epoch": 16.327433628318584, |
| "grad_norm": 0.002352345734834671, |
| "learning_rate": 4.985417199141443e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 517936, |
| "step": 1845 |
| }, |
| { |
| "epoch": 16.371681415929203, |
| "grad_norm": 0.000550171360373497, |
| "learning_rate": 4.870325054525673e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 519184, |
| "step": 1850 |
| }, |
| { |
| "epoch": 16.41592920353982, |
| "grad_norm": 0.0004805954813491553, |
| "learning_rate": 4.7564334383709745e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 520624, |
| "step": 1855 |
| }, |
| { |
| "epoch": 16.460176991150444, |
| "grad_norm": 0.000510231708176434, |
| "learning_rate": 4.6437491431430556e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 522048, |
| "step": 1860 |
| }, |
| { |
| "epoch": 16.504424778761063, |
| "grad_norm": 0.0012865742901340127, |
| "learning_rate": 4.5322788893033155e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 523872, |
| "step": 1865 |
| }, |
| { |
| "epoch": 16.548672566371682, |
| "grad_norm": 0.00048671747208572924, |
| "learning_rate": 4.422029324908061e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 524928, |
| "step": 1870 |
| }, |
| { |
| "epoch": 16.5929203539823, |
| "grad_norm": 0.000755130487959832, |
| "learning_rate": 4.313007025211985e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 526160, |
| "step": 1875 |
| }, |
| { |
| "epoch": 16.63716814159292, |
| "grad_norm": 0.0006898596766404808, |
| "learning_rate": 4.205218492276055e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 527488, |
| "step": 1880 |
| }, |
| { |
| "epoch": 16.68141592920354, |
| "grad_norm": 0.00509675731882453, |
| "learning_rate": 4.098670154579715e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 529136, |
| "step": 1885 |
| }, |
| { |
| "epoch": 16.72566371681416, |
| "grad_norm": 0.0004100928199477494, |
| "learning_rate": 3.9933683666374986e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 530384, |
| "step": 1890 |
| }, |
| { |
| "epoch": 16.76991150442478, |
| "grad_norm": 0.0004588266892824322, |
| "learning_rate": 3.889319408620021e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 531712, |
| "step": 1895 |
| }, |
| { |
| "epoch": 16.8141592920354, |
| "grad_norm": 0.00041784465429373085, |
| "learning_rate": 3.7865294859794926e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 533072, |
| "step": 1900 |
| }, |
| { |
| "epoch": 16.858407079646017, |
| "grad_norm": 0.001727156457491219, |
| "learning_rate": 3.68500472907955e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 534544, |
| "step": 1905 |
| }, |
| { |
| "epoch": 16.902654867256636, |
| "grad_norm": 0.00045313002192415297, |
| "learning_rate": 3.584751192829705e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 536096, |
| "step": 1910 |
| }, |
| { |
| "epoch": 16.946902654867255, |
| "grad_norm": 0.0015837351093068719, |
| "learning_rate": 3.4857748563242006e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 537376, |
| "step": 1915 |
| }, |
| { |
| "epoch": 16.991150442477878, |
| "grad_norm": 0.003061944153159857, |
| "learning_rate": 3.388081622485431e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 538464, |
| "step": 1920 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.3425106704235077, |
| "eval_runtime": 0.6899, |
| "eval_samples_per_second": 36.236, |
| "eval_steps_per_second": 18.843, |
| "num_input_tokens_seen": 538552, |
| "step": 1921 |
| }, |
| { |
| "epoch": 17.035398230088497, |
| "grad_norm": 0.0013424725038930774, |
| "learning_rate": 3.2916773177118778e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 539864, |
| "step": 1925 |
| }, |
| { |
| "epoch": 17.079646017699115, |
| "grad_norm": 0.0013487042160704732, |
| "learning_rate": 3.1965676915306384e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 541240, |
| "step": 1930 |
| }, |
| { |
| "epoch": 17.123893805309734, |
| "grad_norm": 0.0023580584675073624, |
| "learning_rate": 3.102758416254545e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 542296, |
| "step": 1935 |
| }, |
| { |
| "epoch": 17.168141592920353, |
| "grad_norm": 0.0017332157585769892, |
| "learning_rate": 3.010255086643818e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 543784, |
| "step": 1940 |
| }, |
| { |
| "epoch": 17.212389380530972, |
| "grad_norm": 0.001323725562542677, |
| "learning_rate": 2.919063219572438e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 545432, |
| "step": 1945 |
| }, |
| { |
| "epoch": 17.256637168141594, |
| "grad_norm": 0.0005653653061017394, |
| "learning_rate": 2.829188253699111e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 546680, |
| "step": 1950 |
| }, |
| { |
| "epoch": 17.300884955752213, |
| "grad_norm": 0.0038920126389712095, |
| "learning_rate": 2.7406355491429086e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 547848, |
| "step": 1955 |
| }, |
| { |
| "epoch": 17.345132743362832, |
| "grad_norm": 0.00043903160258196294, |
| "learning_rate": 2.653410387163574e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 549192, |
| "step": 1960 |
| }, |
| { |
| "epoch": 17.38938053097345, |
| "grad_norm": 0.0006057664868421853, |
| "learning_rate": 2.567517969846575e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 550568, |
| "step": 1965 |
| }, |
| { |
| "epoch": 17.43362831858407, |
| "grad_norm": 0.0003931774408556521, |
| "learning_rate": 2.482963419792844e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 552104, |
| "step": 1970 |
| }, |
| { |
| "epoch": 17.47787610619469, |
| "grad_norm": 0.0007057964103296399, |
| "learning_rate": 2.399751779813264e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 553416, |
| "step": 1975 |
| }, |
| { |
| "epoch": 17.52212389380531, |
| "grad_norm": 0.0005147880292497575, |
| "learning_rate": 2.317888012627914e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 554888, |
| "step": 1980 |
| }, |
| { |
| "epoch": 17.56637168141593, |
| "grad_norm": 0.0005328193074092269, |
| "learning_rate": 2.2373770005700955e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 556728, |
| "step": 1985 |
| }, |
| { |
| "epoch": 17.61061946902655, |
| "grad_norm": 0.0030065446626394987, |
| "learning_rate": 2.1582235452951682e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 557896, |
| "step": 1990 |
| }, |
| { |
| "epoch": 17.654867256637168, |
| "grad_norm": 0.0003217308840248734, |
| "learning_rate": 2.0804323674941563e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 559528, |
| "step": 1995 |
| }, |
| { |
| "epoch": 17.699115044247787, |
| "grad_norm": 0.0005141164292581379, |
| "learning_rate": 2.0040081066122043e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 561144, |
| "step": 2000 |
| }, |
| { |
| "epoch": 17.743362831858406, |
| "grad_norm": 0.0004922666703350842, |
| "learning_rate": 1.9289553205719317e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 562328, |
| "step": 2005 |
| }, |
| { |
| "epoch": 17.787610619469028, |
| "grad_norm": 0.0009286728454753757, |
| "learning_rate": 1.8552784855015215e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 563560, |
| "step": 2010 |
| }, |
| { |
| "epoch": 17.831858407079647, |
| "grad_norm": 0.0006457404233515263, |
| "learning_rate": 1.7829819954678361e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 565000, |
| "step": 2015 |
| }, |
| { |
| "epoch": 17.876106194690266, |
| "grad_norm": 0.0003511160612106323, |
| "learning_rate": 1.7120701622143132e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 566664, |
| "step": 2020 |
| }, |
| { |
| "epoch": 17.920353982300885, |
| "grad_norm": 0.0007439504261128604, |
| "learning_rate": 1.6425472149038361e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 568024, |
| "step": 2025 |
| }, |
| { |
| "epoch": 17.964601769911503, |
| "grad_norm": 0.0003757936938200146, |
| "learning_rate": 1.5744172998664902e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 569464, |
| "step": 2030 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.3449106216430664, |
| "eval_runtime": 0.6874, |
| "eval_samples_per_second": 36.367, |
| "eval_steps_per_second": 18.911, |
| "num_input_tokens_seen": 570488, |
| "step": 2034 |
| }, |
| { |
| "epoch": 18.008849557522122, |
| "grad_norm": 0.0026148678734898567, |
| "learning_rate": 1.5076844803522922e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 570664, |
| "step": 2035 |
| }, |
| { |
| "epoch": 18.053097345132745, |
| "grad_norm": 0.0012115227291360497, |
| "learning_rate": 1.4423527362888546e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 571896, |
| "step": 2040 |
| }, |
| { |
| "epoch": 18.097345132743364, |
| "grad_norm": 0.0015595826553180814, |
| "learning_rate": 1.3784259640440279e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 573368, |
| "step": 2045 |
| }, |
| { |
| "epoch": 18.141592920353983, |
| "grad_norm": 0.0014978301478549838, |
| "learning_rate": 1.3159079761934923e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 574472, |
| "step": 2050 |
| }, |
| { |
| "epoch": 18.1858407079646, |
| "grad_norm": 0.0022592353634536266, |
| "learning_rate": 1.2548025012934367e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 575672, |
| "step": 2055 |
| }, |
| { |
| "epoch": 18.23008849557522, |
| "grad_norm": 0.0030029607005417347, |
| "learning_rate": 1.195113183658131e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 577160, |
| "step": 2060 |
| }, |
| { |
| "epoch": 18.27433628318584, |
| "grad_norm": 0.0006508956430479884, |
| "learning_rate": 1.1368435831426021e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 578456, |
| "step": 2065 |
| }, |
| { |
| "epoch": 18.31858407079646, |
| "grad_norm": 0.0003759461687877774, |
| "learning_rate": 1.0799971749303333e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 580168, |
| "step": 2070 |
| }, |
| { |
| "epoch": 18.36283185840708, |
| "grad_norm": 0.0006137562450021505, |
| "learning_rate": 1.0245773493259946e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 581624, |
| "step": 2075 |
| }, |
| { |
| "epoch": 18.4070796460177, |
| "grad_norm": 0.00028311816276982427, |
| "learning_rate": 9.705874115532532e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 583352, |
| "step": 2080 |
| }, |
| { |
| "epoch": 18.451327433628318, |
| "grad_norm": 0.0006366329034790397, |
| "learning_rate": 9.180305815576301e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 584600, |
| "step": 2085 |
| }, |
| { |
| "epoch": 18.495575221238937, |
| "grad_norm": 0.0011967526515945792, |
| "learning_rate": 8.669099938144992e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 585832, |
| "step": 2090 |
| }, |
| { |
| "epoch": 18.539823008849556, |
| "grad_norm": 0.0007272835355252028, |
| "learning_rate": 8.172286971421167e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 587416, |
| "step": 2095 |
| }, |
| { |
| "epoch": 18.58407079646018, |
| "grad_norm": 0.001816938747651875, |
| "learning_rate": 7.689896545198111e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 588760, |
| "step": 2100 |
| }, |
| { |
| "epoch": 18.628318584070797, |
| "grad_norm": 0.0007159130764193833, |
| "learning_rate": 7.221957429112469e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 590104, |
| "step": 2105 |
| }, |
| { |
| "epoch": 18.672566371681416, |
| "grad_norm": 0.0004972445894964039, |
| "learning_rate": 6.768497530928785e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 591432, |
| "step": 2110 |
| }, |
| { |
| "epoch": 18.716814159292035, |
| "grad_norm": 0.0005601092125289142, |
| "learning_rate": 6.329543894874779e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 592760, |
| "step": 2115 |
| }, |
| { |
| "epoch": 18.761061946902654, |
| "grad_norm": 0.00032141589326784015, |
| "learning_rate": 5.905122700028576e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 594280, |
| "step": 2120 |
| }, |
| { |
| "epoch": 18.805309734513273, |
| "grad_norm": 0.00047725485637784004, |
| "learning_rate": 5.49525925875738e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 595784, |
| "step": 2125 |
| }, |
| { |
| "epoch": 18.849557522123895, |
| "grad_norm": 0.0004924829117953777, |
| "learning_rate": 5.099978015207868e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 597400, |
| "step": 2130 |
| }, |
| { |
| "epoch": 18.893805309734514, |
| "grad_norm": 0.0003631446452345699, |
| "learning_rate": 4.719302543848225e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 598712, |
| "step": 2135 |
| }, |
| { |
| "epoch": 18.938053097345133, |
| "grad_norm": 0.0006079651066102087, |
| "learning_rate": 4.3532555480624295e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 600136, |
| "step": 2140 |
| }, |
| { |
| "epoch": 18.98230088495575, |
| "grad_norm": 0.0007797577418386936, |
| "learning_rate": 4.001858858795893e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 601560, |
| "step": 2145 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.3529210686683655, |
| "eval_runtime": 0.6901, |
| "eval_samples_per_second": 36.229, |
| "eval_steps_per_second": 18.839, |
| "num_input_tokens_seen": 602176, |
| "step": 2147 |
| }, |
| { |
| "epoch": 19.02654867256637, |
| "grad_norm": 0.0006274338811635971, |
| "learning_rate": 3.665133433253809e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 603072, |
| "step": 2150 |
| }, |
| { |
| "epoch": 19.07079646017699, |
| "grad_norm": 0.0008976224344223738, |
| "learning_rate": 3.34309935365093e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 604128, |
| "step": 2155 |
| }, |
| { |
| "epoch": 19.115044247787612, |
| "grad_norm": 0.0009988279780372977, |
| "learning_rate": 3.03577582601422e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 605488, |
| "step": 2160 |
| }, |
| { |
| "epoch": 19.15929203539823, |
| "grad_norm": 0.0005080733681097627, |
| "learning_rate": 2.743181179037047e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 606800, |
| "step": 2165 |
| }, |
| { |
| "epoch": 19.20353982300885, |
| "grad_norm": 0.0024150812532752752, |
| "learning_rate": 2.465332862986447e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 608272, |
| "step": 2170 |
| }, |
| { |
| "epoch": 19.24778761061947, |
| "grad_norm": 0.001215559197589755, |
| "learning_rate": 2.2022474486620427e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 609360, |
| "step": 2175 |
| }, |
| { |
| "epoch": 19.292035398230087, |
| "grad_norm": 0.000782918359618634, |
| "learning_rate": 1.953940626408024e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 610592, |
| "step": 2180 |
| }, |
| { |
| "epoch": 19.336283185840706, |
| "grad_norm": 0.0004457279574126005, |
| "learning_rate": 1.720427205177233e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 611840, |
| "step": 2185 |
| }, |
| { |
| "epoch": 19.38053097345133, |
| "grad_norm": 0.00041979391244240105, |
| "learning_rate": 1.5017211116479802e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 613472, |
| "step": 2190 |
| }, |
| { |
| "epoch": 19.424778761061948, |
| "grad_norm": 0.0004557992215268314, |
| "learning_rate": 1.297835389393598e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 614848, |
| "step": 2195 |
| }, |
| { |
| "epoch": 19.469026548672566, |
| "grad_norm": 0.0030613532289862633, |
| "learning_rate": 1.1087821981042856e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 616320, |
| "step": 2200 |
| }, |
| { |
| "epoch": 19.513274336283185, |
| "grad_norm": 0.0012249136343598366, |
| "learning_rate": 9.345728128621611e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 617664, |
| "step": 2205 |
| }, |
| { |
| "epoch": 19.557522123893804, |
| "grad_norm": 0.0005657924339175224, |
| "learning_rate": 7.752176234685771e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 619200, |
| "step": 2210 |
| }, |
| { |
| "epoch": 19.601769911504427, |
| "grad_norm": 0.0010923200752586126, |
| "learning_rate": 6.307261338246718e-08, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 620688, |
| "step": 2215 |
| }, |
| { |
| "epoch": 19.646017699115045, |
| "grad_norm": 0.003021995071321726, |
| "learning_rate": 5.011069613644892e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 622032, |
| "step": 2220 |
| }, |
| { |
| "epoch": 19.690265486725664, |
| "grad_norm": 0.002445510122925043, |
| "learning_rate": 3.8636783654100174e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 623248, |
| "step": 2225 |
| }, |
| { |
| "epoch": 19.734513274336283, |
| "grad_norm": 0.0032096211798489094, |
| "learning_rate": 2.865156023650617e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 624944, |
| "step": 2230 |
| }, |
| { |
| "epoch": 19.778761061946902, |
| "grad_norm": 0.0004213061183691025, |
| "learning_rate": 2.0155621399742254e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 626176, |
| "step": 2235 |
| }, |
| { |
| "epoch": 19.82300884955752, |
| "grad_norm": 0.000989592750556767, |
| "learning_rate": 1.31494738393384e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 627408, |
| "step": 2240 |
| }, |
| { |
| "epoch": 19.86725663716814, |
| "grad_norm": 0.007174751255661249, |
| "learning_rate": 7.633535400070057e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 629216, |
| "step": 2245 |
| }, |
| { |
| "epoch": 19.911504424778762, |
| "grad_norm": 0.0005223340122029185, |
| "learning_rate": 3.6081350510447365e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 630592, |
| "step": 2250 |
| }, |
| { |
| "epoch": 19.95575221238938, |
| "grad_norm": 0.0004655419907066971, |
| "learning_rate": 1.0735128660649406e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 632208, |
| "step": 2255 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.0004798208246938884, |
| "learning_rate": 2.982000932294504e-11, |
| "loss": 0.0, |
| "num_input_tokens_seen": 633448, |
| "step": 2260 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.34241023659706116, |
| "eval_runtime": 0.6874, |
| "eval_samples_per_second": 36.368, |
| "eval_steps_per_second": 18.911, |
| "num_input_tokens_seen": 633448, |
| "step": 2260 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 633448, |
| "step": 2260, |
| "total_flos": 2.8523877141774336e+16, |
| "train_loss": 0.20182705866069886, |
| "train_runtime": 250.2247, |
| "train_samples_per_second": 17.984, |
| "train_steps_per_second": 9.032 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 2260, |
| "num_input_tokens_seen": 633448, |
| "num_train_epochs": 20, |
| "save_steps": 113, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.8523877141774336e+16, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|