| { | |
| "best_global_step": 490, | |
| "best_metric": 1.0370746850967407, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.0, | |
| "eval_steps": 10, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.007787227630615, | |
| "learning_rate": 5e-05, | |
| "loss": 5.2601, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 4.682629585266113, | |
| "eval_mean_token_accuracy": 0.37830855041742323, | |
| "eval_num_tokens": 6333.0, | |
| "eval_runtime": 97.2723, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 8.059301376342773, | |
| "learning_rate": 9.999927156177032e-05, | |
| "loss": 3.7743, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 2.5042078495025635, | |
| "eval_mean_token_accuracy": 0.6304361110925675, | |
| "eval_num_tokens": 12685.0, | |
| "eval_runtime": 97.2264, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.4571871757507324, | |
| "learning_rate": 9.991188465340766e-05, | |
| "loss": 2.0475, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 1.8001786470413208, | |
| "eval_mean_token_accuracy": 0.6825134682655335, | |
| "eval_num_tokens": 19121.0, | |
| "eval_runtime": 97.26, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.8676475286483765, | |
| "learning_rate": 9.967910180154889e-05, | |
| "loss": 1.6727, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 1.611749529838562, | |
| "eval_mean_token_accuracy": 0.6887511330842971, | |
| "eval_num_tokens": 25515.0, | |
| "eval_runtime": 97.2714, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.9121489524841309, | |
| "learning_rate": 9.930160111487716e-05, | |
| "loss": 1.5602, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 1.519342303276062, | |
| "eval_mean_token_accuracy": 0.6936497485637665, | |
| "eval_num_tokens": 31902.0, | |
| "eval_runtime": 97.2536, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.4662973880767822, | |
| "learning_rate": 9.87804822727352e-05, | |
| "loss": 1.4916, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 1.4659887552261353, | |
| "eval_mean_token_accuracy": 0.6948034042119979, | |
| "eval_num_tokens": 38297.0, | |
| "eval_runtime": 97.2339, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.132735252380371, | |
| "learning_rate": 9.811726332170153e-05, | |
| "loss": 1.4291, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_loss": 1.422730565071106, | |
| "eval_mean_token_accuracy": 0.6967671060562134, | |
| "eval_num_tokens": 44694.0, | |
| "eval_runtime": 97.2271, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.3734078407287598, | |
| "learning_rate": 9.731387625344104e-05, | |
| "loss": 1.3832, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.3986200094223022, | |
| "eval_mean_token_accuracy": 0.6979268860816955, | |
| "eval_num_tokens": 51069.0, | |
| "eval_runtime": 97.2072, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 1.4604731798171997, | |
| "learning_rate": 9.637266137671177e-05, | |
| "loss": 1.3693, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_loss": 1.3532851934432983, | |
| "eval_mean_token_accuracy": 0.7013956385850907, | |
| "eval_num_tokens": 57445.0, | |
| "eval_runtime": 97.2573, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3173726797103882, | |
| "learning_rate": 9.529636049992234e-05, | |
| "loss": 1.3143, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.2985877990722656, | |
| "eval_mean_token_accuracy": 0.7077123075723648, | |
| "eval_num_tokens": 63853.0, | |
| "eval_runtime": 97.2409, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.3028841018676758, | |
| "learning_rate": 9.408810894410009e-05, | |
| "loss": 1.2416, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "eval_loss": 1.2666221857070923, | |
| "eval_mean_token_accuracy": 0.7159780770540237, | |
| "eval_num_tokens": 70269.0, | |
| "eval_runtime": 97.2354, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.479728102684021, | |
| "learning_rate": 9.2751426409536e-05, | |
| "loss": 1.208, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 1.2293524742126465, | |
| "eval_mean_token_accuracy": 0.7204430556297302, | |
| "eval_num_tokens": 76691.0, | |
| "eval_runtime": 97.2278, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 1.6283237934112549, | |
| "learning_rate": 9.129020672271283e-05, | |
| "loss": 1.1881, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "eval_loss": 1.2158491611480713, | |
| "eval_mean_token_accuracy": 0.7250880861282348, | |
| "eval_num_tokens": 83083.0, | |
| "eval_runtime": 97.221, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.2515714168548584, | |
| "learning_rate": 8.970870649338387e-05, | |
| "loss": 1.1893, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_loss": 1.2067173719406128, | |
| "eval_mean_token_accuracy": 0.7283873379230499, | |
| "eval_num_tokens": 89470.0, | |
| "eval_runtime": 97.217, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.7679073810577393, | |
| "learning_rate": 8.801153271484502e-05, | |
| "loss": 1.1742, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 1.2018429040908813, | |
| "eval_mean_token_accuracy": 0.7298205083608628, | |
| "eval_num_tokens": 95793.0, | |
| "eval_runtime": 97.2454, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.4671496152877808, | |
| "learning_rate": 8.620362934352109e-05, | |
| "loss": 1.1713, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 1.2024760246276855, | |
| "eval_mean_token_accuracy": 0.7248463779687881, | |
| "eval_num_tokens": 102197.0, | |
| "eval_runtime": 97.2434, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 1.3356602191925049, | |
| "learning_rate": 8.429026289696091e-05, | |
| "loss": 1.1616, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "eval_loss": 1.1988317966461182, | |
| "eval_mean_token_accuracy": 0.7268992912769318, | |
| "eval_num_tokens": 108568.0, | |
| "eval_runtime": 97.2357, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.8510276079177856, | |
| "learning_rate": 8.227700711219493e-05, | |
| "loss": 1.1541, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "eval_loss": 1.1827600002288818, | |
| "eval_mean_token_accuracy": 0.7310593771934509, | |
| "eval_num_tokens": 114964.0, | |
| "eval_runtime": 97.2104, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 1.4240201711654663, | |
| "learning_rate": 8.016972670914624e-05, | |
| "loss": 1.1708, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_loss": 1.1701805591583252, | |
| "eval_mean_token_accuracy": 0.7321947473287582, | |
| "eval_num_tokens": 121333.0, | |
| "eval_runtime": 97.2429, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.8542824983596802, | |
| "learning_rate": 7.797456030639313e-05, | |
| "loss": 1.1509, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.157992959022522, | |
| "eval_mean_token_accuracy": 0.7394243097305297, | |
| "eval_num_tokens": 127706.0, | |
| "eval_runtime": 97.2191, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 1.5795457363128662, | |
| "learning_rate": 7.569790253905059e-05, | |
| "loss": 1.0941, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "eval_loss": 1.148593544960022, | |
| "eval_mean_token_accuracy": 0.7429650634527206, | |
| "eval_num_tokens": 134095.0, | |
| "eval_runtime": 97.2383, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.4866969585418701, | |
| "learning_rate": 7.334638543086203e-05, | |
| "loss": 1.0931, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "eval_loss": 1.1364799737930298, | |
| "eval_mean_token_accuracy": 0.7470852738618851, | |
| "eval_num_tokens": 140527.0, | |
| "eval_runtime": 97.2225, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.651310920715332, | |
| "learning_rate": 7.092685907476558e-05, | |
| "loss": 1.0897, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "eval_loss": 1.1412357091903687, | |
| "eval_mean_token_accuracy": 0.7455077153444291, | |
| "eval_num_tokens": 146939.0, | |
| "eval_runtime": 97.2224, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.3703193664550781, | |
| "learning_rate": 6.844637167821326e-05, | |
| "loss": 1.0945, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 1.1335656642913818, | |
| "eval_mean_token_accuracy": 0.7461127752065658, | |
| "eval_num_tokens": 153360.0, | |
| "eval_runtime": 97.2102, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.7046772241592407, | |
| "learning_rate": 6.59121490313722e-05, | |
| "loss": 1.0803, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 1.1254699230194092, | |
| "eval_mean_token_accuracy": 0.7490965259075165, | |
| "eval_num_tokens": 159731.0, | |
| "eval_runtime": 97.2222, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.6481680870056152, | |
| "learning_rate": 6.333157345801809e-05, | |
| "loss": 1.0859, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "eval_loss": 1.1334922313690186, | |
| "eval_mean_token_accuracy": 0.7479477733373642, | |
| "eval_num_tokens": 166113.0, | |
| "eval_runtime": 97.2341, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 1.7207422256469727, | |
| "learning_rate": 6.071216231043799e-05, | |
| "loss": 1.0934, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "eval_loss": 1.1130690574645996, | |
| "eval_mean_token_accuracy": 0.7543548595905304, | |
| "eval_num_tokens": 172481.0, | |
| "eval_runtime": 97.2173, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.7942646741867065, | |
| "learning_rate": 5.8061546070987994e-05, | |
| "loss": 1.0733, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_loss": 1.1070621013641357, | |
| "eval_mean_token_accuracy": 0.7557503712177277, | |
| "eval_num_tokens": 178813.0, | |
| "eval_runtime": 97.2529, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.5024192333221436, | |
| "learning_rate": 5.538744612409701e-05, | |
| "loss": 1.0767, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "eval_loss": 1.109175205230713, | |
| "eval_mean_token_accuracy": 0.7545353853702546, | |
| "eval_num_tokens": 185220.0, | |
| "eval_runtime": 97.25, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.7613085508346558, | |
| "learning_rate": 5.2697652263468125e-05, | |
| "loss": 1.0574, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.0976698398590088, | |
| "eval_mean_token_accuracy": 0.759365046620369, | |
| "eval_num_tokens": 191559.0, | |
| "eval_runtime": 97.2105, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 1.6670249700546265, | |
| "learning_rate": 5e-05, | |
| "loss": 1.0182, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "eval_loss": 1.094779372215271, | |
| "eval_mean_token_accuracy": 0.76161092877388, | |
| "eval_num_tokens": 197915.0, | |
| "eval_runtime": 97.2406, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.7392572164535522, | |
| "learning_rate": 4.730234773653188e-05, | |
| "loss": 1.007, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 1.0834596157073975, | |
| "eval_mean_token_accuracy": 0.7656834137439728, | |
| "eval_num_tokens": 204252.0, | |
| "eval_runtime": 97.2394, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 2.0105206966400146, | |
| "learning_rate": 4.461255387590299e-05, | |
| "loss": 1.0119, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "eval_loss": 1.0809874534606934, | |
| "eval_mean_token_accuracy": 0.7655541002750397, | |
| "eval_num_tokens": 210574.0, | |
| "eval_runtime": 97.2409, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 1.759490966796875, | |
| "learning_rate": 4.193845392901201e-05, | |
| "loss": 0.9974, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "eval_loss": 1.0805025100708008, | |
| "eval_mean_token_accuracy": 0.7660670423507691, | |
| "eval_num_tokens": 216977.0, | |
| "eval_runtime": 97.206, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 1.8894627094268799, | |
| "learning_rate": 3.9287837689562016e-05, | |
| "loss": 1.0107, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_loss": 1.0740913152694702, | |
| "eval_mean_token_accuracy": 0.7694036465883255, | |
| "eval_num_tokens": 223412.0, | |
| "eval_runtime": 97.2057, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 2.0457475185394287, | |
| "learning_rate": 3.666842654198191e-05, | |
| "loss": 1.0004, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "eval_loss": 1.0669925212860107, | |
| "eval_mean_token_accuracy": 0.7717740494012832, | |
| "eval_num_tokens": 229731.0, | |
| "eval_runtime": 97.1971, | |
| "eval_samples_per_second": 2.058, | |
| "eval_steps_per_second": 1.029, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 1.5154274702072144, | |
| "learning_rate": 3.408785096862782e-05, | |
| "loss": 0.9902, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "eval_loss": 1.0629109144210815, | |
| "eval_mean_token_accuracy": 0.7715724587440491, | |
| "eval_num_tokens": 236188.0, | |
| "eval_runtime": 97.203, | |
| "eval_samples_per_second": 2.058, | |
| "eval_steps_per_second": 1.029, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 2.06585693359375, | |
| "learning_rate": 3.1553628321786745e-05, | |
| "loss": 1.0053, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_loss": 1.0563435554504395, | |
| "eval_mean_token_accuracy": 0.7732643353939056, | |
| "eval_num_tokens": 242558.0, | |
| "eval_runtime": 97.2143, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 1.730677843093872, | |
| "learning_rate": 2.907314092523442e-05, | |
| "loss": 0.9813, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "eval_loss": 1.0525062084197998, | |
| "eval_mean_token_accuracy": 0.7765388804674148, | |
| "eval_num_tokens": 249000.0, | |
| "eval_runtime": 97.2525, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.5716614723205566, | |
| "learning_rate": 2.6653614569137968e-05, | |
| "loss": 0.9852, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.0519495010375977, | |
| "eval_mean_token_accuracy": 0.7759864777326584, | |
| "eval_num_tokens": 255412.0, | |
| "eval_runtime": 97.2138, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 1.607945203781128, | |
| "learning_rate": 2.430209746094943e-05, | |
| "loss": 0.9515, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "eval_loss": 1.0613422393798828, | |
| "eval_mean_token_accuracy": 0.7760749870538711, | |
| "eval_num_tokens": 261847.0, | |
| "eval_runtime": 97.2595, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 1.7474143505096436, | |
| "learning_rate": 2.2025439693606882e-05, | |
| "loss": 0.9453, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "eval_loss": 1.0487031936645508, | |
| "eval_mean_token_accuracy": 0.7783306258916854, | |
| "eval_num_tokens": 268220.0, | |
| "eval_runtime": 97.2251, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.029, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 1.3773257732391357, | |
| "learning_rate": 1.983027329085377e-05, | |
| "loss": 0.9398, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "eval_loss": 1.0485966205596924, | |
| "eval_mean_token_accuracy": 0.7771646714210511, | |
| "eval_num_tokens": 274612.0, | |
| "eval_runtime": 97.2492, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 1.5962660312652588, | |
| "learning_rate": 1.772299288780508e-05, | |
| "loss": 0.9283, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "eval_loss": 1.049791693687439, | |
| "eval_mean_token_accuracy": 0.778401963710785, | |
| "eval_num_tokens": 281009.0, | |
| "eval_runtime": 97.2759, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 2.201416015625, | |
| "learning_rate": 1.5709737103039103e-05, | |
| "loss": 0.944, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "eval_loss": 1.0448309183120728, | |
| "eval_mean_token_accuracy": 0.7783755934238434, | |
| "eval_num_tokens": 287341.0, | |
| "eval_runtime": 97.2585, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 1.6953043937683105, | |
| "learning_rate": 1.3796370656478935e-05, | |
| "loss": 0.9357, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "eval_loss": 1.0441021919250488, | |
| "eval_mean_token_accuracy": 0.7798552727699279, | |
| "eval_num_tokens": 293667.0, | |
| "eval_runtime": 97.2712, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 2.495961904525757, | |
| "learning_rate": 1.1988467285154987e-05, | |
| "loss": 0.9543, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "eval_loss": 1.0410436391830444, | |
| "eval_mean_token_accuracy": 0.7808067119121551, | |
| "eval_num_tokens": 300075.0, | |
| "eval_runtime": 97.273, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 2.314516067504883, | |
| "learning_rate": 1.0291293506616133e-05, | |
| "loss": 0.9453, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "eval_loss": 1.040062427520752, | |
| "eval_mean_token_accuracy": 0.7821026688814163, | |
| "eval_num_tokens": 306434.0, | |
| "eval_runtime": 97.2666, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 2.006592035293579, | |
| "learning_rate": 8.70979327728718e-06, | |
| "loss": 0.9376, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "eval_loss": 1.0370746850967407, | |
| "eval_mean_token_accuracy": 0.7834529572725296, | |
| "eval_num_tokens": 312881.0, | |
| "eval_runtime": 97.2721, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 1.7751625776290894, | |
| "learning_rate": 7.248573590464014e-06, | |
| "loss": 0.9288, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.0378228425979614, | |
| "eval_mean_token_accuracy": 0.7834442704916, | |
| "eval_num_tokens": 319265.0, | |
| "eval_runtime": 97.2917, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 5.1, | |
| "grad_norm": 1.7418688535690308, | |
| "learning_rate": 5.91189105589992e-06, | |
| "loss": 0.9145, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.1, | |
| "eval_loss": 1.0420323610305786, | |
| "eval_mean_token_accuracy": 0.7838906270265579, | |
| "eval_num_tokens": 325626.0, | |
| "eval_runtime": 97.2737, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 1.826409101486206, | |
| "learning_rate": 4.703639500077656e-06, | |
| "loss": 0.9237, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "eval_loss": 1.0423357486724854, | |
| "eval_mean_token_accuracy": 0.7844956815242767, | |
| "eval_num_tokens": 331976.0, | |
| "eval_runtime": 97.2685, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "grad_norm": 1.7615423202514648, | |
| "learning_rate": 3.6273386232882343e-06, | |
| "loss": 0.9201, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 5.3, | |
| "eval_loss": 1.0414623022079468, | |
| "eval_mean_token_accuracy": 0.7839177978038788, | |
| "eval_num_tokens": 338413.0, | |
| "eval_runtime": 97.2327, | |
| "eval_samples_per_second": 2.057, | |
| "eval_steps_per_second": 1.028, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 1.5898215770721436, | |
| "learning_rate": 2.686123746558961e-06, | |
| "loss": 0.9002, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "eval_loss": 1.0424439907073975, | |
| "eval_mean_token_accuracy": 0.7836556518077851, | |
| "eval_num_tokens": 344800.0, | |
| "eval_runtime": 97.2686, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 1.721126675605774, | |
| "learning_rate": 1.8827366782984913e-06, | |
| "loss": 0.9025, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "eval_loss": 1.043124794960022, | |
| "eval_mean_token_accuracy": 0.7836492872238159, | |
| "eval_num_tokens": 351242.0, | |
| "eval_runtime": 97.3002, | |
| "eval_samples_per_second": 2.055, | |
| "eval_steps_per_second": 1.028, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 1.763380527496338, | |
| "learning_rate": 1.2195177272648127e-06, | |
| "loss": 0.9187, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "eval_loss": 1.0419065952301025, | |
| "eval_mean_token_accuracy": 0.7836943608522415, | |
| "eval_num_tokens": 357626.0, | |
| "eval_runtime": 97.2575, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 5.7, | |
| "grad_norm": 1.7301427125930786, | |
| "learning_rate": 6.983988851228473e-07, | |
| "loss": 0.9192, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 5.7, | |
| "eval_loss": 1.0414124727249146, | |
| "eval_mean_token_accuracy": 0.784173795580864, | |
| "eval_num_tokens": 363989.0, | |
| "eval_runtime": 97.2552, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 1.9148461818695068, | |
| "learning_rate": 3.208981984511195e-07, | |
| "loss": 0.9075, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "eval_loss": 1.0412297248840332, | |
| "eval_mean_token_accuracy": 0.7838809263706207, | |
| "eval_num_tokens": 370407.0, | |
| "eval_runtime": 97.2933, | |
| "eval_samples_per_second": 2.056, | |
| "eval_steps_per_second": 1.028, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 5.9, | |
| "grad_norm": 1.3000129461288452, | |
| "learning_rate": 8.811534659234899e-08, | |
| "loss": 0.9122, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 5.9, | |
| "eval_loss": 1.041398048400879, | |
| "eval_mean_token_accuracy": 0.7838524436950683, | |
| "eval_num_tokens": 376689.0, | |
| "eval_runtime": 97.3246, | |
| "eval_samples_per_second": 2.055, | |
| "eval_steps_per_second": 1.027, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 1.879518747329712, | |
| "learning_rate": 7.284382296801617e-10, | |
| "loss": 0.9134, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 1.0410884618759155, | |
| "eval_mean_token_accuracy": 0.7832156884670257, | |
| "eval_num_tokens": 383118.0, | |
| "eval_runtime": 97.3129, | |
| "eval_samples_per_second": 2.055, | |
| "eval_steps_per_second": 1.028, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.777512987967488e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |