{ "best_global_step": 190, "best_metric": 6.224213600158691, "best_model_checkpoint": "/kaggle/working/qwen-model-finetuned/checkpoint-190", "epoch": 20.0, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5194805194805194, "grad_norm": 504.0, "learning_rate": 9.800000000000001e-06, "loss": 10.9305, "mean_token_accuracy": 0.022239863348659128, "num_tokens": 81920.0, "step": 5 }, { "epoch": 1.0, "grad_norm": 528.0, "learning_rate": 9.55e-06, "loss": 9.7037, "mean_token_accuracy": 0.03139729846923335, "num_tokens": 157696.0, "step": 10 }, { "epoch": 1.0, "eval_loss": 9.371186256408691, "eval_mean_token_accuracy": 0.025891548436548974, "eval_num_tokens": 157696.0, "eval_runtime": 10.0705, "eval_samples_per_second": 0.894, "eval_steps_per_second": 0.894, "step": 10 }, { "epoch": 1.5194805194805194, "grad_norm": 374.0, "learning_rate": 9.3e-06, "loss": 9.0276, "mean_token_accuracy": 0.04255007305182516, "num_tokens": 239616.0, "step": 15 }, { "epoch": 2.0, "grad_norm": 3504.0, "learning_rate": 9.050000000000001e-06, "loss": 8.5915, "mean_token_accuracy": 0.06423375010490417, "num_tokens": 315392.0, "step": 20 }, { "epoch": 2.0, "eval_loss": 8.596755981445312, "eval_mean_token_accuracy": 0.06323617200056712, "eval_num_tokens": 315392.0, "eval_runtime": 9.9888, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.901, "step": 20 }, { "epoch": 2.5194805194805197, "grad_norm": 502.0, "learning_rate": 8.8e-06, "loss": 8.3693, "mean_token_accuracy": 0.0784318515099585, "num_tokens": 397312.0, "step": 25 }, { "epoch": 3.0, "grad_norm": 288.0, "learning_rate": 8.550000000000001e-06, "loss": 8.2333, "mean_token_accuracy": 0.07881012428048495, "num_tokens": 473088.0, "step": 30 }, { "epoch": 3.0, "eval_loss": 8.20586109161377, "eval_mean_token_accuracy": 0.06926124874088499, "eval_num_tokens": 473088.0, "eval_runtime": 9.9908, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.901, "step": 30 }, { "epoch": 3.5194805194805197, "grad_norm": 249.0, "learning_rate": 8.3e-06, "loss": 8.0225, "mean_token_accuracy": 0.09137762561440468, "num_tokens": 555008.0, "step": 35 }, { "epoch": 4.0, "grad_norm": 218.0, "learning_rate": 8.050000000000001e-06, "loss": 7.8343, "mean_token_accuracy": 0.09395423753036035, "num_tokens": 630784.0, "step": 40 }, { "epoch": 4.0, "eval_loss": 7.793511390686035, "eval_mean_token_accuracy": 0.0885306414630678, "eval_num_tokens": 630784.0, "eval_runtime": 9.9692, "eval_samples_per_second": 0.903, "eval_steps_per_second": 0.903, "step": 40 }, { "epoch": 4.51948051948052, "grad_norm": 124.5, "learning_rate": 7.800000000000002e-06, "loss": 7.667, "mean_token_accuracy": 0.10079384371638297, "num_tokens": 712704.0, "step": 45 }, { "epoch": 5.0, "grad_norm": 199.0, "learning_rate": 7.5500000000000006e-06, "loss": 7.5062, "mean_token_accuracy": 0.10965288692229502, "num_tokens": 788480.0, "step": 50 }, { "epoch": 5.0, "eval_loss": 7.532417297363281, "eval_mean_token_accuracy": 0.11339086873663796, "eval_num_tokens": 788480.0, "eval_runtime": 9.9854, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.901, "step": 50 }, { "epoch": 5.51948051948052, "grad_norm": 189.0, "learning_rate": 7.3e-06, "loss": 7.4228, "mean_token_accuracy": 0.14055935498327016, "num_tokens": 870400.0, "step": 55 }, { "epoch": 6.0, "grad_norm": 179.0, "learning_rate": 7.05e-06, "loss": 7.1983, "mean_token_accuracy": 0.1601288616657257, "num_tokens": 946176.0, "step": 60 }, { "epoch": 6.0, "eval_loss": 7.2402825355529785, "eval_mean_token_accuracy": 0.15719481143686506, "eval_num_tokens": 946176.0, "eval_runtime": 9.9843, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.901, "step": 60 }, { "epoch": 6.51948051948052, "grad_norm": 212.0, "learning_rate": 6.800000000000001e-06, "loss": 7.1554, "mean_token_accuracy": 0.16111382581293582, "num_tokens": 1028096.0, "step": 65 }, { "epoch": 7.0, "grad_norm": 240.0, "learning_rate": 6.550000000000001e-06, "loss": 7.0146, "mean_token_accuracy": 0.16839409096015467, "num_tokens": 1103872.0, "step": 70 }, { "epoch": 7.0, "eval_loss": 7.037937641143799, "eval_mean_token_accuracy": 0.16300277080800799, "eval_num_tokens": 1103872.0, "eval_runtime": 9.9771, "eval_samples_per_second": 0.902, "eval_steps_per_second": 0.902, "step": 70 }, { "epoch": 7.51948051948052, "grad_norm": 89.0, "learning_rate": 6.300000000000001e-06, "loss": 6.9988, "mean_token_accuracy": 0.1658891063183546, "num_tokens": 1185792.0, "step": 75 }, { "epoch": 8.0, "grad_norm": 99.5, "learning_rate": 6.0500000000000005e-06, "loss": 6.8182, "mean_token_accuracy": 0.1801449720923965, "num_tokens": 1261568.0, "step": 80 }, { "epoch": 8.0, "eval_loss": 6.879114627838135, "eval_mean_token_accuracy": 0.17081908716095817, "eval_num_tokens": 1261568.0, "eval_runtime": 9.9794, "eval_samples_per_second": 0.902, "eval_steps_per_second": 0.902, "step": 80 }, { "epoch": 8.519480519480519, "grad_norm": 79.0, "learning_rate": 5.8e-06, "loss": 6.7599, "mean_token_accuracy": 0.18248656746000053, "num_tokens": 1343488.0, "step": 85 }, { "epoch": 9.0, "grad_norm": 244.0, "learning_rate": 5.550000000000001e-06, "loss": 6.7712, "mean_token_accuracy": 0.18557150158527735, "num_tokens": 1419264.0, "step": 90 }, { "epoch": 9.0, "eval_loss": 6.76102352142334, "eval_mean_token_accuracy": 0.18694023622406852, "eval_num_tokens": 1419264.0, "eval_runtime": 9.9905, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.901, "step": 90 }, { "epoch": 9.519480519480519, "grad_norm": 122.0, "learning_rate": 5.300000000000001e-06, "loss": 6.7039, "mean_token_accuracy": 0.18946018554270266, "num_tokens": 1501184.0, "step": 95 }, { "epoch": 10.0, "grad_norm": 182.0, "learning_rate": 5.050000000000001e-06, "loss": 6.618, "mean_token_accuracy": 0.1998177944002925, "num_tokens": 1576960.0, "step": 100 }, { "epoch": 10.0, "eval_loss": 6.640429496765137, "eval_mean_token_accuracy": 0.19361667500601876, "eval_num_tokens": 1576960.0, "eval_runtime": 9.9854, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.901, "step": 100 }, { "epoch": 10.519480519480519, "grad_norm": 103.5, "learning_rate": 4.800000000000001e-06, "loss": 6.5581, "mean_token_accuracy": 0.19838788434863092, "num_tokens": 1658880.0, "step": 105 }, { "epoch": 11.0, "grad_norm": 132.0, "learning_rate": 4.5500000000000005e-06, "loss": 6.5207, "mean_token_accuracy": 0.19835223559592222, "num_tokens": 1734656.0, "step": 110 }, { "epoch": 11.0, "eval_loss": 6.540436744689941, "eval_mean_token_accuracy": 0.19757911231782702, "eval_num_tokens": 1734656.0, "eval_runtime": 9.9902, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.901, "step": 110 }, { "epoch": 11.519480519480519, "grad_norm": 82.0, "learning_rate": 4.3e-06, "loss": 6.487, "mean_token_accuracy": 0.1987176351249218, "num_tokens": 1816576.0, "step": 115 }, { "epoch": 12.0, "grad_norm": 55.0, "learning_rate": 4.05e-06, "loss": 6.381, "mean_token_accuracy": 0.20363352991439201, "num_tokens": 1892352.0, "step": 120 }, { "epoch": 12.0, "eval_loss": 6.44816255569458, "eval_mean_token_accuracy": 0.19850187169180977, "eval_num_tokens": 1892352.0, "eval_runtime": 9.9827, "eval_samples_per_second": 0.902, "eval_steps_per_second": 0.902, "step": 120 }, { "epoch": 12.519480519480519, "grad_norm": 109.5, "learning_rate": 3.8000000000000005e-06, "loss": 6.3201, "mean_token_accuracy": 0.207901806011796, "num_tokens": 1974272.0, "step": 125 }, { "epoch": 13.0, "grad_norm": 81.0, "learning_rate": 3.5500000000000003e-06, "loss": 6.3889, "mean_token_accuracy": 0.1981937969858582, "num_tokens": 2050048.0, "step": 130 }, { "epoch": 13.0, "eval_loss": 6.373791217803955, "eval_mean_token_accuracy": 0.19828475183910793, "eval_num_tokens": 2050048.0, "eval_runtime": 9.9649, "eval_samples_per_second": 0.903, "eval_steps_per_second": 0.903, "step": 130 }, { "epoch": 13.519480519480519, "grad_norm": 57.75, "learning_rate": 3.3000000000000006e-06, "loss": 6.3537, "mean_token_accuracy": 0.20020762123167515, "num_tokens": 2131968.0, "step": 135 }, { "epoch": 14.0, "grad_norm": 131.0, "learning_rate": 3.05e-06, "loss": 6.2238, "mean_token_accuracy": 0.20939014046578794, "num_tokens": 2207744.0, "step": 140 }, { "epoch": 14.0, "eval_loss": 6.317364692687988, "eval_mean_token_accuracy": 0.20094447003470528, "eval_num_tokens": 2207744.0, "eval_runtime": 9.9793, "eval_samples_per_second": 0.902, "eval_steps_per_second": 0.902, "step": 140 }, { "epoch": 14.519480519480519, "grad_norm": 111.0, "learning_rate": 2.8000000000000003e-06, "loss": 6.2564, "mean_token_accuracy": 0.2047997061163187, "num_tokens": 2289664.0, "step": 145 }, { "epoch": 15.0, "grad_norm": 60.5, "learning_rate": 2.55e-06, "loss": 6.237, "mean_token_accuracy": 0.2071191845713435, "num_tokens": 2365440.0, "step": 150 }, { "epoch": 15.0, "eval_loss": 6.276952743530273, "eval_mean_token_accuracy": 0.20354990826712716, "eval_num_tokens": 2365440.0, "eval_runtime": 9.9884, "eval_samples_per_second": 0.901, "eval_steps_per_second": 0.901, "step": 150 }, { "epoch": 15.519480519480519, "grad_norm": 57.0, "learning_rate": 2.3000000000000004e-06, "loss": 6.1936, "mean_token_accuracy": 0.2097093306481838, "num_tokens": 2447360.0, "step": 155 }, { "epoch": 16.0, "grad_norm": 92.5, "learning_rate": 2.05e-06, "loss": 6.2567, "mean_token_accuracy": 0.20516510428609075, "num_tokens": 2523136.0, "step": 160 }, { "epoch": 16.0, "eval_loss": 6.257329940795898, "eval_mean_token_accuracy": 0.20631818804475996, "eval_num_tokens": 2523136.0, "eval_runtime": 10.0306, "eval_samples_per_second": 0.897, "eval_steps_per_second": 0.897, "step": 160 }, { "epoch": 16.51948051948052, "grad_norm": 57.75, "learning_rate": 1.8000000000000001e-06, "loss": 6.1925, "mean_token_accuracy": 0.2081704933196306, "num_tokens": 2605056.0, "step": 165 }, { "epoch": 17.0, "grad_norm": 67.5, "learning_rate": 1.5500000000000002e-06, "loss": 6.2109, "mean_token_accuracy": 0.2084659144685075, "num_tokens": 2680832.0, "step": 170 }, { "epoch": 17.0, "eval_loss": 6.232978343963623, "eval_mean_token_accuracy": 0.20924930771191916, "eval_num_tokens": 2680832.0, "eval_runtime": 9.9679, "eval_samples_per_second": 0.903, "eval_steps_per_second": 0.903, "step": 170 }, { "epoch": 17.51948051948052, "grad_norm": 66.0, "learning_rate": 1.3e-06, "loss": 6.1513, "mean_token_accuracy": 0.21311675421893597, "num_tokens": 2762752.0, "step": 175 }, { "epoch": 18.0, "grad_norm": 62.25, "learning_rate": 1.0500000000000001e-06, "loss": 6.2227, "mean_token_accuracy": 0.20644582042822968, "num_tokens": 2838528.0, "step": 180 }, { "epoch": 18.0, "eval_loss": 6.226585388183594, "eval_mean_token_accuracy": 0.19757911231782702, "eval_num_tokens": 2838528.0, "eval_runtime": 9.9755, "eval_samples_per_second": 0.902, "eval_steps_per_second": 0.902, "step": 180 }, { "epoch": 18.51948051948052, "grad_norm": 47.75, "learning_rate": 8.000000000000001e-07, "loss": 6.1354, "mean_token_accuracy": 0.20610649585723878, "num_tokens": 2920448.0, "step": 185 }, { "epoch": 19.0, "grad_norm": 58.25, "learning_rate": 5.5e-07, "loss": 6.2115, "mean_token_accuracy": 0.20702676193134203, "num_tokens": 2996224.0, "step": 190 }, { "epoch": 19.0, "eval_loss": 6.224213600158691, "eval_mean_token_accuracy": 0.20870650642448002, "eval_num_tokens": 2996224.0, "eval_runtime": 9.9648, "eval_samples_per_second": 0.903, "eval_steps_per_second": 0.903, "step": 190 }, { "epoch": 19.51948051948052, "grad_norm": 58.75, "learning_rate": 3.0000000000000004e-07, "loss": 6.1279, "mean_token_accuracy": 0.21613336391746998, "num_tokens": 3078144.0, "step": 195 }, { "epoch": 20.0, "grad_norm": 57.75, "learning_rate": 5.0000000000000004e-08, "loss": 6.2165, "mean_token_accuracy": 0.20515190266274116, "num_tokens": 3153920.0, "step": 200 }, { "epoch": 20.0, "eval_loss": 6.226177215576172, "eval_mean_token_accuracy": 0.20892362627718183, "eval_num_tokens": 3153920.0, "eval_runtime": 9.9552, "eval_samples_per_second": 0.904, "eval_steps_per_second": 0.904, "step": 200 } ], "logging_steps": 5, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8335194712965120.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }