| { | |
| "best_global_step": 190, | |
| "best_metric": 6.224213600158691, | |
| "best_model_checkpoint": "/kaggle/working/qwen-model-finetuned/checkpoint-190", | |
| "epoch": 20.0, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.5194805194805194, | |
| "grad_norm": 504.0, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 10.9305, | |
| "mean_token_accuracy": 0.022239863348659128, | |
| "num_tokens": 81920.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 528.0, | |
| "learning_rate": 9.55e-06, | |
| "loss": 9.7037, | |
| "mean_token_accuracy": 0.03139729846923335, | |
| "num_tokens": 157696.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 9.371186256408691, | |
| "eval_mean_token_accuracy": 0.025891548436548974, | |
| "eval_num_tokens": 157696.0, | |
| "eval_runtime": 10.0705, | |
| "eval_samples_per_second": 0.894, | |
| "eval_steps_per_second": 0.894, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 1.5194805194805194, | |
| "grad_norm": 374.0, | |
| "learning_rate": 9.3e-06, | |
| "loss": 9.0276, | |
| "mean_token_accuracy": 0.04255007305182516, | |
| "num_tokens": 239616.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3504.0, | |
| "learning_rate": 9.050000000000001e-06, | |
| "loss": 8.5915, | |
| "mean_token_accuracy": 0.06423375010490417, | |
| "num_tokens": 315392.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 8.596755981445312, | |
| "eval_mean_token_accuracy": 0.06323617200056712, | |
| "eval_num_tokens": 315392.0, | |
| "eval_runtime": 9.9888, | |
| "eval_samples_per_second": 0.901, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 2.5194805194805197, | |
| "grad_norm": 502.0, | |
| "learning_rate": 8.8e-06, | |
| "loss": 8.3693, | |
| "mean_token_accuracy": 0.0784318515099585, | |
| "num_tokens": 397312.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 288.0, | |
| "learning_rate": 8.550000000000001e-06, | |
| "loss": 8.2333, | |
| "mean_token_accuracy": 0.07881012428048495, | |
| "num_tokens": 473088.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 8.20586109161377, | |
| "eval_mean_token_accuracy": 0.06926124874088499, | |
| "eval_num_tokens": 473088.0, | |
| "eval_runtime": 9.9908, | |
| "eval_samples_per_second": 0.901, | |
| "eval_steps_per_second": 0.901, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 3.5194805194805197, | |
| "grad_norm": 249.0, | |
| "learning_rate": 8.3e-06, | |
| "loss": 8.0225, | |
| "mean_token_accuracy": 0.09137762561440468, | |
| "num_tokens": 555008.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 218.0, | |
| "learning_rate": 8.050000000000001e-06, | |
| "loss": 7.8343, | |
| "mean_token_accuracy": 0.09395423753036035, | |
| "num_tokens": 630784.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 7.793511390686035, | |
| "eval_mean_token_accuracy": 0.0885306414630678, | |
| "eval_num_tokens": 630784.0, | |
| "eval_runtime": 9.9692, | |
| "eval_samples_per_second": 0.903, | |
| "eval_steps_per_second": 0.903, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 4.51948051948052, | |
| "grad_norm": 124.5, | |
| "learning_rate": 7.800000000000002e-06, | |
| "loss": 7.667, | |
| "mean_token_accuracy": 0.10079384371638297, | |
| "num_tokens": 712704.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 199.0, | |
| "learning_rate": 7.5500000000000006e-06, | |
| "loss": 7.5062, | |
| "mean_token_accuracy": 0.10965288692229502, | |
| "num_tokens": 788480.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 7.532417297363281, | |
| "eval_mean_token_accuracy": 0.11339086873663796, | |
| "eval_num_tokens": 788480.0, | |
| "eval_runtime": 9.9854, | |
| "eval_samples_per_second": 0.901, | |
| "eval_steps_per_second": 0.901, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 5.51948051948052, | |
| "grad_norm": 189.0, | |
| "learning_rate": 7.3e-06, | |
| "loss": 7.4228, | |
| "mean_token_accuracy": 0.14055935498327016, | |
| "num_tokens": 870400.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 179.0, | |
| "learning_rate": 7.05e-06, | |
| "loss": 7.1983, | |
| "mean_token_accuracy": 0.1601288616657257, | |
| "num_tokens": 946176.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 7.2402825355529785, | |
| "eval_mean_token_accuracy": 0.15719481143686506, | |
| "eval_num_tokens": 946176.0, | |
| "eval_runtime": 9.9843, | |
| "eval_samples_per_second": 0.901, | |
| "eval_steps_per_second": 0.901, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 6.51948051948052, | |
| "grad_norm": 212.0, | |
| "learning_rate": 6.800000000000001e-06, | |
| "loss": 7.1554, | |
| "mean_token_accuracy": 0.16111382581293582, | |
| "num_tokens": 1028096.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 240.0, | |
| "learning_rate": 6.550000000000001e-06, | |
| "loss": 7.0146, | |
| "mean_token_accuracy": 0.16839409096015467, | |
| "num_tokens": 1103872.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 7.037937641143799, | |
| "eval_mean_token_accuracy": 0.16300277080800799, | |
| "eval_num_tokens": 1103872.0, | |
| "eval_runtime": 9.9771, | |
| "eval_samples_per_second": 0.902, | |
| "eval_steps_per_second": 0.902, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 7.51948051948052, | |
| "grad_norm": 89.0, | |
| "learning_rate": 6.300000000000001e-06, | |
| "loss": 6.9988, | |
| "mean_token_accuracy": 0.1658891063183546, | |
| "num_tokens": 1185792.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 99.5, | |
| "learning_rate": 6.0500000000000005e-06, | |
| "loss": 6.8182, | |
| "mean_token_accuracy": 0.1801449720923965, | |
| "num_tokens": 1261568.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 6.879114627838135, | |
| "eval_mean_token_accuracy": 0.17081908716095817, | |
| "eval_num_tokens": 1261568.0, | |
| "eval_runtime": 9.9794, | |
| "eval_samples_per_second": 0.902, | |
| "eval_steps_per_second": 0.902, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 8.519480519480519, | |
| "grad_norm": 79.0, | |
| "learning_rate": 5.8e-06, | |
| "loss": 6.7599, | |
| "mean_token_accuracy": 0.18248656746000053, | |
| "num_tokens": 1343488.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 244.0, | |
| "learning_rate": 5.550000000000001e-06, | |
| "loss": 6.7712, | |
| "mean_token_accuracy": 0.18557150158527735, | |
| "num_tokens": 1419264.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 6.76102352142334, | |
| "eval_mean_token_accuracy": 0.18694023622406852, | |
| "eval_num_tokens": 1419264.0, | |
| "eval_runtime": 9.9905, | |
| "eval_samples_per_second": 0.901, | |
| "eval_steps_per_second": 0.901, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 9.519480519480519, | |
| "grad_norm": 122.0, | |
| "learning_rate": 5.300000000000001e-06, | |
| "loss": 6.7039, | |
| "mean_token_accuracy": 0.18946018554270266, | |
| "num_tokens": 1501184.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 182.0, | |
| "learning_rate": 5.050000000000001e-06, | |
| "loss": 6.618, | |
| "mean_token_accuracy": 0.1998177944002925, | |
| "num_tokens": 1576960.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 6.640429496765137, | |
| "eval_mean_token_accuracy": 0.19361667500601876, | |
| "eval_num_tokens": 1576960.0, | |
| "eval_runtime": 9.9854, | |
| "eval_samples_per_second": 0.901, | |
| "eval_steps_per_second": 0.901, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 10.519480519480519, | |
| "grad_norm": 103.5, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 6.5581, | |
| "mean_token_accuracy": 0.19838788434863092, | |
| "num_tokens": 1658880.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 132.0, | |
| "learning_rate": 4.5500000000000005e-06, | |
| "loss": 6.5207, | |
| "mean_token_accuracy": 0.19835223559592222, | |
| "num_tokens": 1734656.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 6.540436744689941, | |
| "eval_mean_token_accuracy": 0.19757911231782702, | |
| "eval_num_tokens": 1734656.0, | |
| "eval_runtime": 9.9902, | |
| "eval_samples_per_second": 0.901, | |
| "eval_steps_per_second": 0.901, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 11.519480519480519, | |
| "grad_norm": 82.0, | |
| "learning_rate": 4.3e-06, | |
| "loss": 6.487, | |
| "mean_token_accuracy": 0.1987176351249218, | |
| "num_tokens": 1816576.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 55.0, | |
| "learning_rate": 4.05e-06, | |
| "loss": 6.381, | |
| "mean_token_accuracy": 0.20363352991439201, | |
| "num_tokens": 1892352.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 6.44816255569458, | |
| "eval_mean_token_accuracy": 0.19850187169180977, | |
| "eval_num_tokens": 1892352.0, | |
| "eval_runtime": 9.9827, | |
| "eval_samples_per_second": 0.902, | |
| "eval_steps_per_second": 0.902, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 12.519480519480519, | |
| "grad_norm": 109.5, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "loss": 6.3201, | |
| "mean_token_accuracy": 0.207901806011796, | |
| "num_tokens": 1974272.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 81.0, | |
| "learning_rate": 3.5500000000000003e-06, | |
| "loss": 6.3889, | |
| "mean_token_accuracy": 0.1981937969858582, | |
| "num_tokens": 2050048.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 6.373791217803955, | |
| "eval_mean_token_accuracy": 0.19828475183910793, | |
| "eval_num_tokens": 2050048.0, | |
| "eval_runtime": 9.9649, | |
| "eval_samples_per_second": 0.903, | |
| "eval_steps_per_second": 0.903, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 13.519480519480519, | |
| "grad_norm": 57.75, | |
| "learning_rate": 3.3000000000000006e-06, | |
| "loss": 6.3537, | |
| "mean_token_accuracy": 0.20020762123167515, | |
| "num_tokens": 2131968.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 131.0, | |
| "learning_rate": 3.05e-06, | |
| "loss": 6.2238, | |
| "mean_token_accuracy": 0.20939014046578794, | |
| "num_tokens": 2207744.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 6.317364692687988, | |
| "eval_mean_token_accuracy": 0.20094447003470528, | |
| "eval_num_tokens": 2207744.0, | |
| "eval_runtime": 9.9793, | |
| "eval_samples_per_second": 0.902, | |
| "eval_steps_per_second": 0.902, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 14.519480519480519, | |
| "grad_norm": 111.0, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 6.2564, | |
| "mean_token_accuracy": 0.2047997061163187, | |
| "num_tokens": 2289664.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 60.5, | |
| "learning_rate": 2.55e-06, | |
| "loss": 6.237, | |
| "mean_token_accuracy": 0.2071191845713435, | |
| "num_tokens": 2365440.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 6.276952743530273, | |
| "eval_mean_token_accuracy": 0.20354990826712716, | |
| "eval_num_tokens": 2365440.0, | |
| "eval_runtime": 9.9884, | |
| "eval_samples_per_second": 0.901, | |
| "eval_steps_per_second": 0.901, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 15.519480519480519, | |
| "grad_norm": 57.0, | |
| "learning_rate": 2.3000000000000004e-06, | |
| "loss": 6.1936, | |
| "mean_token_accuracy": 0.2097093306481838, | |
| "num_tokens": 2447360.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 92.5, | |
| "learning_rate": 2.05e-06, | |
| "loss": 6.2567, | |
| "mean_token_accuracy": 0.20516510428609075, | |
| "num_tokens": 2523136.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 6.257329940795898, | |
| "eval_mean_token_accuracy": 0.20631818804475996, | |
| "eval_num_tokens": 2523136.0, | |
| "eval_runtime": 10.0306, | |
| "eval_samples_per_second": 0.897, | |
| "eval_steps_per_second": 0.897, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 16.51948051948052, | |
| "grad_norm": 57.75, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "loss": 6.1925, | |
| "mean_token_accuracy": 0.2081704933196306, | |
| "num_tokens": 2605056.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 67.5, | |
| "learning_rate": 1.5500000000000002e-06, | |
| "loss": 6.2109, | |
| "mean_token_accuracy": 0.2084659144685075, | |
| "num_tokens": 2680832.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 6.232978343963623, | |
| "eval_mean_token_accuracy": 0.20924930771191916, | |
| "eval_num_tokens": 2680832.0, | |
| "eval_runtime": 9.9679, | |
| "eval_samples_per_second": 0.903, | |
| "eval_steps_per_second": 0.903, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 17.51948051948052, | |
| "grad_norm": 66.0, | |
| "learning_rate": 1.3e-06, | |
| "loss": 6.1513, | |
| "mean_token_accuracy": 0.21311675421893597, | |
| "num_tokens": 2762752.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 62.25, | |
| "learning_rate": 1.0500000000000001e-06, | |
| "loss": 6.2227, | |
| "mean_token_accuracy": 0.20644582042822968, | |
| "num_tokens": 2838528.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 6.226585388183594, | |
| "eval_mean_token_accuracy": 0.19757911231782702, | |
| "eval_num_tokens": 2838528.0, | |
| "eval_runtime": 9.9755, | |
| "eval_samples_per_second": 0.902, | |
| "eval_steps_per_second": 0.902, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 18.51948051948052, | |
| "grad_norm": 47.75, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 6.1354, | |
| "mean_token_accuracy": 0.20610649585723878, | |
| "num_tokens": 2920448.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 58.25, | |
| "learning_rate": 5.5e-07, | |
| "loss": 6.2115, | |
| "mean_token_accuracy": 0.20702676193134203, | |
| "num_tokens": 2996224.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 6.224213600158691, | |
| "eval_mean_token_accuracy": 0.20870650642448002, | |
| "eval_num_tokens": 2996224.0, | |
| "eval_runtime": 9.9648, | |
| "eval_samples_per_second": 0.903, | |
| "eval_steps_per_second": 0.903, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 19.51948051948052, | |
| "grad_norm": 58.75, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "loss": 6.1279, | |
| "mean_token_accuracy": 0.21613336391746998, | |
| "num_tokens": 3078144.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 57.75, | |
| "learning_rate": 5.0000000000000004e-08, | |
| "loss": 6.2165, | |
| "mean_token_accuracy": 0.20515190266274116, | |
| "num_tokens": 3153920.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 6.226177215576172, | |
| "eval_mean_token_accuracy": 0.20892362627718183, | |
| "eval_num_tokens": 3153920.0, | |
| "eval_runtime": 9.9552, | |
| "eval_samples_per_second": 0.904, | |
| "eval_steps_per_second": 0.904, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8335194712965120.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |