| { | |
| "best_global_step": 2500, | |
| "best_metric": 3.794926643371582, | |
| "best_model_checkpoint": "./qwen3moe_tinystories_sft_global_balance/checkpoint-2500", | |
| "epoch": 0.9997473684210526, | |
| "eval_steps": 500, | |
| "global_step": 2968, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03368421052631579, | |
| "grad_norm": 492097.03125, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 11.6957, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06736842105263158, | |
| "grad_norm": 526197.25, | |
| "learning_rate": 3.35016835016835e-05, | |
| "loss": 10.5167, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10105263157894737, | |
| "grad_norm": 486177.34375, | |
| "learning_rate": 4.999993082936328e-05, | |
| "loss": 8.9635, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.13473684210526315, | |
| "grad_norm": 394882.59375, | |
| "learning_rate": 4.982030277845304e-05, | |
| "loss": 7.2522, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16842105263157894, | |
| "grad_norm": 279043.53125, | |
| "learning_rate": 4.9297703006544226e-05, | |
| "loss": 6.0898, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.16842105263157894, | |
| "eval_loss": 5.723438262939453, | |
| "eval_runtime": 135.9602, | |
| "eval_samples_per_second": 36.775, | |
| "eval_steps_per_second": 2.302, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.20210526315789473, | |
| "grad_norm": 258667.03125, | |
| "learning_rate": 4.843935289787076e-05, | |
| "loss": 5.4572, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.23578947368421052, | |
| "grad_norm": 308293.84375, | |
| "learning_rate": 4.725711329944238e-05, | |
| "loss": 5.051, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2694736842105263, | |
| "grad_norm": 280231.3125, | |
| "learning_rate": 4.5767320625577836e-05, | |
| "loss": 4.7862, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3031578947368421, | |
| "grad_norm": 383468.1875, | |
| "learning_rate": 4.399056111818752e-05, | |
| "loss": 4.5928, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3368421052631579, | |
| "grad_norm": 332836.21875, | |
| "learning_rate": 4.19513863821205e-05, | |
| "loss": 4.4388, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3368421052631579, | |
| "eval_loss": 4.37507438659668, | |
| "eval_runtime": 134.9276, | |
| "eval_samples_per_second": 37.057, | |
| "eval_steps_per_second": 2.32, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3705263157894737, | |
| "grad_norm": 378902.09375, | |
| "learning_rate": 3.967797412636315e-05, | |
| "loss": 4.3341, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.40421052631578946, | |
| "grad_norm": 377249.0, | |
| "learning_rate": 3.7201738799033065e-05, | |
| "loss": 4.2245, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4378947368421053, | |
| "grad_norm": 357383.0625, | |
| "learning_rate": 3.4556897496488504e-05, | |
| "loss": 4.1398, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.47157894736842104, | |
| "grad_norm": 475150.9375, | |
| "learning_rate": 3.177999714490516e-05, | |
| "loss": 4.1092, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5052631578947369, | |
| "grad_norm": 475546.25, | |
| "learning_rate": 2.890940948781592e-05, | |
| "loss": 4.0588, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5052631578947369, | |
| "eval_loss": 4.014543533325195, | |
| "eval_runtime": 136.1182, | |
| "eval_samples_per_second": 36.733, | |
| "eval_steps_per_second": 2.299, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5389473684210526, | |
| "grad_norm": 510377.0625, | |
| "learning_rate": 2.5984800857973353e-05, | |
| "loss": 3.9888, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5726315789473684, | |
| "grad_norm": 433969.625, | |
| "learning_rate": 2.3046584060329007e-05, | |
| "loss": 3.9544, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6063157894736843, | |
| "grad_norm": 430580.25, | |
| "learning_rate": 2.0135359940116327e-05, | |
| "loss": 3.9086, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 443697.03125, | |
| "learning_rate": 1.729135635255667e-05, | |
| "loss": 3.9023, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6736842105263158, | |
| "grad_norm": 382403.0, | |
| "learning_rate": 1.455387228661314e-05, | |
| "loss": 3.8641, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6736842105263158, | |
| "eval_loss": 3.8544445037841797, | |
| "eval_runtime": 134.3764, | |
| "eval_samples_per_second": 37.209, | |
| "eval_steps_per_second": 2.329, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7073684210526315, | |
| "grad_norm": 408598.40625, | |
| "learning_rate": 1.1960734823997168e-05, | |
| "loss": 3.8511, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7410526315789474, | |
| "grad_norm": 461325.09375, | |
| "learning_rate": 9.547776437272746e-06, | |
| "loss": 3.832, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7747368421052632, | |
| "grad_norm": 427811.28125, | |
| "learning_rate": 7.348339849853858e-06, | |
| "loss": 3.8217, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.8084210526315789, | |
| "grad_norm": 465533.09375, | |
| "learning_rate": 5.39281729983474e-06, | |
| "loss": 3.81, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 420344.15625, | |
| "learning_rate": 3.7082305741943213e-06, | |
| "loss": 3.7949, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "eval_loss": 3.794926643371582, | |
| "eval_runtime": 133.9027, | |
| "eval_samples_per_second": 37.341, | |
| "eval_steps_per_second": 2.338, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8757894736842106, | |
| "grad_norm": 433467.875, | |
| "learning_rate": 2.3178576165427735e-06, | |
| "loss": 3.7939, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9094736842105263, | |
| "grad_norm": 398190.46875, | |
| "learning_rate": 1.2409108680163734e-06, | |
| "loss": 3.7832, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9431578947368421, | |
| "grad_norm": 369212.90625, | |
| "learning_rate": 4.922717860680298e-07, | |
| "loss": 3.7862, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9768421052631578, | |
| "grad_norm": 457141.46875, | |
| "learning_rate": 8.228520962394182e-08, | |
| "loss": 3.8013, | |
| "step": 2900 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 2968, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9101710322884608.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |