WilliamHH's picture
Training in progress, step 2000
7a4f829 verified
{
"best_global_step": 2000,
"best_metric": 0.34400272369384766,
"best_model_checkpoint": "Assignment4_Distilled_ModernBERT/run-2/checkpoint-2000",
"epoch": 4.1928721174004195,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": true,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.20964360587002095,
"grad_norm": 15.277114868164062,
"learning_rate": 5.99213045445736e-05,
"loss": 6.7857,
"step": 100
},
{
"epoch": 0.20964360587002095,
"eval_accuracy": 0.7109677419354838,
"eval_loss": 3.844492197036743,
"eval_runtime": 24.4798,
"eval_samples_per_second": 126.635,
"eval_steps_per_second": 15.85,
"step": 100
},
{
"epoch": 0.4192872117400419,
"grad_norm": 9.93507194519043,
"learning_rate": 5.968245320843699e-05,
"loss": 2.8413,
"step": 200
},
{
"epoch": 0.4192872117400419,
"eval_accuracy": 0.8938709677419355,
"eval_loss": 1.8548765182495117,
"eval_runtime": 24.5612,
"eval_samples_per_second": 126.215,
"eval_steps_per_second": 15.797,
"step": 200
},
{
"epoch": 0.6289308176100629,
"grad_norm": 3.672398805618286,
"learning_rate": 5.928471649139723e-05,
"loss": 1.4945,
"step": 300
},
{
"epoch": 0.6289308176100629,
"eval_accuracy": 0.94,
"eval_loss": 1.1517655849456787,
"eval_runtime": 24.6593,
"eval_samples_per_second": 125.713,
"eval_steps_per_second": 15.734,
"step": 300
},
{
"epoch": 0.8385744234800838,
"grad_norm": 9.015824317932129,
"learning_rate": 5.87302234139009e-05,
"loss": 1.0247,
"step": 400
},
{
"epoch": 0.8385744234800838,
"eval_accuracy": 0.9532258064516129,
"eval_loss": 0.9113911390304565,
"eval_runtime": 24.4669,
"eval_samples_per_second": 126.702,
"eval_steps_per_second": 15.858,
"step": 400
},
{
"epoch": 1.0482180293501049,
"grad_norm": 13.825676918029785,
"learning_rate": 5.802194208788969e-05,
"loss": 0.7584,
"step": 500
},
{
"epoch": 1.0482180293501049,
"eval_accuracy": 0.9558064516129032,
"eval_loss": 0.7309688329696655,
"eval_runtime": 24.9176,
"eval_samples_per_second": 124.41,
"eval_steps_per_second": 15.571,
"step": 500
},
{
"epoch": 1.2578616352201257,
"grad_norm": 2.16816782951355,
"learning_rate": 5.716366382897622e-05,
"loss": 0.5289,
"step": 600
},
{
"epoch": 1.2578616352201257,
"eval_accuracy": 0.9616129032258065,
"eval_loss": 0.6186437606811523,
"eval_runtime": 24.8855,
"eval_samples_per_second": 124.571,
"eval_steps_per_second": 15.591,
"step": 600
},
{
"epoch": 1.4675052410901468,
"grad_norm": 3.3350861072540283,
"learning_rate": 5.6159982862143515e-05,
"loss": 0.4857,
"step": 700
},
{
"epoch": 1.4675052410901468,
"eval_accuracy": 0.96,
"eval_loss": 0.5846336483955383,
"eval_runtime": 24.8682,
"eval_samples_per_second": 124.657,
"eval_steps_per_second": 15.602,
"step": 700
},
{
"epoch": 1.6771488469601676,
"grad_norm": 1.4833406209945679,
"learning_rate": 5.5016271729600304e-05,
"loss": 0.4169,
"step": 800
},
{
"epoch": 1.6771488469601676,
"eval_accuracy": 0.9638709677419355,
"eval_loss": 0.5170760750770569,
"eval_runtime": 31.1132,
"eval_samples_per_second": 99.636,
"eval_steps_per_second": 12.471,
"step": 800
},
{
"epoch": 1.8867924528301887,
"grad_norm": 8.999539375305176,
"learning_rate": 5.3738652532429715e-05,
"loss": 0.3953,
"step": 900
},
{
"epoch": 1.8867924528301887,
"eval_accuracy": 0.9683870967741935,
"eval_loss": 0.4950183629989624,
"eval_runtime": 25.9065,
"eval_samples_per_second": 119.661,
"eval_steps_per_second": 14.977,
"step": 900
},
{
"epoch": 2.0964360587002098,
"grad_norm": 1.5073931217193604,
"learning_rate": 5.2333964159970384e-05,
"loss": 0.3393,
"step": 1000
},
{
"epoch": 2.0964360587002098,
"eval_accuracy": 0.9674193548387097,
"eval_loss": 0.45995765924453735,
"eval_runtime": 25.1127,
"eval_samples_per_second": 123.444,
"eval_steps_per_second": 15.45,
"step": 1000
},
{
"epoch": 2.3060796645702304,
"grad_norm": 1.204892873764038,
"learning_rate": 5.080972568234569e-05,
"loss": 0.2747,
"step": 1100
},
{
"epoch": 2.3060796645702304,
"eval_accuracy": 0.9661290322580646,
"eval_loss": 0.44675561785697937,
"eval_runtime": 24.863,
"eval_samples_per_second": 124.683,
"eval_steps_per_second": 15.606,
"step": 1100
},
{
"epoch": 2.5157232704402515,
"grad_norm": 1.581334114074707,
"learning_rate": 4.9174096102095087e-05,
"loss": 0.2703,
"step": 1200
},
{
"epoch": 2.5157232704402515,
"eval_accuracy": 0.9664516129032258,
"eval_loss": 0.43446871638298035,
"eval_runtime": 24.788,
"eval_samples_per_second": 125.061,
"eval_steps_per_second": 15.653,
"step": 1200
},
{
"epoch": 2.7253668763102725,
"grad_norm": 1.4468449354171753,
"learning_rate": 4.7435830680350456e-05,
"loss": 0.2482,
"step": 1300
},
{
"epoch": 2.7253668763102725,
"eval_accuracy": 0.967741935483871,
"eval_loss": 0.4083913564682007,
"eval_runtime": 30.9276,
"eval_samples_per_second": 100.234,
"eval_steps_per_second": 12.545,
"step": 1300
},
{
"epoch": 2.9350104821802936,
"grad_norm": 1.0635446310043335,
"learning_rate": 4.5604234071336463e-05,
"loss": 0.2342,
"step": 1400
},
{
"epoch": 2.9350104821802936,
"eval_accuracy": 0.9674193548387097,
"eval_loss": 0.3973832428455353,
"eval_runtime": 25.2744,
"eval_samples_per_second": 122.654,
"eval_steps_per_second": 15.351,
"step": 1400
},
{
"epoch": 3.1446540880503147,
"grad_norm": 1.0511268377304077,
"learning_rate": 4.368911051605842e-05,
"loss": 0.2073,
"step": 1500
},
{
"epoch": 3.1446540880503147,
"eval_accuracy": 0.9680645161290322,
"eval_loss": 0.3778529465198517,
"eval_runtime": 24.3903,
"eval_samples_per_second": 127.1,
"eval_steps_per_second": 15.908,
"step": 1500
},
{
"epoch": 3.3542976939203353,
"grad_norm": 1.3159152269363403,
"learning_rate": 4.1700711361782675e-05,
"loss": 0.1973,
"step": 1600
},
{
"epoch": 3.3542976939203353,
"eval_accuracy": 0.9696774193548388,
"eval_loss": 0.3735784590244293,
"eval_runtime": 24.7557,
"eval_samples_per_second": 125.224,
"eval_steps_per_second": 15.673,
"step": 1600
},
{
"epoch": 3.5639412997903563,
"grad_norm": 0.8998211622238159,
"learning_rate": 3.9649680188229416e-05,
"loss": 0.1876,
"step": 1700
},
{
"epoch": 3.5639412997903563,
"eval_accuracy": 0.9696774193548388,
"eval_loss": 0.3719204366207123,
"eval_runtime": 25.0523,
"eval_samples_per_second": 123.741,
"eval_steps_per_second": 15.488,
"step": 1700
},
{
"epoch": 3.7735849056603774,
"grad_norm": 1.0484949350357056,
"learning_rate": 3.754699583420843e-05,
"loss": 0.1836,
"step": 1800
},
{
"epoch": 3.7735849056603774,
"eval_accuracy": 0.9703225806451613,
"eval_loss": 0.35943037271499634,
"eval_runtime": 29.7226,
"eval_samples_per_second": 104.298,
"eval_steps_per_second": 13.054,
"step": 1800
},
{
"epoch": 3.9832285115303985,
"grad_norm": 1.2159234285354614,
"learning_rate": 3.5403913629667045e-05,
"loss": 0.184,
"step": 1900
},
{
"epoch": 3.9832285115303985,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.35340631008148193,
"eval_runtime": 26.1611,
"eval_samples_per_second": 118.496,
"eval_steps_per_second": 14.831,
"step": 1900
},
{
"epoch": 4.1928721174004195,
"grad_norm": 0.8908376097679138,
"learning_rate": 3.323190514772574e-05,
"loss": 0.1558,
"step": 2000
},
{
"epoch": 4.1928721174004195,
"eval_accuracy": 0.97,
"eval_loss": 0.34400272369384766,
"eval_runtime": 24.8773,
"eval_samples_per_second": 124.612,
"eval_steps_per_second": 15.597,
"step": 2000
}
],
"logging_steps": 100,
"max_steps": 4293,
"num_input_tokens_seen": 0,
"num_train_epochs": 9,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 896601881570736.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": {
"alpha": 0.497477897443408,
"num_train_epochs": 9,
"temperature": 7
}
}