WilliamHH's picture
Training in progress, step 500
3c9dd4a verified
{
"best_global_step": 3900,
"best_metric": 0.3069113492965698,
"best_model_checkpoint": "Assignment4_Distilled_ModernBERT/run-2/checkpoint-3000",
"epoch": 9.0,
"eval_steps": 100,
"global_step": 4293,
"is_hyper_param_search": true,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.20964360587002095,
"grad_norm": 15.277114868164062,
"learning_rate": 5.99213045445736e-05,
"loss": 6.7857,
"step": 100
},
{
"epoch": 0.20964360587002095,
"eval_accuracy": 0.7109677419354838,
"eval_loss": 3.844492197036743,
"eval_runtime": 24.4798,
"eval_samples_per_second": 126.635,
"eval_steps_per_second": 15.85,
"step": 100
},
{
"epoch": 0.4192872117400419,
"grad_norm": 9.93507194519043,
"learning_rate": 5.968245320843699e-05,
"loss": 2.8413,
"step": 200
},
{
"epoch": 0.4192872117400419,
"eval_accuracy": 0.8938709677419355,
"eval_loss": 1.8548765182495117,
"eval_runtime": 24.5612,
"eval_samples_per_second": 126.215,
"eval_steps_per_second": 15.797,
"step": 200
},
{
"epoch": 0.6289308176100629,
"grad_norm": 3.672398805618286,
"learning_rate": 5.928471649139723e-05,
"loss": 1.4945,
"step": 300
},
{
"epoch": 0.6289308176100629,
"eval_accuracy": 0.94,
"eval_loss": 1.1517655849456787,
"eval_runtime": 24.6593,
"eval_samples_per_second": 125.713,
"eval_steps_per_second": 15.734,
"step": 300
},
{
"epoch": 0.8385744234800838,
"grad_norm": 9.015824317932129,
"learning_rate": 5.87302234139009e-05,
"loss": 1.0247,
"step": 400
},
{
"epoch": 0.8385744234800838,
"eval_accuracy": 0.9532258064516129,
"eval_loss": 0.9113911390304565,
"eval_runtime": 24.4669,
"eval_samples_per_second": 126.702,
"eval_steps_per_second": 15.858,
"step": 400
},
{
"epoch": 1.0482180293501049,
"grad_norm": 13.825676918029785,
"learning_rate": 5.802194208788969e-05,
"loss": 0.7584,
"step": 500
},
{
"epoch": 1.0482180293501049,
"eval_accuracy": 0.9558064516129032,
"eval_loss": 0.7309688329696655,
"eval_runtime": 24.9176,
"eval_samples_per_second": 124.41,
"eval_steps_per_second": 15.571,
"step": 500
},
{
"epoch": 1.2578616352201257,
"grad_norm": 2.16816782951355,
"learning_rate": 5.716366382897622e-05,
"loss": 0.5289,
"step": 600
},
{
"epoch": 1.2578616352201257,
"eval_accuracy": 0.9616129032258065,
"eval_loss": 0.6186437606811523,
"eval_runtime": 24.8855,
"eval_samples_per_second": 124.571,
"eval_steps_per_second": 15.591,
"step": 600
},
{
"epoch": 1.4675052410901468,
"grad_norm": 3.3350861072540283,
"learning_rate": 5.6159982862143515e-05,
"loss": 0.4857,
"step": 700
},
{
"epoch": 1.4675052410901468,
"eval_accuracy": 0.96,
"eval_loss": 0.5846336483955383,
"eval_runtime": 24.8682,
"eval_samples_per_second": 124.657,
"eval_steps_per_second": 15.602,
"step": 700
},
{
"epoch": 1.6771488469601676,
"grad_norm": 1.4833406209945679,
"learning_rate": 5.5016271729600304e-05,
"loss": 0.4169,
"step": 800
},
{
"epoch": 1.6771488469601676,
"eval_accuracy": 0.9638709677419355,
"eval_loss": 0.5170760750770569,
"eval_runtime": 31.1132,
"eval_samples_per_second": 99.636,
"eval_steps_per_second": 12.471,
"step": 800
},
{
"epoch": 1.8867924528301887,
"grad_norm": 8.999539375305176,
"learning_rate": 5.3738652532429715e-05,
"loss": 0.3953,
"step": 900
},
{
"epoch": 1.8867924528301887,
"eval_accuracy": 0.9683870967741935,
"eval_loss": 0.4950183629989624,
"eval_runtime": 25.9065,
"eval_samples_per_second": 119.661,
"eval_steps_per_second": 14.977,
"step": 900
},
{
"epoch": 2.0964360587002098,
"grad_norm": 1.5073931217193604,
"learning_rate": 5.2333964159970384e-05,
"loss": 0.3393,
"step": 1000
},
{
"epoch": 2.0964360587002098,
"eval_accuracy": 0.9674193548387097,
"eval_loss": 0.45995765924453735,
"eval_runtime": 25.1127,
"eval_samples_per_second": 123.444,
"eval_steps_per_second": 15.45,
"step": 1000
},
{
"epoch": 2.3060796645702304,
"grad_norm": 1.204892873764038,
"learning_rate": 5.080972568234569e-05,
"loss": 0.2747,
"step": 1100
},
{
"epoch": 2.3060796645702304,
"eval_accuracy": 0.9661290322580646,
"eval_loss": 0.44675561785697937,
"eval_runtime": 24.863,
"eval_samples_per_second": 124.683,
"eval_steps_per_second": 15.606,
"step": 1100
},
{
"epoch": 2.5157232704402515,
"grad_norm": 1.581334114074707,
"learning_rate": 4.9174096102095087e-05,
"loss": 0.2703,
"step": 1200
},
{
"epoch": 2.5157232704402515,
"eval_accuracy": 0.9664516129032258,
"eval_loss": 0.43446871638298035,
"eval_runtime": 24.788,
"eval_samples_per_second": 125.061,
"eval_steps_per_second": 15.653,
"step": 1200
},
{
"epoch": 2.7253668763102725,
"grad_norm": 1.4468449354171753,
"learning_rate": 4.7435830680350456e-05,
"loss": 0.2482,
"step": 1300
},
{
"epoch": 2.7253668763102725,
"eval_accuracy": 0.967741935483871,
"eval_loss": 0.4083913564682007,
"eval_runtime": 30.9276,
"eval_samples_per_second": 100.234,
"eval_steps_per_second": 12.545,
"step": 1300
},
{
"epoch": 2.9350104821802936,
"grad_norm": 1.0635446310043335,
"learning_rate": 4.5604234071336463e-05,
"loss": 0.2342,
"step": 1400
},
{
"epoch": 2.9350104821802936,
"eval_accuracy": 0.9674193548387097,
"eval_loss": 0.3973832428455353,
"eval_runtime": 25.2744,
"eval_samples_per_second": 122.654,
"eval_steps_per_second": 15.351,
"step": 1400
},
{
"epoch": 3.1446540880503147,
"grad_norm": 1.0511268377304077,
"learning_rate": 4.368911051605842e-05,
"loss": 0.2073,
"step": 1500
},
{
"epoch": 3.1446540880503147,
"eval_accuracy": 0.9680645161290322,
"eval_loss": 0.3778529465198517,
"eval_runtime": 24.3903,
"eval_samples_per_second": 127.1,
"eval_steps_per_second": 15.908,
"step": 1500
},
{
"epoch": 3.3542976939203353,
"grad_norm": 1.3159152269363403,
"learning_rate": 4.1700711361782675e-05,
"loss": 0.1973,
"step": 1600
},
{
"epoch": 3.3542976939203353,
"eval_accuracy": 0.9696774193548388,
"eval_loss": 0.3735784590244293,
"eval_runtime": 24.7557,
"eval_samples_per_second": 125.224,
"eval_steps_per_second": 15.673,
"step": 1600
},
{
"epoch": 3.5639412997903563,
"grad_norm": 0.8998211622238159,
"learning_rate": 3.9649680188229416e-05,
"loss": 0.1876,
"step": 1700
},
{
"epoch": 3.5639412997903563,
"eval_accuracy": 0.9696774193548388,
"eval_loss": 0.3719204366207123,
"eval_runtime": 25.0523,
"eval_samples_per_second": 123.741,
"eval_steps_per_second": 15.488,
"step": 1700
},
{
"epoch": 3.7735849056603774,
"grad_norm": 1.0484949350357056,
"learning_rate": 3.754699583420843e-05,
"loss": 0.1836,
"step": 1800
},
{
"epoch": 3.7735849056603774,
"eval_accuracy": 0.9703225806451613,
"eval_loss": 0.35943037271499634,
"eval_runtime": 29.7226,
"eval_samples_per_second": 104.298,
"eval_steps_per_second": 13.054,
"step": 1800
},
{
"epoch": 3.9832285115303985,
"grad_norm": 1.2159234285354614,
"learning_rate": 3.5403913629667045e-05,
"loss": 0.184,
"step": 1900
},
{
"epoch": 3.9832285115303985,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.35340631008148193,
"eval_runtime": 26.1611,
"eval_samples_per_second": 118.496,
"eval_steps_per_second": 14.831,
"step": 1900
},
{
"epoch": 4.1928721174004195,
"grad_norm": 0.8908376097679138,
"learning_rate": 3.323190514772574e-05,
"loss": 0.1558,
"step": 2000
},
{
"epoch": 4.1928721174004195,
"eval_accuracy": 0.97,
"eval_loss": 0.34400272369384766,
"eval_runtime": 24.8773,
"eval_samples_per_second": 124.612,
"eval_steps_per_second": 15.597,
"step": 2000
},
{
"epoch": 4.40251572327044,
"grad_norm": 0.9733275175094604,
"learning_rate": 3.1042596799198844e-05,
"loss": 0.1496,
"step": 2100
},
{
"epoch": 4.40251572327044,
"eval_accuracy": 0.9716129032258064,
"eval_loss": 0.33769491314888,
"eval_runtime": 24.7885,
"eval_samples_per_second": 125.058,
"eval_steps_per_second": 15.652,
"step": 2100
},
{
"epoch": 4.612159329140461,
"grad_norm": 0.7378196120262146,
"learning_rate": 2.8847707598294083e-05,
"loss": 0.1461,
"step": 2200
},
{
"epoch": 4.612159329140461,
"eval_accuracy": 0.9706451612903226,
"eval_loss": 0.3369283974170685,
"eval_runtime": 24.7525,
"eval_samples_per_second": 125.24,
"eval_steps_per_second": 15.675,
"step": 2200
},
{
"epoch": 4.821802935010482,
"grad_norm": 1.0811831951141357,
"learning_rate": 2.665898643262115e-05,
"loss": 0.1457,
"step": 2300
},
{
"epoch": 4.821802935010482,
"eval_accuracy": 0.9703225806451613,
"eval_loss": 0.3287930190563202,
"eval_runtime": 27.4125,
"eval_samples_per_second": 113.087,
"eval_steps_per_second": 14.154,
"step": 2300
},
{
"epoch": 5.031446540880503,
"grad_norm": 0.89784175157547,
"learning_rate": 2.448814917329275e-05,
"loss": 0.1406,
"step": 2400
},
{
"epoch": 5.031446540880503,
"eval_accuracy": 0.97,
"eval_loss": 0.3258771300315857,
"eval_runtime": 26.8839,
"eval_samples_per_second": 115.311,
"eval_steps_per_second": 14.432,
"step": 2400
},
{
"epoch": 5.241090146750524,
"grad_norm": 0.6693010330200195,
"learning_rate": 2.23468159617576e-05,
"loss": 0.1254,
"step": 2500
},
{
"epoch": 5.241090146750524,
"eval_accuracy": 0.9703225806451613,
"eval_loss": 0.3203785717487335,
"eval_runtime": 24.583,
"eval_samples_per_second": 126.104,
"eval_steps_per_second": 15.783,
"step": 2500
},
{
"epoch": 5.450733752620545,
"grad_norm": 0.6166190505027771,
"learning_rate": 2.024644900905892e-05,
"loss": 0.1245,
"step": 2600
},
{
"epoch": 5.450733752620545,
"eval_accuracy": 0.97,
"eval_loss": 0.3191240429878235,
"eval_runtime": 25.4943,
"eval_samples_per_second": 121.596,
"eval_steps_per_second": 15.219,
"step": 2600
},
{
"epoch": 5.660377358490566,
"grad_norm": 0.5683078169822693,
"learning_rate": 1.8198291240468836e-05,
"loss": 0.1221,
"step": 2700
},
{
"epoch": 5.660377358490566,
"eval_accuracy": 0.9719354838709677,
"eval_loss": 0.3174494206905365,
"eval_runtime": 25.2702,
"eval_samples_per_second": 122.674,
"eval_steps_per_second": 15.354,
"step": 2700
},
{
"epoch": 5.870020964360587,
"grad_norm": 0.6431116461753845,
"learning_rate": 1.621330611392417e-05,
"loss": 0.1209,
"step": 2800
},
{
"epoch": 5.870020964360587,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.3135199248790741,
"eval_runtime": 27.0594,
"eval_samples_per_second": 114.563,
"eval_steps_per_second": 14.339,
"step": 2800
},
{
"epoch": 6.079664570230608,
"grad_norm": 0.5339530110359192,
"learning_rate": 1.4302118934405711e-05,
"loss": 0.1173,
"step": 2900
},
{
"epoch": 6.079664570230608,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.31273338198661804,
"eval_runtime": 28.6742,
"eval_samples_per_second": 108.111,
"eval_steps_per_second": 13.531,
"step": 2900
},
{
"epoch": 6.289308176100629,
"grad_norm": 0.4836990237236023,
"learning_rate": 1.2474959978395775e-05,
"loss": 0.1113,
"step": 3000
},
{
"epoch": 6.289308176100629,
"eval_accuracy": 0.9703225806451613,
"eval_loss": 0.31234365701675415,
"eval_runtime": 24.6684,
"eval_samples_per_second": 125.667,
"eval_steps_per_second": 15.729,
"step": 3000
},
{
"epoch": 6.49895178197065,
"grad_norm": 0.5658986568450928,
"learning_rate": 1.0741609732859415e-05,
"loss": 0.1065,
"step": 3100
},
{
"epoch": 6.49895178197065,
"eval_accuracy": 0.9712903225806452,
"eval_loss": 0.30965664982795715,
"eval_runtime": 25.2448,
"eval_samples_per_second": 122.797,
"eval_steps_per_second": 15.369,
"step": 3100
},
{
"epoch": 6.7085953878406706,
"grad_norm": 0.5603812336921692,
"learning_rate": 9.111346541876427e-06,
"loss": 0.1061,
"step": 3200
},
{
"epoch": 6.7085953878406706,
"eval_accuracy": 0.9703225806451613,
"eval_loss": 0.30989164113998413,
"eval_runtime": 24.9967,
"eval_samples_per_second": 124.017,
"eval_steps_per_second": 15.522,
"step": 3200
},
{
"epoch": 6.918238993710692,
"grad_norm": 0.5175463557243347,
"learning_rate": 7.5928969411631525e-06,
"loss": 0.1057,
"step": 3300
},
{
"epoch": 6.918238993710692,
"eval_accuracy": 0.9703225806451613,
"eval_loss": 0.3099421262741089,
"eval_runtime": 25.1564,
"eval_samples_per_second": 123.229,
"eval_steps_per_second": 15.424,
"step": 3300
},
{
"epoch": 7.127882599580713,
"grad_norm": 0.47962552309036255,
"learning_rate": 6.194388946335337e-06,
"loss": 0.1026,
"step": 3400
},
{
"epoch": 7.127882599580713,
"eval_accuracy": 0.9706451612903226,
"eval_loss": 0.3079238533973694,
"eval_runtime": 30.478,
"eval_samples_per_second": 101.713,
"eval_steps_per_second": 12.73,
"step": 3400
},
{
"epoch": 7.337526205450734,
"grad_norm": 0.5649577975273132,
"learning_rate": 4.923308544952481e-06,
"loss": 0.099,
"step": 3500
},
{
"epoch": 7.337526205450734,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.3081134557723999,
"eval_runtime": 24.6885,
"eval_samples_per_second": 125.565,
"eval_steps_per_second": 15.716,
"step": 3500
},
{
"epoch": 7.547169811320755,
"grad_norm": 0.4486183226108551,
"learning_rate": 3.786459625234828e-06,
"loss": 0.0999,
"step": 3600
},
{
"epoch": 7.547169811320755,
"eval_accuracy": 0.9703225806451613,
"eval_loss": 0.30738693475723267,
"eval_runtime": 24.9276,
"eval_samples_per_second": 124.36,
"eval_steps_per_second": 15.565,
"step": 3600
},
{
"epoch": 7.756813417190775,
"grad_norm": 0.5032072067260742,
"learning_rate": 2.7899275559478822e-06,
"loss": 0.0997,
"step": 3700
},
{
"epoch": 7.756813417190775,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.3071826696395874,
"eval_runtime": 24.6968,
"eval_samples_per_second": 125.522,
"eval_steps_per_second": 15.711,
"step": 3700
},
{
"epoch": 7.966457023060797,
"grad_norm": 0.4989459216594696,
"learning_rate": 1.9390466124057314e-06,
"loss": 0.0987,
"step": 3800
},
{
"epoch": 7.966457023060797,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.30702054500579834,
"eval_runtime": 24.6975,
"eval_samples_per_second": 125.519,
"eval_steps_per_second": 15.71,
"step": 3800
},
{
"epoch": 8.176100628930818,
"grad_norm": 0.4764493405818939,
"learning_rate": 1.238371422956248e-06,
"loss": 0.0952,
"step": 3900
},
{
"epoch": 8.176100628930818,
"eval_accuracy": 0.9706451612903226,
"eval_loss": 0.3069113492965698,
"eval_runtime": 32.1081,
"eval_samples_per_second": 96.549,
"eval_steps_per_second": 12.084,
"step": 3900
},
{
"epoch": 8.385744234800839,
"grad_norm": 0.44305703043937683,
"learning_rate": 6.916525887907699e-07,
"loss": 0.0964,
"step": 4000
},
{
"epoch": 8.385744234800839,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.3070818781852722,
"eval_runtime": 25.2525,
"eval_samples_per_second": 122.76,
"eval_steps_per_second": 15.365,
"step": 4000
},
{
"epoch": 8.59538784067086,
"grad_norm": 0.42053186893463135,
"learning_rate": 3.0181660758127563e-07,
"loss": 0.0952,
"step": 4100
},
{
"epoch": 8.59538784067086,
"eval_accuracy": 0.9709677419354839,
"eval_loss": 0.306934118270874,
"eval_runtime": 25.3837,
"eval_samples_per_second": 122.126,
"eval_steps_per_second": 15.285,
"step": 4100
},
{
"epoch": 8.80503144654088,
"grad_norm": 0.5232221484184265,
"learning_rate": 7.095020841074385e-08,
"loss": 0.0977,
"step": 4200
},
{
"epoch": 8.80503144654088,
"eval_accuracy": 0.9706451612903226,
"eval_loss": 0.3069760203361511,
"eval_runtime": 25.0708,
"eval_samples_per_second": 123.65,
"eval_steps_per_second": 15.476,
"step": 4200
}
],
"logging_steps": 100,
"max_steps": 4293,
"num_input_tokens_seen": 0,
"num_train_epochs": 9,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1878717068327988.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": {
"alpha": 0.497477897443408,
"num_train_epochs": 9,
"temperature": 7
}
}