mamba_nli_ensemble / checkpoint-6108 /trainer_state.json
w06618pm
Final trained model ready for demo
4ab0ec0
{
"best_global_step": 6108,
"best_metric": 0.6058866381645203,
"best_model_checkpoint": "mamba_nli_ensemble/checkpoint-6108",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 6108,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016371971185330715,
"grad_norm": 15.273209571838379,
"learning_rate": 1.6371971185330716e-06,
"loss": 0.7553,
"step": 100
},
{
"epoch": 0.03274394237066143,
"grad_norm": 17.171979904174805,
"learning_rate": 3.2743942370661432e-06,
"loss": 0.7151,
"step": 200
},
{
"epoch": 0.04911591355599214,
"grad_norm": 5.574963569641113,
"learning_rate": 4.911591355599214e-06,
"loss": 0.7065,
"step": 300
},
{
"epoch": 0.06548788474132286,
"grad_norm": 7.3369059562683105,
"learning_rate": 6.5487884741322864e-06,
"loss": 0.6996,
"step": 400
},
{
"epoch": 0.08185985592665357,
"grad_norm": 7.506777286529541,
"learning_rate": 8.185985592665357e-06,
"loss": 0.7071,
"step": 500
},
{
"epoch": 0.09823182711198428,
"grad_norm": 6.177253723144531,
"learning_rate": 9.823182711198428e-06,
"loss": 0.6926,
"step": 600
},
{
"epoch": 0.114603798297315,
"grad_norm": 5.600251197814941,
"learning_rate": 1.14603798297315e-05,
"loss": 0.7145,
"step": 700
},
{
"epoch": 0.13097576948264572,
"grad_norm": 2.5524227619171143,
"learning_rate": 1.3097576948264573e-05,
"loss": 0.6919,
"step": 800
},
{
"epoch": 0.14734774066797643,
"grad_norm": 8.479023933410645,
"learning_rate": 1.4734774066797644e-05,
"loss": 0.6819,
"step": 900
},
{
"epoch": 0.16371971185330714,
"grad_norm": 5.893022060394287,
"learning_rate": 1.6371971185330713e-05,
"loss": 0.6944,
"step": 1000
},
{
"epoch": 0.18009168303863785,
"grad_norm": 4.31400728225708,
"learning_rate": 1.8009168303863786e-05,
"loss": 0.6787,
"step": 1100
},
{
"epoch": 0.19646365422396855,
"grad_norm": 1.9604185819625854,
"learning_rate": 1.9646365422396855e-05,
"loss": 0.6804,
"step": 1200
},
{
"epoch": 0.2128356254092993,
"grad_norm": 14.416400909423828,
"learning_rate": 2.128356254092993e-05,
"loss": 0.6979,
"step": 1300
},
{
"epoch": 0.22920759659463,
"grad_norm": 2.0943357944488525,
"learning_rate": 2.2920759659463e-05,
"loss": 0.6689,
"step": 1400
},
{
"epoch": 0.2455795677799607,
"grad_norm": 4.136998176574707,
"learning_rate": 2.4557956777996073e-05,
"loss": 0.6694,
"step": 1500
},
{
"epoch": 0.26195153896529144,
"grad_norm": 3.5071325302124023,
"learning_rate": 2.6195153896529146e-05,
"loss": 0.6168,
"step": 1600
},
{
"epoch": 0.2783235101506221,
"grad_norm": 7.638752460479736,
"learning_rate": 2.7832351015062215e-05,
"loss": 0.6237,
"step": 1700
},
{
"epoch": 0.29469548133595286,
"grad_norm": 1.9127601385116577,
"learning_rate": 2.9469548133595288e-05,
"loss": 0.6163,
"step": 1800
},
{
"epoch": 0.31106745252128354,
"grad_norm": 66.4927749633789,
"learning_rate": 3.110674525212836e-05,
"loss": 0.6006,
"step": 1900
},
{
"epoch": 0.3274394237066143,
"grad_norm": 52.92075729370117,
"learning_rate": 3.2743942370661426e-05,
"loss": 0.6598,
"step": 2000
},
{
"epoch": 0.343811394891945,
"grad_norm": 11.339043617248535,
"learning_rate": 3.43811394891945e-05,
"loss": 0.5987,
"step": 2100
},
{
"epoch": 0.3601833660772757,
"grad_norm": 28.995885848999023,
"learning_rate": 3.601833660772757e-05,
"loss": 0.6509,
"step": 2200
},
{
"epoch": 0.3765553372626064,
"grad_norm": 23.708646774291992,
"learning_rate": 3.765553372626065e-05,
"loss": 0.6729,
"step": 2300
},
{
"epoch": 0.3929273084479371,
"grad_norm": 3.438246726989746,
"learning_rate": 3.929273084479371e-05,
"loss": 0.5537,
"step": 2400
},
{
"epoch": 0.40929927963326784,
"grad_norm": 10.562445640563965,
"learning_rate": 4.0929927963326786e-05,
"loss": 0.6228,
"step": 2500
},
{
"epoch": 0.4256712508185986,
"grad_norm": 9.508832931518555,
"learning_rate": 4.256712508185986e-05,
"loss": 0.5776,
"step": 2600
},
{
"epoch": 0.44204322200392926,
"grad_norm": 12.658103942871094,
"learning_rate": 4.4204322200392925e-05,
"loss": 0.5455,
"step": 2700
},
{
"epoch": 0.45841519318926,
"grad_norm": 8.46078109741211,
"learning_rate": 4.5841519318926e-05,
"loss": 0.5583,
"step": 2800
},
{
"epoch": 0.4747871643745907,
"grad_norm": 5.642892360687256,
"learning_rate": 4.747871643745907e-05,
"loss": 0.5556,
"step": 2900
},
{
"epoch": 0.4911591355599214,
"grad_norm": 3.8212382793426514,
"learning_rate": 4.9115913555992146e-05,
"loss": 0.5552,
"step": 3000
},
{
"epoch": 0.5075311067452521,
"grad_norm": 8.145768165588379,
"learning_rate": 4.999965445760666e-05,
"loss": 0.5488,
"step": 3100
},
{
"epoch": 0.5239030779305829,
"grad_norm": 12.39121150970459,
"learning_rate": 4.999651917405523e-05,
"loss": 0.5595,
"step": 3200
},
{
"epoch": 0.5402750491159135,
"grad_norm": 6.998423099517822,
"learning_rate": 4.999011837711028e-05,
"loss": 0.5111,
"step": 3300
},
{
"epoch": 0.5566470203012442,
"grad_norm": 31.633630752563477,
"learning_rate": 4.998045290296376e-05,
"loss": 0.553,
"step": 3400
},
{
"epoch": 0.573018991486575,
"grad_norm": 56.126251220703125,
"learning_rate": 4.9967524014300896e-05,
"loss": 0.5713,
"step": 3500
},
{
"epoch": 0.5893909626719057,
"grad_norm": 6.04685640335083,
"learning_rate": 4.995133340013522e-05,
"loss": 0.526,
"step": 3600
},
{
"epoch": 0.6057629338572365,
"grad_norm": 9.806577682495117,
"learning_rate": 4.993188317558791e-05,
"loss": 0.6185,
"step": 3700
},
{
"epoch": 0.6221349050425671,
"grad_norm": 3.9068918228149414,
"learning_rate": 4.9909175881611514e-05,
"loss": 0.5086,
"step": 3800
},
{
"epoch": 0.6385068762278978,
"grad_norm": 19.12666130065918,
"learning_rate": 4.9883214484657957e-05,
"loss": 0.515,
"step": 3900
},
{
"epoch": 0.6548788474132285,
"grad_norm": 6.140756607055664,
"learning_rate": 4.9854002376291046e-05,
"loss": 0.5581,
"step": 4000
},
{
"epoch": 0.6712508185985593,
"grad_norm": 12.521078109741211,
"learning_rate": 4.9821543372743355e-05,
"loss": 0.5192,
"step": 4100
},
{
"epoch": 0.68762278978389,
"grad_norm": 18.783933639526367,
"learning_rate": 4.9785841714417734e-05,
"loss": 0.5334,
"step": 4200
},
{
"epoch": 0.7039947609692206,
"grad_norm": 7.139877796173096,
"learning_rate": 4.97469020653333e-05,
"loss": 0.5334,
"step": 4300
},
{
"epoch": 0.7203667321545514,
"grad_norm": 7.0137834548950195,
"learning_rate": 4.970472951251617e-05,
"loss": 0.5019,
"step": 4400
},
{
"epoch": 0.7367387033398821,
"grad_norm": 51.292449951171875,
"learning_rate": 4.9659329565334854e-05,
"loss": 0.4813,
"step": 4500
},
{
"epoch": 0.7531106745252129,
"grad_norm": 7.053626537322998,
"learning_rate": 4.9610708154780585e-05,
"loss": 0.6834,
"step": 4600
},
{
"epoch": 0.7694826457105436,
"grad_norm": 0.5209086537361145,
"learning_rate": 4.955887163269243e-05,
"loss": 0.4802,
"step": 4700
},
{
"epoch": 0.7858546168958742,
"grad_norm": 0.9966021776199341,
"learning_rate": 4.950382677092754e-05,
"loss": 0.5673,
"step": 4800
},
{
"epoch": 0.802226588081205,
"grad_norm": 1.4829602241516113,
"learning_rate": 4.944558076047649e-05,
"loss": 0.4976,
"step": 4900
},
{
"epoch": 0.8185985592665357,
"grad_norm": 63.20207214355469,
"learning_rate": 4.9384141210523804e-05,
"loss": 0.5398,
"step": 5000
},
{
"epoch": 0.8349705304518664,
"grad_norm": 9.008106231689453,
"learning_rate": 4.931951614745395e-05,
"loss": 0.5906,
"step": 5100
},
{
"epoch": 0.8513425016371972,
"grad_norm": 9.714171409606934,
"learning_rate": 4.925171401380278e-05,
"loss": 0.4833,
"step": 5200
},
{
"epoch": 0.8677144728225278,
"grad_norm": 1.0515024662017822,
"learning_rate": 4.918074366715457e-05,
"loss": 0.5046,
"step": 5300
},
{
"epoch": 0.8840864440078585,
"grad_norm": 0.32931941747665405,
"learning_rate": 4.910661437898493e-05,
"loss": 0.6394,
"step": 5400
},
{
"epoch": 0.9004584151931893,
"grad_norm": 19.654884338378906,
"learning_rate": 4.902933583344954e-05,
"loss": 0.5572,
"step": 5500
},
{
"epoch": 0.91683038637852,
"grad_norm": 6.547713279724121,
"learning_rate": 4.8948918126119056e-05,
"loss": 0.5898,
"step": 5600
},
{
"epoch": 0.9332023575638507,
"grad_norm": 0.6314940452575684,
"learning_rate": 4.886537176266024e-05,
"loss": 0.4681,
"step": 5700
},
{
"epoch": 0.9495743287491814,
"grad_norm": 9.13287353515625,
"learning_rate": 4.877870765746347e-05,
"loss": 0.4678,
"step": 5800
},
{
"epoch": 0.9659462999345121,
"grad_norm": 8.16297721862793,
"learning_rate": 4.8688937132216966e-05,
"loss": 0.5657,
"step": 5900
},
{
"epoch": 0.9823182711198428,
"grad_norm": 19.567949295043945,
"learning_rate": 4.859607191442768e-05,
"loss": 0.5164,
"step": 6000
},
{
"epoch": 0.9986902423051736,
"grad_norm": 20.028736114501953,
"learning_rate": 4.850012413588926e-05,
"loss": 0.4769,
"step": 6100
},
{
"epoch": 1.0,
"eval_accuracy": 0.8083209509658247,
"eval_loss": 0.6058866381645203,
"eval_mcc": 0.6161859428296993,
"eval_runtime": 5.3649,
"eval_samples_per_second": 125.446,
"eval_steps_per_second": 8.015,
"step": 6108
}
],
"logging_steps": 100,
"max_steps": 30540,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}