MNLP_M3_mcqa_model_corrected / trainer_state.json
ellendagher's picture
Upload folder using huggingface_hub
d054ab8 verified
{
"best_global_step": 4954,
"best_metric": 1.776762843132019,
"best_model_checkpoint": "./mcqa_qwen3_letter_best/checkpoint-4954",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4954,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010092854259184497,
"grad_norm": 48.55782699584961,
"learning_rate": 8.879919273461152e-07,
"loss": 2.3985,
"step": 50
},
{
"epoch": 0.020185708518368994,
"grad_norm": 25.696617126464844,
"learning_rate": 1.8970736629667005e-06,
"loss": 2.053,
"step": 100
},
{
"epoch": 0.030278562777553492,
"grad_norm": 27.860021591186523,
"learning_rate": 2.906155398587286e-06,
"loss": 1.9305,
"step": 150
},
{
"epoch": 0.04037141703673799,
"grad_norm": 17.68500518798828,
"learning_rate": 3.915237134207871e-06,
"loss": 1.9294,
"step": 200
},
{
"epoch": 0.050464271295922486,
"grad_norm": 26.112218856811523,
"learning_rate": 4.924318869828457e-06,
"loss": 1.8834,
"step": 250
},
{
"epoch": 0.060557125555106985,
"grad_norm": 25.835376739501953,
"learning_rate": 5.933400605449042e-06,
"loss": 1.8517,
"step": 300
},
{
"epoch": 0.07064997981429148,
"grad_norm": 22.44589614868164,
"learning_rate": 6.942482341069627e-06,
"loss": 1.8978,
"step": 350
},
{
"epoch": 0.08074283407347597,
"grad_norm": 32.82951354980469,
"learning_rate": 7.951564076690212e-06,
"loss": 1.8867,
"step": 400
},
{
"epoch": 0.09083568833266048,
"grad_norm": 35.665794372558594,
"learning_rate": 8.960645812310798e-06,
"loss": 1.9055,
"step": 450
},
{
"epoch": 0.10092854259184497,
"grad_norm": 22.500865936279297,
"learning_rate": 9.969727547931384e-06,
"loss": 1.8755,
"step": 500
},
{
"epoch": 0.11102139685102948,
"grad_norm": 40.59410095214844,
"learning_rate": 1.0978809283551967e-05,
"loss": 1.8881,
"step": 550
},
{
"epoch": 0.12111425111021397,
"grad_norm": 28.769454956054688,
"learning_rate": 1.1987891019172555e-05,
"loss": 1.8713,
"step": 600
},
{
"epoch": 0.13120710536939847,
"grad_norm": 17.596820831298828,
"learning_rate": 1.299697275479314e-05,
"loss": 1.8694,
"step": 650
},
{
"epoch": 0.14129995962858297,
"grad_norm": 17.149999618530273,
"learning_rate": 1.4006054490413725e-05,
"loss": 1.8809,
"step": 700
},
{
"epoch": 0.15139281388776746,
"grad_norm": 19.181955337524414,
"learning_rate": 1.5015136226034311e-05,
"loss": 1.8697,
"step": 750
},
{
"epoch": 0.16148566814695195,
"grad_norm": 24.227073669433594,
"learning_rate": 1.6024217961654894e-05,
"loss": 1.9201,
"step": 800
},
{
"epoch": 0.17157852240613647,
"grad_norm": 18.42403221130371,
"learning_rate": 1.703329969727548e-05,
"loss": 1.8876,
"step": 850
},
{
"epoch": 0.18167137666532096,
"grad_norm": 21.015230178833008,
"learning_rate": 1.8042381432896066e-05,
"loss": 1.8697,
"step": 900
},
{
"epoch": 0.19176423092450545,
"grad_norm": 16.02488899230957,
"learning_rate": 1.905146316851665e-05,
"loss": 1.9102,
"step": 950
},
{
"epoch": 0.20185708518368994,
"grad_norm": 25.045923233032227,
"learning_rate": 1.9993271279578333e-05,
"loss": 1.9121,
"step": 1000
},
{
"epoch": 0.21194993944287444,
"grad_norm": 17.414430618286133,
"learning_rate": 1.9881125939217227e-05,
"loss": 1.9449,
"step": 1050
},
{
"epoch": 0.22204279370205895,
"grad_norm": 15.37423324584961,
"learning_rate": 1.976898059885612e-05,
"loss": 1.9139,
"step": 1100
},
{
"epoch": 0.23213564796124345,
"grad_norm": 20.543489456176758,
"learning_rate": 1.965683525849501e-05,
"loss": 1.92,
"step": 1150
},
{
"epoch": 0.24222850222042794,
"grad_norm": 12.01870346069336,
"learning_rate": 1.9544689918133902e-05,
"loss": 1.8962,
"step": 1200
},
{
"epoch": 0.25232135647961246,
"grad_norm": 15.475773811340332,
"learning_rate": 1.9432544577772796e-05,
"loss": 1.9483,
"step": 1250
},
{
"epoch": 0.26241421073879695,
"grad_norm": 11.753213882446289,
"learning_rate": 1.9320399237411686e-05,
"loss": 1.919,
"step": 1300
},
{
"epoch": 0.27250706499798144,
"grad_norm": 14.90489673614502,
"learning_rate": 1.920825389705058e-05,
"loss": 1.8742,
"step": 1350
},
{
"epoch": 0.28259991925716593,
"grad_norm": 12.925189971923828,
"learning_rate": 1.909610855668947e-05,
"loss": 1.8822,
"step": 1400
},
{
"epoch": 0.2926927735163504,
"grad_norm": 17.215579986572266,
"learning_rate": 1.898396321632836e-05,
"loss": 1.8796,
"step": 1450
},
{
"epoch": 0.3027856277755349,
"grad_norm": 16.483861923217773,
"learning_rate": 1.8871817875967255e-05,
"loss": 1.8442,
"step": 1500
},
{
"epoch": 0.3128784820347194,
"grad_norm": 18.10808753967285,
"learning_rate": 1.875967253560615e-05,
"loss": 1.9131,
"step": 1550
},
{
"epoch": 0.3229713362939039,
"grad_norm": 14.261265754699707,
"learning_rate": 1.864752719524504e-05,
"loss": 1.7602,
"step": 1600
},
{
"epoch": 0.3330641905530884,
"grad_norm": 16.223392486572266,
"learning_rate": 1.8535381854883933e-05,
"loss": 1.8392,
"step": 1650
},
{
"epoch": 0.34315704481227294,
"grad_norm": 14.012106895446777,
"learning_rate": 1.8423236514522824e-05,
"loss": 1.8335,
"step": 1700
},
{
"epoch": 0.35324989907145743,
"grad_norm": 13.234374046325684,
"learning_rate": 1.8311091174161714e-05,
"loss": 1.8501,
"step": 1750
},
{
"epoch": 0.3633427533306419,
"grad_norm": 11.787166595458984,
"learning_rate": 1.8198945833800608e-05,
"loss": 1.8704,
"step": 1800
},
{
"epoch": 0.3734356075898264,
"grad_norm": 15.64974308013916,
"learning_rate": 1.80868004934395e-05,
"loss": 1.85,
"step": 1850
},
{
"epoch": 0.3835284618490109,
"grad_norm": 13.893998146057129,
"learning_rate": 1.7974655153078392e-05,
"loss": 1.8807,
"step": 1900
},
{
"epoch": 0.3936213161081954,
"grad_norm": 15.42603588104248,
"learning_rate": 1.7862509812717283e-05,
"loss": 1.8124,
"step": 1950
},
{
"epoch": 0.4037141703673799,
"grad_norm": 12.293023109436035,
"learning_rate": 1.7750364472356173e-05,
"loss": 1.8112,
"step": 2000
},
{
"epoch": 0.4138070246265644,
"grad_norm": 17.576618194580078,
"learning_rate": 1.7638219131995067e-05,
"loss": 1.8468,
"step": 2050
},
{
"epoch": 0.42389987888574887,
"grad_norm": 36.62916946411133,
"learning_rate": 1.752607379163396e-05,
"loss": 1.8563,
"step": 2100
},
{
"epoch": 0.43399273314493336,
"grad_norm": 12.232354164123535,
"learning_rate": 1.741392845127285e-05,
"loss": 1.8643,
"step": 2150
},
{
"epoch": 0.4440855874041179,
"grad_norm": 9.772968292236328,
"learning_rate": 1.7301783110911742e-05,
"loss": 1.8686,
"step": 2200
},
{
"epoch": 0.4541784416633024,
"grad_norm": 13.78654956817627,
"learning_rate": 1.7189637770550636e-05,
"loss": 1.8477,
"step": 2250
},
{
"epoch": 0.4642712959224869,
"grad_norm": 14.448091506958008,
"learning_rate": 1.7077492430189526e-05,
"loss": 1.828,
"step": 2300
},
{
"epoch": 0.4743641501816714,
"grad_norm": 10.872529983520508,
"learning_rate": 1.696534708982842e-05,
"loss": 1.7916,
"step": 2350
},
{
"epoch": 0.4844570044408559,
"grad_norm": 14.716806411743164,
"learning_rate": 1.685320174946731e-05,
"loss": 1.7982,
"step": 2400
},
{
"epoch": 0.49454985870004037,
"grad_norm": 15.155656814575195,
"learning_rate": 1.67410564091062e-05,
"loss": 1.8422,
"step": 2450
},
{
"epoch": 0.5046427129592249,
"grad_norm": 11.369612693786621,
"learning_rate": 1.6628911068745095e-05,
"loss": 1.8217,
"step": 2500
},
{
"epoch": 0.5147355672184094,
"grad_norm": 15.491066932678223,
"learning_rate": 1.651676572838399e-05,
"loss": 1.8487,
"step": 2550
},
{
"epoch": 0.5248284214775939,
"grad_norm": 12.249984741210938,
"learning_rate": 1.640462038802288e-05,
"loss": 1.7951,
"step": 2600
},
{
"epoch": 0.5349212757367784,
"grad_norm": 14.075465202331543,
"learning_rate": 1.629247504766177e-05,
"loss": 1.8115,
"step": 2650
},
{
"epoch": 0.5450141299959629,
"grad_norm": 9.785154342651367,
"learning_rate": 1.6180329707300664e-05,
"loss": 1.8576,
"step": 2700
},
{
"epoch": 0.5551069842551474,
"grad_norm": 14.559487342834473,
"learning_rate": 1.6068184366939554e-05,
"loss": 1.8263,
"step": 2750
},
{
"epoch": 0.5651998385143319,
"grad_norm": 15.150165557861328,
"learning_rate": 1.5956039026578448e-05,
"loss": 1.8029,
"step": 2800
},
{
"epoch": 0.5752926927735164,
"grad_norm": 13.863632202148438,
"learning_rate": 1.584389368621734e-05,
"loss": 1.7863,
"step": 2850
},
{
"epoch": 0.5853855470327008,
"grad_norm": 9.358270645141602,
"learning_rate": 1.573174834585623e-05,
"loss": 1.806,
"step": 2900
},
{
"epoch": 0.5954784012918853,
"grad_norm": 12.770975112915039,
"learning_rate": 1.5619603005495123e-05,
"loss": 1.7417,
"step": 2950
},
{
"epoch": 0.6055712555510698,
"grad_norm": 12.026569366455078,
"learning_rate": 1.5507457665134017e-05,
"loss": 1.7623,
"step": 3000
},
{
"epoch": 0.6156641098102543,
"grad_norm": 9.8405122756958,
"learning_rate": 1.5395312324772907e-05,
"loss": 1.7941,
"step": 3050
},
{
"epoch": 0.6257569640694388,
"grad_norm": 13.649519920349121,
"learning_rate": 1.5283166984411798e-05,
"loss": 1.7499,
"step": 3100
},
{
"epoch": 0.6358498183286233,
"grad_norm": 13.303316116333008,
"learning_rate": 1.5171021644050692e-05,
"loss": 1.7821,
"step": 3150
},
{
"epoch": 0.6459426725878078,
"grad_norm": 14.893158912658691,
"learning_rate": 1.5058876303689582e-05,
"loss": 1.8423,
"step": 3200
},
{
"epoch": 0.6560355268469923,
"grad_norm": 14.434380531311035,
"learning_rate": 1.4946730963328474e-05,
"loss": 1.8138,
"step": 3250
},
{
"epoch": 0.6661283811061768,
"grad_norm": 9.59044075012207,
"learning_rate": 1.4834585622967368e-05,
"loss": 1.7734,
"step": 3300
},
{
"epoch": 0.6762212353653613,
"grad_norm": 12.524561882019043,
"learning_rate": 1.4722440282606259e-05,
"loss": 1.8246,
"step": 3350
},
{
"epoch": 0.6863140896245459,
"grad_norm": 13.521296501159668,
"learning_rate": 1.4610294942245151e-05,
"loss": 1.7847,
"step": 3400
},
{
"epoch": 0.6964069438837304,
"grad_norm": 10.999866485595703,
"learning_rate": 1.4498149601884043e-05,
"loss": 1.8027,
"step": 3450
},
{
"epoch": 0.7064997981429149,
"grad_norm": 15.364250183105469,
"learning_rate": 1.4386004261522934e-05,
"loss": 1.7802,
"step": 3500
},
{
"epoch": 0.7165926524020994,
"grad_norm": 13.141353607177734,
"learning_rate": 1.4273858921161828e-05,
"loss": 1.7464,
"step": 3550
},
{
"epoch": 0.7266855066612838,
"grad_norm": 9.018637657165527,
"learning_rate": 1.4161713580800718e-05,
"loss": 1.7553,
"step": 3600
},
{
"epoch": 0.7367783609204683,
"grad_norm": 11.081124305725098,
"learning_rate": 1.404956824043961e-05,
"loss": 1.7922,
"step": 3650
},
{
"epoch": 0.7468712151796528,
"grad_norm": 10.0188627243042,
"learning_rate": 1.3937422900078504e-05,
"loss": 1.7769,
"step": 3700
},
{
"epoch": 0.7569640694388373,
"grad_norm": 10.286458015441895,
"learning_rate": 1.3825277559717395e-05,
"loss": 1.7696,
"step": 3750
},
{
"epoch": 0.7670569236980218,
"grad_norm": 11.746405601501465,
"learning_rate": 1.3713132219356287e-05,
"loss": 1.7188,
"step": 3800
},
{
"epoch": 0.7771497779572063,
"grad_norm": 11.215723991394043,
"learning_rate": 1.3600986878995179e-05,
"loss": 1.6803,
"step": 3850
},
{
"epoch": 0.7872426322163908,
"grad_norm": 8.982596397399902,
"learning_rate": 1.348884153863407e-05,
"loss": 1.7696,
"step": 3900
},
{
"epoch": 0.7973354864755753,
"grad_norm": 12.450457572937012,
"learning_rate": 1.3376696198272963e-05,
"loss": 1.8021,
"step": 3950
},
{
"epoch": 0.8074283407347598,
"grad_norm": 10.87128734588623,
"learning_rate": 1.3264550857911855e-05,
"loss": 1.7492,
"step": 4000
},
{
"epoch": 0.8175211949939443,
"grad_norm": 11.78647518157959,
"learning_rate": 1.3152405517550746e-05,
"loss": 1.7883,
"step": 4050
},
{
"epoch": 0.8276140492531288,
"grad_norm": 12.425263404846191,
"learning_rate": 1.3040260177189638e-05,
"loss": 1.7546,
"step": 4100
},
{
"epoch": 0.8377069035123133,
"grad_norm": 11.663323402404785,
"learning_rate": 1.2928114836828532e-05,
"loss": 1.8018,
"step": 4150
},
{
"epoch": 0.8477997577714977,
"grad_norm": 17.913087844848633,
"learning_rate": 1.2815969496467423e-05,
"loss": 1.7827,
"step": 4200
},
{
"epoch": 0.8578926120306822,
"grad_norm": 9.219327926635742,
"learning_rate": 1.2703824156106315e-05,
"loss": 1.7245,
"step": 4250
},
{
"epoch": 0.8679854662898667,
"grad_norm": 11.107460021972656,
"learning_rate": 1.2591678815745207e-05,
"loss": 1.7264,
"step": 4300
},
{
"epoch": 0.8780783205490512,
"grad_norm": 10.487607955932617,
"learning_rate": 1.2479533475384097e-05,
"loss": 1.753,
"step": 4350
},
{
"epoch": 0.8881711748082358,
"grad_norm": 13.2865571975708,
"learning_rate": 1.2367388135022991e-05,
"loss": 1.7317,
"step": 4400
},
{
"epoch": 0.8982640290674203,
"grad_norm": 10.927115440368652,
"learning_rate": 1.2255242794661883e-05,
"loss": 1.7651,
"step": 4450
},
{
"epoch": 0.9083568833266048,
"grad_norm": 10.536073684692383,
"learning_rate": 1.2143097454300774e-05,
"loss": 1.7578,
"step": 4500
},
{
"epoch": 0.9184497375857893,
"grad_norm": 13.544109344482422,
"learning_rate": 1.2030952113939666e-05,
"loss": 1.7505,
"step": 4550
},
{
"epoch": 0.9285425918449738,
"grad_norm": 9.343710899353027,
"learning_rate": 1.1921049680385782e-05,
"loss": 1.6865,
"step": 4600
},
{
"epoch": 0.9386354461041583,
"grad_norm": 11.518623352050781,
"learning_rate": 1.1808904340024674e-05,
"loss": 1.7203,
"step": 4650
},
{
"epoch": 0.9487283003633428,
"grad_norm": 7.897172927856445,
"learning_rate": 1.1696758999663564e-05,
"loss": 1.7201,
"step": 4700
},
{
"epoch": 0.9588211546225273,
"grad_norm": 11.530837059020996,
"learning_rate": 1.1584613659302457e-05,
"loss": 1.8117,
"step": 4750
},
{
"epoch": 0.9689140088817118,
"grad_norm": 11.721019744873047,
"learning_rate": 1.147246831894135e-05,
"loss": 1.7663,
"step": 4800
},
{
"epoch": 0.9790068631408962,
"grad_norm": 11.470191955566406,
"learning_rate": 1.1360322978580241e-05,
"loss": 1.7655,
"step": 4850
},
{
"epoch": 0.9890997174000807,
"grad_norm": 12.892107009887695,
"learning_rate": 1.1248177638219133e-05,
"loss": 1.759,
"step": 4900
},
{
"epoch": 0.9991925716592652,
"grad_norm": 13.869138717651367,
"learning_rate": 1.1136032297858025e-05,
"loss": 1.7831,
"step": 4950
},
{
"epoch": 1.0,
"eval_loss": 1.776762843132019,
"eval_runtime": 226.5804,
"eval_samples_per_second": 16.396,
"eval_steps_per_second": 2.052,
"step": 4954
}
],
"logging_steps": 50,
"max_steps": 9908,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.681334714807091e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}