mamba-distill-7b / trainer_state.json
KotshinZ's picture
Model save
6d3b002 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998003992015968,
"eval_steps": 50,
"global_step": 375,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01330671989354624,
"grad_norm": 38.0893895691,
"learning_rate": 2.631578947368421e-06,
"loss": 9.4719,
"mean_token_accuracy": 0.6992475613951683,
"step": 5
},
{
"epoch": 0.02661343978709248,
"grad_norm": 53.38062892110183,
"learning_rate": 5.263157894736842e-06,
"loss": 9.2617,
"mean_token_accuracy": 0.7013110458850861,
"step": 10
},
{
"epoch": 0.03992015968063872,
"grad_norm": 65.59379133263475,
"learning_rate": 7.894736842105265e-06,
"loss": 8.8052,
"mean_token_accuracy": 0.7065529704093934,
"step": 15
},
{
"epoch": 0.05322687957418496,
"grad_norm": 30.823233386013126,
"learning_rate": 1.0526315789473684e-05,
"loss": 8.1585,
"mean_token_accuracy": 0.719629879295826,
"step": 20
},
{
"epoch": 0.0665335994677312,
"grad_norm": 12.66541979902211,
"learning_rate": 1.3157894736842108e-05,
"loss": 7.3748,
"mean_token_accuracy": 0.740014499425888,
"step": 25
},
{
"epoch": 0.07984031936127745,
"grad_norm": 10.709230045785015,
"learning_rate": 1.578947368421053e-05,
"loss": 7.1481,
"mean_token_accuracy": 0.7436378166079521,
"step": 30
},
{
"epoch": 0.09314703925482369,
"grad_norm": 7.432960167563066,
"learning_rate": 1.8421052631578947e-05,
"loss": 6.9424,
"mean_token_accuracy": 0.7486263796687126,
"step": 35
},
{
"epoch": 0.10645375914836992,
"grad_norm": 10.045989605681791,
"learning_rate": 1.9998261969639324e-05,
"loss": 6.9028,
"mean_token_accuracy": 0.7475503668189049,
"step": 40
},
{
"epoch": 0.11976047904191617,
"grad_norm": 15.965640756107303,
"learning_rate": 1.9978716065702566e-05,
"loss": 7.2718,
"mean_token_accuracy": 0.7349641278386116,
"step": 45
},
{
"epoch": 0.1330671989354624,
"grad_norm": 27.77458817629002,
"learning_rate": 1.9937494319239112e-05,
"loss": 7.3623,
"mean_token_accuracy": 0.7310615047812462,
"step": 50
},
{
"epoch": 0.1330671989354624,
"eval_loss": 1.008537769317627,
"eval_mean_token_accuracy": 0.7182760106192695,
"eval_runtime": 42.3882,
"eval_samples_per_second": 3.397,
"eval_steps_per_second": 0.425,
"step": 50
},
{
"epoch": 0.14637391882900866,
"grad_norm": 30.81470758273484,
"learning_rate": 1.9874686272438467e-05,
"loss": 7.2943,
"mean_token_accuracy": 0.7337011635303498,
"step": 55
},
{
"epoch": 0.1596806387225549,
"grad_norm": 22.17371294352614,
"learning_rate": 1.979042835741503e-05,
"loss": 7.104,
"mean_token_accuracy": 0.7383959114551544,
"step": 60
},
{
"epoch": 0.17298735861610112,
"grad_norm": 27.90713548690936,
"learning_rate": 1.968490359984923e-05,
"loss": 7.1833,
"mean_token_accuracy": 0.7363018915057182,
"step": 65
},
{
"epoch": 0.18629407850964738,
"grad_norm": 9.001486401205879,
"learning_rate": 1.9558341221417744e-05,
"loss": 7.05,
"mean_token_accuracy": 0.740586844086647,
"step": 70
},
{
"epoch": 0.1996007984031936,
"grad_norm": 13.898951307519155,
"learning_rate": 1.9411016141876438e-05,
"loss": 7.0786,
"mean_token_accuracy": 0.7393299728631973,
"step": 75
},
{
"epoch": 0.21290751829673984,
"grad_norm": 44.17413162889863,
"learning_rate": 1.9243248381877605e-05,
"loss": 7.513,
"mean_token_accuracy": 0.7232646465301513,
"step": 80
},
{
"epoch": 0.2262142381902861,
"grad_norm": 43.276281867208816,
"learning_rate": 1.9055402367818673e-05,
"loss": 7.2214,
"mean_token_accuracy": 0.7344184964895248,
"step": 85
},
{
"epoch": 0.23952095808383234,
"grad_norm": 22.007621479836395,
"learning_rate": 1.8847886140232438e-05,
"loss": 7.1625,
"mean_token_accuracy": 0.735144229233265,
"step": 90
},
{
"epoch": 0.2528276779773786,
"grad_norm": 77.4740488466291,
"learning_rate": 1.862115046743831e-05,
"loss": 7.5932,
"mean_token_accuracy": 0.722845695912838,
"step": 95
},
{
"epoch": 0.2661343978709248,
"grad_norm": 1066.7215003063964,
"learning_rate": 1.8375687866379988e-05,
"loss": 7.4423,
"mean_token_accuracy": 0.7269001781940461,
"step": 100
},
{
"epoch": 0.2661343978709248,
"eval_loss": 1.0586838722229004,
"eval_mean_token_accuracy": 0.6958943770991431,
"eval_runtime": 42.4569,
"eval_samples_per_second": 3.392,
"eval_steps_per_second": 0.424,
"step": 100
},
{
"epoch": 0.27944111776447106,
"grad_norm": 923.4336165050305,
"learning_rate": 1.811203153277641e-05,
"loss": 8.4501,
"mean_token_accuracy": 0.6929014056921006,
"step": 105
},
{
"epoch": 0.2927478376580173,
"grad_norm": 10253.700005495137,
"learning_rate": 1.7830754182909985e-05,
"loss": 11.581,
"mean_token_accuracy": 0.6142643451690674,
"step": 110
},
{
"epoch": 0.3060545575515635,
"grad_norm": 11508.216656593022,
"learning_rate": 1.753246680956795e-05,
"loss": 15.7105,
"mean_token_accuracy": 0.5149824447929859,
"step": 115
},
{
"epoch": 0.3193612774451098,
"grad_norm": 6036.292381336853,
"learning_rate": 1.721781735483921e-05,
"loss": 26.2876,
"mean_token_accuracy": 0.33059981614351275,
"step": 120
},
{
"epoch": 0.33266799733865604,
"grad_norm": 26623.625140159445,
"learning_rate": 1.6887489302649657e-05,
"loss": 30.2414,
"mean_token_accuracy": 0.26836080476641655,
"step": 125
},
{
"epoch": 0.34597471723220224,
"grad_norm": 213666.71326672958,
"learning_rate": 1.654220019409317e-05,
"loss": 36.7917,
"mean_token_accuracy": 0.20126449912786484,
"step": 130
},
{
"epoch": 0.3592814371257485,
"grad_norm": 281204.55298403726,
"learning_rate": 1.6182700068783463e-05,
"loss": 53.894,
"mean_token_accuracy": 0.08266483591869474,
"step": 135
},
{
"epoch": 0.37258815701929476,
"grad_norm": 274791.48096197617,
"learning_rate": 1.580976983561235e-05,
"loss": 58.3125,
"mean_token_accuracy": 0.06163843311369419,
"step": 140
},
{
"epoch": 0.38589487691284097,
"grad_norm": 9564.75792140504,
"learning_rate": 1.5424219576453526e-05,
"loss": 45.3478,
"mean_token_accuracy": 0.12734813932329417,
"step": 145
},
{
"epoch": 0.3992015968063872,
"grad_norm": 6120.107227530501,
"learning_rate": 1.5026886786496624e-05,
"loss": 42.2261,
"mean_token_accuracy": 0.1591544572263956,
"step": 150
},
{
"epoch": 0.3992015968063872,
"eval_loss": 4.92734432220459,
"eval_mean_token_accuracy": 0.1774691359864341,
"eval_runtime": 42.2395,
"eval_samples_per_second": 3.409,
"eval_steps_per_second": 0.426,
"step": 150
},
{
"epoch": 0.4125083166999335,
"grad_norm": 13441.595281708549,
"learning_rate": 1.46186345550338e-05,
"loss": 32.8561,
"mean_token_accuracy": 0.23304792679846287,
"step": 155
},
{
"epoch": 0.4258150365934797,
"grad_norm": 4366.951799070855,
"learning_rate": 1.4200349690650654e-05,
"loss": 26.1181,
"mean_token_accuracy": 0.3177220694720745,
"step": 160
},
{
"epoch": 0.43912175648702595,
"grad_norm": 8486.995137743008,
"learning_rate": 1.3772940794893916e-05,
"loss": 28.5985,
"mean_token_accuracy": 0.28858516551554203,
"step": 165
},
{
"epoch": 0.4524284763805722,
"grad_norm": 4760.402708498517,
"learning_rate": 1.3337336288600297e-05,
"loss": 24.7618,
"mean_token_accuracy": 0.3370695985853672,
"step": 170
},
{
"epoch": 0.4657351962741184,
"grad_norm": 1237.143262994326,
"learning_rate": 1.2894482395173695e-05,
"loss": 17.015,
"mean_token_accuracy": 0.4780235022306442,
"step": 175
},
{
"epoch": 0.47904191616766467,
"grad_norm": 408.8332305447992,
"learning_rate": 1.24453410851916e-05,
"loss": 15.019,
"mean_token_accuracy": 0.516390411555767,
"step": 180
},
{
"epoch": 0.49234863606121093,
"grad_norm": 311.65890181237427,
"learning_rate": 1.1990887986805295e-05,
"loss": 13.0538,
"mean_token_accuracy": 0.5649969473481178,
"step": 185
},
{
"epoch": 0.5056553559547572,
"grad_norm": 218.78464454962847,
"learning_rate": 1.1532110266473026e-05,
"loss": 11.4017,
"mean_token_accuracy": 0.6076564386487007,
"step": 190
},
{
"epoch": 0.5189620758483033,
"grad_norm": 248.3901525774538,
"learning_rate": 1.1070004484629543e-05,
"loss": 10.3675,
"mean_token_accuracy": 0.6390743300318718,
"step": 195
},
{
"epoch": 0.5322687957418496,
"grad_norm": 68.79121302231147,
"learning_rate": 1.0605574430949983e-05,
"loss": 9.2733,
"mean_token_accuracy": 0.6695673123002053,
"step": 200
},
{
"epoch": 0.5322687957418496,
"eval_loss": 1.2745658159255981,
"eval_mean_token_accuracy": 0.6473477118545108,
"eval_runtime": 42.3799,
"eval_samples_per_second": 3.398,
"eval_steps_per_second": 0.425,
"step": 200
},
{
"epoch": 0.5455755156353959,
"grad_norm": 61.494613995295225,
"learning_rate": 1.0139828943910358e-05,
"loss": 8.6282,
"mean_token_accuracy": 0.6900610521435737,
"step": 205
},
{
"epoch": 0.5588822355289421,
"grad_norm": 223.47361816320114,
"learning_rate": 9.673779719380967e-06,
"loss": 8.8734,
"mean_token_accuracy": 0.6839690148830414,
"step": 210
},
{
"epoch": 0.5721889554224884,
"grad_norm": 311.485197048925,
"learning_rate": 9.208439113012984e-06,
"loss": 9.6346,
"mean_token_accuracy": 0.6596978038549424,
"step": 215
},
{
"epoch": 0.5854956753160346,
"grad_norm": 208.95277149419533,
"learning_rate": 8.744817941191862e-06,
"loss": 9.8742,
"mean_token_accuracy": 0.6521520212292671,
"step": 220
},
{
"epoch": 0.5988023952095808,
"grad_norm": 99.55141840914388,
"learning_rate": 8.283923285334304e-06,
"loss": 10.0645,
"mean_token_accuracy": 0.6457211509346962,
"step": 225
},
{
"epoch": 0.612109115103127,
"grad_norm": 113.01850407325877,
"learning_rate": 7.826756304298428e-06,
"loss": 9.6991,
"mean_token_accuracy": 0.6567307710647583,
"step": 230
},
{
"epoch": 0.6254158349966733,
"grad_norm": 122.00372512450222,
"learning_rate": 7.3743100596589e-06,
"loss": 9.3977,
"mean_token_accuracy": 0.6660859316587449,
"step": 235
},
{
"epoch": 0.6387225548902196,
"grad_norm": 141.28653030104314,
"learning_rate": 6.92756735857107e-06,
"loss": 9.916,
"mean_token_accuracy": 0.6507371798157692,
"step": 240
},
{
"epoch": 0.6520292747837658,
"grad_norm": 807.7008160726642,
"learning_rate": 6.487498618909845e-06,
"loss": 9.8794,
"mean_token_accuracy": 0.6521420940756798,
"step": 245
},
{
"epoch": 0.6653359946773121,
"grad_norm": 12179.703084325536,
"learning_rate": 6.0550597613206205e-06,
"loss": 10.3914,
"mean_token_accuracy": 0.6389522299170494,
"step": 250
},
{
"epoch": 0.6653359946773121,
"eval_loss": 1.7774240970611572,
"eval_mean_token_accuracy": 0.5449769298235575,
"eval_runtime": 42.2375,
"eval_samples_per_second": 3.409,
"eval_steps_per_second": 0.426,
"step": 250
},
{
"epoch": 0.6786427145708582,
"grad_norm": 1054.76927310761,
"learning_rate": 5.631190132761247e-06,
"loss": 11.7133,
"mean_token_accuracy": 0.5997588485479355,
"step": 255
},
{
"epoch": 0.6919494344644045,
"grad_norm": 1424.9333125346188,
"learning_rate": 5.216810466045448e-06,
"loss": 12.5735,
"mean_token_accuracy": 0.5747481673955918,
"step": 260
},
{
"epoch": 0.7052561543579507,
"grad_norm": 798.7973283165478,
"learning_rate": 4.812820879820034e-06,
"loss": 13.4974,
"mean_token_accuracy": 0.5521100461483002,
"step": 265
},
{
"epoch": 0.718562874251497,
"grad_norm": 1790.9039210107235,
"learning_rate": 4.420098923320378e-06,
"loss": 14.4898,
"mean_token_accuracy": 0.5296176724135876,
"step": 270
},
{
"epoch": 0.7318695941450433,
"grad_norm": 829.5583452937811,
"learning_rate": 4.0394976701513235e-06,
"loss": 14.5873,
"mean_token_accuracy": 0.5268749997019768,
"step": 275
},
{
"epoch": 0.7451763140385895,
"grad_norm": 6150.368162782374,
"learning_rate": 3.671843865234238e-06,
"loss": 14.6091,
"mean_token_accuracy": 0.5281791850924492,
"step": 280
},
{
"epoch": 0.7584830339321357,
"grad_norm": 29207.957135047905,
"learning_rate": 3.3179361289454694e-06,
"loss": 16.9682,
"mean_token_accuracy": 0.48243742287158964,
"step": 285
},
{
"epoch": 0.7717897538256819,
"grad_norm": 54201.045662727825,
"learning_rate": 2.978543222347076e-06,
"loss": 20.7529,
"mean_token_accuracy": 0.4138728640973568,
"step": 290
},
{
"epoch": 0.7850964737192282,
"grad_norm": 23857.53870983074,
"learning_rate": 2.6544023772782736e-06,
"loss": 21.3047,
"mean_token_accuracy": 0.403152472525835,
"step": 295
},
{
"epoch": 0.7984031936127745,
"grad_norm": 11423.97832396548,
"learning_rate": 2.346217694934847e-06,
"loss": 20.4021,
"mean_token_accuracy": 0.41206730976700784,
"step": 300
},
{
"epoch": 0.7984031936127745,
"eval_loss": 3.0904834270477295,
"eval_mean_token_accuracy": 0.34269434379206765,
"eval_runtime": 42.4109,
"eval_samples_per_second": 3.395,
"eval_steps_per_second": 0.424,
"step": 300
},
{
"epoch": 0.8117099135063207,
"grad_norm": 6555.658931606807,
"learning_rate": 2.0546586164151827e-06,
"loss": 19.3343,
"mean_token_accuracy": 0.42890567928552625,
"step": 305
},
{
"epoch": 0.825016633399867,
"grad_norm": 8596.180267406271,
"learning_rate": 1.7803584685552877e-06,
"loss": 19.1283,
"mean_token_accuracy": 0.4296626977622509,
"step": 310
},
{
"epoch": 0.8383233532934131,
"grad_norm": 7559.729843914057,
"learning_rate": 1.523913088211415e-06,
"loss": 19.9312,
"mean_token_accuracy": 0.4130441091954708,
"step": 315
},
{
"epoch": 0.8516300731869594,
"grad_norm": 6283.0641437588,
"learning_rate": 1.2858795279787517e-06,
"loss": 20.0128,
"mean_token_accuracy": 0.40999465957283976,
"step": 320
},
{
"epoch": 0.8649367930805056,
"grad_norm": 4627.199331551124,
"learning_rate": 1.0667748461575544e-06,
"loss": 20.2021,
"mean_token_accuracy": 0.40563797727227213,
"step": 325
},
{
"epoch": 0.8782435129740519,
"grad_norm": 3896.885861785845,
"learning_rate": 8.670749835951964e-07,
"loss": 20.3633,
"mean_token_accuracy": 0.40211157202720643,
"step": 330
},
{
"epoch": 0.8915502328675982,
"grad_norm": 3649.4556728802945,
"learning_rate": 6.872137298438653e-07,
"loss": 20.6856,
"mean_token_accuracy": 0.39633470848202706,
"step": 335
},
{
"epoch": 0.9048569527611444,
"grad_norm": 5894.774814516487,
"learning_rate": 5.275817808796013e-07,
"loss": 21.1202,
"mean_token_accuracy": 0.38578067943453787,
"step": 340
},
{
"epoch": 0.9181636726546906,
"grad_norm": 4756.426577271269,
"learning_rate": 3.885258904295575e-07,
"loss": 21.8543,
"mean_token_accuracy": 0.37522283270955087,
"step": 345
},
{
"epoch": 0.9314703925482368,
"grad_norm": 11889.44146408838,
"learning_rate": 2.703481167509281e-07,
"loss": 22.5845,
"mean_token_accuracy": 0.36378281489014624,
"step": 350
},
{
"epoch": 0.9314703925482368,
"eval_loss": 3.6812663078308105,
"eval_mean_token_accuracy": 0.2753951284620497,
"eval_runtime": 42.3376,
"eval_samples_per_second": 3.401,
"eval_steps_per_second": 0.425,
"step": 350
},
{
"epoch": 0.9447771124417831,
"grad_norm": 6803.573866074052,
"learning_rate": 1.73305166497707e-07,
"loss": 22.5139,
"mean_token_accuracy": 0.36300976797938345,
"step": 355
},
{
"epoch": 0.9580838323353293,
"grad_norm": 7457.921947626093,
"learning_rate": 9.760783710056176e-08,
"loss": 22.5036,
"mean_token_accuracy": 0.3649313189089298,
"step": 360
},
{
"epoch": 0.9713905522288756,
"grad_norm": 8489.33440106548,
"learning_rate": 4.3420558871060116e-08,
"loss": 22.7636,
"mean_token_accuracy": 0.35811431556940077,
"step": 365
},
{
"epoch": 0.9846972721224219,
"grad_norm": 10404.8253118789,
"learning_rate": 1.0861037824896337e-08,
"loss": 22.8268,
"mean_token_accuracy": 0.3570818044245243,
"step": 370
},
{
"epoch": 0.998003992015968,
"grad_norm": 7227.984198697766,
"learning_rate": 0.0,
"loss": 22.9385,
"mean_token_accuracy": 0.3560729533433914,
"step": 375
},
{
"epoch": 0.998003992015968,
"step": 375,
"total_flos": 3.616227566899167e+18,
"train_loss": 16.595592213948567,
"train_runtime": 29081.677,
"train_samples_per_second": 0.827,
"train_steps_per_second": 0.013
}
],
"logging_steps": 5,
"max_steps": 375,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.616227566899167e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}