DimMem-4B-Locomo / trainer_state.source.json
wtqiu's picture
Upload DimMem-4B Locomo checkpoint
1fa0791 verified
Raw
History Blame Contribute Delete
8.88 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200.0,
"global_step": 354,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005649717514124294,
"grad_norm": 1.8059313297271729,
"learning_rate": 4.999901553476555e-05,
"loss": 0.4717830717563629,
"step": 1,
"token_acc": 0.924627166465135
},
{
"epoch": 0.05649717514124294,
"grad_norm": 0.1542048305273056,
"learning_rate": 4.9901617425775067e-05,
"loss": 0.33008625772264266,
"step": 10,
"token_acc": 0.9219071687140808
},
{
"epoch": 0.11299435028248588,
"grad_norm": 0.11267846822738647,
"learning_rate": 4.9607244033573156e-05,
"loss": 0.25351030826568605,
"step": 20,
"token_acc": 0.9264000170247177
},
{
"epoch": 0.1694915254237288,
"grad_norm": 0.09091860800981522,
"learning_rate": 4.91191967203629e-05,
"loss": 0.22513890266418457,
"step": 30,
"token_acc": 0.9317765577689588
},
{
"epoch": 0.22598870056497175,
"grad_norm": 0.08639991283416748,
"learning_rate": 4.84413167142257e-05,
"loss": 0.20885028839111328,
"step": 40,
"token_acc": 0.9359174715592828
},
{
"epoch": 0.2824858757062147,
"grad_norm": 0.09375399351119995,
"learning_rate": 4.7578939341563095e-05,
"loss": 0.19784480333328247,
"step": 50,
"token_acc": 0.9384661788621316
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.08870115131139755,
"learning_rate": 4.653885203484515e-05,
"loss": 0.18448562622070314,
"step": 60,
"token_acc": 0.9420314698252165
},
{
"epoch": 0.3954802259887006,
"grad_norm": 0.11067840456962585,
"learning_rate": 4.532924091140417e-05,
"loss": 0.18294379711151124,
"step": 70,
"token_acc": 0.9418481147105683
},
{
"epoch": 0.4519774011299435,
"grad_norm": 0.09862152487039566,
"learning_rate": 4.395962634373097e-05,
"loss": 0.17243103981018065,
"step": 80,
"token_acc": 0.9446352200693965
},
{
"epoch": 0.5084745762711864,
"grad_norm": 0.10678666085004807,
"learning_rate": 4.2440788028374624e-05,
"loss": 0.1731430172920227,
"step": 90,
"token_acc": 0.9447363875815018
},
{
"epoch": 0.5649717514124294,
"grad_norm": 0.10532315075397491,
"learning_rate": 4.0784680143198836e-05,
"loss": 0.17281131744384765,
"step": 100,
"token_acc": 0.9447437022704439
},
{
"epoch": 0.6214689265536724,
"grad_norm": 0.11211931705474854,
"learning_rate": 3.900433726075865e-05,
"loss": 0.16182489395141603,
"step": 110,
"token_acc": 0.9481675818843257
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.10726941376924515,
"learning_rate": 3.711377175831626e-05,
"loss": 0.16202739477157593,
"step": 120,
"token_acc": 0.9479687034245692
},
{
"epoch": 0.7344632768361582,
"grad_norm": 0.10400757193565369,
"learning_rate": 3.512786353194134e-05,
"loss": 0.15924739837646484,
"step": 130,
"token_acc": 0.9486699455285843
},
{
"epoch": 0.7909604519774012,
"grad_norm": 0.10787644982337952,
"learning_rate": 3.3062242882712724e-05,
"loss": 0.15439069271087646,
"step": 140,
"token_acc": 0.9498849158473873
},
{
"epoch": 0.847457627118644,
"grad_norm": 0.10836105048656464,
"learning_rate": 3.093316749677788e-05,
"loss": 0.15822217464447022,
"step": 150,
"token_acc": 0.9487123526844173
},
{
"epoch": 0.903954802259887,
"grad_norm": 0.10872391611337662,
"learning_rate": 2.875739448751176e-05,
"loss": 0.15569958686828614,
"step": 160,
"token_acc": 0.9491473105803542
},
{
"epoch": 0.96045197740113,
"grad_norm": 0.11218578368425369,
"learning_rate": 2.655204850688085e-05,
"loss": 0.15528473854064942,
"step": 170,
"token_acc": 0.9494661997922623
},
{
"epoch": 1.0169491525423728,
"grad_norm": 0.13003411889076233,
"learning_rate": 2.433448696405563e-05,
"loss": 0.1492830991744995,
"step": 180,
"token_acc": 0.9511860316683133
},
{
"epoch": 1.073446327683616,
"grad_norm": 0.12652547657489777,
"learning_rate": 2.2122163412082927e-05,
"loss": 0.14996984004974365,
"step": 190,
"token_acc": 0.9509753894028877
},
{
"epoch": 1.1299435028248588,
"grad_norm": 0.11710216104984283,
"learning_rate": 1.993249017784766e-05,
"loss": 0.149368953704834,
"step": 200,
"token_acc": 0.9509840746795515
},
{
"epoch": 1.1864406779661016,
"grad_norm": 0.12813611328601837,
"learning_rate": 1.778270131650948e-05,
"loss": 0.1482247829437256,
"step": 210,
"token_acc": 0.9514638991717056
},
{
"epoch": 1.2429378531073447,
"grad_norm": 0.12335359305143356,
"learning_rate": 1.5689716969045848e-05,
"loss": 0.14458421468734742,
"step": 220,
"token_acc": 0.9528204997080846
},
{
"epoch": 1.2994350282485876,
"grad_norm": 0.1252630650997162,
"learning_rate": 1.3670010190490073e-05,
"loss": 0.14932241439819335,
"step": 230,
"token_acc": 0.950983923940499
},
{
"epoch": 1.3559322033898304,
"grad_norm": 0.12602832913398743,
"learning_rate": 1.173947729700644e-05,
"loss": 0.14413282871246338,
"step": 240,
"token_acc": 0.9524894618411184
},
{
"epoch": 1.4124293785310735,
"grad_norm": 0.1404254138469696,
"learning_rate": 9.913312752249903e-06,
"loss": 0.14448442459106445,
"step": 250,
"token_acc": 0.9523784878342272
},
{
"epoch": 1.4689265536723164,
"grad_norm": 0.1297323852777481,
"learning_rate": 8.20588957773018e-06,
"loss": 0.1442911744117737,
"step": 260,
"token_acc": 0.9526485262065045
},
{
"epoch": 1.5254237288135593,
"grad_norm": 0.1164567619562149,
"learning_rate": 6.6306462284233234e-06,
"loss": 0.1476944088935852,
"step": 270,
"token_acc": 0.9516979818914234
},
{
"epoch": 1.5819209039548023,
"grad_norm": 0.12876106798648834,
"learning_rate": 5.199980823988157e-06,
"loss": 0.14429720640182495,
"step": 280,
"token_acc": 0.9527345847326476
},
{
"epoch": 1.6384180790960452,
"grad_norm": 0.12522290647029877,
"learning_rate": 3.925153568052123e-06,
"loss": 0.14247846603393555,
"step": 290,
"token_acc": 0.952577761791889
},
{
"epoch": 1.694915254237288,
"grad_norm": 0.1215895265340805,
"learning_rate": 2.8161981235857143e-06,
"loss": 0.14371044635772706,
"step": 300,
"token_acc": 0.9530261029770724
},
{
"epoch": 1.7514124293785311,
"grad_norm": 0.13429652154445648,
"learning_rate": 1.881842641895104e-06,
"loss": 0.1435616970062256,
"step": 310,
"token_acc": 0.9526384206465796
},
{
"epoch": 1.807909604519774,
"grad_norm": 0.12633894383907318,
"learning_rate": 1.129441066782702e-06,
"loss": 0.14912809133529664,
"step": 320,
"token_acc": 0.9509201261393581
},
{
"epoch": 1.8644067796610169,
"grad_norm": 0.12346093356609344,
"learning_rate": 5.649152545533332e-07,
"loss": 0.14490561485290526,
"step": 330,
"token_acc": 0.9519967728922952
},
{
"epoch": 1.92090395480226,
"grad_norm": 0.123548224568367,
"learning_rate": 1.927083654168854e-07,
"loss": 0.14034559726715087,
"step": 340,
"token_acc": 0.9537333066731213
},
{
"epoch": 1.9774011299435028,
"grad_norm": 0.1290796846151352,
"learning_rate": 1.5749893125160954e-08,
"loss": 0.14156577587127686,
"step": 350,
"token_acc": 0.9530219643471618
}
],
"logging_steps": 10,
"max_steps": 354,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.089589125686231e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}