OH_DCFT_V3_wo_unreplicated / trainer_state.json
sedrickkeh's picture
End of training
02d8d69 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9880794701986755,
"eval_steps": 500,
"global_step": 564,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.052980132450331126,
"grad_norm": 2.3983146952234486,
"learning_rate": 5e-06,
"loss": 0.9851,
"step": 10
},
{
"epoch": 0.10596026490066225,
"grad_norm": 1.7246649174554667,
"learning_rate": 5e-06,
"loss": 0.8319,
"step": 20
},
{
"epoch": 0.15894039735099338,
"grad_norm": 3.601038796091522,
"learning_rate": 5e-06,
"loss": 0.8075,
"step": 30
},
{
"epoch": 0.2119205298013245,
"grad_norm": 1.2059052889786372,
"learning_rate": 5e-06,
"loss": 0.7915,
"step": 40
},
{
"epoch": 0.26490066225165565,
"grad_norm": 1.3608839677655582,
"learning_rate": 5e-06,
"loss": 0.7709,
"step": 50
},
{
"epoch": 0.31788079470198677,
"grad_norm": 1.2434441624370192,
"learning_rate": 5e-06,
"loss": 0.7621,
"step": 60
},
{
"epoch": 0.3708609271523179,
"grad_norm": 0.9441777855534653,
"learning_rate": 5e-06,
"loss": 0.7454,
"step": 70
},
{
"epoch": 0.423841059602649,
"grad_norm": 1.5125978662591961,
"learning_rate": 5e-06,
"loss": 0.7248,
"step": 80
},
{
"epoch": 0.4768211920529801,
"grad_norm": 0.7471461569802452,
"learning_rate": 5e-06,
"loss": 0.7275,
"step": 90
},
{
"epoch": 0.5298013245033113,
"grad_norm": 0.645516397675585,
"learning_rate": 5e-06,
"loss": 0.7123,
"step": 100
},
{
"epoch": 0.5827814569536424,
"grad_norm": 0.7251445711578004,
"learning_rate": 5e-06,
"loss": 0.7117,
"step": 110
},
{
"epoch": 0.6357615894039735,
"grad_norm": 0.651327566584479,
"learning_rate": 5e-06,
"loss": 0.7119,
"step": 120
},
{
"epoch": 0.6887417218543046,
"grad_norm": 0.6140018870668793,
"learning_rate": 5e-06,
"loss": 0.7053,
"step": 130
},
{
"epoch": 0.7417218543046358,
"grad_norm": 0.5388085038750972,
"learning_rate": 5e-06,
"loss": 0.7022,
"step": 140
},
{
"epoch": 0.7947019867549668,
"grad_norm": 1.010650981679106,
"learning_rate": 5e-06,
"loss": 0.6987,
"step": 150
},
{
"epoch": 0.847682119205298,
"grad_norm": 1.1037782052291758,
"learning_rate": 5e-06,
"loss": 0.6976,
"step": 160
},
{
"epoch": 0.9006622516556292,
"grad_norm": 0.666699690620748,
"learning_rate": 5e-06,
"loss": 0.705,
"step": 170
},
{
"epoch": 0.9536423841059603,
"grad_norm": 0.5794869194974834,
"learning_rate": 5e-06,
"loss": 0.7042,
"step": 180
},
{
"epoch": 0.9960264900662251,
"eval_loss": 0.6996881365776062,
"eval_runtime": 101.9424,
"eval_samples_per_second": 49.901,
"eval_steps_per_second": 0.392,
"step": 188
},
{
"epoch": 1.0066225165562914,
"grad_norm": 0.8762129194412334,
"learning_rate": 5e-06,
"loss": 0.6861,
"step": 190
},
{
"epoch": 1.0596026490066226,
"grad_norm": 0.7641648019922738,
"learning_rate": 5e-06,
"loss": 0.6399,
"step": 200
},
{
"epoch": 1.1125827814569536,
"grad_norm": 0.6026445432992825,
"learning_rate": 5e-06,
"loss": 0.6342,
"step": 210
},
{
"epoch": 1.1655629139072847,
"grad_norm": 0.5607709134599749,
"learning_rate": 5e-06,
"loss": 0.6285,
"step": 220
},
{
"epoch": 1.218543046357616,
"grad_norm": 0.723598523167553,
"learning_rate": 5e-06,
"loss": 0.654,
"step": 230
},
{
"epoch": 1.271523178807947,
"grad_norm": 0.6634015008522252,
"learning_rate": 5e-06,
"loss": 0.6474,
"step": 240
},
{
"epoch": 1.3245033112582782,
"grad_norm": 0.5676178378824602,
"learning_rate": 5e-06,
"loss": 0.6356,
"step": 250
},
{
"epoch": 1.3774834437086092,
"grad_norm": 0.6219906931731467,
"learning_rate": 5e-06,
"loss": 0.6396,
"step": 260
},
{
"epoch": 1.4304635761589404,
"grad_norm": 0.5539002206307158,
"learning_rate": 5e-06,
"loss": 0.6395,
"step": 270
},
{
"epoch": 1.4834437086092715,
"grad_norm": 0.6706880554061717,
"learning_rate": 5e-06,
"loss": 0.6364,
"step": 280
},
{
"epoch": 1.5364238410596025,
"grad_norm": 0.6250744115575335,
"learning_rate": 5e-06,
"loss": 0.6455,
"step": 290
},
{
"epoch": 1.589403973509934,
"grad_norm": 0.5666575820633527,
"learning_rate": 5e-06,
"loss": 0.6352,
"step": 300
},
{
"epoch": 1.6423841059602649,
"grad_norm": 0.8049891928557037,
"learning_rate": 5e-06,
"loss": 0.634,
"step": 310
},
{
"epoch": 1.695364238410596,
"grad_norm": 0.8098028256502842,
"learning_rate": 5e-06,
"loss": 0.6379,
"step": 320
},
{
"epoch": 1.7483443708609272,
"grad_norm": 0.6314929024368203,
"learning_rate": 5e-06,
"loss": 0.6394,
"step": 330
},
{
"epoch": 1.8013245033112582,
"grad_norm": 0.824620474103318,
"learning_rate": 5e-06,
"loss": 0.6414,
"step": 340
},
{
"epoch": 1.8543046357615895,
"grad_norm": 0.5854556799760776,
"learning_rate": 5e-06,
"loss": 0.6393,
"step": 350
},
{
"epoch": 1.9072847682119205,
"grad_norm": 0.6825161397864904,
"learning_rate": 5e-06,
"loss": 0.6408,
"step": 360
},
{
"epoch": 1.9602649006622517,
"grad_norm": 0.5897191051228083,
"learning_rate": 5e-06,
"loss": 0.6362,
"step": 370
},
{
"epoch": 1.9973509933774833,
"eval_loss": 0.6881988644599915,
"eval_runtime": 101.7242,
"eval_samples_per_second": 50.008,
"eval_steps_per_second": 0.393,
"step": 377
},
{
"epoch": 2.013245033112583,
"grad_norm": 0.8098259238713678,
"learning_rate": 5e-06,
"loss": 0.6249,
"step": 380
},
{
"epoch": 2.066225165562914,
"grad_norm": 0.8384706128552907,
"learning_rate": 5e-06,
"loss": 0.5796,
"step": 390
},
{
"epoch": 2.119205298013245,
"grad_norm": 0.7877590869928718,
"learning_rate": 5e-06,
"loss": 0.5721,
"step": 400
},
{
"epoch": 2.172185430463576,
"grad_norm": 0.8214423131053483,
"learning_rate": 5e-06,
"loss": 0.5881,
"step": 410
},
{
"epoch": 2.225165562913907,
"grad_norm": 0.708950143379715,
"learning_rate": 5e-06,
"loss": 0.5788,
"step": 420
},
{
"epoch": 2.2781456953642385,
"grad_norm": 0.6491889315422662,
"learning_rate": 5e-06,
"loss": 0.5778,
"step": 430
},
{
"epoch": 2.3311258278145695,
"grad_norm": 0.6858462236619034,
"learning_rate": 5e-06,
"loss": 0.5821,
"step": 440
},
{
"epoch": 2.384105960264901,
"grad_norm": 0.780640823583864,
"learning_rate": 5e-06,
"loss": 0.5768,
"step": 450
},
{
"epoch": 2.437086092715232,
"grad_norm": 0.7173053514977337,
"learning_rate": 5e-06,
"loss": 0.5759,
"step": 460
},
{
"epoch": 2.4900662251655628,
"grad_norm": 0.7004632879605499,
"learning_rate": 5e-06,
"loss": 0.5787,
"step": 470
},
{
"epoch": 2.543046357615894,
"grad_norm": 0.7028579263335615,
"learning_rate": 5e-06,
"loss": 0.579,
"step": 480
},
{
"epoch": 2.596026490066225,
"grad_norm": 0.9012109929919548,
"learning_rate": 5e-06,
"loss": 0.5848,
"step": 490
},
{
"epoch": 2.6490066225165565,
"grad_norm": 0.6237112161014274,
"learning_rate": 5e-06,
"loss": 0.584,
"step": 500
},
{
"epoch": 2.7019867549668874,
"grad_norm": 0.6803732464125802,
"learning_rate": 5e-06,
"loss": 0.5918,
"step": 510
},
{
"epoch": 2.7549668874172184,
"grad_norm": 0.7496234836165662,
"learning_rate": 5e-06,
"loss": 0.5797,
"step": 520
},
{
"epoch": 2.80794701986755,
"grad_norm": 0.6761315878843943,
"learning_rate": 5e-06,
"loss": 0.5921,
"step": 530
},
{
"epoch": 2.8609271523178808,
"grad_norm": 0.6002390896713952,
"learning_rate": 5e-06,
"loss": 0.5887,
"step": 540
},
{
"epoch": 2.9139072847682117,
"grad_norm": 0.6064169883870584,
"learning_rate": 5e-06,
"loss": 0.5919,
"step": 550
},
{
"epoch": 2.966887417218543,
"grad_norm": 0.5890652422466117,
"learning_rate": 5e-06,
"loss": 0.5826,
"step": 560
},
{
"epoch": 2.9880794701986755,
"eval_loss": 0.6938396692276001,
"eval_runtime": 102.1596,
"eval_samples_per_second": 49.795,
"eval_steps_per_second": 0.392,
"step": 564
},
{
"epoch": 2.9880794701986755,
"step": 564,
"total_flos": 944302247116800.0,
"train_loss": 0.6565315867146702,
"train_runtime": 17042.4484,
"train_samples_per_second": 17.011,
"train_steps_per_second": 0.033
}
],
"logging_steps": 10,
"max_steps": 564,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 944302247116800.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}