simplewiki_baseline / trainer_state.json
jennhu's picture
jennhu/olmo-7b-lora_simplewiki_baseline_simplewiki_baseline
ef3013a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9411764705882355,
"eval_steps": 20,
"global_step": 111,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05378151260504202,
"grad_norm": 0.012224463745951653,
"learning_rate": 0.001981818181818182,
"loss": 2.4733,
"step": 2
},
{
"epoch": 0.10756302521008404,
"grad_norm": 0.04710804298520088,
"learning_rate": 0.0019454545454545456,
"loss": 2.4443,
"step": 4
},
{
"epoch": 0.16134453781512606,
"grad_norm": 0.09238269925117493,
"learning_rate": 0.0019090909090909091,
"loss": 2.3956,
"step": 6
},
{
"epoch": 0.21512605042016808,
"grad_norm": 0.17334744334220886,
"learning_rate": 0.0018727272727272729,
"loss": 2.4043,
"step": 8
},
{
"epoch": 0.2689075630252101,
"grad_norm": 0.11766365170478821,
"learning_rate": 0.0018363636363636364,
"loss": 2.385,
"step": 10
},
{
"epoch": 0.3226890756302521,
"grad_norm": 0.1385774314403534,
"learning_rate": 0.0018000000000000002,
"loss": 2.3933,
"step": 12
},
{
"epoch": 0.3764705882352941,
"grad_norm": 0.1210022047162056,
"learning_rate": 0.0017636363636363637,
"loss": 2.3733,
"step": 14
},
{
"epoch": 0.43025210084033616,
"grad_norm": 0.11510306596755981,
"learning_rate": 0.0017272727272727272,
"loss": 2.3297,
"step": 16
},
{
"epoch": 0.48403361344537815,
"grad_norm": 0.08369912207126617,
"learning_rate": 0.001690909090909091,
"loss": 2.351,
"step": 18
},
{
"epoch": 0.5378151260504201,
"grad_norm": 0.09298688918352127,
"learning_rate": 0.0016545454545454545,
"loss": 2.3175,
"step": 20
},
{
"epoch": 0.5378151260504201,
"eval_loss": 2.3251912593841553,
"eval_runtime": 84.2914,
"eval_samples_per_second": 14.118,
"eval_steps_per_second": 1.768,
"step": 20
},
{
"epoch": 0.5915966386554622,
"grad_norm": 0.10441266000270844,
"learning_rate": 0.0016181818181818183,
"loss": 2.3643,
"step": 22
},
{
"epoch": 0.6453781512605042,
"grad_norm": 0.09343012422323227,
"learning_rate": 0.0015818181818181818,
"loss": 2.3391,
"step": 24
},
{
"epoch": 0.6991596638655462,
"grad_norm": 0.09008985757827759,
"learning_rate": 0.0015454545454545454,
"loss": 2.2984,
"step": 26
},
{
"epoch": 0.7529411764705882,
"grad_norm": 0.08069847524166107,
"learning_rate": 0.0015090909090909091,
"loss": 2.3202,
"step": 28
},
{
"epoch": 0.8067226890756303,
"grad_norm": 0.08655106276273727,
"learning_rate": 0.0014727272727272727,
"loss": 2.3438,
"step": 30
},
{
"epoch": 0.8605042016806723,
"grad_norm": 0.08203998953104019,
"learning_rate": 0.0014363636363636362,
"loss": 2.2862,
"step": 32
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.10055471211671829,
"learning_rate": 0.0014,
"loss": 2.3448,
"step": 34
},
{
"epoch": 0.9680672268907563,
"grad_norm": 0.08500978350639343,
"learning_rate": 0.0013636363636363635,
"loss": 2.3221,
"step": 36
},
{
"epoch": 1.0,
"grad_norm": 0.16502036154270172,
"learning_rate": 0.0013272727272727275,
"loss": 2.3438,
"step": 38
},
{
"epoch": 1.053781512605042,
"grad_norm": 0.08134379237890244,
"learning_rate": 0.001290909090909091,
"loss": 2.3075,
"step": 40
},
{
"epoch": 1.053781512605042,
"eval_loss": 2.299827814102173,
"eval_runtime": 84.1245,
"eval_samples_per_second": 14.146,
"eval_steps_per_second": 1.771,
"step": 40
},
{
"epoch": 1.107563025210084,
"grad_norm": 0.09189953655004501,
"learning_rate": 0.0012545454545454546,
"loss": 2.2457,
"step": 42
},
{
"epoch": 1.1613445378151261,
"grad_norm": 0.09041959792375565,
"learning_rate": 0.0012181818181818183,
"loss": 2.2977,
"step": 44
},
{
"epoch": 1.2151260504201682,
"grad_norm": 0.08456366509199142,
"learning_rate": 0.0011818181818181819,
"loss": 2.2843,
"step": 46
},
{
"epoch": 1.26890756302521,
"grad_norm": 0.08097781240940094,
"learning_rate": 0.0011454545454545454,
"loss": 2.2328,
"step": 48
},
{
"epoch": 1.322689075630252,
"grad_norm": 0.10243827849626541,
"learning_rate": 0.0011090909090909092,
"loss": 2.254,
"step": 50
},
{
"epoch": 1.3764705882352941,
"grad_norm": 0.09242815524339676,
"learning_rate": 0.0010727272727272727,
"loss": 2.3295,
"step": 52
},
{
"epoch": 1.4302521008403362,
"grad_norm": 0.09403648227453232,
"learning_rate": 0.0010363636363636365,
"loss": 2.2749,
"step": 54
},
{
"epoch": 1.4840336134453782,
"grad_norm": 0.09187959879636765,
"learning_rate": 0.001,
"loss": 2.2606,
"step": 56
},
{
"epoch": 1.53781512605042,
"grad_norm": 0.09116198122501373,
"learning_rate": 0.0009636363636363637,
"loss": 2.2676,
"step": 58
},
{
"epoch": 1.5915966386554623,
"grad_norm": 0.08270075172185898,
"learning_rate": 0.0009272727272727273,
"loss": 2.2741,
"step": 60
},
{
"epoch": 1.5915966386554623,
"eval_loss": 2.3082737922668457,
"eval_runtime": 84.2709,
"eval_samples_per_second": 14.121,
"eval_steps_per_second": 1.768,
"step": 60
},
{
"epoch": 1.6453781512605041,
"grad_norm": 0.09275200217962265,
"learning_rate": 0.0008909090909090909,
"loss": 2.2582,
"step": 62
},
{
"epoch": 1.6991596638655462,
"grad_norm": 0.09241969138383865,
"learning_rate": 0.0008545454545454545,
"loss": 2.2554,
"step": 64
},
{
"epoch": 1.7529411764705882,
"grad_norm": 0.08338718116283417,
"learning_rate": 0.0008181818181818183,
"loss": 2.2244,
"step": 66
},
{
"epoch": 1.8067226890756303,
"grad_norm": 0.09568168222904205,
"learning_rate": 0.0007818181818181819,
"loss": 2.2719,
"step": 68
},
{
"epoch": 1.8605042016806723,
"grad_norm": 0.0905410498380661,
"learning_rate": 0.0007454545454545455,
"loss": 2.2505,
"step": 70
},
{
"epoch": 1.9142857142857141,
"grad_norm": 0.08841802924871445,
"learning_rate": 0.0007090909090909091,
"loss": 2.3005,
"step": 72
},
{
"epoch": 1.9680672268907564,
"grad_norm": 0.09013470262289047,
"learning_rate": 0.0006727272727272728,
"loss": 2.2682,
"step": 74
},
{
"epoch": 2.0,
"grad_norm": 0.19737772643566132,
"learning_rate": 0.0006363636363636364,
"loss": 2.3476,
"step": 76
},
{
"epoch": 2.053781512605042,
"grad_norm": 0.0839110016822815,
"learning_rate": 0.0006,
"loss": 2.2338,
"step": 78
},
{
"epoch": 2.107563025210084,
"grad_norm": 0.10582801699638367,
"learning_rate": 0.0005636363636363636,
"loss": 2.2257,
"step": 80
},
{
"epoch": 2.107563025210084,
"eval_loss": 2.304074764251709,
"eval_runtime": 84.3347,
"eval_samples_per_second": 14.11,
"eval_steps_per_second": 1.767,
"step": 80
},
{
"epoch": 2.161344537815126,
"grad_norm": 0.09145358949899673,
"learning_rate": 0.0005272727272727272,
"loss": 2.2488,
"step": 82
},
{
"epoch": 2.215126050420168,
"grad_norm": 0.08459240943193436,
"learning_rate": 0.0004909090909090909,
"loss": 2.2518,
"step": 84
},
{
"epoch": 2.26890756302521,
"grad_norm": 0.09590018540620804,
"learning_rate": 0.00045454545454545455,
"loss": 2.2324,
"step": 86
},
{
"epoch": 2.3226890756302523,
"grad_norm": 0.10032965242862701,
"learning_rate": 0.00041818181818181814,
"loss": 2.2099,
"step": 88
},
{
"epoch": 2.376470588235294,
"grad_norm": 0.09092257171869278,
"learning_rate": 0.00038181818181818184,
"loss": 2.2077,
"step": 90
},
{
"epoch": 2.4302521008403364,
"grad_norm": 0.10066290944814682,
"learning_rate": 0.00034545454545454544,
"loss": 2.2629,
"step": 92
},
{
"epoch": 2.484033613445378,
"grad_norm": 0.0973694771528244,
"learning_rate": 0.0003090909090909091,
"loss": 2.2292,
"step": 94
},
{
"epoch": 2.53781512605042,
"grad_norm": 0.09254106879234314,
"learning_rate": 0.00027272727272727274,
"loss": 2.1923,
"step": 96
},
{
"epoch": 2.5915966386554623,
"grad_norm": 0.10056042671203613,
"learning_rate": 0.00023636363636363636,
"loss": 2.2445,
"step": 98
},
{
"epoch": 2.645378151260504,
"grad_norm": 0.09601625055074692,
"learning_rate": 0.0002,
"loss": 2.2605,
"step": 100
},
{
"epoch": 2.645378151260504,
"eval_loss": 2.3115394115448,
"eval_runtime": 84.2807,
"eval_samples_per_second": 14.119,
"eval_steps_per_second": 1.768,
"step": 100
},
{
"epoch": 2.6991596638655464,
"grad_norm": 0.09498832374811172,
"learning_rate": 0.00016363636363636363,
"loss": 2.215,
"step": 102
},
{
"epoch": 2.7529411764705882,
"grad_norm": 0.09191343188285828,
"learning_rate": 0.00012727272727272725,
"loss": 2.2116,
"step": 104
},
{
"epoch": 2.80672268907563,
"grad_norm": 0.10717286169528961,
"learning_rate": 9.090909090909092e-05,
"loss": 2.2435,
"step": 106
},
{
"epoch": 2.8605042016806723,
"grad_norm": 0.09715902805328369,
"learning_rate": 5.4545454545454546e-05,
"loss": 2.2196,
"step": 108
},
{
"epoch": 2.914285714285714,
"grad_norm": 0.10500436276197433,
"learning_rate": 1.8181818181818182e-05,
"loss": 2.2351,
"step": 110
},
{
"epoch": 2.9411764705882355,
"step": 111,
"total_flos": 8.1776874848256e+17,
"train_loss": 2.2894913076280474,
"train_runtime": 2825.7191,
"train_samples_per_second": 10.107,
"train_steps_per_second": 0.039
},
{
"epoch": 2.9411764705882355,
"eval_loss": 2.311664342880249,
"eval_runtime": 84.4111,
"eval_samples_per_second": 14.098,
"eval_steps_per_second": 1.765,
"step": 111
},
{
"epoch": 2.9411764705882355,
"eval_loss": 2.3323311805725098,
"eval_runtime": 84.1513,
"eval_samples_per_second": 14.141,
"eval_steps_per_second": 1.771,
"step": 111
}
],
"logging_steps": 2,
"max_steps": 111,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.1776874848256e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}