learning_dynamic / checkpoint-100 /trainer_state.json
XiangPan's picture
Training in progress, step 100, checkpoint
c7e0f24 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.006993006993006993,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 6.993006993006993e-05,
"grad_norm": 5.159224033355713,
"learning_rate": 1.048951048951049e-06,
"loss": 11.0749,
"step": 1
},
{
"epoch": 0.00013986013986013986,
"grad_norm": 5.171240329742432,
"learning_rate": 2.097902097902098e-06,
"loss": 11.076,
"step": 2
},
{
"epoch": 0.0002097902097902098,
"grad_norm": 5.199888229370117,
"learning_rate": 3.146853146853147e-06,
"loss": 11.0669,
"step": 3
},
{
"epoch": 0.0002797202797202797,
"grad_norm": 5.08656120300293,
"learning_rate": 4.195804195804196e-06,
"loss": 11.0237,
"step": 4
},
{
"epoch": 0.00034965034965034965,
"grad_norm": 5.065122127532959,
"learning_rate": 5.244755244755245e-06,
"loss": 10.8949,
"step": 5
},
{
"epoch": 0.0004195804195804196,
"grad_norm": 4.298279762268066,
"learning_rate": 6.293706293706294e-06,
"loss": 10.675,
"step": 6
},
{
"epoch": 0.0004895104895104895,
"grad_norm": 3.8743011951446533,
"learning_rate": 7.342657342657342e-06,
"loss": 10.4729,
"step": 7
},
{
"epoch": 0.0005594405594405594,
"grad_norm": 3.7294981479644775,
"learning_rate": 8.391608391608391e-06,
"loss": 10.3221,
"step": 8
},
{
"epoch": 0.0006293706293706294,
"grad_norm": 3.3471932411193848,
"learning_rate": 9.44055944055944e-06,
"loss": 10.1544,
"step": 9
},
{
"epoch": 0.0006993006993006993,
"grad_norm": 3.088207721710205,
"learning_rate": 1.048951048951049e-05,
"loss": 10.0823,
"step": 10
},
{
"epoch": 0.0007692307692307692,
"grad_norm": 2.713122606277466,
"learning_rate": 1.1538461538461538e-05,
"loss": 9.9324,
"step": 11
},
{
"epoch": 0.0008391608391608392,
"grad_norm": 2.5934906005859375,
"learning_rate": 1.2587412587412587e-05,
"loss": 9.8238,
"step": 12
},
{
"epoch": 0.0009090909090909091,
"grad_norm": 2.258924722671509,
"learning_rate": 1.3636363636363635e-05,
"loss": 9.7534,
"step": 13
},
{
"epoch": 0.000979020979020979,
"grad_norm": 2.0431277751922607,
"learning_rate": 1.4685314685314684e-05,
"loss": 9.683,
"step": 14
},
{
"epoch": 0.001048951048951049,
"grad_norm": 2.0174779891967773,
"learning_rate": 1.5734265734265734e-05,
"loss": 9.5835,
"step": 15
},
{
"epoch": 0.0011188811188811189,
"grad_norm": 1.9392926692962646,
"learning_rate": 1.6783216783216783e-05,
"loss": 9.566,
"step": 16
},
{
"epoch": 0.0011888111888111888,
"grad_norm": 1.841917634010315,
"learning_rate": 1.783216783216783e-05,
"loss": 9.4862,
"step": 17
},
{
"epoch": 0.0012587412587412587,
"grad_norm": 1.6751583814620972,
"learning_rate": 1.888111888111888e-05,
"loss": 9.4613,
"step": 18
},
{
"epoch": 0.0013286713286713287,
"grad_norm": 1.701292634010315,
"learning_rate": 1.9930069930069928e-05,
"loss": 9.3861,
"step": 19
},
{
"epoch": 0.0013986013986013986,
"grad_norm": 1.7422462701797485,
"learning_rate": 2.097902097902098e-05,
"loss": 9.3607,
"step": 20
},
{
"epoch": 0.0014685314685314685,
"grad_norm": 1.7331982851028442,
"learning_rate": 2.2027972027972026e-05,
"loss": 9.3106,
"step": 21
},
{
"epoch": 0.0015384615384615385,
"grad_norm": 1.8083829879760742,
"learning_rate": 2.3076923076923076e-05,
"loss": 9.252,
"step": 22
},
{
"epoch": 0.0016083916083916084,
"grad_norm": 1.7314563989639282,
"learning_rate": 2.412587412587412e-05,
"loss": 9.2325,
"step": 23
},
{
"epoch": 0.0016783216783216783,
"grad_norm": 1.5793718099594116,
"learning_rate": 2.5174825174825174e-05,
"loss": 9.2489,
"step": 24
},
{
"epoch": 0.0017482517482517483,
"grad_norm": 1.5183552503585815,
"learning_rate": 2.622377622377622e-05,
"loss": 9.1821,
"step": 25
},
{
"epoch": 0.0018181818181818182,
"grad_norm": 1.5751609802246094,
"learning_rate": 2.727272727272727e-05,
"loss": 9.1217,
"step": 26
},
{
"epoch": 0.0018881118881118881,
"grad_norm": 1.541367530822754,
"learning_rate": 2.832167832167832e-05,
"loss": 9.1186,
"step": 27
},
{
"epoch": 0.001958041958041958,
"grad_norm": 1.5820801258087158,
"learning_rate": 2.937062937062937e-05,
"loss": 9.105,
"step": 28
},
{
"epoch": 0.002027972027972028,
"grad_norm": 1.483319640159607,
"learning_rate": 3.0419580419580414e-05,
"loss": 9.0772,
"step": 29
},
{
"epoch": 0.002097902097902098,
"grad_norm": 1.5269867181777954,
"learning_rate": 3.146853146853147e-05,
"loss": 9.0185,
"step": 30
},
{
"epoch": 0.002167832167832168,
"grad_norm": 1.4273761510849,
"learning_rate": 3.251748251748251e-05,
"loss": 9.0013,
"step": 31
},
{
"epoch": 0.0022377622377622378,
"grad_norm": 1.4377866983413696,
"learning_rate": 3.3566433566433566e-05,
"loss": 8.9161,
"step": 32
},
{
"epoch": 0.002307692307692308,
"grad_norm": 1.369158148765564,
"learning_rate": 3.461538461538461e-05,
"loss": 8.917,
"step": 33
},
{
"epoch": 0.0023776223776223776,
"grad_norm": 1.328121542930603,
"learning_rate": 3.566433566433566e-05,
"loss": 8.8484,
"step": 34
},
{
"epoch": 0.0024475524475524478,
"grad_norm": 1.289856195449829,
"learning_rate": 3.671328671328671e-05,
"loss": 8.8637,
"step": 35
},
{
"epoch": 0.0025174825174825175,
"grad_norm": 1.2946962118148804,
"learning_rate": 3.776223776223776e-05,
"loss": 8.8045,
"step": 36
},
{
"epoch": 0.0025874125874125876,
"grad_norm": 1.3396406173706055,
"learning_rate": 3.881118881118881e-05,
"loss": 8.7569,
"step": 37
},
{
"epoch": 0.0026573426573426573,
"grad_norm": 1.3043384552001953,
"learning_rate": 3.9860139860139855e-05,
"loss": 8.7184,
"step": 38
},
{
"epoch": 0.0027272727272727275,
"grad_norm": 1.2441431283950806,
"learning_rate": 4.09090909090909e-05,
"loss": 8.66,
"step": 39
},
{
"epoch": 0.002797202797202797,
"grad_norm": 1.1379547119140625,
"learning_rate": 4.195804195804196e-05,
"loss": 8.6217,
"step": 40
},
{
"epoch": 0.0028671328671328673,
"grad_norm": 1.1792885065078735,
"learning_rate": 4.300699300699301e-05,
"loss": 8.5006,
"step": 41
},
{
"epoch": 0.002937062937062937,
"grad_norm": 1.183509349822998,
"learning_rate": 4.405594405594405e-05,
"loss": 8.5037,
"step": 42
},
{
"epoch": 0.003006993006993007,
"grad_norm": 1.1481878757476807,
"learning_rate": 4.51048951048951e-05,
"loss": 8.5114,
"step": 43
},
{
"epoch": 0.003076923076923077,
"grad_norm": 1.1832213401794434,
"learning_rate": 4.615384615384615e-05,
"loss": 8.4117,
"step": 44
},
{
"epoch": 0.003146853146853147,
"grad_norm": 1.0428388118743896,
"learning_rate": 4.72027972027972e-05,
"loss": 8.4378,
"step": 45
},
{
"epoch": 0.0032167832167832168,
"grad_norm": 1.1736469268798828,
"learning_rate": 4.825174825174824e-05,
"loss": 8.2981,
"step": 46
},
{
"epoch": 0.003286713286713287,
"grad_norm": 1.0533231496810913,
"learning_rate": 4.930069930069929e-05,
"loss": 8.3406,
"step": 47
},
{
"epoch": 0.0033566433566433566,
"grad_norm": 1.1464074850082397,
"learning_rate": 5.034965034965035e-05,
"loss": 8.2388,
"step": 48
},
{
"epoch": 0.003426573426573427,
"grad_norm": 1.224696159362793,
"learning_rate": 5.1398601398601395e-05,
"loss": 8.208,
"step": 49
},
{
"epoch": 0.0034965034965034965,
"grad_norm": 1.1058298349380493,
"learning_rate": 5.244755244755244e-05,
"loss": 8.192,
"step": 50
},
{
"epoch": 0.0035664335664335666,
"grad_norm": 1.0586011409759521,
"learning_rate": 5.3496503496503493e-05,
"loss": 8.1369,
"step": 51
},
{
"epoch": 0.0036363636363636364,
"grad_norm": 1.0969116687774658,
"learning_rate": 5.454545454545454e-05,
"loss": 8.0515,
"step": 52
},
{
"epoch": 0.0037062937062937065,
"grad_norm": 0.9527297616004944,
"learning_rate": 5.559440559440559e-05,
"loss": 8.0319,
"step": 53
},
{
"epoch": 0.0037762237762237762,
"grad_norm": 1.0115875005722046,
"learning_rate": 5.664335664335664e-05,
"loss": 7.9882,
"step": 54
},
{
"epoch": 0.0038461538461538464,
"grad_norm": 1.044940710067749,
"learning_rate": 5.769230769230769e-05,
"loss": 7.9776,
"step": 55
},
{
"epoch": 0.003916083916083916,
"grad_norm": 0.9222269058227539,
"learning_rate": 5.874125874125874e-05,
"loss": 7.9327,
"step": 56
},
{
"epoch": 0.003986013986013986,
"grad_norm": 0.9342182278633118,
"learning_rate": 5.979020979020978e-05,
"loss": 7.853,
"step": 57
},
{
"epoch": 0.004055944055944056,
"grad_norm": 0.9146339297294617,
"learning_rate": 6.083916083916083e-05,
"loss": 7.8091,
"step": 58
},
{
"epoch": 0.004125874125874126,
"grad_norm": 0.8632137775421143,
"learning_rate": 6.188811188811188e-05,
"loss": 7.7744,
"step": 59
},
{
"epoch": 0.004195804195804196,
"grad_norm": 1.001086711883545,
"learning_rate": 6.293706293706293e-05,
"loss": 7.7623,
"step": 60
},
{
"epoch": 0.0042657342657342655,
"grad_norm": 0.9110626578330994,
"learning_rate": 6.398601398601397e-05,
"loss": 7.7356,
"step": 61
},
{
"epoch": 0.004335664335664336,
"grad_norm": 0.7626254558563232,
"learning_rate": 6.503496503496503e-05,
"loss": 7.6801,
"step": 62
},
{
"epoch": 0.004405594405594406,
"grad_norm": 1.0254472494125366,
"learning_rate": 6.608391608391608e-05,
"loss": 7.6111,
"step": 63
},
{
"epoch": 0.0044755244755244755,
"grad_norm": 0.9034542441368103,
"learning_rate": 6.713286713286713e-05,
"loss": 7.603,
"step": 64
},
{
"epoch": 0.004545454545454545,
"grad_norm": 0.685672402381897,
"learning_rate": 6.818181818181817e-05,
"loss": 7.5614,
"step": 65
},
{
"epoch": 0.004615384615384616,
"grad_norm": 0.9960275292396545,
"learning_rate": 6.923076923076922e-05,
"loss": 7.5537,
"step": 66
},
{
"epoch": 0.0046853146853146855,
"grad_norm": 0.6702042818069458,
"learning_rate": 7.027972027972028e-05,
"loss": 7.4794,
"step": 67
},
{
"epoch": 0.004755244755244755,
"grad_norm": 0.8843338489532471,
"learning_rate": 7.132867132867132e-05,
"loss": 7.4486,
"step": 68
},
{
"epoch": 0.004825174825174825,
"grad_norm": 0.7096754908561707,
"learning_rate": 7.237762237762237e-05,
"loss": 7.4253,
"step": 69
},
{
"epoch": 0.0048951048951048955,
"grad_norm": 0.8532392382621765,
"learning_rate": 7.342657342657342e-05,
"loss": 7.383,
"step": 70
},
{
"epoch": 0.004965034965034965,
"grad_norm": 0.7407330870628357,
"learning_rate": 7.447552447552447e-05,
"loss": 7.4059,
"step": 71
},
{
"epoch": 0.005034965034965035,
"grad_norm": 0.7601925730705261,
"learning_rate": 7.552447552447553e-05,
"loss": 7.3522,
"step": 72
},
{
"epoch": 0.005104895104895105,
"grad_norm": 0.6095029711723328,
"learning_rate": 7.657342657342657e-05,
"loss": 7.3442,
"step": 73
},
{
"epoch": 0.005174825174825175,
"grad_norm": 0.8200113773345947,
"learning_rate": 7.762237762237762e-05,
"loss": 7.3272,
"step": 74
},
{
"epoch": 0.005244755244755245,
"grad_norm": 0.9570252299308777,
"learning_rate": 7.867132867132867e-05,
"loss": 7.2258,
"step": 75
},
{
"epoch": 0.005314685314685315,
"grad_norm": 1.005595088005066,
"learning_rate": 7.972027972027971e-05,
"loss": 7.2552,
"step": 76
},
{
"epoch": 0.005384615384615384,
"grad_norm": 0.7686155438423157,
"learning_rate": 8.076923076923076e-05,
"loss": 7.2615,
"step": 77
},
{
"epoch": 0.005454545454545455,
"grad_norm": 0.6844556927680969,
"learning_rate": 8.18181818181818e-05,
"loss": 7.1683,
"step": 78
},
{
"epoch": 0.005524475524475525,
"grad_norm": 0.665515661239624,
"learning_rate": 8.286713286713286e-05,
"loss": 7.2015,
"step": 79
},
{
"epoch": 0.005594405594405594,
"grad_norm": 0.7764729261398315,
"learning_rate": 8.391608391608392e-05,
"loss": 7.1474,
"step": 80
},
{
"epoch": 0.005664335664335664,
"grad_norm": 0.6877023577690125,
"learning_rate": 8.496503496503496e-05,
"loss": 7.1705,
"step": 81
},
{
"epoch": 0.005734265734265735,
"grad_norm": 0.678676426410675,
"learning_rate": 8.601398601398601e-05,
"loss": 7.1529,
"step": 82
},
{
"epoch": 0.005804195804195804,
"grad_norm": 0.8167915940284729,
"learning_rate": 8.706293706293705e-05,
"loss": 7.0738,
"step": 83
},
{
"epoch": 0.005874125874125874,
"grad_norm": 0.6965641975402832,
"learning_rate": 8.81118881118881e-05,
"loss": 7.1085,
"step": 84
},
{
"epoch": 0.005944055944055944,
"grad_norm": 0.9094026684761047,
"learning_rate": 8.916083916083914e-05,
"loss": 7.0906,
"step": 85
},
{
"epoch": 0.006013986013986014,
"grad_norm": 0.7994691133499146,
"learning_rate": 9.02097902097902e-05,
"loss": 7.0782,
"step": 86
},
{
"epoch": 0.006083916083916084,
"grad_norm": 1.007351279258728,
"learning_rate": 9.125874125874126e-05,
"loss": 7.0095,
"step": 87
},
{
"epoch": 0.006153846153846154,
"grad_norm": 0.8736310601234436,
"learning_rate": 9.23076923076923e-05,
"loss": 6.9629,
"step": 88
},
{
"epoch": 0.0062237762237762236,
"grad_norm": 0.6392287015914917,
"learning_rate": 9.335664335664336e-05,
"loss": 6.9908,
"step": 89
},
{
"epoch": 0.006293706293706294,
"grad_norm": 1.091141939163208,
"learning_rate": 9.44055944055944e-05,
"loss": 6.9523,
"step": 90
},
{
"epoch": 0.006363636363636364,
"grad_norm": 0.7908533215522766,
"learning_rate": 9.545454545454545e-05,
"loss": 6.9238,
"step": 91
},
{
"epoch": 0.0064335664335664336,
"grad_norm": 0.6706556081771851,
"learning_rate": 9.650349650349649e-05,
"loss": 6.9672,
"step": 92
},
{
"epoch": 0.006503496503496503,
"grad_norm": 0.9139024019241333,
"learning_rate": 9.755244755244754e-05,
"loss": 6.9997,
"step": 93
},
{
"epoch": 0.006573426573426574,
"grad_norm": 0.7192760109901428,
"learning_rate": 9.860139860139858e-05,
"loss": 6.936,
"step": 94
},
{
"epoch": 0.006643356643356644,
"grad_norm": 0.7734940052032471,
"learning_rate": 9.965034965034964e-05,
"loss": 7.0156,
"step": 95
},
{
"epoch": 0.006713286713286713,
"grad_norm": 0.8069332838058472,
"learning_rate": 0.0001006993006993007,
"loss": 6.8958,
"step": 96
},
{
"epoch": 0.006783216783216783,
"grad_norm": 1.0324732065200806,
"learning_rate": 0.00010174825174825174,
"loss": 6.8863,
"step": 97
},
{
"epoch": 0.006853146853146854,
"grad_norm": 0.6798093318939209,
"learning_rate": 0.00010279720279720279,
"loss": 6.8534,
"step": 98
},
{
"epoch": 0.006923076923076923,
"grad_norm": 0.7703111171722412,
"learning_rate": 0.00010384615384615383,
"loss": 6.7499,
"step": 99
},
{
"epoch": 0.006993006993006993,
"grad_norm": 0.941916823387146,
"learning_rate": 0.00010489510489510488,
"loss": 6.8574,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 14300,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.7825750933504e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}