checkpoint-100 / trainer_state.json
models4world's picture
Upload trainer_state.json with huggingface_hub
a4a6ce9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.26628895184136,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0226628895184136,
"grad_norm": 5.715946698612309,
"learning_rate": 3.7037037037037036e-08,
"loss": 1.039,
"step": 1
},
{
"epoch": 0.0453257790368272,
"grad_norm": 5.918098634610158,
"learning_rate": 7.407407407407407e-08,
"loss": 1.0345,
"step": 2
},
{
"epoch": 0.0679886685552408,
"grad_norm": 5.967358491879423,
"learning_rate": 1.111111111111111e-07,
"loss": 1.0568,
"step": 3
},
{
"epoch": 0.0906515580736544,
"grad_norm": 6.076151471056227,
"learning_rate": 1.4814814814814815e-07,
"loss": 1.0407,
"step": 4
},
{
"epoch": 0.11331444759206799,
"grad_norm": 5.698276915195162,
"learning_rate": 1.8518518518518516e-07,
"loss": 1.0355,
"step": 5
},
{
"epoch": 0.1359773371104816,
"grad_norm": 5.524873495595531,
"learning_rate": 2.222222222222222e-07,
"loss": 1.0329,
"step": 6
},
{
"epoch": 0.15864022662889518,
"grad_norm": 5.663139068043792,
"learning_rate": 2.5925925925925923e-07,
"loss": 1.013,
"step": 7
},
{
"epoch": 0.1813031161473088,
"grad_norm": 5.483842003291619,
"learning_rate": 2.962962962962963e-07,
"loss": 1.0285,
"step": 8
},
{
"epoch": 0.20396600566572237,
"grad_norm": 5.501921058157795,
"learning_rate": 3.333333333333333e-07,
"loss": 1.0181,
"step": 9
},
{
"epoch": 0.22662889518413598,
"grad_norm": 5.691661611678567,
"learning_rate": 3.703703703703703e-07,
"loss": 1.0046,
"step": 10
},
{
"epoch": 0.24929178470254956,
"grad_norm": 5.490524973688248,
"learning_rate": 4.0740740740740737e-07,
"loss": 1.0291,
"step": 11
},
{
"epoch": 0.2719546742209632,
"grad_norm": 4.885236117260528,
"learning_rate": 4.444444444444444e-07,
"loss": 1.0084,
"step": 12
},
{
"epoch": 0.29461756373937675,
"grad_norm": 5.256688897749667,
"learning_rate": 4.814814814814814e-07,
"loss": 0.9945,
"step": 13
},
{
"epoch": 0.31728045325779036,
"grad_norm": 5.026023661790397,
"learning_rate": 5.185185185185185e-07,
"loss": 0.9936,
"step": 14
},
{
"epoch": 0.33994334277620397,
"grad_norm": 4.979666180740075,
"learning_rate": 5.555555555555555e-07,
"loss": 0.9997,
"step": 15
},
{
"epoch": 0.3626062322946176,
"grad_norm": 4.741351636847691,
"learning_rate": 5.925925925925926e-07,
"loss": 0.9904,
"step": 16
},
{
"epoch": 0.38526912181303113,
"grad_norm": 4.429638197959212,
"learning_rate": 6.296296296296296e-07,
"loss": 0.9779,
"step": 17
},
{
"epoch": 0.40793201133144474,
"grad_norm": 4.2702651723674006,
"learning_rate": 6.666666666666666e-07,
"loss": 0.9373,
"step": 18
},
{
"epoch": 0.43059490084985835,
"grad_norm": 4.371215055008036,
"learning_rate": 7.037037037037037e-07,
"loss": 0.9616,
"step": 19
},
{
"epoch": 0.45325779036827196,
"grad_norm": 4.300078040900759,
"learning_rate": 7.407407407407406e-07,
"loss": 0.9581,
"step": 20
},
{
"epoch": 0.47592067988668557,
"grad_norm": 4.242855799180736,
"learning_rate": 7.777777777777778e-07,
"loss": 0.9454,
"step": 21
},
{
"epoch": 0.4985835694050991,
"grad_norm": 3.4536592234259555,
"learning_rate": 8.148148148148147e-07,
"loss": 0.9274,
"step": 22
},
{
"epoch": 0.5212464589235127,
"grad_norm": 3.3525795982748203,
"learning_rate": 8.518518518518518e-07,
"loss": 0.8833,
"step": 23
},
{
"epoch": 0.5439093484419264,
"grad_norm": 3.110575381958802,
"learning_rate": 8.888888888888888e-07,
"loss": 0.9066,
"step": 24
},
{
"epoch": 0.56657223796034,
"grad_norm": 3.18785930927135,
"learning_rate": 9.259259259259259e-07,
"loss": 0.8896,
"step": 25
},
{
"epoch": 0.5892351274787535,
"grad_norm": 3.0188412291205684,
"learning_rate": 9.629629629629628e-07,
"loss": 0.9068,
"step": 26
},
{
"epoch": 0.6118980169971672,
"grad_norm": 3.0072699515749344,
"learning_rate": 1e-06,
"loss": 0.8959,
"step": 27
},
{
"epoch": 0.6345609065155807,
"grad_norm": 3.050779999599616,
"learning_rate": 9.999560724782173e-07,
"loss": 0.8648,
"step": 28
},
{
"epoch": 0.6572237960339944,
"grad_norm": 3.034749793056673,
"learning_rate": 9.998242976313776e-07,
"loss": 0.8763,
"step": 29
},
{
"epoch": 0.6798866855524079,
"grad_norm": 2.6230160618361897,
"learning_rate": 9.996046986136508e-07,
"loss": 0.8439,
"step": 30
},
{
"epoch": 0.7025495750708215,
"grad_norm": 2.619746810094255,
"learning_rate": 9.992973140107996e-07,
"loss": 0.8395,
"step": 31
},
{
"epoch": 0.7252124645892352,
"grad_norm": 2.2660982887250496,
"learning_rate": 9.989021978333994e-07,
"loss": 0.8407,
"step": 32
},
{
"epoch": 0.7478753541076487,
"grad_norm": 1.92948640709938,
"learning_rate": 9.984194195073478e-07,
"loss": 0.8175,
"step": 33
},
{
"epoch": 0.7705382436260623,
"grad_norm": 1.8673042037436878,
"learning_rate": 9.97849063861667e-07,
"loss": 0.7963,
"step": 34
},
{
"epoch": 0.7932011331444759,
"grad_norm": 1.841378707582655,
"learning_rate": 9.971912311135967e-07,
"loss": 0.8177,
"step": 35
},
{
"epoch": 0.8158640226628895,
"grad_norm": 1.6212101538356403,
"learning_rate": 9.964460368509865e-07,
"loss": 0.8036,
"step": 36
},
{
"epoch": 0.8385269121813032,
"grad_norm": 1.6148282593388759,
"learning_rate": 9.956136120119856e-07,
"loss": 0.7945,
"step": 37
},
{
"epoch": 0.8611898016997167,
"grad_norm": 1.5660870386151309,
"learning_rate": 9.946941028620347e-07,
"loss": 0.7919,
"step": 38
},
{
"epoch": 0.8838526912181303,
"grad_norm": 1.5162976532167538,
"learning_rate": 9.936876709681666e-07,
"loss": 0.7965,
"step": 39
},
{
"epoch": 0.9065155807365439,
"grad_norm": 1.4779616090178773,
"learning_rate": 9.92594493170617e-07,
"loss": 0.7872,
"step": 40
},
{
"epoch": 0.9291784702549575,
"grad_norm": 1.4588545367417372,
"learning_rate": 9.914147615517526e-07,
"loss": 0.7933,
"step": 41
},
{
"epoch": 0.9518413597733711,
"grad_norm": 1.2450088034935203,
"learning_rate": 9.901486834023181e-07,
"loss": 0.7401,
"step": 42
},
{
"epoch": 0.9745042492917847,
"grad_norm": 1.1159548060929454,
"learning_rate": 9.887964811850157e-07,
"loss": 0.7496,
"step": 43
},
{
"epoch": 0.9971671388101983,
"grad_norm": 1.0418410473138606,
"learning_rate": 9.87358392495415e-07,
"loss": 0.7568,
"step": 44
},
{
"epoch": 1.019830028328612,
"grad_norm": 2.1594760368768195,
"learning_rate": 9.858346700202048e-07,
"loss": 1.3469,
"step": 45
},
{
"epoch": 1.0424929178470255,
"grad_norm": 0.9706954495224399,
"learning_rate": 9.842255814927944e-07,
"loss": 0.7412,
"step": 46
},
{
"epoch": 1.065155807365439,
"grad_norm": 0.9479843401943371,
"learning_rate": 9.825314096462684e-07,
"loss": 0.712,
"step": 47
},
{
"epoch": 1.0878186968838528,
"grad_norm": 0.8785518016295425,
"learning_rate": 9.807524521637102e-07,
"loss": 0.721,
"step": 48
},
{
"epoch": 1.1104815864022664,
"grad_norm": 0.9083971698155864,
"learning_rate": 9.788890216258938e-07,
"loss": 0.7405,
"step": 49
},
{
"epoch": 1.13314447592068,
"grad_norm": 0.9052818651846114,
"learning_rate": 9.769414454563615e-07,
"loss": 0.7223,
"step": 50
},
{
"epoch": 1.1558073654390935,
"grad_norm": 0.8244297426454674,
"learning_rate": 9.749100658638914e-07,
"loss": 0.7113,
"step": 51
},
{
"epoch": 1.178470254957507,
"grad_norm": 0.7448472800310213,
"learning_rate": 9.72795239782369e-07,
"loss": 0.7001,
"step": 52
},
{
"epoch": 1.2011331444759206,
"grad_norm": 0.8936397991398377,
"learning_rate": 9.705973388080692e-07,
"loss": 0.6924,
"step": 53
},
{
"epoch": 1.2237960339943343,
"grad_norm": 0.7188466048624885,
"learning_rate": 9.68316749134364e-07,
"loss": 0.7005,
"step": 54
},
{
"epoch": 1.246458923512748,
"grad_norm": 0.6923178573722074,
"learning_rate": 9.659538714838633e-07,
"loss": 0.6983,
"step": 55
},
{
"epoch": 1.2691218130311614,
"grad_norm": 0.6963394168232236,
"learning_rate": 9.63509121038005e-07,
"loss": 0.6932,
"step": 56
},
{
"epoch": 1.291784702549575,
"grad_norm": 0.6743675615821408,
"learning_rate": 9.609829273641032e-07,
"loss": 0.6789,
"step": 57
},
{
"epoch": 1.3144475920679888,
"grad_norm": 0.6786035246894967,
"learning_rate": 9.583757343398684e-07,
"loss": 0.6628,
"step": 58
},
{
"epoch": 1.3371104815864023,
"grad_norm": 0.7270460673039131,
"learning_rate": 9.55688000075414e-07,
"loss": 0.6831,
"step": 59
},
{
"epoch": 1.3597733711048159,
"grad_norm": 0.6841455902480504,
"learning_rate": 9.529201968327616e-07,
"loss": 0.6951,
"step": 60
},
{
"epoch": 1.3824362606232294,
"grad_norm": 0.6153616879449294,
"learning_rate": 9.500728109428603e-07,
"loss": 0.676,
"step": 61
},
{
"epoch": 1.405099150141643,
"grad_norm": 0.6177487537567523,
"learning_rate": 9.47146342720133e-07,
"loss": 0.6842,
"step": 62
},
{
"epoch": 1.4277620396600565,
"grad_norm": 0.5753559089127149,
"learning_rate": 9.441413063745659e-07,
"loss": 0.6408,
"step": 63
},
{
"epoch": 1.4504249291784703,
"grad_norm": 0.620464077741966,
"learning_rate": 9.410582299213572e-07,
"loss": 0.6952,
"step": 64
},
{
"epoch": 1.4730878186968839,
"grad_norm": 0.587732312757755,
"learning_rate": 9.378976550881392e-07,
"loss": 0.6897,
"step": 65
},
{
"epoch": 1.4957507082152974,
"grad_norm": 0.6133303288545134,
"learning_rate": 9.346601372197913e-07,
"loss": 0.6319,
"step": 66
},
{
"epoch": 1.5184135977337112,
"grad_norm": 0.5975684805854956,
"learning_rate": 9.313462451808599e-07,
"loss": 0.7085,
"step": 67
},
{
"epoch": 1.5410764872521248,
"grad_norm": 0.5691716789827311,
"learning_rate": 9.279565612556042e-07,
"loss": 0.6799,
"step": 68
},
{
"epoch": 1.5637393767705383,
"grad_norm": 0.5623581760482004,
"learning_rate": 9.24491681045682e-07,
"loss": 0.6627,
"step": 69
},
{
"epoch": 1.5864022662889519,
"grad_norm": 0.5545018113642449,
"learning_rate": 9.209522133654968e-07,
"loss": 0.6673,
"step": 70
},
{
"epoch": 1.6090651558073654,
"grad_norm": 0.6223379664208608,
"learning_rate": 9.17338780135223e-07,
"loss": 0.6682,
"step": 71
},
{
"epoch": 1.631728045325779,
"grad_norm": 0.5484348938274137,
"learning_rate": 9.136520162715286e-07,
"loss": 0.6459,
"step": 72
},
{
"epoch": 1.6543909348441925,
"grad_norm": 0.598633459691356,
"learning_rate": 9.098925695760131e-07,
"loss": 0.6663,
"step": 73
},
{
"epoch": 1.677053824362606,
"grad_norm": 0.6063642708751795,
"learning_rate": 9.060611006213832e-07,
"loss": 0.6471,
"step": 74
},
{
"epoch": 1.6997167138810199,
"grad_norm": 0.5310843433827631,
"learning_rate": 9.021582826353824e-07,
"loss": 0.6422,
"step": 75
},
{
"epoch": 1.7223796033994334,
"grad_norm": 0.5899701772442509,
"learning_rate": 8.981848013824993e-07,
"loss": 0.6616,
"step": 76
},
{
"epoch": 1.7450424929178472,
"grad_norm": 0.6774981304086599,
"learning_rate": 8.94141355043471e-07,
"loss": 0.6442,
"step": 77
},
{
"epoch": 1.7677053824362607,
"grad_norm": 0.5555862881849043,
"learning_rate": 8.90028654092606e-07,
"loss": 0.6427,
"step": 78
},
{
"epoch": 1.7903682719546743,
"grad_norm": 0.5521769324318557,
"learning_rate": 8.858474211729469e-07,
"loss": 0.6308,
"step": 79
},
{
"epoch": 1.8130311614730878,
"grad_norm": 0.5094008328024741,
"learning_rate": 8.815983909692941e-07,
"loss": 0.6375,
"step": 80
},
{
"epoch": 1.8356940509915014,
"grad_norm": 0.47949684902186096,
"learning_rate": 8.77282310079115e-07,
"loss": 0.6124,
"step": 81
},
{
"epoch": 1.858356940509915,
"grad_norm": 0.5457213358478963,
"learning_rate": 8.72899936881359e-07,
"loss": 0.676,
"step": 82
},
{
"epoch": 1.8810198300283285,
"grad_norm": 0.5475114660934921,
"learning_rate": 8.684520414032023e-07,
"loss": 0.6462,
"step": 83
},
{
"epoch": 1.903682719546742,
"grad_norm": 0.5780771596548755,
"learning_rate": 8.639394051847471e-07,
"loss": 0.629,
"step": 84
},
{
"epoch": 1.9263456090651558,
"grad_norm": 0.5153044368837152,
"learning_rate": 8.593628211416963e-07,
"loss": 0.6607,
"step": 85
},
{
"epoch": 1.9490084985835694,
"grad_norm": 0.5078347714748787,
"learning_rate": 8.547230934260311e-07,
"loss": 0.653,
"step": 86
},
{
"epoch": 1.9716713881019832,
"grad_norm": 0.5090369208403657,
"learning_rate": 8.500210372847126e-07,
"loss": 0.6555,
"step": 87
},
{
"epoch": 1.9943342776203967,
"grad_norm": 0.521639825746896,
"learning_rate": 8.45257478916435e-07,
"loss": 0.6187,
"step": 88
},
{
"epoch": 2.0169971671388103,
"grad_norm": 1.4410319682327064,
"learning_rate": 8.404332553264546e-07,
"loss": 1.1825,
"step": 89
},
{
"epoch": 2.039660056657224,
"grad_norm": 0.5362038209234066,
"learning_rate": 8.355492141795184e-07,
"loss": 0.6046,
"step": 90
},
{
"epoch": 2.0623229461756374,
"grad_norm": 0.5295224430525873,
"learning_rate": 8.306062136509219e-07,
"loss": 0.607,
"step": 91
},
{
"epoch": 2.084985835694051,
"grad_norm": 0.5438204610049183,
"learning_rate": 8.256051222757187e-07,
"loss": 0.6425,
"step": 92
},
{
"epoch": 2.1076487252124645,
"grad_norm": 0.5637438849056178,
"learning_rate": 8.2054681879611e-07,
"loss": 0.6472,
"step": 93
},
{
"epoch": 2.130311614730878,
"grad_norm": 0.4915626737833171,
"learning_rate": 8.154321920070412e-07,
"loss": 0.6366,
"step": 94
},
{
"epoch": 2.1529745042492916,
"grad_norm": 0.5445000714826581,
"learning_rate": 8.102621406000308e-07,
"loss": 0.6302,
"step": 95
},
{
"epoch": 2.1756373937677056,
"grad_norm": 0.544017574639994,
"learning_rate": 8.050375730052621e-07,
"loss": 0.6016,
"step": 96
},
{
"epoch": 2.198300283286119,
"grad_norm": 0.7667138278664033,
"learning_rate": 7.997594072319625e-07,
"loss": 0.6476,
"step": 97
},
{
"epoch": 2.2209631728045327,
"grad_norm": 0.5723261101431134,
"learning_rate": 7.944285707070997e-07,
"loss": 0.5982,
"step": 98
},
{
"epoch": 2.2436260623229463,
"grad_norm": 0.5198427284810859,
"learning_rate": 7.890460001124241e-07,
"loss": 0.6373,
"step": 99
},
{
"epoch": 2.26628895184136,
"grad_norm": 0.5082652201383684,
"learning_rate": 7.83612641219884e-07,
"loss": 0.5894,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 264,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 208143843852288.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}