affine-forge-test-16-2epoch / trainer_state.json
ATL-Machine's picture
Upload model (bfloat16) from script
1ccd6a0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.1621621621621623,
"eval_steps": 500,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02702702702702703,
"grad_norm": 0.042490359395742416,
"learning_rate": 0.0,
"loss": 0.1846,
"step": 1
},
{
"epoch": 0.05405405405405406,
"grad_norm": 0.04087316244840622,
"learning_rate": 1e-05,
"loss": 0.1756,
"step": 2
},
{
"epoch": 0.08108108108108109,
"grad_norm": 0.04440251365303993,
"learning_rate": 2e-05,
"loss": 0.1709,
"step": 3
},
{
"epoch": 0.10810810810810811,
"grad_norm": 0.07775039225816727,
"learning_rate": 3e-05,
"loss": 0.233,
"step": 4
},
{
"epoch": 0.13513513513513514,
"grad_norm": 0.04126206785440445,
"learning_rate": 4e-05,
"loss": 0.1952,
"step": 5
},
{
"epoch": 0.16216216216216217,
"grad_norm": 0.03525131568312645,
"learning_rate": 5e-05,
"loss": 0.1972,
"step": 6
},
{
"epoch": 0.1891891891891892,
"grad_norm": 0.028111394494771957,
"learning_rate": 4.9989020912719864e-05,
"loss": 0.1606,
"step": 7
},
{
"epoch": 0.21621621621621623,
"grad_norm": 0.026041388511657715,
"learning_rate": 4.995609329410804e-05,
"loss": 0.1998,
"step": 8
},
{
"epoch": 0.24324324324324326,
"grad_norm": 0.03152597323060036,
"learning_rate": 4.990124606538042e-05,
"loss": 0.132,
"step": 9
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.042134612798690796,
"learning_rate": 4.982452740033793e-05,
"loss": 0.1698,
"step": 10
},
{
"epoch": 0.2972972972972973,
"grad_norm": 0.030262183398008347,
"learning_rate": 4.97260046830541e-05,
"loss": 0.1368,
"step": 11
},
{
"epoch": 0.32432432432432434,
"grad_norm": 0.025897588580846786,
"learning_rate": 4.960576444868992e-05,
"loss": 0.2145,
"step": 12
},
{
"epoch": 0.35135135135135137,
"grad_norm": 0.019992820918560028,
"learning_rate": 4.94639123074876e-05,
"loss": 0.177,
"step": 13
},
{
"epoch": 0.3783783783783784,
"grad_norm": 0.0250578373670578,
"learning_rate": 4.930057285201027e-05,
"loss": 0.1203,
"step": 14
},
{
"epoch": 0.40540540540540543,
"grad_norm": 0.02095206454396248,
"learning_rate": 4.911588954770897e-05,
"loss": 0.2099,
"step": 15
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.024979041889309883,
"learning_rate": 4.891002460691306e-05,
"loss": 0.2099,
"step": 16
},
{
"epoch": 0.4594594594594595,
"grad_norm": 0.02010725811123848,
"learning_rate": 4.8683158846354786e-05,
"loss": 0.1187,
"step": 17
},
{
"epoch": 0.4864864864864865,
"grad_norm": 0.02482457086443901,
"learning_rate": 4.8435491528353026e-05,
"loss": 0.1486,
"step": 18
},
{
"epoch": 0.5135135135135135,
"grad_norm": 0.027636045590043068,
"learning_rate": 4.8167240185795835e-05,
"loss": 0.129,
"step": 19
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.02037637121975422,
"learning_rate": 4.787864043107546e-05,
"loss": 0.1781,
"step": 20
},
{
"epoch": 0.5675675675675675,
"grad_norm": 0.01964612863957882,
"learning_rate": 4.756994574914359e-05,
"loss": 0.1359,
"step": 21
},
{
"epoch": 0.5945945945945946,
"grad_norm": 0.03788142651319504,
"learning_rate": 4.724142727486869e-05,
"loss": 0.2172,
"step": 22
},
{
"epoch": 0.6216216216216216,
"grad_norm": 0.02042173221707344,
"learning_rate": 4.6893373554890925e-05,
"loss": 0.1855,
"step": 23
},
{
"epoch": 0.6486486486486487,
"grad_norm": 0.02441004291176796,
"learning_rate": 4.652609029418389e-05,
"loss": 0.1384,
"step": 24
},
{
"epoch": 0.6756756756756757,
"grad_norm": 0.02537146583199501,
"learning_rate": 4.613990008754565e-05,
"loss": 0.1964,
"step": 25
},
{
"epoch": 0.7027027027027027,
"grad_norm": 0.019411351531744003,
"learning_rate": 4.573514213625505e-05,
"loss": 0.164,
"step": 26
},
{
"epoch": 0.7297297297297297,
"grad_norm": 0.022983919829130173,
"learning_rate": 4.5312171950142034e-05,
"loss": 0.1583,
"step": 27
},
{
"epoch": 0.7567567567567568,
"grad_norm": 0.020284397527575493,
"learning_rate": 4.4871361035333836e-05,
"loss": 0.1925,
"step": 28
},
{
"epoch": 0.7837837837837838,
"grad_norm": 0.016482355073094368,
"learning_rate": 4.441309656795106e-05,
"loss": 0.1149,
"step": 29
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.02259289100766182,
"learning_rate": 4.3937781054040505e-05,
"loss": 0.1415,
"step": 30
},
{
"epoch": 0.8378378378378378,
"grad_norm": 0.02618718333542347,
"learning_rate": 4.344583197604318e-05,
"loss": 0.1105,
"step": 31
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.019114641472697258,
"learning_rate": 4.293768142610828e-05,
"loss": 0.1691,
"step": 32
},
{
"epoch": 0.8918918918918919,
"grad_norm": 0.01756560429930687,
"learning_rate": 4.241377572657493e-05,
"loss": 0.1338,
"step": 33
},
{
"epoch": 0.918918918918919,
"grad_norm": 0.017911238595843315,
"learning_rate": 4.187457503795527e-05,
"loss": 0.1116,
"step": 34
},
{
"epoch": 0.9459459459459459,
"grad_norm": 0.021960722282528877,
"learning_rate": 4.1320552954763044e-05,
"loss": 0.1448,
"step": 35
},
{
"epoch": 0.972972972972973,
"grad_norm": 0.020443174988031387,
"learning_rate": 4.075219608954278e-05,
"loss": 0.127,
"step": 36
},
{
"epoch": 1.0,
"grad_norm": 0.018107520416378975,
"learning_rate": 4.017000364546484e-05,
"loss": 0.1173,
"step": 37
},
{
"epoch": 1.027027027027027,
"grad_norm": 0.018736043944954872,
"learning_rate": 3.95744869778618e-05,
"loss": 0.1401,
"step": 38
},
{
"epoch": 1.054054054054054,
"grad_norm": 0.017554383724927902,
"learning_rate": 3.896616914509131e-05,
"loss": 0.0959,
"step": 39
},
{
"epoch": 1.0810810810810811,
"grad_norm": 0.017801837995648384,
"learning_rate": 3.8345584449119776e-05,
"loss": 0.1627,
"step": 40
},
{
"epoch": 1.1081081081081081,
"grad_norm": 0.015455273911356926,
"learning_rate": 3.7713277966230514e-05,
"loss": 0.0959,
"step": 41
},
{
"epoch": 1.135135135135135,
"grad_norm": 0.017967497929930687,
"learning_rate": 3.706980506826863e-05,
"loss": 0.1315,
"step": 42
},
{
"epoch": 1.1621621621621623,
"grad_norm": 0.020251786336302757,
"learning_rate": 3.6415730934842827e-05,
"loss": 0.1878,
"step": 43
},
{
"epoch": 1.1891891891891893,
"grad_norm": 0.0196075476706028,
"learning_rate": 3.575163005691302e-05,
"loss": 0.148,
"step": 44
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.019250644370913506,
"learning_rate": 3.507808573219931e-05,
"loss": 0.1018,
"step": 45
},
{
"epoch": 1.2432432432432432,
"grad_norm": 0.022631129249930382,
"learning_rate": 3.4395689552855955e-05,
"loss": 0.1352,
"step": 46
},
{
"epoch": 1.2702702702702702,
"grad_norm": 0.015486803837120533,
"learning_rate": 3.3705040885859975e-05,
"loss": 0.1104,
"step": 47
},
{
"epoch": 1.2972972972972974,
"grad_norm": 0.017026731744408607,
"learning_rate": 3.300674634657094e-05,
"loss": 0.1514,
"step": 48
},
{
"epoch": 1.3243243243243243,
"grad_norm": 0.019450196996331215,
"learning_rate": 3.2301419265924395e-05,
"loss": 0.0962,
"step": 49
},
{
"epoch": 1.3513513513513513,
"grad_norm": 0.020679926499724388,
"learning_rate": 3.158967915172669e-05,
"loss": 0.1086,
"step": 50
},
{
"epoch": 1.3783783783783785,
"grad_norm": 0.022294625639915466,
"learning_rate": 3.0872151144524595e-05,
"loss": 0.1686,
"step": 51
},
{
"epoch": 1.4054054054054055,
"grad_norm": 0.015719972550868988,
"learning_rate": 3.014946546852746e-05,
"loss": 0.1041,
"step": 52
},
{
"epoch": 1.4324324324324325,
"grad_norm": 0.015457438305020332,
"learning_rate": 2.9422256878064325e-05,
"loss": 0.0716,
"step": 53
},
{
"epoch": 1.4594594594594594,
"grad_norm": 0.020002242177724838,
"learning_rate": 2.8691164100062034e-05,
"loss": 0.1544,
"step": 54
},
{
"epoch": 1.4864864864864864,
"grad_norm": 0.018282748758792877,
"learning_rate": 2.7956829273034148e-05,
"loss": 0.0992,
"step": 55
},
{
"epoch": 1.5135135135135136,
"grad_norm": 0.024401405826210976,
"learning_rate": 2.7219897383073373e-05,
"loss": 0.101,
"step": 56
},
{
"epoch": 1.5405405405405406,
"grad_norm": 0.023635603487491608,
"learning_rate": 2.648101569734286e-05,
"loss": 0.0839,
"step": 57
},
{
"epoch": 1.5675675675675675,
"grad_norm": 0.017611190676689148,
"learning_rate": 2.5740833195563996e-05,
"loss": 0.13,
"step": 58
},
{
"epoch": 1.5945945945945947,
"grad_norm": 0.025297798216342926,
"learning_rate": 2.5e-05,
"loss": 0.0823,
"step": 59
},
{
"epoch": 1.6216216216216215,
"grad_norm": 0.016872813925147057,
"learning_rate": 2.4259166804436006e-05,
"loss": 0.1504,
"step": 60
},
{
"epoch": 1.6486486486486487,
"grad_norm": 0.019432147964835167,
"learning_rate": 2.3518984302657146e-05,
"loss": 0.0662,
"step": 61
},
{
"epoch": 1.6756756756756757,
"grad_norm": 0.021674757823348045,
"learning_rate": 2.2780102616926633e-05,
"loss": 0.1402,
"step": 62
},
{
"epoch": 1.7027027027027026,
"grad_norm": 0.017999498173594475,
"learning_rate": 2.2043170726965858e-05,
"loss": 0.0848,
"step": 63
},
{
"epoch": 1.7297297297297298,
"grad_norm": 0.01601288840174675,
"learning_rate": 2.1308835899937972e-05,
"loss": 0.0835,
"step": 64
},
{
"epoch": 1.7567567567567568,
"grad_norm": 0.017392445355653763,
"learning_rate": 2.0577743121935684e-05,
"loss": 0.1108,
"step": 65
},
{
"epoch": 1.7837837837837838,
"grad_norm": 0.017404066398739815,
"learning_rate": 1.9850534531472546e-05,
"loss": 0.1703,
"step": 66
},
{
"epoch": 1.810810810810811,
"grad_norm": 0.022098753601312637,
"learning_rate": 1.912784885547541e-05,
"loss": 0.0759,
"step": 67
},
{
"epoch": 1.8378378378378377,
"grad_norm": 0.022605005651712418,
"learning_rate": 1.8410320848273315e-05,
"loss": 0.0847,
"step": 68
},
{
"epoch": 1.864864864864865,
"grad_norm": 0.01858612895011902,
"learning_rate": 1.769858073407561e-05,
"loss": 0.136,
"step": 69
},
{
"epoch": 1.8918918918918919,
"grad_norm": 0.016947893425822258,
"learning_rate": 1.6993253653429063e-05,
"loss": 0.1174,
"step": 70
},
{
"epoch": 1.9189189189189189,
"grad_norm": 0.017410583794116974,
"learning_rate": 1.6294959114140034e-05,
"loss": 0.134,
"step": 71
},
{
"epoch": 1.945945945945946,
"grad_norm": 0.022370297461748123,
"learning_rate": 1.560431044714405e-05,
"loss": 0.1031,
"step": 72
},
{
"epoch": 1.972972972972973,
"grad_norm": 0.0391063429415226,
"learning_rate": 1.49219142678007e-05,
"loss": 0.1099,
"step": 73
},
{
"epoch": 2.0,
"grad_norm": 0.02405022457242012,
"learning_rate": 1.4248369943086998e-05,
"loss": 0.1621,
"step": 74
},
{
"epoch": 2.027027027027027,
"grad_norm": 0.017795320600271225,
"learning_rate": 1.3584269065157174e-05,
"loss": 0.0567,
"step": 75
},
{
"epoch": 2.054054054054054,
"grad_norm": 0.01834317483007908,
"learning_rate": 1.2930194931731382e-05,
"loss": 0.1826,
"step": 76
},
{
"epoch": 2.081081081081081,
"grad_norm": 0.020555740222334862,
"learning_rate": 1.2286722033769493e-05,
"loss": 0.0902,
"step": 77
},
{
"epoch": 2.108108108108108,
"grad_norm": 0.01855107955634594,
"learning_rate": 1.1654415550880243e-05,
"loss": 0.0787,
"step": 78
},
{
"epoch": 2.135135135135135,
"grad_norm": 0.012949816882610321,
"learning_rate": 1.1033830854908691e-05,
"loss": 0.109,
"step": 79
},
{
"epoch": 2.1621621621621623,
"grad_norm": 0.017050473019480705,
"learning_rate": 1.0425513022138203e-05,
"loss": 0.1013,
"step": 80
}
],
"logging_steps": 1,
"max_steps": 111,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6429622867315917e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}