short-ttt-step2-from-base / trainer_state.json
aadityap's picture
Model save
da41db5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 152,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013157894736842105,
"grad_norm": 0.011193246343451556,
"learning_rate": 5e-06,
"loss": 0.1709,
"step": 1
},
{
"epoch": 0.02631578947368421,
"grad_norm": 0.012632831062784382,
"learning_rate": 1e-05,
"loss": 0.1722,
"step": 2
},
{
"epoch": 0.039473684210526314,
"grad_norm": 0.01217162428713648,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.1719,
"step": 3
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.014278596408591488,
"learning_rate": 2e-05,
"loss": 0.1736,
"step": 4
},
{
"epoch": 0.06578947368421052,
"grad_norm": 0.011651798020362559,
"learning_rate": 2.5e-05,
"loss": 0.1693,
"step": 5
},
{
"epoch": 0.07894736842105263,
"grad_norm": 0.013076438611297115,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.1746,
"step": 6
},
{
"epoch": 0.09210526315789473,
"grad_norm": 0.013682964249895072,
"learning_rate": 3.5000000000000004e-05,
"loss": 0.1704,
"step": 7
},
{
"epoch": 0.10526315789473684,
"grad_norm": 0.014686466968317395,
"learning_rate": 4e-05,
"loss": 0.1699,
"step": 8
},
{
"epoch": 0.11842105263157894,
"grad_norm": 0.016281931534044915,
"learning_rate": 4.5e-05,
"loss": 0.1689,
"step": 9
},
{
"epoch": 0.13157894736842105,
"grad_norm": 0.0171854262921491,
"learning_rate": 5e-05,
"loss": 0.1687,
"step": 10
},
{
"epoch": 0.14473684210526316,
"grad_norm": 0.018894242233212503,
"learning_rate": 5.5e-05,
"loss": 0.1646,
"step": 11
},
{
"epoch": 0.15789473684210525,
"grad_norm": 0.01779881219815579,
"learning_rate": 6.000000000000001e-05,
"loss": 0.1569,
"step": 12
},
{
"epoch": 0.17105263157894737,
"grad_norm": 0.017710450658139844,
"learning_rate": 6.500000000000001e-05,
"loss": 0.1545,
"step": 13
},
{
"epoch": 0.18421052631578946,
"grad_norm": 0.017592203426232824,
"learning_rate": 7.000000000000001e-05,
"loss": 0.1479,
"step": 14
},
{
"epoch": 0.19736842105263158,
"grad_norm": 0.017739355110624643,
"learning_rate": 7.500000000000001e-05,
"loss": 0.1412,
"step": 15
},
{
"epoch": 0.21052631578947367,
"grad_norm": 0.01830298878239801,
"learning_rate": 8e-05,
"loss": 0.1377,
"step": 16
},
{
"epoch": 0.2236842105263158,
"grad_norm": 0.019099329601174855,
"learning_rate": 7.99893283244655e-05,
"loss": 0.138,
"step": 17
},
{
"epoch": 0.23684210526315788,
"grad_norm": 0.020182334218850437,
"learning_rate": 7.995731899209491e-05,
"loss": 0.1335,
"step": 18
},
{
"epoch": 0.25,
"grad_norm": 0.021528435171673927,
"learning_rate": 7.990398908254869e-05,
"loss": 0.1262,
"step": 19
},
{
"epoch": 0.2631578947368421,
"grad_norm": 0.019272557399678692,
"learning_rate": 7.982936705180139e-05,
"loss": 0.1199,
"step": 20
},
{
"epoch": 0.27631578947368424,
"grad_norm": 0.01555919871902734,
"learning_rate": 7.9733492716958e-05,
"loss": 0.1155,
"step": 21
},
{
"epoch": 0.2894736842105263,
"grad_norm": 0.014069638973321082,
"learning_rate": 7.961641723500821e-05,
"loss": 0.1124,
"step": 22
},
{
"epoch": 0.3026315789473684,
"grad_norm": 0.01364776366814878,
"learning_rate": 7.947820307552984e-05,
"loss": 0.1095,
"step": 23
},
{
"epoch": 0.3157894736842105,
"grad_norm": 0.013427503151618732,
"learning_rate": 7.931892398735608e-05,
"loss": 0.1065,
"step": 24
},
{
"epoch": 0.32894736842105265,
"grad_norm": 0.011876878823261603,
"learning_rate": 7.913866495922436e-05,
"loss": 0.1003,
"step": 25
},
{
"epoch": 0.34210526315789475,
"grad_norm": 0.011314513909875947,
"learning_rate": 7.893752217442773e-05,
"loss": 0.096,
"step": 26
},
{
"epoch": 0.35526315789473684,
"grad_norm": 0.010900987899870946,
"learning_rate": 7.871560295949292e-05,
"loss": 0.0927,
"step": 27
},
{
"epoch": 0.3684210526315789,
"grad_norm": 0.011694060724422608,
"learning_rate": 7.847302572691277e-05,
"loss": 0.0835,
"step": 28
},
{
"epoch": 0.3815789473684211,
"grad_norm": 0.012306370489616343,
"learning_rate": 7.820991991196321e-05,
"loss": 0.0818,
"step": 29
},
{
"epoch": 0.39473684210526316,
"grad_norm": 0.013091064612739244,
"learning_rate": 7.792642590363864e-05,
"loss": 0.0784,
"step": 30
},
{
"epoch": 0.40789473684210525,
"grad_norm": 0.012940667007794919,
"learning_rate": 7.762269496974271e-05,
"loss": 0.0726,
"step": 31
},
{
"epoch": 0.42105263157894735,
"grad_norm": 0.012578766778023602,
"learning_rate": 7.729888917617424e-05,
"loss": 0.0723,
"step": 32
},
{
"epoch": 0.4342105263157895,
"grad_norm": 0.011800836575446658,
"learning_rate": 7.695518130045147e-05,
"loss": 0.0629,
"step": 33
},
{
"epoch": 0.4473684210526316,
"grad_norm": 0.01216383986530226,
"learning_rate": 7.659175473952085e-05,
"loss": 0.0583,
"step": 34
},
{
"epoch": 0.4605263157894737,
"grad_norm": 0.012221190154378425,
"learning_rate": 7.620880341189928e-05,
"loss": 0.0547,
"step": 35
},
{
"epoch": 0.47368421052631576,
"grad_norm": 0.012221416929572789,
"learning_rate": 7.58065316542025e-05,
"loss": 0.0537,
"step": 36
},
{
"epoch": 0.4868421052631579,
"grad_norm": 0.01022797450598566,
"learning_rate": 7.538515411211422e-05,
"loss": 0.0525,
"step": 37
},
{
"epoch": 0.5,
"grad_norm": 0.010196900578949973,
"learning_rate": 7.494489562585479e-05,
"loss": 0.0522,
"step": 38
},
{
"epoch": 0.5131578947368421,
"grad_norm": 0.01081474068902212,
"learning_rate": 7.448599111021003e-05,
"loss": 0.0497,
"step": 39
},
{
"epoch": 0.5263157894736842,
"grad_norm": 0.011127976549735704,
"learning_rate": 7.400868542918457e-05,
"loss": 0.0494,
"step": 40
},
{
"epoch": 0.5394736842105263,
"grad_norm": 0.009719680216921442,
"learning_rate": 7.351323326534634e-05,
"loss": 0.0474,
"step": 41
},
{
"epoch": 0.5526315789473685,
"grad_norm": 0.011121396526423858,
"learning_rate": 7.299989898393209e-05,
"loss": 0.0491,
"step": 42
},
{
"epoch": 0.5657894736842105,
"grad_norm": 0.00932521994941077,
"learning_rate": 7.246895649178646e-05,
"loss": 0.0468,
"step": 43
},
{
"epoch": 0.5789473684210527,
"grad_norm": 0.010516122343315393,
"learning_rate": 7.192068909120959e-05,
"loss": 0.046,
"step": 44
},
{
"epoch": 0.5921052631578947,
"grad_norm": 0.008426401948130227,
"learning_rate": 7.135538932879176e-05,
"loss": 0.0416,
"step": 45
},
{
"epoch": 0.6052631578947368,
"grad_norm": 0.009231664562787666,
"learning_rate": 7.077335883931516e-05,
"loss": 0.0463,
"step": 46
},
{
"epoch": 0.618421052631579,
"grad_norm": 0.007751899943539208,
"learning_rate": 7.017490818480657e-05,
"loss": 0.0439,
"step": 47
},
{
"epoch": 0.631578947368421,
"grad_norm": 0.007380790901519942,
"learning_rate": 6.956035668882637e-05,
"loss": 0.0406,
"step": 48
},
{
"epoch": 0.6447368421052632,
"grad_norm": 0.007089333975023999,
"learning_rate": 6.893003226608281e-05,
"loss": 0.0417,
"step": 49
},
{
"epoch": 0.6578947368421053,
"grad_norm": 0.0074054283948870505,
"learning_rate": 6.828427124746191e-05,
"loss": 0.0423,
"step": 50
},
{
"epoch": 0.6710526315789473,
"grad_norm": 0.006910246968440978,
"learning_rate": 6.762341820056687e-05,
"loss": 0.0449,
"step": 51
},
{
"epoch": 0.6842105263157895,
"grad_norm": 0.006530381828194509,
"learning_rate": 6.69478257458623e-05,
"loss": 0.0428,
"step": 52
},
{
"epoch": 0.6973684210526315,
"grad_norm": 0.007097174249867035,
"learning_rate": 6.625785436852172e-05,
"loss": 0.0425,
"step": 53
},
{
"epoch": 0.7105263157894737,
"grad_norm": 0.006711690699573767,
"learning_rate": 6.555387222607845e-05,
"loss": 0.0435,
"step": 54
},
{
"epoch": 0.7236842105263158,
"grad_norm": 0.00701588418316799,
"learning_rate": 6.483625495198287e-05,
"loss": 0.0439,
"step": 55
},
{
"epoch": 0.7368421052631579,
"grad_norm": 0.006833977043231902,
"learning_rate": 6.410538545517026e-05,
"loss": 0.0417,
"step": 56
},
{
"epoch": 0.75,
"grad_norm": 0.005986983591642089,
"learning_rate": 6.336165371574707e-05,
"loss": 0.0434,
"step": 57
},
{
"epoch": 0.7631578947368421,
"grad_norm": 0.0062782002366031,
"learning_rate": 6.260545657690369e-05,
"loss": 0.0403,
"step": 58
},
{
"epoch": 0.7763157894736842,
"grad_norm": 0.006945884720384615,
"learning_rate": 6.183719753316539e-05,
"loss": 0.0416,
"step": 59
},
{
"epoch": 0.7894736842105263,
"grad_norm": 0.006319586224258474,
"learning_rate": 6.105728651509424e-05,
"loss": 0.0415,
"step": 60
},
{
"epoch": 0.8026315789473685,
"grad_norm": 0.058532063238545366,
"learning_rate": 6.026613967055678e-05,
"loss": 0.0409,
"step": 61
},
{
"epoch": 0.8157894736842105,
"grad_norm": 0.006336527241640012,
"learning_rate": 5.946417914267425e-05,
"loss": 0.0382,
"step": 62
},
{
"epoch": 0.8289473684210527,
"grad_norm": 0.006470625016947808,
"learning_rate": 5.865183284457392e-05,
"loss": 0.0416,
"step": 63
},
{
"epoch": 0.8421052631578947,
"grad_norm": 0.00682983462425869,
"learning_rate": 5.782953423106154e-05,
"loss": 0.0437,
"step": 64
},
{
"epoch": 0.8552631578947368,
"grad_norm": 0.006033568829008143,
"learning_rate": 5.699772206733689e-05,
"loss": 0.0423,
"step": 65
},
{
"epoch": 0.868421052631579,
"grad_norm": 0.006263481762315256,
"learning_rate": 5.61568401948758e-05,
"loss": 0.0401,
"step": 66
},
{
"epoch": 0.881578947368421,
"grad_norm": 0.0074388468143108035,
"learning_rate": 5.5307337294603595e-05,
"loss": 0.0426,
"step": 67
},
{
"epoch": 0.8947368421052632,
"grad_norm": 0.006213659757931369,
"learning_rate": 5.444966664748613e-05,
"loss": 0.0412,
"step": 68
},
{
"epoch": 0.9078947368421053,
"grad_norm": 0.006380330465230061,
"learning_rate": 5.3584285892666454e-05,
"loss": 0.0428,
"step": 69
},
{
"epoch": 0.9210526315789473,
"grad_norm": 0.006207536923484298,
"learning_rate": 5.271165678327607e-05,
"loss": 0.04,
"step": 70
},
{
"epoch": 0.9342105263157895,
"grad_norm": 0.0066810038964758185,
"learning_rate": 5.183224494005083e-05,
"loss": 0.0435,
"step": 71
},
{
"epoch": 0.9473684210526315,
"grad_norm": 0.006226425147587245,
"learning_rate": 5.0946519602883326e-05,
"loss": 0.0401,
"step": 72
},
{
"epoch": 0.9605263157894737,
"grad_norm": 0.0059224557005355645,
"learning_rate": 5.0054953380444116e-05,
"loss": 0.0435,
"step": 73
},
{
"epoch": 0.9736842105263158,
"grad_norm": 0.007488520716144708,
"learning_rate": 4.9158021998005366e-05,
"loss": 0.0427,
"step": 74
},
{
"epoch": 0.9868421052631579,
"grad_norm": 0.006072226631282511,
"learning_rate": 4.825620404360159e-05,
"loss": 0.041,
"step": 75
},
{
"epoch": 1.0,
"grad_norm": 0.006569700378799453,
"learning_rate": 4.734998071266282e-05,
"loss": 0.0433,
"step": 76
},
{
"epoch": 1.013157894736842,
"grad_norm": 0.005838560277534237,
"learning_rate": 4.643983555125652e-05,
"loss": 0.0403,
"step": 77
},
{
"epoch": 1.0263157894736843,
"grad_norm": 0.005804466890798108,
"learning_rate": 4.552625419807529e-05,
"loss": 0.0414,
"step": 78
},
{
"epoch": 1.0394736842105263,
"grad_norm": 0.0066119101268658585,
"learning_rate": 4.460972412530791e-05,
"loss": 0.0414,
"step": 79
},
{
"epoch": 1.0526315789473684,
"grad_norm": 0.005556966301273331,
"learning_rate": 4.369073437853208e-05,
"loss": 0.0378,
"step": 80
},
{
"epoch": 1.0657894736842106,
"grad_norm": 0.006512517911844179,
"learning_rate": 4.276977531576767e-05,
"loss": 0.0388,
"step": 81
},
{
"epoch": 1.0789473684210527,
"grad_norm": 0.005833488831583047,
"learning_rate": 4.184733834582959e-05,
"loss": 0.039,
"step": 82
},
{
"epoch": 1.0921052631578947,
"grad_norm": 0.006216115200940607,
"learning_rate": 4.092391566612005e-05,
"loss": 0.0409,
"step": 83
},
{
"epoch": 1.1052631578947367,
"grad_norm": 0.00575317037203953,
"learning_rate": 4e-05,
"loss": 0.0409,
"step": 84
},
{
"epoch": 1.118421052631579,
"grad_norm": 0.006222832163571204,
"learning_rate": 3.9076084333879964e-05,
"loss": 0.0387,
"step": 85
},
{
"epoch": 1.131578947368421,
"grad_norm": 0.005647614055554481,
"learning_rate": 3.815266165417043e-05,
"loss": 0.0397,
"step": 86
},
{
"epoch": 1.1447368421052633,
"grad_norm": 0.006096737358226156,
"learning_rate": 3.723022468423234e-05,
"loss": 0.0409,
"step": 87
},
{
"epoch": 1.1578947368421053,
"grad_norm": 0.0054974626092716364,
"learning_rate": 3.630926562146792e-05,
"loss": 0.0366,
"step": 88
},
{
"epoch": 1.1710526315789473,
"grad_norm": 0.005754720995497907,
"learning_rate": 3.53902758746921e-05,
"loss": 0.038,
"step": 89
},
{
"epoch": 1.1842105263157894,
"grad_norm": 0.006362210531056341,
"learning_rate": 3.447374580192473e-05,
"loss": 0.0396,
"step": 90
},
{
"epoch": 1.1973684210526316,
"grad_norm": 0.006025762605068581,
"learning_rate": 3.356016444874348e-05,
"loss": 0.0388,
"step": 91
},
{
"epoch": 1.2105263157894737,
"grad_norm": 0.005994627251788469,
"learning_rate": 3.2650019287337184e-05,
"loss": 0.0402,
"step": 92
},
{
"epoch": 1.2236842105263157,
"grad_norm": 0.00620754951196749,
"learning_rate": 3.1743795956398425e-05,
"loss": 0.0407,
"step": 93
},
{
"epoch": 1.236842105263158,
"grad_norm": 0.006255192958964342,
"learning_rate": 3.084197800199465e-05,
"loss": 0.0372,
"step": 94
},
{
"epoch": 1.25,
"grad_norm": 0.006320719774606469,
"learning_rate": 2.9945046619555894e-05,
"loss": 0.0378,
"step": 95
},
{
"epoch": 1.263157894736842,
"grad_norm": 0.0060922447961943,
"learning_rate": 2.905348039711669e-05,
"loss": 0.0397,
"step": 96
},
{
"epoch": 1.2763157894736843,
"grad_norm": 0.006202374774637231,
"learning_rate": 2.816775505994919e-05,
"loss": 0.0375,
"step": 97
},
{
"epoch": 1.2894736842105263,
"grad_norm": 0.005867178023820865,
"learning_rate": 2.728834321672394e-05,
"loss": 0.0359,
"step": 98
},
{
"epoch": 1.3026315789473684,
"grad_norm": 0.005998890900515434,
"learning_rate": 2.6415714107333545e-05,
"loss": 0.0393,
"step": 99
},
{
"epoch": 1.3157894736842106,
"grad_norm": 0.005839373614340455,
"learning_rate": 2.5550333352513885e-05,
"loss": 0.0386,
"step": 100
},
{
"epoch": 1.3289473684210527,
"grad_norm": 0.0061143595917766535,
"learning_rate": 2.4692662705396412e-05,
"loss": 0.0376,
"step": 101
},
{
"epoch": 1.3421052631578947,
"grad_norm": 0.0065210416681655975,
"learning_rate": 2.3843159805124207e-05,
"loss": 0.0414,
"step": 102
},
{
"epoch": 1.3552631578947367,
"grad_norm": 0.006158647021865143,
"learning_rate": 2.300227793266314e-05,
"loss": 0.0394,
"step": 103
},
{
"epoch": 1.368421052631579,
"grad_norm": 0.005766499988676987,
"learning_rate": 2.2170465768938473e-05,
"loss": 0.0371,
"step": 104
},
{
"epoch": 1.381578947368421,
"grad_norm": 0.0059323690491137875,
"learning_rate": 2.1348167155426073e-05,
"loss": 0.0399,
"step": 105
},
{
"epoch": 1.3947368421052633,
"grad_norm": 0.006516041490698414,
"learning_rate": 2.0535820857325755e-05,
"loss": 0.0394,
"step": 106
},
{
"epoch": 1.4078947368421053,
"grad_norm": 0.006798971635981513,
"learning_rate": 1.973386032944323e-05,
"loss": 0.0389,
"step": 107
},
{
"epoch": 1.4210526315789473,
"grad_norm": 0.006304951283408136,
"learning_rate": 1.8942713484905762e-05,
"loss": 0.0382,
"step": 108
},
{
"epoch": 1.4342105263157894,
"grad_norm": 0.006305286850533848,
"learning_rate": 1.816280246683463e-05,
"loss": 0.0401,
"step": 109
},
{
"epoch": 1.4473684210526316,
"grad_norm": 0.0061112906748014935,
"learning_rate": 1.7394543423096327e-05,
"loss": 0.0402,
"step": 110
},
{
"epoch": 1.4605263157894737,
"grad_norm": 0.006103907371143296,
"learning_rate": 1.6638346284252946e-05,
"loss": 0.0387,
"step": 111
},
{
"epoch": 1.4736842105263157,
"grad_norm": 0.006214314821635323,
"learning_rate": 1.589461454482975e-05,
"loss": 0.0403,
"step": 112
},
{
"epoch": 1.486842105263158,
"grad_norm": 0.0059749757145963625,
"learning_rate": 1.5163745048017147e-05,
"loss": 0.0389,
"step": 113
},
{
"epoch": 1.5,
"grad_norm": 0.005237867395501871,
"learning_rate": 1.4446127773921559e-05,
"loss": 0.0369,
"step": 114
},
{
"epoch": 1.513157894736842,
"grad_norm": 0.00650681235281057,
"learning_rate": 1.37421456314783e-05,
"loss": 0.0377,
"step": 115
},
{
"epoch": 1.526315789473684,
"grad_norm": 0.006517220869170101,
"learning_rate": 1.3052174254137713e-05,
"loss": 0.0401,
"step": 116
},
{
"epoch": 1.5394736842105263,
"grad_norm": 0.006712118321082451,
"learning_rate": 1.2376581799433139e-05,
"loss": 0.0398,
"step": 117
},
{
"epoch": 1.5526315789473686,
"grad_norm": 0.006200500734559256,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.0375,
"step": 118
},
{
"epoch": 1.5657894736842106,
"grad_norm": 0.005732987430416324,
"learning_rate": 1.1069967733917215e-05,
"loss": 0.0391,
"step": 119
},
{
"epoch": 1.5789473684210527,
"grad_norm": 0.006484965464125168,
"learning_rate": 1.0439643311173642e-05,
"loss": 0.041,
"step": 120
},
{
"epoch": 1.5921052631578947,
"grad_norm": 0.006091177429475543,
"learning_rate": 9.825091815193443e-06,
"loss": 0.0396,
"step": 121
},
{
"epoch": 1.6052631578947367,
"grad_norm": 0.005897121800005114,
"learning_rate": 9.226641160684843e-06,
"loss": 0.038,
"step": 122
},
{
"epoch": 1.618421052631579,
"grad_norm": 0.006402643906597493,
"learning_rate": 8.644610671208263e-06,
"loss": 0.041,
"step": 123
},
{
"epoch": 1.631578947368421,
"grad_norm": 0.006350403783775521,
"learning_rate": 8.07931090879042e-06,
"loss": 0.0369,
"step": 124
},
{
"epoch": 1.6447368421052633,
"grad_norm": 0.006319766184613516,
"learning_rate": 7.531043508213552e-06,
"loss": 0.0407,
"step": 125
},
{
"epoch": 1.6578947368421053,
"grad_norm": 0.006377723302699078,
"learning_rate": 7.000101016067913e-06,
"loss": 0.0367,
"step": 126
},
{
"epoch": 1.6710526315789473,
"grad_norm": 0.006186351651696318,
"learning_rate": 6.4867667346536715e-06,
"loss": 0.0379,
"step": 127
},
{
"epoch": 1.6842105263157894,
"grad_norm": 0.006108718491588002,
"learning_rate": 5.991314570815441e-06,
"loss": 0.0393,
"step": 128
},
{
"epoch": 1.6973684210526314,
"grad_norm": 0.0057589729685953136,
"learning_rate": 5.514008889789977e-06,
"loss": 0.0368,
"step": 129
},
{
"epoch": 1.7105263157894737,
"grad_norm": 0.005754253166546721,
"learning_rate": 5.055104374145221e-06,
"loss": 0.0352,
"step": 130
},
{
"epoch": 1.723684210526316,
"grad_norm": 0.006859144229533852,
"learning_rate": 4.614845887885793e-06,
"loss": 0.0393,
"step": 131
},
{
"epoch": 1.736842105263158,
"grad_norm": 0.007047040409450739,
"learning_rate": 4.193468345797511e-06,
"loss": 0.0402,
"step": 132
},
{
"epoch": 1.75,
"grad_norm": 0.006382411362923948,
"learning_rate": 3.791196588100716e-06,
"loss": 0.038,
"step": 133
},
{
"epoch": 1.763157894736842,
"grad_norm": 0.006315946771327425,
"learning_rate": 3.4082452604791594e-06,
"loss": 0.0365,
"step": 134
},
{
"epoch": 1.776315789473684,
"grad_norm": 0.006017388517146248,
"learning_rate": 3.0448186995485307e-06,
"loss": 0.0368,
"step": 135
},
{
"epoch": 1.7894736842105263,
"grad_norm": 0.006327757401819577,
"learning_rate": 2.7011108238257723e-06,
"loss": 0.0363,
"step": 136
},
{
"epoch": 1.8026315789473686,
"grad_norm": 0.006203858870102881,
"learning_rate": 2.3773050302572955e-06,
"loss": 0.0393,
"step": 137
},
{
"epoch": 1.8157894736842106,
"grad_norm": 0.005922591453193032,
"learning_rate": 2.073574096361366e-06,
"loss": 0.0373,
"step": 138
},
{
"epoch": 1.8289473684210527,
"grad_norm": 0.005850429039456173,
"learning_rate": 1.7900800880368008e-06,
"loss": 0.0348,
"step": 139
},
{
"epoch": 1.8421052631578947,
"grad_norm": 0.006355905348540479,
"learning_rate": 1.5269742730872384e-06,
"loss": 0.0399,
"step": 140
},
{
"epoch": 1.8552631578947367,
"grad_norm": 0.006160454658469522,
"learning_rate": 1.2843970405070994e-06,
"loss": 0.0376,
"step": 141
},
{
"epoch": 1.868421052631579,
"grad_norm": 0.006410484683214994,
"learning_rate": 1.0624778255722857e-06,
"loss": 0.0377,
"step": 142
},
{
"epoch": 1.881578947368421,
"grad_norm": 0.009474571629909624,
"learning_rate": 8.613350407756438e-07,
"loss": 0.0395,
"step": 143
},
{
"epoch": 1.8947368421052633,
"grad_norm": 0.00641904068629697,
"learning_rate": 6.810760126439287e-07,
"loss": 0.0385,
"step": 144
},
{
"epoch": 1.9078947368421053,
"grad_norm": 0.0061467775277458895,
"learning_rate": 5.217969244701771e-07,
"loss": 0.0371,
"step": 145
},
{
"epoch": 1.9210526315789473,
"grad_norm": 0.0065777183398005095,
"learning_rate": 3.835827649917967e-07,
"loss": 0.0395,
"step": 146
},
{
"epoch": 1.9342105263157894,
"grad_norm": 0.00622464792774119,
"learning_rate": 2.6650728304200125e-07,
"loss": 0.0388,
"step": 147
},
{
"epoch": 1.9473684210526314,
"grad_norm": 0.00658284263741108,
"learning_rate": 1.706329481986213e-07,
"loss": 0.0394,
"step": 148
},
{
"epoch": 1.9605263157894737,
"grad_norm": 0.006984312886553246,
"learning_rate": 9.601091745132352e-08,
"loss": 0.041,
"step": 149
},
{
"epoch": 1.973684210526316,
"grad_norm": 0.006342838371429401,
"learning_rate": 4.268100790510321e-08,
"loss": 0.0379,
"step": 150
},
{
"epoch": 1.986842105263158,
"grad_norm": 0.006421376568210238,
"learning_rate": 1.0671675534510429e-08,
"loss": 0.0394,
"step": 151
},
{
"epoch": 2.0,
"grad_norm": 0.006358948228182317,
"learning_rate": 0.0,
"loss": 0.0371,
"step": 152
},
{
"epoch": 2.0,
"step": 152,
"total_flos": 113543336886272.0,
"train_loss": 0.060337725389552746,
"train_runtime": 1259.4637,
"train_samples_per_second": 0.965,
"train_steps_per_second": 0.121
}
],
"logging_steps": 1,
"max_steps": 152,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 113543336886272.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}