57feceec7 / last-checkpoint /trainer_state.json
magatex's picture
Training in progress, step 190, checkpoint
bfd7deb verified
{
"best_metric": 1.0219863653182983,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 1.4205607476635513,
"eval_steps": 25,
"global_step": 190,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007476635514018692,
"grad_norm": 53.585628509521484,
"learning_rate": 1.1111111111111112e-05,
"loss": 5.7506,
"step": 1
},
{
"epoch": 0.007476635514018692,
"eval_loss": 6.5497517585754395,
"eval_runtime": 1.7647,
"eval_samples_per_second": 28.333,
"eval_steps_per_second": 3.967,
"step": 1
},
{
"epoch": 0.014953271028037384,
"grad_norm": 51.50555419921875,
"learning_rate": 2.2222222222222223e-05,
"loss": 6.2904,
"step": 2
},
{
"epoch": 0.022429906542056073,
"grad_norm": 35.29224395751953,
"learning_rate": 3.3333333333333335e-05,
"loss": 6.3377,
"step": 3
},
{
"epoch": 0.029906542056074768,
"grad_norm": 28.75082778930664,
"learning_rate": 4.4444444444444447e-05,
"loss": 6.1761,
"step": 4
},
{
"epoch": 0.037383177570093455,
"grad_norm": 21.41317367553711,
"learning_rate": 5.555555555555556e-05,
"loss": 5.9294,
"step": 5
},
{
"epoch": 0.044859813084112146,
"grad_norm": 17.97064971923828,
"learning_rate": 6.666666666666667e-05,
"loss": 5.5866,
"step": 6
},
{
"epoch": 0.052336448598130844,
"grad_norm": 10.495800018310547,
"learning_rate": 7.777777777777778e-05,
"loss": 5.4007,
"step": 7
},
{
"epoch": 0.059813084112149535,
"grad_norm": 9.666411399841309,
"learning_rate": 8.888888888888889e-05,
"loss": 4.8408,
"step": 8
},
{
"epoch": 0.06728971962616823,
"grad_norm": 9.070466041564941,
"learning_rate": 0.0001,
"loss": 4.3133,
"step": 9
},
{
"epoch": 0.07476635514018691,
"grad_norm": 8.968005180358887,
"learning_rate": 9.999322180262823e-05,
"loss": 3.9532,
"step": 10
},
{
"epoch": 0.08224299065420561,
"grad_norm": 8.822087287902832,
"learning_rate": 9.997288925246668e-05,
"loss": 3.5377,
"step": 11
},
{
"epoch": 0.08971962616822429,
"grad_norm": 6.6545939445495605,
"learning_rate": 9.993900847476147e-05,
"loss": 3.1288,
"step": 12
},
{
"epoch": 0.09719626168224299,
"grad_norm": 6.504652500152588,
"learning_rate": 9.989158967620588e-05,
"loss": 2.662,
"step": 13
},
{
"epoch": 0.10467289719626169,
"grad_norm": 6.791845321655273,
"learning_rate": 9.983064714186548e-05,
"loss": 2.686,
"step": 14
},
{
"epoch": 0.11214953271028037,
"grad_norm": 6.590876579284668,
"learning_rate": 9.975619923087478e-05,
"loss": 2.4704,
"step": 15
},
{
"epoch": 0.11962616822429907,
"grad_norm": 4.255295753479004,
"learning_rate": 9.966826837090643e-05,
"loss": 2.2264,
"step": 16
},
{
"epoch": 0.12710280373831775,
"grad_norm": 4.624362468719482,
"learning_rate": 9.956688105141482e-05,
"loss": 2.1681,
"step": 17
},
{
"epoch": 0.13457943925233645,
"grad_norm": 3.850867986679077,
"learning_rate": 9.945206781565605e-05,
"loss": 2.1567,
"step": 18
},
{
"epoch": 0.14205607476635515,
"grad_norm": 5.03740119934082,
"learning_rate": 9.932386325148672e-05,
"loss": 1.9059,
"step": 19
},
{
"epoch": 0.14953271028037382,
"grad_norm": 7.313295364379883,
"learning_rate": 9.918230598094414e-05,
"loss": 1.849,
"step": 20
},
{
"epoch": 0.15700934579439252,
"grad_norm": 5.069641590118408,
"learning_rate": 9.902743864861138e-05,
"loss": 1.7289,
"step": 21
},
{
"epoch": 0.16448598130841122,
"grad_norm": 2.52390718460083,
"learning_rate": 9.885930790877044e-05,
"loss": 1.5924,
"step": 22
},
{
"epoch": 0.17196261682242991,
"grad_norm": 2.989380359649658,
"learning_rate": 9.867796441134754e-05,
"loss": 1.5416,
"step": 23
},
{
"epoch": 0.17943925233644858,
"grad_norm": 3.9914767742156982,
"learning_rate": 9.84834627866545e-05,
"loss": 1.558,
"step": 24
},
{
"epoch": 0.18691588785046728,
"grad_norm": 2.8886358737945557,
"learning_rate": 9.82758616289314e-05,
"loss": 1.526,
"step": 25
},
{
"epoch": 0.18691588785046728,
"eval_loss": 1.497947335243225,
"eval_runtime": 1.7334,
"eval_samples_per_second": 28.846,
"eval_steps_per_second": 4.038,
"step": 25
},
{
"epoch": 0.19439252336448598,
"grad_norm": 2.935326337814331,
"learning_rate": 9.805522347869479e-05,
"loss": 1.4824,
"step": 26
},
{
"epoch": 0.20186915887850468,
"grad_norm": 3.524118185043335,
"learning_rate": 9.78216148038971e-05,
"loss": 1.4853,
"step": 27
},
{
"epoch": 0.20934579439252338,
"grad_norm": 2.3418080806732178,
"learning_rate": 9.757510597990301e-05,
"loss": 1.3454,
"step": 28
},
{
"epoch": 0.21682242990654205,
"grad_norm": 2.0094268321990967,
"learning_rate": 9.731577126828865e-05,
"loss": 1.4325,
"step": 29
},
{
"epoch": 0.22429906542056074,
"grad_norm": 2.4824633598327637,
"learning_rate": 9.704368879447005e-05,
"loss": 1.4706,
"step": 30
},
{
"epoch": 0.23177570093457944,
"grad_norm": 2.506647825241089,
"learning_rate": 9.675894052416765e-05,
"loss": 1.7467,
"step": 31
},
{
"epoch": 0.23925233644859814,
"grad_norm": 2.892395257949829,
"learning_rate": 9.64616122387137e-05,
"loss": 1.6818,
"step": 32
},
{
"epoch": 0.2467289719626168,
"grad_norm": 3.7638962268829346,
"learning_rate": 9.615179350921063e-05,
"loss": 2.4594,
"step": 33
},
{
"epoch": 0.2542056074766355,
"grad_norm": 2.2599539756774902,
"learning_rate": 9.58295776695472e-05,
"loss": 1.7344,
"step": 34
},
{
"epoch": 0.2616822429906542,
"grad_norm": 2.3030145168304443,
"learning_rate": 9.549506178828152e-05,
"loss": 1.3113,
"step": 35
},
{
"epoch": 0.2691588785046729,
"grad_norm": 1.872995138168335,
"learning_rate": 9.514834663939882e-05,
"loss": 1.1947,
"step": 36
},
{
"epoch": 0.2766355140186916,
"grad_norm": 2.171834945678711,
"learning_rate": 9.478953667195292e-05,
"loss": 1.3481,
"step": 37
},
{
"epoch": 0.2841121495327103,
"grad_norm": 1.6435407400131226,
"learning_rate": 9.441873997860061e-05,
"loss": 1.1591,
"step": 38
},
{
"epoch": 0.29158878504672897,
"grad_norm": 1.8728747367858887,
"learning_rate": 9.403606826303847e-05,
"loss": 1.166,
"step": 39
},
{
"epoch": 0.29906542056074764,
"grad_norm": 1.5220937728881836,
"learning_rate": 9.364163680635166e-05,
"loss": 1.1966,
"step": 40
},
{
"epoch": 0.30654205607476637,
"grad_norm": 1.9457396268844604,
"learning_rate": 9.323556443228521e-05,
"loss": 1.115,
"step": 41
},
{
"epoch": 0.31401869158878504,
"grad_norm": 1.6101148128509521,
"learning_rate": 9.281797347144796e-05,
"loss": 1.1116,
"step": 42
},
{
"epoch": 0.32149532710280376,
"grad_norm": 1.4722403287887573,
"learning_rate": 9.238898972446005e-05,
"loss": 1.1254,
"step": 43
},
{
"epoch": 0.32897196261682243,
"grad_norm": 1.5298360586166382,
"learning_rate": 9.194874242405508e-05,
"loss": 1.0772,
"step": 44
},
{
"epoch": 0.3364485981308411,
"grad_norm": 1.4249259233474731,
"learning_rate": 9.149736419614837e-05,
"loss": 1.0679,
"step": 45
},
{
"epoch": 0.34392523364485983,
"grad_norm": 1.617456316947937,
"learning_rate": 9.103499101988296e-05,
"loss": 1.1213,
"step": 46
},
{
"epoch": 0.3514018691588785,
"grad_norm": 1.6541962623596191,
"learning_rate": 9.056176218666543e-05,
"loss": 1.1241,
"step": 47
},
{
"epoch": 0.35887850467289717,
"grad_norm": 1.584012508392334,
"learning_rate": 9.007782025820393e-05,
"loss": 1.0767,
"step": 48
},
{
"epoch": 0.3663551401869159,
"grad_norm": 1.4249681234359741,
"learning_rate": 8.958331102356102e-05,
"loss": 1.1389,
"step": 49
},
{
"epoch": 0.37383177570093457,
"grad_norm": 1.602219581604004,
"learning_rate": 8.907838345523424e-05,
"loss": 1.2053,
"step": 50
},
{
"epoch": 0.37383177570093457,
"eval_loss": 1.1656819581985474,
"eval_runtime": 1.8134,
"eval_samples_per_second": 27.573,
"eval_steps_per_second": 3.86,
"step": 50
},
{
"epoch": 0.3813084112149533,
"grad_norm": 1.888527274131775,
"learning_rate": 8.856318966427766e-05,
"loss": 1.3811,
"step": 51
},
{
"epoch": 0.38878504672897196,
"grad_norm": 1.4429353475570679,
"learning_rate": 8.803788485447791e-05,
"loss": 1.1332,
"step": 52
},
{
"epoch": 0.39626168224299063,
"grad_norm": 1.5478978157043457,
"learning_rate": 8.750262727559867e-05,
"loss": 0.932,
"step": 53
},
{
"epoch": 0.40373831775700936,
"grad_norm": 1.5630110502243042,
"learning_rate": 8.695757817570717e-05,
"loss": 1.1636,
"step": 54
},
{
"epoch": 0.411214953271028,
"grad_norm": 1.6892582178115845,
"learning_rate": 8.640290175259794e-05,
"loss": 1.0162,
"step": 55
},
{
"epoch": 0.41869158878504675,
"grad_norm": 1.607128620147705,
"learning_rate": 8.58387651043276e-05,
"loss": 1.1545,
"step": 56
},
{
"epoch": 0.4261682242990654,
"grad_norm": 1.8287376165390015,
"learning_rate": 8.526533817887597e-05,
"loss": 1.1185,
"step": 57
},
{
"epoch": 0.4336448598130841,
"grad_norm": 1.4400948286056519,
"learning_rate": 8.468279372294879e-05,
"loss": 0.8956,
"step": 58
},
{
"epoch": 0.4411214953271028,
"grad_norm": 1.4424753189086914,
"learning_rate": 8.409130722993716e-05,
"loss": 0.9624,
"step": 59
},
{
"epoch": 0.4485981308411215,
"grad_norm": 1.5252015590667725,
"learning_rate": 8.349105688704965e-05,
"loss": 1.0351,
"step": 60
},
{
"epoch": 0.45607476635514016,
"grad_norm": 2.396437406539917,
"learning_rate": 8.28822235216328e-05,
"loss": 1.3683,
"step": 61
},
{
"epoch": 0.4635514018691589,
"grad_norm": 1.8390008211135864,
"learning_rate": 8.22649905466962e-05,
"loss": 1.4461,
"step": 62
},
{
"epoch": 0.47102803738317756,
"grad_norm": 1.6097524166107178,
"learning_rate": 8.163954390565895e-05,
"loss": 1.2682,
"step": 63
},
{
"epoch": 0.4785046728971963,
"grad_norm": 1.5528298616409302,
"learning_rate": 8.100607201633341e-05,
"loss": 1.3143,
"step": 64
},
{
"epoch": 0.48598130841121495,
"grad_norm": 2.06070876121521,
"learning_rate": 8.03647657141638e-05,
"loss": 1.5265,
"step": 65
},
{
"epoch": 0.4934579439252336,
"grad_norm": 3.6317155361175537,
"learning_rate": 7.971581819473646e-05,
"loss": 2.6631,
"step": 66
},
{
"epoch": 0.5009345794392523,
"grad_norm": 1.9490833282470703,
"learning_rate": 7.905942495557893e-05,
"loss": 1.5979,
"step": 67
},
{
"epoch": 0.508411214953271,
"grad_norm": 1.641648769378662,
"learning_rate": 7.839578373726587e-05,
"loss": 1.0137,
"step": 68
},
{
"epoch": 0.5158878504672897,
"grad_norm": 1.5076510906219482,
"learning_rate": 7.772509446384883e-05,
"loss": 1.0979,
"step": 69
},
{
"epoch": 0.5233644859813084,
"grad_norm": 1.3256382942199707,
"learning_rate": 7.704755918262877e-05,
"loss": 0.974,
"step": 70
},
{
"epoch": 0.5308411214953271,
"grad_norm": 1.267953872680664,
"learning_rate": 7.636338200328847e-05,
"loss": 0.935,
"step": 71
},
{
"epoch": 0.5383177570093458,
"grad_norm": 1.5867618322372437,
"learning_rate": 7.567276903640388e-05,
"loss": 1.0266,
"step": 72
},
{
"epoch": 0.5457943925233645,
"grad_norm": 2.156276226043701,
"learning_rate": 7.49759283313526e-05,
"loss": 0.9141,
"step": 73
},
{
"epoch": 0.5532710280373832,
"grad_norm": 1.622321605682373,
"learning_rate": 7.427306981363847e-05,
"loss": 0.8034,
"step": 74
},
{
"epoch": 0.5607476635514018,
"grad_norm": 1.727579951286316,
"learning_rate": 7.356440522165072e-05,
"loss": 0.8998,
"step": 75
},
{
"epoch": 0.5607476635514018,
"eval_loss": 1.0979422330856323,
"eval_runtime": 1.7942,
"eval_samples_per_second": 27.867,
"eval_steps_per_second": 3.901,
"step": 75
},
{
"epoch": 0.5682242990654206,
"grad_norm": 1.4198564291000366,
"learning_rate": 7.28501480428771e-05,
"loss": 0.9308,
"step": 76
},
{
"epoch": 0.5757009345794393,
"grad_norm": 1.5244777202606201,
"learning_rate": 7.213051344959015e-05,
"loss": 0.8892,
"step": 77
},
{
"epoch": 0.5831775700934579,
"grad_norm": 1.2492246627807617,
"learning_rate": 7.140571823402581e-05,
"loss": 0.9154,
"step": 78
},
{
"epoch": 0.5906542056074766,
"grad_norm": 1.314004898071289,
"learning_rate": 7.06759807430741e-05,
"loss": 0.8424,
"step": 79
},
{
"epoch": 0.5981308411214953,
"grad_norm": 1.3923033475875854,
"learning_rate": 6.994152081250139e-05,
"loss": 0.9523,
"step": 80
},
{
"epoch": 0.6056074766355141,
"grad_norm": 1.9231942892074585,
"learning_rate": 6.920255970072414e-05,
"loss": 0.9735,
"step": 81
},
{
"epoch": 0.6130841121495327,
"grad_norm": 1.9869779348373413,
"learning_rate": 6.845932002215419e-05,
"loss": 0.9535,
"step": 82
},
{
"epoch": 0.6205607476635514,
"grad_norm": 1.4400379657745361,
"learning_rate": 6.771202568013538e-05,
"loss": 0.8512,
"step": 83
},
{
"epoch": 0.6280373831775701,
"grad_norm": 1.6403017044067383,
"learning_rate": 6.696090179949188e-05,
"loss": 1.0716,
"step": 84
},
{
"epoch": 0.6355140186915887,
"grad_norm": 1.586094856262207,
"learning_rate": 6.620617465870877e-05,
"loss": 0.9685,
"step": 85
},
{
"epoch": 0.6429906542056075,
"grad_norm": 1.352432131767273,
"learning_rate": 6.544807162176478e-05,
"loss": 1.0035,
"step": 86
},
{
"epoch": 0.6504672897196262,
"grad_norm": 1.3724721670150757,
"learning_rate": 6.468682106963829e-05,
"loss": 1.1021,
"step": 87
},
{
"epoch": 0.6579439252336449,
"grad_norm": 1.259173035621643,
"learning_rate": 6.39226523315067e-05,
"loss": 0.8772,
"step": 88
},
{
"epoch": 0.6654205607476635,
"grad_norm": 1.35273277759552,
"learning_rate": 6.315579561566031e-05,
"loss": 1.024,
"step": 89
},
{
"epoch": 0.6728971962616822,
"grad_norm": 1.3475384712219238,
"learning_rate": 6.238648194015137e-05,
"loss": 1.0481,
"step": 90
},
{
"epoch": 0.680373831775701,
"grad_norm": 1.4782145023345947,
"learning_rate": 6.16149430631992e-05,
"loss": 0.9756,
"step": 91
},
{
"epoch": 0.6878504672897197,
"grad_norm": 1.314041018486023,
"learning_rate": 6.084141141337213e-05,
"loss": 1.113,
"step": 92
},
{
"epoch": 0.6953271028037383,
"grad_norm": 1.2568014860153198,
"learning_rate": 6.006612001956774e-05,
"loss": 0.9907,
"step": 93
},
{
"epoch": 0.702803738317757,
"grad_norm": 1.3871257305145264,
"learning_rate": 5.928930244081214e-05,
"loss": 1.2216,
"step": 94
},
{
"epoch": 0.7102803738317757,
"grad_norm": 1.4609795808792114,
"learning_rate": 5.851119269589963e-05,
"loss": 0.9784,
"step": 95
},
{
"epoch": 0.7177570093457943,
"grad_norm": 1.5408998727798462,
"learning_rate": 5.773202519289364e-05,
"loss": 0.9439,
"step": 96
},
{
"epoch": 0.7252336448598131,
"grad_norm": 1.5144058465957642,
"learning_rate": 5.695203465851068e-05,
"loss": 1.2598,
"step": 97
},
{
"epoch": 0.7327102803738318,
"grad_norm": 1.4548237323760986,
"learning_rate": 5.617145606740804e-05,
"loss": 1.2112,
"step": 98
},
{
"epoch": 0.7401869158878505,
"grad_norm": 2.4008612632751465,
"learning_rate": 5.5390524571397106e-05,
"loss": 2.154,
"step": 99
},
{
"epoch": 0.7476635514018691,
"grad_norm": 2.0451629161834717,
"learning_rate": 5.46094754286029e-05,
"loss": 1.3008,
"step": 100
},
{
"epoch": 0.7476635514018691,
"eval_loss": 1.081913709640503,
"eval_runtime": 1.7949,
"eval_samples_per_second": 27.857,
"eval_steps_per_second": 3.9,
"step": 100
},
{
"epoch": 0.7551401869158878,
"grad_norm": 1.8308786153793335,
"learning_rate": 5.382854393259197e-05,
"loss": 0.8989,
"step": 101
},
{
"epoch": 0.7626168224299066,
"grad_norm": 1.6299549341201782,
"learning_rate": 5.3047965341489344e-05,
"loss": 1.0614,
"step": 102
},
{
"epoch": 0.7700934579439253,
"grad_norm": 1.342459797859192,
"learning_rate": 5.226797480710638e-05,
"loss": 1.0972,
"step": 103
},
{
"epoch": 0.7775700934579439,
"grad_norm": 1.3016060590744019,
"learning_rate": 5.1488807304100386e-05,
"loss": 0.9202,
"step": 104
},
{
"epoch": 0.7850467289719626,
"grad_norm": 1.2006447315216064,
"learning_rate": 5.071069755918787e-05,
"loss": 0.9442,
"step": 105
},
{
"epoch": 0.7925233644859813,
"grad_norm": 1.2381112575531006,
"learning_rate": 4.9933879980432284e-05,
"loss": 0.8637,
"step": 106
},
{
"epoch": 0.8,
"grad_norm": 1.3466088771820068,
"learning_rate": 4.9158588586627895e-05,
"loss": 0.9609,
"step": 107
},
{
"epoch": 0.8074766355140187,
"grad_norm": 1.3996555805206299,
"learning_rate": 4.8385056936800786e-05,
"loss": 0.971,
"step": 108
},
{
"epoch": 0.8149532710280374,
"grad_norm": 1.1782788038253784,
"learning_rate": 4.7613518059848614e-05,
"loss": 0.9633,
"step": 109
},
{
"epoch": 0.822429906542056,
"grad_norm": 1.092534065246582,
"learning_rate": 4.684420438433971e-05,
"loss": 0.7657,
"step": 110
},
{
"epoch": 0.8299065420560747,
"grad_norm": 1.141908049583435,
"learning_rate": 4.607734766849332e-05,
"loss": 0.9123,
"step": 111
},
{
"epoch": 0.8373831775700935,
"grad_norm": 1.3389331102371216,
"learning_rate": 4.531317893036172e-05,
"loss": 0.9046,
"step": 112
},
{
"epoch": 0.8448598130841122,
"grad_norm": 1.2002415657043457,
"learning_rate": 4.455192837823523e-05,
"loss": 0.8237,
"step": 113
},
{
"epoch": 0.8523364485981308,
"grad_norm": 1.2427459955215454,
"learning_rate": 4.379382534129125e-05,
"loss": 0.9271,
"step": 114
},
{
"epoch": 0.8598130841121495,
"grad_norm": 1.2061138153076172,
"learning_rate": 4.303909820050814e-05,
"loss": 0.9193,
"step": 115
},
{
"epoch": 0.8672897196261682,
"grad_norm": 1.336917757987976,
"learning_rate": 4.228797431986463e-05,
"loss": 0.8825,
"step": 116
},
{
"epoch": 0.874766355140187,
"grad_norm": 1.0712493658065796,
"learning_rate": 4.154067997784581e-05,
"loss": 0.8476,
"step": 117
},
{
"epoch": 0.8822429906542056,
"grad_norm": 1.3190851211547852,
"learning_rate": 4.079744029927587e-05,
"loss": 0.977,
"step": 118
},
{
"epoch": 0.8897196261682243,
"grad_norm": 1.257728934288025,
"learning_rate": 4.005847918749863e-05,
"loss": 0.8868,
"step": 119
},
{
"epoch": 0.897196261682243,
"grad_norm": 1.2771947383880615,
"learning_rate": 3.932401925692591e-05,
"loss": 0.948,
"step": 120
},
{
"epoch": 0.9046728971962616,
"grad_norm": 1.36766517162323,
"learning_rate": 3.8594281765974204e-05,
"loss": 0.9731,
"step": 121
},
{
"epoch": 0.9121495327102803,
"grad_norm": 1.3747360706329346,
"learning_rate": 3.786948655040987e-05,
"loss": 0.9178,
"step": 122
},
{
"epoch": 0.9196261682242991,
"grad_norm": 1.2876604795455933,
"learning_rate": 3.714985195712292e-05,
"loss": 1.0227,
"step": 123
},
{
"epoch": 0.9271028037383178,
"grad_norm": 1.2210800647735596,
"learning_rate": 3.643559477834928e-05,
"loss": 0.9963,
"step": 124
},
{
"epoch": 0.9345794392523364,
"grad_norm": 1.3463104963302612,
"learning_rate": 3.572693018636152e-05,
"loss": 0.9492,
"step": 125
},
{
"epoch": 0.9345794392523364,
"eval_loss": 1.0178673267364502,
"eval_runtime": 1.8163,
"eval_samples_per_second": 27.528,
"eval_steps_per_second": 3.854,
"step": 125
},
{
"epoch": 0.9420560747663551,
"grad_norm": 1.2226606607437134,
"learning_rate": 3.5024071668647405e-05,
"loss": 0.9361,
"step": 126
},
{
"epoch": 0.9495327102803738,
"grad_norm": 1.3024946451187134,
"learning_rate": 3.432723096359614e-05,
"loss": 1.0504,
"step": 127
},
{
"epoch": 0.9570093457943926,
"grad_norm": 1.3780845403671265,
"learning_rate": 3.363661799671154e-05,
"loss": 1.0951,
"step": 128
},
{
"epoch": 0.9644859813084112,
"grad_norm": 1.4091452360153198,
"learning_rate": 3.2952440817371225e-05,
"loss": 0.934,
"step": 129
},
{
"epoch": 0.9719626168224299,
"grad_norm": 1.4428718090057373,
"learning_rate": 3.2274905536151187e-05,
"loss": 1.1048,
"step": 130
},
{
"epoch": 0.9794392523364486,
"grad_norm": 1.5878864526748657,
"learning_rate": 3.160421626273415e-05,
"loss": 1.2798,
"step": 131
},
{
"epoch": 0.9869158878504672,
"grad_norm": 3.471273899078369,
"learning_rate": 3.094057504442107e-05,
"loss": 2.45,
"step": 132
},
{
"epoch": 0.994392523364486,
"grad_norm": 1.5296839475631714,
"learning_rate": 3.0284181805263556e-05,
"loss": 1.1224,
"step": 133
},
{
"epoch": 1.0018691588785047,
"grad_norm": 1.9310905933380127,
"learning_rate": 2.963523428583621e-05,
"loss": 1.7417,
"step": 134
},
{
"epoch": 1.0093457943925233,
"grad_norm": 1.5308727025985718,
"learning_rate": 2.8993927983666613e-05,
"loss": 1.0866,
"step": 135
},
{
"epoch": 1.016822429906542,
"grad_norm": 1.4650174379348755,
"learning_rate": 2.836045609434107e-05,
"loss": 0.769,
"step": 136
},
{
"epoch": 1.0242990654205608,
"grad_norm": 1.3912580013275146,
"learning_rate": 2.7735009453303806e-05,
"loss": 0.9367,
"step": 137
},
{
"epoch": 1.0317757009345794,
"grad_norm": 1.2151148319244385,
"learning_rate": 2.7117776478367228e-05,
"loss": 0.8128,
"step": 138
},
{
"epoch": 1.0392523364485982,
"grad_norm": 1.2486802339553833,
"learning_rate": 2.650894311295034e-05,
"loss": 0.8115,
"step": 139
},
{
"epoch": 1.0467289719626167,
"grad_norm": 1.2411141395568848,
"learning_rate": 2.5908692770062843e-05,
"loss": 0.9227,
"step": 140
},
{
"epoch": 1.0542056074766355,
"grad_norm": 1.0788848400115967,
"learning_rate": 2.531720627705123e-05,
"loss": 0.7906,
"step": 141
},
{
"epoch": 1.0616822429906543,
"grad_norm": 1.148887276649475,
"learning_rate": 2.4734661821124045e-05,
"loss": 0.6868,
"step": 142
},
{
"epoch": 1.0691588785046728,
"grad_norm": 1.1361629962921143,
"learning_rate": 2.4161234895672416e-05,
"loss": 0.8188,
"step": 143
},
{
"epoch": 1.0766355140186916,
"grad_norm": 1.2399373054504395,
"learning_rate": 2.359709824740207e-05,
"loss": 0.7538,
"step": 144
},
{
"epoch": 1.0841121495327102,
"grad_norm": 1.1857571601867676,
"learning_rate": 2.3042421824292836e-05,
"loss": 0.7866,
"step": 145
},
{
"epoch": 1.091588785046729,
"grad_norm": 1.3761775493621826,
"learning_rate": 2.249737272440135e-05,
"loss": 0.803,
"step": 146
},
{
"epoch": 1.0990654205607477,
"grad_norm": 1.153091549873352,
"learning_rate": 2.196211514552208e-05,
"loss": 0.7357,
"step": 147
},
{
"epoch": 1.1065420560747663,
"grad_norm": 1.0790941715240479,
"learning_rate": 2.1436810335722354e-05,
"loss": 0.766,
"step": 148
},
{
"epoch": 1.114018691588785,
"grad_norm": 1.2671778202056885,
"learning_rate": 2.092161654476577e-05,
"loss": 0.8744,
"step": 149
},
{
"epoch": 1.1214953271028036,
"grad_norm": 1.2150834798812866,
"learning_rate": 2.0416688976438993e-05,
"loss": 0.8142,
"step": 150
},
{
"epoch": 1.1214953271028036,
"eval_loss": 1.0219863653182983,
"eval_runtime": 1.7535,
"eval_samples_per_second": 28.515,
"eval_steps_per_second": 3.992,
"step": 150
},
{
"epoch": 1.1289719626168224,
"grad_norm": 1.3583935499191284,
"learning_rate": 1.9922179741796086e-05,
"loss": 0.8198,
"step": 151
},
{
"epoch": 1.1364485981308412,
"grad_norm": 1.284717321395874,
"learning_rate": 1.9438237813334586e-05,
"loss": 0.7878,
"step": 152
},
{
"epoch": 1.1439252336448598,
"grad_norm": 1.3621183633804321,
"learning_rate": 1.8965008980117037e-05,
"loss": 0.9232,
"step": 153
},
{
"epoch": 1.1514018691588785,
"grad_norm": 1.6327229738235474,
"learning_rate": 1.850263580385163e-05,
"loss": 1.0294,
"step": 154
},
{
"epoch": 1.158878504672897,
"grad_norm": 1.386734127998352,
"learning_rate": 1.8051257575944925e-05,
"loss": 0.8834,
"step": 155
},
{
"epoch": 1.1663551401869159,
"grad_norm": 1.375533938407898,
"learning_rate": 1.7611010275539962e-05,
"loss": 0.9483,
"step": 156
},
{
"epoch": 1.1738317757009347,
"grad_norm": 1.242241382598877,
"learning_rate": 1.718202652855205e-05,
"loss": 0.8194,
"step": 157
},
{
"epoch": 1.1813084112149532,
"grad_norm": 1.247066617012024,
"learning_rate": 1.6764435567714794e-05,
"loss": 0.8326,
"step": 158
},
{
"epoch": 1.188785046728972,
"grad_norm": 1.4270540475845337,
"learning_rate": 1.6358363193648352e-05,
"loss": 0.8584,
"step": 159
},
{
"epoch": 1.1962616822429906,
"grad_norm": 1.225496768951416,
"learning_rate": 1.5963931736961547e-05,
"loss": 0.8475,
"step": 160
},
{
"epoch": 1.2037383177570093,
"grad_norm": 1.2226568460464478,
"learning_rate": 1.5581260021399396e-05,
"loss": 0.828,
"step": 161
},
{
"epoch": 1.2112149532710281,
"grad_norm": 1.387080192565918,
"learning_rate": 1.5210463328047095e-05,
"loss": 0.8902,
"step": 162
},
{
"epoch": 1.2186915887850467,
"grad_norm": 1.2809566259384155,
"learning_rate": 1.4851653360601179e-05,
"loss": 0.9188,
"step": 163
},
{
"epoch": 1.2261682242990655,
"grad_norm": 1.4872632026672363,
"learning_rate": 1.4504938211718489e-05,
"loss": 1.1853,
"step": 164
},
{
"epoch": 1.233644859813084,
"grad_norm": 1.3789664506912231,
"learning_rate": 1.4170422330452816e-05,
"loss": 0.9331,
"step": 165
},
{
"epoch": 1.2411214953271028,
"grad_norm": 1.7620553970336914,
"learning_rate": 1.384820649078939e-05,
"loss": 1.3851,
"step": 166
},
{
"epoch": 1.2485981308411216,
"grad_norm": 2.51485013961792,
"learning_rate": 1.3538387761286303e-05,
"loss": 1.7585,
"step": 167
},
{
"epoch": 1.2560747663551401,
"grad_norm": 1.4379189014434814,
"learning_rate": 1.3241059475832373e-05,
"loss": 0.9246,
"step": 168
},
{
"epoch": 1.263551401869159,
"grad_norm": 1.3618923425674438,
"learning_rate": 1.2956311205529943e-05,
"loss": 0.8608,
"step": 169
},
{
"epoch": 1.2710280373831775,
"grad_norm": 1.3011233806610107,
"learning_rate": 1.268422873171136e-05,
"loss": 0.8322,
"step": 170
},
{
"epoch": 1.2785046728971963,
"grad_norm": 1.5399248600006104,
"learning_rate": 1.2424894020096997e-05,
"loss": 0.7588,
"step": 171
},
{
"epoch": 1.2859813084112148,
"grad_norm": 1.449872374534607,
"learning_rate": 1.217838519610291e-05,
"loss": 0.857,
"step": 172
},
{
"epoch": 1.2934579439252336,
"grad_norm": 1.3477046489715576,
"learning_rate": 1.1944776521305213e-05,
"loss": 0.8627,
"step": 173
},
{
"epoch": 1.3009345794392524,
"grad_norm": 1.3076852560043335,
"learning_rate": 1.1724138371068603e-05,
"loss": 0.9005,
"step": 174
},
{
"epoch": 1.308411214953271,
"grad_norm": 1.294968843460083,
"learning_rate": 1.1516537213345519e-05,
"loss": 0.7639,
"step": 175
},
{
"epoch": 1.308411214953271,
"eval_loss": 1.0100014209747314,
"eval_runtime": 1.729,
"eval_samples_per_second": 28.918,
"eval_steps_per_second": 4.049,
"step": 175
},
{
"epoch": 1.3158878504672897,
"grad_norm": 1.3219696283340454,
"learning_rate": 1.1322035588652484e-05,
"loss": 0.7752,
"step": 176
},
{
"epoch": 1.3233644859813083,
"grad_norm": 1.1848926544189453,
"learning_rate": 1.1140692091229556e-05,
"loss": 0.7759,
"step": 177
},
{
"epoch": 1.330841121495327,
"grad_norm": 1.1485064029693604,
"learning_rate": 1.0972561351388622e-05,
"loss": 0.7454,
"step": 178
},
{
"epoch": 1.3383177570093459,
"grad_norm": 1.1740100383758545,
"learning_rate": 1.0817694019055866e-05,
"loss": 0.761,
"step": 179
},
{
"epoch": 1.3457943925233644,
"grad_norm": 1.3378069400787354,
"learning_rate": 1.0676136748513286e-05,
"loss": 0.8535,
"step": 180
},
{
"epoch": 1.3532710280373832,
"grad_norm": 1.2721531391143799,
"learning_rate": 1.0547932184343948e-05,
"loss": 0.8117,
"step": 181
},
{
"epoch": 1.3607476635514018,
"grad_norm": 1.255110740661621,
"learning_rate": 1.043311894858519e-05,
"loss": 0.8114,
"step": 182
},
{
"epoch": 1.3682242990654205,
"grad_norm": 1.184085726737976,
"learning_rate": 1.033173162909358e-05,
"loss": 0.7484,
"step": 183
},
{
"epoch": 1.3757009345794393,
"grad_norm": 1.2864772081375122,
"learning_rate": 1.0243800769125222e-05,
"loss": 0.8197,
"step": 184
},
{
"epoch": 1.3831775700934579,
"grad_norm": 1.3960767984390259,
"learning_rate": 1.0169352858134525e-05,
"loss": 0.8416,
"step": 185
},
{
"epoch": 1.3906542056074767,
"grad_norm": 1.6105817556381226,
"learning_rate": 1.0108410323794131e-05,
"loss": 0.8156,
"step": 186
},
{
"epoch": 1.3981308411214952,
"grad_norm": 1.4161114692687988,
"learning_rate": 1.0060991525238538e-05,
"loss": 0.8663,
"step": 187
},
{
"epoch": 1.405607476635514,
"grad_norm": 1.3891263008117676,
"learning_rate": 1.0027110747533332e-05,
"loss": 0.9249,
"step": 188
},
{
"epoch": 1.4130841121495328,
"grad_norm": 1.4171258211135864,
"learning_rate": 1.0006778197371774e-05,
"loss": 0.837,
"step": 189
},
{
"epoch": 1.4205607476635513,
"grad_norm": 1.4086953401565552,
"learning_rate": 1e-05,
"loss": 0.9393,
"step": 190
}
],
"logging_steps": 1,
"max_steps": 190,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 80,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.916153511660749e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}