qwen2.5-3b-instruct-R64-RIVERUSDT / trainer_state.json
NikhilSwami's picture
Upload trainer_state.json with huggingface_hub
7ed00a5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0863531225905936,
"eval_steps": 64,
"global_step": 352,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003108003108003108,
"grad_norm": 10.027831077575684,
"learning_rate": 0.0,
"loss": 0.6767,
"step": 1
},
{
"epoch": 0.006216006216006216,
"grad_norm": 9.679778099060059,
"learning_rate": 5.000000000000001e-07,
"loss": 0.6644,
"step": 2
},
{
"epoch": 0.009324009324009324,
"grad_norm": 10.520271301269531,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.6934,
"step": 3
},
{
"epoch": 0.012432012432012432,
"grad_norm": 8.677583694458008,
"learning_rate": 1.5e-06,
"loss": 0.6617,
"step": 4
},
{
"epoch": 0.01554001554001554,
"grad_norm": 6.502548694610596,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.6509,
"step": 5
},
{
"epoch": 0.018648018648018648,
"grad_norm": 4.257171154022217,
"learning_rate": 2.5e-06,
"loss": 0.639,
"step": 6
},
{
"epoch": 0.021756021756021756,
"grad_norm": 3.460066556930542,
"learning_rate": 3e-06,
"loss": 0.6286,
"step": 7
},
{
"epoch": 0.024864024864024864,
"grad_norm": 3.0126283168792725,
"learning_rate": 3.5e-06,
"loss": 0.5948,
"step": 8
},
{
"epoch": 0.027972027972027972,
"grad_norm": 2.567995309829712,
"learning_rate": 4.000000000000001e-06,
"loss": 0.5744,
"step": 9
},
{
"epoch": 0.03108003108003108,
"grad_norm": 2.516597032546997,
"learning_rate": 4.5e-06,
"loss": 0.5496,
"step": 10
},
{
"epoch": 0.03418803418803419,
"grad_norm": 1.8187586069107056,
"learning_rate": 5e-06,
"loss": 0.5397,
"step": 11
},
{
"epoch": 0.037296037296037296,
"grad_norm": 1.7935529947280884,
"learning_rate": 5.500000000000001e-06,
"loss": 0.5229,
"step": 12
},
{
"epoch": 0.04040404040404041,
"grad_norm": 1.8665963411331177,
"learning_rate": 6e-06,
"loss": 0.5227,
"step": 13
},
{
"epoch": 0.04351204351204351,
"grad_norm": 2.0106680393218994,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.4882,
"step": 14
},
{
"epoch": 0.046620046620046623,
"grad_norm": 3.305211305618286,
"learning_rate": 7e-06,
"loss": 0.4772,
"step": 15
},
{
"epoch": 0.04972804972804973,
"grad_norm": 3.047219753265381,
"learning_rate": 7.500000000000001e-06,
"loss": 0.452,
"step": 16
},
{
"epoch": 0.05283605283605284,
"grad_norm": 2.5453591346740723,
"learning_rate": 8.000000000000001e-06,
"loss": 0.4138,
"step": 17
},
{
"epoch": 0.055944055944055944,
"grad_norm": 5.414841175079346,
"learning_rate": 8.5e-06,
"loss": 0.4238,
"step": 18
},
{
"epoch": 0.059052059052059055,
"grad_norm": 2.979440927505493,
"learning_rate": 9e-06,
"loss": 0.3987,
"step": 19
},
{
"epoch": 0.06216006216006216,
"grad_norm": 1.981175422668457,
"learning_rate": 9.5e-06,
"loss": 0.3874,
"step": 20
},
{
"epoch": 0.06526806526806526,
"grad_norm": 1.7793089151382446,
"learning_rate": 1e-05,
"loss": 0.3631,
"step": 21
},
{
"epoch": 0.06837606837606838,
"grad_norm": 1.1854480504989624,
"learning_rate": 9.989429175475688e-06,
"loss": 0.3765,
"step": 22
},
{
"epoch": 0.07148407148407149,
"grad_norm": 0.8928348422050476,
"learning_rate": 9.978858350951375e-06,
"loss": 0.3481,
"step": 23
},
{
"epoch": 0.07459207459207459,
"grad_norm": 1.7531942129135132,
"learning_rate": 9.968287526427062e-06,
"loss": 0.3693,
"step": 24
},
{
"epoch": 0.0777000777000777,
"grad_norm": 1.0829464197158813,
"learning_rate": 9.957716701902749e-06,
"loss": 0.3644,
"step": 25
},
{
"epoch": 0.08080808080808081,
"grad_norm": 0.98089200258255,
"learning_rate": 9.947145877378436e-06,
"loss": 0.3616,
"step": 26
},
{
"epoch": 0.08391608391608392,
"grad_norm": 0.795221745967865,
"learning_rate": 9.936575052854123e-06,
"loss": 0.3679,
"step": 27
},
{
"epoch": 0.08702408702408702,
"grad_norm": 1.091843605041504,
"learning_rate": 9.92600422832981e-06,
"loss": 0.3439,
"step": 28
},
{
"epoch": 0.09013209013209013,
"grad_norm": 0.8538377285003662,
"learning_rate": 9.915433403805497e-06,
"loss": 0.3401,
"step": 29
},
{
"epoch": 0.09324009324009325,
"grad_norm": 0.9114591479301453,
"learning_rate": 9.904862579281184e-06,
"loss": 0.3515,
"step": 30
},
{
"epoch": 0.09634809634809635,
"grad_norm": 0.9083001017570496,
"learning_rate": 9.894291754756871e-06,
"loss": 0.3449,
"step": 31
},
{
"epoch": 0.09945609945609946,
"grad_norm": 0.9144365787506104,
"learning_rate": 9.883720930232558e-06,
"loss": 0.3393,
"step": 32
},
{
"epoch": 0.10256410256410256,
"grad_norm": 1.0221809148788452,
"learning_rate": 9.873150105708245e-06,
"loss": 0.353,
"step": 33
},
{
"epoch": 0.10567210567210568,
"grad_norm": 1.0219439268112183,
"learning_rate": 9.862579281183932e-06,
"loss": 0.3439,
"step": 34
},
{
"epoch": 0.10878010878010878,
"grad_norm": 1.5430618524551392,
"learning_rate": 9.852008456659621e-06,
"loss": 0.3338,
"step": 35
},
{
"epoch": 0.11188811188811189,
"grad_norm": 1.4754544496536255,
"learning_rate": 9.841437632135308e-06,
"loss": 0.3363,
"step": 36
},
{
"epoch": 0.11499611499611499,
"grad_norm": 1.1298989057540894,
"learning_rate": 9.830866807610995e-06,
"loss": 0.3423,
"step": 37
},
{
"epoch": 0.11810411810411811,
"grad_norm": 1.0130062103271484,
"learning_rate": 9.820295983086682e-06,
"loss": 0.3298,
"step": 38
},
{
"epoch": 0.12121212121212122,
"grad_norm": 1.8003513813018799,
"learning_rate": 9.80972515856237e-06,
"loss": 0.3272,
"step": 39
},
{
"epoch": 0.12432012432012432,
"grad_norm": 0.9532265067100525,
"learning_rate": 9.799154334038056e-06,
"loss": 0.3282,
"step": 40
},
{
"epoch": 0.12742812742812742,
"grad_norm": 1.5232913494110107,
"learning_rate": 9.788583509513743e-06,
"loss": 0.3469,
"step": 41
},
{
"epoch": 0.13053613053613053,
"grad_norm": 0.8918169736862183,
"learning_rate": 9.77801268498943e-06,
"loss": 0.326,
"step": 42
},
{
"epoch": 0.13364413364413363,
"grad_norm": 0.8845950365066528,
"learning_rate": 9.767441860465117e-06,
"loss": 0.3313,
"step": 43
},
{
"epoch": 0.13675213675213677,
"grad_norm": 0.8410794138908386,
"learning_rate": 9.756871035940804e-06,
"loss": 0.3318,
"step": 44
},
{
"epoch": 0.13986013986013987,
"grad_norm": 0.7157808542251587,
"learning_rate": 9.746300211416491e-06,
"loss": 0.3381,
"step": 45
},
{
"epoch": 0.14296814296814297,
"grad_norm": 1.1680670976638794,
"learning_rate": 9.735729386892178e-06,
"loss": 0.3281,
"step": 46
},
{
"epoch": 0.14607614607614608,
"grad_norm": 0.9500836133956909,
"learning_rate": 9.725158562367865e-06,
"loss": 0.336,
"step": 47
},
{
"epoch": 0.14918414918414918,
"grad_norm": 0.8565309643745422,
"learning_rate": 9.714587737843552e-06,
"loss": 0.3207,
"step": 48
},
{
"epoch": 0.1522921522921523,
"grad_norm": 1.1311777830123901,
"learning_rate": 9.70401691331924e-06,
"loss": 0.3339,
"step": 49
},
{
"epoch": 0.1554001554001554,
"grad_norm": 1.0368160009384155,
"learning_rate": 9.693446088794927e-06,
"loss": 0.3262,
"step": 50
},
{
"epoch": 0.1585081585081585,
"grad_norm": 0.9648517370223999,
"learning_rate": 9.682875264270614e-06,
"loss": 0.3376,
"step": 51
},
{
"epoch": 0.16161616161616163,
"grad_norm": 1.1039059162139893,
"learning_rate": 9.6723044397463e-06,
"loss": 0.3352,
"step": 52
},
{
"epoch": 0.16472416472416473,
"grad_norm": 1.0544918775558472,
"learning_rate": 9.661733615221988e-06,
"loss": 0.3237,
"step": 53
},
{
"epoch": 0.16783216783216784,
"grad_norm": 1.533158302307129,
"learning_rate": 9.651162790697676e-06,
"loss": 0.3287,
"step": 54
},
{
"epoch": 0.17094017094017094,
"grad_norm": 1.2342826128005981,
"learning_rate": 9.640591966173363e-06,
"loss": 0.3162,
"step": 55
},
{
"epoch": 0.17404817404817405,
"grad_norm": 1.0702942609786987,
"learning_rate": 9.63002114164905e-06,
"loss": 0.3143,
"step": 56
},
{
"epoch": 0.17715617715617715,
"grad_norm": 1.02211594581604,
"learning_rate": 9.619450317124736e-06,
"loss": 0.3318,
"step": 57
},
{
"epoch": 0.18026418026418026,
"grad_norm": 0.8379388451576233,
"learning_rate": 9.608879492600423e-06,
"loss": 0.3239,
"step": 58
},
{
"epoch": 0.18337218337218336,
"grad_norm": 0.9620960354804993,
"learning_rate": 9.59830866807611e-06,
"loss": 0.3246,
"step": 59
},
{
"epoch": 0.1864801864801865,
"grad_norm": 0.9239097833633423,
"learning_rate": 9.587737843551797e-06,
"loss": 0.3278,
"step": 60
},
{
"epoch": 0.1895881895881896,
"grad_norm": 0.7097995281219482,
"learning_rate": 9.577167019027484e-06,
"loss": 0.3152,
"step": 61
},
{
"epoch": 0.1926961926961927,
"grad_norm": 0.9077997803688049,
"learning_rate": 9.566596194503171e-06,
"loss": 0.3219,
"step": 62
},
{
"epoch": 0.1958041958041958,
"grad_norm": 0.8704112768173218,
"learning_rate": 9.55602536997886e-06,
"loss": 0.3262,
"step": 63
},
{
"epoch": 0.1989121989121989,
"grad_norm": 0.9264605641365051,
"learning_rate": 9.545454545454547e-06,
"loss": 0.3176,
"step": 64
},
{
"epoch": 0.1989121989121989,
"eval_loss": 0.3377174139022827,
"eval_runtime": 149.1316,
"eval_samples_per_second": 1.911,
"eval_steps_per_second": 0.959,
"step": 64
},
{
"epoch": 0.20202020202020202,
"grad_norm": 0.9881049394607544,
"learning_rate": 9.534883720930234e-06,
"loss": 0.3312,
"step": 65
},
{
"epoch": 0.20512820512820512,
"grad_norm": 1.1825007200241089,
"learning_rate": 9.524312896405921e-06,
"loss": 0.3189,
"step": 66
},
{
"epoch": 0.20823620823620823,
"grad_norm": 0.8272495865821838,
"learning_rate": 9.513742071881608e-06,
"loss": 0.3293,
"step": 67
},
{
"epoch": 0.21134421134421136,
"grad_norm": 1.0992769002914429,
"learning_rate": 9.503171247357295e-06,
"loss": 0.3119,
"step": 68
},
{
"epoch": 0.21445221445221446,
"grad_norm": 0.9182390570640564,
"learning_rate": 9.492600422832982e-06,
"loss": 0.331,
"step": 69
},
{
"epoch": 0.21756021756021757,
"grad_norm": 0.8677308559417725,
"learning_rate": 9.482029598308669e-06,
"loss": 0.3168,
"step": 70
},
{
"epoch": 0.22066822066822067,
"grad_norm": 1.2915256023406982,
"learning_rate": 9.471458773784356e-06,
"loss": 0.3181,
"step": 71
},
{
"epoch": 0.22377622377622378,
"grad_norm": 1.6176910400390625,
"learning_rate": 9.460887949260043e-06,
"loss": 0.3254,
"step": 72
},
{
"epoch": 0.22688422688422688,
"grad_norm": 0.6357202529907227,
"learning_rate": 9.45031712473573e-06,
"loss": 0.3298,
"step": 73
},
{
"epoch": 0.22999222999222999,
"grad_norm": 0.911662220954895,
"learning_rate": 9.439746300211417e-06,
"loss": 0.3248,
"step": 74
},
{
"epoch": 0.2331002331002331,
"grad_norm": 0.7426556944847107,
"learning_rate": 9.429175475687104e-06,
"loss": 0.3301,
"step": 75
},
{
"epoch": 0.23620823620823622,
"grad_norm": 0.7509779930114746,
"learning_rate": 9.418604651162791e-06,
"loss": 0.3209,
"step": 76
},
{
"epoch": 0.23931623931623933,
"grad_norm": 0.7699870467185974,
"learning_rate": 9.408033826638478e-06,
"loss": 0.3171,
"step": 77
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.7583193182945251,
"learning_rate": 9.397463002114165e-06,
"loss": 0.3128,
"step": 78
},
{
"epoch": 0.24553224553224554,
"grad_norm": 0.968973696231842,
"learning_rate": 9.386892177589852e-06,
"loss": 0.3293,
"step": 79
},
{
"epoch": 0.24864024864024864,
"grad_norm": 0.9967902302742004,
"learning_rate": 9.37632135306554e-06,
"loss": 0.3209,
"step": 80
},
{
"epoch": 0.2517482517482518,
"grad_norm": 0.7837809920310974,
"learning_rate": 9.365750528541226e-06,
"loss": 0.3152,
"step": 81
},
{
"epoch": 0.25485625485625485,
"grad_norm": 1.6905367374420166,
"learning_rate": 9.355179704016915e-06,
"loss": 0.3163,
"step": 82
},
{
"epoch": 0.257964257964258,
"grad_norm": 0.8734452128410339,
"learning_rate": 9.344608879492602e-06,
"loss": 0.3306,
"step": 83
},
{
"epoch": 0.26107226107226106,
"grad_norm": 3.6059653759002686,
"learning_rate": 9.33403805496829e-06,
"loss": 0.3104,
"step": 84
},
{
"epoch": 0.2641802641802642,
"grad_norm": 1.1703656911849976,
"learning_rate": 9.323467230443976e-06,
"loss": 0.3071,
"step": 85
},
{
"epoch": 0.26728826728826727,
"grad_norm": 0.8762909770011902,
"learning_rate": 9.312896405919663e-06,
"loss": 0.3022,
"step": 86
},
{
"epoch": 0.2703962703962704,
"grad_norm": 2.158876419067383,
"learning_rate": 9.30232558139535e-06,
"loss": 0.3217,
"step": 87
},
{
"epoch": 0.27350427350427353,
"grad_norm": 0.8010348081588745,
"learning_rate": 9.291754756871036e-06,
"loss": 0.322,
"step": 88
},
{
"epoch": 0.2766122766122766,
"grad_norm": 1.119739055633545,
"learning_rate": 9.281183932346723e-06,
"loss": 0.3248,
"step": 89
},
{
"epoch": 0.27972027972027974,
"grad_norm": 0.7900079488754272,
"learning_rate": 9.27061310782241e-06,
"loss": 0.3102,
"step": 90
},
{
"epoch": 0.2828282828282828,
"grad_norm": 0.8093041181564331,
"learning_rate": 9.260042283298098e-06,
"loss": 0.3259,
"step": 91
},
{
"epoch": 0.28593628593628595,
"grad_norm": 0.7240622043609619,
"learning_rate": 9.249471458773785e-06,
"loss": 0.3002,
"step": 92
},
{
"epoch": 0.289044289044289,
"grad_norm": 0.9449782371520996,
"learning_rate": 9.238900634249473e-06,
"loss": 0.3076,
"step": 93
},
{
"epoch": 0.29215229215229216,
"grad_norm": 0.9448596835136414,
"learning_rate": 9.22832980972516e-06,
"loss": 0.3012,
"step": 94
},
{
"epoch": 0.29526029526029524,
"grad_norm": 0.9209067821502686,
"learning_rate": 9.217758985200847e-06,
"loss": 0.3131,
"step": 95
},
{
"epoch": 0.29836829836829837,
"grad_norm": 0.878709614276886,
"learning_rate": 9.207188160676534e-06,
"loss": 0.3157,
"step": 96
},
{
"epoch": 0.3014763014763015,
"grad_norm": 1.1178463697433472,
"learning_rate": 9.19661733615222e-06,
"loss": 0.3166,
"step": 97
},
{
"epoch": 0.3045843045843046,
"grad_norm": 0.9717866778373718,
"learning_rate": 9.186046511627908e-06,
"loss": 0.3144,
"step": 98
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.9905857443809509,
"learning_rate": 9.175475687103595e-06,
"loss": 0.3263,
"step": 99
},
{
"epoch": 0.3108003108003108,
"grad_norm": 1.0447399616241455,
"learning_rate": 9.164904862579282e-06,
"loss": 0.3074,
"step": 100
},
{
"epoch": 0.3139083139083139,
"grad_norm": 0.9876366853713989,
"learning_rate": 9.154334038054969e-06,
"loss": 0.3221,
"step": 101
},
{
"epoch": 0.317016317016317,
"grad_norm": 1.3406106233596802,
"learning_rate": 9.143763213530656e-06,
"loss": 0.3209,
"step": 102
},
{
"epoch": 0.3201243201243201,
"grad_norm": 1.1402978897094727,
"learning_rate": 9.133192389006343e-06,
"loss": 0.3181,
"step": 103
},
{
"epoch": 0.32323232323232326,
"grad_norm": 1.0274314880371094,
"learning_rate": 9.12262156448203e-06,
"loss": 0.3179,
"step": 104
},
{
"epoch": 0.32634032634032634,
"grad_norm": 1.0853135585784912,
"learning_rate": 9.112050739957717e-06,
"loss": 0.3068,
"step": 105
},
{
"epoch": 0.32944832944832947,
"grad_norm": 0.9549627900123596,
"learning_rate": 9.101479915433404e-06,
"loss": 0.3058,
"step": 106
},
{
"epoch": 0.33255633255633255,
"grad_norm": 0.9081363081932068,
"learning_rate": 9.090909090909091e-06,
"loss": 0.305,
"step": 107
},
{
"epoch": 0.3356643356643357,
"grad_norm": 1.083267092704773,
"learning_rate": 9.080338266384778e-06,
"loss": 0.3293,
"step": 108
},
{
"epoch": 0.33877233877233875,
"grad_norm": 0.9146764278411865,
"learning_rate": 9.069767441860465e-06,
"loss": 0.3308,
"step": 109
},
{
"epoch": 0.3418803418803419,
"grad_norm": 0.8309290409088135,
"learning_rate": 9.059196617336154e-06,
"loss": 0.3219,
"step": 110
},
{
"epoch": 0.34498834498834496,
"grad_norm": 0.7540556788444519,
"learning_rate": 9.048625792811841e-06,
"loss": 0.3165,
"step": 111
},
{
"epoch": 0.3480963480963481,
"grad_norm": 0.7756165862083435,
"learning_rate": 9.038054968287528e-06,
"loss": 0.3201,
"step": 112
},
{
"epoch": 0.35120435120435123,
"grad_norm": 1.016161561012268,
"learning_rate": 9.027484143763215e-06,
"loss": 0.318,
"step": 113
},
{
"epoch": 0.3543123543123543,
"grad_norm": 1.1762275695800781,
"learning_rate": 9.016913319238902e-06,
"loss": 0.3071,
"step": 114
},
{
"epoch": 0.35742035742035744,
"grad_norm": 1.0186941623687744,
"learning_rate": 9.006342494714589e-06,
"loss": 0.3094,
"step": 115
},
{
"epoch": 0.3605283605283605,
"grad_norm": 1.3835426568984985,
"learning_rate": 8.995771670190276e-06,
"loss": 0.3203,
"step": 116
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.9151639938354492,
"learning_rate": 8.985200845665963e-06,
"loss": 0.3075,
"step": 117
},
{
"epoch": 0.3667443667443667,
"grad_norm": 0.9079708456993103,
"learning_rate": 8.974630021141648e-06,
"loss": 0.3111,
"step": 118
},
{
"epoch": 0.36985236985236986,
"grad_norm": 0.7135366201400757,
"learning_rate": 8.964059196617337e-06,
"loss": 0.3131,
"step": 119
},
{
"epoch": 0.372960372960373,
"grad_norm": 0.7310993671417236,
"learning_rate": 8.953488372093024e-06,
"loss": 0.3181,
"step": 120
},
{
"epoch": 0.37606837606837606,
"grad_norm": 0.9562262296676636,
"learning_rate": 8.942917547568711e-06,
"loss": 0.3114,
"step": 121
},
{
"epoch": 0.3791763791763792,
"grad_norm": 1.088692545890808,
"learning_rate": 8.932346723044398e-06,
"loss": 0.2985,
"step": 122
},
{
"epoch": 0.3822843822843823,
"grad_norm": 1.3334287405014038,
"learning_rate": 8.921775898520085e-06,
"loss": 0.3198,
"step": 123
},
{
"epoch": 0.3853923853923854,
"grad_norm": 1.1457082033157349,
"learning_rate": 8.911205073995772e-06,
"loss": 0.3027,
"step": 124
},
{
"epoch": 0.3885003885003885,
"grad_norm": 1.0944201946258545,
"learning_rate": 8.90063424947146e-06,
"loss": 0.3195,
"step": 125
},
{
"epoch": 0.3916083916083916,
"grad_norm": 1.679890513420105,
"learning_rate": 8.890063424947146e-06,
"loss": 0.3118,
"step": 126
},
{
"epoch": 0.3947163947163947,
"grad_norm": 1.0934737920761108,
"learning_rate": 8.879492600422833e-06,
"loss": 0.3125,
"step": 127
},
{
"epoch": 0.3978243978243978,
"grad_norm": 0.9423776865005493,
"learning_rate": 8.86892177589852e-06,
"loss": 0.3069,
"step": 128
},
{
"epoch": 0.3978243978243978,
"eval_loss": 0.33542340993881226,
"eval_runtime": 147.0915,
"eval_samples_per_second": 1.938,
"eval_steps_per_second": 0.972,
"step": 128
},
{
"epoch": 0.40093240093240096,
"grad_norm": 1.373064637184143,
"learning_rate": 8.858350951374208e-06,
"loss": 0.3113,
"step": 129
},
{
"epoch": 0.40404040404040403,
"grad_norm": 0.9782734513282776,
"learning_rate": 8.847780126849895e-06,
"loss": 0.3176,
"step": 130
},
{
"epoch": 0.40714840714840717,
"grad_norm": 1.1988129615783691,
"learning_rate": 8.837209302325582e-06,
"loss": 0.3036,
"step": 131
},
{
"epoch": 0.41025641025641024,
"grad_norm": 1.3978164196014404,
"learning_rate": 8.826638477801269e-06,
"loss": 0.3067,
"step": 132
},
{
"epoch": 0.4133644133644134,
"grad_norm": 0.8266012072563171,
"learning_rate": 8.816067653276956e-06,
"loss": 0.3105,
"step": 133
},
{
"epoch": 0.41647241647241645,
"grad_norm": 1.0358003377914429,
"learning_rate": 8.805496828752643e-06,
"loss": 0.3176,
"step": 134
},
{
"epoch": 0.4195804195804196,
"grad_norm": 0.9363102316856384,
"learning_rate": 8.79492600422833e-06,
"loss": 0.3151,
"step": 135
},
{
"epoch": 0.4226884226884227,
"grad_norm": 0.9805242419242859,
"learning_rate": 8.784355179704017e-06,
"loss": 0.3164,
"step": 136
},
{
"epoch": 0.4257964257964258,
"grad_norm": 1.4923985004425049,
"learning_rate": 8.773784355179706e-06,
"loss": 0.3059,
"step": 137
},
{
"epoch": 0.4289044289044289,
"grad_norm": 1.7009886503219604,
"learning_rate": 8.763213530655393e-06,
"loss": 0.2937,
"step": 138
},
{
"epoch": 0.432012432012432,
"grad_norm": 0.8320425748825073,
"learning_rate": 8.75264270613108e-06,
"loss": 0.288,
"step": 139
},
{
"epoch": 0.43512043512043513,
"grad_norm": 1.3431979417800903,
"learning_rate": 8.742071881606767e-06,
"loss": 0.3063,
"step": 140
},
{
"epoch": 0.4382284382284382,
"grad_norm": 1.0519447326660156,
"learning_rate": 8.731501057082454e-06,
"loss": 0.3043,
"step": 141
},
{
"epoch": 0.44133644133644134,
"grad_norm": 1.0041645765304565,
"learning_rate": 8.72093023255814e-06,
"loss": 0.3207,
"step": 142
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.176352620124817,
"learning_rate": 8.710359408033828e-06,
"loss": 0.3099,
"step": 143
},
{
"epoch": 0.44755244755244755,
"grad_norm": 0.8591434955596924,
"learning_rate": 8.699788583509515e-06,
"loss": 0.2913,
"step": 144
},
{
"epoch": 0.4471858134155744,
"grad_norm": 1.2351419925689697,
"learning_rate": 8.689217758985202e-06,
"loss": 0.3099,
"step": 145
},
{
"epoch": 0.4502698535080956,
"grad_norm": 1.8375589847564697,
"learning_rate": 8.691099476439791e-06,
"loss": 0.3092,
"step": 146
},
{
"epoch": 0.4533538936006168,
"grad_norm": 1.07125985622406,
"learning_rate": 8.680628272251308e-06,
"loss": 0.3016,
"step": 147
},
{
"epoch": 0.456437933693138,
"grad_norm": 1.1839478015899658,
"learning_rate": 8.670157068062827e-06,
"loss": 0.3003,
"step": 148
},
{
"epoch": 0.45952197378565923,
"grad_norm": 1.294833779335022,
"learning_rate": 8.659685863874346e-06,
"loss": 0.2972,
"step": 149
},
{
"epoch": 0.4626060138781804,
"grad_norm": 1.0540661811828613,
"learning_rate": 8.649214659685865e-06,
"loss": 0.2837,
"step": 150
},
{
"epoch": 0.4656900539707016,
"grad_norm": 1.1067568063735962,
"learning_rate": 8.638743455497383e-06,
"loss": 0.2966,
"step": 151
},
{
"epoch": 0.46877409406322285,
"grad_norm": 0.9972389340400696,
"learning_rate": 8.6282722513089e-06,
"loss": 0.2934,
"step": 152
},
{
"epoch": 0.471858134155744,
"grad_norm": 1.1589370965957642,
"learning_rate": 8.61780104712042e-06,
"loss": 0.3026,
"step": 153
},
{
"epoch": 0.47494217424826524,
"grad_norm": 1.1224210262298584,
"learning_rate": 8.607329842931938e-06,
"loss": 0.3042,
"step": 154
},
{
"epoch": 0.4780262143407864,
"grad_norm": 1.3200238943099976,
"learning_rate": 8.596858638743457e-06,
"loss": 0.3124,
"step": 155
},
{
"epoch": 0.4811102544333076,
"grad_norm": 1.1300067901611328,
"learning_rate": 8.586387434554974e-06,
"loss": 0.3167,
"step": 156
},
{
"epoch": 0.48419429452582885,
"grad_norm": 0.9678866863250732,
"learning_rate": 8.575916230366493e-06,
"loss": 0.3039,
"step": 157
},
{
"epoch": 0.48727833461835,
"grad_norm": 0.9656190872192383,
"learning_rate": 8.565445026178011e-06,
"loss": 0.3067,
"step": 158
},
{
"epoch": 0.49036237471087124,
"grad_norm": 0.9618685245513916,
"learning_rate": 8.55497382198953e-06,
"loss": 0.2992,
"step": 159
},
{
"epoch": 0.49344641480339246,
"grad_norm": 1.1055867671966553,
"learning_rate": 8.544502617801049e-06,
"loss": 0.2986,
"step": 160
},
{
"epoch": 0.49653045489591363,
"grad_norm": 0.8761485815048218,
"learning_rate": 8.534031413612566e-06,
"loss": 0.3071,
"step": 161
},
{
"epoch": 0.49961449498843485,
"grad_norm": 1.0709651708602905,
"learning_rate": 8.523560209424085e-06,
"loss": 0.2965,
"step": 162
},
{
"epoch": 0.5026985350809561,
"grad_norm": 1.2407382726669312,
"learning_rate": 8.513089005235604e-06,
"loss": 0.3134,
"step": 163
},
{
"epoch": 0.5057825751734772,
"grad_norm": 1.46315598487854,
"learning_rate": 8.502617801047122e-06,
"loss": 0.2886,
"step": 164
},
{
"epoch": 0.5088666152659984,
"grad_norm": 1.2314726114273071,
"learning_rate": 8.49214659685864e-06,
"loss": 0.2902,
"step": 165
},
{
"epoch": 0.5119506553585197,
"grad_norm": 1.223716378211975,
"learning_rate": 8.481675392670158e-06,
"loss": 0.3088,
"step": 166
},
{
"epoch": 0.5150346954510409,
"grad_norm": 1.1966098546981812,
"learning_rate": 8.471204188481677e-06,
"loss": 0.3139,
"step": 167
},
{
"epoch": 0.518118735543562,
"grad_norm": 1.1182276010513306,
"learning_rate": 8.460732984293194e-06,
"loss": 0.3161,
"step": 168
},
{
"epoch": 0.5212027756360833,
"grad_norm": 1.1583510637283325,
"learning_rate": 8.450261780104713e-06,
"loss": 0.3041,
"step": 169
},
{
"epoch": 0.5242868157286045,
"grad_norm": 1.1864618062973022,
"learning_rate": 8.439790575916232e-06,
"loss": 0.3008,
"step": 170
},
{
"epoch": 0.5273708558211256,
"grad_norm": 1.3757935762405396,
"learning_rate": 8.429319371727749e-06,
"loss": 0.2865,
"step": 171
},
{
"epoch": 0.5304548959136469,
"grad_norm": 1.4410743713378906,
"learning_rate": 8.418848167539267e-06,
"loss": 0.3081,
"step": 172
},
{
"epoch": 0.5335389360061681,
"grad_norm": 1.3494313955307007,
"learning_rate": 8.408376963350786e-06,
"loss": 0.2988,
"step": 173
},
{
"epoch": 0.5366229760986893,
"grad_norm": 1.3871009349822998,
"learning_rate": 8.397905759162305e-06,
"loss": 0.3045,
"step": 174
},
{
"epoch": 0.5397070161912105,
"grad_norm": 1.183766484260559,
"learning_rate": 8.387434554973822e-06,
"loss": 0.2969,
"step": 175
},
{
"epoch": 0.5427910562837317,
"grad_norm": 1.1075443029403687,
"learning_rate": 8.37696335078534e-06,
"loss": 0.2834,
"step": 176
},
{
"epoch": 0.5458750963762529,
"grad_norm": 1.3118195533752441,
"learning_rate": 8.36649214659686e-06,
"loss": 0.2945,
"step": 177
},
{
"epoch": 0.5489591364687741,
"grad_norm": 1.3226675987243652,
"learning_rate": 8.356020942408377e-06,
"loss": 0.3085,
"step": 178
},
{
"epoch": 0.5520431765612953,
"grad_norm": 1.1877515316009521,
"learning_rate": 8.345549738219895e-06,
"loss": 0.2757,
"step": 179
},
{
"epoch": 0.5551272166538165,
"grad_norm": 1.379599928855896,
"learning_rate": 8.335078534031414e-06,
"loss": 0.2968,
"step": 180
},
{
"epoch": 0.5582112567463376,
"grad_norm": 1.2975775003433228,
"learning_rate": 8.324607329842933e-06,
"loss": 0.3074,
"step": 181
},
{
"epoch": 0.5612952968388589,
"grad_norm": 1.2829333543777466,
"learning_rate": 8.31413612565445e-06,
"loss": 0.3014,
"step": 182
},
{
"epoch": 0.5643793369313801,
"grad_norm": 1.4759114980697632,
"learning_rate": 8.303664921465969e-06,
"loss": 0.3014,
"step": 183
},
{
"epoch": 0.5674633770239013,
"grad_norm": 1.3108978271484375,
"learning_rate": 8.293193717277488e-06,
"loss": 0.2914,
"step": 184
},
{
"epoch": 0.5705474171164225,
"grad_norm": 1.271666407585144,
"learning_rate": 8.282722513089005e-06,
"loss": 0.305,
"step": 185
},
{
"epoch": 0.5736314572089437,
"grad_norm": 1.1115907430648804,
"learning_rate": 8.272251308900523e-06,
"loss": 0.2963,
"step": 186
},
{
"epoch": 0.5767154973014649,
"grad_norm": 1.089092493057251,
"learning_rate": 8.261780104712042e-06,
"loss": 0.303,
"step": 187
},
{
"epoch": 0.5797995373939862,
"grad_norm": 1.1514776945114136,
"learning_rate": 8.251308900523561e-06,
"loss": 0.3073,
"step": 188
},
{
"epoch": 0.5828835774865073,
"grad_norm": 1.1654891967773438,
"learning_rate": 8.240837696335078e-06,
"loss": 0.2883,
"step": 189
},
{
"epoch": 0.5859676175790285,
"grad_norm": 1.2040210962295532,
"learning_rate": 8.230366492146597e-06,
"loss": 0.295,
"step": 190
},
{
"epoch": 0.5890516576715498,
"grad_norm": 1.203511118888855,
"learning_rate": 8.219895287958116e-06,
"loss": 0.2795,
"step": 191
},
{
"epoch": 0.5921356977640709,
"grad_norm": 1.5743706226348877,
"learning_rate": 8.209424083769634e-06,
"loss": 0.3123,
"step": 192
},
{
"epoch": 0.5921356977640709,
"eval_loss": 0.3412991166114807,
"eval_runtime": 149.387,
"eval_samples_per_second": 1.928,
"eval_steps_per_second": 0.964,
"step": 192
},
{
"epoch": 0.5952197378565921,
"grad_norm": 1.4109128713607788,
"learning_rate": 8.198952879581153e-06,
"loss": 0.2996,
"step": 193
},
{
"epoch": 0.5983037779491134,
"grad_norm": 1.3817074298858643,
"learning_rate": 8.18848167539267e-06,
"loss": 0.2964,
"step": 194
},
{
"epoch": 0.6013878180416345,
"grad_norm": 1.3587619066238403,
"learning_rate": 8.178010471204189e-06,
"loss": 0.3004,
"step": 195
},
{
"epoch": 0.6044718581341557,
"grad_norm": 1.502744197845459,
"learning_rate": 8.167539267015708e-06,
"loss": 0.2957,
"step": 196
},
{
"epoch": 0.607555898226677,
"grad_norm": 1.4416728019714355,
"learning_rate": 8.157068062827227e-06,
"loss": 0.2962,
"step": 197
},
{
"epoch": 0.6106399383191982,
"grad_norm": 2.2597157955169678,
"learning_rate": 8.146596858638745e-06,
"loss": 0.2853,
"step": 198
},
{
"epoch": 0.6137239784117193,
"grad_norm": 1.854837417602539,
"learning_rate": 8.136125654450262e-06,
"loss": 0.2918,
"step": 199
},
{
"epoch": 0.6168080185042406,
"grad_norm": 2.1409687995910645,
"learning_rate": 8.125654450261781e-06,
"loss": 0.3118,
"step": 200
},
{
"epoch": 0.6198920585967618,
"grad_norm": 1.7128517627716064,
"learning_rate": 8.1151832460733e-06,
"loss": 0.2822,
"step": 201
},
{
"epoch": 0.6229760986892829,
"grad_norm": 1.4401497840881348,
"learning_rate": 8.104712041884819e-06,
"loss": 0.2802,
"step": 202
},
{
"epoch": 0.6260601387818041,
"grad_norm": 1.7307312488555908,
"learning_rate": 8.094240837696336e-06,
"loss": 0.2973,
"step": 203
},
{
"epoch": 0.6291441788743254,
"grad_norm": 1.263535737991333,
"learning_rate": 8.083769633507855e-06,
"loss": 0.3016,
"step": 204
},
{
"epoch": 0.6322282189668466,
"grad_norm": 1.4065901041030884,
"learning_rate": 8.073298429319373e-06,
"loss": 0.284,
"step": 205
},
{
"epoch": 0.6353122590593677,
"grad_norm": 1.6004809141159058,
"learning_rate": 8.06282722513089e-06,
"loss": 0.2908,
"step": 206
},
{
"epoch": 0.638396299151889,
"grad_norm": 1.458287239074707,
"learning_rate": 8.05235602094241e-06,
"loss": 0.2832,
"step": 207
},
{
"epoch": 0.6414803392444102,
"grad_norm": 1.8239188194274902,
"learning_rate": 8.041884816753928e-06,
"loss": 0.2993,
"step": 208
},
{
"epoch": 0.6445643793369313,
"grad_norm": 1.8187966346740723,
"learning_rate": 8.031413612565445e-06,
"loss": 0.311,
"step": 209
},
{
"epoch": 0.6476484194294526,
"grad_norm": 1.5089385509490967,
"learning_rate": 8.020942408376964e-06,
"loss": 0.2835,
"step": 210
},
{
"epoch": 0.6507324595219738,
"grad_norm": 1.5591213703155518,
"learning_rate": 8.010471204188483e-06,
"loss": 0.2985,
"step": 211
},
{
"epoch": 0.653816499614495,
"grad_norm": 1.5221312046051025,
"learning_rate": 8.000000000000001e-06,
"loss": 0.2805,
"step": 212
},
{
"epoch": 0.6569005397070162,
"grad_norm": 1.8211005926132202,
"learning_rate": 7.989528795811518e-06,
"loss": 0.2728,
"step": 213
},
{
"epoch": 0.6599845797995374,
"grad_norm": 2.2500016689300537,
"learning_rate": 7.979057591623037e-06,
"loss": 0.2932,
"step": 214
},
{
"epoch": 0.6630686198920586,
"grad_norm": 1.7227460145950317,
"learning_rate": 7.968586387434556e-06,
"loss": 0.2927,
"step": 215
},
{
"epoch": 0.6661526599845798,
"grad_norm": 2.1821672916412354,
"learning_rate": 7.958115183246073e-06,
"loss": 0.2919,
"step": 216
},
{
"epoch": 0.669236700077101,
"grad_norm": 1.3368958234786987,
"learning_rate": 7.947643979057592e-06,
"loss": 0.2789,
"step": 217
},
{
"epoch": 0.6723207401696222,
"grad_norm": 1.4419403076171875,
"learning_rate": 7.93717277486911e-06,
"loss": 0.2876,
"step": 218
},
{
"epoch": 0.6754047802621435,
"grad_norm": 2.0355281829833984,
"learning_rate": 7.92670157068063e-06,
"loss": 0.3059,
"step": 219
},
{
"epoch": 0.6784888203546646,
"grad_norm": 1.7871628999710083,
"learning_rate": 7.916230366492146e-06,
"loss": 0.2804,
"step": 220
},
{
"epoch": 0.6815728604471858,
"grad_norm": 1.8160405158996582,
"learning_rate": 7.905759162303665e-06,
"loss": 0.2842,
"step": 221
},
{
"epoch": 0.6846569005397071,
"grad_norm": 2.1498160362243652,
"learning_rate": 7.895287958115184e-06,
"loss": 0.2875,
"step": 222
},
{
"epoch": 0.6877409406322282,
"grad_norm": 1.9483954906463623,
"learning_rate": 7.884816753926701e-06,
"loss": 0.2874,
"step": 223
},
{
"epoch": 0.6908249807247494,
"grad_norm": 2.0145816802978516,
"learning_rate": 7.87434554973822e-06,
"loss": 0.2879,
"step": 224
},
{
"epoch": 0.6939090208172706,
"grad_norm": 1.680413007736206,
"learning_rate": 7.863874345549739e-06,
"loss": 0.2755,
"step": 225
},
{
"epoch": 0.6969930609097919,
"grad_norm": 1.5203242301940918,
"learning_rate": 7.853403141361257e-06,
"loss": 0.284,
"step": 226
},
{
"epoch": 0.700077101002313,
"grad_norm": 1.892943263053894,
"learning_rate": 7.842931937172774e-06,
"loss": 0.2799,
"step": 227
},
{
"epoch": 0.7031611410948342,
"grad_norm": 1.5476278066635132,
"learning_rate": 7.832460732984293e-06,
"loss": 0.2767,
"step": 228
},
{
"epoch": 0.7062451811873555,
"grad_norm": 2.2650210857391357,
"learning_rate": 7.821989528795812e-06,
"loss": 0.2905,
"step": 229
},
{
"epoch": 0.7093292212798766,
"grad_norm": 2.1595096588134766,
"learning_rate": 7.81151832460733e-06,
"loss": 0.274,
"step": 230
},
{
"epoch": 0.7124132613723978,
"grad_norm": 1.587994933128357,
"learning_rate": 7.80104712041885e-06,
"loss": 0.2743,
"step": 231
},
{
"epoch": 0.7154973014649191,
"grad_norm": 1.9411978721618652,
"learning_rate": 7.790575916230367e-06,
"loss": 0.272,
"step": 232
},
{
"epoch": 0.7185813415574402,
"grad_norm": 2.1039252281188965,
"learning_rate": 7.780104712041885e-06,
"loss": 0.2884,
"step": 233
},
{
"epoch": 0.7216653816499614,
"grad_norm": 1.834591269493103,
"learning_rate": 7.769633507853404e-06,
"loss": 0.2756,
"step": 234
},
{
"epoch": 0.7247494217424827,
"grad_norm": 2.1758062839508057,
"learning_rate": 7.759162303664923e-06,
"loss": 0.287,
"step": 235
},
{
"epoch": 0.7278334618350039,
"grad_norm": 2.0601179599761963,
"learning_rate": 7.748691099476442e-06,
"loss": 0.2683,
"step": 236
},
{
"epoch": 0.730917501927525,
"grad_norm": 1.7605801820755005,
"learning_rate": 7.738219895287959e-06,
"loss": 0.2552,
"step": 237
},
{
"epoch": 0.7340015420200463,
"grad_norm": 2.0951759815216064,
"learning_rate": 7.727748691099478e-06,
"loss": 0.258,
"step": 238
},
{
"epoch": 0.7370855821125675,
"grad_norm": 2.2250118255615234,
"learning_rate": 7.717277486910996e-06,
"loss": 0.2627,
"step": 239
},
{
"epoch": 0.7401696222050886,
"grad_norm": 2.54436993598938,
"learning_rate": 7.706806282722513e-06,
"loss": 0.278,
"step": 240
},
{
"epoch": 0.7432536622976099,
"grad_norm": 1.810699701309204,
"learning_rate": 7.696335078534032e-06,
"loss": 0.2684,
"step": 241
},
{
"epoch": 0.7463377023901311,
"grad_norm": 2.161043882369995,
"learning_rate": 7.685863874345551e-06,
"loss": 0.2828,
"step": 242
},
{
"epoch": 0.7494217424826523,
"grad_norm": 1.7965888977050781,
"learning_rate": 7.67539267015707e-06,
"loss": 0.2677,
"step": 243
},
{
"epoch": 0.7525057825751735,
"grad_norm": 1.9139559268951416,
"learning_rate": 7.664921465968587e-06,
"loss": 0.2701,
"step": 244
},
{
"epoch": 0.7555898226676947,
"grad_norm": 2.0285589694976807,
"learning_rate": 7.654450261780106e-06,
"loss": 0.2726,
"step": 245
},
{
"epoch": 0.7586738627602159,
"grad_norm": 2.2968027591705322,
"learning_rate": 7.643979057591624e-06,
"loss": 0.2606,
"step": 246
},
{
"epoch": 0.761757902852737,
"grad_norm": 2.4324936866760254,
"learning_rate": 7.633507853403141e-06,
"loss": 0.2659,
"step": 247
},
{
"epoch": 0.7648419429452583,
"grad_norm": 2.66330885887146,
"learning_rate": 7.62303664921466e-06,
"loss": 0.2627,
"step": 248
},
{
"epoch": 0.7679259830377795,
"grad_norm": 2.435866355895996,
"learning_rate": 7.612565445026179e-06,
"loss": 0.2713,
"step": 249
},
{
"epoch": 0.7710100231303006,
"grad_norm": 2.2584385871887207,
"learning_rate": 7.602094240837698e-06,
"loss": 0.2754,
"step": 250
},
{
"epoch": 0.7740940632228219,
"grad_norm": 2.1898317337036133,
"learning_rate": 7.591623036649215e-06,
"loss": 0.2705,
"step": 251
},
{
"epoch": 0.7771781033153431,
"grad_norm": 2.051255464553833,
"learning_rate": 7.5811518324607335e-06,
"loss": 0.2491,
"step": 252
},
{
"epoch": 0.7802621434078643,
"grad_norm": 2.353940725326538,
"learning_rate": 7.570680628272252e-06,
"loss": 0.277,
"step": 253
},
{
"epoch": 0.7833461835003855,
"grad_norm": 2.3826687335968018,
"learning_rate": 7.560209424083769e-06,
"loss": 0.2693,
"step": 254
},
{
"epoch": 0.7864302235929067,
"grad_norm": 2.522019863128662,
"learning_rate": 7.549738219895288e-06,
"loss": 0.2706,
"step": 255
},
{
"epoch": 0.7895142636854279,
"grad_norm": 2.3525524139404297,
"learning_rate": 7.539267015706807e-06,
"loss": 0.2509,
"step": 256
},
{
"epoch": 0.7895142636854279,
"eval_loss": 0.3851300776004791,
"eval_runtime": 149.046,
"eval_samples_per_second": 1.932,
"eval_steps_per_second": 0.966,
"step": 256
},
{
"epoch": 0.7925983037779492,
"grad_norm": 2.7143642902374268,
"learning_rate": 7.528795811518326e-06,
"loss": 0.2701,
"step": 257
},
{
"epoch": 0.7956823438704703,
"grad_norm": 2.6725356578826904,
"learning_rate": 7.518324607329844e-06,
"loss": 0.2718,
"step": 258
},
{
"epoch": 0.7987663839629915,
"grad_norm": 2.4051880836486816,
"learning_rate": 7.5078534031413615e-06,
"loss": 0.2554,
"step": 259
},
{
"epoch": 0.8018504240555128,
"grad_norm": 2.472904920578003,
"learning_rate": 7.49738219895288e-06,
"loss": 0.2666,
"step": 260
},
{
"epoch": 0.8049344641480339,
"grad_norm": 2.3598804473876953,
"learning_rate": 7.486910994764398e-06,
"loss": 0.2532,
"step": 261
},
{
"epoch": 0.8080185042405551,
"grad_norm": 2.383300542831421,
"learning_rate": 7.476439790575917e-06,
"loss": 0.2568,
"step": 262
},
{
"epoch": 0.8111025443330764,
"grad_norm": 2.999469518661499,
"learning_rate": 7.465968586387436e-06,
"loss": 0.2403,
"step": 263
},
{
"epoch": 0.8141865844255975,
"grad_norm": 4.071384429931641,
"learning_rate": 7.455497382198954e-06,
"loss": 0.265,
"step": 264
},
{
"epoch": 0.8172706245181187,
"grad_norm": 3.5529489517211914,
"learning_rate": 7.445026178010472e-06,
"loss": 0.2647,
"step": 265
},
{
"epoch": 0.8203546646106399,
"grad_norm": 2.8842644691467285,
"learning_rate": 7.43455497382199e-06,
"loss": 0.2725,
"step": 266
},
{
"epoch": 0.8234387047031612,
"grad_norm": 2.1277332305908203,
"learning_rate": 7.424083769633509e-06,
"loss": 0.2657,
"step": 267
},
{
"epoch": 0.8265227447956823,
"grad_norm": 2.832111358642578,
"learning_rate": 7.413612565445026e-06,
"loss": 0.255,
"step": 268
},
{
"epoch": 0.8296067848882035,
"grad_norm": 2.7438676357269287,
"learning_rate": 7.403141361256545e-06,
"loss": 0.2596,
"step": 269
},
{
"epoch": 0.8326908249807248,
"grad_norm": 2.7950987815856934,
"learning_rate": 7.392670157068064e-06,
"loss": 0.2624,
"step": 270
},
{
"epoch": 0.8357748650732459,
"grad_norm": 3.497069835662842,
"learning_rate": 7.382198952879581e-06,
"loss": 0.2385,
"step": 271
},
{
"epoch": 0.8388589051657671,
"grad_norm": 5.024068832397461,
"learning_rate": 7.3717277486911e-06,
"loss": 0.2526,
"step": 272
},
{
"epoch": 0.8419429452582884,
"grad_norm": 3.5298011302948,
"learning_rate": 7.361256544502618e-06,
"loss": 0.2452,
"step": 273
},
{
"epoch": 0.8450269853508096,
"grad_norm": 2.701545238494873,
"learning_rate": 7.350785340314137e-06,
"loss": 0.2293,
"step": 274
},
{
"epoch": 0.8481110254433307,
"grad_norm": 2.838541030883789,
"learning_rate": 7.340314136125655e-06,
"loss": 0.2554,
"step": 275
},
{
"epoch": 0.851195065535852,
"grad_norm": 2.5854012966156006,
"learning_rate": 7.329842931937173e-06,
"loss": 0.245,
"step": 276
},
{
"epoch": 0.8542791056283732,
"grad_norm": 2.9351906776428223,
"learning_rate": 7.319371727748692e-06,
"loss": 0.2556,
"step": 277
},
{
"epoch": 0.8573631457208943,
"grad_norm": 3.0675830841064453,
"learning_rate": 7.30890052356021e-06,
"loss": 0.2501,
"step": 278
},
{
"epoch": 0.8604471858134156,
"grad_norm": 3.1958088874816895,
"learning_rate": 7.2984293193717285e-06,
"loss": 0.2347,
"step": 279
},
{
"epoch": 0.8635312259059368,
"grad_norm": 3.0006463527679443,
"learning_rate": 7.287958115183246e-06,
"loss": 0.242,
"step": 280
},
{
"epoch": 0.866615265998458,
"grad_norm": 2.862990379333496,
"learning_rate": 7.277486910994765e-06,
"loss": 0.2442,
"step": 281
},
{
"epoch": 0.8696993060909792,
"grad_norm": 3.1585986614227295,
"learning_rate": 7.267015706806283e-06,
"loss": 0.2401,
"step": 282
},
{
"epoch": 0.8727833461835004,
"grad_norm": 2.6111812591552734,
"learning_rate": 7.256544502617802e-06,
"loss": 0.2324,
"step": 283
},
{
"epoch": 0.8758673862760216,
"grad_norm": 3.1289191246032715,
"learning_rate": 7.246073298429321e-06,
"loss": 0.2426,
"step": 284
},
{
"epoch": 0.8789514263685428,
"grad_norm": 3.448789358139038,
"learning_rate": 7.235602094240838e-06,
"loss": 0.2224,
"step": 285
},
{
"epoch": 0.882035466461064,
"grad_norm": 3.018432855606079,
"learning_rate": 7.2251308900523565e-06,
"loss": 0.2238,
"step": 286
},
{
"epoch": 0.8851195065535852,
"grad_norm": 4.171509742736816,
"learning_rate": 7.214659685863875e-06,
"loss": 0.2546,
"step": 287
},
{
"epoch": 0.8882035466461063,
"grad_norm": 3.5390446186065674,
"learning_rate": 7.204188481675394e-06,
"loss": 0.2417,
"step": 288
},
{
"epoch": 0.8912875867386276,
"grad_norm": 2.8169162273406982,
"learning_rate": 7.193717277486911e-06,
"loss": 0.2348,
"step": 289
},
{
"epoch": 0.8943716268311488,
"grad_norm": 2.9175827503204346,
"learning_rate": 7.18324607329843e-06,
"loss": 0.214,
"step": 290
},
{
"epoch": 0.89745566692367,
"grad_norm": 3.939680576324463,
"learning_rate": 7.172774869109949e-06,
"loss": 0.2489,
"step": 291
},
{
"epoch": 0.9005397070161912,
"grad_norm": 2.874373435974121,
"learning_rate": 7.162303664921466e-06,
"loss": 0.2219,
"step": 292
},
{
"epoch": 0.9036237471087124,
"grad_norm": 4.381021976470947,
"learning_rate": 7.1518324607329845e-06,
"loss": 0.2419,
"step": 293
},
{
"epoch": 0.9067077872012336,
"grad_norm": 3.9895918369293213,
"learning_rate": 7.141361256544503e-06,
"loss": 0.2552,
"step": 294
},
{
"epoch": 0.9097918272937549,
"grad_norm": 2.9028842449188232,
"learning_rate": 7.130890052356022e-06,
"loss": 0.2323,
"step": 295
},
{
"epoch": 0.912875867386276,
"grad_norm": 3.5980117321014404,
"learning_rate": 7.12041884816754e-06,
"loss": 0.2404,
"step": 296
},
{
"epoch": 0.9159599074787972,
"grad_norm": 3.490727186203003,
"learning_rate": 7.109947643979058e-06,
"loss": 0.22,
"step": 297
},
{
"epoch": 0.9190439475713185,
"grad_norm": 3.256279706954956,
"learning_rate": 7.099476439790577e-06,
"loss": 0.2368,
"step": 298
},
{
"epoch": 0.9221279876638396,
"grad_norm": 3.92038893699646,
"learning_rate": 7.089005235602095e-06,
"loss": 0.2331,
"step": 299
},
{
"epoch": 0.9252120277563608,
"grad_norm": 3.6917364597320557,
"learning_rate": 7.078534031413613e-06,
"loss": 0.2139,
"step": 300
},
{
"epoch": 0.9282960678488821,
"grad_norm": 3.058729887008667,
"learning_rate": 7.068062827225132e-06,
"loss": 0.2199,
"step": 301
},
{
"epoch": 0.9313801079414032,
"grad_norm": 3.150188446044922,
"learning_rate": 7.057591623036649e-06,
"loss": 0.2137,
"step": 302
},
{
"epoch": 0.9344641480339244,
"grad_norm": 5.77610445022583,
"learning_rate": 7.047120418848168e-06,
"loss": 0.2478,
"step": 303
},
{
"epoch": 0.9375481881264457,
"grad_norm": 2.8851089477539062,
"learning_rate": 7.036649214659687e-06,
"loss": 0.227,
"step": 304
},
{
"epoch": 0.9406322282189669,
"grad_norm": 3.1656086444854736,
"learning_rate": 7.0261780104712055e-06,
"loss": 0.2335,
"step": 305
},
{
"epoch": 0.943716268311488,
"grad_norm": 3.3355696201324463,
"learning_rate": 7.015706806282723e-06,
"loss": 0.2169,
"step": 306
},
{
"epoch": 0.9468003084040093,
"grad_norm": 3.5095317363739014,
"learning_rate": 7.005235602094241e-06,
"loss": 0.2161,
"step": 307
},
{
"epoch": 0.9498843484965305,
"grad_norm": 3.5365262031555176,
"learning_rate": 6.99476439790576e-06,
"loss": 0.2097,
"step": 308
},
{
"epoch": 0.9529683885890516,
"grad_norm": 4.159248352050781,
"learning_rate": 6.984293193717277e-06,
"loss": 0.2337,
"step": 309
},
{
"epoch": 0.9560524286815728,
"grad_norm": 2.9792213439941406,
"learning_rate": 6.973821989528796e-06,
"loss": 0.2149,
"step": 310
},
{
"epoch": 0.9591364687740941,
"grad_norm": 3.2603046894073486,
"learning_rate": 6.963350785340315e-06,
"loss": 0.2218,
"step": 311
},
{
"epoch": 0.9622205088666153,
"grad_norm": 3.5064327716827393,
"learning_rate": 6.9528795811518335e-06,
"loss": 0.2128,
"step": 312
},
{
"epoch": 0.9653045489591364,
"grad_norm": 3.971139430999756,
"learning_rate": 6.942408376963351e-06,
"loss": 0.2172,
"step": 313
},
{
"epoch": 0.9683885890516577,
"grad_norm": 3.651603937149048,
"learning_rate": 6.931937172774869e-06,
"loss": 0.2036,
"step": 314
},
{
"epoch": 0.9714726291441789,
"grad_norm": 5.394900321960449,
"learning_rate": 6.921465968586388e-06,
"loss": 0.2157,
"step": 315
},
{
"epoch": 0.9745566692367,
"grad_norm": 3.7696452140808105,
"learning_rate": 6.910994764397906e-06,
"loss": 0.2168,
"step": 316
},
{
"epoch": 0.9776407093292213,
"grad_norm": 3.3137505054473877,
"learning_rate": 6.900523560209425e-06,
"loss": 0.2217,
"step": 317
},
{
"epoch": 0.9807247494217425,
"grad_norm": 3.927021026611328,
"learning_rate": 6.890052356020943e-06,
"loss": 0.2149,
"step": 318
},
{
"epoch": 0.9838087895142636,
"grad_norm": 3.598501443862915,
"learning_rate": 6.8795811518324615e-06,
"loss": 0.2007,
"step": 319
},
{
"epoch": 0.9868928296067849,
"grad_norm": 4.063229084014893,
"learning_rate": 6.8691099476439794e-06,
"loss": 0.2142,
"step": 320
},
{
"epoch": 0.9868928296067849,
"eval_loss": 0.46243318915367126,
"eval_runtime": 150.4594,
"eval_samples_per_second": 1.914,
"eval_steps_per_second": 0.957,
"step": 320
},
{
"epoch": 0.9899768696993061,
"grad_norm": 4.520982265472412,
"learning_rate": 6.858638743455498e-06,
"loss": 0.1978,
"step": 321
},
{
"epoch": 0.9930609097918273,
"grad_norm": 3.6312687397003174,
"learning_rate": 6.848167539267017e-06,
"loss": 0.1896,
"step": 322
},
{
"epoch": 0.9961449498843485,
"grad_norm": 3.1252243518829346,
"learning_rate": 6.837696335078534e-06,
"loss": 0.1817,
"step": 323
},
{
"epoch": 0.9992289899768697,
"grad_norm": 4.3829264640808105,
"learning_rate": 6.827225130890053e-06,
"loss": 0.2199,
"step": 324
},
{
"epoch": 1.0030840400925212,
"grad_norm": 9.755841255187988,
"learning_rate": 6.816753926701572e-06,
"loss": 0.4578,
"step": 325
},
{
"epoch": 1.0061680801850423,
"grad_norm": 3.9052581787109375,
"learning_rate": 6.80628272251309e-06,
"loss": 0.1959,
"step": 326
},
{
"epoch": 1.0092521202775635,
"grad_norm": 3.6258931159973145,
"learning_rate": 6.7958115183246075e-06,
"loss": 0.2062,
"step": 327
},
{
"epoch": 1.012336160370085,
"grad_norm": 4.131122589111328,
"learning_rate": 6.785340314136126e-06,
"loss": 0.1915,
"step": 328
},
{
"epoch": 1.015420200462606,
"grad_norm": 4.387429237365723,
"learning_rate": 6.774869109947645e-06,
"loss": 0.1792,
"step": 329
},
{
"epoch": 1.0185042405551272,
"grad_norm": 3.873361110687256,
"learning_rate": 6.764397905759162e-06,
"loss": 0.1895,
"step": 330
},
{
"epoch": 1.0215882806476484,
"grad_norm": 4.318599700927734,
"learning_rate": 6.753926701570681e-06,
"loss": 0.1836,
"step": 331
},
{
"epoch": 1.0246723207401696,
"grad_norm": 4.9434494972229,
"learning_rate": 6.7434554973822e-06,
"loss": 0.2199,
"step": 332
},
{
"epoch": 1.0277563608326907,
"grad_norm": 3.8584797382354736,
"learning_rate": 6.732984293193718e-06,
"loss": 0.1796,
"step": 333
},
{
"epoch": 1.0308404009252121,
"grad_norm": 4.104945659637451,
"learning_rate": 6.722513089005236e-06,
"loss": 0.1812,
"step": 334
},
{
"epoch": 1.0339244410177333,
"grad_norm": 4.125020503997803,
"learning_rate": 6.712041884816754e-06,
"loss": 0.197,
"step": 335
},
{
"epoch": 1.0370084811102545,
"grad_norm": 3.783364772796631,
"learning_rate": 6.701570680628273e-06,
"loss": 0.1798,
"step": 336
},
{
"epoch": 1.0400925212027756,
"grad_norm": 4.799828052520752,
"learning_rate": 6.691099476439791e-06,
"loss": 0.1837,
"step": 337
},
{
"epoch": 1.0431765612952968,
"grad_norm": 5.570056438446045,
"learning_rate": 6.68062827225131e-06,
"loss": 0.1987,
"step": 338
},
{
"epoch": 1.046260601387818,
"grad_norm": 3.9299843311309814,
"learning_rate": 6.670157068062828e-06,
"loss": 0.1728,
"step": 339
},
{
"epoch": 1.0493446414803393,
"grad_norm": 4.746124267578125,
"learning_rate": 6.6596858638743455e-06,
"loss": 0.2055,
"step": 340
},
{
"epoch": 1.0524286815728605,
"grad_norm": 3.6969268321990967,
"learning_rate": 6.649214659685864e-06,
"loss": 0.1919,
"step": 341
},
{
"epoch": 1.0555127216653817,
"grad_norm": 4.096460819244385,
"learning_rate": 6.638743455497383e-06,
"loss": 0.1725,
"step": 342
},
{
"epoch": 1.0585967617579028,
"grad_norm": 3.819343328475952,
"learning_rate": 6.628272251308902e-06,
"loss": 0.1727,
"step": 343
},
{
"epoch": 1.061680801850424,
"grad_norm": 4.487940788269043,
"learning_rate": 6.617801047120419e-06,
"loss": 0.176,
"step": 344
},
{
"epoch": 1.0647648419429452,
"grad_norm": 4.727810382843018,
"learning_rate": 6.607329842931938e-06,
"loss": 0.1694,
"step": 345
},
{
"epoch": 1.0678488820354666,
"grad_norm": 5.403895854949951,
"learning_rate": 6.5968586387434565e-06,
"loss": 0.1853,
"step": 346
},
{
"epoch": 1.0709329221279877,
"grad_norm": 3.548576831817627,
"learning_rate": 6.5863874345549736e-06,
"loss": 0.1711,
"step": 347
},
{
"epoch": 1.074016962220509,
"grad_norm": 3.6849658489227295,
"learning_rate": 6.575916230366492e-06,
"loss": 0.1877,
"step": 348
},
{
"epoch": 1.07710100231303,
"grad_norm": 3.7493557929992676,
"learning_rate": 6.565445026178011e-06,
"loss": 0.1858,
"step": 349
},
{
"epoch": 1.0801850424055512,
"grad_norm": 3.9486773014068604,
"learning_rate": 6.55497382198953e-06,
"loss": 0.1515,
"step": 350
},
{
"epoch": 1.0832690824980724,
"grad_norm": 4.970436096191406,
"learning_rate": 6.544502617801047e-06,
"loss": 0.172,
"step": 351
},
{
"epoch": 1.0863531225905936,
"grad_norm": 5.032225131988525,
"learning_rate": 6.534031413612566e-06,
"loss": 0.1611,
"step": 352
}
],
"logging_steps": 1,
"max_steps": 975,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 16,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.458954269238886e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}