llama3-3b-second-try / trainer_state.json
tarik645's picture
Eğitilmiş modelimi yüklüyorum 🚀
5620835 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.511530398322851,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.033542976939203356,
"grad_norm": 0.9615421891212463,
"learning_rate": 1e-05,
"loss": 1.818,
"step": 1
},
{
"epoch": 0.06708595387840671,
"grad_norm": 1.061348557472229,
"learning_rate": 2e-05,
"loss": 1.9118,
"step": 2
},
{
"epoch": 0.10062893081761007,
"grad_norm": 0.906833827495575,
"learning_rate": 3e-05,
"loss": 1.7764,
"step": 3
},
{
"epoch": 0.13417190775681342,
"grad_norm": 0.8332676887512207,
"learning_rate": 4e-05,
"loss": 1.8419,
"step": 4
},
{
"epoch": 0.16771488469601678,
"grad_norm": 0.6788995265960693,
"learning_rate": 5e-05,
"loss": 1.8896,
"step": 5
},
{
"epoch": 0.20125786163522014,
"grad_norm": 0.5330966711044312,
"learning_rate": 4.888888888888889e-05,
"loss": 1.7301,
"step": 6
},
{
"epoch": 0.2348008385744235,
"grad_norm": 0.4760504364967346,
"learning_rate": 4.7777777777777784e-05,
"loss": 1.7193,
"step": 7
},
{
"epoch": 0.26834381551362685,
"grad_norm": 0.3972032070159912,
"learning_rate": 4.666666666666667e-05,
"loss": 1.6832,
"step": 8
},
{
"epoch": 0.3018867924528302,
"grad_norm": 0.3579612672328949,
"learning_rate": 4.555555555555556e-05,
"loss": 1.608,
"step": 9
},
{
"epoch": 0.33542976939203356,
"grad_norm": 0.3818889856338501,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.8124,
"step": 10
},
{
"epoch": 0.3689727463312369,
"grad_norm": 0.3447263538837433,
"learning_rate": 4.3333333333333334e-05,
"loss": 1.7331,
"step": 11
},
{
"epoch": 0.4025157232704403,
"grad_norm": 0.323868989944458,
"learning_rate": 4.222222222222222e-05,
"loss": 1.7071,
"step": 12
},
{
"epoch": 0.4360587002096436,
"grad_norm": 0.3421488106250763,
"learning_rate": 4.111111111111111e-05,
"loss": 1.6768,
"step": 13
},
{
"epoch": 0.469601677148847,
"grad_norm": 0.3541533052921295,
"learning_rate": 4e-05,
"loss": 1.7081,
"step": 14
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.33424726128578186,
"learning_rate": 3.888888888888889e-05,
"loss": 1.536,
"step": 15
},
{
"epoch": 0.5366876310272537,
"grad_norm": 0.36894017457962036,
"learning_rate": 3.777777777777778e-05,
"loss": 1.5753,
"step": 16
},
{
"epoch": 0.570230607966457,
"grad_norm": 0.3404862880706787,
"learning_rate": 3.6666666666666666e-05,
"loss": 1.6014,
"step": 17
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.3427893817424774,
"learning_rate": 3.555555555555556e-05,
"loss": 1.6454,
"step": 18
},
{
"epoch": 0.6373165618448637,
"grad_norm": 0.3238353431224823,
"learning_rate": 3.444444444444445e-05,
"loss": 1.5531,
"step": 19
},
{
"epoch": 0.6708595387840671,
"grad_norm": 0.318460077047348,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.6036,
"step": 20
},
{
"epoch": 0.7044025157232704,
"grad_norm": 0.3185439109802246,
"learning_rate": 3.222222222222223e-05,
"loss": 1.5285,
"step": 21
},
{
"epoch": 0.7379454926624738,
"grad_norm": 0.3249723017215729,
"learning_rate": 3.111111111111111e-05,
"loss": 1.5633,
"step": 22
},
{
"epoch": 0.7714884696016772,
"grad_norm": 0.3257281184196472,
"learning_rate": 3e-05,
"loss": 1.5892,
"step": 23
},
{
"epoch": 0.8050314465408805,
"grad_norm": 0.35130995512008667,
"learning_rate": 2.8888888888888888e-05,
"loss": 1.6089,
"step": 24
},
{
"epoch": 0.8385744234800838,
"grad_norm": 0.32942768931388855,
"learning_rate": 2.777777777777778e-05,
"loss": 1.5397,
"step": 25
},
{
"epoch": 0.8721174004192872,
"grad_norm": 0.3389425575733185,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.5448,
"step": 26
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.322301983833313,
"learning_rate": 2.5555555555555554e-05,
"loss": 1.506,
"step": 27
},
{
"epoch": 0.939203354297694,
"grad_norm": 0.3191693127155304,
"learning_rate": 2.4444444444444445e-05,
"loss": 1.5321,
"step": 28
},
{
"epoch": 0.9727463312368972,
"grad_norm": 0.330905020236969,
"learning_rate": 2.3333333333333336e-05,
"loss": 1.734,
"step": 29
},
{
"epoch": 1.0335429769392033,
"grad_norm": 0.6754521131515503,
"learning_rate": 2.2222222222222223e-05,
"loss": 3.2346,
"step": 30
},
{
"epoch": 1.0670859538784068,
"grad_norm": 0.31602275371551514,
"learning_rate": 2.111111111111111e-05,
"loss": 1.5563,
"step": 31
},
{
"epoch": 1.10062893081761,
"grad_norm": 0.3237570822238922,
"learning_rate": 2e-05,
"loss": 1.5454,
"step": 32
},
{
"epoch": 1.1341719077568135,
"grad_norm": 0.30346807837486267,
"learning_rate": 1.888888888888889e-05,
"loss": 1.5016,
"step": 33
},
{
"epoch": 1.1677148846960168,
"grad_norm": 0.3014126121997833,
"learning_rate": 1.777777777777778e-05,
"loss": 1.5729,
"step": 34
},
{
"epoch": 1.20125786163522,
"grad_norm": 0.307090699672699,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.6538,
"step": 35
},
{
"epoch": 1.2348008385744236,
"grad_norm": 0.29638656973838806,
"learning_rate": 1.5555555555555555e-05,
"loss": 1.5825,
"step": 36
},
{
"epoch": 1.2683438155136268,
"grad_norm": 0.3165188431739807,
"learning_rate": 1.4444444444444444e-05,
"loss": 1.736,
"step": 37
},
{
"epoch": 1.3018867924528301,
"grad_norm": 0.2909906804561615,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.5676,
"step": 38
},
{
"epoch": 1.3354297693920336,
"grad_norm": 0.2958202064037323,
"learning_rate": 1.2222222222222222e-05,
"loss": 1.5789,
"step": 39
},
{
"epoch": 1.368972746331237,
"grad_norm": 0.3097296953201294,
"learning_rate": 1.1111111111111112e-05,
"loss": 1.6394,
"step": 40
},
{
"epoch": 1.4025157232704402,
"grad_norm": 0.31482434272766113,
"learning_rate": 1e-05,
"loss": 1.6685,
"step": 41
},
{
"epoch": 1.4360587002096437,
"grad_norm": 0.2970486283302307,
"learning_rate": 8.88888888888889e-06,
"loss": 1.5871,
"step": 42
},
{
"epoch": 1.469601677148847,
"grad_norm": 0.29967784881591797,
"learning_rate": 7.777777777777777e-06,
"loss": 1.508,
"step": 43
},
{
"epoch": 1.5031446540880502,
"grad_norm": 0.288617342710495,
"learning_rate": 6.666666666666667e-06,
"loss": 1.5807,
"step": 44
},
{
"epoch": 1.5366876310272537,
"grad_norm": 0.298141747713089,
"learning_rate": 5.555555555555556e-06,
"loss": 1.6094,
"step": 45
},
{
"epoch": 1.570230607966457,
"grad_norm": 0.300231009721756,
"learning_rate": 4.444444444444445e-06,
"loss": 1.6047,
"step": 46
},
{
"epoch": 1.6037735849056602,
"grad_norm": 0.3105227053165436,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.6895,
"step": 47
},
{
"epoch": 1.6373165618448637,
"grad_norm": 0.29686439037323,
"learning_rate": 2.2222222222222225e-06,
"loss": 1.568,
"step": 48
},
{
"epoch": 1.6708595387840672,
"grad_norm": 0.29341980814933777,
"learning_rate": 1.1111111111111112e-06,
"loss": 1.4565,
"step": 49
},
{
"epoch": 1.7044025157232703,
"grad_norm": 0.29961156845092773,
"learning_rate": 0.0,
"loss": 1.5627,
"step": 50
},
{
"epoch": 0.4276729559748428,
"grad_norm": 0.43133091926574707,
"learning_rate": 4.220338983050848e-05,
"loss": 1.6502,
"step": 51
},
{
"epoch": 0.4360587002096436,
"grad_norm": 0.409037321805954,
"learning_rate": 4.2033898305084746e-05,
"loss": 1.7093,
"step": 52
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.3889688551425934,
"learning_rate": 4.186440677966102e-05,
"loss": 1.6019,
"step": 53
},
{
"epoch": 0.4528301886792453,
"grad_norm": 0.4061110019683838,
"learning_rate": 4.1694915254237285e-05,
"loss": 1.7135,
"step": 54
},
{
"epoch": 0.4612159329140461,
"grad_norm": 0.4317370057106018,
"learning_rate": 4.152542372881356e-05,
"loss": 1.6998,
"step": 55
},
{
"epoch": 0.469601677148847,
"grad_norm": 0.40719956159591675,
"learning_rate": 4.135593220338983e-05,
"loss": 1.6249,
"step": 56
},
{
"epoch": 0.4779874213836478,
"grad_norm": 0.3879191279411316,
"learning_rate": 4.1186440677966105e-05,
"loss": 1.5039,
"step": 57
},
{
"epoch": 0.4863731656184486,
"grad_norm": 0.4131089448928833,
"learning_rate": 4.101694915254237e-05,
"loss": 1.5849,
"step": 58
},
{
"epoch": 0.4947589098532495,
"grad_norm": 0.3905002772808075,
"learning_rate": 4.0847457627118644e-05,
"loss": 1.4827,
"step": 59
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.4169052839279175,
"learning_rate": 4.067796610169492e-05,
"loss": 1.3878,
"step": 60
},
{
"epoch": 0.5115303983228512,
"grad_norm": 0.42767494916915894,
"learning_rate": 4.050847457627119e-05,
"loss": 1.5039,
"step": 61
},
{
"epoch": 0.519916142557652,
"grad_norm": 0.42942067980766296,
"learning_rate": 4.0338983050847464e-05,
"loss": 1.5792,
"step": 62
},
{
"epoch": 0.5283018867924528,
"grad_norm": 0.45012345910072327,
"learning_rate": 4.016949152542373e-05,
"loss": 1.5006,
"step": 63
},
{
"epoch": 0.5366876310272537,
"grad_norm": 0.43249914050102234,
"learning_rate": 4e-05,
"loss": 1.572,
"step": 64
},
{
"epoch": 0.5450733752620545,
"grad_norm": 0.43645647168159485,
"learning_rate": 3.983050847457627e-05,
"loss": 1.7374,
"step": 65
},
{
"epoch": 0.5534591194968553,
"grad_norm": 0.41012486815452576,
"learning_rate": 3.966101694915255e-05,
"loss": 1.4895,
"step": 66
},
{
"epoch": 0.5618448637316562,
"grad_norm": 0.4467809796333313,
"learning_rate": 3.9491525423728816e-05,
"loss": 1.7326,
"step": 67
},
{
"epoch": 0.570230607966457,
"grad_norm": 0.4244528114795685,
"learning_rate": 3.932203389830509e-05,
"loss": 1.4259,
"step": 68
},
{
"epoch": 0.5786163522012578,
"grad_norm": 0.449232280254364,
"learning_rate": 3.9152542372881355e-05,
"loss": 1.6682,
"step": 69
},
{
"epoch": 0.5870020964360587,
"grad_norm": 0.4241749942302704,
"learning_rate": 3.898305084745763e-05,
"loss": 1.6646,
"step": 70
},
{
"epoch": 0.5953878406708596,
"grad_norm": 0.4776236116886139,
"learning_rate": 3.88135593220339e-05,
"loss": 1.6555,
"step": 71
},
{
"epoch": 0.6037735849056604,
"grad_norm": 0.4678778052330017,
"learning_rate": 3.8644067796610175e-05,
"loss": 1.4964,
"step": 72
},
{
"epoch": 0.6121593291404612,
"grad_norm": 0.4315565526485443,
"learning_rate": 3.847457627118644e-05,
"loss": 1.5025,
"step": 73
},
{
"epoch": 0.6205450733752621,
"grad_norm": 0.3997185528278351,
"learning_rate": 3.8305084745762714e-05,
"loss": 1.497,
"step": 74
},
{
"epoch": 0.6289308176100629,
"grad_norm": 0.42872926592826843,
"learning_rate": 3.813559322033898e-05,
"loss": 1.4873,
"step": 75
},
{
"epoch": 0.6373165618448637,
"grad_norm": 0.45695438981056213,
"learning_rate": 3.7966101694915254e-05,
"loss": 1.606,
"step": 76
},
{
"epoch": 0.6457023060796646,
"grad_norm": 0.4163571894168854,
"learning_rate": 3.779661016949153e-05,
"loss": 1.4837,
"step": 77
},
{
"epoch": 0.6540880503144654,
"grad_norm": 0.45837995409965515,
"learning_rate": 3.76271186440678e-05,
"loss": 1.5733,
"step": 78
},
{
"epoch": 0.6624737945492662,
"grad_norm": 0.4821924865245819,
"learning_rate": 3.745762711864407e-05,
"loss": 1.742,
"step": 79
},
{
"epoch": 0.6708595387840671,
"grad_norm": 0.4537578225135803,
"learning_rate": 3.728813559322034e-05,
"loss": 1.5152,
"step": 80
},
{
"epoch": 0.6792452830188679,
"grad_norm": 0.42695993185043335,
"learning_rate": 3.711864406779661e-05,
"loss": 1.5123,
"step": 81
},
{
"epoch": 0.6876310272536688,
"grad_norm": 0.436599463224411,
"learning_rate": 3.6949152542372886e-05,
"loss": 1.4442,
"step": 82
},
{
"epoch": 0.6960167714884696,
"grad_norm": 0.44244834780693054,
"learning_rate": 3.677966101694915e-05,
"loss": 1.3896,
"step": 83
},
{
"epoch": 0.7044025157232704,
"grad_norm": 0.4942834675312042,
"learning_rate": 3.6610169491525426e-05,
"loss": 1.5767,
"step": 84
},
{
"epoch": 0.7127882599580713,
"grad_norm": 0.4958462119102478,
"learning_rate": 3.644067796610169e-05,
"loss": 1.745,
"step": 85
},
{
"epoch": 0.7211740041928721,
"grad_norm": 0.4499577581882477,
"learning_rate": 3.6271186440677965e-05,
"loss": 1.4771,
"step": 86
},
{
"epoch": 0.7295597484276729,
"grad_norm": 0.486020565032959,
"learning_rate": 3.610169491525424e-05,
"loss": 1.5388,
"step": 87
},
{
"epoch": 0.7379454926624738,
"grad_norm": 0.44480133056640625,
"learning_rate": 3.593220338983051e-05,
"loss": 1.377,
"step": 88
},
{
"epoch": 0.7463312368972747,
"grad_norm": 0.45817309617996216,
"learning_rate": 3.5762711864406785e-05,
"loss": 1.4927,
"step": 89
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.5093894004821777,
"learning_rate": 3.559322033898305e-05,
"loss": 1.5837,
"step": 90
},
{
"epoch": 0.7631027253668763,
"grad_norm": 0.4713049829006195,
"learning_rate": 3.5423728813559324e-05,
"loss": 1.6117,
"step": 91
},
{
"epoch": 0.7714884696016772,
"grad_norm": 0.45069509744644165,
"learning_rate": 3.52542372881356e-05,
"loss": 1.4815,
"step": 92
},
{
"epoch": 0.779874213836478,
"grad_norm": 0.5270215272903442,
"learning_rate": 3.508474576271187e-05,
"loss": 1.5292,
"step": 93
},
{
"epoch": 0.7882599580712788,
"grad_norm": 0.5345816016197205,
"learning_rate": 3.491525423728814e-05,
"loss": 1.7355,
"step": 94
},
{
"epoch": 0.7966457023060797,
"grad_norm": 0.43932732939720154,
"learning_rate": 3.474576271186441e-05,
"loss": 1.3706,
"step": 95
},
{
"epoch": 0.8050314465408805,
"grad_norm": 0.5658639073371887,
"learning_rate": 3.4576271186440676e-05,
"loss": 1.6624,
"step": 96
},
{
"epoch": 0.8134171907756813,
"grad_norm": 0.4958181083202362,
"learning_rate": 3.440677966101695e-05,
"loss": 1.446,
"step": 97
},
{
"epoch": 0.8218029350104822,
"grad_norm": 0.45654749870300293,
"learning_rate": 3.423728813559322e-05,
"loss": 1.3261,
"step": 98
},
{
"epoch": 0.8301886792452831,
"grad_norm": 0.5374109745025635,
"learning_rate": 3.4067796610169496e-05,
"loss": 1.6,
"step": 99
},
{
"epoch": 0.8385744234800838,
"grad_norm": 0.5511431097984314,
"learning_rate": 3.389830508474576e-05,
"loss": 1.623,
"step": 100
},
{
"epoch": 0.8469601677148847,
"grad_norm": 0.5506657361984253,
"learning_rate": 3.3728813559322035e-05,
"loss": 1.3841,
"step": 101
},
{
"epoch": 0.8553459119496856,
"grad_norm": 0.5375157594680786,
"learning_rate": 3.35593220338983e-05,
"loss": 1.5645,
"step": 102
},
{
"epoch": 0.8637316561844863,
"grad_norm": 0.4963093101978302,
"learning_rate": 3.338983050847458e-05,
"loss": 1.4993,
"step": 103
},
{
"epoch": 0.8721174004192872,
"grad_norm": 0.5070456862449646,
"learning_rate": 3.322033898305085e-05,
"loss": 1.4661,
"step": 104
},
{
"epoch": 0.8805031446540881,
"grad_norm": 0.48827776312828064,
"learning_rate": 3.305084745762712e-05,
"loss": 1.5453,
"step": 105
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.5393761396408081,
"learning_rate": 3.288135593220339e-05,
"loss": 1.4993,
"step": 106
},
{
"epoch": 0.8972746331236897,
"grad_norm": 0.47431623935699463,
"learning_rate": 3.271186440677966e-05,
"loss": 1.3194,
"step": 107
},
{
"epoch": 0.9056603773584906,
"grad_norm": 0.5005940794944763,
"learning_rate": 3.2542372881355934e-05,
"loss": 1.4166,
"step": 108
},
{
"epoch": 0.9140461215932913,
"grad_norm": 0.5325838327407837,
"learning_rate": 3.237288135593221e-05,
"loss": 1.3792,
"step": 109
},
{
"epoch": 0.9224318658280922,
"grad_norm": 0.48578980565071106,
"learning_rate": 3.2203389830508473e-05,
"loss": 1.3679,
"step": 110
},
{
"epoch": 0.9308176100628931,
"grad_norm": 0.5063319206237793,
"learning_rate": 3.203389830508475e-05,
"loss": 1.507,
"step": 111
},
{
"epoch": 0.939203354297694,
"grad_norm": 0.5529047250747681,
"learning_rate": 3.186440677966101e-05,
"loss": 1.6027,
"step": 112
},
{
"epoch": 0.9475890985324947,
"grad_norm": 0.5580345392227173,
"learning_rate": 3.169491525423729e-05,
"loss": 1.5906,
"step": 113
},
{
"epoch": 0.9559748427672956,
"grad_norm": 0.5370936393737793,
"learning_rate": 3.1525423728813566e-05,
"loss": 1.8187,
"step": 114
},
{
"epoch": 0.9643605870020965,
"grad_norm": 0.5383415222167969,
"learning_rate": 3.135593220338983e-05,
"loss": 1.688,
"step": 115
},
{
"epoch": 0.9727463312368972,
"grad_norm": 0.550933837890625,
"learning_rate": 3.1186440677966106e-05,
"loss": 1.5902,
"step": 116
},
{
"epoch": 0.9811320754716981,
"grad_norm": 0.5485110878944397,
"learning_rate": 3.101694915254237e-05,
"loss": 1.5818,
"step": 117
},
{
"epoch": 0.989517819706499,
"grad_norm": 0.6686434149742126,
"learning_rate": 3.0847457627118645e-05,
"loss": 1.5586,
"step": 118
},
{
"epoch": 0.9979035639412998,
"grad_norm": 0.5468031167984009,
"learning_rate": 3.067796610169492e-05,
"loss": 1.7003,
"step": 119
},
{
"epoch": 1.0083857442348008,
"grad_norm": 1.3953214883804321,
"learning_rate": 3.050847457627119e-05,
"loss": 3.0796,
"step": 120
},
{
"epoch": 1.0167714884696017,
"grad_norm": 0.5408557057380676,
"learning_rate": 3.0338983050847458e-05,
"loss": 1.4953,
"step": 121
},
{
"epoch": 1.0251572327044025,
"grad_norm": 0.5604081749916077,
"learning_rate": 3.016949152542373e-05,
"loss": 1.391,
"step": 122
},
{
"epoch": 1.0335429769392033,
"grad_norm": 0.5473874807357788,
"learning_rate": 3e-05,
"loss": 1.5113,
"step": 123
},
{
"epoch": 1.0419287211740043,
"grad_norm": 0.5697469115257263,
"learning_rate": 2.9830508474576274e-05,
"loss": 1.3537,
"step": 124
},
{
"epoch": 1.050314465408805,
"grad_norm": 0.6473388075828552,
"learning_rate": 2.9661016949152544e-05,
"loss": 1.4589,
"step": 125
},
{
"epoch": 1.0587002096436058,
"grad_norm": 0.5580431222915649,
"learning_rate": 2.9491525423728817e-05,
"loss": 1.5658,
"step": 126
},
{
"epoch": 1.0670859538784068,
"grad_norm": 0.5432992577552795,
"learning_rate": 2.9322033898305083e-05,
"loss": 1.4302,
"step": 127
},
{
"epoch": 1.0754716981132075,
"grad_norm": 0.5975386500358582,
"learning_rate": 2.915254237288136e-05,
"loss": 1.5508,
"step": 128
},
{
"epoch": 1.0838574423480083,
"grad_norm": 0.565565288066864,
"learning_rate": 2.8983050847457626e-05,
"loss": 1.4234,
"step": 129
},
{
"epoch": 1.0922431865828093,
"grad_norm": 0.6301350593566895,
"learning_rate": 2.88135593220339e-05,
"loss": 1.5267,
"step": 130
},
{
"epoch": 1.10062893081761,
"grad_norm": 0.5254076719284058,
"learning_rate": 2.864406779661017e-05,
"loss": 1.3074,
"step": 131
},
{
"epoch": 1.1090146750524108,
"grad_norm": 0.6312873959541321,
"learning_rate": 2.8474576271186442e-05,
"loss": 1.4878,
"step": 132
},
{
"epoch": 1.1174004192872118,
"grad_norm": 0.4790211319923401,
"learning_rate": 2.8305084745762712e-05,
"loss": 1.3158,
"step": 133
},
{
"epoch": 1.1257861635220126,
"grad_norm": 0.5577117800712585,
"learning_rate": 2.8135593220338985e-05,
"loss": 1.4097,
"step": 134
},
{
"epoch": 1.1341719077568135,
"grad_norm": 0.5645062327384949,
"learning_rate": 2.7966101694915255e-05,
"loss": 1.4276,
"step": 135
},
{
"epoch": 1.1425576519916143,
"grad_norm": 0.5814913511276245,
"learning_rate": 2.7796610169491528e-05,
"loss": 1.401,
"step": 136
},
{
"epoch": 1.150943396226415,
"grad_norm": 0.5780409574508667,
"learning_rate": 2.7627118644067794e-05,
"loss": 1.4739,
"step": 137
},
{
"epoch": 1.159329140461216,
"grad_norm": 0.6315497756004333,
"learning_rate": 2.7457627118644068e-05,
"loss": 1.5386,
"step": 138
},
{
"epoch": 1.1677148846960168,
"grad_norm": 0.5676960945129395,
"learning_rate": 2.7288135593220337e-05,
"loss": 1.4961,
"step": 139
},
{
"epoch": 1.1761006289308176,
"grad_norm": 0.5943715572357178,
"learning_rate": 2.711864406779661e-05,
"loss": 1.6106,
"step": 140
},
{
"epoch": 1.1844863731656186,
"grad_norm": 0.5691059231758118,
"learning_rate": 2.6949152542372884e-05,
"loss": 1.5345,
"step": 141
},
{
"epoch": 1.1928721174004193,
"grad_norm": 0.6429669260978699,
"learning_rate": 2.6779661016949153e-05,
"loss": 1.6742,
"step": 142
},
{
"epoch": 1.20125786163522,
"grad_norm": 0.5904098749160767,
"learning_rate": 2.6610169491525427e-05,
"loss": 1.458,
"step": 143
},
{
"epoch": 1.209643605870021,
"grad_norm": 0.5914203524589539,
"learning_rate": 2.6440677966101696e-05,
"loss": 1.5086,
"step": 144
},
{
"epoch": 1.2180293501048218,
"grad_norm": 0.6000847816467285,
"learning_rate": 2.627118644067797e-05,
"loss": 1.4316,
"step": 145
},
{
"epoch": 1.2264150943396226,
"grad_norm": 0.6070534586906433,
"learning_rate": 2.610169491525424e-05,
"loss": 1.4388,
"step": 146
},
{
"epoch": 1.2348008385744236,
"grad_norm": 0.5641275644302368,
"learning_rate": 2.5932203389830512e-05,
"loss": 1.5318,
"step": 147
},
{
"epoch": 1.2431865828092243,
"grad_norm": 0.5671488642692566,
"learning_rate": 2.576271186440678e-05,
"loss": 1.6092,
"step": 148
},
{
"epoch": 1.251572327044025,
"grad_norm": 0.6899793744087219,
"learning_rate": 2.5593220338983055e-05,
"loss": 1.536,
"step": 149
},
{
"epoch": 1.259958071278826,
"grad_norm": 0.6142588257789612,
"learning_rate": 2.5423728813559322e-05,
"loss": 1.6281,
"step": 150
},
{
"epoch": 1.2683438155136268,
"grad_norm": 0.6308810114860535,
"learning_rate": 2.5254237288135595e-05,
"loss": 1.6989,
"step": 151
},
{
"epoch": 1.2767295597484276,
"grad_norm": 0.699433445930481,
"learning_rate": 2.5084745762711865e-05,
"loss": 1.4067,
"step": 152
},
{
"epoch": 1.2851153039832286,
"grad_norm": 0.6100484132766724,
"learning_rate": 2.4915254237288138e-05,
"loss": 1.569,
"step": 153
},
{
"epoch": 1.2935010482180294,
"grad_norm": 0.5674847364425659,
"learning_rate": 2.4745762711864408e-05,
"loss": 1.4536,
"step": 154
},
{
"epoch": 1.3018867924528301,
"grad_norm": 0.6240501999855042,
"learning_rate": 2.457627118644068e-05,
"loss": 1.3428,
"step": 155
},
{
"epoch": 1.310272536687631,
"grad_norm": 0.6679978370666504,
"learning_rate": 2.440677966101695e-05,
"loss": 1.5999,
"step": 156
},
{
"epoch": 1.3186582809224319,
"grad_norm": 0.5994001626968384,
"learning_rate": 2.4237288135593224e-05,
"loss": 1.5542,
"step": 157
},
{
"epoch": 1.3270440251572326,
"grad_norm": 0.6358633041381836,
"learning_rate": 2.4067796610169493e-05,
"loss": 1.3593,
"step": 158
},
{
"epoch": 1.3354297693920336,
"grad_norm": 0.5659995079040527,
"learning_rate": 2.3898305084745763e-05,
"loss": 1.3259,
"step": 159
},
{
"epoch": 1.3438155136268344,
"grad_norm": 0.7298100590705872,
"learning_rate": 2.3728813559322036e-05,
"loss": 1.5724,
"step": 160
},
{
"epoch": 1.3522012578616351,
"grad_norm": 0.6506521701812744,
"learning_rate": 2.3559322033898306e-05,
"loss": 1.5445,
"step": 161
},
{
"epoch": 1.3605870020964361,
"grad_norm": 0.6763033866882324,
"learning_rate": 2.338983050847458e-05,
"loss": 1.5003,
"step": 162
},
{
"epoch": 1.368972746331237,
"grad_norm": 0.5723408460617065,
"learning_rate": 2.322033898305085e-05,
"loss": 1.5313,
"step": 163
},
{
"epoch": 1.3773584905660377,
"grad_norm": 0.6918197870254517,
"learning_rate": 2.305084745762712e-05,
"loss": 1.5711,
"step": 164
},
{
"epoch": 1.3857442348008386,
"grad_norm": 0.6125330924987793,
"learning_rate": 2.2881355932203392e-05,
"loss": 1.395,
"step": 165
},
{
"epoch": 1.3941299790356394,
"grad_norm": 0.6379712820053101,
"learning_rate": 2.271186440677966e-05,
"loss": 1.684,
"step": 166
},
{
"epoch": 1.4025157232704402,
"grad_norm": 0.6271690726280212,
"learning_rate": 2.2542372881355935e-05,
"loss": 1.4623,
"step": 167
},
{
"epoch": 1.4109014675052411,
"grad_norm": 0.6018547415733337,
"learning_rate": 2.2372881355932205e-05,
"loss": 1.3288,
"step": 168
},
{
"epoch": 1.419287211740042,
"grad_norm": 0.6406589150428772,
"learning_rate": 2.2203389830508474e-05,
"loss": 1.4531,
"step": 169
},
{
"epoch": 1.4276729559748427,
"grad_norm": 0.6161438822746277,
"learning_rate": 2.2033898305084748e-05,
"loss": 1.5999,
"step": 170
},
{
"epoch": 1.4360587002096437,
"grad_norm": 0.614861249923706,
"learning_rate": 2.1864406779661017e-05,
"loss": 1.3684,
"step": 171
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.6136622428894043,
"learning_rate": 2.1694915254237287e-05,
"loss": 1.399,
"step": 172
},
{
"epoch": 1.4528301886792452,
"grad_norm": 0.5674051642417908,
"learning_rate": 2.152542372881356e-05,
"loss": 1.274,
"step": 173
},
{
"epoch": 1.4612159329140462,
"grad_norm": 0.6396893858909607,
"learning_rate": 2.135593220338983e-05,
"loss": 1.5185,
"step": 174
},
{
"epoch": 1.469601677148847,
"grad_norm": 0.6016610264778137,
"learning_rate": 2.1186440677966103e-05,
"loss": 1.2965,
"step": 175
},
{
"epoch": 1.4779874213836477,
"grad_norm": 0.6875283122062683,
"learning_rate": 2.1016949152542373e-05,
"loss": 1.6257,
"step": 176
},
{
"epoch": 1.4863731656184487,
"grad_norm": 0.5814647078514099,
"learning_rate": 2.0847457627118643e-05,
"loss": 1.3111,
"step": 177
},
{
"epoch": 1.4947589098532494,
"grad_norm": 0.6307722926139832,
"learning_rate": 2.0677966101694916e-05,
"loss": 1.547,
"step": 178
},
{
"epoch": 1.5031446540880502,
"grad_norm": 0.588858962059021,
"learning_rate": 2.0508474576271186e-05,
"loss": 1.3264,
"step": 179
},
{
"epoch": 1.5115303983228512,
"grad_norm": 0.669362485408783,
"learning_rate": 2.033898305084746e-05,
"loss": 1.5909,
"step": 180
},
{
"epoch": 1.519916142557652,
"grad_norm": 0.7193084359169006,
"learning_rate": 2.0169491525423732e-05,
"loss": 1.5327,
"step": 181
},
{
"epoch": 1.5283018867924527,
"grad_norm": 0.635857105255127,
"learning_rate": 2e-05,
"loss": 1.4276,
"step": 182
},
{
"epoch": 1.5366876310272537,
"grad_norm": 0.636381208896637,
"learning_rate": 1.9830508474576275e-05,
"loss": 1.3813,
"step": 183
},
{
"epoch": 1.5450733752620545,
"grad_norm": 0.6892669796943665,
"learning_rate": 1.9661016949152545e-05,
"loss": 1.3461,
"step": 184
},
{
"epoch": 1.5534591194968552,
"grad_norm": 0.671186089515686,
"learning_rate": 1.9491525423728814e-05,
"loss": 1.3873,
"step": 185
},
{
"epoch": 1.5618448637316562,
"grad_norm": 0.5775100588798523,
"learning_rate": 1.9322033898305087e-05,
"loss": 1.396,
"step": 186
},
{
"epoch": 1.570230607966457,
"grad_norm": 0.634170651435852,
"learning_rate": 1.9152542372881357e-05,
"loss": 1.6692,
"step": 187
},
{
"epoch": 1.5786163522012577,
"grad_norm": 0.6621935963630676,
"learning_rate": 1.8983050847457627e-05,
"loss": 1.5905,
"step": 188
},
{
"epoch": 1.5870020964360587,
"grad_norm": 0.6979579329490662,
"learning_rate": 1.88135593220339e-05,
"loss": 1.5344,
"step": 189
},
{
"epoch": 1.5953878406708597,
"grad_norm": 0.6624859571456909,
"learning_rate": 1.864406779661017e-05,
"loss": 1.3544,
"step": 190
},
{
"epoch": 1.6037735849056602,
"grad_norm": 0.6619541645050049,
"learning_rate": 1.8474576271186443e-05,
"loss": 1.6527,
"step": 191
},
{
"epoch": 1.6121593291404612,
"grad_norm": 0.646507978439331,
"learning_rate": 1.8305084745762713e-05,
"loss": 1.4865,
"step": 192
},
{
"epoch": 1.6205450733752622,
"grad_norm": 0.6617197394371033,
"learning_rate": 1.8135593220338983e-05,
"loss": 1.5115,
"step": 193
},
{
"epoch": 1.6289308176100628,
"grad_norm": 0.5884259939193726,
"learning_rate": 1.7966101694915256e-05,
"loss": 1.2601,
"step": 194
},
{
"epoch": 1.6373165618448637,
"grad_norm": 0.6010658144950867,
"learning_rate": 1.7796610169491526e-05,
"loss": 1.408,
"step": 195
},
{
"epoch": 1.6457023060796647,
"grad_norm": 0.7407470941543579,
"learning_rate": 1.76271186440678e-05,
"loss": 1.346,
"step": 196
},
{
"epoch": 1.6540880503144653,
"grad_norm": 0.7493016719818115,
"learning_rate": 1.745762711864407e-05,
"loss": 1.421,
"step": 197
},
{
"epoch": 1.6624737945492662,
"grad_norm": 0.5945444107055664,
"learning_rate": 1.7288135593220338e-05,
"loss": 1.3774,
"step": 198
},
{
"epoch": 1.6708595387840672,
"grad_norm": 0.5583181977272034,
"learning_rate": 1.711864406779661e-05,
"loss": 1.1518,
"step": 199
},
{
"epoch": 1.6792452830188678,
"grad_norm": 0.6571647524833679,
"learning_rate": 1.694915254237288e-05,
"loss": 1.401,
"step": 200
},
{
"epoch": 1.6876310272536688,
"grad_norm": 0.6961767673492432,
"learning_rate": 1.677966101694915e-05,
"loss": 1.4618,
"step": 201
},
{
"epoch": 1.6960167714884697,
"grad_norm": 0.6763336062431335,
"learning_rate": 1.6610169491525424e-05,
"loss": 1.3297,
"step": 202
},
{
"epoch": 1.7044025157232703,
"grad_norm": 0.7434819340705872,
"learning_rate": 1.6440677966101694e-05,
"loss": 1.3833,
"step": 203
},
{
"epoch": 1.7127882599580713,
"grad_norm": 0.6780304908752441,
"learning_rate": 1.6271186440677967e-05,
"loss": 1.39,
"step": 204
},
{
"epoch": 1.7211740041928723,
"grad_norm": 0.6340621113777161,
"learning_rate": 1.6101694915254237e-05,
"loss": 1.2273,
"step": 205
},
{
"epoch": 1.7295597484276728,
"grad_norm": 0.6686990261077881,
"learning_rate": 1.5932203389830507e-05,
"loss": 1.4499,
"step": 206
},
{
"epoch": 1.7379454926624738,
"grad_norm": 0.7210912108421326,
"learning_rate": 1.5762711864406783e-05,
"loss": 1.4879,
"step": 207
},
{
"epoch": 1.7463312368972748,
"grad_norm": 0.7638130784034729,
"learning_rate": 1.5593220338983053e-05,
"loss": 1.5784,
"step": 208
},
{
"epoch": 1.7547169811320755,
"grad_norm": 0.7345211505889893,
"learning_rate": 1.5423728813559323e-05,
"loss": 1.3925,
"step": 209
},
{
"epoch": 1.7631027253668763,
"grad_norm": 0.5969035625457764,
"learning_rate": 1.5254237288135596e-05,
"loss": 1.2387,
"step": 210
},
{
"epoch": 1.7714884696016773,
"grad_norm": 0.6565172076225281,
"learning_rate": 1.5084745762711865e-05,
"loss": 1.2855,
"step": 211
},
{
"epoch": 1.779874213836478,
"grad_norm": 0.6907662153244019,
"learning_rate": 1.4915254237288137e-05,
"loss": 1.3413,
"step": 212
},
{
"epoch": 1.7882599580712788,
"grad_norm": 0.6184176206588745,
"learning_rate": 1.4745762711864408e-05,
"loss": 1.445,
"step": 213
},
{
"epoch": 1.7966457023060798,
"grad_norm": 0.6009007096290588,
"learning_rate": 1.457627118644068e-05,
"loss": 1.1403,
"step": 214
},
{
"epoch": 1.8050314465408805,
"grad_norm": 0.7282977104187012,
"learning_rate": 1.440677966101695e-05,
"loss": 1.4733,
"step": 215
},
{
"epoch": 1.8134171907756813,
"grad_norm": 0.6807677745819092,
"learning_rate": 1.4237288135593221e-05,
"loss": 1.6621,
"step": 216
},
{
"epoch": 1.8218029350104823,
"grad_norm": 0.5497955083847046,
"learning_rate": 1.4067796610169493e-05,
"loss": 1.2456,
"step": 217
},
{
"epoch": 1.830188679245283,
"grad_norm": 0.65602046251297,
"learning_rate": 1.3898305084745764e-05,
"loss": 1.3571,
"step": 218
},
{
"epoch": 1.8385744234800838,
"grad_norm": 0.8637228012084961,
"learning_rate": 1.3728813559322034e-05,
"loss": 1.5819,
"step": 219
},
{
"epoch": 1.8469601677148848,
"grad_norm": 0.671103298664093,
"learning_rate": 1.3559322033898305e-05,
"loss": 1.4916,
"step": 220
},
{
"epoch": 1.8553459119496856,
"grad_norm": 0.652407705783844,
"learning_rate": 1.3389830508474577e-05,
"loss": 1.4693,
"step": 221
},
{
"epoch": 1.8637316561844863,
"grad_norm": 0.6491547226905823,
"learning_rate": 1.3220338983050848e-05,
"loss": 1.4074,
"step": 222
},
{
"epoch": 1.8721174004192873,
"grad_norm": 0.6175271272659302,
"learning_rate": 1.305084745762712e-05,
"loss": 1.3958,
"step": 223
},
{
"epoch": 1.880503144654088,
"grad_norm": 0.6546741127967834,
"learning_rate": 1.288135593220339e-05,
"loss": 1.2658,
"step": 224
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.7430203557014465,
"learning_rate": 1.2711864406779661e-05,
"loss": 1.5967,
"step": 225
},
{
"epoch": 1.8972746331236898,
"grad_norm": 0.7903656959533691,
"learning_rate": 1.2542372881355932e-05,
"loss": 1.3528,
"step": 226
},
{
"epoch": 1.9056603773584906,
"grad_norm": 0.712054967880249,
"learning_rate": 1.2372881355932204e-05,
"loss": 1.4618,
"step": 227
},
{
"epoch": 1.9140461215932913,
"grad_norm": 0.6519030332565308,
"learning_rate": 1.2203389830508475e-05,
"loss": 1.3854,
"step": 228
},
{
"epoch": 1.9224318658280923,
"grad_norm": 0.6560716032981873,
"learning_rate": 1.2033898305084747e-05,
"loss": 1.4835,
"step": 229
},
{
"epoch": 1.930817610062893,
"grad_norm": 0.61641526222229,
"learning_rate": 1.1864406779661018e-05,
"loss": 1.4169,
"step": 230
},
{
"epoch": 1.9392033542976939,
"grad_norm": 0.8207079172134399,
"learning_rate": 1.169491525423729e-05,
"loss": 1.5823,
"step": 231
},
{
"epoch": 1.9475890985324948,
"grad_norm": 0.6479889154434204,
"learning_rate": 1.152542372881356e-05,
"loss": 1.4131,
"step": 232
},
{
"epoch": 1.9559748427672956,
"grad_norm": 0.746671199798584,
"learning_rate": 1.135593220338983e-05,
"loss": 1.391,
"step": 233
},
{
"epoch": 1.9643605870020964,
"grad_norm": 0.7543257474899292,
"learning_rate": 1.1186440677966102e-05,
"loss": 1.5147,
"step": 234
},
{
"epoch": 1.9727463312368974,
"grad_norm": 0.6632611751556396,
"learning_rate": 1.1016949152542374e-05,
"loss": 1.5113,
"step": 235
},
{
"epoch": 1.9811320754716981,
"grad_norm": 0.6857608556747437,
"learning_rate": 1.0847457627118644e-05,
"loss": 1.3092,
"step": 236
},
{
"epoch": 1.9895178197064989,
"grad_norm": 0.6538596153259277,
"learning_rate": 1.0677966101694915e-05,
"loss": 1.2827,
"step": 237
},
{
"epoch": 1.9979035639412999,
"grad_norm": 0.6871718764305115,
"learning_rate": 1.0508474576271186e-05,
"loss": 1.4524,
"step": 238
},
{
"epoch": 2.0,
"grad_norm": 1.580600380897522,
"learning_rate": 1.0338983050847458e-05,
"loss": 1.3756,
"step": 239
},
{
"epoch": 2.008385744234801,
"grad_norm": 0.8496657609939575,
"learning_rate": 1.016949152542373e-05,
"loss": 1.5361,
"step": 240
},
{
"epoch": 2.0167714884696015,
"grad_norm": 0.6252016425132751,
"learning_rate": 1e-05,
"loss": 1.2379,
"step": 241
},
{
"epoch": 2.0251572327044025,
"grad_norm": 0.5890762209892273,
"learning_rate": 9.830508474576272e-06,
"loss": 1.1365,
"step": 242
},
{
"epoch": 2.0335429769392035,
"grad_norm": 0.6594178080558777,
"learning_rate": 9.661016949152544e-06,
"loss": 1.4264,
"step": 243
},
{
"epoch": 2.041928721174004,
"grad_norm": 0.6303755640983582,
"learning_rate": 9.491525423728814e-06,
"loss": 1.5162,
"step": 244
},
{
"epoch": 2.050314465408805,
"grad_norm": 0.7321446537971497,
"learning_rate": 9.322033898305085e-06,
"loss": 1.5888,
"step": 245
},
{
"epoch": 2.058700209643606,
"grad_norm": 0.6928514838218689,
"learning_rate": 9.152542372881356e-06,
"loss": 1.5963,
"step": 246
},
{
"epoch": 2.0670859538784065,
"grad_norm": 0.7305393815040588,
"learning_rate": 8.983050847457628e-06,
"loss": 1.4996,
"step": 247
},
{
"epoch": 2.0754716981132075,
"grad_norm": 0.7094164490699768,
"learning_rate": 8.8135593220339e-06,
"loss": 1.3651,
"step": 248
},
{
"epoch": 2.0838574423480085,
"grad_norm": 0.6932939291000366,
"learning_rate": 8.644067796610169e-06,
"loss": 1.4837,
"step": 249
},
{
"epoch": 2.092243186582809,
"grad_norm": 0.846845269203186,
"learning_rate": 8.47457627118644e-06,
"loss": 1.5293,
"step": 250
},
{
"epoch": 2.10062893081761,
"grad_norm": 0.7128404974937439,
"learning_rate": 8.305084745762712e-06,
"loss": 1.3928,
"step": 251
},
{
"epoch": 2.109014675052411,
"grad_norm": 0.7099897861480713,
"learning_rate": 8.135593220338983e-06,
"loss": 1.396,
"step": 252
},
{
"epoch": 2.1174004192872116,
"grad_norm": 0.6866568922996521,
"learning_rate": 7.966101694915253e-06,
"loss": 1.5034,
"step": 253
},
{
"epoch": 2.1257861635220126,
"grad_norm": 0.583806574344635,
"learning_rate": 7.796610169491526e-06,
"loss": 1.2417,
"step": 254
},
{
"epoch": 2.1341719077568135,
"grad_norm": 0.6535069942474365,
"learning_rate": 7.627118644067798e-06,
"loss": 1.2934,
"step": 255
},
{
"epoch": 2.142557651991614,
"grad_norm": 0.6619601845741272,
"learning_rate": 7.4576271186440685e-06,
"loss": 1.2856,
"step": 256
},
{
"epoch": 2.150943396226415,
"grad_norm": 0.7087454795837402,
"learning_rate": 7.28813559322034e-06,
"loss": 1.3244,
"step": 257
},
{
"epoch": 2.159329140461216,
"grad_norm": 0.7019234895706177,
"learning_rate": 7.1186440677966106e-06,
"loss": 1.3269,
"step": 258
},
{
"epoch": 2.1677148846960166,
"grad_norm": 0.6695578694343567,
"learning_rate": 6.949152542372882e-06,
"loss": 1.2839,
"step": 259
},
{
"epoch": 2.1761006289308176,
"grad_norm": 0.6900045275688171,
"learning_rate": 6.779661016949153e-06,
"loss": 1.5439,
"step": 260
},
{
"epoch": 2.1844863731656186,
"grad_norm": 0.7736982107162476,
"learning_rate": 6.610169491525424e-06,
"loss": 1.5258,
"step": 261
},
{
"epoch": 2.192872117400419,
"grad_norm": 0.5855519771575928,
"learning_rate": 6.440677966101695e-06,
"loss": 1.1754,
"step": 262
},
{
"epoch": 2.20125786163522,
"grad_norm": 0.6449745893478394,
"learning_rate": 6.271186440677966e-06,
"loss": 1.4888,
"step": 263
},
{
"epoch": 2.209643605870021,
"grad_norm": 0.7780332565307617,
"learning_rate": 6.101694915254238e-06,
"loss": 1.5469,
"step": 264
},
{
"epoch": 2.2180293501048216,
"grad_norm": 0.6325747966766357,
"learning_rate": 5.932203389830509e-06,
"loss": 1.3144,
"step": 265
},
{
"epoch": 2.2264150943396226,
"grad_norm": 0.6543543338775635,
"learning_rate": 5.76271186440678e-06,
"loss": 1.1714,
"step": 266
},
{
"epoch": 2.2348008385744236,
"grad_norm": 0.8630987405776978,
"learning_rate": 5.593220338983051e-06,
"loss": 1.4851,
"step": 267
},
{
"epoch": 2.243186582809224,
"grad_norm": 0.7857372164726257,
"learning_rate": 5.423728813559322e-06,
"loss": 1.3268,
"step": 268
},
{
"epoch": 2.251572327044025,
"grad_norm": 0.7938205599784851,
"learning_rate": 5.254237288135593e-06,
"loss": 1.5033,
"step": 269
},
{
"epoch": 2.259958071278826,
"grad_norm": 0.6283496022224426,
"learning_rate": 5.084745762711865e-06,
"loss": 1.2449,
"step": 270
},
{
"epoch": 2.268343815513627,
"grad_norm": 0.7021183967590332,
"learning_rate": 4.915254237288136e-06,
"loss": 1.3424,
"step": 271
},
{
"epoch": 2.2767295597484276,
"grad_norm": 0.730631411075592,
"learning_rate": 4.745762711864407e-06,
"loss": 1.3327,
"step": 272
},
{
"epoch": 2.2851153039832286,
"grad_norm": 0.6509723663330078,
"learning_rate": 4.576271186440678e-06,
"loss": 1.1817,
"step": 273
},
{
"epoch": 2.2935010482180296,
"grad_norm": 0.6313263177871704,
"learning_rate": 4.40677966101695e-06,
"loss": 1.3395,
"step": 274
},
{
"epoch": 2.30188679245283,
"grad_norm": 0.7210220694541931,
"learning_rate": 4.23728813559322e-06,
"loss": 1.5118,
"step": 275
},
{
"epoch": 2.310272536687631,
"grad_norm": 0.698341429233551,
"learning_rate": 4.067796610169492e-06,
"loss": 1.4284,
"step": 276
},
{
"epoch": 2.318658280922432,
"grad_norm": 0.6756731271743774,
"learning_rate": 3.898305084745763e-06,
"loss": 1.5066,
"step": 277
},
{
"epoch": 2.3270440251572326,
"grad_norm": 0.6834630370140076,
"learning_rate": 3.7288135593220342e-06,
"loss": 1.476,
"step": 278
},
{
"epoch": 2.3354297693920336,
"grad_norm": 0.715414822101593,
"learning_rate": 3.5593220338983053e-06,
"loss": 1.4063,
"step": 279
},
{
"epoch": 2.3438155136268346,
"grad_norm": 0.6956151723861694,
"learning_rate": 3.3898305084745763e-06,
"loss": 1.412,
"step": 280
},
{
"epoch": 2.352201257861635,
"grad_norm": 0.6213716268539429,
"learning_rate": 3.2203389830508473e-06,
"loss": 1.2284,
"step": 281
},
{
"epoch": 2.360587002096436,
"grad_norm": 0.7275508642196655,
"learning_rate": 3.050847457627119e-06,
"loss": 1.4907,
"step": 282
},
{
"epoch": 2.368972746331237,
"grad_norm": 0.672480046749115,
"learning_rate": 2.88135593220339e-06,
"loss": 1.3039,
"step": 283
},
{
"epoch": 2.3773584905660377,
"grad_norm": 0.7390619516372681,
"learning_rate": 2.711864406779661e-06,
"loss": 1.452,
"step": 284
},
{
"epoch": 2.3857442348008386,
"grad_norm": 0.6363676190376282,
"learning_rate": 2.5423728813559323e-06,
"loss": 1.264,
"step": 285
},
{
"epoch": 2.3941299790356396,
"grad_norm": 0.7060114145278931,
"learning_rate": 2.3728813559322034e-06,
"loss": 1.438,
"step": 286
},
{
"epoch": 2.40251572327044,
"grad_norm": 0.7109473347663879,
"learning_rate": 2.203389830508475e-06,
"loss": 1.4074,
"step": 287
},
{
"epoch": 2.410901467505241,
"grad_norm": 0.7845531105995178,
"learning_rate": 2.033898305084746e-06,
"loss": 1.513,
"step": 288
},
{
"epoch": 2.419287211740042,
"grad_norm": 0.7582221627235413,
"learning_rate": 1.8644067796610171e-06,
"loss": 1.475,
"step": 289
},
{
"epoch": 2.4276729559748427,
"grad_norm": 0.7518870234489441,
"learning_rate": 1.6949152542372882e-06,
"loss": 1.5362,
"step": 290
},
{
"epoch": 2.4360587002096437,
"grad_norm": 0.7295182347297668,
"learning_rate": 1.5254237288135594e-06,
"loss": 1.3987,
"step": 291
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.7670787572860718,
"learning_rate": 1.3559322033898304e-06,
"loss": 1.4496,
"step": 292
},
{
"epoch": 2.452830188679245,
"grad_norm": 0.7006129026412964,
"learning_rate": 1.1864406779661017e-06,
"loss": 1.366,
"step": 293
},
{
"epoch": 2.461215932914046,
"grad_norm": 0.6317689418792725,
"learning_rate": 1.016949152542373e-06,
"loss": 1.3638,
"step": 294
},
{
"epoch": 2.469601677148847,
"grad_norm": 0.6305463910102844,
"learning_rate": 8.474576271186441e-07,
"loss": 1.166,
"step": 295
},
{
"epoch": 2.4779874213836477,
"grad_norm": 0.7784201502799988,
"learning_rate": 6.779661016949152e-07,
"loss": 1.4706,
"step": 296
},
{
"epoch": 2.4863731656184487,
"grad_norm": 0.7264308333396912,
"learning_rate": 5.084745762711865e-07,
"loss": 1.4284,
"step": 297
},
{
"epoch": 2.4947589098532497,
"grad_norm": 0.7306190133094788,
"learning_rate": 3.389830508474576e-07,
"loss": 1.4214,
"step": 298
},
{
"epoch": 2.50314465408805,
"grad_norm": 0.743761420249939,
"learning_rate": 1.694915254237288e-07,
"loss": 1.5255,
"step": 299
},
{
"epoch": 2.511530398322851,
"grad_norm": 0.699112594127655,
"learning_rate": 0.0,
"loss": 1.4776,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 300,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.114333412565627e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}