marianna13's picture
Upload folder using huggingface_hub
9a17d5a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.989993328885924,
"eval_steps": 500,
"global_step": 935,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00533689126084056,
"grad_norm": 6.148000037568765,
"learning_rate": 4.2553191489361704e-07,
"loss": 0.9543,
"step": 1
},
{
"epoch": 0.01067378252168112,
"grad_norm": 5.76207120136237,
"learning_rate": 8.510638297872341e-07,
"loss": 0.9275,
"step": 2
},
{
"epoch": 0.016010673782521682,
"grad_norm": 5.846133487180721,
"learning_rate": 1.276595744680851e-06,
"loss": 0.9352,
"step": 3
},
{
"epoch": 0.02134756504336224,
"grad_norm": 5.948943250861422,
"learning_rate": 1.7021276595744682e-06,
"loss": 0.9911,
"step": 4
},
{
"epoch": 0.0266844563042028,
"grad_norm": 5.471501096137681,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.9284,
"step": 5
},
{
"epoch": 0.032021347565043365,
"grad_norm": 4.598154721919055,
"learning_rate": 2.553191489361702e-06,
"loss": 0.9151,
"step": 6
},
{
"epoch": 0.037358238825883926,
"grad_norm": 4.191582543667081,
"learning_rate": 2.978723404255319e-06,
"loss": 0.9174,
"step": 7
},
{
"epoch": 0.04269513008672448,
"grad_norm": 2.2852207165276175,
"learning_rate": 3.4042553191489363e-06,
"loss": 0.8733,
"step": 8
},
{
"epoch": 0.04803202134756504,
"grad_norm": 1.9926907110993184,
"learning_rate": 3.8297872340425535e-06,
"loss": 0.8863,
"step": 9
},
{
"epoch": 0.0533689126084056,
"grad_norm": 1.6897648811838724,
"learning_rate": 4.255319148936171e-06,
"loss": 0.8197,
"step": 10
},
{
"epoch": 0.05870580386924616,
"grad_norm": 4.197834380211571,
"learning_rate": 4.680851063829788e-06,
"loss": 0.8614,
"step": 11
},
{
"epoch": 0.06404269513008673,
"grad_norm": 4.349335585259934,
"learning_rate": 5.106382978723404e-06,
"loss": 0.8723,
"step": 12
},
{
"epoch": 0.06937958639092728,
"grad_norm": 3.969299235666275,
"learning_rate": 5.531914893617022e-06,
"loss": 0.8121,
"step": 13
},
{
"epoch": 0.07471647765176785,
"grad_norm": 3.1468331398249854,
"learning_rate": 5.957446808510638e-06,
"loss": 0.8111,
"step": 14
},
{
"epoch": 0.0800533689126084,
"grad_norm": 3.060340690213249,
"learning_rate": 6.382978723404256e-06,
"loss": 0.7968,
"step": 15
},
{
"epoch": 0.08539026017344896,
"grad_norm": 2.5011639930656364,
"learning_rate": 6.808510638297873e-06,
"loss": 0.7474,
"step": 16
},
{
"epoch": 0.09072715143428953,
"grad_norm": 2.0423245778644086,
"learning_rate": 7.234042553191491e-06,
"loss": 0.7869,
"step": 17
},
{
"epoch": 0.09606404269513008,
"grad_norm": 1.7376707160976537,
"learning_rate": 7.659574468085107e-06,
"loss": 0.7671,
"step": 18
},
{
"epoch": 0.10140093395597065,
"grad_norm": 1.615732209627739,
"learning_rate": 8.085106382978723e-06,
"loss": 0.7463,
"step": 19
},
{
"epoch": 0.1067378252168112,
"grad_norm": 1.728049657874694,
"learning_rate": 8.510638297872341e-06,
"loss": 0.7339,
"step": 20
},
{
"epoch": 0.11207471647765177,
"grad_norm": 1.7850972209703333,
"learning_rate": 8.936170212765958e-06,
"loss": 0.7487,
"step": 21
},
{
"epoch": 0.11741160773849232,
"grad_norm": 1.427350518416662,
"learning_rate": 9.361702127659576e-06,
"loss": 0.7327,
"step": 22
},
{
"epoch": 0.12274849899933289,
"grad_norm": 1.259778470170131,
"learning_rate": 9.787234042553192e-06,
"loss": 0.7032,
"step": 23
},
{
"epoch": 0.12808539026017346,
"grad_norm": 1.2541341499268899,
"learning_rate": 1.0212765957446808e-05,
"loss": 0.7157,
"step": 24
},
{
"epoch": 0.133422281521014,
"grad_norm": 1.209399238284116,
"learning_rate": 1.0638297872340426e-05,
"loss": 0.696,
"step": 25
},
{
"epoch": 0.13875917278185457,
"grad_norm": 1.1148869413377325,
"learning_rate": 1.1063829787234044e-05,
"loss": 0.7342,
"step": 26
},
{
"epoch": 0.14409606404269512,
"grad_norm": 1.0267231678585527,
"learning_rate": 1.1489361702127662e-05,
"loss": 0.7014,
"step": 27
},
{
"epoch": 0.1494329553035357,
"grad_norm": 1.053000012577453,
"learning_rate": 1.1914893617021277e-05,
"loss": 0.7193,
"step": 28
},
{
"epoch": 0.15476984656437626,
"grad_norm": 0.9236120588909748,
"learning_rate": 1.2340425531914895e-05,
"loss": 0.6848,
"step": 29
},
{
"epoch": 0.1601067378252168,
"grad_norm": 0.707299824861324,
"learning_rate": 1.2765957446808513e-05,
"loss": 0.6967,
"step": 30
},
{
"epoch": 0.16544362908605736,
"grad_norm": 0.7954547209115258,
"learning_rate": 1.3191489361702127e-05,
"loss": 0.7101,
"step": 31
},
{
"epoch": 0.17078052034689792,
"grad_norm": 0.7740855587703414,
"learning_rate": 1.3617021276595745e-05,
"loss": 0.6959,
"step": 32
},
{
"epoch": 0.1761174116077385,
"grad_norm": 0.6911657011135235,
"learning_rate": 1.4042553191489363e-05,
"loss": 0.6649,
"step": 33
},
{
"epoch": 0.18145430286857905,
"grad_norm": 0.6352745397687202,
"learning_rate": 1.4468085106382981e-05,
"loss": 0.7012,
"step": 34
},
{
"epoch": 0.1867911941294196,
"grad_norm": 0.5651606409699009,
"learning_rate": 1.4893617021276596e-05,
"loss": 0.6294,
"step": 35
},
{
"epoch": 0.19212808539026016,
"grad_norm": 0.7512985605813014,
"learning_rate": 1.5319148936170214e-05,
"loss": 0.6641,
"step": 36
},
{
"epoch": 0.19746497665110074,
"grad_norm": 0.6367644839194788,
"learning_rate": 1.5744680851063832e-05,
"loss": 0.6414,
"step": 37
},
{
"epoch": 0.2028018679119413,
"grad_norm": 0.6727260374168498,
"learning_rate": 1.6170212765957446e-05,
"loss": 0.6606,
"step": 38
},
{
"epoch": 0.20813875917278185,
"grad_norm": 0.5863518339156979,
"learning_rate": 1.6595744680851064e-05,
"loss": 0.629,
"step": 39
},
{
"epoch": 0.2134756504336224,
"grad_norm": 0.5957413274178028,
"learning_rate": 1.7021276595744682e-05,
"loss": 0.6433,
"step": 40
},
{
"epoch": 0.218812541694463,
"grad_norm": 0.6599440739948166,
"learning_rate": 1.74468085106383e-05,
"loss": 0.6458,
"step": 41
},
{
"epoch": 0.22414943295530354,
"grad_norm": 0.6721754940213285,
"learning_rate": 1.7872340425531915e-05,
"loss": 0.6492,
"step": 42
},
{
"epoch": 0.2294863242161441,
"grad_norm": 0.7632199119447193,
"learning_rate": 1.8297872340425533e-05,
"loss": 0.6746,
"step": 43
},
{
"epoch": 0.23482321547698465,
"grad_norm": 0.5166294877933729,
"learning_rate": 1.872340425531915e-05,
"loss": 0.6295,
"step": 44
},
{
"epoch": 0.24016010673782523,
"grad_norm": 0.829534817664272,
"learning_rate": 1.914893617021277e-05,
"loss": 0.6375,
"step": 45
},
{
"epoch": 0.24549699799866578,
"grad_norm": 0.5857977032260275,
"learning_rate": 1.9574468085106384e-05,
"loss": 0.6572,
"step": 46
},
{
"epoch": 0.25083388925950634,
"grad_norm": 0.7823636354279289,
"learning_rate": 2e-05,
"loss": 0.6504,
"step": 47
},
{
"epoch": 0.2561707805203469,
"grad_norm": 0.7047081341160781,
"learning_rate": 2.0425531914893616e-05,
"loss": 0.6228,
"step": 48
},
{
"epoch": 0.26150767178118745,
"grad_norm": 0.6863987126187889,
"learning_rate": 2.0851063829787238e-05,
"loss": 0.6277,
"step": 49
},
{
"epoch": 0.266844563042028,
"grad_norm": 0.6190735802489663,
"learning_rate": 2.1276595744680852e-05,
"loss": 0.6425,
"step": 50
},
{
"epoch": 0.27218145430286855,
"grad_norm": 0.6101823544223539,
"learning_rate": 2.1702127659574467e-05,
"loss": 0.6437,
"step": 51
},
{
"epoch": 0.27751834556370913,
"grad_norm": 0.6644263069643335,
"learning_rate": 2.2127659574468088e-05,
"loss": 0.6387,
"step": 52
},
{
"epoch": 0.2828552368245497,
"grad_norm": 0.5986962358886303,
"learning_rate": 2.2553191489361703e-05,
"loss": 0.6385,
"step": 53
},
{
"epoch": 0.28819212808539024,
"grad_norm": 0.6094843736073496,
"learning_rate": 2.2978723404255324e-05,
"loss": 0.6117,
"step": 54
},
{
"epoch": 0.2935290193462308,
"grad_norm": 0.6020778721421863,
"learning_rate": 2.340425531914894e-05,
"loss": 0.6254,
"step": 55
},
{
"epoch": 0.2988659106070714,
"grad_norm": 0.811050821660795,
"learning_rate": 2.3829787234042553e-05,
"loss": 0.6128,
"step": 56
},
{
"epoch": 0.30420280186791193,
"grad_norm": 0.5754298827220308,
"learning_rate": 2.4255319148936175e-05,
"loss": 0.6311,
"step": 57
},
{
"epoch": 0.3095396931287525,
"grad_norm": 0.7520600773223384,
"learning_rate": 2.468085106382979e-05,
"loss": 0.6179,
"step": 58
},
{
"epoch": 0.31487658438959304,
"grad_norm": 0.6029568660086547,
"learning_rate": 2.5106382978723404e-05,
"loss": 0.6297,
"step": 59
},
{
"epoch": 0.3202134756504336,
"grad_norm": 0.5648234220070932,
"learning_rate": 2.5531914893617025e-05,
"loss": 0.6281,
"step": 60
},
{
"epoch": 0.3255503669112742,
"grad_norm": 0.6476246719033275,
"learning_rate": 2.595744680851064e-05,
"loss": 0.627,
"step": 61
},
{
"epoch": 0.33088725817211473,
"grad_norm": 0.7385881909771188,
"learning_rate": 2.6382978723404255e-05,
"loss": 0.6173,
"step": 62
},
{
"epoch": 0.3362241494329553,
"grad_norm": 0.7821703232556236,
"learning_rate": 2.6808510638297876e-05,
"loss": 0.6044,
"step": 63
},
{
"epoch": 0.34156104069379584,
"grad_norm": 0.6836861306090984,
"learning_rate": 2.723404255319149e-05,
"loss": 0.588,
"step": 64
},
{
"epoch": 0.3468979319546364,
"grad_norm": 0.6580791298040481,
"learning_rate": 2.7659574468085112e-05,
"loss": 0.6286,
"step": 65
},
{
"epoch": 0.352234823215477,
"grad_norm": 0.864506493478045,
"learning_rate": 2.8085106382978727e-05,
"loss": 0.5923,
"step": 66
},
{
"epoch": 0.3575717144763175,
"grad_norm": 0.7017167406064279,
"learning_rate": 2.851063829787234e-05,
"loss": 0.5999,
"step": 67
},
{
"epoch": 0.3629086057371581,
"grad_norm": 0.9052162942115397,
"learning_rate": 2.8936170212765963e-05,
"loss": 0.6348,
"step": 68
},
{
"epoch": 0.3682454969979987,
"grad_norm": 0.7977337352191972,
"learning_rate": 2.9361702127659577e-05,
"loss": 0.5823,
"step": 69
},
{
"epoch": 0.3735823882588392,
"grad_norm": 0.741861801885776,
"learning_rate": 2.9787234042553192e-05,
"loss": 0.6078,
"step": 70
},
{
"epoch": 0.3789192795196798,
"grad_norm": 0.8436055916627975,
"learning_rate": 3.0212765957446813e-05,
"loss": 0.65,
"step": 71
},
{
"epoch": 0.3842561707805203,
"grad_norm": 0.9156977886774698,
"learning_rate": 3.063829787234043e-05,
"loss": 0.6131,
"step": 72
},
{
"epoch": 0.3895930620413609,
"grad_norm": 0.7176731849912837,
"learning_rate": 3.1063829787234046e-05,
"loss": 0.6065,
"step": 73
},
{
"epoch": 0.3949299533022015,
"grad_norm": 1.0096379849584094,
"learning_rate": 3.1489361702127664e-05,
"loss": 0.6437,
"step": 74
},
{
"epoch": 0.400266844563042,
"grad_norm": 0.9015077136133018,
"learning_rate": 3.191489361702128e-05,
"loss": 0.5947,
"step": 75
},
{
"epoch": 0.4056037358238826,
"grad_norm": 0.9057483735249481,
"learning_rate": 3.234042553191489e-05,
"loss": 0.6039,
"step": 76
},
{
"epoch": 0.4109406270847232,
"grad_norm": 1.036920332839965,
"learning_rate": 3.276595744680851e-05,
"loss": 0.6058,
"step": 77
},
{
"epoch": 0.4162775183455637,
"grad_norm": 0.6532671217107379,
"learning_rate": 3.319148936170213e-05,
"loss": 0.6039,
"step": 78
},
{
"epoch": 0.4216144096064043,
"grad_norm": 0.861543032559001,
"learning_rate": 3.361702127659575e-05,
"loss": 0.6169,
"step": 79
},
{
"epoch": 0.4269513008672448,
"grad_norm": 0.9427328391918289,
"learning_rate": 3.4042553191489365e-05,
"loss": 0.5972,
"step": 80
},
{
"epoch": 0.4322881921280854,
"grad_norm": 0.8268349751981375,
"learning_rate": 3.446808510638298e-05,
"loss": 0.6028,
"step": 81
},
{
"epoch": 0.437625083388926,
"grad_norm": 0.8010280657011346,
"learning_rate": 3.48936170212766e-05,
"loss": 0.5937,
"step": 82
},
{
"epoch": 0.4429619746497665,
"grad_norm": 1.4163426245382202,
"learning_rate": 3.531914893617022e-05,
"loss": 0.5801,
"step": 83
},
{
"epoch": 0.4482988659106071,
"grad_norm": 0.8866001712478151,
"learning_rate": 3.574468085106383e-05,
"loss": 0.6121,
"step": 84
},
{
"epoch": 0.4536357571714476,
"grad_norm": 1.2990003965290253,
"learning_rate": 3.617021276595745e-05,
"loss": 0.6509,
"step": 85
},
{
"epoch": 0.4589726484322882,
"grad_norm": 0.9828125705811529,
"learning_rate": 3.6595744680851066e-05,
"loss": 0.6122,
"step": 86
},
{
"epoch": 0.46430953969312877,
"grad_norm": 1.1075341839055335,
"learning_rate": 3.7021276595744684e-05,
"loss": 0.6258,
"step": 87
},
{
"epoch": 0.4696464309539693,
"grad_norm": 1.0144524046084673,
"learning_rate": 3.74468085106383e-05,
"loss": 0.6086,
"step": 88
},
{
"epoch": 0.4749833222148099,
"grad_norm": 1.0772540630578744,
"learning_rate": 3.787234042553192e-05,
"loss": 0.5999,
"step": 89
},
{
"epoch": 0.48032021347565046,
"grad_norm": 0.881806469299722,
"learning_rate": 3.829787234042554e-05,
"loss": 0.6498,
"step": 90
},
{
"epoch": 0.485657104736491,
"grad_norm": 0.9769728043077871,
"learning_rate": 3.872340425531915e-05,
"loss": 0.632,
"step": 91
},
{
"epoch": 0.49099399599733157,
"grad_norm": 0.9506924647295948,
"learning_rate": 3.914893617021277e-05,
"loss": 0.5646,
"step": 92
},
{
"epoch": 0.4963308872581721,
"grad_norm": 0.697157023956671,
"learning_rate": 3.9574468085106385e-05,
"loss": 0.5814,
"step": 93
},
{
"epoch": 0.5016677785190127,
"grad_norm": 0.8702722934298305,
"learning_rate": 4e-05,
"loss": 0.6086,
"step": 94
},
{
"epoch": 0.5070046697798533,
"grad_norm": 0.8781146273756057,
"learning_rate": 3.9999860457259224e-05,
"loss": 0.579,
"step": 95
},
{
"epoch": 0.5123415610406938,
"grad_norm": 0.6961405489963192,
"learning_rate": 3.99994418309841e-05,
"loss": 0.6133,
"step": 96
},
{
"epoch": 0.5176784523015343,
"grad_norm": 0.7087239184079108,
"learning_rate": 3.9998744127016264e-05,
"loss": 0.5902,
"step": 97
},
{
"epoch": 0.5230153435623749,
"grad_norm": 0.5997862763213605,
"learning_rate": 3.999776735509166e-05,
"loss": 0.5875,
"step": 98
},
{
"epoch": 0.5283522348232155,
"grad_norm": 0.660612211013302,
"learning_rate": 3.999651152884044e-05,
"loss": 0.6003,
"step": 99
},
{
"epoch": 0.533689126084056,
"grad_norm": 0.6214742049455745,
"learning_rate": 3.999497666578674e-05,
"loss": 0.6351,
"step": 100
},
{
"epoch": 0.5390260173448966,
"grad_norm": 0.764941637291837,
"learning_rate": 3.999316278734846e-05,
"loss": 0.5883,
"step": 101
},
{
"epoch": 0.5443629086057371,
"grad_norm": 0.5428320010347989,
"learning_rate": 3.9991069918836966e-05,
"loss": 0.607,
"step": 102
},
{
"epoch": 0.5496997998665777,
"grad_norm": 0.6973175643455113,
"learning_rate": 3.998869808945671e-05,
"loss": 0.5755,
"step": 103
},
{
"epoch": 0.5550366911274183,
"grad_norm": 0.5367648197100708,
"learning_rate": 3.998604733230485e-05,
"loss": 0.577,
"step": 104
},
{
"epoch": 0.5603735823882589,
"grad_norm": 0.6880065518848788,
"learning_rate": 3.998311768437078e-05,
"loss": 0.606,
"step": 105
},
{
"epoch": 0.5657104736490994,
"grad_norm": 0.6838905198209162,
"learning_rate": 3.9979909186535606e-05,
"loss": 0.5624,
"step": 106
},
{
"epoch": 0.57104736490994,
"grad_norm": 0.6426687525240344,
"learning_rate": 3.9976421883571594e-05,
"loss": 0.6195,
"step": 107
},
{
"epoch": 0.5763842561707805,
"grad_norm": 0.7965064222410392,
"learning_rate": 3.9972655824141524e-05,
"loss": 0.6357,
"step": 108
},
{
"epoch": 0.5817211474316211,
"grad_norm": 0.5885934942607214,
"learning_rate": 3.996861106079801e-05,
"loss": 0.558,
"step": 109
},
{
"epoch": 0.5870580386924616,
"grad_norm": 0.7774147945831619,
"learning_rate": 3.9964287649982805e-05,
"loss": 0.5971,
"step": 110
},
{
"epoch": 0.5923949299533022,
"grad_norm": 0.5353107229236527,
"learning_rate": 3.9959685652025954e-05,
"loss": 0.5731,
"step": 111
},
{
"epoch": 0.5977318212141428,
"grad_norm": 0.8291069965223162,
"learning_rate": 3.995480513114501e-05,
"loss": 0.6127,
"step": 112
},
{
"epoch": 0.6030687124749833,
"grad_norm": 0.7325810415707809,
"learning_rate": 3.994964615544409e-05,
"loss": 0.6041,
"step": 113
},
{
"epoch": 0.6084056037358239,
"grad_norm": 0.6428699302157482,
"learning_rate": 3.994420879691296e-05,
"loss": 0.5808,
"step": 114
},
{
"epoch": 0.6137424949966644,
"grad_norm": 0.5760017273903351,
"learning_rate": 3.993849313142601e-05,
"loss": 0.5625,
"step": 115
},
{
"epoch": 0.619079386257505,
"grad_norm": 0.6798380160844573,
"learning_rate": 3.9932499238741205e-05,
"loss": 0.5739,
"step": 116
},
{
"epoch": 0.6244162775183456,
"grad_norm": 0.6045010645880844,
"learning_rate": 3.992622720249896e-05,
"loss": 0.5535,
"step": 117
},
{
"epoch": 0.6297531687791861,
"grad_norm": 0.6499452053823308,
"learning_rate": 3.991967711022099e-05,
"loss": 0.5774,
"step": 118
},
{
"epoch": 0.6350900600400267,
"grad_norm": 0.852341222777626,
"learning_rate": 3.991284905330908e-05,
"loss": 0.6156,
"step": 119
},
{
"epoch": 0.6404269513008672,
"grad_norm": 0.7942631863588528,
"learning_rate": 3.99057431270438e-05,
"loss": 0.6146,
"step": 120
},
{
"epoch": 0.6457638425617078,
"grad_norm": 0.7346243276112419,
"learning_rate": 3.989835943058321e-05,
"loss": 0.6245,
"step": 121
},
{
"epoch": 0.6511007338225484,
"grad_norm": 0.5552728410457584,
"learning_rate": 3.989069806696141e-05,
"loss": 0.5767,
"step": 122
},
{
"epoch": 0.6564376250833889,
"grad_norm": 0.7057507722370929,
"learning_rate": 3.9882759143087194e-05,
"loss": 0.588,
"step": 123
},
{
"epoch": 0.6617745163442295,
"grad_norm": 0.6039953078185621,
"learning_rate": 3.9874542769742465e-05,
"loss": 0.5562,
"step": 124
},
{
"epoch": 0.66711140760507,
"grad_norm": 0.6614177548639164,
"learning_rate": 3.9866049061580754e-05,
"loss": 0.6246,
"step": 125
},
{
"epoch": 0.6724482988659106,
"grad_norm": 0.4994146494205324,
"learning_rate": 3.985727813712559e-05,
"loss": 0.5781,
"step": 126
},
{
"epoch": 0.6777851901267512,
"grad_norm": 0.6316687122403261,
"learning_rate": 3.984823011876885e-05,
"loss": 0.5849,
"step": 127
},
{
"epoch": 0.6831220813875917,
"grad_norm": 0.6451958765097028,
"learning_rate": 3.983890513276908e-05,
"loss": 0.5729,
"step": 128
},
{
"epoch": 0.6884589726484323,
"grad_norm": 0.6080146197253071,
"learning_rate": 3.982930330924968e-05,
"loss": 0.581,
"step": 129
},
{
"epoch": 0.6937958639092728,
"grad_norm": 0.6403798321958414,
"learning_rate": 3.981942478219712e-05,
"loss": 0.58,
"step": 130
},
{
"epoch": 0.6991327551701134,
"grad_norm": 0.5940580400091781,
"learning_rate": 3.980926968945909e-05,
"loss": 0.5851,
"step": 131
},
{
"epoch": 0.704469646430954,
"grad_norm": 0.6114900367863954,
"learning_rate": 3.9798838172742523e-05,
"loss": 0.5861,
"step": 132
},
{
"epoch": 0.7098065376917946,
"grad_norm": 0.578905757005195,
"learning_rate": 3.978813037761167e-05,
"loss": 0.5686,
"step": 133
},
{
"epoch": 0.715143428952635,
"grad_norm": 0.4992188099121227,
"learning_rate": 3.977714645348603e-05,
"loss": 0.5839,
"step": 134
},
{
"epoch": 0.7204803202134756,
"grad_norm": 0.6287390942847498,
"learning_rate": 3.9765886553638305e-05,
"loss": 0.5935,
"step": 135
},
{
"epoch": 0.7258172114743162,
"grad_norm": 0.5377587251455253,
"learning_rate": 3.975435083519221e-05,
"loss": 0.5908,
"step": 136
},
{
"epoch": 0.7311541027351568,
"grad_norm": 0.5666232050777832,
"learning_rate": 3.974253945912033e-05,
"loss": 0.5761,
"step": 137
},
{
"epoch": 0.7364909939959974,
"grad_norm": 0.5157376399011453,
"learning_rate": 3.9730452590241855e-05,
"loss": 0.5535,
"step": 138
},
{
"epoch": 0.7418278852568378,
"grad_norm": 0.5906288982124854,
"learning_rate": 3.9718090397220235e-05,
"loss": 0.5669,
"step": 139
},
{
"epoch": 0.7471647765176784,
"grad_norm": 0.4830392788438412,
"learning_rate": 3.9705453052560935e-05,
"loss": 0.5834,
"step": 140
},
{
"epoch": 0.752501667778519,
"grad_norm": 0.49576189569537626,
"learning_rate": 3.9692540732608895e-05,
"loss": 0.5695,
"step": 141
},
{
"epoch": 0.7578385590393596,
"grad_norm": 0.5736086879059951,
"learning_rate": 3.9679353617546185e-05,
"loss": 0.5677,
"step": 142
},
{
"epoch": 0.7631754503002002,
"grad_norm": 0.6081982434820203,
"learning_rate": 3.966589189138941e-05,
"loss": 0.5737,
"step": 143
},
{
"epoch": 0.7685123415610406,
"grad_norm": 0.4513895583255699,
"learning_rate": 3.9652155741987204e-05,
"loss": 0.5747,
"step": 144
},
{
"epoch": 0.7738492328218812,
"grad_norm": 0.5662887213755944,
"learning_rate": 3.963814536101756e-05,
"loss": 0.5642,
"step": 145
},
{
"epoch": 0.7791861240827218,
"grad_norm": 0.5224566159232965,
"learning_rate": 3.962386094398515e-05,
"loss": 0.557,
"step": 146
},
{
"epoch": 0.7845230153435624,
"grad_norm": 0.630041060013869,
"learning_rate": 3.960930269021866e-05,
"loss": 0.5868,
"step": 147
},
{
"epoch": 0.789859906604403,
"grad_norm": 0.4804049278016382,
"learning_rate": 3.959447080286795e-05,
"loss": 0.562,
"step": 148
},
{
"epoch": 0.7951967978652434,
"grad_norm": 0.6899935340653105,
"learning_rate": 3.957936548890126e-05,
"loss": 0.5842,
"step": 149
},
{
"epoch": 0.800533689126084,
"grad_norm": 0.5121084821707802,
"learning_rate": 3.956398695910225e-05,
"loss": 0.5835,
"step": 150
},
{
"epoch": 0.8058705803869246,
"grad_norm": 0.668616893098578,
"learning_rate": 3.954833542806716e-05,
"loss": 0.5861,
"step": 151
},
{
"epoch": 0.8112074716477652,
"grad_norm": 0.45695972645097455,
"learning_rate": 3.953241111420174e-05,
"loss": 0.5684,
"step": 152
},
{
"epoch": 0.8165443629086058,
"grad_norm": 0.5756971939096189,
"learning_rate": 3.951621423971822e-05,
"loss": 0.5609,
"step": 153
},
{
"epoch": 0.8218812541694464,
"grad_norm": 0.6608106731864758,
"learning_rate": 3.949974503063224e-05,
"loss": 0.5832,
"step": 154
},
{
"epoch": 0.8272181454302868,
"grad_norm": 0.45567367046023793,
"learning_rate": 3.9483003716759656e-05,
"loss": 0.5733,
"step": 155
},
{
"epoch": 0.8325550366911274,
"grad_norm": 0.5216817660800224,
"learning_rate": 3.946599053171334e-05,
"loss": 0.5863,
"step": 156
},
{
"epoch": 0.837891927951968,
"grad_norm": 0.5621476528316465,
"learning_rate": 3.944870571289995e-05,
"loss": 0.6054,
"step": 157
},
{
"epoch": 0.8432288192128086,
"grad_norm": 0.5106888550302184,
"learning_rate": 3.943114950151658e-05,
"loss": 0.5567,
"step": 158
},
{
"epoch": 0.8485657104736491,
"grad_norm": 0.48507154178393364,
"learning_rate": 3.94133221425474e-05,
"loss": 0.5759,
"step": 159
},
{
"epoch": 0.8539026017344896,
"grad_norm": 0.5726162474627128,
"learning_rate": 3.93952238847603e-05,
"loss": 0.5689,
"step": 160
},
{
"epoch": 0.8592394929953302,
"grad_norm": 0.5476504710325754,
"learning_rate": 3.9376854980703305e-05,
"loss": 0.5509,
"step": 161
},
{
"epoch": 0.8645763842561708,
"grad_norm": 0.5841063194600445,
"learning_rate": 3.935821568670113e-05,
"loss": 0.5787,
"step": 162
},
{
"epoch": 0.8699132755170114,
"grad_norm": 0.5131288550994832,
"learning_rate": 3.9339306262851604e-05,
"loss": 0.5543,
"step": 163
},
{
"epoch": 0.875250166777852,
"grad_norm": 0.46011085571103316,
"learning_rate": 3.932012697302202e-05,
"loss": 0.5432,
"step": 164
},
{
"epoch": 0.8805870580386924,
"grad_norm": 1.3675570339416427,
"learning_rate": 3.9300678084845414e-05,
"loss": 0.5746,
"step": 165
},
{
"epoch": 0.885923949299533,
"grad_norm": 0.5279362760899026,
"learning_rate": 3.928095986971693e-05,
"loss": 0.5498,
"step": 166
},
{
"epoch": 0.8912608405603736,
"grad_norm": 0.5332378885535611,
"learning_rate": 3.926097260278994e-05,
"loss": 0.5896,
"step": 167
},
{
"epoch": 0.8965977318212142,
"grad_norm": 0.45490314214957983,
"learning_rate": 3.924071656297224e-05,
"loss": 0.5788,
"step": 168
},
{
"epoch": 0.9019346230820547,
"grad_norm": 0.57394261542064,
"learning_rate": 3.922019203292217e-05,
"loss": 0.572,
"step": 169
},
{
"epoch": 0.9072715143428952,
"grad_norm": 0.5549810760866458,
"learning_rate": 3.9199399299044636e-05,
"loss": 0.604,
"step": 170
},
{
"epoch": 0.9126084056037358,
"grad_norm": 0.47674471227279036,
"learning_rate": 3.9178338651487146e-05,
"loss": 0.5958,
"step": 171
},
{
"epoch": 0.9179452968645764,
"grad_norm": 0.5092134083211615,
"learning_rate": 3.915701038413575e-05,
"loss": 0.5463,
"step": 172
},
{
"epoch": 0.923282188125417,
"grad_norm": 0.5631301439234547,
"learning_rate": 3.913541479461095e-05,
"loss": 0.5829,
"step": 173
},
{
"epoch": 0.9286190793862575,
"grad_norm": 0.46094438063306387,
"learning_rate": 3.9113552184263506e-05,
"loss": 0.5647,
"step": 174
},
{
"epoch": 0.933955970647098,
"grad_norm": 0.5137591466008057,
"learning_rate": 3.9091422858170275e-05,
"loss": 0.571,
"step": 175
},
{
"epoch": 0.9392928619079386,
"grad_norm": 0.5226838432471194,
"learning_rate": 3.906902712512994e-05,
"loss": 0.5626,
"step": 176
},
{
"epoch": 0.9446297531687792,
"grad_norm": 0.5466147422388645,
"learning_rate": 3.904636529765872e-05,
"loss": 0.5726,
"step": 177
},
{
"epoch": 0.9499666444296198,
"grad_norm": 0.5487628898316014,
"learning_rate": 3.902343769198592e-05,
"loss": 0.5629,
"step": 178
},
{
"epoch": 0.9553035356904603,
"grad_norm": 0.5645880019209112,
"learning_rate": 3.900024462804968e-05,
"loss": 0.5379,
"step": 179
},
{
"epoch": 0.9606404269513009,
"grad_norm": 0.5714192698614569,
"learning_rate": 3.897678642949234e-05,
"loss": 0.559,
"step": 180
},
{
"epoch": 0.9659773182121414,
"grad_norm": 0.8028140925883746,
"learning_rate": 3.8953063423656055e-05,
"loss": 0.5528,
"step": 181
},
{
"epoch": 0.971314209472982,
"grad_norm": 0.5192388665650057,
"learning_rate": 3.892907594157813e-05,
"loss": 0.6081,
"step": 182
},
{
"epoch": 0.9766511007338226,
"grad_norm": 0.6005085196506503,
"learning_rate": 3.8904824317986475e-05,
"loss": 0.597,
"step": 183
},
{
"epoch": 0.9819879919946631,
"grad_norm": 0.5300178874667031,
"learning_rate": 3.8880308891294894e-05,
"loss": 0.5569,
"step": 184
},
{
"epoch": 0.9873248832555037,
"grad_norm": 0.507750454533361,
"learning_rate": 3.885553000359836e-05,
"loss": 0.5902,
"step": 185
},
{
"epoch": 0.9926617745163442,
"grad_norm": 0.5220015768920386,
"learning_rate": 3.8830488000668276e-05,
"loss": 0.5907,
"step": 186
},
{
"epoch": 0.9979986657771848,
"grad_norm": 0.4864525986978283,
"learning_rate": 3.8805183231947605e-05,
"loss": 0.5545,
"step": 187
},
{
"epoch": 1.0033355570380253,
"grad_norm": 0.7889929121631584,
"learning_rate": 3.8779616050546035e-05,
"loss": 0.8624,
"step": 188
},
{
"epoch": 1.0086724482988658,
"grad_norm": 0.6971636670787393,
"learning_rate": 3.875378681323501e-05,
"loss": 0.4685,
"step": 189
},
{
"epoch": 1.0140093395597065,
"grad_norm": 0.8621305134857299,
"learning_rate": 3.872769588044279e-05,
"loss": 0.5079,
"step": 190
},
{
"epoch": 1.019346230820547,
"grad_norm": 0.868037119002654,
"learning_rate": 3.8701343616249415e-05,
"loss": 0.4286,
"step": 191
},
{
"epoch": 1.0246831220813877,
"grad_norm": 0.8244134271269803,
"learning_rate": 3.867473038838158e-05,
"loss": 0.5353,
"step": 192
},
{
"epoch": 1.0300200133422281,
"grad_norm": 0.750497350373635,
"learning_rate": 3.864785656820758e-05,
"loss": 0.4478,
"step": 193
},
{
"epoch": 1.0353569046030686,
"grad_norm": 0.7802579665937702,
"learning_rate": 3.862072253073207e-05,
"loss": 0.4809,
"step": 194
},
{
"epoch": 1.0406937958639093,
"grad_norm": 0.6332504376170986,
"learning_rate": 3.859332865459082e-05,
"loss": 0.4659,
"step": 195
},
{
"epoch": 1.0460306871247498,
"grad_norm": 0.5212841037738816,
"learning_rate": 3.856567532204551e-05,
"loss": 0.4419,
"step": 196
},
{
"epoch": 1.0513675783855905,
"grad_norm": 0.5429591096952704,
"learning_rate": 3.853776291897831e-05,
"loss": 0.471,
"step": 197
},
{
"epoch": 1.056704469646431,
"grad_norm": 0.6105400333523633,
"learning_rate": 3.850959183488655e-05,
"loss": 0.4869,
"step": 198
},
{
"epoch": 1.0620413609072714,
"grad_norm": 1.389605521972471,
"learning_rate": 3.848116246287725e-05,
"loss": 0.4194,
"step": 199
},
{
"epoch": 1.067378252168112,
"grad_norm": 0.6500172584353179,
"learning_rate": 3.845247519966167e-05,
"loss": 0.4742,
"step": 200
},
{
"epoch": 1.0727151434289526,
"grad_norm": 0.6231565335876623,
"learning_rate": 3.842353044554973e-05,
"loss": 0.4883,
"step": 201
},
{
"epoch": 1.0780520346897933,
"grad_norm": 0.5326635797858091,
"learning_rate": 3.839432860444447e-05,
"loss": 0.4684,
"step": 202
},
{
"epoch": 1.0833889259506337,
"grad_norm": 0.5630660263478705,
"learning_rate": 3.836487008383638e-05,
"loss": 0.4974,
"step": 203
},
{
"epoch": 1.0887258172114742,
"grad_norm": 0.5426950088640516,
"learning_rate": 3.8335155294797744e-05,
"loss": 0.4966,
"step": 204
},
{
"epoch": 1.094062708472315,
"grad_norm": 0.46720899935844223,
"learning_rate": 3.8305184651976855e-05,
"loss": 0.4518,
"step": 205
},
{
"epoch": 1.0993995997331554,
"grad_norm": 0.6590650012988468,
"learning_rate": 3.827495857359228e-05,
"loss": 0.5123,
"step": 206
},
{
"epoch": 1.104736490993996,
"grad_norm": 0.4688566921090766,
"learning_rate": 3.824447748142701e-05,
"loss": 0.4665,
"step": 207
},
{
"epoch": 1.1100733822548365,
"grad_norm": 0.6539956626751293,
"learning_rate": 3.821374180082256e-05,
"loss": 0.4836,
"step": 208
},
{
"epoch": 1.115410273515677,
"grad_norm": 0.4735771906265032,
"learning_rate": 3.8182751960673024e-05,
"loss": 0.4854,
"step": 209
},
{
"epoch": 1.1207471647765177,
"grad_norm": 0.5752246665082068,
"learning_rate": 3.815150839341915e-05,
"loss": 0.4598,
"step": 210
},
{
"epoch": 1.1260840560373582,
"grad_norm": 0.5222212002375315,
"learning_rate": 3.812001153504221e-05,
"loss": 0.4436,
"step": 211
},
{
"epoch": 1.1314209472981989,
"grad_norm": 0.4870834555429964,
"learning_rate": 3.8088261825058025e-05,
"loss": 0.468,
"step": 212
},
{
"epoch": 1.1367578385590393,
"grad_norm": 0.5849873353511134,
"learning_rate": 3.8056259706510735e-05,
"loss": 0.4751,
"step": 213
},
{
"epoch": 1.1420947298198798,
"grad_norm": 0.6135788225861464,
"learning_rate": 3.802400562596668e-05,
"loss": 0.4372,
"step": 214
},
{
"epoch": 1.1474316210807205,
"grad_norm": 0.4949403107444529,
"learning_rate": 3.799150003350813e-05,
"loss": 0.4886,
"step": 215
},
{
"epoch": 1.152768512341561,
"grad_norm": 0.4908480402002135,
"learning_rate": 3.795874338272705e-05,
"loss": 0.4244,
"step": 216
},
{
"epoch": 1.1581054036024017,
"grad_norm": 0.5302662495518609,
"learning_rate": 3.79257361307187e-05,
"loss": 0.5117,
"step": 217
},
{
"epoch": 1.1634422948632421,
"grad_norm": 0.47953450158842736,
"learning_rate": 3.7892478738075335e-05,
"loss": 0.4814,
"step": 218
},
{
"epoch": 1.1687791861240826,
"grad_norm": 0.4762686857338102,
"learning_rate": 3.785897166887973e-05,
"loss": 0.4668,
"step": 219
},
{
"epoch": 1.1741160773849233,
"grad_norm": 0.4687082531990241,
"learning_rate": 3.7825215390698696e-05,
"loss": 0.4596,
"step": 220
},
{
"epoch": 1.1794529686457638,
"grad_norm": 0.4882003652888224,
"learning_rate": 3.779121037457661e-05,
"loss": 0.4827,
"step": 221
},
{
"epoch": 1.1847898599066045,
"grad_norm": 0.4673722770115682,
"learning_rate": 3.7756957095028776e-05,
"loss": 0.4739,
"step": 222
},
{
"epoch": 1.190126751167445,
"grad_norm": 0.46193789537235297,
"learning_rate": 3.772245603003485e-05,
"loss": 0.4785,
"step": 223
},
{
"epoch": 1.1954636424282854,
"grad_norm": 0.41695599713095927,
"learning_rate": 3.768770766103214e-05,
"loss": 0.4574,
"step": 224
},
{
"epoch": 1.200800533689126,
"grad_norm": 0.48103374543287214,
"learning_rate": 3.765271247290892e-05,
"loss": 0.4968,
"step": 225
},
{
"epoch": 1.2061374249499666,
"grad_norm": 0.4107744420495834,
"learning_rate": 3.761747095399764e-05,
"loss": 0.4691,
"step": 226
},
{
"epoch": 1.2114743162108073,
"grad_norm": 0.5216753656288083,
"learning_rate": 3.75819835960681e-05,
"loss": 0.4655,
"step": 227
},
{
"epoch": 1.2168112074716477,
"grad_norm": 0.47480701486406535,
"learning_rate": 3.754625089432062e-05,
"loss": 0.4659,
"step": 228
},
{
"epoch": 1.2221480987324884,
"grad_norm": 0.5167936671852781,
"learning_rate": 3.751027334737913e-05,
"loss": 0.4789,
"step": 229
},
{
"epoch": 1.227484989993329,
"grad_norm": 0.5894608057326902,
"learning_rate": 3.747405145728416e-05,
"loss": 0.4857,
"step": 230
},
{
"epoch": 1.2328218812541694,
"grad_norm": 0.4441843183602042,
"learning_rate": 3.743758572948591e-05,
"loss": 0.4711,
"step": 231
},
{
"epoch": 1.23815877251501,
"grad_norm": 0.44680579976734436,
"learning_rate": 3.740087667283712e-05,
"loss": 0.4913,
"step": 232
},
{
"epoch": 1.2434956637758505,
"grad_norm": 0.4767148142121902,
"learning_rate": 3.736392479958606e-05,
"loss": 0.4583,
"step": 233
},
{
"epoch": 1.2488325550366912,
"grad_norm": 0.4360066844228599,
"learning_rate": 3.732673062536926e-05,
"loss": 0.461,
"step": 234
},
{
"epoch": 1.2541694462975317,
"grad_norm": 0.49211626438882655,
"learning_rate": 3.728929466920445e-05,
"loss": 0.4771,
"step": 235
},
{
"epoch": 1.2595063375583724,
"grad_norm": 0.4462865895003735,
"learning_rate": 3.72516174534832e-05,
"loss": 0.4735,
"step": 236
},
{
"epoch": 1.2648432288192129,
"grad_norm": 0.3962897023044732,
"learning_rate": 3.721369950396373e-05,
"loss": 0.44,
"step": 237
},
{
"epoch": 1.2701801200800533,
"grad_norm": 0.47458867881560224,
"learning_rate": 3.7175541349763474e-05,
"loss": 0.4798,
"step": 238
},
{
"epoch": 1.2755170113408938,
"grad_norm": 0.47364034729607424,
"learning_rate": 3.7137143523351787e-05,
"loss": 0.4918,
"step": 239
},
{
"epoch": 1.2808539026017345,
"grad_norm": 0.5140571591821321,
"learning_rate": 3.7098506560542464e-05,
"loss": 0.4755,
"step": 240
},
{
"epoch": 1.2861907938625752,
"grad_norm": 0.4575077757654535,
"learning_rate": 3.705963100048627e-05,
"loss": 0.4618,
"step": 241
},
{
"epoch": 1.2915276851234156,
"grad_norm": 0.4752155163431786,
"learning_rate": 3.702051738566343e-05,
"loss": 0.4805,
"step": 242
},
{
"epoch": 1.2968645763842561,
"grad_norm": 0.44739905544161696,
"learning_rate": 3.698116626187603e-05,
"loss": 0.4553,
"step": 243
},
{
"epoch": 1.3022014676450968,
"grad_norm": 0.4313224074744495,
"learning_rate": 3.694157817824046e-05,
"loss": 0.4586,
"step": 244
},
{
"epoch": 1.3075383589059373,
"grad_norm": 0.4540941645048732,
"learning_rate": 3.6901753687179674e-05,
"loss": 0.4484,
"step": 245
},
{
"epoch": 1.312875250166778,
"grad_norm": 0.4544044695491594,
"learning_rate": 3.686169334441554e-05,
"loss": 0.4662,
"step": 246
},
{
"epoch": 1.3182121414276184,
"grad_norm": 0.4324267581370308,
"learning_rate": 3.6821397708961045e-05,
"loss": 0.4973,
"step": 247
},
{
"epoch": 1.323549032688459,
"grad_norm": 0.5525969912230031,
"learning_rate": 3.678086734311256e-05,
"loss": 0.4824,
"step": 248
},
{
"epoch": 1.3288859239492996,
"grad_norm": 0.48134801141058897,
"learning_rate": 3.67401028124419e-05,
"loss": 0.4953,
"step": 249
},
{
"epoch": 1.33422281521014,
"grad_norm": 0.5606903378778597,
"learning_rate": 3.66991046857885e-05,
"loss": 0.4763,
"step": 250
},
{
"epoch": 1.3395597064709808,
"grad_norm": 0.4778001351815925,
"learning_rate": 3.6657873535251456e-05,
"loss": 0.4427,
"step": 251
},
{
"epoch": 1.3448965977318212,
"grad_norm": 0.5198221637041993,
"learning_rate": 3.661640993618155e-05,
"loss": 0.4962,
"step": 252
},
{
"epoch": 1.3502334889926617,
"grad_norm": 0.614188855305235,
"learning_rate": 3.6574714467173194e-05,
"loss": 0.4621,
"step": 253
},
{
"epoch": 1.3555703802535024,
"grad_norm": 0.47715507181630035,
"learning_rate": 3.6532787710056405e-05,
"loss": 0.4506,
"step": 254
},
{
"epoch": 1.3609072715143429,
"grad_norm": 0.4823822750756398,
"learning_rate": 3.649063024988864e-05,
"loss": 0.4813,
"step": 255
},
{
"epoch": 1.3662441627751836,
"grad_norm": 0.5035590732756817,
"learning_rate": 3.644824267494664e-05,
"loss": 0.4732,
"step": 256
},
{
"epoch": 1.371581054036024,
"grad_norm": 0.4775276009416485,
"learning_rate": 3.6405625576718256e-05,
"loss": 0.502,
"step": 257
},
{
"epoch": 1.3769179452968645,
"grad_norm": 0.4616087736602264,
"learning_rate": 3.6362779549894155e-05,
"loss": 0.4687,
"step": 258
},
{
"epoch": 1.3822548365577052,
"grad_norm": 0.5051090242818488,
"learning_rate": 3.631970519235954e-05,
"loss": 0.4527,
"step": 259
},
{
"epoch": 1.3875917278185457,
"grad_norm": 0.46170933564301747,
"learning_rate": 3.62764031051858e-05,
"loss": 0.4531,
"step": 260
},
{
"epoch": 1.3929286190793864,
"grad_norm": 0.4597839229598545,
"learning_rate": 3.623287389262211e-05,
"loss": 0.4428,
"step": 261
},
{
"epoch": 1.3982655103402268,
"grad_norm": 0.47396042868358945,
"learning_rate": 3.618911816208707e-05,
"loss": 0.4748,
"step": 262
},
{
"epoch": 1.4036024016010673,
"grad_norm": 0.46865196681556354,
"learning_rate": 3.614513652416011e-05,
"loss": 0.4555,
"step": 263
},
{
"epoch": 1.408939292861908,
"grad_norm": 0.4941893702835204,
"learning_rate": 3.610092959257306e-05,
"loss": 0.4475,
"step": 264
},
{
"epoch": 1.4142761841227485,
"grad_norm": 0.45053813237543944,
"learning_rate": 3.6056497984201566e-05,
"loss": 0.5037,
"step": 265
},
{
"epoch": 1.4196130753835892,
"grad_norm": 0.4602967972562359,
"learning_rate": 3.601184231905647e-05,
"loss": 0.4625,
"step": 266
},
{
"epoch": 1.4249499666444296,
"grad_norm": 0.40312077116267125,
"learning_rate": 3.5966963220275155e-05,
"loss": 0.4322,
"step": 267
},
{
"epoch": 1.43028685790527,
"grad_norm": 0.43605945616048775,
"learning_rate": 3.592186131411288e-05,
"loss": 0.4758,
"step": 268
},
{
"epoch": 1.4356237491661108,
"grad_norm": 0.4490964338856413,
"learning_rate": 3.5876537229933994e-05,
"loss": 0.4606,
"step": 269
},
{
"epoch": 1.4409606404269513,
"grad_norm": 0.4488693733839842,
"learning_rate": 3.583099160020319e-05,
"loss": 0.5358,
"step": 270
},
{
"epoch": 1.446297531687792,
"grad_norm": 0.4399709509429179,
"learning_rate": 3.578522506047667e-05,
"loss": 0.4585,
"step": 271
},
{
"epoch": 1.4516344229486324,
"grad_norm": 0.48797858146525214,
"learning_rate": 3.573923824939327e-05,
"loss": 0.4934,
"step": 272
},
{
"epoch": 1.456971314209473,
"grad_norm": 0.43113845680671775,
"learning_rate": 3.5693031808665563e-05,
"loss": 0.4624,
"step": 273
},
{
"epoch": 1.4623082054703136,
"grad_norm": 0.4622009600777627,
"learning_rate": 3.564660638307088e-05,
"loss": 0.4418,
"step": 274
},
{
"epoch": 1.467645096731154,
"grad_norm": 0.5575580830867934,
"learning_rate": 3.5599962620442344e-05,
"loss": 0.507,
"step": 275
},
{
"epoch": 1.4729819879919948,
"grad_norm": 0.4878453026449952,
"learning_rate": 3.555310117165979e-05,
"loss": 0.4176,
"step": 276
},
{
"epoch": 1.4783188792528352,
"grad_norm": 0.5728589696523062,
"learning_rate": 3.550602269064073e-05,
"loss": 0.5278,
"step": 277
},
{
"epoch": 1.4836557705136757,
"grad_norm": 0.48279056904217577,
"learning_rate": 3.545872783433118e-05,
"loss": 0.4131,
"step": 278
},
{
"epoch": 1.4889926617745164,
"grad_norm": 0.5888361010983522,
"learning_rate": 3.541121726269654e-05,
"loss": 0.4494,
"step": 279
},
{
"epoch": 1.4943295530353569,
"grad_norm": 0.5243711854925368,
"learning_rate": 3.5363491638712326e-05,
"loss": 0.4546,
"step": 280
},
{
"epoch": 1.4996664442961976,
"grad_norm": 0.565934472490769,
"learning_rate": 3.531555162835501e-05,
"loss": 0.4774,
"step": 281
},
{
"epoch": 1.505003335557038,
"grad_norm": 0.46628567085954914,
"learning_rate": 3.52673979005926e-05,
"loss": 0.4572,
"step": 282
},
{
"epoch": 1.5103402268178785,
"grad_norm": 0.5254588931292046,
"learning_rate": 3.521903112737544e-05,
"loss": 0.5014,
"step": 283
},
{
"epoch": 1.5156771180787192,
"grad_norm": 0.4149936994490417,
"learning_rate": 3.517045198362672e-05,
"loss": 0.4611,
"step": 284
},
{
"epoch": 1.5210140093395597,
"grad_norm": 0.4142527610245841,
"learning_rate": 3.512166114723314e-05,
"loss": 0.4378,
"step": 285
},
{
"epoch": 1.5263509006004004,
"grad_norm": 0.4952367594365313,
"learning_rate": 3.507265929903539e-05,
"loss": 0.5056,
"step": 286
},
{
"epoch": 1.5316877918612408,
"grad_norm": 0.49022347663893007,
"learning_rate": 3.5023447122818696e-05,
"loss": 0.4144,
"step": 287
},
{
"epoch": 1.5370246831220813,
"grad_norm": 0.5284452745454441,
"learning_rate": 3.497402530530326e-05,
"loss": 0.4864,
"step": 288
},
{
"epoch": 1.542361574382922,
"grad_norm": 0.6563737965515248,
"learning_rate": 3.492439453613466e-05,
"loss": 0.4772,
"step": 289
},
{
"epoch": 1.5476984656437625,
"grad_norm": 0.4605849913602888,
"learning_rate": 3.487455550787426e-05,
"loss": 0.4519,
"step": 290
},
{
"epoch": 1.5530353569046031,
"grad_norm": 0.5170611719530955,
"learning_rate": 3.482450891598951e-05,
"loss": 0.4967,
"step": 291
},
{
"epoch": 1.5583722481654436,
"grad_norm": 0.46268930563862615,
"learning_rate": 3.4774255458844273e-05,
"loss": 0.4515,
"step": 292
},
{
"epoch": 1.563709139426284,
"grad_norm": 0.4177933995256324,
"learning_rate": 3.472379583768906e-05,
"loss": 0.4557,
"step": 293
},
{
"epoch": 1.5690460306871248,
"grad_norm": 0.4790527598758891,
"learning_rate": 3.4673130756651266e-05,
"loss": 0.4557,
"step": 294
},
{
"epoch": 1.5743829219479655,
"grad_norm": 0.38274726550797006,
"learning_rate": 3.4622260922725315e-05,
"loss": 0.4655,
"step": 295
},
{
"epoch": 1.579719813208806,
"grad_norm": 0.5046386994690183,
"learning_rate": 3.457118704576281e-05,
"loss": 0.5072,
"step": 296
},
{
"epoch": 1.5850567044696464,
"grad_norm": 0.3438922118300985,
"learning_rate": 3.451990983846262e-05,
"loss": 0.4092,
"step": 297
},
{
"epoch": 1.5903935957304869,
"grad_norm": 0.47843066059231854,
"learning_rate": 3.4468430016360955e-05,
"loss": 0.4719,
"step": 298
},
{
"epoch": 1.5957304869913276,
"grad_norm": 0.41900316347012834,
"learning_rate": 3.4416748297821375e-05,
"loss": 0.4697,
"step": 299
},
{
"epoch": 1.6010673782521683,
"grad_norm": 0.4504191244471495,
"learning_rate": 3.4364865404024725e-05,
"loss": 0.4716,
"step": 300
},
{
"epoch": 1.6064042695130087,
"grad_norm": 0.40206430223864725,
"learning_rate": 3.4312782058959136e-05,
"loss": 0.4693,
"step": 301
},
{
"epoch": 1.6117411607738492,
"grad_norm": 0.4131717908261127,
"learning_rate": 3.426049898940988e-05,
"loss": 0.4326,
"step": 302
},
{
"epoch": 1.6170780520346897,
"grad_norm": 0.4192801860140271,
"learning_rate": 3.420801692494923e-05,
"loss": 0.4816,
"step": 303
},
{
"epoch": 1.6224149432955304,
"grad_norm": 0.45720379651134796,
"learning_rate": 3.415533659792631e-05,
"loss": 0.4762,
"step": 304
},
{
"epoch": 1.627751834556371,
"grad_norm": 0.4322890724213698,
"learning_rate": 3.4102458743456836e-05,
"loss": 0.4956,
"step": 305
},
{
"epoch": 1.6330887258172115,
"grad_norm": 0.3701032350873174,
"learning_rate": 3.404938409941288e-05,
"loss": 0.4258,
"step": 306
},
{
"epoch": 1.638425617078052,
"grad_norm": 0.4418583797952319,
"learning_rate": 3.3996113406412575e-05,
"loss": 0.4635,
"step": 307
},
{
"epoch": 1.6437625083388925,
"grad_norm": 0.5119651096884537,
"learning_rate": 3.394264740780977e-05,
"loss": 0.4565,
"step": 308
},
{
"epoch": 1.6490993995997332,
"grad_norm": 0.439255187163439,
"learning_rate": 3.388898684968367e-05,
"loss": 0.4244,
"step": 309
},
{
"epoch": 1.6544362908605739,
"grad_norm": 0.6314072128471104,
"learning_rate": 3.3835132480828395e-05,
"loss": 0.4979,
"step": 310
},
{
"epoch": 1.6597731821214143,
"grad_norm": 0.45356149569718207,
"learning_rate": 3.3781085052742587e-05,
"loss": 0.4659,
"step": 311
},
{
"epoch": 1.6651100733822548,
"grad_norm": 0.6595936619453585,
"learning_rate": 3.372684531961885e-05,
"loss": 0.4715,
"step": 312
},
{
"epoch": 1.6704469646430953,
"grad_norm": 0.40146590046819736,
"learning_rate": 3.3672414038333294e-05,
"loss": 0.442,
"step": 313
},
{
"epoch": 1.675783855903936,
"grad_norm": 0.5439115291324319,
"learning_rate": 3.361779196843495e-05,
"loss": 0.4642,
"step": 314
},
{
"epoch": 1.6811207471647767,
"grad_norm": 0.4925416574124429,
"learning_rate": 3.356297987213514e-05,
"loss": 0.4799,
"step": 315
},
{
"epoch": 1.6864576384256171,
"grad_norm": 0.41997671654731134,
"learning_rate": 3.350797851429688e-05,
"loss": 0.4485,
"step": 316
},
{
"epoch": 1.6917945296864576,
"grad_norm": 0.5655372175649206,
"learning_rate": 3.345278866242419e-05,
"loss": 0.4933,
"step": 317
},
{
"epoch": 1.697131420947298,
"grad_norm": 0.45516429030940203,
"learning_rate": 3.339741108665139e-05,
"loss": 0.4693,
"step": 318
},
{
"epoch": 1.7024683122081388,
"grad_norm": 0.5182373828396348,
"learning_rate": 3.334184655973236e-05,
"loss": 0.4318,
"step": 319
},
{
"epoch": 1.7078052034689795,
"grad_norm": 0.5043642496605346,
"learning_rate": 3.3286095857029724e-05,
"loss": 0.5043,
"step": 320
},
{
"epoch": 1.71314209472982,
"grad_norm": 0.5143484476959619,
"learning_rate": 3.3230159756504065e-05,
"loss": 0.4523,
"step": 321
},
{
"epoch": 1.7184789859906604,
"grad_norm": 0.4004155764578505,
"learning_rate": 3.317403903870308e-05,
"loss": 0.4542,
"step": 322
},
{
"epoch": 1.7238158772515009,
"grad_norm": 0.43510446375283907,
"learning_rate": 3.311773448675063e-05,
"loss": 0.4591,
"step": 323
},
{
"epoch": 1.7291527685123416,
"grad_norm": 0.40735475539646576,
"learning_rate": 3.3061246886335866e-05,
"loss": 0.4767,
"step": 324
},
{
"epoch": 1.7344896597731823,
"grad_norm": 0.5027103748999582,
"learning_rate": 3.300457702570225e-05,
"loss": 0.42,
"step": 325
},
{
"epoch": 1.7398265510340227,
"grad_norm": 0.39562071018506284,
"learning_rate": 3.294772569563656e-05,
"loss": 0.5089,
"step": 326
},
{
"epoch": 1.7451634422948632,
"grad_norm": 0.45829391562540117,
"learning_rate": 3.2890693689457817e-05,
"loss": 0.4785,
"step": 327
},
{
"epoch": 1.7505003335557037,
"grad_norm": 0.3733919681858633,
"learning_rate": 3.283348180300627e-05,
"loss": 0.4503,
"step": 328
},
{
"epoch": 1.7558372248165444,
"grad_norm": 0.4599706292196285,
"learning_rate": 3.277609083463228e-05,
"loss": 0.4637,
"step": 329
},
{
"epoch": 1.761174116077385,
"grad_norm": 0.4095854985895356,
"learning_rate": 3.271852158518514e-05,
"loss": 0.4707,
"step": 330
},
{
"epoch": 1.7665110073382255,
"grad_norm": 0.4230424744489382,
"learning_rate": 3.266077485800192e-05,
"loss": 0.4611,
"step": 331
},
{
"epoch": 1.771847898599066,
"grad_norm": 0.40727774451181376,
"learning_rate": 3.26028514588963e-05,
"loss": 0.4899,
"step": 332
},
{
"epoch": 1.7771847898599065,
"grad_norm": 0.4257028861000557,
"learning_rate": 3.2544752196147266e-05,
"loss": 0.4759,
"step": 333
},
{
"epoch": 1.7825216811207472,
"grad_norm": 0.4044619820594187,
"learning_rate": 3.248647788048784e-05,
"loss": 0.4589,
"step": 334
},
{
"epoch": 1.7878585723815879,
"grad_norm": 0.41896755044660233,
"learning_rate": 3.2428029325093794e-05,
"loss": 0.457,
"step": 335
},
{
"epoch": 1.7931954636424283,
"grad_norm": 0.4226741156494024,
"learning_rate": 3.23694073455723e-05,
"loss": 0.468,
"step": 336
},
{
"epoch": 1.7985323549032688,
"grad_norm": 0.3803247543345477,
"learning_rate": 3.2310612759950535e-05,
"loss": 0.4548,
"step": 337
},
{
"epoch": 1.8038692461641093,
"grad_norm": 0.4456607835139093,
"learning_rate": 3.225164638866424e-05,
"loss": 0.4808,
"step": 338
},
{
"epoch": 1.80920613742495,
"grad_norm": 0.43894216697827915,
"learning_rate": 3.219250905454633e-05,
"loss": 0.4538,
"step": 339
},
{
"epoch": 1.8145430286857906,
"grad_norm": 0.3622459739187082,
"learning_rate": 3.213320158281538e-05,
"loss": 0.429,
"step": 340
},
{
"epoch": 1.8198799199466311,
"grad_norm": 0.45236032311104785,
"learning_rate": 3.207372480106409e-05,
"loss": 0.4955,
"step": 341
},
{
"epoch": 1.8252168112074716,
"grad_norm": 0.40834732197197826,
"learning_rate": 3.201407953924779e-05,
"loss": 0.4419,
"step": 342
},
{
"epoch": 1.830553702468312,
"grad_norm": 0.4934429140575739,
"learning_rate": 3.195426662967281e-05,
"loss": 0.5097,
"step": 343
},
{
"epoch": 1.8358905937291528,
"grad_norm": 0.40870574603025267,
"learning_rate": 3.189428690698487e-05,
"loss": 0.4398,
"step": 344
},
{
"epoch": 1.8412274849899934,
"grad_norm": 0.459285280694887,
"learning_rate": 3.183414120815747e-05,
"loss": 0.4808,
"step": 345
},
{
"epoch": 1.846564376250834,
"grad_norm": 0.3783731629311537,
"learning_rate": 3.177383037248018e-05,
"loss": 0.4393,
"step": 346
},
{
"epoch": 1.8519012675116744,
"grad_norm": 0.43203892549748285,
"learning_rate": 3.171335524154691e-05,
"loss": 0.4496,
"step": 347
},
{
"epoch": 1.8572381587725149,
"grad_norm": 0.5083672035817517,
"learning_rate": 3.165271665924424e-05,
"loss": 0.4537,
"step": 348
},
{
"epoch": 1.8625750500333556,
"grad_norm": 0.470971515949764,
"learning_rate": 3.159191547173955e-05,
"loss": 0.4534,
"step": 349
},
{
"epoch": 1.8679119412941962,
"grad_norm": 0.5043411580642327,
"learning_rate": 3.153095252746928e-05,
"loss": 0.455,
"step": 350
},
{
"epoch": 1.8732488325550367,
"grad_norm": 0.5422278049069252,
"learning_rate": 3.146982867712706e-05,
"loss": 0.4976,
"step": 351
},
{
"epoch": 1.8785857238158772,
"grad_norm": 0.4604934576130623,
"learning_rate": 3.140854477365185e-05,
"loss": 0.4338,
"step": 352
},
{
"epoch": 1.8839226150767177,
"grad_norm": 0.5227311072550253,
"learning_rate": 3.134710167221604e-05,
"loss": 0.4867,
"step": 353
},
{
"epoch": 1.8892595063375583,
"grad_norm": 0.5016387520943354,
"learning_rate": 3.12855002302135e-05,
"loss": 0.4463,
"step": 354
},
{
"epoch": 1.894596397598399,
"grad_norm": 0.48681527381444745,
"learning_rate": 3.122374130724765e-05,
"loss": 0.4878,
"step": 355
},
{
"epoch": 1.8999332888592395,
"grad_norm": 0.4753924890233577,
"learning_rate": 3.116182576511941e-05,
"loss": 0.4614,
"step": 356
},
{
"epoch": 1.90527018012008,
"grad_norm": 0.40839824511015216,
"learning_rate": 3.1099754467815244e-05,
"loss": 0.4551,
"step": 357
},
{
"epoch": 1.9106070713809205,
"grad_norm": 0.5233984189857435,
"learning_rate": 3.103752828149502e-05,
"loss": 0.4852,
"step": 358
},
{
"epoch": 1.9159439626417611,
"grad_norm": 0.44864050338135375,
"learning_rate": 3.0975148074480026e-05,
"loss": 0.4786,
"step": 359
},
{
"epoch": 1.9212808539026018,
"grad_norm": 0.43437456170806327,
"learning_rate": 3.0912614717240745e-05,
"loss": 0.464,
"step": 360
},
{
"epoch": 1.9266177451634423,
"grad_norm": 0.399850990382915,
"learning_rate": 3.08499290823848e-05,
"loss": 0.4438,
"step": 361
},
{
"epoch": 1.9319546364242828,
"grad_norm": 0.43768868935382793,
"learning_rate": 3.07870920446447e-05,
"loss": 0.4603,
"step": 362
},
{
"epoch": 1.9372915276851232,
"grad_norm": 0.5552601097943353,
"learning_rate": 3.072410448086572e-05,
"loss": 0.4762,
"step": 363
},
{
"epoch": 1.942628418945964,
"grad_norm": 0.40557866679728677,
"learning_rate": 3.066096726999357e-05,
"loss": 0.4607,
"step": 364
},
{
"epoch": 1.9479653102068046,
"grad_norm": 0.5268701709396402,
"learning_rate": 3.0597681293062187e-05,
"loss": 0.4389,
"step": 365
},
{
"epoch": 1.953302201467645,
"grad_norm": 0.4027710666772095,
"learning_rate": 3.053424743318146e-05,
"loss": 0.4517,
"step": 366
},
{
"epoch": 1.9586390927284856,
"grad_norm": 0.44216756582628347,
"learning_rate": 3.047066657552484e-05,
"loss": 0.4616,
"step": 367
},
{
"epoch": 1.9639759839893263,
"grad_norm": 0.4117419874422281,
"learning_rate": 3.040693960731704e-05,
"loss": 0.4517,
"step": 368
},
{
"epoch": 1.9693128752501667,
"grad_norm": 0.4044754379711903,
"learning_rate": 3.034306741782166e-05,
"loss": 0.4719,
"step": 369
},
{
"epoch": 1.9746497665110074,
"grad_norm": 0.3728722340841331,
"learning_rate": 3.0279050898328716e-05,
"loss": 0.4168,
"step": 370
},
{
"epoch": 1.979986657771848,
"grad_norm": 0.4162488452438543,
"learning_rate": 3.021489094214228e-05,
"loss": 0.4688,
"step": 371
},
{
"epoch": 1.9853235490326884,
"grad_norm": 0.43351004658705333,
"learning_rate": 3.0150588444567962e-05,
"loss": 0.4519,
"step": 372
},
{
"epoch": 1.990660440293529,
"grad_norm": 0.3852293267610788,
"learning_rate": 3.0086144302900425e-05,
"loss": 0.4819,
"step": 373
},
{
"epoch": 1.9959973315543695,
"grad_norm": 0.36569663935905455,
"learning_rate": 3.002155941641091e-05,
"loss": 0.4535,
"step": 374
},
{
"epoch": 2.0013342228152102,
"grad_norm": 0.608016827751094,
"learning_rate": 2.99568346863346e-05,
"loss": 0.7085,
"step": 375
},
{
"epoch": 2.0066711140760507,
"grad_norm": 0.6591571174677568,
"learning_rate": 2.989197101585813e-05,
"loss": 0.3487,
"step": 376
},
{
"epoch": 2.012008005336891,
"grad_norm": 0.6684266661006703,
"learning_rate": 2.9826969310106927e-05,
"loss": 0.3407,
"step": 377
},
{
"epoch": 2.0173448965977316,
"grad_norm": 0.6399522965663798,
"learning_rate": 2.976183047613262e-05,
"loss": 0.3667,
"step": 378
},
{
"epoch": 2.0226817878585726,
"grad_norm": 0.5520199671519408,
"learning_rate": 2.9696555422900352e-05,
"loss": 0.3039,
"step": 379
},
{
"epoch": 2.028018679119413,
"grad_norm": 0.5699997898314517,
"learning_rate": 2.9631145061276093e-05,
"loss": 0.3339,
"step": 380
},
{
"epoch": 2.0333555703802535,
"grad_norm": 0.6020462454607703,
"learning_rate": 2.956560030401397e-05,
"loss": 0.3557,
"step": 381
},
{
"epoch": 2.038692461641094,
"grad_norm": 0.5721714900378834,
"learning_rate": 2.949992206574348e-05,
"loss": 0.3475,
"step": 382
},
{
"epoch": 2.0440293529019344,
"grad_norm": 0.49943187780464754,
"learning_rate": 2.9434111262956767e-05,
"loss": 0.3062,
"step": 383
},
{
"epoch": 2.0493662441627754,
"grad_norm": 0.6557048215847663,
"learning_rate": 2.9368168813995806e-05,
"loss": 0.3715,
"step": 384
},
{
"epoch": 2.054703135423616,
"grad_norm": 0.5041985226572085,
"learning_rate": 2.9302095639039607e-05,
"loss": 0.3342,
"step": 385
},
{
"epoch": 2.0600400266844563,
"grad_norm": 0.5594849349537587,
"learning_rate": 2.923589266009136e-05,
"loss": 0.3116,
"step": 386
},
{
"epoch": 2.0653769179452968,
"grad_norm": 0.4290186723298561,
"learning_rate": 2.9169560800965583e-05,
"loss": 0.3593,
"step": 387
},
{
"epoch": 2.0707138092061372,
"grad_norm": 0.526856097744535,
"learning_rate": 2.910310098727521e-05,
"loss": 0.3534,
"step": 388
},
{
"epoch": 2.076050700466978,
"grad_norm": 0.4372849060681286,
"learning_rate": 2.9036514146418705e-05,
"loss": 0.3224,
"step": 389
},
{
"epoch": 2.0813875917278186,
"grad_norm": 0.4365535359116469,
"learning_rate": 2.896980120756709e-05,
"loss": 0.3381,
"step": 390
},
{
"epoch": 2.086724482988659,
"grad_norm": 0.4371195260504785,
"learning_rate": 2.8902963101651004e-05,
"loss": 0.3337,
"step": 391
},
{
"epoch": 2.0920613742494996,
"grad_norm": 0.4119069627388638,
"learning_rate": 2.883600076134768e-05,
"loss": 0.3396,
"step": 392
},
{
"epoch": 2.09739826551034,
"grad_norm": 0.4389727835717101,
"learning_rate": 2.8768915121067987e-05,
"loss": 0.3544,
"step": 393
},
{
"epoch": 2.102735156771181,
"grad_norm": 0.39839899451373395,
"learning_rate": 2.870170711694333e-05,
"loss": 0.3258,
"step": 394
},
{
"epoch": 2.1080720480320214,
"grad_norm": 0.42893629704061353,
"learning_rate": 2.8634377686812608e-05,
"loss": 0.3257,
"step": 395
},
{
"epoch": 2.113408939292862,
"grad_norm": 0.41063083754083984,
"learning_rate": 2.8566927770209153e-05,
"loss": 0.335,
"step": 396
},
{
"epoch": 2.1187458305537024,
"grad_norm": 0.48368270464573715,
"learning_rate": 2.8499358308347595e-05,
"loss": 0.3707,
"step": 397
},
{
"epoch": 2.124082721814543,
"grad_norm": 0.3961965622304182,
"learning_rate": 2.843167024411071e-05,
"loss": 0.3395,
"step": 398
},
{
"epoch": 2.1294196130753837,
"grad_norm": 0.4082398648072262,
"learning_rate": 2.8363864522036298e-05,
"loss": 0.3116,
"step": 399
},
{
"epoch": 2.134756504336224,
"grad_norm": 0.39241622775035234,
"learning_rate": 2.8295942088304004e-05,
"loss": 0.3331,
"step": 400
},
{
"epoch": 2.1400933955970647,
"grad_norm": 0.43575155152898126,
"learning_rate": 2.822790389072207e-05,
"loss": 0.3516,
"step": 401
},
{
"epoch": 2.145430286857905,
"grad_norm": 0.34607848217775505,
"learning_rate": 2.815975087871416e-05,
"loss": 0.2954,
"step": 402
},
{
"epoch": 2.1507671781187456,
"grad_norm": 0.40910730417092234,
"learning_rate": 2.8091484003306074e-05,
"loss": 0.3335,
"step": 403
},
{
"epoch": 2.1561040693795865,
"grad_norm": 0.3834996888062665,
"learning_rate": 2.802310421711252e-05,
"loss": 0.3293,
"step": 404
},
{
"epoch": 2.161440960640427,
"grad_norm": 0.39804942929539155,
"learning_rate": 2.7954612474323754e-05,
"loss": 0.35,
"step": 405
},
{
"epoch": 2.1667778519012675,
"grad_norm": 0.40500969458813774,
"learning_rate": 2.788600973069234e-05,
"loss": 0.3375,
"step": 406
},
{
"epoch": 2.172114743162108,
"grad_norm": 0.38166833304508063,
"learning_rate": 2.781729694351976e-05,
"loss": 0.3614,
"step": 407
},
{
"epoch": 2.1774516344229484,
"grad_norm": 0.39321975173629375,
"learning_rate": 2.7748475071643085e-05,
"loss": 0.3361,
"step": 408
},
{
"epoch": 2.1827885256837893,
"grad_norm": 0.4264354527637513,
"learning_rate": 2.7679545075421573e-05,
"loss": 0.3403,
"step": 409
},
{
"epoch": 2.18812541694463,
"grad_norm": 0.4009398961332454,
"learning_rate": 2.7610507916723283e-05,
"loss": 0.3494,
"step": 410
},
{
"epoch": 2.1934623082054703,
"grad_norm": 0.44735512944431943,
"learning_rate": 2.754136455891165e-05,
"loss": 0.3381,
"step": 411
},
{
"epoch": 2.1987991994663107,
"grad_norm": 0.40873569246310604,
"learning_rate": 2.7472115966832044e-05,
"loss": 0.32,
"step": 412
},
{
"epoch": 2.204136090727151,
"grad_norm": 0.451038278060695,
"learning_rate": 2.7402763106798295e-05,
"loss": 0.3537,
"step": 413
},
{
"epoch": 2.209472981987992,
"grad_norm": 0.43597576256513326,
"learning_rate": 2.733330694657921e-05,
"loss": 0.3341,
"step": 414
},
{
"epoch": 2.2148098732488326,
"grad_norm": 0.4216053478782035,
"learning_rate": 2.7263748455385098e-05,
"loss": 0.3496,
"step": 415
},
{
"epoch": 2.220146764509673,
"grad_norm": 0.4741703909383378,
"learning_rate": 2.719408860385421e-05,
"loss": 0.3387,
"step": 416
},
{
"epoch": 2.2254836557705135,
"grad_norm": 0.396307019176774,
"learning_rate": 2.7124328364039203e-05,
"loss": 0.3447,
"step": 417
},
{
"epoch": 2.230820547031354,
"grad_norm": 0.44800881081287386,
"learning_rate": 2.7054468709393575e-05,
"loss": 0.3513,
"step": 418
},
{
"epoch": 2.236157438292195,
"grad_norm": 0.3929341967270722,
"learning_rate": 2.6984510614758112e-05,
"loss": 0.3298,
"step": 419
},
{
"epoch": 2.2414943295530354,
"grad_norm": 0.4067007662384614,
"learning_rate": 2.6914455056347225e-05,
"loss": 0.3325,
"step": 420
},
{
"epoch": 2.246831220813876,
"grad_norm": 0.43892260225513113,
"learning_rate": 2.6844303011735385e-05,
"loss": 0.3449,
"step": 421
},
{
"epoch": 2.2521681120747163,
"grad_norm": 0.41729138409748806,
"learning_rate": 2.677405545984344e-05,
"loss": 0.3414,
"step": 422
},
{
"epoch": 2.257505003335557,
"grad_norm": 0.38213817747852563,
"learning_rate": 2.6703713380924993e-05,
"loss": 0.343,
"step": 423
},
{
"epoch": 2.2628418945963977,
"grad_norm": 0.38560789077218444,
"learning_rate": 2.6633277756552683e-05,
"loss": 0.3193,
"step": 424
},
{
"epoch": 2.268178785857238,
"grad_norm": 0.40286245157177797,
"learning_rate": 2.6562749569604527e-05,
"loss": 0.3133,
"step": 425
},
{
"epoch": 2.2735156771180787,
"grad_norm": 0.41596719722020303,
"learning_rate": 2.6492129804250173e-05,
"loss": 0.3523,
"step": 426
},
{
"epoch": 2.278852568378919,
"grad_norm": 0.4153442559638167,
"learning_rate": 2.642141944593718e-05,
"loss": 0.3541,
"step": 427
},
{
"epoch": 2.2841894596397596,
"grad_norm": 0.40946075262767867,
"learning_rate": 2.635061948137727e-05,
"loss": 0.3321,
"step": 428
},
{
"epoch": 2.2895263509006005,
"grad_norm": 0.3585516295174825,
"learning_rate": 2.6279730898532548e-05,
"loss": 0.3568,
"step": 429
},
{
"epoch": 2.294863242161441,
"grad_norm": 0.4126272108188889,
"learning_rate": 2.6208754686601735e-05,
"loss": 0.3522,
"step": 430
},
{
"epoch": 2.3002001334222815,
"grad_norm": 0.339353916561967,
"learning_rate": 2.613769183600634e-05,
"loss": 0.2944,
"step": 431
},
{
"epoch": 2.305537024683122,
"grad_norm": 0.4282861919791616,
"learning_rate": 2.6066543338376865e-05,
"loss": 0.3318,
"step": 432
},
{
"epoch": 2.3108739159439624,
"grad_norm": 0.35897497473677303,
"learning_rate": 2.599531018653893e-05,
"loss": 0.3378,
"step": 433
},
{
"epoch": 2.3162108072048033,
"grad_norm": 0.47411097531132296,
"learning_rate": 2.5923993374499475e-05,
"loss": 0.3662,
"step": 434
},
{
"epoch": 2.321547698465644,
"grad_norm": 0.3568620059406491,
"learning_rate": 2.585259389743284e-05,
"loss": 0.3143,
"step": 435
},
{
"epoch": 2.3268845897264843,
"grad_norm": 0.4473037089763921,
"learning_rate": 2.5781112751666886e-05,
"loss": 0.3753,
"step": 436
},
{
"epoch": 2.3322214809873247,
"grad_norm": 0.3799705626691493,
"learning_rate": 2.5709550934669123e-05,
"loss": 0.3361,
"step": 437
},
{
"epoch": 2.337558372248165,
"grad_norm": 0.39700940455368583,
"learning_rate": 2.5637909445032752e-05,
"loss": 0.3174,
"step": 438
},
{
"epoch": 2.342895263509006,
"grad_norm": 0.4519495883374496,
"learning_rate": 2.5566189282462766e-05,
"loss": 0.356,
"step": 439
},
{
"epoch": 2.3482321547698466,
"grad_norm": 0.3870700641532921,
"learning_rate": 2.549439144776195e-05,
"loss": 0.3332,
"step": 440
},
{
"epoch": 2.353569046030687,
"grad_norm": 0.4068180001517662,
"learning_rate": 2.542251694281699e-05,
"loss": 0.3342,
"step": 441
},
{
"epoch": 2.3589059372915275,
"grad_norm": 0.3931784325165116,
"learning_rate": 2.5350566770584423e-05,
"loss": 0.3316,
"step": 442
},
{
"epoch": 2.364242828552368,
"grad_norm": 0.44083065881755534,
"learning_rate": 2.5278541935076656e-05,
"loss": 0.3493,
"step": 443
},
{
"epoch": 2.369579719813209,
"grad_norm": 0.41781820434477857,
"learning_rate": 2.5206443441347995e-05,
"loss": 0.3334,
"step": 444
},
{
"epoch": 2.3749166110740494,
"grad_norm": 0.4211075137727358,
"learning_rate": 2.5134272295480587e-05,
"loss": 0.353,
"step": 445
},
{
"epoch": 2.38025350233489,
"grad_norm": 0.4276719699292506,
"learning_rate": 2.506202950457038e-05,
"loss": 0.3194,
"step": 446
},
{
"epoch": 2.3855903935957303,
"grad_norm": 0.3891436905777496,
"learning_rate": 2.4989716076713063e-05,
"loss": 0.3298,
"step": 447
},
{
"epoch": 2.390927284856571,
"grad_norm": 0.4136388035611674,
"learning_rate": 2.4917333020990045e-05,
"loss": 0.3316,
"step": 448
},
{
"epoch": 2.3962641761174117,
"grad_norm": 0.46318938516966707,
"learning_rate": 2.4844881347454326e-05,
"loss": 0.3561,
"step": 449
},
{
"epoch": 2.401601067378252,
"grad_norm": 0.4361718813430768,
"learning_rate": 2.477236206711641e-05,
"loss": 0.3353,
"step": 450
},
{
"epoch": 2.4069379586390927,
"grad_norm": 0.4231633561528134,
"learning_rate": 2.46997761919302e-05,
"loss": 0.3528,
"step": 451
},
{
"epoch": 2.412274849899933,
"grad_norm": 0.44841625308699595,
"learning_rate": 2.4627124734778905e-05,
"loss": 0.3407,
"step": 452
},
{
"epoch": 2.417611741160774,
"grad_norm": 0.368806895133852,
"learning_rate": 2.4554408709460873e-05,
"loss": 0.3171,
"step": 453
},
{
"epoch": 2.4229486324216145,
"grad_norm": 0.46212577157467083,
"learning_rate": 2.4481629130675444e-05,
"loss": 0.3593,
"step": 454
},
{
"epoch": 2.428285523682455,
"grad_norm": 0.4396967709166417,
"learning_rate": 2.4408787014008807e-05,
"loss": 0.3441,
"step": 455
},
{
"epoch": 2.4336224149432955,
"grad_norm": 0.47046726193724747,
"learning_rate": 2.4335883375919828e-05,
"loss": 0.354,
"step": 456
},
{
"epoch": 2.438959306204136,
"grad_norm": 0.4453837386684812,
"learning_rate": 2.4262919233725853e-05,
"loss": 0.3545,
"step": 457
},
{
"epoch": 2.444296197464977,
"grad_norm": 0.4077861792439572,
"learning_rate": 2.418989560558852e-05,
"loss": 0.3307,
"step": 458
},
{
"epoch": 2.4496330887258173,
"grad_norm": 0.4120092551712366,
"learning_rate": 2.411681351049954e-05,
"loss": 0.3594,
"step": 459
},
{
"epoch": 2.454969979986658,
"grad_norm": 0.3903774406478128,
"learning_rate": 2.404367396826651e-05,
"loss": 0.3511,
"step": 460
},
{
"epoch": 2.4603068712474983,
"grad_norm": 0.3537187224774452,
"learning_rate": 2.3970477999498648e-05,
"loss": 0.3413,
"step": 461
},
{
"epoch": 2.4656437625083387,
"grad_norm": 0.37223078801129084,
"learning_rate": 2.3897226625592555e-05,
"loss": 0.3289,
"step": 462
},
{
"epoch": 2.4709806537691796,
"grad_norm": 0.3724310146397289,
"learning_rate": 2.3823920868717982e-05,
"loss": 0.333,
"step": 463
},
{
"epoch": 2.47631754503002,
"grad_norm": 0.37821163484897535,
"learning_rate": 2.3750561751803563e-05,
"loss": 0.3397,
"step": 464
},
{
"epoch": 2.4816544362908606,
"grad_norm": 0.3963343238925445,
"learning_rate": 2.3677150298522513e-05,
"loss": 0.3728,
"step": 465
},
{
"epoch": 2.486991327551701,
"grad_norm": 0.36709987148577683,
"learning_rate": 2.3603687533278364e-05,
"loss": 0.3222,
"step": 466
},
{
"epoch": 2.4923282188125415,
"grad_norm": 0.37561130929516573,
"learning_rate": 2.3530174481190692e-05,
"loss": 0.3266,
"step": 467
},
{
"epoch": 2.4976651100733824,
"grad_norm": 0.3478504823013864,
"learning_rate": 2.3456612168080764e-05,
"loss": 0.3445,
"step": 468
},
{
"epoch": 2.503002001334223,
"grad_norm": 0.43376543680900886,
"learning_rate": 2.338300162045726e-05,
"loss": 0.3578,
"step": 469
},
{
"epoch": 2.5083388925950634,
"grad_norm": 0.3467248206638372,
"learning_rate": 2.330934386550194e-05,
"loss": 0.3527,
"step": 470
},
{
"epoch": 2.513675783855904,
"grad_norm": 0.3703030631269327,
"learning_rate": 2.32356399310553e-05,
"loss": 0.3372,
"step": 471
},
{
"epoch": 2.5190126751167448,
"grad_norm": 0.36617067977815226,
"learning_rate": 2.316189084560224e-05,
"loss": 0.3212,
"step": 472
},
{
"epoch": 2.524349566377585,
"grad_norm": 0.4109293782714893,
"learning_rate": 2.3088097638257722e-05,
"loss": 0.3669,
"step": 473
},
{
"epoch": 2.5296864576384257,
"grad_norm": 0.40880336771156084,
"learning_rate": 2.3014261338752376e-05,
"loss": 0.3373,
"step": 474
},
{
"epoch": 2.535023348899266,
"grad_norm": 0.41531532752412315,
"learning_rate": 2.294038297741817e-05,
"loss": 0.3412,
"step": 475
},
{
"epoch": 2.5403602401601066,
"grad_norm": 0.4088747291326209,
"learning_rate": 2.2866463585174007e-05,
"loss": 0.3429,
"step": 476
},
{
"epoch": 2.5456971314209476,
"grad_norm": 0.3575193305915961,
"learning_rate": 2.2792504193511338e-05,
"loss": 0.3681,
"step": 477
},
{
"epoch": 2.5510340226817876,
"grad_norm": 0.4035090318465378,
"learning_rate": 2.2718505834479787e-05,
"loss": 0.335,
"step": 478
},
{
"epoch": 2.5563709139426285,
"grad_norm": 0.34216276945535873,
"learning_rate": 2.2644469540672736e-05,
"loss": 0.3387,
"step": 479
},
{
"epoch": 2.561707805203469,
"grad_norm": 0.37240018289403937,
"learning_rate": 2.2570396345212932e-05,
"loss": 0.3565,
"step": 480
},
{
"epoch": 2.5670446964643094,
"grad_norm": 0.33808417077793806,
"learning_rate": 2.2496287281738033e-05,
"loss": 0.3349,
"step": 481
},
{
"epoch": 2.5723815877251504,
"grad_norm": 0.3746743257239068,
"learning_rate": 2.2422143384386222e-05,
"loss": 0.3495,
"step": 482
},
{
"epoch": 2.577718478985991,
"grad_norm": 0.3494328211369956,
"learning_rate": 2.234796568778178e-05,
"loss": 0.3434,
"step": 483
},
{
"epoch": 2.5830553702468313,
"grad_norm": 0.38271781562914736,
"learning_rate": 2.22737552270206e-05,
"loss": 0.3177,
"step": 484
},
{
"epoch": 2.5883922615076718,
"grad_norm": 0.37885165545443145,
"learning_rate": 2.219951303765579e-05,
"loss": 0.345,
"step": 485
},
{
"epoch": 2.5937291527685122,
"grad_norm": 0.417558398659155,
"learning_rate": 2.212524015568322e-05,
"loss": 0.3384,
"step": 486
},
{
"epoch": 2.599066044029353,
"grad_norm": 0.33797987587981193,
"learning_rate": 2.205093761752704e-05,
"loss": 0.3424,
"step": 487
},
{
"epoch": 2.6044029352901936,
"grad_norm": 0.3436020891272047,
"learning_rate": 2.197660646002523e-05,
"loss": 0.3098,
"step": 488
},
{
"epoch": 2.609739826551034,
"grad_norm": 0.36972858197550945,
"learning_rate": 2.190224772041512e-05,
"loss": 0.3573,
"step": 489
},
{
"epoch": 2.6150767178118746,
"grad_norm": 0.3473912972172743,
"learning_rate": 2.1827862436318964e-05,
"loss": 0.3435,
"step": 490
},
{
"epoch": 2.620413609072715,
"grad_norm": 0.355276880649726,
"learning_rate": 2.175345164572939e-05,
"loss": 0.3507,
"step": 491
},
{
"epoch": 2.625750500333556,
"grad_norm": 0.36822265084446165,
"learning_rate": 2.1679016386994972e-05,
"loss": 0.3484,
"step": 492
},
{
"epoch": 2.6310873915943964,
"grad_norm": 0.34120947707028404,
"learning_rate": 2.1604557698805707e-05,
"loss": 0.3581,
"step": 493
},
{
"epoch": 2.636424282855237,
"grad_norm": 0.3625974151488874,
"learning_rate": 2.153007662017854e-05,
"loss": 0.3486,
"step": 494
},
{
"epoch": 2.6417611741160774,
"grad_norm": 0.3810454285560133,
"learning_rate": 2.145557419044286e-05,
"loss": 0.3262,
"step": 495
},
{
"epoch": 2.647098065376918,
"grad_norm": 0.3567767494033327,
"learning_rate": 2.1381051449225977e-05,
"loss": 0.3523,
"step": 496
},
{
"epoch": 2.6524349566377587,
"grad_norm": 0.33557909084194115,
"learning_rate": 2.130650943643866e-05,
"loss": 0.3318,
"step": 497
},
{
"epoch": 2.657771847898599,
"grad_norm": 0.3835013976581898,
"learning_rate": 2.123194919226058e-05,
"loss": 0.3419,
"step": 498
},
{
"epoch": 2.6631087391594397,
"grad_norm": 0.3425374794771174,
"learning_rate": 2.1157371757125827e-05,
"loss": 0.3314,
"step": 499
},
{
"epoch": 2.66844563042028,
"grad_norm": 0.3613222705329777,
"learning_rate": 2.1082778171708355e-05,
"loss": 0.3367,
"step": 500
},
{
"epoch": 2.6737825216811206,
"grad_norm": 0.39030975504768284,
"learning_rate": 2.100816947690751e-05,
"loss": 0.3633,
"step": 501
},
{
"epoch": 2.6791194129419615,
"grad_norm": 0.34775441424791625,
"learning_rate": 2.0933546713833474e-05,
"loss": 0.3261,
"step": 502
},
{
"epoch": 2.684456304202802,
"grad_norm": 0.35619640082486737,
"learning_rate": 2.0858910923792725e-05,
"loss": 0.3468,
"step": 503
},
{
"epoch": 2.6897931954636425,
"grad_norm": 0.36749874498504337,
"learning_rate": 2.0784263148273537e-05,
"loss": 0.3222,
"step": 504
},
{
"epoch": 2.695130086724483,
"grad_norm": 0.32371066166225,
"learning_rate": 2.070960442893143e-05,
"loss": 0.3295,
"step": 505
},
{
"epoch": 2.7004669779853234,
"grad_norm": 0.35511531727159107,
"learning_rate": 2.0634935807574633e-05,
"loss": 0.3412,
"step": 506
},
{
"epoch": 2.7058038692461643,
"grad_norm": 0.36830566559856426,
"learning_rate": 2.0560258326149557e-05,
"loss": 0.3419,
"step": 507
},
{
"epoch": 2.711140760507005,
"grad_norm": 0.34429904324010957,
"learning_rate": 2.0485573026726243e-05,
"loss": 0.3339,
"step": 508
},
{
"epoch": 2.7164776517678453,
"grad_norm": 0.3237344051103606,
"learning_rate": 2.041088095148383e-05,
"loss": 0.3129,
"step": 509
},
{
"epoch": 2.7218145430286858,
"grad_norm": 0.34535776524886275,
"learning_rate": 2.0336183142696006e-05,
"loss": 0.3493,
"step": 510
},
{
"epoch": 2.727151434289526,
"grad_norm": 0.37790756994617064,
"learning_rate": 2.0261480642716462e-05,
"loss": 0.3668,
"step": 511
},
{
"epoch": 2.732488325550367,
"grad_norm": 0.31766118054327325,
"learning_rate": 2.018677449396437e-05,
"loss": 0.3307,
"step": 512
},
{
"epoch": 2.7378252168112076,
"grad_norm": 0.36231855507183786,
"learning_rate": 2.01120657389098e-05,
"loss": 0.3419,
"step": 513
},
{
"epoch": 2.743162108072048,
"grad_norm": 0.3366225474904379,
"learning_rate": 2.0037355420059193e-05,
"loss": 0.3281,
"step": 514
},
{
"epoch": 2.7484989993328885,
"grad_norm": 0.3288307627791964,
"learning_rate": 1.9962644579940814e-05,
"loss": 0.3446,
"step": 515
},
{
"epoch": 2.753835890593729,
"grad_norm": 0.4002909383646017,
"learning_rate": 1.988793426109021e-05,
"loss": 0.3407,
"step": 516
},
{
"epoch": 2.75917278185457,
"grad_norm": 0.361890263436206,
"learning_rate": 1.9813225506035637e-05,
"loss": 0.3573,
"step": 517
},
{
"epoch": 2.7645096731154104,
"grad_norm": 0.36972409412445767,
"learning_rate": 1.973851935728354e-05,
"loss": 0.3386,
"step": 518
},
{
"epoch": 2.769846564376251,
"grad_norm": 0.3890677812342347,
"learning_rate": 1.9663816857304005e-05,
"loss": 0.3526,
"step": 519
},
{
"epoch": 2.7751834556370913,
"grad_norm": 0.3515870789484941,
"learning_rate": 1.9589119048516177e-05,
"loss": 0.327,
"step": 520
},
{
"epoch": 2.780520346897932,
"grad_norm": 0.33110583319058595,
"learning_rate": 1.951442697327376e-05,
"loss": 0.3114,
"step": 521
},
{
"epoch": 2.7858572381587727,
"grad_norm": 0.3644018882302513,
"learning_rate": 1.943974167385045e-05,
"loss": 0.3222,
"step": 522
},
{
"epoch": 2.791194129419613,
"grad_norm": 0.3435318930149615,
"learning_rate": 1.936506419242537e-05,
"loss": 0.3289,
"step": 523
},
{
"epoch": 2.7965310206804537,
"grad_norm": 0.3927187524254616,
"learning_rate": 1.9290395571068573e-05,
"loss": 0.3519,
"step": 524
},
{
"epoch": 2.801867911941294,
"grad_norm": 0.3388044424262462,
"learning_rate": 1.921573685172647e-05,
"loss": 0.3167,
"step": 525
},
{
"epoch": 2.8072048032021346,
"grad_norm": 0.3587054307128672,
"learning_rate": 1.914108907620728e-05,
"loss": 0.3389,
"step": 526
},
{
"epoch": 2.8125416944629755,
"grad_norm": 0.3485342273967904,
"learning_rate": 1.9066453286166536e-05,
"loss": 0.3412,
"step": 527
},
{
"epoch": 2.817878585723816,
"grad_norm": 0.3341586078617526,
"learning_rate": 1.8991830523092497e-05,
"loss": 0.3216,
"step": 528
},
{
"epoch": 2.8232154769846565,
"grad_norm": 0.37102695451617046,
"learning_rate": 1.8917221828291652e-05,
"loss": 0.356,
"step": 529
},
{
"epoch": 2.828552368245497,
"grad_norm": 0.3137005027651722,
"learning_rate": 1.8842628242874187e-05,
"loss": 0.3256,
"step": 530
},
{
"epoch": 2.8338892595063374,
"grad_norm": 0.39850109158745595,
"learning_rate": 1.8768050807739425e-05,
"loss": 0.3387,
"step": 531
},
{
"epoch": 2.8392261507671783,
"grad_norm": 0.36284113759820136,
"learning_rate": 1.8693490563561343e-05,
"loss": 0.3234,
"step": 532
},
{
"epoch": 2.844563042028019,
"grad_norm": 0.3420275126150477,
"learning_rate": 1.8618948550774033e-05,
"loss": 0.3297,
"step": 533
},
{
"epoch": 2.8498999332888593,
"grad_norm": 0.39257170087476245,
"learning_rate": 1.854442580955715e-05,
"loss": 0.3321,
"step": 534
},
{
"epoch": 2.8552368245496997,
"grad_norm": 0.3523635968791173,
"learning_rate": 1.846992337982147e-05,
"loss": 0.3585,
"step": 535
},
{
"epoch": 2.86057371581054,
"grad_norm": 0.337680075516661,
"learning_rate": 1.83954423011943e-05,
"loss": 0.3269,
"step": 536
},
{
"epoch": 2.865910607071381,
"grad_norm": 0.3968837701031737,
"learning_rate": 1.832098361300503e-05,
"loss": 0.3644,
"step": 537
},
{
"epoch": 2.8712474983322216,
"grad_norm": 0.35722398722368337,
"learning_rate": 1.8246548354270616e-05,
"loss": 0.335,
"step": 538
},
{
"epoch": 2.876584389593062,
"grad_norm": 0.3572447721264452,
"learning_rate": 1.8172137563681042e-05,
"loss": 0.3513,
"step": 539
},
{
"epoch": 2.8819212808539025,
"grad_norm": 0.3689075039010707,
"learning_rate": 1.809775227958488e-05,
"loss": 0.3479,
"step": 540
},
{
"epoch": 2.887258172114743,
"grad_norm": 0.4157671175592236,
"learning_rate": 1.802339353997478e-05,
"loss": 0.3629,
"step": 541
},
{
"epoch": 2.892595063375584,
"grad_norm": 0.3447307730764763,
"learning_rate": 1.7949062382472967e-05,
"loss": 0.3172,
"step": 542
},
{
"epoch": 2.8979319546364244,
"grad_norm": 0.3990421667679026,
"learning_rate": 1.787475984431678e-05,
"loss": 0.3568,
"step": 543
},
{
"epoch": 2.903268845897265,
"grad_norm": 0.3284492948137503,
"learning_rate": 1.7800486962344213e-05,
"loss": 0.3232,
"step": 544
},
{
"epoch": 2.9086057371581053,
"grad_norm": 0.3564409700384773,
"learning_rate": 1.7726244772979408e-05,
"loss": 0.3262,
"step": 545
},
{
"epoch": 2.913942628418946,
"grad_norm": 0.38056209814773034,
"learning_rate": 1.7652034312218234e-05,
"loss": 0.3504,
"step": 546
},
{
"epoch": 2.9192795196797867,
"grad_norm": 0.33330941463187824,
"learning_rate": 1.757785661561378e-05,
"loss": 0.3503,
"step": 547
},
{
"epoch": 2.924616410940627,
"grad_norm": 0.32533655359519337,
"learning_rate": 1.7503712718261977e-05,
"loss": 0.3306,
"step": 548
},
{
"epoch": 2.9299533022014677,
"grad_norm": 0.34762836109457435,
"learning_rate": 1.7429603654787078e-05,
"loss": 0.3464,
"step": 549
},
{
"epoch": 2.935290193462308,
"grad_norm": 0.355505370708806,
"learning_rate": 1.7355530459327267e-05,
"loss": 0.3355,
"step": 550
},
{
"epoch": 2.9406270847231486,
"grad_norm": 0.34476430444950434,
"learning_rate": 1.7281494165520217e-05,
"loss": 0.3215,
"step": 551
},
{
"epoch": 2.9459639759839895,
"grad_norm": 0.34463395158247007,
"learning_rate": 1.7207495806488672e-05,
"loss": 0.3286,
"step": 552
},
{
"epoch": 2.95130086724483,
"grad_norm": 0.39694696141098895,
"learning_rate": 1.7133536414826e-05,
"loss": 0.348,
"step": 553
},
{
"epoch": 2.9566377585056705,
"grad_norm": 0.3414356307623428,
"learning_rate": 1.705961702258183e-05,
"loss": 0.3266,
"step": 554
},
{
"epoch": 2.961974649766511,
"grad_norm": 0.34062712306187115,
"learning_rate": 1.6985738661247627e-05,
"loss": 0.3245,
"step": 555
},
{
"epoch": 2.9673115410273514,
"grad_norm": 0.35420373579360254,
"learning_rate": 1.691190236174228e-05,
"loss": 0.3094,
"step": 556
},
{
"epoch": 2.9726484322881923,
"grad_norm": 0.35393965464453037,
"learning_rate": 1.6838109154397764e-05,
"loss": 0.3636,
"step": 557
},
{
"epoch": 2.977985323549033,
"grad_norm": 0.3398430528475795,
"learning_rate": 1.6764360068944706e-05,
"loss": 0.3489,
"step": 558
},
{
"epoch": 2.9833222148098733,
"grad_norm": 0.36959208816642103,
"learning_rate": 1.6690656134498063e-05,
"loss": 0.3545,
"step": 559
},
{
"epoch": 2.9886591060707137,
"grad_norm": 0.3244411298894109,
"learning_rate": 1.661699837954275e-05,
"loss": 0.3024,
"step": 560
},
{
"epoch": 2.993995997331554,
"grad_norm": 0.31714532814318064,
"learning_rate": 1.6543387831919243e-05,
"loss": 0.3196,
"step": 561
},
{
"epoch": 2.999332888592395,
"grad_norm": 0.49216959263692955,
"learning_rate": 1.646982551880931e-05,
"loss": 0.4825,
"step": 562
},
{
"epoch": 3.0046697798532356,
"grad_norm": 0.6375873638749311,
"learning_rate": 1.639631246672164e-05,
"loss": 0.3264,
"step": 563
},
{
"epoch": 3.010006671114076,
"grad_norm": 0.4465753271154956,
"learning_rate": 1.632284970147749e-05,
"loss": 0.2328,
"step": 564
},
{
"epoch": 3.0153435623749165,
"grad_norm": 0.4868378113812574,
"learning_rate": 1.6249438248196437e-05,
"loss": 0.2209,
"step": 565
},
{
"epoch": 3.020680453635757,
"grad_norm": 0.7954769693611728,
"learning_rate": 1.617607913128202e-05,
"loss": 0.2305,
"step": 566
},
{
"epoch": 3.026017344896598,
"grad_norm": 0.4733054296578894,
"learning_rate": 1.610277337440745e-05,
"loss": 0.231,
"step": 567
},
{
"epoch": 3.0313542361574384,
"grad_norm": 0.44905546773789157,
"learning_rate": 1.6029522000501362e-05,
"loss": 0.2129,
"step": 568
},
{
"epoch": 3.036691127418279,
"grad_norm": 0.46572310673573625,
"learning_rate": 1.5956326031733496e-05,
"loss": 0.2092,
"step": 569
},
{
"epoch": 3.0420280186791193,
"grad_norm": 0.46263530375828704,
"learning_rate": 1.5883186489500465e-05,
"loss": 0.2214,
"step": 570
},
{
"epoch": 3.04736490993996,
"grad_norm": 0.4153475909168177,
"learning_rate": 1.5810104394411494e-05,
"loss": 0.2374,
"step": 571
},
{
"epoch": 3.0527018012008007,
"grad_norm": 0.3926765936070903,
"learning_rate": 1.5737080766274154e-05,
"loss": 0.2181,
"step": 572
},
{
"epoch": 3.058038692461641,
"grad_norm": 0.4423780999916496,
"learning_rate": 1.5664116624080176e-05,
"loss": 0.2263,
"step": 573
},
{
"epoch": 3.0633755837224816,
"grad_norm": 0.48104075830395865,
"learning_rate": 1.55912129859912e-05,
"loss": 0.2299,
"step": 574
},
{
"epoch": 3.068712474983322,
"grad_norm": 0.34802553512115314,
"learning_rate": 1.5518370869324562e-05,
"loss": 0.2038,
"step": 575
},
{
"epoch": 3.0740493662441626,
"grad_norm": 0.4024921353804397,
"learning_rate": 1.5445591290539133e-05,
"loss": 0.2306,
"step": 576
},
{
"epoch": 3.0793862575050035,
"grad_norm": 0.39223703505556246,
"learning_rate": 1.5372875265221098e-05,
"loss": 0.2146,
"step": 577
},
{
"epoch": 3.084723148765844,
"grad_norm": 0.37616914852450856,
"learning_rate": 1.53002238080698e-05,
"loss": 0.2215,
"step": 578
},
{
"epoch": 3.0900600400266844,
"grad_norm": 0.38195507124837264,
"learning_rate": 1.5227637932883603e-05,
"loss": 0.2008,
"step": 579
},
{
"epoch": 3.095396931287525,
"grad_norm": 0.37714457548841984,
"learning_rate": 1.515511865254568e-05,
"loss": 0.2257,
"step": 580
},
{
"epoch": 3.1007338225483654,
"grad_norm": 0.36734917441929993,
"learning_rate": 1.5082666979009953e-05,
"loss": 0.2081,
"step": 581
},
{
"epoch": 3.1060707138092063,
"grad_norm": 0.38769476025178157,
"learning_rate": 1.5010283923286944e-05,
"loss": 0.2271,
"step": 582
},
{
"epoch": 3.1114076050700468,
"grad_norm": 0.3586607196800494,
"learning_rate": 1.493797049542963e-05,
"loss": 0.2021,
"step": 583
},
{
"epoch": 3.1167444963308872,
"grad_norm": 0.37496022310615634,
"learning_rate": 1.4865727704519416e-05,
"loss": 0.2443,
"step": 584
},
{
"epoch": 3.1220813875917277,
"grad_norm": 0.34669037639542094,
"learning_rate": 1.4793556558652012e-05,
"loss": 0.2221,
"step": 585
},
{
"epoch": 3.127418278852568,
"grad_norm": 0.33087385453332585,
"learning_rate": 1.472145806492335e-05,
"loss": 0.2068,
"step": 586
},
{
"epoch": 3.132755170113409,
"grad_norm": 0.3375224847458593,
"learning_rate": 1.4649433229415588e-05,
"loss": 0.2167,
"step": 587
},
{
"epoch": 3.1380920613742496,
"grad_norm": 0.34607209711469494,
"learning_rate": 1.457748305718301e-05,
"loss": 0.2175,
"step": 588
},
{
"epoch": 3.14342895263509,
"grad_norm": 0.330025155691137,
"learning_rate": 1.4505608552238047e-05,
"loss": 0.2341,
"step": 589
},
{
"epoch": 3.1487658438959305,
"grad_norm": 0.34147253971194746,
"learning_rate": 1.4433810717537244e-05,
"loss": 0.2146,
"step": 590
},
{
"epoch": 3.154102735156771,
"grad_norm": 0.3453098452736006,
"learning_rate": 1.436209055496725e-05,
"loss": 0.2024,
"step": 591
},
{
"epoch": 3.159439626417612,
"grad_norm": 0.3466365634852416,
"learning_rate": 1.429044906533088e-05,
"loss": 0.242,
"step": 592
},
{
"epoch": 3.1647765176784524,
"grad_norm": 0.3290618234863436,
"learning_rate": 1.4218887248333123e-05,
"loss": 0.2111,
"step": 593
},
{
"epoch": 3.170113408939293,
"grad_norm": 0.351656000947437,
"learning_rate": 1.414740610256717e-05,
"loss": 0.225,
"step": 594
},
{
"epoch": 3.1754503002001333,
"grad_norm": 0.3176619140403144,
"learning_rate": 1.4076006625500526e-05,
"loss": 0.2234,
"step": 595
},
{
"epoch": 3.1807871914609738,
"grad_norm": 0.3355617376830507,
"learning_rate": 1.4004689813461072e-05,
"loss": 0.2105,
"step": 596
},
{
"epoch": 3.1861240827218147,
"grad_norm": 0.3361984337758312,
"learning_rate": 1.3933456661623142e-05,
"loss": 0.2243,
"step": 597
},
{
"epoch": 3.191460973982655,
"grad_norm": 0.3271843509477646,
"learning_rate": 1.3862308163993667e-05,
"loss": 0.2094,
"step": 598
},
{
"epoch": 3.1967978652434956,
"grad_norm": 0.3571117855172697,
"learning_rate": 1.379124531339827e-05,
"loss": 0.2162,
"step": 599
},
{
"epoch": 3.202134756504336,
"grad_norm": 0.3273165917601703,
"learning_rate": 1.3720269101467454e-05,
"loss": 0.2061,
"step": 600
},
{
"epoch": 3.2074716477651766,
"grad_norm": 0.3284542388193666,
"learning_rate": 1.364938051862274e-05,
"loss": 0.2066,
"step": 601
},
{
"epoch": 3.2128085390260175,
"grad_norm": 0.36447675615077985,
"learning_rate": 1.3578580554062826e-05,
"loss": 0.2281,
"step": 602
},
{
"epoch": 3.218145430286858,
"grad_norm": 0.3262380092074158,
"learning_rate": 1.3507870195749829e-05,
"loss": 0.203,
"step": 603
},
{
"epoch": 3.2234823215476984,
"grad_norm": 0.3457313320643053,
"learning_rate": 1.3437250430395478e-05,
"loss": 0.2379,
"step": 604
},
{
"epoch": 3.228819212808539,
"grad_norm": 0.3441380900113131,
"learning_rate": 1.336672224344732e-05,
"loss": 0.2245,
"step": 605
},
{
"epoch": 3.2341561040693794,
"grad_norm": 0.32996002379601147,
"learning_rate": 1.3296286619075016e-05,
"loss": 0.2227,
"step": 606
},
{
"epoch": 3.2394929953302203,
"grad_norm": 0.3282894261484981,
"learning_rate": 1.3225944540156565e-05,
"loss": 0.2046,
"step": 607
},
{
"epoch": 3.2448298865910608,
"grad_norm": 0.32710033563774793,
"learning_rate": 1.3155696988264621e-05,
"loss": 0.1997,
"step": 608
},
{
"epoch": 3.2501667778519012,
"grad_norm": 0.3762040570145167,
"learning_rate": 1.3085544943652783e-05,
"loss": 0.235,
"step": 609
},
{
"epoch": 3.2555036691127417,
"grad_norm": 0.3440879224425546,
"learning_rate": 1.3015489385241895e-05,
"loss": 0.2028,
"step": 610
},
{
"epoch": 3.260840560373582,
"grad_norm": 0.38071227441207245,
"learning_rate": 1.2945531290606423e-05,
"loss": 0.2346,
"step": 611
},
{
"epoch": 3.266177451634423,
"grad_norm": 0.31980915182543196,
"learning_rate": 1.2875671635960807e-05,
"loss": 0.214,
"step": 612
},
{
"epoch": 3.2715143428952635,
"grad_norm": 0.35716629327114985,
"learning_rate": 1.2805911396145794e-05,
"loss": 0.2117,
"step": 613
},
{
"epoch": 3.276851234156104,
"grad_norm": 0.34571819244635243,
"learning_rate": 1.2736251544614903e-05,
"loss": 0.2304,
"step": 614
},
{
"epoch": 3.2821881254169445,
"grad_norm": 0.3208986289612815,
"learning_rate": 1.2666693053420795e-05,
"loss": 0.218,
"step": 615
},
{
"epoch": 3.287525016677785,
"grad_norm": 0.3567717531275436,
"learning_rate": 1.2597236893201712e-05,
"loss": 0.2036,
"step": 616
},
{
"epoch": 3.292861907938626,
"grad_norm": 0.32882485699097,
"learning_rate": 1.2527884033167966e-05,
"loss": 0.2248,
"step": 617
},
{
"epoch": 3.2981987991994663,
"grad_norm": 0.33037504825082553,
"learning_rate": 1.2458635441088354e-05,
"loss": 0.2136,
"step": 618
},
{
"epoch": 3.303535690460307,
"grad_norm": 0.3715971284418149,
"learning_rate": 1.2389492083276719e-05,
"loss": 0.2321,
"step": 619
},
{
"epoch": 3.3088725817211473,
"grad_norm": 0.33125816705438366,
"learning_rate": 1.2320454924578435e-05,
"loss": 0.2364,
"step": 620
},
{
"epoch": 3.3142094729819878,
"grad_norm": 0.33161675743461216,
"learning_rate": 1.225152492835692e-05,
"loss": 0.2092,
"step": 621
},
{
"epoch": 3.3195463642428287,
"grad_norm": 0.34978398552096673,
"learning_rate": 1.2182703056480243e-05,
"loss": 0.2293,
"step": 622
},
{
"epoch": 3.324883255503669,
"grad_norm": 0.32364794551815557,
"learning_rate": 1.211399026930767e-05,
"loss": 0.2204,
"step": 623
},
{
"epoch": 3.3302201467645096,
"grad_norm": 0.3499702901876252,
"learning_rate": 1.2045387525676253e-05,
"loss": 0.2159,
"step": 624
},
{
"epoch": 3.33555703802535,
"grad_norm": 0.29554206232950536,
"learning_rate": 1.1976895782887488e-05,
"loss": 0.199,
"step": 625
},
{
"epoch": 3.3408939292861906,
"grad_norm": 0.3556138967561671,
"learning_rate": 1.1908515996693927e-05,
"loss": 0.2231,
"step": 626
},
{
"epoch": 3.3462308205470315,
"grad_norm": 0.3363266594913331,
"learning_rate": 1.1840249121285843e-05,
"loss": 0.2252,
"step": 627
},
{
"epoch": 3.351567711807872,
"grad_norm": 0.29662341791312835,
"learning_rate": 1.1772096109277937e-05,
"loss": 0.2,
"step": 628
},
{
"epoch": 3.3569046030687124,
"grad_norm": 0.32468090780131365,
"learning_rate": 1.1704057911696003e-05,
"loss": 0.2124,
"step": 629
},
{
"epoch": 3.362241494329553,
"grad_norm": 0.33364365893577863,
"learning_rate": 1.1636135477963702e-05,
"loss": 0.2418,
"step": 630
},
{
"epoch": 3.3675783855903934,
"grad_norm": 0.32925700089942483,
"learning_rate": 1.15683297558893e-05,
"loss": 0.2139,
"step": 631
},
{
"epoch": 3.3729152768512343,
"grad_norm": 0.3328549235968737,
"learning_rate": 1.1500641691652412e-05,
"loss": 0.2165,
"step": 632
},
{
"epoch": 3.3782521681120747,
"grad_norm": 0.30882570277397,
"learning_rate": 1.1433072229790847e-05,
"loss": 0.2128,
"step": 633
},
{
"epoch": 3.383589059372915,
"grad_norm": 0.3370088932960149,
"learning_rate": 1.1365622313187402e-05,
"loss": 0.2289,
"step": 634
},
{
"epoch": 3.3889259506337557,
"grad_norm": 0.3419578297439491,
"learning_rate": 1.1298292883056682e-05,
"loss": 0.2295,
"step": 635
},
{
"epoch": 3.394262841894596,
"grad_norm": 0.310821271131648,
"learning_rate": 1.1231084878932018e-05,
"loss": 0.1937,
"step": 636
},
{
"epoch": 3.399599733155437,
"grad_norm": 0.32752165763324353,
"learning_rate": 1.1163999238652328e-05,
"loss": 0.2342,
"step": 637
},
{
"epoch": 3.4049366244162775,
"grad_norm": 0.33007003787949457,
"learning_rate": 1.109703689834901e-05,
"loss": 0.2068,
"step": 638
},
{
"epoch": 3.410273515677118,
"grad_norm": 0.34735756204554885,
"learning_rate": 1.1030198792432915e-05,
"loss": 0.2414,
"step": 639
},
{
"epoch": 3.4156104069379585,
"grad_norm": 0.305560108048619,
"learning_rate": 1.09634858535813e-05,
"loss": 0.2097,
"step": 640
},
{
"epoch": 3.4209472981987994,
"grad_norm": 0.3389861898800808,
"learning_rate": 1.089689901272479e-05,
"loss": 0.2127,
"step": 641
},
{
"epoch": 3.42628418945964,
"grad_norm": 0.3207807998603898,
"learning_rate": 1.0830439199034424e-05,
"loss": 0.2226,
"step": 642
},
{
"epoch": 3.4316210807204803,
"grad_norm": 0.3264054440024789,
"learning_rate": 1.0764107339908643e-05,
"loss": 0.222,
"step": 643
},
{
"epoch": 3.436957971981321,
"grad_norm": 0.32600813384811467,
"learning_rate": 1.0697904360960392e-05,
"loss": 0.209,
"step": 644
},
{
"epoch": 3.4422948632421613,
"grad_norm": 0.3177047337904919,
"learning_rate": 1.06318311860042e-05,
"loss": 0.2089,
"step": 645
},
{
"epoch": 3.447631754503002,
"grad_norm": 0.3210423679681483,
"learning_rate": 1.0565888737043238e-05,
"loss": 0.2274,
"step": 646
},
{
"epoch": 3.4529686457638427,
"grad_norm": 0.3142756745691945,
"learning_rate": 1.050007793425653e-05,
"loss": 0.208,
"step": 647
},
{
"epoch": 3.458305537024683,
"grad_norm": 0.3189823143454847,
"learning_rate": 1.0434399695986038e-05,
"loss": 0.237,
"step": 648
},
{
"epoch": 3.4636424282855236,
"grad_norm": 0.35446119159135153,
"learning_rate": 1.0368854938723909e-05,
"loss": 0.2257,
"step": 649
},
{
"epoch": 3.468979319546364,
"grad_norm": 0.30379218539151415,
"learning_rate": 1.0303444577099657e-05,
"loss": 0.2055,
"step": 650
},
{
"epoch": 3.474316210807205,
"grad_norm": 0.3334793611412552,
"learning_rate": 1.023816952386738e-05,
"loss": 0.2259,
"step": 651
},
{
"epoch": 3.4796531020680455,
"grad_norm": 0.31245098736700866,
"learning_rate": 1.0173030689893073e-05,
"loss": 0.217,
"step": 652
},
{
"epoch": 3.484989993328886,
"grad_norm": 0.298270066885541,
"learning_rate": 1.010802898414188e-05,
"loss": 0.1971,
"step": 653
},
{
"epoch": 3.4903268845897264,
"grad_norm": 0.3270591833255489,
"learning_rate": 1.0043165313665408e-05,
"loss": 0.2278,
"step": 654
},
{
"epoch": 3.495663775850567,
"grad_norm": 0.305730303527883,
"learning_rate": 9.978440583589097e-06,
"loss": 0.2177,
"step": 655
},
{
"epoch": 3.5010006671114073,
"grad_norm": 0.3144303533888383,
"learning_rate": 9.913855697099581e-06,
"loss": 0.2244,
"step": 656
},
{
"epoch": 3.5063375583722483,
"grad_norm": 0.3186316247887544,
"learning_rate": 9.84941155543205e-06,
"loss": 0.2117,
"step": 657
},
{
"epoch": 3.5116744496330887,
"grad_norm": 0.3338362484473223,
"learning_rate": 9.785109057857724e-06,
"loss": 0.2175,
"step": 658
},
{
"epoch": 3.517011340893929,
"grad_norm": 0.33877190549699376,
"learning_rate": 9.720949101671283e-06,
"loss": 0.2072,
"step": 659
},
{
"epoch": 3.52234823215477,
"grad_norm": 0.2878815210471794,
"learning_rate": 9.65693258217834e-06,
"loss": 0.2024,
"step": 660
},
{
"epoch": 3.52768512341561,
"grad_norm": 0.3229015014259361,
"learning_rate": 9.59306039268296e-06,
"loss": 0.2309,
"step": 661
},
{
"epoch": 3.533022014676451,
"grad_norm": 0.329774980311716,
"learning_rate": 9.529333424475165e-06,
"loss": 0.21,
"step": 662
},
{
"epoch": 3.5383589059372915,
"grad_norm": 0.3254821489847073,
"learning_rate": 9.465752566818545e-06,
"loss": 0.2255,
"step": 663
},
{
"epoch": 3.543695797198132,
"grad_norm": 0.3318586960366466,
"learning_rate": 9.402318706937818e-06,
"loss": 0.2298,
"step": 664
},
{
"epoch": 3.549032688458973,
"grad_norm": 0.3116201867034196,
"learning_rate": 9.33903273000644e-06,
"loss": 0.2054,
"step": 665
},
{
"epoch": 3.554369579719813,
"grad_norm": 0.31694579049650856,
"learning_rate": 9.275895519134284e-06,
"loss": 0.222,
"step": 666
},
{
"epoch": 3.559706470980654,
"grad_norm": 0.32312031003526426,
"learning_rate": 9.212907955355302e-06,
"loss": 0.2173,
"step": 667
},
{
"epoch": 3.5650433622414943,
"grad_norm": 0.32535627286627056,
"learning_rate": 9.150070917615209e-06,
"loss": 0.224,
"step": 668
},
{
"epoch": 3.570380253502335,
"grad_norm": 0.32754134685676567,
"learning_rate": 9.087385282759262e-06,
"loss": 0.2056,
"step": 669
},
{
"epoch": 3.5757171447631757,
"grad_norm": 0.32097631798875437,
"learning_rate": 9.024851925519984e-06,
"loss": 0.2167,
"step": 670
},
{
"epoch": 3.581054036024016,
"grad_norm": 0.3224914181755297,
"learning_rate": 8.962471718504981e-06,
"loss": 0.2318,
"step": 671
},
{
"epoch": 3.5863909272848566,
"grad_norm": 0.3069463307574391,
"learning_rate": 8.90024553218477e-06,
"loss": 0.2147,
"step": 672
},
{
"epoch": 3.591727818545697,
"grad_norm": 0.34503320829284617,
"learning_rate": 8.838174234880595e-06,
"loss": 0.2115,
"step": 673
},
{
"epoch": 3.5970647098065376,
"grad_norm": 0.33721115686062114,
"learning_rate": 8.776258692752355e-06,
"loss": 0.2125,
"step": 674
},
{
"epoch": 3.6024016010673785,
"grad_norm": 0.2988266857938936,
"learning_rate": 8.714499769786504e-06,
"loss": 0.2068,
"step": 675
},
{
"epoch": 3.607738492328219,
"grad_norm": 0.33052375619924823,
"learning_rate": 8.652898327783966e-06,
"loss": 0.2081,
"step": 676
},
{
"epoch": 3.6130753835890594,
"grad_norm": 0.2972205217005894,
"learning_rate": 8.591455226348153e-06,
"loss": 0.2165,
"step": 677
},
{
"epoch": 3.6184122748499,
"grad_norm": 0.29969511000575444,
"learning_rate": 8.530171322872943e-06,
"loss": 0.2013,
"step": 678
},
{
"epoch": 3.6237491661107404,
"grad_norm": 0.3304499281322153,
"learning_rate": 8.469047472530721e-06,
"loss": 0.2355,
"step": 679
},
{
"epoch": 3.6290860573715813,
"grad_norm": 0.290277414481082,
"learning_rate": 8.408084528260454e-06,
"loss": 0.2072,
"step": 680
},
{
"epoch": 3.6344229486324218,
"grad_norm": 0.33406571248792316,
"learning_rate": 8.347283340755762e-06,
"loss": 0.2103,
"step": 681
},
{
"epoch": 3.6397598398932622,
"grad_norm": 0.33772777521240266,
"learning_rate": 8.286644758453084e-06,
"loss": 0.2277,
"step": 682
},
{
"epoch": 3.6450967311541027,
"grad_norm": 0.32580165533637756,
"learning_rate": 8.226169627519829e-06,
"loss": 0.1972,
"step": 683
},
{
"epoch": 3.650433622414943,
"grad_norm": 0.327604705269643,
"learning_rate": 8.165858791842531e-06,
"loss": 0.2301,
"step": 684
},
{
"epoch": 3.655770513675784,
"grad_norm": 0.30673340194726156,
"learning_rate": 8.10571309301513e-06,
"loss": 0.2214,
"step": 685
},
{
"epoch": 3.6611074049366246,
"grad_norm": 0.3082993998431756,
"learning_rate": 8.045733370327197e-06,
"loss": 0.2142,
"step": 686
},
{
"epoch": 3.666444296197465,
"grad_norm": 0.3264898640096142,
"learning_rate": 7.98592046075221e-06,
"loss": 0.2239,
"step": 687
},
{
"epoch": 3.6717811874583055,
"grad_norm": 0.3267139343805825,
"learning_rate": 7.926275198935915e-06,
"loss": 0.2174,
"step": 688
},
{
"epoch": 3.677118078719146,
"grad_norm": 0.3038869171361146,
"learning_rate": 7.866798417184631e-06,
"loss": 0.2167,
"step": 689
},
{
"epoch": 3.682454969979987,
"grad_norm": 0.3153766070982165,
"learning_rate": 7.807490945453675e-06,
"loss": 0.2214,
"step": 690
},
{
"epoch": 3.6877918612408274,
"grad_norm": 0.3359093682690147,
"learning_rate": 7.748353611335772e-06,
"loss": 0.2282,
"step": 691
},
{
"epoch": 3.693128752501668,
"grad_norm": 0.3235870658057771,
"learning_rate": 7.689387240049475e-06,
"loss": 0.215,
"step": 692
},
{
"epoch": 3.6984656437625083,
"grad_norm": 0.32596212481215736,
"learning_rate": 7.6305926544277e-06,
"loss": 0.2204,
"step": 693
},
{
"epoch": 3.7038025350233488,
"grad_norm": 0.3089795878726115,
"learning_rate": 7.571970674906212e-06,
"loss": 0.2013,
"step": 694
},
{
"epoch": 3.7091394262841897,
"grad_norm": 0.3036096075117797,
"learning_rate": 7.513522119512171e-06,
"loss": 0.2128,
"step": 695
},
{
"epoch": 3.71447631754503,
"grad_norm": 0.31370142673701046,
"learning_rate": 7.455247803852741e-06,
"loss": 0.2153,
"step": 696
},
{
"epoch": 3.7198132088058706,
"grad_norm": 0.3164732161874329,
"learning_rate": 7.397148541103698e-06,
"loss": 0.215,
"step": 697
},
{
"epoch": 3.725150100066711,
"grad_norm": 0.3143756963158667,
"learning_rate": 7.339225141998076e-06,
"loss": 0.2274,
"step": 698
},
{
"epoch": 3.7304869913275516,
"grad_norm": 0.3259467908831874,
"learning_rate": 7.281478414814869e-06,
"loss": 0.2248,
"step": 699
},
{
"epoch": 3.7358238825883925,
"grad_norm": 0.3243759701523797,
"learning_rate": 7.223909165367722e-06,
"loss": 0.2267,
"step": 700
},
{
"epoch": 3.741160773849233,
"grad_norm": 0.29306669061610235,
"learning_rate": 7.166518196993726e-06,
"loss": 0.2039,
"step": 701
},
{
"epoch": 3.7464976651100734,
"grad_norm": 0.3122606127392118,
"learning_rate": 7.109306310542193e-06,
"loss": 0.2247,
"step": 702
},
{
"epoch": 3.751834556370914,
"grad_norm": 0.3084720774983147,
"learning_rate": 7.052274304363449e-06,
"loss": 0.2263,
"step": 703
},
{
"epoch": 3.7571714476317544,
"grad_norm": 0.32807197455446596,
"learning_rate": 6.995422974297748e-06,
"loss": 0.2182,
"step": 704
},
{
"epoch": 3.7625083388925953,
"grad_norm": 0.29627151938725177,
"learning_rate": 6.938753113664138e-06,
"loss": 0.223,
"step": 705
},
{
"epoch": 3.7678452301534358,
"grad_norm": 0.30481388666511067,
"learning_rate": 6.882265513249376e-06,
"loss": 0.2153,
"step": 706
},
{
"epoch": 3.7731821214142762,
"grad_norm": 0.30367279516217394,
"learning_rate": 6.8259609612969245e-06,
"loss": 0.1961,
"step": 707
},
{
"epoch": 3.7785190126751167,
"grad_norm": 0.3311032509813633,
"learning_rate": 6.769840243495937e-06,
"loss": 0.2451,
"step": 708
},
{
"epoch": 3.783855903935957,
"grad_norm": 0.3124678571976269,
"learning_rate": 6.713904142970282e-06,
"loss": 0.215,
"step": 709
},
{
"epoch": 3.789192795196798,
"grad_norm": 0.31743166594564953,
"learning_rate": 6.658153440267649e-06,
"loss": 0.2176,
"step": 710
},
{
"epoch": 3.7945296864576386,
"grad_norm": 0.28782289424591967,
"learning_rate": 6.602588913348611e-06,
"loss": 0.2094,
"step": 711
},
{
"epoch": 3.799866577718479,
"grad_norm": 0.3124172224100049,
"learning_rate": 6.547211337575812e-06,
"loss": 0.2083,
"step": 712
},
{
"epoch": 3.8052034689793195,
"grad_norm": 0.3173760507861447,
"learning_rate": 6.4920214857031286e-06,
"loss": 0.2181,
"step": 713
},
{
"epoch": 3.81054036024016,
"grad_norm": 0.3191329518696444,
"learning_rate": 6.437020127864863e-06,
"loss": 0.2134,
"step": 714
},
{
"epoch": 3.815877251501001,
"grad_norm": 0.3051391344384358,
"learning_rate": 6.382208031565051e-06,
"loss": 0.2204,
"step": 715
},
{
"epoch": 3.8212141427618413,
"grad_norm": 0.31850920002112115,
"learning_rate": 6.327585961666703e-06,
"loss": 0.2253,
"step": 716
},
{
"epoch": 3.826551034022682,
"grad_norm": 0.29793389352549826,
"learning_rate": 6.273154680381152e-06,
"loss": 0.2078,
"step": 717
},
{
"epoch": 3.8318879252835223,
"grad_norm": 0.3003506055760915,
"learning_rate": 6.218914947257424e-06,
"loss": 0.231,
"step": 718
},
{
"epoch": 3.8372248165443628,
"grad_norm": 0.320501388188613,
"learning_rate": 6.164867519171609e-06,
"loss": 0.2244,
"step": 719
},
{
"epoch": 3.8425617078052037,
"grad_norm": 0.3141510603372196,
"learning_rate": 6.111013150316336e-06,
"loss": 0.2145,
"step": 720
},
{
"epoch": 3.847898599066044,
"grad_norm": 0.28759406096829065,
"learning_rate": 6.057352592190233e-06,
"loss": 0.2059,
"step": 721
},
{
"epoch": 3.8532354903268846,
"grad_norm": 0.2965011224320686,
"learning_rate": 6.003886593587429e-06,
"loss": 0.1994,
"step": 722
},
{
"epoch": 3.858572381587725,
"grad_norm": 0.32309929809226157,
"learning_rate": 5.9506159005871225e-06,
"loss": 0.2223,
"step": 723
},
{
"epoch": 3.8639092728485656,
"grad_norm": 0.3064344176543871,
"learning_rate": 5.897541256543171e-06,
"loss": 0.2165,
"step": 724
},
{
"epoch": 3.8692461641094065,
"grad_norm": 0.30830702220241546,
"learning_rate": 5.844663402073696e-06,
"loss": 0.222,
"step": 725
},
{
"epoch": 3.874583055370247,
"grad_norm": 0.29076332143502975,
"learning_rate": 5.791983075050773e-06,
"loss": 0.2048,
"step": 726
},
{
"epoch": 3.8799199466310874,
"grad_norm": 0.300114275064821,
"learning_rate": 5.739501010590132e-06,
"loss": 0.2084,
"step": 727
},
{
"epoch": 3.885256837891928,
"grad_norm": 0.3045488309986525,
"learning_rate": 5.68721794104087e-06,
"loss": 0.2197,
"step": 728
},
{
"epoch": 3.8905937291527684,
"grad_norm": 0.327552950910515,
"learning_rate": 5.635134595975285e-06,
"loss": 0.2175,
"step": 729
},
{
"epoch": 3.8959306204136093,
"grad_norm": 0.30484919275231337,
"learning_rate": 5.583251702178634e-06,
"loss": 0.2093,
"step": 730
},
{
"epoch": 3.9012675116744497,
"grad_norm": 0.2921364622612547,
"learning_rate": 5.531569983639045e-06,
"loss": 0.2058,
"step": 731
},
{
"epoch": 3.90660440293529,
"grad_norm": 0.30049818877171114,
"learning_rate": 5.480090161537388e-06,
"loss": 0.2259,
"step": 732
},
{
"epoch": 3.9119412941961307,
"grad_norm": 0.3139671791644443,
"learning_rate": 5.4288129542371995e-06,
"loss": 0.2197,
"step": 733
},
{
"epoch": 3.917278185456971,
"grad_norm": 0.31085024950173556,
"learning_rate": 5.377739077274688e-06,
"loss": 0.2223,
"step": 734
},
{
"epoch": 3.922615076717812,
"grad_norm": 0.27795634676595493,
"learning_rate": 5.326869243348734e-06,
"loss": 0.2087,
"step": 735
},
{
"epoch": 3.9279519679786525,
"grad_norm": 0.292718813818914,
"learning_rate": 5.276204162310938e-06,
"loss": 0.2138,
"step": 736
},
{
"epoch": 3.933288859239493,
"grad_norm": 0.3124029055450099,
"learning_rate": 5.225744541155731e-06,
"loss": 0.2202,
"step": 737
},
{
"epoch": 3.9386257505003335,
"grad_norm": 0.3056001516045311,
"learning_rate": 5.1754910840105e-06,
"loss": 0.2203,
"step": 738
},
{
"epoch": 3.943962641761174,
"grad_norm": 0.28861734955419716,
"learning_rate": 5.125444492125748e-06,
"loss": 0.2118,
"step": 739
},
{
"epoch": 3.949299533022015,
"grad_norm": 0.2944862914827709,
"learning_rate": 5.075605463865348e-06,
"loss": 0.215,
"step": 740
},
{
"epoch": 3.9546364242828553,
"grad_norm": 0.3018187959209579,
"learning_rate": 5.025974694696747e-06,
"loss": 0.2121,
"step": 741
},
{
"epoch": 3.959973315543696,
"grad_norm": 0.32968530294516596,
"learning_rate": 4.9765528771813065e-06,
"loss": 0.2194,
"step": 742
},
{
"epoch": 3.9653102068045363,
"grad_norm": 0.2895487447372504,
"learning_rate": 4.92734070096462e-06,
"loss": 0.1945,
"step": 743
},
{
"epoch": 3.9706470980653767,
"grad_norm": 0.30516795101430666,
"learning_rate": 4.878338852766871e-06,
"loss": 0.2218,
"step": 744
},
{
"epoch": 3.9759839893262177,
"grad_norm": 0.30604355308841524,
"learning_rate": 4.829548016373285e-06,
"loss": 0.2166,
"step": 745
},
{
"epoch": 3.981320880587058,
"grad_norm": 0.2907648161711581,
"learning_rate": 4.780968872624569e-06,
"loss": 0.2155,
"step": 746
},
{
"epoch": 3.9866577718478986,
"grad_norm": 0.2981568120720774,
"learning_rate": 4.732602099407402e-06,
"loss": 0.2271,
"step": 747
},
{
"epoch": 3.991994663108739,
"grad_norm": 0.30885152124963694,
"learning_rate": 4.684448371645003e-06,
"loss": 0.2183,
"step": 748
},
{
"epoch": 3.9973315543695795,
"grad_norm": 0.29884575186957585,
"learning_rate": 4.636508361287675e-06,
"loss": 0.2158,
"step": 749
},
{
"epoch": 4.0026684456304205,
"grad_norm": 0.601890871189883,
"learning_rate": 4.58878273730347e-06,
"loss": 0.3212,
"step": 750
},
{
"epoch": 4.0080053368912605,
"grad_norm": 0.5298412615002074,
"learning_rate": 4.541272165668829e-06,
"loss": 0.1575,
"step": 751
},
{
"epoch": 4.013342228152101,
"grad_norm": 0.42841188304774436,
"learning_rate": 4.493977309359279e-06,
"loss": 0.1504,
"step": 752
},
{
"epoch": 4.018679119412942,
"grad_norm": 0.3120373077214368,
"learning_rate": 4.4468988283402135e-06,
"loss": 0.1287,
"step": 753
},
{
"epoch": 4.024016010673782,
"grad_norm": 0.33027912692579064,
"learning_rate": 4.40003737955766e-06,
"loss": 0.161,
"step": 754
},
{
"epoch": 4.029352901934623,
"grad_norm": 0.39662465830617283,
"learning_rate": 4.353393616929118e-06,
"loss": 0.15,
"step": 755
},
{
"epoch": 4.034689793195463,
"grad_norm": 0.4875405375103197,
"learning_rate": 4.306968191334437e-06,
"loss": 0.1542,
"step": 756
},
{
"epoch": 4.040026684456304,
"grad_norm": 0.4647132436994288,
"learning_rate": 4.260761750606734e-06,
"loss": 0.1372,
"step": 757
},
{
"epoch": 4.045363575717145,
"grad_norm": 0.4316859008197165,
"learning_rate": 4.2147749395233365e-06,
"loss": 0.1507,
"step": 758
},
{
"epoch": 4.050700466977985,
"grad_norm": 0.36991769149322723,
"learning_rate": 4.1690083997968216e-06,
"loss": 0.1399,
"step": 759
},
{
"epoch": 4.056037358238826,
"grad_norm": 0.3081151075975586,
"learning_rate": 4.123462770066013e-06,
"loss": 0.1358,
"step": 760
},
{
"epoch": 4.061374249499666,
"grad_norm": 0.33035087289352366,
"learning_rate": 4.078138685887125e-06,
"loss": 0.137,
"step": 761
},
{
"epoch": 4.066711140760507,
"grad_norm": 0.35282903465510224,
"learning_rate": 4.033036779724848e-06,
"loss": 0.1423,
"step": 762
},
{
"epoch": 4.072048032021348,
"grad_norm": 0.35728767536135847,
"learning_rate": 3.988157680943536e-06,
"loss": 0.1471,
"step": 763
},
{
"epoch": 4.077384923282188,
"grad_norm": 0.35082302549758365,
"learning_rate": 3.943502015798437e-06,
"loss": 0.1385,
"step": 764
},
{
"epoch": 4.082721814543029,
"grad_norm": 0.30763129510502485,
"learning_rate": 3.899070407426948e-06,
"loss": 0.1308,
"step": 765
},
{
"epoch": 4.088058705803869,
"grad_norm": 0.31052345237501106,
"learning_rate": 3.854863475839898e-06,
"loss": 0.1508,
"step": 766
},
{
"epoch": 4.09339559706471,
"grad_norm": 0.29117416131590373,
"learning_rate": 3.810881837912934e-06,
"loss": 0.1359,
"step": 767
},
{
"epoch": 4.098732488325551,
"grad_norm": 0.32738573833211637,
"learning_rate": 3.7671261073778875e-06,
"loss": 0.1551,
"step": 768
},
{
"epoch": 4.104069379586391,
"grad_norm": 0.2879978040339806,
"learning_rate": 3.7235968948142098e-06,
"loss": 0.1285,
"step": 769
},
{
"epoch": 4.109406270847232,
"grad_norm": 0.3297634034251823,
"learning_rate": 3.6802948076404675e-06,
"loss": 0.1476,
"step": 770
},
{
"epoch": 4.114743162108072,
"grad_norm": 0.30016892407880663,
"learning_rate": 3.6372204501058494e-06,
"loss": 0.1295,
"step": 771
},
{
"epoch": 4.120080053368913,
"grad_norm": 0.3145248211219936,
"learning_rate": 3.5943744232817455e-06,
"loss": 0.1296,
"step": 772
},
{
"epoch": 4.1254169446297535,
"grad_norm": 0.31318694134932973,
"learning_rate": 3.551757325053362e-06,
"loss": 0.1482,
"step": 773
},
{
"epoch": 4.1307538358905935,
"grad_norm": 0.2948784037632883,
"learning_rate": 3.5093697501113645e-06,
"loss": 0.1422,
"step": 774
},
{
"epoch": 4.136090727151434,
"grad_norm": 0.2801444303260538,
"learning_rate": 3.4672122899435935e-06,
"loss": 0.1352,
"step": 775
},
{
"epoch": 4.1414276184122745,
"grad_norm": 0.2824012703736762,
"learning_rate": 3.4252855328268055e-06,
"loss": 0.1505,
"step": 776
},
{
"epoch": 4.146764509673115,
"grad_norm": 0.2877569800912583,
"learning_rate": 3.3835900638184538e-06,
"loss": 0.1336,
"step": 777
},
{
"epoch": 4.152101400933956,
"grad_norm": 0.2922681146418443,
"learning_rate": 3.3421264647485476e-06,
"loss": 0.1403,
"step": 778
},
{
"epoch": 4.157438292194796,
"grad_norm": 0.29571418661558607,
"learning_rate": 3.300895314211503e-06,
"loss": 0.1417,
"step": 779
},
{
"epoch": 4.162775183455637,
"grad_norm": 0.2807197693531769,
"learning_rate": 3.259897187558101e-06,
"loss": 0.1321,
"step": 780
},
{
"epoch": 4.168112074716477,
"grad_norm": 0.26947880551875214,
"learning_rate": 3.219132656887445e-06,
"loss": 0.1393,
"step": 781
},
{
"epoch": 4.173448965977318,
"grad_norm": 0.30694317413028555,
"learning_rate": 3.1786022910389524e-06,
"loss": 0.1541,
"step": 782
},
{
"epoch": 4.178785857238159,
"grad_norm": 0.2702053773322514,
"learning_rate": 3.1383066555844686e-06,
"loss": 0.1245,
"step": 783
},
{
"epoch": 4.184122748498999,
"grad_norm": 0.3036431279445982,
"learning_rate": 3.0982463128203346e-06,
"loss": 0.1378,
"step": 784
},
{
"epoch": 4.18945963975984,
"grad_norm": 0.2903132944155927,
"learning_rate": 3.058421821759545e-06,
"loss": 0.1446,
"step": 785
},
{
"epoch": 4.19479653102068,
"grad_norm": 0.2693686823614546,
"learning_rate": 3.0188337381239696e-06,
"loss": 0.1388,
"step": 786
},
{
"epoch": 4.200133422281521,
"grad_norm": 0.2547143693355037,
"learning_rate": 2.9794826143365794e-06,
"loss": 0.1284,
"step": 787
},
{
"epoch": 4.205470313542362,
"grad_norm": 0.2751535551881633,
"learning_rate": 2.940368999513734e-06,
"loss": 0.1375,
"step": 788
},
{
"epoch": 4.210807204803202,
"grad_norm": 0.2778666347500376,
"learning_rate": 2.901493439457543e-06,
"loss": 0.1303,
"step": 789
},
{
"epoch": 4.216144096064043,
"grad_norm": 0.2846619856811341,
"learning_rate": 2.8628564766482193e-06,
"loss": 0.1492,
"step": 790
},
{
"epoch": 4.221480987324883,
"grad_norm": 0.281092186868078,
"learning_rate": 2.824458650236532e-06,
"loss": 0.1414,
"step": 791
},
{
"epoch": 4.226817878585724,
"grad_norm": 0.2832353643098496,
"learning_rate": 2.7863004960362784e-06,
"loss": 0.141,
"step": 792
},
{
"epoch": 4.232154769846565,
"grad_norm": 0.26863830308187436,
"learning_rate": 2.748382546516799e-06,
"loss": 0.1276,
"step": 793
},
{
"epoch": 4.237491661107405,
"grad_norm": 0.2837290905016847,
"learning_rate": 2.7107053307955535e-06,
"loss": 0.1464,
"step": 794
},
{
"epoch": 4.242828552368246,
"grad_norm": 0.30003856403259976,
"learning_rate": 2.6732693746307405e-06,
"loss": 0.1467,
"step": 795
},
{
"epoch": 4.248165443629086,
"grad_norm": 0.3031551246837749,
"learning_rate": 2.6360752004139457e-06,
"loss": 0.141,
"step": 796
},
{
"epoch": 4.253502334889927,
"grad_norm": 0.2839203101647867,
"learning_rate": 2.599123327162876e-06,
"loss": 0.128,
"step": 797
},
{
"epoch": 4.2588392261507675,
"grad_norm": 0.28309288050123305,
"learning_rate": 2.5624142705140974e-06,
"loss": 0.1424,
"step": 798
},
{
"epoch": 4.2641761174116075,
"grad_norm": 0.3055488535217384,
"learning_rate": 2.5259485427158436e-06,
"loss": 0.1489,
"step": 799
},
{
"epoch": 4.269513008672448,
"grad_norm": 0.300540177885969,
"learning_rate": 2.489726652620883e-06,
"loss": 0.1362,
"step": 800
},
{
"epoch": 4.2748498999332885,
"grad_norm": 0.29347736303425087,
"learning_rate": 2.453749105679386e-06,
"loss": 0.1465,
"step": 801
},
{
"epoch": 4.280186791194129,
"grad_norm": 0.27707210287753226,
"learning_rate": 2.418016403931909e-06,
"loss": 0.1329,
"step": 802
},
{
"epoch": 4.28552368245497,
"grad_norm": 0.2920650228372569,
"learning_rate": 2.382529046002371e-06,
"loss": 0.1527,
"step": 803
},
{
"epoch": 4.29086057371581,
"grad_norm": 0.287631363925361,
"learning_rate": 2.347287527091082e-06,
"loss": 0.1422,
"step": 804
},
{
"epoch": 4.296197464976651,
"grad_norm": 0.28709688346306034,
"learning_rate": 2.3122923389678607e-06,
"loss": 0.1285,
"step": 805
},
{
"epoch": 4.301534356237491,
"grad_norm": 0.27342773854893104,
"learning_rate": 2.2775439699651567e-06,
"loss": 0.1388,
"step": 806
},
{
"epoch": 4.306871247498332,
"grad_norm": 0.2666822866123337,
"learning_rate": 2.2430429049712268e-06,
"loss": 0.1336,
"step": 807
},
{
"epoch": 4.312208138759173,
"grad_norm": 0.2983890987441143,
"learning_rate": 2.208789625423391e-06,
"loss": 0.1508,
"step": 808
},
{
"epoch": 4.317545030020013,
"grad_norm": 0.2800353859335473,
"learning_rate": 2.174784609301306e-06,
"loss": 0.1319,
"step": 809
},
{
"epoch": 4.322881921280854,
"grad_norm": 0.29623766050646816,
"learning_rate": 2.141028331120276e-06,
"loss": 0.1513,
"step": 810
},
{
"epoch": 4.328218812541694,
"grad_norm": 0.27571067910906955,
"learning_rate": 2.107521261924668e-06,
"loss": 0.1358,
"step": 811
},
{
"epoch": 4.333555703802535,
"grad_norm": 0.29103395140128885,
"learning_rate": 2.0742638692813033e-06,
"loss": 0.1309,
"step": 812
},
{
"epoch": 4.338892595063376,
"grad_norm": 0.2933371021781572,
"learning_rate": 2.0412566172729554e-06,
"loss": 0.1351,
"step": 813
},
{
"epoch": 4.344229486324216,
"grad_norm": 0.27215608874364955,
"learning_rate": 2.0084999664918725e-06,
"loss": 0.1475,
"step": 814
},
{
"epoch": 4.349566377585057,
"grad_norm": 0.2883150380634738,
"learning_rate": 1.9759943740333256e-06,
"loss": 0.1475,
"step": 815
},
{
"epoch": 4.354903268845897,
"grad_norm": 0.2777208010579726,
"learning_rate": 1.943740293489267e-06,
"loss": 0.1479,
"step": 816
},
{
"epoch": 4.360240160106738,
"grad_norm": 0.2824439004885457,
"learning_rate": 1.9117381749419794e-06,
"loss": 0.1471,
"step": 817
},
{
"epoch": 4.365577051367579,
"grad_norm": 0.28150269206830153,
"learning_rate": 1.8799884649577915e-06,
"loss": 0.132,
"step": 818
},
{
"epoch": 4.370913942628419,
"grad_norm": 0.29174288272730886,
"learning_rate": 1.8484916065808622e-06,
"loss": 0.1482,
"step": 819
},
{
"epoch": 4.37625083388926,
"grad_norm": 0.27656705304849727,
"learning_rate": 1.8172480393269797e-06,
"loss": 0.137,
"step": 820
},
{
"epoch": 4.3815877251501,
"grad_norm": 0.2757754977908488,
"learning_rate": 1.7862581991774486e-06,
"loss": 0.1377,
"step": 821
},
{
"epoch": 4.386924616410941,
"grad_norm": 0.2750527107134215,
"learning_rate": 1.755522518572994e-06,
"loss": 0.1207,
"step": 822
},
{
"epoch": 4.3922615076717815,
"grad_norm": 0.29276192591791234,
"learning_rate": 1.725041426407723e-06,
"loss": 0.1401,
"step": 823
},
{
"epoch": 4.3975983989326215,
"grad_norm": 0.29627925099149893,
"learning_rate": 1.6948153480231511e-06,
"loss": 0.1403,
"step": 824
},
{
"epoch": 4.402935290193462,
"grad_norm": 0.2781883617488326,
"learning_rate": 1.6648447052022643e-06,
"loss": 0.1266,
"step": 825
},
{
"epoch": 4.408272181454302,
"grad_norm": 0.2996634586484779,
"learning_rate": 1.6351299161636202e-06,
"loss": 0.1599,
"step": 826
},
{
"epoch": 4.413609072715143,
"grad_norm": 0.26162944731844284,
"learning_rate": 1.6056713955555349e-06,
"loss": 0.1365,
"step": 827
},
{
"epoch": 4.418945963975984,
"grad_norm": 0.30091733604376597,
"learning_rate": 1.5764695544502774e-06,
"loss": 0.1536,
"step": 828
},
{
"epoch": 4.424282855236824,
"grad_norm": 0.28660862503859624,
"learning_rate": 1.5475248003383382e-06,
"loss": 0.1343,
"step": 829
},
{
"epoch": 4.429619746497665,
"grad_norm": 0.2913443376429063,
"learning_rate": 1.5188375371227525e-06,
"loss": 0.1519,
"step": 830
},
{
"epoch": 4.434956637758505,
"grad_norm": 0.2701467713064408,
"learning_rate": 1.4904081651134527e-06,
"loss": 0.1262,
"step": 831
},
{
"epoch": 4.440293529019346,
"grad_norm": 0.2852211039949671,
"learning_rate": 1.462237081021689e-06,
"loss": 0.1501,
"step": 832
},
{
"epoch": 4.445630420280187,
"grad_norm": 0.28582152725735244,
"learning_rate": 1.4343246779544929e-06,
"loss": 0.1482,
"step": 833
},
{
"epoch": 4.450967311541027,
"grad_norm": 0.2763605840804222,
"learning_rate": 1.4066713454091808e-06,
"loss": 0.1375,
"step": 834
},
{
"epoch": 4.456304202801868,
"grad_norm": 0.2918688271426615,
"learning_rate": 1.3792774692679366e-06,
"loss": 0.1432,
"step": 835
},
{
"epoch": 4.461641094062708,
"grad_norm": 0.3054006010342176,
"learning_rate": 1.3521434317924186e-06,
"loss": 0.1491,
"step": 836
},
{
"epoch": 4.466977985323549,
"grad_norm": 0.2824524390265842,
"learning_rate": 1.3252696116184184e-06,
"loss": 0.13,
"step": 837
},
{
"epoch": 4.47231487658439,
"grad_norm": 0.2892331408612295,
"learning_rate": 1.2986563837505894e-06,
"loss": 0.1433,
"step": 838
},
{
"epoch": 4.47765176784523,
"grad_norm": 0.27746574091084786,
"learning_rate": 1.2723041195572106e-06,
"loss": 0.1375,
"step": 839
},
{
"epoch": 4.482988659106071,
"grad_norm": 0.2907412475565416,
"learning_rate": 1.246213186764995e-06,
"loss": 0.1578,
"step": 840
},
{
"epoch": 4.488325550366911,
"grad_norm": 0.27648095040345927,
"learning_rate": 1.2203839494539738e-06,
"loss": 0.1415,
"step": 841
},
{
"epoch": 4.493662441627752,
"grad_norm": 0.2821554850140892,
"learning_rate": 1.1948167680523981e-06,
"loss": 0.1416,
"step": 842
},
{
"epoch": 4.498999332888593,
"grad_norm": 0.2659555704630584,
"learning_rate": 1.1695119993317271e-06,
"loss": 0.1262,
"step": 843
},
{
"epoch": 4.504336224149433,
"grad_norm": 0.27101074307755924,
"learning_rate": 1.1444699964016448e-06,
"loss": 0.1339,
"step": 844
},
{
"epoch": 4.509673115410274,
"grad_norm": 0.27920563868852216,
"learning_rate": 1.1196911087051143e-06,
"loss": 0.144,
"step": 845
},
{
"epoch": 4.515010006671114,
"grad_norm": 0.2917517252311828,
"learning_rate": 1.0951756820135294e-06,
"loss": 0.1562,
"step": 846
},
{
"epoch": 4.5203468979319545,
"grad_norm": 0.2719082360193771,
"learning_rate": 1.070924058421876e-06,
"loss": 0.1393,
"step": 847
},
{
"epoch": 4.5256837891927955,
"grad_norm": 0.255901572414045,
"learning_rate": 1.0469365763439532e-06,
"loss": 0.1277,
"step": 848
},
{
"epoch": 4.5310206804536355,
"grad_norm": 0.2763364181685816,
"learning_rate": 1.0232135705076596e-06,
"loss": 0.1551,
"step": 849
},
{
"epoch": 4.536357571714476,
"grad_norm": 0.2962740172540047,
"learning_rate": 9.997553719503239e-07,
"loss": 0.1561,
"step": 850
},
{
"epoch": 4.541694462975316,
"grad_norm": 0.268054629742994,
"learning_rate": 9.765623080140774e-07,
"loss": 0.1234,
"step": 851
},
{
"epoch": 4.547031354236157,
"grad_norm": 0.3035670976166526,
"learning_rate": 9.536347023412928e-07,
"loss": 0.1636,
"step": 852
},
{
"epoch": 4.552368245496998,
"grad_norm": 0.2793525064851917,
"learning_rate": 9.309728748700574e-07,
"loss": 0.1398,
"step": 853
},
{
"epoch": 4.557705136757838,
"grad_norm": 0.28016641299104156,
"learning_rate": 9.085771418297274e-07,
"loss": 0.1292,
"step": 854
},
{
"epoch": 4.563042028018679,
"grad_norm": 0.27699022892058145,
"learning_rate": 8.864478157364997e-07,
"loss": 0.1435,
"step": 855
},
{
"epoch": 4.568378919279519,
"grad_norm": 0.2720294929447123,
"learning_rate": 8.645852053890547e-07,
"loss": 0.1324,
"step": 856
},
{
"epoch": 4.57371581054036,
"grad_norm": 0.28532423639550997,
"learning_rate": 8.429896158642492e-07,
"loss": 0.1472,
"step": 857
},
{
"epoch": 4.579052701801201,
"grad_norm": 0.26985015917853045,
"learning_rate": 8.216613485128611e-07,
"loss": 0.141,
"step": 858
},
{
"epoch": 4.584389593062041,
"grad_norm": 0.2704957723845787,
"learning_rate": 8.00600700955374e-07,
"loss": 0.1361,
"step": 859
},
{
"epoch": 4.589726484322882,
"grad_norm": 0.28012140457224527,
"learning_rate": 7.798079670778391e-07,
"loss": 0.1282,
"step": 860
},
{
"epoch": 4.595063375583722,
"grad_norm": 0.2782972818576599,
"learning_rate": 7.592834370277624e-07,
"loss": 0.1243,
"step": 861
},
{
"epoch": 4.600400266844563,
"grad_norm": 0.29307525545180607,
"learning_rate": 7.390273972100614e-07,
"loss": 0.1442,
"step": 862
},
{
"epoch": 4.605737158105404,
"grad_norm": 0.2702797863957165,
"learning_rate": 7.190401302830729e-07,
"loss": 0.1313,
"step": 863
},
{
"epoch": 4.611074049366244,
"grad_norm": 0.2884103769485427,
"learning_rate": 6.993219151545871e-07,
"loss": 0.1507,
"step": 864
},
{
"epoch": 4.616410940627085,
"grad_norm": 0.3042376630394671,
"learning_rate": 6.798730269779907e-07,
"loss": 0.1413,
"step": 865
},
{
"epoch": 4.621747831887925,
"grad_norm": 0.30142009513304885,
"learning_rate": 6.60693737148399e-07,
"loss": 0.1411,
"step": 866
},
{
"epoch": 4.627084723148766,
"grad_norm": 0.28563889877580295,
"learning_rate": 6.417843132988744e-07,
"loss": 0.138,
"step": 867
},
{
"epoch": 4.632421614409607,
"grad_norm": 0.2656629603197653,
"learning_rate": 6.231450192967048e-07,
"loss": 0.1421,
"step": 868
},
{
"epoch": 4.637758505670447,
"grad_norm": 0.26399752051623476,
"learning_rate": 6.047761152397025e-07,
"loss": 0.1325,
"step": 869
},
{
"epoch": 4.643095396931288,
"grad_norm": 0.27939357858366015,
"learning_rate": 5.866778574525933e-07,
"loss": 0.1346,
"step": 870
},
{
"epoch": 4.648432288192128,
"grad_norm": 0.268759877392077,
"learning_rate": 5.688504984834287e-07,
"loss": 0.1367,
"step": 871
},
{
"epoch": 4.6537691794529685,
"grad_norm": 0.2898319032741956,
"learning_rate": 5.512942871000549e-07,
"loss": 0.1516,
"step": 872
},
{
"epoch": 4.6591060707138094,
"grad_norm": 0.28063437477691194,
"learning_rate": 5.340094682866603e-07,
"loss": 0.1278,
"step": 873
},
{
"epoch": 4.6644429619746495,
"grad_norm": 0.2872370132188477,
"learning_rate": 5.169962832403475e-07,
"loss": 0.1502,
"step": 874
},
{
"epoch": 4.66977985323549,
"grad_norm": 0.2712192364166597,
"learning_rate": 5.002549693677594e-07,
"loss": 0.1337,
"step": 875
},
{
"epoch": 4.67511674449633,
"grad_norm": 0.29491089886365257,
"learning_rate": 4.837857602817808e-07,
"loss": 0.1503,
"step": 876
},
{
"epoch": 4.680453635757171,
"grad_norm": 0.27587942631382745,
"learning_rate": 4.675888857982669e-07,
"loss": 0.1327,
"step": 877
},
{
"epoch": 4.685790527018012,
"grad_norm": 0.29332732966215386,
"learning_rate": 4.5166457193284386e-07,
"loss": 0.1458,
"step": 878
},
{
"epoch": 4.691127418278852,
"grad_norm": 0.28714021402000245,
"learning_rate": 4.3601304089775366e-07,
"loss": 0.1466,
"step": 879
},
{
"epoch": 4.696464309539693,
"grad_norm": 0.26784688798952216,
"learning_rate": 4.2063451109874756e-07,
"loss": 0.1274,
"step": 880
},
{
"epoch": 4.701801200800533,
"grad_norm": 0.2797049205351308,
"learning_rate": 4.055291971320485e-07,
"loss": 0.142,
"step": 881
},
{
"epoch": 4.707138092061374,
"grad_norm": 0.26674171880102343,
"learning_rate": 3.906973097813449e-07,
"loss": 0.1293,
"step": 882
},
{
"epoch": 4.712474983322215,
"grad_norm": 0.25308546573318846,
"learning_rate": 3.76139056014857e-07,
"loss": 0.1315,
"step": 883
},
{
"epoch": 4.717811874583055,
"grad_norm": 0.2710104168106361,
"learning_rate": 3.6185463898245066e-07,
"loss": 0.1489,
"step": 884
},
{
"epoch": 4.723148765843896,
"grad_norm": 0.28048882254908447,
"learning_rate": 3.478442580127972e-07,
"loss": 0.144,
"step": 885
},
{
"epoch": 4.728485657104736,
"grad_norm": 0.26485229244166536,
"learning_rate": 3.341081086105891e-07,
"loss": 0.1255,
"step": 886
},
{
"epoch": 4.733822548365577,
"grad_norm": 0.2759331791232634,
"learning_rate": 3.2064638245382194e-07,
"loss": 0.1568,
"step": 887
},
{
"epoch": 4.739159439626418,
"grad_norm": 0.26413539339616676,
"learning_rate": 3.0745926739111033e-07,
"loss": 0.1342,
"step": 888
},
{
"epoch": 4.744496330887258,
"grad_norm": 0.28475596898962846,
"learning_rate": 2.9454694743907386e-07,
"loss": 0.1398,
"step": 889
},
{
"epoch": 4.749833222148099,
"grad_norm": 0.277687235094201,
"learning_rate": 2.819096027797641e-07,
"loss": 0.1342,
"step": 890
},
{
"epoch": 4.755170113408939,
"grad_norm": 0.2957077369519711,
"learning_rate": 2.6954740975815076e-07,
"loss": 0.1464,
"step": 891
},
{
"epoch": 4.76050700466978,
"grad_norm": 0.2907912947380036,
"learning_rate": 2.57460540879666e-07,
"loss": 0.154,
"step": 892
},
{
"epoch": 4.765843895930621,
"grad_norm": 0.2801914453007158,
"learning_rate": 2.4564916480778855e-07,
"loss": 0.1468,
"step": 893
},
{
"epoch": 4.771180787191461,
"grad_norm": 0.2711625198266653,
"learning_rate": 2.3411344636169898e-07,
"loss": 0.1301,
"step": 894
},
{
"epoch": 4.776517678452302,
"grad_norm": 0.2839350340429628,
"learning_rate": 2.228535465139703e-07,
"loss": 0.1495,
"step": 895
},
{
"epoch": 4.781854569713142,
"grad_norm": 0.27595704547443245,
"learning_rate": 2.1186962238833653e-07,
"loss": 0.1238,
"step": 896
},
{
"epoch": 4.7871914609739825,
"grad_norm": 0.28199603087443903,
"learning_rate": 2.0116182725748334e-07,
"loss": 0.1334,
"step": 897
},
{
"epoch": 4.792528352234823,
"grad_norm": 0.2743312329373981,
"learning_rate": 1.907303105409164e-07,
"loss": 0.146,
"step": 898
},
{
"epoch": 4.7978652434956635,
"grad_norm": 0.2780483555703688,
"learning_rate": 1.80575217802883e-07,
"loss": 0.1459,
"step": 899
},
{
"epoch": 4.803202134756504,
"grad_norm": 0.2805564993027487,
"learning_rate": 1.7069669075032492e-07,
"loss": 0.1393,
"step": 900
},
{
"epoch": 4.808539026017344,
"grad_norm": 0.2703447762812398,
"learning_rate": 1.6109486723092426e-07,
"loss": 0.1315,
"step": 901
},
{
"epoch": 4.813875917278185,
"grad_norm": 0.2847007738527403,
"learning_rate": 1.5176988123114966e-07,
"loss": 0.1495,
"step": 902
},
{
"epoch": 4.819212808539026,
"grad_norm": 0.2653003442730712,
"learning_rate": 1.4272186287441535e-07,
"loss": 0.1355,
"step": 903
},
{
"epoch": 4.824549699799866,
"grad_norm": 0.2822152085841712,
"learning_rate": 1.3395093841925166e-07,
"loss": 0.1576,
"step": 904
},
{
"epoch": 4.829886591060707,
"grad_norm": 0.27304693619702214,
"learning_rate": 1.2545723025753743e-07,
"loss": 0.1316,
"step": 905
},
{
"epoch": 4.835223482321548,
"grad_norm": 0.27468533143866714,
"learning_rate": 1.1724085691280806e-07,
"loss": 0.132,
"step": 906
},
{
"epoch": 4.840560373582388,
"grad_norm": 0.27963914069565776,
"learning_rate": 1.0930193303858805e-07,
"loss": 0.1461,
"step": 907
},
{
"epoch": 4.845897264843229,
"grad_norm": 0.2565432498525225,
"learning_rate": 1.0164056941679657e-07,
"loss": 0.1246,
"step": 908
},
{
"epoch": 4.851234156104069,
"grad_norm": 0.2811061477351414,
"learning_rate": 9.42568729561999e-08,
"loss": 0.1443,
"step": 909
},
{
"epoch": 4.85657104736491,
"grad_norm": 0.2857439707607634,
"learning_rate": 8.715094669092816e-08,
"loss": 0.1338,
"step": 910
},
{
"epoch": 4.861907938625751,
"grad_norm": 0.2767918923747537,
"learning_rate": 8.032288977901647e-08,
"loss": 0.1317,
"step": 911
},
{
"epoch": 4.867244829886591,
"grad_norm": 0.27736070767280885,
"learning_rate": 7.377279750104605e-08,
"loss": 0.1332,
"step": 912
},
{
"epoch": 4.872581721147432,
"grad_norm": 0.27702046457250123,
"learning_rate": 6.750076125880079e-08,
"loss": 0.1403,
"step": 913
},
{
"epoch": 4.877918612408272,
"grad_norm": 0.2726767818543548,
"learning_rate": 6.150686857399057e-08,
"loss": 0.1375,
"step": 914
},
{
"epoch": 4.883255503669113,
"grad_norm": 0.2770778486020138,
"learning_rate": 5.5791203087041114e-08,
"loss": 0.1448,
"step": 915
},
{
"epoch": 4.888592394929954,
"grad_norm": 0.2673740108848184,
"learning_rate": 5.0353844555910415e-08,
"loss": 0.1401,
"step": 916
},
{
"epoch": 4.893929286190794,
"grad_norm": 0.2773633602325955,
"learning_rate": 4.5194868854991913e-08,
"loss": 0.1408,
"step": 917
},
{
"epoch": 4.899266177451635,
"grad_norm": 0.29556441642236714,
"learning_rate": 4.031434797404421e-08,
"loss": 0.1381,
"step": 918
},
{
"epoch": 4.904603068712475,
"grad_norm": 0.2597939381066577,
"learning_rate": 3.571235001719853e-08,
"loss": 0.1283,
"step": 919
},
{
"epoch": 4.909939959973316,
"grad_norm": 0.26747252132965876,
"learning_rate": 3.13889392019906e-08,
"loss": 0.1424,
"step": 920
},
{
"epoch": 4.9152768512341565,
"grad_norm": 0.28253875872873746,
"learning_rate": 2.734417585848137e-08,
"loss": 0.1428,
"step": 921
},
{
"epoch": 4.9206137424949965,
"grad_norm": 0.28810571816428815,
"learning_rate": 2.3578116428408792e-08,
"loss": 0.1478,
"step": 922
},
{
"epoch": 4.925950633755837,
"grad_norm": 0.2585384085401356,
"learning_rate": 2.0090813464395122e-08,
"loss": 0.1388,
"step": 923
},
{
"epoch": 4.931287525016677,
"grad_norm": 0.2820464252629273,
"learning_rate": 1.6882315629225267e-08,
"loss": 0.1495,
"step": 924
},
{
"epoch": 4.936624416277518,
"grad_norm": 0.283069152205124,
"learning_rate": 1.3952667695156241e-08,
"loss": 0.1303,
"step": 925
},
{
"epoch": 4.941961307538359,
"grad_norm": 0.27731020890016556,
"learning_rate": 1.1301910543295436e-08,
"loss": 0.1329,
"step": 926
},
{
"epoch": 4.947298198799199,
"grad_norm": 0.2906776181838218,
"learning_rate": 8.93008116303884e-09,
"loss": 0.1623,
"step": 927
},
{
"epoch": 4.95263509006004,
"grad_norm": 0.27016994542838946,
"learning_rate": 6.8372126515403594e-09,
"loss": 0.1419,
"step": 928
},
{
"epoch": 4.95797198132088,
"grad_norm": 0.268976173834872,
"learning_rate": 5.0233342132632865e-09,
"loss": 0.1335,
"step": 929
},
{
"epoch": 4.963308872581721,
"grad_norm": 0.2831312656649367,
"learning_rate": 3.4884711595650765e-09,
"loss": 0.1481,
"step": 930
},
{
"epoch": 4.968645763842562,
"grad_norm": 0.2723703468394432,
"learning_rate": 2.2326449083420745e-09,
"loss": 0.1325,
"step": 931
},
{
"epoch": 4.973982655103402,
"grad_norm": 0.2875352034119992,
"learning_rate": 1.255872983740858e-09,
"loss": 0.1477,
"step": 932
},
{
"epoch": 4.979319546364243,
"grad_norm": 0.2737232836278831,
"learning_rate": 5.581690159006669e-10,
"loss": 0.1447,
"step": 933
},
{
"epoch": 4.984656437625083,
"grad_norm": 0.2956259263292112,
"learning_rate": 1.3954274078020748e-10,
"loss": 0.1562,
"step": 934
},
{
"epoch": 4.989993328885924,
"grad_norm": 0.285955394980644,
"learning_rate": 0.0,
"loss": 0.1344,
"step": 935
},
{
"epoch": 4.989993328885924,
"step": 935,
"total_flos": 1.946622601061204e+18,
"train_loss": 0.3608587793966028,
"train_runtime": 57464.5588,
"train_samples_per_second": 2.087,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 935,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.946622601061204e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}