EtashGuha's picture
Upload model
fb64284 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 395,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.012658227848101266,
"grad_norm": 0.5064402371961658,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.7968,
"step": 1
},
{
"epoch": 0.02531645569620253,
"grad_norm": 0.4347700160422649,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7971,
"step": 2
},
{
"epoch": 0.0379746835443038,
"grad_norm": 0.4600069866451295,
"learning_rate": 3e-06,
"loss": 0.7682,
"step": 3
},
{
"epoch": 0.05063291139240506,
"grad_norm": 0.38856164898778117,
"learning_rate": 4.000000000000001e-06,
"loss": 0.804,
"step": 4
},
{
"epoch": 0.06329113924050633,
"grad_norm": 0.2550662772218312,
"learning_rate": 5e-06,
"loss": 0.8014,
"step": 5
},
{
"epoch": 0.0759493670886076,
"grad_norm": 0.9890291578960435,
"learning_rate": 6e-06,
"loss": 0.8146,
"step": 6
},
{
"epoch": 0.08860759493670886,
"grad_norm": 0.8553074952653977,
"learning_rate": 7e-06,
"loss": 0.7909,
"step": 7
},
{
"epoch": 0.10126582278481013,
"grad_norm": 0.5481080562458982,
"learning_rate": 8.000000000000001e-06,
"loss": 0.806,
"step": 8
},
{
"epoch": 0.11392405063291139,
"grad_norm": 0.37793043326257825,
"learning_rate": 9e-06,
"loss": 0.7858,
"step": 9
},
{
"epoch": 0.12658227848101267,
"grad_norm": 0.5081915085652062,
"learning_rate": 1e-05,
"loss": 0.749,
"step": 10
},
{
"epoch": 0.13924050632911392,
"grad_norm": 0.6521886644076518,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.8069,
"step": 11
},
{
"epoch": 0.1518987341772152,
"grad_norm": 0.5026663766470839,
"learning_rate": 1.2e-05,
"loss": 0.7862,
"step": 12
},
{
"epoch": 0.16455696202531644,
"grad_norm": 0.43919041970979394,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.7846,
"step": 13
},
{
"epoch": 0.17721518987341772,
"grad_norm": 0.49349482728877997,
"learning_rate": 1.4e-05,
"loss": 0.7633,
"step": 14
},
{
"epoch": 0.189873417721519,
"grad_norm": 0.44284955692741484,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.7951,
"step": 15
},
{
"epoch": 0.20253164556962025,
"grad_norm": 0.46628061714192676,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.8081,
"step": 16
},
{
"epoch": 0.21518987341772153,
"grad_norm": 0.41362438662118695,
"learning_rate": 1.7e-05,
"loss": 0.767,
"step": 17
},
{
"epoch": 0.22784810126582278,
"grad_norm": 0.43028098922164054,
"learning_rate": 1.8e-05,
"loss": 0.8082,
"step": 18
},
{
"epoch": 0.24050632911392406,
"grad_norm": 0.4088201650759829,
"learning_rate": 1.9e-05,
"loss": 0.7876,
"step": 19
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.4236896815328937,
"learning_rate": 2e-05,
"loss": 0.7934,
"step": 20
},
{
"epoch": 0.26582278481012656,
"grad_norm": 0.3801519316427054,
"learning_rate": 2.1000000000000002e-05,
"loss": 0.8217,
"step": 21
},
{
"epoch": 0.27848101265822783,
"grad_norm": 0.4353103208992625,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.802,
"step": 22
},
{
"epoch": 0.2911392405063291,
"grad_norm": 0.4056304765778736,
"learning_rate": 2.3e-05,
"loss": 0.8004,
"step": 23
},
{
"epoch": 0.3037974683544304,
"grad_norm": 0.42870239352523093,
"learning_rate": 2.4e-05,
"loss": 0.8005,
"step": 24
},
{
"epoch": 0.31645569620253167,
"grad_norm": 0.4388221915073862,
"learning_rate": 2.5e-05,
"loss": 0.7949,
"step": 25
},
{
"epoch": 0.3291139240506329,
"grad_norm": 0.47344060385708536,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.7977,
"step": 26
},
{
"epoch": 0.34177215189873417,
"grad_norm": 0.4871886869801061,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.8074,
"step": 27
},
{
"epoch": 0.35443037974683544,
"grad_norm": 0.5731199334104394,
"learning_rate": 2.8e-05,
"loss": 0.8203,
"step": 28
},
{
"epoch": 0.3670886075949367,
"grad_norm": 0.9389275389340311,
"learning_rate": 2.9e-05,
"loss": 0.8262,
"step": 29
},
{
"epoch": 0.379746835443038,
"grad_norm": 1.2115180715535663,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.8104,
"step": 30
},
{
"epoch": 0.3924050632911392,
"grad_norm": 0.8412491217200665,
"learning_rate": 3.1e-05,
"loss": 0.7697,
"step": 31
},
{
"epoch": 0.4050632911392405,
"grad_norm": 1.1810315994151994,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.8183,
"step": 32
},
{
"epoch": 0.4177215189873418,
"grad_norm": 0.7336114416401421,
"learning_rate": 3.3e-05,
"loss": 0.7915,
"step": 33
},
{
"epoch": 0.43037974683544306,
"grad_norm": 1.0322176098181708,
"learning_rate": 3.4e-05,
"loss": 0.8054,
"step": 34
},
{
"epoch": 0.4430379746835443,
"grad_norm": 1.0870886821908625,
"learning_rate": 3.5000000000000004e-05,
"loss": 0.7942,
"step": 35
},
{
"epoch": 0.45569620253164556,
"grad_norm": 1.1509564204460234,
"learning_rate": 3.6e-05,
"loss": 0.8058,
"step": 36
},
{
"epoch": 0.46835443037974683,
"grad_norm": 0.8072450413210277,
"learning_rate": 3.7000000000000005e-05,
"loss": 0.8012,
"step": 37
},
{
"epoch": 0.4810126582278481,
"grad_norm": 1.1375575820376653,
"learning_rate": 3.8e-05,
"loss": 0.7727,
"step": 38
},
{
"epoch": 0.4936708860759494,
"grad_norm": 0.6984855676314381,
"learning_rate": 3.9e-05,
"loss": 0.8279,
"step": 39
},
{
"epoch": 0.5063291139240507,
"grad_norm": 0.6890356108321417,
"learning_rate": 4e-05,
"loss": 0.8,
"step": 40
},
{
"epoch": 0.5189873417721519,
"grad_norm": 0.7836814209745863,
"learning_rate": 3.9999216858560603e-05,
"loss": 0.849,
"step": 41
},
{
"epoch": 0.5316455696202531,
"grad_norm": 0.770626888486783,
"learning_rate": 3.999686749557346e-05,
"loss": 0.7986,
"step": 42
},
{
"epoch": 0.5443037974683544,
"grad_norm": 0.5889698309751599,
"learning_rate": 3.999295209502691e-05,
"loss": 0.7991,
"step": 43
},
{
"epoch": 0.5569620253164557,
"grad_norm": 0.7643640268290252,
"learning_rate": 3.998747096355221e-05,
"loss": 0.8069,
"step": 44
},
{
"epoch": 0.569620253164557,
"grad_norm": 0.5973666051464454,
"learning_rate": 3.9980424530399475e-05,
"loss": 0.818,
"step": 45
},
{
"epoch": 0.5822784810126582,
"grad_norm": 0.6018566974452557,
"learning_rate": 3.997181334740408e-05,
"loss": 0.8202,
"step": 46
},
{
"epoch": 0.5949367088607594,
"grad_norm": 0.500026806418615,
"learning_rate": 3.9961638088943465e-05,
"loss": 0.7914,
"step": 47
},
{
"epoch": 0.6075949367088608,
"grad_norm": 0.5536025945061174,
"learning_rate": 3.994989955188427e-05,
"loss": 0.7837,
"step": 48
},
{
"epoch": 0.620253164556962,
"grad_norm": 0.4765009895810983,
"learning_rate": 3.993659865551998e-05,
"loss": 0.814,
"step": 49
},
{
"epoch": 0.6329113924050633,
"grad_norm": 0.47847721633882073,
"learning_rate": 3.99217364414989e-05,
"loss": 0.8074,
"step": 50
},
{
"epoch": 0.6455696202531646,
"grad_norm": 0.47280549738166827,
"learning_rate": 3.990531407374262e-05,
"loss": 0.8049,
"step": 51
},
{
"epoch": 0.6582278481012658,
"grad_norm": 0.42005120027664167,
"learning_rate": 3.9887332838354784e-05,
"loss": 0.8341,
"step": 52
},
{
"epoch": 0.6708860759493671,
"grad_norm": 0.423268612947329,
"learning_rate": 3.986779414352047e-05,
"loss": 0.7464,
"step": 53
},
{
"epoch": 0.6835443037974683,
"grad_norm": 0.42906392610316346,
"learning_rate": 3.984669951939583e-05,
"loss": 0.7935,
"step": 54
},
{
"epoch": 0.6962025316455697,
"grad_norm": 0.4670274491545538,
"learning_rate": 3.982405061798829e-05,
"loss": 0.7739,
"step": 55
},
{
"epoch": 0.7088607594936709,
"grad_norm": 0.5205590281674367,
"learning_rate": 3.9799849213027186e-05,
"loss": 0.8025,
"step": 56
},
{
"epoch": 0.7215189873417721,
"grad_norm": 0.47278324921832826,
"learning_rate": 3.9774097199824824e-05,
"loss": 0.7848,
"step": 57
},
{
"epoch": 0.7341772151898734,
"grad_norm": 0.45158006725544253,
"learning_rate": 3.974679659512807e-05,
"loss": 0.8095,
"step": 58
},
{
"epoch": 0.7468354430379747,
"grad_norm": 0.4391143464763561,
"learning_rate": 3.971794953696041e-05,
"loss": 0.783,
"step": 59
},
{
"epoch": 0.759493670886076,
"grad_norm": 0.45441260412279694,
"learning_rate": 3.9687558284454515e-05,
"loss": 0.7963,
"step": 60
},
{
"epoch": 0.7721518987341772,
"grad_norm": 0.4550922518852085,
"learning_rate": 3.96556252176753e-05,
"loss": 0.8334,
"step": 61
},
{
"epoch": 0.7848101265822784,
"grad_norm": 0.3712309670648417,
"learning_rate": 3.962215283743356e-05,
"loss": 0.8051,
"step": 62
},
{
"epoch": 0.7974683544303798,
"grad_norm": 0.3585063618209627,
"learning_rate": 3.9587143765090096e-05,
"loss": 0.8459,
"step": 63
},
{
"epoch": 0.810126582278481,
"grad_norm": 0.37138144105274296,
"learning_rate": 3.955060074235045e-05,
"loss": 0.853,
"step": 64
},
{
"epoch": 0.8227848101265823,
"grad_norm": 0.4363375524824417,
"learning_rate": 3.951252663105014e-05,
"loss": 0.8164,
"step": 65
},
{
"epoch": 0.8354430379746836,
"grad_norm": 0.47787123482225796,
"learning_rate": 3.947292441293062e-05,
"loss": 0.8222,
"step": 66
},
{
"epoch": 0.8481012658227848,
"grad_norm": 0.5017223706338163,
"learning_rate": 3.943179718940569e-05,
"loss": 0.7934,
"step": 67
},
{
"epoch": 0.8607594936708861,
"grad_norm": 0.4949845610321875,
"learning_rate": 3.938914818131866e-05,
"loss": 0.7981,
"step": 68
},
{
"epoch": 0.8734177215189873,
"grad_norm": 0.49920491454568733,
"learning_rate": 3.934498072869008e-05,
"loss": 0.8281,
"step": 69
},
{
"epoch": 0.8860759493670886,
"grad_norm": 0.45034846062729655,
"learning_rate": 3.92992982904562e-05,
"loss": 0.7719,
"step": 70
},
{
"epoch": 0.8987341772151899,
"grad_norm": 0.44394745297725413,
"learning_rate": 3.925210444419806e-05,
"loss": 0.7916,
"step": 71
},
{
"epoch": 0.9113924050632911,
"grad_norm": 0.38715911801365394,
"learning_rate": 3.9203402885861334e-05,
"loss": 0.8019,
"step": 72
},
{
"epoch": 0.9240506329113924,
"grad_norm": 0.48760649952731494,
"learning_rate": 3.915319742946687e-05,
"loss": 0.8223,
"step": 73
},
{
"epoch": 0.9367088607594937,
"grad_norm": 0.5616971172642145,
"learning_rate": 3.910149200681199e-05,
"loss": 0.8079,
"step": 74
},
{
"epoch": 0.9493670886075949,
"grad_norm": 0.5425560304150727,
"learning_rate": 3.904829066716263e-05,
"loss": 0.8329,
"step": 75
},
{
"epoch": 0.9620253164556962,
"grad_norm": 0.4498377825200447,
"learning_rate": 3.8993597576936154e-05,
"loss": 0.8053,
"step": 76
},
{
"epoch": 0.9746835443037974,
"grad_norm": 0.34642979074048774,
"learning_rate": 3.893741701937509e-05,
"loss": 0.8226,
"step": 77
},
{
"epoch": 0.9873417721518988,
"grad_norm": 0.4402127889952929,
"learning_rate": 3.887975339421172e-05,
"loss": 0.8552,
"step": 78
},
{
"epoch": 1.0,
"grad_norm": 0.44688397413240505,
"learning_rate": 3.882061121732349e-05,
"loss": 0.7747,
"step": 79
},
{
"epoch": 1.0126582278481013,
"grad_norm": 0.4768098067060385,
"learning_rate": 3.8759995120379355e-05,
"loss": 0.7735,
"step": 80
},
{
"epoch": 1.0253164556962024,
"grad_norm": 0.5623984337396924,
"learning_rate": 3.869790985047704e-05,
"loss": 0.7581,
"step": 81
},
{
"epoch": 1.0379746835443038,
"grad_norm": 0.576604325996257,
"learning_rate": 3.863436026977132e-05,
"loss": 0.7318,
"step": 82
},
{
"epoch": 1.0506329113924051,
"grad_norm": 0.5059595351594764,
"learning_rate": 3.85693513550932e-05,
"loss": 0.7125,
"step": 83
},
{
"epoch": 1.0632911392405062,
"grad_norm": 0.6040561142397214,
"learning_rate": 3.850288819756019e-05,
"loss": 0.7377,
"step": 84
},
{
"epoch": 1.0759493670886076,
"grad_norm": 0.5450877100328588,
"learning_rate": 3.843497600217757e-05,
"loss": 0.7555,
"step": 85
},
{
"epoch": 1.0886075949367089,
"grad_norm": 0.6556623197542489,
"learning_rate": 3.836562008743079e-05,
"loss": 0.7659,
"step": 86
},
{
"epoch": 1.1012658227848102,
"grad_norm": 0.6204913635746594,
"learning_rate": 3.8294825884868926e-05,
"loss": 0.6745,
"step": 87
},
{
"epoch": 1.1139240506329113,
"grad_norm": 0.6187620108671996,
"learning_rate": 3.822259893867936e-05,
"loss": 0.7572,
"step": 88
},
{
"epoch": 1.1265822784810127,
"grad_norm": 0.63340304982857,
"learning_rate": 3.814894490525356e-05,
"loss": 0.735,
"step": 89
},
{
"epoch": 1.139240506329114,
"grad_norm": 0.6506066390683658,
"learning_rate": 3.807386955274408e-05,
"loss": 0.7628,
"step": 90
},
{
"epoch": 1.1518987341772151,
"grad_norm": 0.7725263361365678,
"learning_rate": 3.79973787606129e-05,
"loss": 0.7295,
"step": 91
},
{
"epoch": 1.1645569620253164,
"grad_norm": 0.6852645943983061,
"learning_rate": 3.7919478519170917e-05,
"loss": 0.7416,
"step": 92
},
{
"epoch": 1.1772151898734178,
"grad_norm": 0.47014030202252644,
"learning_rate": 3.7840174929108854e-05,
"loss": 0.7671,
"step": 93
},
{
"epoch": 1.189873417721519,
"grad_norm": 0.7194557838652339,
"learning_rate": 3.775947420101948e-05,
"loss": 0.7862,
"step": 94
},
{
"epoch": 1.2025316455696202,
"grad_norm": 0.6351379876630061,
"learning_rate": 3.767738265491122e-05,
"loss": 0.7313,
"step": 95
},
{
"epoch": 1.2151898734177216,
"grad_norm": 0.5068026128077132,
"learning_rate": 3.759390671971325e-05,
"loss": 0.753,
"step": 96
},
{
"epoch": 1.2278481012658227,
"grad_norm": 0.5153165957337801,
"learning_rate": 3.750905293277197e-05,
"loss": 0.7108,
"step": 97
},
{
"epoch": 1.240506329113924,
"grad_norm": 0.4463914078700893,
"learning_rate": 3.742282793933906e-05,
"loss": 0.7415,
"step": 98
},
{
"epoch": 1.2531645569620253,
"grad_norm": 0.43884131852560887,
"learning_rate": 3.733523849205105e-05,
"loss": 0.7492,
"step": 99
},
{
"epoch": 1.2658227848101267,
"grad_norm": 0.47035841712987453,
"learning_rate": 3.724629145040056e-05,
"loss": 0.7238,
"step": 100
},
{
"epoch": 1.2784810126582278,
"grad_norm": 0.3948950399020623,
"learning_rate": 3.715599378019899e-05,
"loss": 0.7715,
"step": 101
},
{
"epoch": 1.2911392405063291,
"grad_norm": 0.4199178359283047,
"learning_rate": 3.7064352553031077e-05,
"loss": 0.7384,
"step": 102
},
{
"epoch": 1.3037974683544304,
"grad_norm": 0.3810474521618989,
"learning_rate": 3.6971374945701076e-05,
"loss": 0.7311,
"step": 103
},
{
"epoch": 1.3164556962025316,
"grad_norm": 0.43493588286043355,
"learning_rate": 3.687706823967073e-05,
"loss": 0.7197,
"step": 104
},
{
"epoch": 1.3291139240506329,
"grad_norm": 0.41688680545253287,
"learning_rate": 3.6781439820488974e-05,
"loss": 0.7375,
"step": 105
},
{
"epoch": 1.3417721518987342,
"grad_norm": 0.4278932668997099,
"learning_rate": 3.66844971772136e-05,
"loss": 0.7262,
"step": 106
},
{
"epoch": 1.3544303797468356,
"grad_norm": 0.4174836859191787,
"learning_rate": 3.6586247901824724e-05,
"loss": 0.7262,
"step": 107
},
{
"epoch": 1.3670886075949367,
"grad_norm": 0.38016614127893766,
"learning_rate": 3.648669968863023e-05,
"loss": 0.758,
"step": 108
},
{
"epoch": 1.379746835443038,
"grad_norm": 0.3858746148822291,
"learning_rate": 3.6385860333663236e-05,
"loss": 0.7175,
"step": 109
},
{
"epoch": 1.3924050632911391,
"grad_norm": 0.4231365876121473,
"learning_rate": 3.628373773407149e-05,
"loss": 0.7308,
"step": 110
},
{
"epoch": 1.4050632911392404,
"grad_norm": 0.3877572843485212,
"learning_rate": 3.6180339887498953e-05,
"loss": 0.7385,
"step": 111
},
{
"epoch": 1.4177215189873418,
"grad_norm": 0.4824392041857397,
"learning_rate": 3.6075674891459466e-05,
"loss": 0.7468,
"step": 112
},
{
"epoch": 1.4303797468354431,
"grad_norm": 0.3857299814846945,
"learning_rate": 3.59697509427026e-05,
"loss": 0.7357,
"step": 113
},
{
"epoch": 1.4430379746835442,
"grad_norm": 0.3846553226693622,
"learning_rate": 3.5862576336571725e-05,
"loss": 0.7079,
"step": 114
},
{
"epoch": 1.4556962025316456,
"grad_norm": 0.3824538614301241,
"learning_rate": 3.575415946635437e-05,
"loss": 0.7305,
"step": 115
},
{
"epoch": 1.4683544303797469,
"grad_norm": 0.48456126880227873,
"learning_rate": 3.564450882262491e-05,
"loss": 0.7712,
"step": 116
},
{
"epoch": 1.481012658227848,
"grad_norm": 0.43078583341853544,
"learning_rate": 3.5533632992579644e-05,
"loss": 0.7397,
"step": 117
},
{
"epoch": 1.4936708860759493,
"grad_norm": 0.3228096958908832,
"learning_rate": 3.542154065936429e-05,
"loss": 0.7628,
"step": 118
},
{
"epoch": 1.5063291139240507,
"grad_norm": 0.36882818550163615,
"learning_rate": 3.530824060139396e-05,
"loss": 0.7512,
"step": 119
},
{
"epoch": 1.518987341772152,
"grad_norm": 0.4152208277284043,
"learning_rate": 3.51937416916657e-05,
"loss": 0.7277,
"step": 120
},
{
"epoch": 1.5316455696202531,
"grad_norm": 0.45042265983600566,
"learning_rate": 3.507805289706362e-05,
"loss": 0.7774,
"step": 121
},
{
"epoch": 1.5443037974683544,
"grad_norm": 0.4093269543947324,
"learning_rate": 3.496118327765662e-05,
"loss": 0.7377,
"step": 122
},
{
"epoch": 1.5569620253164556,
"grad_norm": 0.34666215542123036,
"learning_rate": 3.48431419859889e-05,
"loss": 0.6989,
"step": 123
},
{
"epoch": 1.5696202531645569,
"grad_norm": 0.41171436492196756,
"learning_rate": 3.472393826636317e-05,
"loss": 0.7268,
"step": 124
},
{
"epoch": 1.5822784810126582,
"grad_norm": 0.39148886668508454,
"learning_rate": 3.460358145411669e-05,
"loss": 0.7261,
"step": 125
},
{
"epoch": 1.5949367088607596,
"grad_norm": 0.35889572408473175,
"learning_rate": 3.4482080974890163e-05,
"loss": 0.7589,
"step": 126
},
{
"epoch": 1.6075949367088609,
"grad_norm": 0.31803696533319203,
"learning_rate": 3.4359446343889624e-05,
"loss": 0.7575,
"step": 127
},
{
"epoch": 1.620253164556962,
"grad_norm": 0.3442315711735276,
"learning_rate": 3.4235687165141215e-05,
"loss": 0.7554,
"step": 128
},
{
"epoch": 1.6329113924050633,
"grad_norm": 0.3499026072634049,
"learning_rate": 3.411081313073906e-05,
"loss": 0.7267,
"step": 129
},
{
"epoch": 1.6455696202531644,
"grad_norm": 0.3260690025254289,
"learning_rate": 3.398483402008629e-05,
"loss": 0.7418,
"step": 130
},
{
"epoch": 1.6582278481012658,
"grad_norm": 0.34543980322658424,
"learning_rate": 3.385775969912908e-05,
"loss": 0.7427,
"step": 131
},
{
"epoch": 1.6708860759493671,
"grad_norm": 0.2988847374657102,
"learning_rate": 3.3729600119584115e-05,
"loss": 0.7638,
"step": 132
},
{
"epoch": 1.6835443037974684,
"grad_norm": 0.35687120319995136,
"learning_rate": 3.3600365318159136e-05,
"loss": 0.757,
"step": 133
},
{
"epoch": 1.6962025316455698,
"grad_norm": 0.38899552043256846,
"learning_rate": 3.3470065415767004e-05,
"loss": 0.7182,
"step": 134
},
{
"epoch": 1.7088607594936709,
"grad_norm": 0.3737695829412944,
"learning_rate": 3.333871061673302e-05,
"loss": 0.7499,
"step": 135
},
{
"epoch": 1.721518987341772,
"grad_norm": 0.34946652396486505,
"learning_rate": 3.3206311207995816e-05,
"loss": 0.751,
"step": 136
},
{
"epoch": 1.7341772151898733,
"grad_norm": 0.3905323790588904,
"learning_rate": 3.307287755830176e-05,
"loss": 0.7622,
"step": 137
},
{
"epoch": 1.7468354430379747,
"grad_norm": 0.4436210357200594,
"learning_rate": 3.2938420117392896e-05,
"loss": 0.7522,
"step": 138
},
{
"epoch": 1.759493670886076,
"grad_norm": 0.38693991633101565,
"learning_rate": 3.28029494151886e-05,
"loss": 0.7213,
"step": 139
},
{
"epoch": 1.7721518987341773,
"grad_norm": 0.39844510406756306,
"learning_rate": 3.2666476060960945e-05,
"loss": 0.7457,
"step": 140
},
{
"epoch": 1.7848101265822784,
"grad_norm": 0.3438391926323602,
"learning_rate": 3.252901074250384e-05,
"loss": 0.7379,
"step": 141
},
{
"epoch": 1.7974683544303798,
"grad_norm": 0.3706564483674349,
"learning_rate": 3.2390564225296015e-05,
"loss": 0.7182,
"step": 142
},
{
"epoch": 1.810126582278481,
"grad_norm": 0.3372688197355965,
"learning_rate": 3.225114735165797e-05,
"loss": 0.7246,
"step": 143
},
{
"epoch": 1.8227848101265822,
"grad_norm": 0.3490816845008961,
"learning_rate": 3.211077103990278e-05,
"loss": 0.7545,
"step": 144
},
{
"epoch": 1.8354430379746836,
"grad_norm": 0.3243213100104822,
"learning_rate": 3.1969446283481157e-05,
"loss": 0.7326,
"step": 145
},
{
"epoch": 1.8481012658227849,
"grad_norm": 0.33681274516080584,
"learning_rate": 3.18271841501204e-05,
"loss": 0.7286,
"step": 146
},
{
"epoch": 1.8607594936708862,
"grad_norm": 0.3263960804788961,
"learning_rate": 3.1683995780957706e-05,
"loss": 0.7408,
"step": 147
},
{
"epoch": 1.8734177215189873,
"grad_norm": 0.4287588920382392,
"learning_rate": 3.153989238966763e-05,
"loss": 0.7244,
"step": 148
},
{
"epoch": 1.8860759493670884,
"grad_norm": 0.38141083746028315,
"learning_rate": 3.13948852615839e-05,
"loss": 0.784,
"step": 149
},
{
"epoch": 1.8987341772151898,
"grad_norm": 0.34623678740897884,
"learning_rate": 3.124898575281562e-05,
"loss": 0.7436,
"step": 150
},
{
"epoch": 1.9113924050632911,
"grad_norm": 0.31200271097016913,
"learning_rate": 3.11022052893579e-05,
"loss": 0.7289,
"step": 151
},
{
"epoch": 1.9240506329113924,
"grad_norm": 0.345655007271936,
"learning_rate": 3.095455536619711e-05,
"loss": 0.7175,
"step": 152
},
{
"epoch": 1.9367088607594938,
"grad_norm": 0.2985815658075589,
"learning_rate": 3.080604754641057e-05,
"loss": 0.7395,
"step": 153
},
{
"epoch": 1.9493670886075949,
"grad_norm": 0.29992477661085093,
"learning_rate": 3.065669346026106e-05,
"loss": 0.7121,
"step": 154
},
{
"epoch": 1.9620253164556962,
"grad_norm": 0.3091766644852409,
"learning_rate": 3.0506504804285977e-05,
"loss": 0.7742,
"step": 155
},
{
"epoch": 1.9746835443037973,
"grad_norm": 0.3004387796291261,
"learning_rate": 3.0355493340381347e-05,
"loss": 0.7566,
"step": 156
},
{
"epoch": 1.9873417721518987,
"grad_norm": 0.2818258498162388,
"learning_rate": 3.020367089488069e-05,
"loss": 0.7343,
"step": 157
},
{
"epoch": 2.0,
"grad_norm": 0.46120255431120377,
"learning_rate": 3.0051049357628855e-05,
"loss": 0.7143,
"step": 158
},
{
"epoch": 2.0126582278481013,
"grad_norm": 0.7007547558193586,
"learning_rate": 2.9897640681050877e-05,
"loss": 0.6753,
"step": 159
},
{
"epoch": 2.0253164556962027,
"grad_norm": 0.9125428937129891,
"learning_rate": 2.9743456879215934e-05,
"loss": 0.6672,
"step": 160
},
{
"epoch": 2.037974683544304,
"grad_norm": 0.6576444790472206,
"learning_rate": 2.9588510026896485e-05,
"loss": 0.6357,
"step": 161
},
{
"epoch": 2.050632911392405,
"grad_norm": 0.6240263238612856,
"learning_rate": 2.9432812258622615e-05,
"loss": 0.654,
"step": 162
},
{
"epoch": 2.0632911392405062,
"grad_norm": 0.636252929878224,
"learning_rate": 2.9276375767731762e-05,
"loss": 0.6887,
"step": 163
},
{
"epoch": 2.0759493670886076,
"grad_norm": 0.5494250514074286,
"learning_rate": 2.91192128054138e-05,
"loss": 0.6473,
"step": 164
},
{
"epoch": 2.088607594936709,
"grad_norm": 0.5045894432855015,
"learning_rate": 2.8961335679751573e-05,
"loss": 0.6344,
"step": 165
},
{
"epoch": 2.1012658227848102,
"grad_norm": 0.5005394468401111,
"learning_rate": 2.8802756754757023e-05,
"loss": 0.6585,
"step": 166
},
{
"epoch": 2.1139240506329116,
"grad_norm": 0.49433013853859775,
"learning_rate": 2.864348844940292e-05,
"loss": 0.6407,
"step": 167
},
{
"epoch": 2.1265822784810124,
"grad_norm": 0.4621959574809036,
"learning_rate": 2.8483543236650245e-05,
"loss": 0.6551,
"step": 168
},
{
"epoch": 2.1392405063291138,
"grad_norm": 0.4815573164281136,
"learning_rate": 2.832293364247141e-05,
"loss": 0.646,
"step": 169
},
{
"epoch": 2.151898734177215,
"grad_norm": 0.41298258433472074,
"learning_rate": 2.81616722448693e-05,
"loss": 0.6421,
"step": 170
},
{
"epoch": 2.1645569620253164,
"grad_norm": 0.482755731008833,
"learning_rate": 2.7999771672892213e-05,
"loss": 0.6706,
"step": 171
},
{
"epoch": 2.1772151898734178,
"grad_norm": 0.4103357196002942,
"learning_rate": 2.783724460564485e-05,
"loss": 0.6301,
"step": 172
},
{
"epoch": 2.189873417721519,
"grad_norm": 0.4305743048719905,
"learning_rate": 2.7674103771295345e-05,
"loss": 0.645,
"step": 173
},
{
"epoch": 2.2025316455696204,
"grad_norm": 0.3767776887976166,
"learning_rate": 2.7510361946078482e-05,
"loss": 0.6574,
"step": 174
},
{
"epoch": 2.2151898734177213,
"grad_norm": 0.4176066280959923,
"learning_rate": 2.734603195329514e-05,
"loss": 0.6892,
"step": 175
},
{
"epoch": 2.2278481012658227,
"grad_norm": 0.43231771488851645,
"learning_rate": 2.7181126662308015e-05,
"loss": 0.6604,
"step": 176
},
{
"epoch": 2.240506329113924,
"grad_norm": 0.31970836274455994,
"learning_rate": 2.7015658987533807e-05,
"loss": 0.6631,
"step": 177
},
{
"epoch": 2.2531645569620253,
"grad_norm": 0.40412759499171763,
"learning_rate": 2.684964188743182e-05,
"loss": 0.6245,
"step": 178
},
{
"epoch": 2.2658227848101267,
"grad_norm": 0.33056157659688,
"learning_rate": 2.6683088363489118e-05,
"loss": 0.6591,
"step": 179
},
{
"epoch": 2.278481012658228,
"grad_norm": 0.3825570425350689,
"learning_rate": 2.6516011459202356e-05,
"loss": 0.6735,
"step": 180
},
{
"epoch": 2.291139240506329,
"grad_norm": 0.44429816345893774,
"learning_rate": 2.634842425905626e-05,
"loss": 0.6572,
"step": 181
},
{
"epoch": 2.3037974683544302,
"grad_norm": 0.36759937742377186,
"learning_rate": 2.618033988749895e-05,
"loss": 0.6773,
"step": 182
},
{
"epoch": 2.3164556962025316,
"grad_norm": 0.7066277320761736,
"learning_rate": 2.6011771507914096e-05,
"loss": 0.6712,
"step": 183
},
{
"epoch": 2.329113924050633,
"grad_norm": 0.3328197240799398,
"learning_rate": 2.5842732321590034e-05,
"loss": 0.6364,
"step": 184
},
{
"epoch": 2.3417721518987342,
"grad_norm": 0.32651931336905726,
"learning_rate": 2.5673235566685935e-05,
"loss": 0.6351,
"step": 185
},
{
"epoch": 2.3544303797468356,
"grad_norm": 0.30693242039281354,
"learning_rate": 2.5503294517195062e-05,
"loss": 0.6547,
"step": 186
},
{
"epoch": 2.367088607594937,
"grad_norm": 0.3203146955735119,
"learning_rate": 2.533292248190523e-05,
"loss": 0.6399,
"step": 187
},
{
"epoch": 2.379746835443038,
"grad_norm": 0.34990420700832414,
"learning_rate": 2.5162132803356522e-05,
"loss": 0.647,
"step": 188
},
{
"epoch": 2.392405063291139,
"grad_norm": 0.3521480721118919,
"learning_rate": 2.499093885679642e-05,
"loss": 0.6339,
"step": 189
},
{
"epoch": 2.4050632911392404,
"grad_norm": 0.30986847706468584,
"learning_rate": 2.481935404913229e-05,
"loss": 0.6454,
"step": 190
},
{
"epoch": 2.4177215189873418,
"grad_norm": 0.323632412960347,
"learning_rate": 2.4647391817881457e-05,
"loss": 0.6729,
"step": 191
},
{
"epoch": 2.430379746835443,
"grad_norm": 0.30405952286836885,
"learning_rate": 2.4475065630118855e-05,
"loss": 0.6392,
"step": 192
},
{
"epoch": 2.4430379746835444,
"grad_norm": 0.3154660761081758,
"learning_rate": 2.4302388981422355e-05,
"loss": 0.6673,
"step": 193
},
{
"epoch": 2.4556962025316453,
"grad_norm": 0.2941674571513733,
"learning_rate": 2.4129375394815878e-05,
"loss": 0.6517,
"step": 194
},
{
"epoch": 2.4683544303797467,
"grad_norm": 0.30895015825392347,
"learning_rate": 2.3956038419710348e-05,
"loss": 0.6633,
"step": 195
},
{
"epoch": 2.481012658227848,
"grad_norm": 0.27566382378720106,
"learning_rate": 2.3782391630842587e-05,
"loss": 0.6045,
"step": 196
},
{
"epoch": 2.4936708860759493,
"grad_norm": 0.3384225646038617,
"learning_rate": 2.3608448627212207e-05,
"loss": 0.6405,
"step": 197
},
{
"epoch": 2.5063291139240507,
"grad_norm": 0.2832906947274659,
"learning_rate": 2.343422303101664e-05,
"loss": 0.6848,
"step": 198
},
{
"epoch": 2.518987341772152,
"grad_norm": 0.3299659781184637,
"learning_rate": 2.3259728486584297e-05,
"loss": 0.6615,
"step": 199
},
{
"epoch": 2.5316455696202533,
"grad_norm": 0.3200656687512706,
"learning_rate": 2.3084978659306048e-05,
"loss": 0.6912,
"step": 200
},
{
"epoch": 2.5443037974683547,
"grad_norm": 0.308174008124388,
"learning_rate": 2.2909987234565026e-05,
"loss": 0.6785,
"step": 201
},
{
"epoch": 2.5569620253164556,
"grad_norm": 0.3544663621063538,
"learning_rate": 2.2734767916664854e-05,
"loss": 0.6323,
"step": 202
},
{
"epoch": 2.569620253164557,
"grad_norm": 0.3354628232787944,
"learning_rate": 2.255933442775641e-05,
"loss": 0.6524,
"step": 203
},
{
"epoch": 2.5822784810126582,
"grad_norm": 0.32689453372708943,
"learning_rate": 2.2383700506763204e-05,
"loss": 0.6748,
"step": 204
},
{
"epoch": 2.5949367088607596,
"grad_norm": 0.31737671727647254,
"learning_rate": 2.22078799083054e-05,
"loss": 0.6363,
"step": 205
},
{
"epoch": 2.607594936708861,
"grad_norm": 0.2825787669688371,
"learning_rate": 2.203188640162265e-05,
"loss": 0.6218,
"step": 206
},
{
"epoch": 2.620253164556962,
"grad_norm": 0.336620459599055,
"learning_rate": 2.185573376949578e-05,
"loss": 0.6731,
"step": 207
},
{
"epoch": 2.632911392405063,
"grad_norm": 0.3156677816260327,
"learning_rate": 2.167943580716737e-05,
"loss": 0.621,
"step": 208
},
{
"epoch": 2.6455696202531644,
"grad_norm": 0.34535943079382136,
"learning_rate": 2.150300632126142e-05,
"loss": 0.6288,
"step": 209
},
{
"epoch": 2.6582278481012658,
"grad_norm": 0.3124546569938381,
"learning_rate": 2.132645912870208e-05,
"loss": 0.6761,
"step": 210
},
{
"epoch": 2.670886075949367,
"grad_norm": 0.34600235147462305,
"learning_rate": 2.1149808055631603e-05,
"loss": 0.6679,
"step": 211
},
{
"epoch": 2.6835443037974684,
"grad_norm": 0.27065178228808984,
"learning_rate": 2.0973066936327565e-05,
"loss": 0.6321,
"step": 212
},
{
"epoch": 2.6962025316455698,
"grad_norm": 0.3362060155593516,
"learning_rate": 2.0796249612119405e-05,
"loss": 0.6677,
"step": 213
},
{
"epoch": 2.708860759493671,
"grad_norm": 0.2973638329827599,
"learning_rate": 2.061936993030451e-05,
"loss": 0.6412,
"step": 214
},
{
"epoch": 2.721518987341772,
"grad_norm": 0.3220694196130532,
"learning_rate": 2.044244174306374e-05,
"loss": 0.6541,
"step": 215
},
{
"epoch": 2.7341772151898733,
"grad_norm": 0.3027641705895448,
"learning_rate": 2.026547890637662e-05,
"loss": 0.6019,
"step": 216
},
{
"epoch": 2.7468354430379747,
"grad_norm": 0.3207247320311963,
"learning_rate": 2.0088495278936212e-05,
"loss": 0.6417,
"step": 217
},
{
"epoch": 2.759493670886076,
"grad_norm": 0.36163473633397253,
"learning_rate": 1.991150472106379e-05,
"loss": 0.6766,
"step": 218
},
{
"epoch": 2.7721518987341773,
"grad_norm": 0.2691926923083107,
"learning_rate": 1.9734521093623388e-05,
"loss": 0.6562,
"step": 219
},
{
"epoch": 2.7848101265822782,
"grad_norm": 0.34359844342765505,
"learning_rate": 1.9557558256936266e-05,
"loss": 0.6446,
"step": 220
},
{
"epoch": 2.7974683544303796,
"grad_norm": 0.2645899195098649,
"learning_rate": 1.9380630069695498e-05,
"loss": 0.6554,
"step": 221
},
{
"epoch": 2.810126582278481,
"grad_norm": 0.29583460748029244,
"learning_rate": 1.9203750387880602e-05,
"loss": 0.6227,
"step": 222
},
{
"epoch": 2.8227848101265822,
"grad_norm": 0.264820920107283,
"learning_rate": 1.9026933063672445e-05,
"loss": 0.6447,
"step": 223
},
{
"epoch": 2.8354430379746836,
"grad_norm": 0.30771232323975206,
"learning_rate": 1.88501919443684e-05,
"loss": 0.6315,
"step": 224
},
{
"epoch": 2.848101265822785,
"grad_norm": 0.2553957352820967,
"learning_rate": 1.8673540871297927e-05,
"loss": 0.6478,
"step": 225
},
{
"epoch": 2.8607594936708862,
"grad_norm": 0.32629305580702406,
"learning_rate": 1.8496993678738587e-05,
"loss": 0.6201,
"step": 226
},
{
"epoch": 2.8734177215189876,
"grad_norm": 0.26966933565658424,
"learning_rate": 1.8320564192832634e-05,
"loss": 0.6299,
"step": 227
},
{
"epoch": 2.8860759493670884,
"grad_norm": 0.31554171568490863,
"learning_rate": 1.8144266230504227e-05,
"loss": 0.6681,
"step": 228
},
{
"epoch": 2.8987341772151898,
"grad_norm": 0.24839739411092723,
"learning_rate": 1.7968113598377356e-05,
"loss": 0.6602,
"step": 229
},
{
"epoch": 2.911392405063291,
"grad_norm": 0.3091165612385795,
"learning_rate": 1.779212009169461e-05,
"loss": 0.6233,
"step": 230
},
{
"epoch": 2.9240506329113924,
"grad_norm": 0.2952522337046769,
"learning_rate": 1.7616299493236806e-05,
"loss": 0.5962,
"step": 231
},
{
"epoch": 2.9367088607594938,
"grad_norm": 0.25845370633874015,
"learning_rate": 1.74406655722436e-05,
"loss": 0.6636,
"step": 232
},
{
"epoch": 2.9493670886075947,
"grad_norm": 0.2648453369227927,
"learning_rate": 1.7265232083335153e-05,
"loss": 0.6138,
"step": 233
},
{
"epoch": 2.962025316455696,
"grad_norm": 0.273906606291419,
"learning_rate": 1.7090012765434974e-05,
"loss": 0.6455,
"step": 234
},
{
"epoch": 2.9746835443037973,
"grad_norm": 0.27228477538008333,
"learning_rate": 1.6915021340693952e-05,
"loss": 0.6657,
"step": 235
},
{
"epoch": 2.9873417721518987,
"grad_norm": 0.2522393915225999,
"learning_rate": 1.6740271513415706e-05,
"loss": 0.6516,
"step": 236
},
{
"epoch": 3.0,
"grad_norm": 0.8108189735549292,
"learning_rate": 1.6565776968983365e-05,
"loss": 0.5983,
"step": 237
},
{
"epoch": 3.0126582278481013,
"grad_norm": 1.232958795490363,
"learning_rate": 1.6391551372787796e-05,
"loss": 0.5634,
"step": 238
},
{
"epoch": 3.0253164556962027,
"grad_norm": 0.7844357772988507,
"learning_rate": 1.6217608369157417e-05,
"loss": 0.5224,
"step": 239
},
{
"epoch": 3.037974683544304,
"grad_norm": 0.7401617998186302,
"learning_rate": 1.6043961580289656e-05,
"loss": 0.5593,
"step": 240
},
{
"epoch": 3.050632911392405,
"grad_norm": 0.7914854111998476,
"learning_rate": 1.5870624605184125e-05,
"loss": 0.5482,
"step": 241
},
{
"epoch": 3.0632911392405062,
"grad_norm": 0.41812530802608444,
"learning_rate": 1.569761101857765e-05,
"loss": 0.5137,
"step": 242
},
{
"epoch": 3.0759493670886076,
"grad_norm": 0.7451435988290669,
"learning_rate": 1.552493436988115e-05,
"loss": 0.563,
"step": 243
},
{
"epoch": 3.088607594936709,
"grad_norm": 0.49313219014296494,
"learning_rate": 1.5352608182118546e-05,
"loss": 0.5424,
"step": 244
},
{
"epoch": 3.1012658227848102,
"grad_norm": 0.49266839267171114,
"learning_rate": 1.5180645950867714e-05,
"loss": 0.5249,
"step": 245
},
{
"epoch": 3.1139240506329116,
"grad_norm": 0.4851788656855937,
"learning_rate": 1.5009061143203585e-05,
"loss": 0.578,
"step": 246
},
{
"epoch": 3.1265822784810124,
"grad_norm": 0.4031242612434171,
"learning_rate": 1.4837867196643481e-05,
"loss": 0.5394,
"step": 247
},
{
"epoch": 3.1392405063291138,
"grad_norm": 0.4659828084891507,
"learning_rate": 1.466707751809478e-05,
"loss": 0.5356,
"step": 248
},
{
"epoch": 3.151898734177215,
"grad_norm": 0.4410019177616585,
"learning_rate": 1.4496705482804943e-05,
"loss": 0.5746,
"step": 249
},
{
"epoch": 3.1645569620253164,
"grad_norm": 0.4151476929851989,
"learning_rate": 1.4326764433314066e-05,
"loss": 0.5787,
"step": 250
},
{
"epoch": 3.1772151898734178,
"grad_norm": 0.383715772905628,
"learning_rate": 1.4157267678409969e-05,
"loss": 0.527,
"step": 251
},
{
"epoch": 3.189873417721519,
"grad_norm": 0.3626013559607639,
"learning_rate": 1.3988228492085911e-05,
"loss": 0.5433,
"step": 252
},
{
"epoch": 3.2025316455696204,
"grad_norm": 0.4357243664660663,
"learning_rate": 1.3819660112501054e-05,
"loss": 0.5408,
"step": 253
},
{
"epoch": 3.2151898734177213,
"grad_norm": 0.3559742228799713,
"learning_rate": 1.3651575740943746e-05,
"loss": 0.5162,
"step": 254
},
{
"epoch": 3.2278481012658227,
"grad_norm": 0.37225929452921475,
"learning_rate": 1.3483988540797652e-05,
"loss": 0.558,
"step": 255
},
{
"epoch": 3.240506329113924,
"grad_norm": 0.37206907021086916,
"learning_rate": 1.331691163651089e-05,
"loss": 0.533,
"step": 256
},
{
"epoch": 3.2531645569620253,
"grad_norm": 0.33216380945004786,
"learning_rate": 1.315035811256819e-05,
"loss": 0.5075,
"step": 257
},
{
"epoch": 3.2658227848101267,
"grad_norm": 0.3559597126097541,
"learning_rate": 1.29843410124662e-05,
"loss": 0.537,
"step": 258
},
{
"epoch": 3.278481012658228,
"grad_norm": 0.320506398509115,
"learning_rate": 1.2818873337691993e-05,
"loss": 0.5483,
"step": 259
},
{
"epoch": 3.291139240506329,
"grad_norm": 0.42262029967412273,
"learning_rate": 1.265396804670487e-05,
"loss": 0.5238,
"step": 260
},
{
"epoch": 3.3037974683544302,
"grad_norm": 0.30332749065508885,
"learning_rate": 1.2489638053921525e-05,
"loss": 0.5401,
"step": 261
},
{
"epoch": 3.3164556962025316,
"grad_norm": 0.29727127727930525,
"learning_rate": 1.2325896228704656e-05,
"loss": 0.5448,
"step": 262
},
{
"epoch": 3.329113924050633,
"grad_norm": 0.30191023413246443,
"learning_rate": 1.2162755394355152e-05,
"loss": 0.525,
"step": 263
},
{
"epoch": 3.3417721518987342,
"grad_norm": 0.26659773269162895,
"learning_rate": 1.2000228327107787e-05,
"loss": 0.5156,
"step": 264
},
{
"epoch": 3.3544303797468356,
"grad_norm": 0.2914720363368646,
"learning_rate": 1.1838327755130701e-05,
"loss": 0.5414,
"step": 265
},
{
"epoch": 3.367088607594937,
"grad_norm": 0.27291872478588397,
"learning_rate": 1.1677066357528591e-05,
"loss": 0.5423,
"step": 266
},
{
"epoch": 3.379746835443038,
"grad_norm": 0.26389957420834564,
"learning_rate": 1.151645676334976e-05,
"loss": 0.5566,
"step": 267
},
{
"epoch": 3.392405063291139,
"grad_norm": 0.2860279883558859,
"learning_rate": 1.1356511550597085e-05,
"loss": 0.5107,
"step": 268
},
{
"epoch": 3.4050632911392404,
"grad_norm": 0.25422431621057706,
"learning_rate": 1.1197243245242978e-05,
"loss": 0.5218,
"step": 269
},
{
"epoch": 3.4177215189873418,
"grad_norm": 0.25158942663841277,
"learning_rate": 1.103866432024843e-05,
"loss": 0.5262,
"step": 270
},
{
"epoch": 3.430379746835443,
"grad_norm": 0.2589285114479854,
"learning_rate": 1.0880787194586206e-05,
"loss": 0.5134,
"step": 271
},
{
"epoch": 3.4430379746835444,
"grad_norm": 0.2430388328861912,
"learning_rate": 1.0723624232268244e-05,
"loss": 0.5422,
"step": 272
},
{
"epoch": 3.4556962025316453,
"grad_norm": 0.25667840874038994,
"learning_rate": 1.0567187741377394e-05,
"loss": 0.5442,
"step": 273
},
{
"epoch": 3.4683544303797467,
"grad_norm": 0.2444208638869945,
"learning_rate": 1.0411489973103525e-05,
"loss": 0.5236,
"step": 274
},
{
"epoch": 3.481012658227848,
"grad_norm": 0.25520542385502243,
"learning_rate": 1.0256543120784074e-05,
"loss": 0.5307,
"step": 275
},
{
"epoch": 3.4936708860759493,
"grad_norm": 0.22684740767572276,
"learning_rate": 1.0102359318949136e-05,
"loss": 0.5146,
"step": 276
},
{
"epoch": 3.5063291139240507,
"grad_norm": 0.25241690666255856,
"learning_rate": 9.948950642371157e-06,
"loss": 0.5518,
"step": 277
},
{
"epoch": 3.518987341772152,
"grad_norm": 0.23534671335842364,
"learning_rate": 9.79632910511932e-06,
"loss": 0.5216,
"step": 278
},
{
"epoch": 3.5316455696202533,
"grad_norm": 0.24981549650103071,
"learning_rate": 9.64450665961866e-06,
"loss": 0.4958,
"step": 279
},
{
"epoch": 3.5443037974683547,
"grad_norm": 0.22674198828661088,
"learning_rate": 9.493495195714028e-06,
"loss": 0.5204,
"step": 280
},
{
"epoch": 3.5569620253164556,
"grad_norm": 0.22144196882208078,
"learning_rate": 9.343306539738947e-06,
"loss": 0.5227,
"step": 281
},
{
"epoch": 3.569620253164557,
"grad_norm": 0.2448947065204887,
"learning_rate": 9.193952453589436e-06,
"loss": 0.5384,
"step": 282
},
{
"epoch": 3.5822784810126582,
"grad_norm": 0.23465042331194777,
"learning_rate": 9.045444633802891e-06,
"loss": 0.535,
"step": 283
},
{
"epoch": 3.5949367088607596,
"grad_norm": 0.23044954454232866,
"learning_rate": 8.897794710642098e-06,
"loss": 0.543,
"step": 284
},
{
"epoch": 3.607594936708861,
"grad_norm": 0.23172133896309619,
"learning_rate": 8.751014247184388e-06,
"loss": 0.5676,
"step": 285
},
{
"epoch": 3.620253164556962,
"grad_norm": 0.2239517484243891,
"learning_rate": 8.605114738416103e-06,
"loss": 0.5513,
"step": 286
},
{
"epoch": 3.632911392405063,
"grad_norm": 0.22102907836542232,
"learning_rate": 8.460107610332371e-06,
"loss": 0.5388,
"step": 287
},
{
"epoch": 3.6455696202531644,
"grad_norm": 0.20401099424371477,
"learning_rate": 8.316004219042297e-06,
"loss": 0.5481,
"step": 288
},
{
"epoch": 3.6582278481012658,
"grad_norm": 0.228631540338843,
"learning_rate": 8.172815849879607e-06,
"loss": 0.528,
"step": 289
},
{
"epoch": 3.670886075949367,
"grad_norm": 0.22456084983117358,
"learning_rate": 8.030553716518853e-06,
"loss": 0.5367,
"step": 290
},
{
"epoch": 3.6835443037974684,
"grad_norm": 0.2144948006077472,
"learning_rate": 7.889228960097228e-06,
"loss": 0.5495,
"step": 291
},
{
"epoch": 3.6962025316455698,
"grad_norm": 0.20727905170409366,
"learning_rate": 7.748852648342038e-06,
"loss": 0.5353,
"step": 292
},
{
"epoch": 3.708860759493671,
"grad_norm": 0.22303267730870863,
"learning_rate": 7.609435774703982e-06,
"loss": 0.5529,
"step": 293
},
{
"epoch": 3.721518987341772,
"grad_norm": 0.22134421892787995,
"learning_rate": 7.470989257496164e-06,
"loss": 0.5393,
"step": 294
},
{
"epoch": 3.7341772151898733,
"grad_norm": 0.21417557348033345,
"learning_rate": 7.333523939039057e-06,
"loss": 0.5545,
"step": 295
},
{
"epoch": 3.7468354430379747,
"grad_norm": 0.21894766689359085,
"learning_rate": 7.197050584811405e-06,
"loss": 0.5298,
"step": 296
},
{
"epoch": 3.759493670886076,
"grad_norm": 0.2168491406607123,
"learning_rate": 7.061579882607108e-06,
"loss": 0.5169,
"step": 297
},
{
"epoch": 3.7721518987341773,
"grad_norm": 0.21650711756284644,
"learning_rate": 6.9271224416982394e-06,
"loss": 0.5638,
"step": 298
},
{
"epoch": 3.7848101265822782,
"grad_norm": 0.21621186272192375,
"learning_rate": 6.7936887920041825e-06,
"loss": 0.5747,
"step": 299
},
{
"epoch": 3.7974683544303796,
"grad_norm": 0.22551644100804194,
"learning_rate": 6.661289383266984e-06,
"loss": 0.526,
"step": 300
},
{
"epoch": 3.810126582278481,
"grad_norm": 0.21154644204339484,
"learning_rate": 6.529934584233e-06,
"loss": 0.5246,
"step": 301
},
{
"epoch": 3.8227848101265822,
"grad_norm": 0.21191179945056313,
"learning_rate": 6.399634681840865e-06,
"loss": 0.5202,
"step": 302
},
{
"epoch": 3.8354430379746836,
"grad_norm": 0.21949330053946264,
"learning_rate": 6.270399880415894e-06,
"loss": 0.5362,
"step": 303
},
{
"epoch": 3.848101265822785,
"grad_norm": 0.21496707551459943,
"learning_rate": 6.1422403008709255e-06,
"loss": 0.5578,
"step": 304
},
{
"epoch": 3.8607594936708862,
"grad_norm": 0.21879240379023276,
"learning_rate": 6.01516597991372e-06,
"loss": 0.5491,
"step": 305
},
{
"epoch": 3.8734177215189876,
"grad_norm": 0.20979663130601378,
"learning_rate": 5.889186869260941e-06,
"loss": 0.5531,
"step": 306
},
{
"epoch": 3.8860759493670884,
"grad_norm": 0.21587617052561336,
"learning_rate": 5.764312834858792e-06,
"loss": 0.55,
"step": 307
},
{
"epoch": 3.8987341772151898,
"grad_norm": 0.20020932496573068,
"learning_rate": 5.640553656110379e-06,
"loss": 0.5522,
"step": 308
},
{
"epoch": 3.911392405063291,
"grad_norm": 0.20804979119508457,
"learning_rate": 5.517919025109839e-06,
"loss": 0.5716,
"step": 309
},
{
"epoch": 3.9240506329113924,
"grad_norm": 0.20614216741584623,
"learning_rate": 5.396418545883318e-06,
"loss": 0.4969,
"step": 310
},
{
"epoch": 3.9367088607594938,
"grad_norm": 0.20211375359560393,
"learning_rate": 5.276061733636833e-06,
"loss": 0.5284,
"step": 311
},
{
"epoch": 3.9493670886075947,
"grad_norm": 0.19804610287780805,
"learning_rate": 5.156858014011104e-06,
"loss": 0.5728,
"step": 312
},
{
"epoch": 3.962025316455696,
"grad_norm": 0.20530850942815684,
"learning_rate": 5.038816722343387e-06,
"loss": 0.5184,
"step": 313
},
{
"epoch": 3.9746835443037973,
"grad_norm": 0.21108690974572725,
"learning_rate": 4.921947102936388e-06,
"loss": 0.5706,
"step": 314
},
{
"epoch": 3.9873417721518987,
"grad_norm": 0.19727815232903248,
"learning_rate": 4.806258308334306e-06,
"loss": 0.5692,
"step": 315
},
{
"epoch": 4.0,
"grad_norm": 0.9831501621136417,
"learning_rate": 4.69175939860605e-06,
"loss": 0.4787,
"step": 316
},
{
"epoch": 4.012658227848101,
"grad_norm": 0.4870908032169019,
"learning_rate": 4.578459340635719e-06,
"loss": 0.4425,
"step": 317
},
{
"epoch": 4.025316455696203,
"grad_norm": 0.4998371951593972,
"learning_rate": 4.466367007420365e-06,
"loss": 0.4669,
"step": 318
},
{
"epoch": 4.037974683544304,
"grad_norm": 0.7307305347874821,
"learning_rate": 4.3554911773751e-06,
"loss": 0.4637,
"step": 319
},
{
"epoch": 4.050632911392405,
"grad_norm": 0.5837987958939078,
"learning_rate": 4.2458405336456395e-06,
"loss": 0.5016,
"step": 320
},
{
"epoch": 4.063291139240507,
"grad_norm": 0.4363099151425374,
"learning_rate": 4.137423663428281e-06,
"loss": 0.4438,
"step": 321
},
{
"epoch": 4.075949367088608,
"grad_norm": 0.38653794232932476,
"learning_rate": 4.0302490572973996e-06,
"loss": 0.4521,
"step": 322
},
{
"epoch": 4.0886075949367084,
"grad_norm": 0.4955367791540508,
"learning_rate": 3.924325108540534e-06,
"loss": 0.4322,
"step": 323
},
{
"epoch": 4.10126582278481,
"grad_norm": 0.4852002953831795,
"learning_rate": 3.819660112501053e-06,
"loss": 0.4106,
"step": 324
},
{
"epoch": 4.113924050632911,
"grad_norm": 0.4095691127815476,
"learning_rate": 3.7162622659285185e-06,
"loss": 0.4734,
"step": 325
},
{
"epoch": 4.1265822784810124,
"grad_norm": 0.3007661855202624,
"learning_rate": 3.614139666336769e-06,
"loss": 0.4694,
"step": 326
},
{
"epoch": 4.139240506329114,
"grad_norm": 0.318953245064844,
"learning_rate": 3.5133003113697717e-06,
"loss": 0.4766,
"step": 327
},
{
"epoch": 4.151898734177215,
"grad_norm": 0.4934240339772568,
"learning_rate": 3.413752098175285e-06,
"loss": 0.4708,
"step": 328
},
{
"epoch": 4.1645569620253164,
"grad_norm": 0.4367189363583746,
"learning_rate": 3.315502822786407e-06,
"loss": 0.4662,
"step": 329
},
{
"epoch": 4.177215189873418,
"grad_norm": 0.29691965725456393,
"learning_rate": 3.21856017951103e-06,
"loss": 0.4829,
"step": 330
},
{
"epoch": 4.189873417721519,
"grad_norm": 0.21949706369317049,
"learning_rate": 3.1229317603292707e-06,
"loss": 0.4508,
"step": 331
},
{
"epoch": 4.2025316455696204,
"grad_norm": 0.30563669994626963,
"learning_rate": 3.0286250542989215e-06,
"loss": 0.4324,
"step": 332
},
{
"epoch": 4.215189873417722,
"grad_norm": 0.342864683828975,
"learning_rate": 2.93564744696893e-06,
"loss": 0.4352,
"step": 333
},
{
"epoch": 4.227848101265823,
"grad_norm": 0.31679071734680664,
"learning_rate": 2.8440062198010187e-06,
"loss": 0.4622,
"step": 334
},
{
"epoch": 4.2405063291139244,
"grad_norm": 0.24543561211413686,
"learning_rate": 2.753708549599443e-06,
"loss": 0.4803,
"step": 335
},
{
"epoch": 4.253164556962025,
"grad_norm": 0.21593306695142758,
"learning_rate": 2.664761507948945e-06,
"loss": 0.4645,
"step": 336
},
{
"epoch": 4.265822784810126,
"grad_norm": 0.23125129304548442,
"learning_rate": 2.5771720606609486e-06,
"loss": 0.4732,
"step": 337
},
{
"epoch": 4.2784810126582276,
"grad_norm": 0.3013286878261682,
"learning_rate": 2.4909470672280334e-06,
"loss": 0.4543,
"step": 338
},
{
"epoch": 4.291139240506329,
"grad_norm": 0.24233569882928516,
"learning_rate": 2.4060932802867498e-06,
"loss": 0.4459,
"step": 339
},
{
"epoch": 4.30379746835443,
"grad_norm": 0.23681738509390698,
"learning_rate": 2.322617345088778e-06,
"loss": 0.4625,
"step": 340
},
{
"epoch": 4.3164556962025316,
"grad_norm": 0.20138071955548387,
"learning_rate": 2.2405257989805264e-06,
"loss": 0.4611,
"step": 341
},
{
"epoch": 4.329113924050633,
"grad_norm": 0.21014293227174918,
"learning_rate": 2.1598250708911504e-06,
"loss": 0.5027,
"step": 342
},
{
"epoch": 4.341772151898734,
"grad_norm": 0.2289994155198736,
"learning_rate": 2.0805214808290896e-06,
"loss": 0.4529,
"step": 343
},
{
"epoch": 4.3544303797468356,
"grad_norm": 0.22414927849521216,
"learning_rate": 2.0026212393871057e-06,
"loss": 0.4592,
"step": 344
},
{
"epoch": 4.367088607594937,
"grad_norm": 0.22167034678197836,
"learning_rate": 1.926130447255925e-06,
"loss": 0.4539,
"step": 345
},
{
"epoch": 4.379746835443038,
"grad_norm": 0.18660957212383408,
"learning_rate": 1.8510550947464479e-06,
"loss": 0.4544,
"step": 346
},
{
"epoch": 4.3924050632911396,
"grad_norm": 0.18945678506406408,
"learning_rate": 1.7774010613206406e-06,
"loss": 0.4953,
"step": 347
},
{
"epoch": 4.405063291139241,
"grad_norm": 0.19337415817857806,
"learning_rate": 1.7051741151310786e-06,
"loss": 0.4648,
"step": 348
},
{
"epoch": 4.417721518987342,
"grad_norm": 0.19973937848994527,
"learning_rate": 1.6343799125692194e-06,
"loss": 0.4364,
"step": 349
},
{
"epoch": 4.430379746835443,
"grad_norm": 0.1999439373837463,
"learning_rate": 1.5650239978224346e-06,
"loss": 0.4487,
"step": 350
},
{
"epoch": 4.443037974683544,
"grad_norm": 0.19944223834974942,
"learning_rate": 1.4971118024398124e-06,
"loss": 0.5032,
"step": 351
},
{
"epoch": 4.455696202531645,
"grad_norm": 0.18772542243736196,
"learning_rate": 1.4306486449068002e-06,
"loss": 0.4732,
"step": 352
},
{
"epoch": 4.468354430379747,
"grad_norm": 0.1851071919628138,
"learning_rate": 1.365639730228685e-06,
"loss": 0.4691,
"step": 353
},
{
"epoch": 4.481012658227848,
"grad_norm": 0.19283845554051482,
"learning_rate": 1.3020901495229632e-06,
"loss": 0.4812,
"step": 354
},
{
"epoch": 4.493670886075949,
"grad_norm": 0.17316072830820867,
"learning_rate": 1.240004879620651e-06,
"loss": 0.4464,
"step": 355
},
{
"epoch": 4.506329113924051,
"grad_norm": 0.26638042061870965,
"learning_rate": 1.1793887826765094e-06,
"loss": 0.4568,
"step": 356
},
{
"epoch": 4.518987341772152,
"grad_norm": 0.1769364498679372,
"learning_rate": 1.1202466057882777e-06,
"loss": 0.4351,
"step": 357
},
{
"epoch": 4.531645569620253,
"grad_norm": 0.21054583188705406,
"learning_rate": 1.0625829806249133e-06,
"loss": 0.4736,
"step": 358
},
{
"epoch": 4.544303797468355,
"grad_norm": 0.18281527921298735,
"learning_rate": 1.0064024230638547e-06,
"loss": 0.464,
"step": 359
},
{
"epoch": 4.556962025316456,
"grad_norm": 0.18749354572565427,
"learning_rate": 9.517093328373739e-07,
"loss": 0.4347,
"step": 360
},
{
"epoch": 4.569620253164557,
"grad_norm": 0.19801669246463768,
"learning_rate": 8.985079931880114e-07,
"loss": 0.4556,
"step": 361
},
{
"epoch": 4.582278481012658,
"grad_norm": 0.1707845437181247,
"learning_rate": 8.468025705331406e-07,
"loss": 0.4678,
"step": 362
},
{
"epoch": 4.594936708860759,
"grad_norm": 0.17927167077263217,
"learning_rate": 7.965971141386708e-07,
"loss": 0.4805,
"step": 363
},
{
"epoch": 4.6075949367088604,
"grad_norm": 0.18123408677924785,
"learning_rate": 7.478955558019408e-07,
"loss": 0.509,
"step": 364
},
{
"epoch": 4.620253164556962,
"grad_norm": 0.17934939572634542,
"learning_rate": 7.007017095438029e-07,
"loss": 0.4385,
"step": 365
},
{
"epoch": 4.632911392405063,
"grad_norm": 0.18707997046238803,
"learning_rate": 6.550192713099224e-07,
"loss": 0.4341,
"step": 366
},
{
"epoch": 4.6455696202531644,
"grad_norm": 0.18084289990559332,
"learning_rate": 6.108518186813462e-07,
"loss": 0.4702,
"step": 367
},
{
"epoch": 4.658227848101266,
"grad_norm": 0.17471257906003163,
"learning_rate": 5.682028105943161e-07,
"loss": 0.4348,
"step": 368
},
{
"epoch": 4.670886075949367,
"grad_norm": 0.17312815925285896,
"learning_rate": 5.270755870693877e-07,
"loss": 0.4369,
"step": 369
},
{
"epoch": 4.6835443037974684,
"grad_norm": 0.17442658053131146,
"learning_rate": 4.874733689498645e-07,
"loss": 0.4593,
"step": 370
},
{
"epoch": 4.69620253164557,
"grad_norm": 0.17553291293796242,
"learning_rate": 4.493992576495609e-07,
"loss": 0.4607,
"step": 371
},
{
"epoch": 4.708860759493671,
"grad_norm": 0.17573326550033613,
"learning_rate": 4.1285623490990413e-07,
"loss": 0.4422,
"step": 372
},
{
"epoch": 4.7215189873417724,
"grad_norm": 0.17271762501593244,
"learning_rate": 3.778471625664404e-07,
"loss": 0.4476,
"step": 373
},
{
"epoch": 4.734177215189874,
"grad_norm": 0.16852411866432154,
"learning_rate": 3.4437478232470123e-07,
"loss": 0.4392,
"step": 374
},
{
"epoch": 4.746835443037975,
"grad_norm": 0.17227172970015317,
"learning_rate": 3.124417155454884e-07,
"loss": 0.433,
"step": 375
},
{
"epoch": 4.759493670886076,
"grad_norm": 0.16744296803228972,
"learning_rate": 2.820504630395915e-07,
"loss": 0.4389,
"step": 376
},
{
"epoch": 4.772151898734177,
"grad_norm": 0.16903573661066207,
"learning_rate": 2.532034048719312e-07,
"loss": 0.4574,
"step": 377
},
{
"epoch": 4.784810126582278,
"grad_norm": 0.17807232065432885,
"learning_rate": 2.259028001751773e-07,
"loss": 0.4046,
"step": 378
},
{
"epoch": 4.7974683544303796,
"grad_norm": 0.18476464618425043,
"learning_rate": 2.0015078697281477e-07,
"loss": 0.4509,
"step": 379
},
{
"epoch": 4.810126582278481,
"grad_norm": 0.16737994039781423,
"learning_rate": 1.7594938201170863e-07,
"loss": 0.4227,
"step": 380
},
{
"epoch": 4.822784810126582,
"grad_norm": 0.2648344981339244,
"learning_rate": 1.533004806041727e-07,
"loss": 0.4483,
"step": 381
},
{
"epoch": 4.8354430379746836,
"grad_norm": 0.16978562036616543,
"learning_rate": 1.3220585647953256e-07,
"loss": 0.4477,
"step": 382
},
{
"epoch": 4.848101265822785,
"grad_norm": 0.1642498077680966,
"learning_rate": 1.1266716164521906e-07,
"loss": 0.4584,
"step": 383
},
{
"epoch": 4.860759493670886,
"grad_norm": 0.1690920633180304,
"learning_rate": 9.46859262573896e-08,
"loss": 0.4531,
"step": 384
},
{
"epoch": 4.8734177215189876,
"grad_norm": 0.17582359187836866,
"learning_rate": 7.826355850110378e-08,
"loss": 0.4448,
"step": 385
},
{
"epoch": 4.886075949367089,
"grad_norm": 0.17280369527452052,
"learning_rate": 6.340134448002966e-08,
"loss": 0.4594,
"step": 386
},
{
"epoch": 4.89873417721519,
"grad_norm": 0.16986660465712172,
"learning_rate": 5.0100448115737446e-08,
"loss": 0.4524,
"step": 387
},
{
"epoch": 4.911392405063291,
"grad_norm": 0.1791349245983108,
"learning_rate": 3.836191105654141e-08,
"loss": 0.4791,
"step": 388
},
{
"epoch": 4.924050632911392,
"grad_norm": 0.1651267565065545,
"learning_rate": 2.8186652595918464e-08,
"loss": 0.4937,
"step": 389
},
{
"epoch": 4.936708860759493,
"grad_norm": 0.17381577748034596,
"learning_rate": 1.957546960052792e-08,
"loss": 0.457,
"step": 390
},
{
"epoch": 4.949367088607595,
"grad_norm": 0.16169549660015017,
"learning_rate": 1.2529036447792576e-08,
"loss": 0.4568,
"step": 391
},
{
"epoch": 4.962025316455696,
"grad_norm": 0.1649898709443881,
"learning_rate": 7.0479049730920454e-09,
"loss": 0.4363,
"step": 392
},
{
"epoch": 4.974683544303797,
"grad_norm": 0.171903361970066,
"learning_rate": 3.132504426548444e-09,
"loss": 0.4822,
"step": 393
},
{
"epoch": 4.987341772151899,
"grad_norm": 0.1729850210189732,
"learning_rate": 7.831414393999481e-10,
"loss": 0.4518,
"step": 394
},
{
"epoch": 5.0,
"grad_norm": 0.38031759897505635,
"learning_rate": 0.0,
"loss": 0.4016,
"step": 395
},
{
"epoch": 5.0,
"step": 395,
"total_flos": 5028535166042112.0,
"train_loss": 0.637588576350031,
"train_runtime": 11226.821,
"train_samples_per_second": 4.454,
"train_steps_per_second": 0.035
}
],
"logging_steps": 1,
"max_steps": 395,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5028535166042112.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}