translator-Qwen2.5-Coder-7B / trainer_state.json
kcxain's picture
Upload folder using huggingface_hub
360e4e5 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 978,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003067484662576687,
"grad_norm": 0.7997869645290624,
"learning_rate": 1.0204081632653061e-07,
"loss": 0.0303,
"step": 1
},
{
"epoch": 0.006134969325153374,
"grad_norm": 1.3916303874119316,
"learning_rate": 2.0408163265306121e-07,
"loss": 0.0855,
"step": 2
},
{
"epoch": 0.009202453987730062,
"grad_norm": 0.9776667246072426,
"learning_rate": 3.0612244897959183e-07,
"loss": 0.0244,
"step": 3
},
{
"epoch": 0.012269938650306749,
"grad_norm": 0.9337405630277921,
"learning_rate": 4.0816326530612243e-07,
"loss": 0.0266,
"step": 4
},
{
"epoch": 0.015337423312883436,
"grad_norm": 0.6813063564646261,
"learning_rate": 5.102040816326531e-07,
"loss": 0.0138,
"step": 5
},
{
"epoch": 0.018404907975460124,
"grad_norm": 0.7694825948696452,
"learning_rate": 6.122448979591837e-07,
"loss": 0.0277,
"step": 6
},
{
"epoch": 0.02147239263803681,
"grad_norm": 1.019918445629934,
"learning_rate": 7.142857142857143e-07,
"loss": 0.0431,
"step": 7
},
{
"epoch": 0.024539877300613498,
"grad_norm": 0.8356666628019843,
"learning_rate": 8.163265306122449e-07,
"loss": 0.0376,
"step": 8
},
{
"epoch": 0.027607361963190184,
"grad_norm": 0.9661685002153698,
"learning_rate": 9.183673469387756e-07,
"loss": 0.0252,
"step": 9
},
{
"epoch": 0.03067484662576687,
"grad_norm": 1.0057781682964735,
"learning_rate": 1.0204081632653063e-06,
"loss": 0.0551,
"step": 10
},
{
"epoch": 0.03374233128834356,
"grad_norm": 1.27237502100699,
"learning_rate": 1.122448979591837e-06,
"loss": 0.0616,
"step": 11
},
{
"epoch": 0.03680981595092025,
"grad_norm": 0.8635209118844634,
"learning_rate": 1.2244897959183673e-06,
"loss": 0.0356,
"step": 12
},
{
"epoch": 0.03987730061349693,
"grad_norm": 1.0917475029682462,
"learning_rate": 1.3265306122448982e-06,
"loss": 0.0517,
"step": 13
},
{
"epoch": 0.04294478527607362,
"grad_norm": 0.9971259570435305,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.0388,
"step": 14
},
{
"epoch": 0.046012269938650305,
"grad_norm": 1.18589108762414,
"learning_rate": 1.5306122448979593e-06,
"loss": 0.0411,
"step": 15
},
{
"epoch": 0.049079754601226995,
"grad_norm": 0.7687072641757458,
"learning_rate": 1.6326530612244897e-06,
"loss": 0.0298,
"step": 16
},
{
"epoch": 0.05214723926380368,
"grad_norm": 0.7583676893025098,
"learning_rate": 1.7346938775510206e-06,
"loss": 0.0341,
"step": 17
},
{
"epoch": 0.05521472392638037,
"grad_norm": 1.1470980781176612,
"learning_rate": 1.8367346938775512e-06,
"loss": 0.0382,
"step": 18
},
{
"epoch": 0.05828220858895705,
"grad_norm": 1.0121900682885199,
"learning_rate": 1.938775510204082e-06,
"loss": 0.038,
"step": 19
},
{
"epoch": 0.06134969325153374,
"grad_norm": 1.0122959695964604,
"learning_rate": 2.0408163265306125e-06,
"loss": 0.0312,
"step": 20
},
{
"epoch": 0.06441717791411043,
"grad_norm": 0.8672537023810776,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.0343,
"step": 21
},
{
"epoch": 0.06748466257668712,
"grad_norm": 0.8464502422934749,
"learning_rate": 2.244897959183674e-06,
"loss": 0.0322,
"step": 22
},
{
"epoch": 0.0705521472392638,
"grad_norm": 0.9014580063188392,
"learning_rate": 2.3469387755102044e-06,
"loss": 0.0517,
"step": 23
},
{
"epoch": 0.0736196319018405,
"grad_norm": 0.644281280221218,
"learning_rate": 2.4489795918367347e-06,
"loss": 0.0196,
"step": 24
},
{
"epoch": 0.07668711656441718,
"grad_norm": 1.153639389526131,
"learning_rate": 2.5510204081632657e-06,
"loss": 0.0575,
"step": 25
},
{
"epoch": 0.07975460122699386,
"grad_norm": 0.5976101616752824,
"learning_rate": 2.6530612244897964e-06,
"loss": 0.0229,
"step": 26
},
{
"epoch": 0.08282208588957055,
"grad_norm": 0.6774405227259789,
"learning_rate": 2.7551020408163266e-06,
"loss": 0.0269,
"step": 27
},
{
"epoch": 0.08588957055214724,
"grad_norm": 0.8858054392922697,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0346,
"step": 28
},
{
"epoch": 0.08895705521472393,
"grad_norm": 0.6961058532345435,
"learning_rate": 2.959183673469388e-06,
"loss": 0.0439,
"step": 29
},
{
"epoch": 0.09202453987730061,
"grad_norm": 0.5514138971509586,
"learning_rate": 3.0612244897959185e-06,
"loss": 0.0142,
"step": 30
},
{
"epoch": 0.0950920245398773,
"grad_norm": 0.9140038253698272,
"learning_rate": 3.1632653061224496e-06,
"loss": 0.0314,
"step": 31
},
{
"epoch": 0.09815950920245399,
"grad_norm": 0.7760922802051613,
"learning_rate": 3.2653061224489794e-06,
"loss": 0.0268,
"step": 32
},
{
"epoch": 0.10122699386503067,
"grad_norm": 0.6792779586916694,
"learning_rate": 3.3673469387755105e-06,
"loss": 0.0271,
"step": 33
},
{
"epoch": 0.10429447852760736,
"grad_norm": 0.6969367770150492,
"learning_rate": 3.469387755102041e-06,
"loss": 0.028,
"step": 34
},
{
"epoch": 0.10736196319018405,
"grad_norm": 0.86611814489453,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.0388,
"step": 35
},
{
"epoch": 0.11042944785276074,
"grad_norm": 0.6484808223290501,
"learning_rate": 3.6734693877551024e-06,
"loss": 0.0335,
"step": 36
},
{
"epoch": 0.11349693251533742,
"grad_norm": 0.5965376825578463,
"learning_rate": 3.7755102040816327e-06,
"loss": 0.0252,
"step": 37
},
{
"epoch": 0.1165644171779141,
"grad_norm": 0.6268545591800431,
"learning_rate": 3.877551020408164e-06,
"loss": 0.0142,
"step": 38
},
{
"epoch": 0.1196319018404908,
"grad_norm": 0.6668195357079019,
"learning_rate": 3.979591836734694e-06,
"loss": 0.0217,
"step": 39
},
{
"epoch": 0.12269938650306748,
"grad_norm": 0.5086913557218375,
"learning_rate": 4.081632653061225e-06,
"loss": 0.0253,
"step": 40
},
{
"epoch": 0.12576687116564417,
"grad_norm": 0.730907605009217,
"learning_rate": 4.183673469387755e-06,
"loss": 0.0228,
"step": 41
},
{
"epoch": 0.12883435582822086,
"grad_norm": 0.7102463889462425,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.0293,
"step": 42
},
{
"epoch": 0.13190184049079753,
"grad_norm": 0.4909211588758295,
"learning_rate": 4.3877551020408165e-06,
"loss": 0.017,
"step": 43
},
{
"epoch": 0.13496932515337423,
"grad_norm": 0.32680440879911227,
"learning_rate": 4.489795918367348e-06,
"loss": 0.0161,
"step": 44
},
{
"epoch": 0.13803680981595093,
"grad_norm": 0.4833196352907147,
"learning_rate": 4.591836734693878e-06,
"loss": 0.0353,
"step": 45
},
{
"epoch": 0.1411042944785276,
"grad_norm": 0.36928803194700305,
"learning_rate": 4.693877551020409e-06,
"loss": 0.02,
"step": 46
},
{
"epoch": 0.1441717791411043,
"grad_norm": 0.4472383224420318,
"learning_rate": 4.795918367346939e-06,
"loss": 0.0131,
"step": 47
},
{
"epoch": 0.147239263803681,
"grad_norm": 0.36087285993792934,
"learning_rate": 4.897959183673469e-06,
"loss": 0.0159,
"step": 48
},
{
"epoch": 0.15030674846625766,
"grad_norm": 0.5556656554696969,
"learning_rate": 5e-06,
"loss": 0.0268,
"step": 49
},
{
"epoch": 0.15337423312883436,
"grad_norm": 0.8497844272956676,
"learning_rate": 5.1020408163265315e-06,
"loss": 0.0454,
"step": 50
},
{
"epoch": 0.15644171779141106,
"grad_norm": 0.4536793707964894,
"learning_rate": 5.204081632653062e-06,
"loss": 0.0201,
"step": 51
},
{
"epoch": 0.15950920245398773,
"grad_norm": 0.7743465371861137,
"learning_rate": 5.306122448979593e-06,
"loss": 0.0386,
"step": 52
},
{
"epoch": 0.16257668711656442,
"grad_norm": 0.49482528494719774,
"learning_rate": 5.408163265306123e-06,
"loss": 0.0209,
"step": 53
},
{
"epoch": 0.1656441717791411,
"grad_norm": 0.43390985077433325,
"learning_rate": 5.510204081632653e-06,
"loss": 0.0154,
"step": 54
},
{
"epoch": 0.1687116564417178,
"grad_norm": 0.19867010422132667,
"learning_rate": 5.6122448979591834e-06,
"loss": 0.0063,
"step": 55
},
{
"epoch": 0.17177914110429449,
"grad_norm": 0.42254705168474815,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.0156,
"step": 56
},
{
"epoch": 0.17484662576687116,
"grad_norm": 0.483259025167169,
"learning_rate": 5.816326530612246e-06,
"loss": 0.0234,
"step": 57
},
{
"epoch": 0.17791411042944785,
"grad_norm": 0.48205583406482455,
"learning_rate": 5.918367346938776e-06,
"loss": 0.0181,
"step": 58
},
{
"epoch": 0.18098159509202455,
"grad_norm": 0.5452087947152122,
"learning_rate": 6.020408163265307e-06,
"loss": 0.0239,
"step": 59
},
{
"epoch": 0.18404907975460122,
"grad_norm": 0.47352939498338353,
"learning_rate": 6.122448979591837e-06,
"loss": 0.0207,
"step": 60
},
{
"epoch": 0.18711656441717792,
"grad_norm": 0.5722083241556664,
"learning_rate": 6.224489795918368e-06,
"loss": 0.0346,
"step": 61
},
{
"epoch": 0.1901840490797546,
"grad_norm": 0.35217938893247586,
"learning_rate": 6.326530612244899e-06,
"loss": 0.0145,
"step": 62
},
{
"epoch": 0.19325153374233128,
"grad_norm": 0.4140072480123712,
"learning_rate": 6.4285714285714295e-06,
"loss": 0.0152,
"step": 63
},
{
"epoch": 0.19631901840490798,
"grad_norm": 0.49720539052583107,
"learning_rate": 6.530612244897959e-06,
"loss": 0.0296,
"step": 64
},
{
"epoch": 0.19938650306748465,
"grad_norm": 0.469616800045718,
"learning_rate": 6.63265306122449e-06,
"loss": 0.0174,
"step": 65
},
{
"epoch": 0.20245398773006135,
"grad_norm": 0.618218528026767,
"learning_rate": 6.734693877551021e-06,
"loss": 0.0344,
"step": 66
},
{
"epoch": 0.20552147239263804,
"grad_norm": 0.7394292500503313,
"learning_rate": 6.836734693877551e-06,
"loss": 0.0324,
"step": 67
},
{
"epoch": 0.2085889570552147,
"grad_norm": 0.5210994325814557,
"learning_rate": 6.938775510204082e-06,
"loss": 0.0228,
"step": 68
},
{
"epoch": 0.2116564417177914,
"grad_norm": 0.4487340952497001,
"learning_rate": 7.0408163265306125e-06,
"loss": 0.0227,
"step": 69
},
{
"epoch": 0.2147239263803681,
"grad_norm": 0.49919865989262474,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.0149,
"step": 70
},
{
"epoch": 0.21779141104294478,
"grad_norm": 0.42206279707783706,
"learning_rate": 7.244897959183675e-06,
"loss": 0.0151,
"step": 71
},
{
"epoch": 0.22085889570552147,
"grad_norm": 0.31366279555645177,
"learning_rate": 7.346938775510205e-06,
"loss": 0.0094,
"step": 72
},
{
"epoch": 0.22392638036809817,
"grad_norm": 0.5571009715110524,
"learning_rate": 7.448979591836736e-06,
"loss": 0.0259,
"step": 73
},
{
"epoch": 0.22699386503067484,
"grad_norm": 0.541523890264921,
"learning_rate": 7.551020408163265e-06,
"loss": 0.0157,
"step": 74
},
{
"epoch": 0.23006134969325154,
"grad_norm": 0.7290444815058128,
"learning_rate": 7.653061224489796e-06,
"loss": 0.0417,
"step": 75
},
{
"epoch": 0.2331288343558282,
"grad_norm": 0.36222020238590813,
"learning_rate": 7.755102040816327e-06,
"loss": 0.0162,
"step": 76
},
{
"epoch": 0.2361963190184049,
"grad_norm": 0.6725897974035238,
"learning_rate": 7.857142857142858e-06,
"loss": 0.0367,
"step": 77
},
{
"epoch": 0.2392638036809816,
"grad_norm": 0.47377840842133173,
"learning_rate": 7.959183673469388e-06,
"loss": 0.0194,
"step": 78
},
{
"epoch": 0.24233128834355827,
"grad_norm": 0.6367586297561662,
"learning_rate": 8.06122448979592e-06,
"loss": 0.0367,
"step": 79
},
{
"epoch": 0.24539877300613497,
"grad_norm": 0.5733239067596057,
"learning_rate": 8.16326530612245e-06,
"loss": 0.0266,
"step": 80
},
{
"epoch": 0.24846625766871167,
"grad_norm": 0.3752422775055063,
"learning_rate": 8.26530612244898e-06,
"loss": 0.0165,
"step": 81
},
{
"epoch": 0.25153374233128833,
"grad_norm": 0.8041178386540389,
"learning_rate": 8.36734693877551e-06,
"loss": 0.0482,
"step": 82
},
{
"epoch": 0.254601226993865,
"grad_norm": 0.48929300164772,
"learning_rate": 8.469387755102042e-06,
"loss": 0.0242,
"step": 83
},
{
"epoch": 0.25766871165644173,
"grad_norm": 0.38718573399562517,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0213,
"step": 84
},
{
"epoch": 0.2607361963190184,
"grad_norm": 0.4376639244558999,
"learning_rate": 8.673469387755103e-06,
"loss": 0.019,
"step": 85
},
{
"epoch": 0.26380368098159507,
"grad_norm": 0.5268233065102302,
"learning_rate": 8.775510204081633e-06,
"loss": 0.0247,
"step": 86
},
{
"epoch": 0.2668711656441718,
"grad_norm": 0.7284585489566475,
"learning_rate": 8.877551020408163e-06,
"loss": 0.0498,
"step": 87
},
{
"epoch": 0.26993865030674846,
"grad_norm": 0.548627684829585,
"learning_rate": 8.979591836734695e-06,
"loss": 0.0399,
"step": 88
},
{
"epoch": 0.27300613496932513,
"grad_norm": 0.49417006334168573,
"learning_rate": 9.081632653061225e-06,
"loss": 0.0241,
"step": 89
},
{
"epoch": 0.27607361963190186,
"grad_norm": 0.35327219241848984,
"learning_rate": 9.183673469387756e-06,
"loss": 0.012,
"step": 90
},
{
"epoch": 0.2791411042944785,
"grad_norm": 0.5651567661019409,
"learning_rate": 9.285714285714288e-06,
"loss": 0.0306,
"step": 91
},
{
"epoch": 0.2822085889570552,
"grad_norm": 0.4977883329448499,
"learning_rate": 9.387755102040818e-06,
"loss": 0.0331,
"step": 92
},
{
"epoch": 0.2852760736196319,
"grad_norm": 0.4742291191076062,
"learning_rate": 9.489795918367348e-06,
"loss": 0.0264,
"step": 93
},
{
"epoch": 0.2883435582822086,
"grad_norm": 0.4979060862617067,
"learning_rate": 9.591836734693878e-06,
"loss": 0.0196,
"step": 94
},
{
"epoch": 0.29141104294478526,
"grad_norm": 0.4618772300579882,
"learning_rate": 9.693877551020408e-06,
"loss": 0.0312,
"step": 95
},
{
"epoch": 0.294478527607362,
"grad_norm": 0.5244902559593818,
"learning_rate": 9.795918367346939e-06,
"loss": 0.0135,
"step": 96
},
{
"epoch": 0.29754601226993865,
"grad_norm": 0.42514924793603937,
"learning_rate": 9.89795918367347e-06,
"loss": 0.0134,
"step": 97
},
{
"epoch": 0.3006134969325153,
"grad_norm": 0.6303178022610803,
"learning_rate": 1e-05,
"loss": 0.0431,
"step": 98
},
{
"epoch": 0.30368098159509205,
"grad_norm": 0.43154235410273956,
"learning_rate": 9.999968137932856e-06,
"loss": 0.0256,
"step": 99
},
{
"epoch": 0.3067484662576687,
"grad_norm": 0.3913466056843226,
"learning_rate": 9.999872552137497e-06,
"loss": 0.0209,
"step": 100
},
{
"epoch": 0.3098159509202454,
"grad_norm": 0.3803054399869341,
"learning_rate": 9.999713243832148e-06,
"loss": 0.0176,
"step": 101
},
{
"epoch": 0.3128834355828221,
"grad_norm": 0.560817485298493,
"learning_rate": 9.999490215047167e-06,
"loss": 0.034,
"step": 102
},
{
"epoch": 0.3159509202453988,
"grad_norm": 0.5606212067152239,
"learning_rate": 9.999203468625017e-06,
"loss": 0.0223,
"step": 103
},
{
"epoch": 0.31901840490797545,
"grad_norm": 0.3078230944385983,
"learning_rate": 9.998853008220231e-06,
"loss": 0.0102,
"step": 104
},
{
"epoch": 0.3220858895705521,
"grad_norm": 0.2936441456513863,
"learning_rate": 9.998438838299366e-06,
"loss": 0.0131,
"step": 105
},
{
"epoch": 0.32515337423312884,
"grad_norm": 0.545125785404182,
"learning_rate": 9.997960964140946e-06,
"loss": 0.0225,
"step": 106
},
{
"epoch": 0.3282208588957055,
"grad_norm": 0.443159450605464,
"learning_rate": 9.997419391835397e-06,
"loss": 0.014,
"step": 107
},
{
"epoch": 0.3312883435582822,
"grad_norm": 0.5171297711383881,
"learning_rate": 9.99681412828496e-06,
"loss": 0.0336,
"step": 108
},
{
"epoch": 0.3343558282208589,
"grad_norm": 0.3541766364039508,
"learning_rate": 9.996145181203616e-06,
"loss": 0.0155,
"step": 109
},
{
"epoch": 0.3374233128834356,
"grad_norm": 0.5566728911251606,
"learning_rate": 9.995412559116979e-06,
"loss": 0.0184,
"step": 110
},
{
"epoch": 0.34049079754601225,
"grad_norm": 0.43277308181190083,
"learning_rate": 9.994616271362193e-06,
"loss": 0.0217,
"step": 111
},
{
"epoch": 0.34355828220858897,
"grad_norm": 1.0421460915519953,
"learning_rate": 9.993756328087806e-06,
"loss": 0.0997,
"step": 112
},
{
"epoch": 0.34662576687116564,
"grad_norm": 0.5075832442140273,
"learning_rate": 9.992832740253646e-06,
"loss": 0.0168,
"step": 113
},
{
"epoch": 0.3496932515337423,
"grad_norm": 0.3720333544524573,
"learning_rate": 9.991845519630679e-06,
"loss": 0.0145,
"step": 114
},
{
"epoch": 0.35276073619631904,
"grad_norm": 0.3602385407355057,
"learning_rate": 9.990794678800863e-06,
"loss": 0.0154,
"step": 115
},
{
"epoch": 0.3558282208588957,
"grad_norm": 0.4542869951307087,
"learning_rate": 9.989680231156983e-06,
"loss": 0.026,
"step": 116
},
{
"epoch": 0.3588957055214724,
"grad_norm": 0.4351733531466369,
"learning_rate": 9.988502190902478e-06,
"loss": 0.0279,
"step": 117
},
{
"epoch": 0.3619631901840491,
"grad_norm": 0.46867639008666345,
"learning_rate": 9.987260573051268e-06,
"loss": 0.0216,
"step": 118
},
{
"epoch": 0.36503067484662577,
"grad_norm": 0.4387734488332606,
"learning_rate": 9.98595539342756e-06,
"loss": 0.0233,
"step": 119
},
{
"epoch": 0.36809815950920244,
"grad_norm": 0.539494122809603,
"learning_rate": 9.984586668665641e-06,
"loss": 0.02,
"step": 120
},
{
"epoch": 0.37116564417177916,
"grad_norm": 0.5140548435409961,
"learning_rate": 9.98315441620967e-06,
"loss": 0.0315,
"step": 121
},
{
"epoch": 0.37423312883435583,
"grad_norm": 0.5589763772928042,
"learning_rate": 9.981658654313458e-06,
"loss": 0.0303,
"step": 122
},
{
"epoch": 0.3773006134969325,
"grad_norm": 0.5761892222444737,
"learning_rate": 9.980099402040231e-06,
"loss": 0.0378,
"step": 123
},
{
"epoch": 0.3803680981595092,
"grad_norm": 0.6156623359330066,
"learning_rate": 9.978476679262387e-06,
"loss": 0.0384,
"step": 124
},
{
"epoch": 0.3834355828220859,
"grad_norm": 0.551665717386314,
"learning_rate": 9.976790506661252e-06,
"loss": 0.0274,
"step": 125
},
{
"epoch": 0.38650306748466257,
"grad_norm": 0.5955360132547092,
"learning_rate": 9.975040905726799e-06,
"loss": 0.0231,
"step": 126
},
{
"epoch": 0.3895705521472393,
"grad_norm": 0.44370422954795896,
"learning_rate": 9.97322789875739e-06,
"loss": 0.0223,
"step": 127
},
{
"epoch": 0.39263803680981596,
"grad_norm": 0.5206926401483173,
"learning_rate": 9.971351508859488e-06,
"loss": 0.0254,
"step": 128
},
{
"epoch": 0.39570552147239263,
"grad_norm": 0.6756035296339351,
"learning_rate": 9.969411759947354e-06,
"loss": 0.1297,
"step": 129
},
{
"epoch": 0.3987730061349693,
"grad_norm": 0.42001192691622796,
"learning_rate": 9.96740867674275e-06,
"loss": 0.0198,
"step": 130
},
{
"epoch": 0.401840490797546,
"grad_norm": 0.5471786768275381,
"learning_rate": 9.965342284774633e-06,
"loss": 0.0407,
"step": 131
},
{
"epoch": 0.4049079754601227,
"grad_norm": 0.46505959754331605,
"learning_rate": 9.963212610378804e-06,
"loss": 0.0203,
"step": 132
},
{
"epoch": 0.40797546012269936,
"grad_norm": 0.5716077696919392,
"learning_rate": 9.961019680697593e-06,
"loss": 0.0241,
"step": 133
},
{
"epoch": 0.4110429447852761,
"grad_norm": 0.49711227219120874,
"learning_rate": 9.958763523679515e-06,
"loss": 0.0173,
"step": 134
},
{
"epoch": 0.41411042944785276,
"grad_norm": 0.47418803366323253,
"learning_rate": 9.956444168078896e-06,
"loss": 0.0191,
"step": 135
},
{
"epoch": 0.4171779141104294,
"grad_norm": 0.4847709417484094,
"learning_rate": 9.954061643455524e-06,
"loss": 0.0262,
"step": 136
},
{
"epoch": 0.42024539877300615,
"grad_norm": 0.49885884477945064,
"learning_rate": 9.951615980174262e-06,
"loss": 0.0354,
"step": 137
},
{
"epoch": 0.4233128834355828,
"grad_norm": 0.5163880173448544,
"learning_rate": 9.949107209404664e-06,
"loss": 0.026,
"step": 138
},
{
"epoch": 0.4263803680981595,
"grad_norm": 0.6209064580342998,
"learning_rate": 9.946535363120581e-06,
"loss": 0.0373,
"step": 139
},
{
"epoch": 0.4294478527607362,
"grad_norm": 0.5706790350480078,
"learning_rate": 9.943900474099748e-06,
"loss": 0.0193,
"step": 140
},
{
"epoch": 0.4325153374233129,
"grad_norm": 0.6595953727064474,
"learning_rate": 9.94120257592337e-06,
"loss": 0.0299,
"step": 141
},
{
"epoch": 0.43558282208588955,
"grad_norm": 0.5292023247761327,
"learning_rate": 9.938441702975689e-06,
"loss": 0.0514,
"step": 142
},
{
"epoch": 0.4386503067484663,
"grad_norm": 0.460766980739092,
"learning_rate": 9.935617890443557e-06,
"loss": 0.0152,
"step": 143
},
{
"epoch": 0.44171779141104295,
"grad_norm": 0.4778170956102627,
"learning_rate": 9.932731174315973e-06,
"loss": 0.0219,
"step": 144
},
{
"epoch": 0.4447852760736196,
"grad_norm": 0.468431790732579,
"learning_rate": 9.929781591383634e-06,
"loss": 0.0199,
"step": 145
},
{
"epoch": 0.44785276073619634,
"grad_norm": 0.4136064992794228,
"learning_rate": 9.926769179238467e-06,
"loss": 0.0287,
"step": 146
},
{
"epoch": 0.450920245398773,
"grad_norm": 0.48063176334483343,
"learning_rate": 9.923693976273138e-06,
"loss": 0.0253,
"step": 147
},
{
"epoch": 0.4539877300613497,
"grad_norm": 0.4390033441811796,
"learning_rate": 9.92055602168058e-06,
"loss": 0.0228,
"step": 148
},
{
"epoch": 0.4570552147239264,
"grad_norm": 0.6077680223170788,
"learning_rate": 9.917355355453481e-06,
"loss": 0.044,
"step": 149
},
{
"epoch": 0.4601226993865031,
"grad_norm": 0.7649894761219165,
"learning_rate": 9.914092018383778e-06,
"loss": 0.0527,
"step": 150
},
{
"epoch": 0.46319018404907975,
"grad_norm": 0.5196433338574888,
"learning_rate": 9.910766052062137e-06,
"loss": 0.0389,
"step": 151
},
{
"epoch": 0.4662576687116564,
"grad_norm": 0.5592360599484748,
"learning_rate": 9.90737749887742e-06,
"loss": 0.0379,
"step": 152
},
{
"epoch": 0.46932515337423314,
"grad_norm": 0.6311825130544745,
"learning_rate": 9.903926402016153e-06,
"loss": 0.0328,
"step": 153
},
{
"epoch": 0.4723926380368098,
"grad_norm": 0.6765533980399042,
"learning_rate": 9.900412805461968e-06,
"loss": 0.0274,
"step": 154
},
{
"epoch": 0.4754601226993865,
"grad_norm": 0.4182044283444094,
"learning_rate": 9.896836753995043e-06,
"loss": 0.0174,
"step": 155
},
{
"epoch": 0.4785276073619632,
"grad_norm": 0.4494240910276348,
"learning_rate": 9.893198293191539e-06,
"loss": 0.0203,
"step": 156
},
{
"epoch": 0.4815950920245399,
"grad_norm": 0.7211159726447208,
"learning_rate": 9.889497469423004e-06,
"loss": 0.0543,
"step": 157
},
{
"epoch": 0.48466257668711654,
"grad_norm": 0.47130058401261765,
"learning_rate": 9.885734329855798e-06,
"loss": 0.016,
"step": 158
},
{
"epoch": 0.48773006134969327,
"grad_norm": 0.523537473814604,
"learning_rate": 9.881908922450485e-06,
"loss": 0.0421,
"step": 159
},
{
"epoch": 0.49079754601226994,
"grad_norm": 0.5459938744286107,
"learning_rate": 9.878021295961218e-06,
"loss": 0.032,
"step": 160
},
{
"epoch": 0.4938650306748466,
"grad_norm": 0.565898967516583,
"learning_rate": 9.874071499935124e-06,
"loss": 0.0414,
"step": 161
},
{
"epoch": 0.49693251533742333,
"grad_norm": 0.48527683019849255,
"learning_rate": 9.870059584711668e-06,
"loss": 0.0352,
"step": 162
},
{
"epoch": 0.5,
"grad_norm": 0.49346103186322066,
"learning_rate": 9.865985601422018e-06,
"loss": 0.0189,
"step": 163
},
{
"epoch": 0.5030674846625767,
"grad_norm": 0.4121625641879102,
"learning_rate": 9.861849601988384e-06,
"loss": 0.0213,
"step": 164
},
{
"epoch": 0.5061349693251533,
"grad_norm": 0.4158207874245154,
"learning_rate": 9.857651639123362e-06,
"loss": 0.0186,
"step": 165
},
{
"epoch": 0.50920245398773,
"grad_norm": 0.3274693773098448,
"learning_rate": 9.853391766329264e-06,
"loss": 0.0151,
"step": 166
},
{
"epoch": 0.5122699386503068,
"grad_norm": 0.4958803903962215,
"learning_rate": 9.849070037897428e-06,
"loss": 0.0284,
"step": 167
},
{
"epoch": 0.5153374233128835,
"grad_norm": 0.6475651680085904,
"learning_rate": 9.844686508907538e-06,
"loss": 0.0425,
"step": 168
},
{
"epoch": 0.5184049079754601,
"grad_norm": 0.5682638967858453,
"learning_rate": 9.840241235226909e-06,
"loss": 0.0266,
"step": 169
},
{
"epoch": 0.5214723926380368,
"grad_norm": 0.5740486936349286,
"learning_rate": 9.835734273509787e-06,
"loss": 0.0482,
"step": 170
},
{
"epoch": 0.5245398773006135,
"grad_norm": 0.43454823069684023,
"learning_rate": 9.831165681196618e-06,
"loss": 0.02,
"step": 171
},
{
"epoch": 0.5276073619631901,
"grad_norm": 0.5008900995831749,
"learning_rate": 9.826535516513318e-06,
"loss": 0.0345,
"step": 172
},
{
"epoch": 0.5306748466257669,
"grad_norm": 0.6963240546508572,
"learning_rate": 9.821843838470536e-06,
"loss": 0.025,
"step": 173
},
{
"epoch": 0.5337423312883436,
"grad_norm": 0.4675708697803615,
"learning_rate": 9.817090706862895e-06,
"loss": 0.0223,
"step": 174
},
{
"epoch": 0.5368098159509203,
"grad_norm": 0.3991135320601108,
"learning_rate": 9.812276182268236e-06,
"loss": 0.0192,
"step": 175
},
{
"epoch": 0.5398773006134969,
"grad_norm": 0.6264372022418258,
"learning_rate": 9.807400326046843e-06,
"loss": 0.0407,
"step": 176
},
{
"epoch": 0.5429447852760736,
"grad_norm": 0.5713577007790391,
"learning_rate": 9.802463200340655e-06,
"loss": 0.0263,
"step": 177
},
{
"epoch": 0.5460122699386503,
"grad_norm": 0.5221974991626641,
"learning_rate": 9.797464868072489e-06,
"loss": 0.0257,
"step": 178
},
{
"epoch": 0.549079754601227,
"grad_norm": 0.6380655188875212,
"learning_rate": 9.792405392945218e-06,
"loss": 0.0446,
"step": 179
},
{
"epoch": 0.5521472392638037,
"grad_norm": 0.37992384347324565,
"learning_rate": 9.787284839440982e-06,
"loss": 0.0167,
"step": 180
},
{
"epoch": 0.5552147239263804,
"grad_norm": 0.43784118394068366,
"learning_rate": 9.78210327282035e-06,
"loss": 0.024,
"step": 181
},
{
"epoch": 0.558282208588957,
"grad_norm": 0.6888326299119037,
"learning_rate": 9.776860759121485e-06,
"loss": 0.0382,
"step": 182
},
{
"epoch": 0.5613496932515337,
"grad_norm": 0.713129424822473,
"learning_rate": 9.77155736515932e-06,
"loss": 0.0551,
"step": 183
},
{
"epoch": 0.5644171779141104,
"grad_norm": 0.5242216640303207,
"learning_rate": 9.766193158524693e-06,
"loss": 0.0279,
"step": 184
},
{
"epoch": 0.5674846625766872,
"grad_norm": 0.5021676250184067,
"learning_rate": 9.76076820758349e-06,
"loss": 0.0199,
"step": 185
},
{
"epoch": 0.5705521472392638,
"grad_norm": 0.53833739733463,
"learning_rate": 9.755282581475769e-06,
"loss": 0.0306,
"step": 186
},
{
"epoch": 0.5736196319018405,
"grad_norm": 0.36310921117767025,
"learning_rate": 9.749736350114886e-06,
"loss": 0.013,
"step": 187
},
{
"epoch": 0.5766871165644172,
"grad_norm": 0.612018294089201,
"learning_rate": 9.744129584186599e-06,
"loss": 0.0302,
"step": 188
},
{
"epoch": 0.5797546012269938,
"grad_norm": 0.6183883573878961,
"learning_rate": 9.73846235514817e-06,
"loss": 0.0373,
"step": 189
},
{
"epoch": 0.5828220858895705,
"grad_norm": 0.5335161630334854,
"learning_rate": 9.73273473522745e-06,
"loss": 0.0406,
"step": 190
},
{
"epoch": 0.5858895705521472,
"grad_norm": 0.43028825452090924,
"learning_rate": 9.726946797421966e-06,
"loss": 0.0185,
"step": 191
},
{
"epoch": 0.588957055214724,
"grad_norm": 0.3607144910335978,
"learning_rate": 9.72109861549798e-06,
"loss": 0.019,
"step": 192
},
{
"epoch": 0.5920245398773006,
"grad_norm": 0.6082644030068691,
"learning_rate": 9.715190263989562e-06,
"loss": 0.0679,
"step": 193
},
{
"epoch": 0.5950920245398773,
"grad_norm": 0.6346813811659229,
"learning_rate": 9.709221818197626e-06,
"loss": 0.0352,
"step": 194
},
{
"epoch": 0.598159509202454,
"grad_norm": 0.7228677414909738,
"learning_rate": 9.703193354188978e-06,
"loss": 0.0489,
"step": 195
},
{
"epoch": 0.6012269938650306,
"grad_norm": 0.6070162862574225,
"learning_rate": 9.697104948795353e-06,
"loss": 0.0292,
"step": 196
},
{
"epoch": 0.6042944785276073,
"grad_norm": 0.6333886635907163,
"learning_rate": 9.690956679612422e-06,
"loss": 0.0347,
"step": 197
},
{
"epoch": 0.6073619631901841,
"grad_norm": 0.5604386523878644,
"learning_rate": 9.68474862499881e-06,
"loss": 0.1063,
"step": 198
},
{
"epoch": 0.6104294478527608,
"grad_norm": 0.4877317760711139,
"learning_rate": 9.678480864075099e-06,
"loss": 0.0351,
"step": 199
},
{
"epoch": 0.6134969325153374,
"grad_norm": 0.73365826110845,
"learning_rate": 9.672153476722816e-06,
"loss": 0.0509,
"step": 200
},
{
"epoch": 0.6165644171779141,
"grad_norm": 0.4167148427493197,
"learning_rate": 9.665766543583418e-06,
"loss": 0.0301,
"step": 201
},
{
"epoch": 0.6196319018404908,
"grad_norm": 0.37456090728363056,
"learning_rate": 9.659320146057263e-06,
"loss": 0.0137,
"step": 202
},
{
"epoch": 0.6226993865030674,
"grad_norm": 0.6047651481834828,
"learning_rate": 9.65281436630257e-06,
"loss": 0.044,
"step": 203
},
{
"epoch": 0.6257668711656442,
"grad_norm": 0.6874182637171276,
"learning_rate": 9.646249287234375e-06,
"loss": 0.0327,
"step": 204
},
{
"epoch": 0.6288343558282209,
"grad_norm": 0.460131137033222,
"learning_rate": 9.639624992523474e-06,
"loss": 0.0294,
"step": 205
},
{
"epoch": 0.6319018404907976,
"grad_norm": 0.5745062676923193,
"learning_rate": 9.632941566595357e-06,
"loss": 0.0431,
"step": 206
},
{
"epoch": 0.6349693251533742,
"grad_norm": 0.40736187331439816,
"learning_rate": 9.626199094629132e-06,
"loss": 0.0213,
"step": 207
},
{
"epoch": 0.6380368098159509,
"grad_norm": 0.4237280861879652,
"learning_rate": 9.619397662556434e-06,
"loss": 0.0209,
"step": 208
},
{
"epoch": 0.6411042944785276,
"grad_norm": 0.4262103667083273,
"learning_rate": 9.612537357060338e-06,
"loss": 0.0227,
"step": 209
},
{
"epoch": 0.6441717791411042,
"grad_norm": 0.4164271188520456,
"learning_rate": 9.60561826557425e-06,
"loss": 0.0179,
"step": 210
},
{
"epoch": 0.647239263803681,
"grad_norm": 0.4610603132848594,
"learning_rate": 9.598640476280795e-06,
"loss": 0.0239,
"step": 211
},
{
"epoch": 0.6503067484662577,
"grad_norm": 0.4431083869802891,
"learning_rate": 9.591604078110686e-06,
"loss": 0.0174,
"step": 212
},
{
"epoch": 0.6533742331288344,
"grad_norm": 0.34132375743490156,
"learning_rate": 9.584509160741599e-06,
"loss": 0.0134,
"step": 213
},
{
"epoch": 0.656441717791411,
"grad_norm": 0.5748219630943767,
"learning_rate": 9.577355814597031e-06,
"loss": 0.0386,
"step": 214
},
{
"epoch": 0.6595092024539877,
"grad_norm": 0.5478545394367712,
"learning_rate": 9.570144130845139e-06,
"loss": 0.0299,
"step": 215
},
{
"epoch": 0.6625766871165644,
"grad_norm": 0.6412397075388798,
"learning_rate": 9.56287420139758e-06,
"loss": 0.0353,
"step": 216
},
{
"epoch": 0.6656441717791411,
"grad_norm": 0.3984095445625212,
"learning_rate": 9.555546118908351e-06,
"loss": 0.0249,
"step": 217
},
{
"epoch": 0.6687116564417178,
"grad_norm": 0.46212114811871957,
"learning_rate": 9.548159976772593e-06,
"loss": 0.0194,
"step": 218
},
{
"epoch": 0.6717791411042945,
"grad_norm": 0.712044108676838,
"learning_rate": 9.540715869125407e-06,
"loss": 0.0512,
"step": 219
},
{
"epoch": 0.6748466257668712,
"grad_norm": 0.6344419761933424,
"learning_rate": 9.533213890840657e-06,
"loss": 0.0443,
"step": 220
},
{
"epoch": 0.6779141104294478,
"grad_norm": 0.5387333699270996,
"learning_rate": 9.52565413752976e-06,
"loss": 0.0221,
"step": 221
},
{
"epoch": 0.6809815950920245,
"grad_norm": 0.4509061477504429,
"learning_rate": 9.518036705540459e-06,
"loss": 0.0195,
"step": 222
},
{
"epoch": 0.6840490797546013,
"grad_norm": 0.49381501765327435,
"learning_rate": 9.510361691955607e-06,
"loss": 0.0226,
"step": 223
},
{
"epoch": 0.6871165644171779,
"grad_norm": 0.7919318387287834,
"learning_rate": 9.502629194591925e-06,
"loss": 0.0734,
"step": 224
},
{
"epoch": 0.6901840490797546,
"grad_norm": 0.45378402430883924,
"learning_rate": 9.494839311998753e-06,
"loss": 0.0258,
"step": 225
},
{
"epoch": 0.6932515337423313,
"grad_norm": 0.4537878089824592,
"learning_rate": 9.486992143456792e-06,
"loss": 0.0242,
"step": 226
},
{
"epoch": 0.696319018404908,
"grad_norm": 0.5028749745217802,
"learning_rate": 9.47908778897685e-06,
"loss": 0.023,
"step": 227
},
{
"epoch": 0.6993865030674846,
"grad_norm": 0.6478779043026598,
"learning_rate": 9.471126349298557e-06,
"loss": 0.0438,
"step": 228
},
{
"epoch": 0.7024539877300614,
"grad_norm": 0.5588980543929863,
"learning_rate": 9.46310792588908e-06,
"loss": 0.0293,
"step": 229
},
{
"epoch": 0.7055214723926381,
"grad_norm": 0.3667360725898723,
"learning_rate": 9.45503262094184e-06,
"loss": 0.0168,
"step": 230
},
{
"epoch": 0.7085889570552147,
"grad_norm": 0.48407267064607673,
"learning_rate": 9.446900537375199e-06,
"loss": 0.025,
"step": 231
},
{
"epoch": 0.7116564417177914,
"grad_norm": 0.5343532675542083,
"learning_rate": 9.438711778831154e-06,
"loss": 0.0879,
"step": 232
},
{
"epoch": 0.7147239263803681,
"grad_norm": 0.5280162128110422,
"learning_rate": 9.430466449674014e-06,
"loss": 0.0296,
"step": 233
},
{
"epoch": 0.7177914110429447,
"grad_norm": 0.3758038349070859,
"learning_rate": 9.422164654989073e-06,
"loss": 0.0136,
"step": 234
},
{
"epoch": 0.7208588957055214,
"grad_norm": 0.5117526267420697,
"learning_rate": 9.413806500581267e-06,
"loss": 0.0261,
"step": 235
},
{
"epoch": 0.7239263803680982,
"grad_norm": 0.3981771203856743,
"learning_rate": 9.405392092973824e-06,
"loss": 0.0154,
"step": 236
},
{
"epoch": 0.7269938650306749,
"grad_norm": 0.7223827510240209,
"learning_rate": 9.396921539406916e-06,
"loss": 0.0353,
"step": 237
},
{
"epoch": 0.7300613496932515,
"grad_norm": 0.5167528334171827,
"learning_rate": 9.388394947836278e-06,
"loss": 0.0227,
"step": 238
},
{
"epoch": 0.7331288343558282,
"grad_norm": 0.45907180962386196,
"learning_rate": 9.379812426931847e-06,
"loss": 0.0243,
"step": 239
},
{
"epoch": 0.7361963190184049,
"grad_norm": 0.47959151249699833,
"learning_rate": 9.371174086076364e-06,
"loss": 0.0277,
"step": 240
},
{
"epoch": 0.7392638036809815,
"grad_norm": 0.6484335074593419,
"learning_rate": 9.362480035363987e-06,
"loss": 0.035,
"step": 241
},
{
"epoch": 0.7423312883435583,
"grad_norm": 0.788011620141102,
"learning_rate": 9.353730385598887e-06,
"loss": 0.046,
"step": 242
},
{
"epoch": 0.745398773006135,
"grad_norm": 0.49585961794822014,
"learning_rate": 9.344925248293837e-06,
"loss": 0.0368,
"step": 243
},
{
"epoch": 0.7484662576687117,
"grad_norm": 0.5556890213184674,
"learning_rate": 9.336064735668784e-06,
"loss": 0.0316,
"step": 244
},
{
"epoch": 0.7515337423312883,
"grad_norm": 0.5654657395691955,
"learning_rate": 9.327148960649431e-06,
"loss": 0.0379,
"step": 245
},
{
"epoch": 0.754601226993865,
"grad_norm": 0.6404905959487527,
"learning_rate": 9.318178036865786e-06,
"loss": 0.0458,
"step": 246
},
{
"epoch": 0.7576687116564417,
"grad_norm": 0.47153130888171024,
"learning_rate": 9.309152078650718e-06,
"loss": 0.0236,
"step": 247
},
{
"epoch": 0.7607361963190185,
"grad_norm": 0.6946471966054896,
"learning_rate": 9.300071201038503e-06,
"loss": 0.0351,
"step": 248
},
{
"epoch": 0.7638036809815951,
"grad_norm": 0.7452826487701208,
"learning_rate": 9.290935519763354e-06,
"loss": 0.036,
"step": 249
},
{
"epoch": 0.7668711656441718,
"grad_norm": 0.39427567747880354,
"learning_rate": 9.281745151257946e-06,
"loss": 0.0218,
"step": 250
},
{
"epoch": 0.7699386503067485,
"grad_norm": 0.27722944056563503,
"learning_rate": 9.272500212651934e-06,
"loss": 0.0144,
"step": 251
},
{
"epoch": 0.7730061349693251,
"grad_norm": 0.5591852306028331,
"learning_rate": 9.263200821770462e-06,
"loss": 0.0275,
"step": 252
},
{
"epoch": 0.7760736196319018,
"grad_norm": 0.4621394066240241,
"learning_rate": 9.253847097132656e-06,
"loss": 0.0233,
"step": 253
},
{
"epoch": 0.7791411042944786,
"grad_norm": 0.5194878891553444,
"learning_rate": 9.244439157950114e-06,
"loss": 0.0344,
"step": 254
},
{
"epoch": 0.7822085889570553,
"grad_norm": 0.49348339338246816,
"learning_rate": 9.234977124125395e-06,
"loss": 0.0285,
"step": 255
},
{
"epoch": 0.7852760736196319,
"grad_norm": 0.4850343759514216,
"learning_rate": 9.225461116250483e-06,
"loss": 0.0218,
"step": 256
},
{
"epoch": 0.7883435582822086,
"grad_norm": 0.5770601668132987,
"learning_rate": 9.215891255605248e-06,
"loss": 0.0359,
"step": 257
},
{
"epoch": 0.7914110429447853,
"grad_norm": 0.42625771720843614,
"learning_rate": 9.206267664155906e-06,
"loss": 0.0156,
"step": 258
},
{
"epoch": 0.7944785276073619,
"grad_norm": 0.3426717684737494,
"learning_rate": 9.196590464553467e-06,
"loss": 0.011,
"step": 259
},
{
"epoch": 0.7975460122699386,
"grad_norm": 0.5865425048649809,
"learning_rate": 9.186859780132165e-06,
"loss": 0.0376,
"step": 260
},
{
"epoch": 0.8006134969325154,
"grad_norm": 0.37273506776238424,
"learning_rate": 9.177075734907883e-06,
"loss": 0.0234,
"step": 261
},
{
"epoch": 0.803680981595092,
"grad_norm": 0.5585965373567147,
"learning_rate": 9.167238453576589e-06,
"loss": 0.0426,
"step": 262
},
{
"epoch": 0.8067484662576687,
"grad_norm": 0.3651968044742629,
"learning_rate": 9.157348061512728e-06,
"loss": 0.0248,
"step": 263
},
{
"epoch": 0.8098159509202454,
"grad_norm": 0.5081337037835217,
"learning_rate": 9.147404684767632e-06,
"loss": 0.0266,
"step": 264
},
{
"epoch": 0.8128834355828221,
"grad_norm": 0.5073465299835719,
"learning_rate": 9.137408450067921e-06,
"loss": 0.0272,
"step": 265
},
{
"epoch": 0.8159509202453987,
"grad_norm": 0.5174001449539224,
"learning_rate": 9.12735948481387e-06,
"loss": 0.0266,
"step": 266
},
{
"epoch": 0.8190184049079755,
"grad_norm": 0.6040688463652676,
"learning_rate": 9.117257917077806e-06,
"loss": 0.0314,
"step": 267
},
{
"epoch": 0.8220858895705522,
"grad_norm": 0.6431020172095245,
"learning_rate": 9.107103875602458e-06,
"loss": 0.0404,
"step": 268
},
{
"epoch": 0.8251533742331288,
"grad_norm": 0.4570395520776521,
"learning_rate": 9.096897489799329e-06,
"loss": 0.0245,
"step": 269
},
{
"epoch": 0.8282208588957055,
"grad_norm": 0.5176485694689812,
"learning_rate": 9.086638889747034e-06,
"loss": 0.0318,
"step": 270
},
{
"epoch": 0.8312883435582822,
"grad_norm": 0.4030417312195923,
"learning_rate": 9.076328206189662e-06,
"loss": 0.0186,
"step": 271
},
{
"epoch": 0.8343558282208589,
"grad_norm": 0.7912092430093546,
"learning_rate": 9.065965570535083e-06,
"loss": 0.0211,
"step": 272
},
{
"epoch": 0.8374233128834356,
"grad_norm": 0.3882727172008151,
"learning_rate": 9.055551114853296e-06,
"loss": 0.0201,
"step": 273
},
{
"epoch": 0.8404907975460123,
"grad_norm": 0.6169335765533825,
"learning_rate": 9.045084971874738e-06,
"loss": 0.0457,
"step": 274
},
{
"epoch": 0.843558282208589,
"grad_norm": 0.484075907018457,
"learning_rate": 9.034567274988585e-06,
"loss": 0.0266,
"step": 275
},
{
"epoch": 0.8466257668711656,
"grad_norm": 0.7402879740038205,
"learning_rate": 9.023998158241067e-06,
"loss": 0.0412,
"step": 276
},
{
"epoch": 0.8496932515337423,
"grad_norm": 0.8200807227847277,
"learning_rate": 9.013377756333745e-06,
"loss": 0.024,
"step": 277
},
{
"epoch": 0.852760736196319,
"grad_norm": 0.47917185258179273,
"learning_rate": 9.002706204621802e-06,
"loss": 0.0354,
"step": 278
},
{
"epoch": 0.8558282208588958,
"grad_norm": 0.5686281464689263,
"learning_rate": 8.991983639112318e-06,
"loss": 0.0463,
"step": 279
},
{
"epoch": 0.8588957055214724,
"grad_norm": 0.5035593732198547,
"learning_rate": 8.981210196462533e-06,
"loss": 0.024,
"step": 280
},
{
"epoch": 0.8619631901840491,
"grad_norm": 0.40871215414302914,
"learning_rate": 8.970386013978108e-06,
"loss": 0.026,
"step": 281
},
{
"epoch": 0.8650306748466258,
"grad_norm": 0.5136824340178904,
"learning_rate": 8.959511229611377e-06,
"loss": 0.0248,
"step": 282
},
{
"epoch": 0.8680981595092024,
"grad_norm": 0.530309289976694,
"learning_rate": 8.94858598195958e-06,
"loss": 0.0249,
"step": 283
},
{
"epoch": 0.8711656441717791,
"grad_norm": 0.6333632955147461,
"learning_rate": 8.93761041026311e-06,
"loss": 0.0417,
"step": 284
},
{
"epoch": 0.8742331288343558,
"grad_norm": 0.3971198296026937,
"learning_rate": 8.926584654403725e-06,
"loss": 0.027,
"step": 285
},
{
"epoch": 0.8773006134969326,
"grad_norm": 0.40732995997319704,
"learning_rate": 8.915508854902778e-06,
"loss": 0.0225,
"step": 286
},
{
"epoch": 0.8803680981595092,
"grad_norm": 0.6090138795408967,
"learning_rate": 8.904383152919414e-06,
"loss": 0.018,
"step": 287
},
{
"epoch": 0.8834355828220859,
"grad_norm": 0.7007910043919408,
"learning_rate": 8.893207690248776e-06,
"loss": 0.0367,
"step": 288
},
{
"epoch": 0.8865030674846626,
"grad_norm": 0.5110317128121923,
"learning_rate": 8.881982609320206e-06,
"loss": 0.0242,
"step": 289
},
{
"epoch": 0.8895705521472392,
"grad_norm": 0.6970377701125607,
"learning_rate": 8.870708053195414e-06,
"loss": 0.0483,
"step": 290
},
{
"epoch": 0.8926380368098159,
"grad_norm": 0.5436387432201235,
"learning_rate": 8.859384165566665e-06,
"loss": 0.031,
"step": 291
},
{
"epoch": 0.8957055214723927,
"grad_norm": 0.48318114304921483,
"learning_rate": 8.848011090754946e-06,
"loss": 0.0208,
"step": 292
},
{
"epoch": 0.8987730061349694,
"grad_norm": 0.5078098207586927,
"learning_rate": 8.836588973708129e-06,
"loss": 0.0242,
"step": 293
},
{
"epoch": 0.901840490797546,
"grad_norm": 0.43132547768511786,
"learning_rate": 8.825117959999117e-06,
"loss": 0.0209,
"step": 294
},
{
"epoch": 0.9049079754601227,
"grad_norm": 0.4592204692663985,
"learning_rate": 8.813598195823991e-06,
"loss": 0.0141,
"step": 295
},
{
"epoch": 0.9079754601226994,
"grad_norm": 0.8293697180348152,
"learning_rate": 8.802029828000157e-06,
"loss": 0.0341,
"step": 296
},
{
"epoch": 0.911042944785276,
"grad_norm": 0.6353969688740292,
"learning_rate": 8.790413003964452e-06,
"loss": 0.0531,
"step": 297
},
{
"epoch": 0.9141104294478528,
"grad_norm": 0.5135592242814184,
"learning_rate": 8.778747871771293e-06,
"loss": 0.0299,
"step": 298
},
{
"epoch": 0.9171779141104295,
"grad_norm": 0.6244937792417771,
"learning_rate": 8.767034580090767e-06,
"loss": 0.0251,
"step": 299
},
{
"epoch": 0.9202453987730062,
"grad_norm": 0.4900368286386423,
"learning_rate": 8.75527327820675e-06,
"loss": 0.0303,
"step": 300
},
{
"epoch": 0.9233128834355828,
"grad_norm": 0.7676655526039488,
"learning_rate": 8.743464116014998e-06,
"loss": 0.055,
"step": 301
},
{
"epoch": 0.9263803680981595,
"grad_norm": 0.48627457126119966,
"learning_rate": 8.731607244021236e-06,
"loss": 0.0264,
"step": 302
},
{
"epoch": 0.9294478527607362,
"grad_norm": 0.433812749109046,
"learning_rate": 8.719702813339248e-06,
"loss": 0.0259,
"step": 303
},
{
"epoch": 0.9325153374233128,
"grad_norm": 0.8758544568172416,
"learning_rate": 8.70775097568894e-06,
"loss": 0.045,
"step": 304
},
{
"epoch": 0.9355828220858896,
"grad_norm": 0.45179382392506523,
"learning_rate": 8.695751883394415e-06,
"loss": 0.0243,
"step": 305
},
{
"epoch": 0.9386503067484663,
"grad_norm": 0.4958191093808722,
"learning_rate": 8.683705689382025e-06,
"loss": 0.04,
"step": 306
},
{
"epoch": 0.941717791411043,
"grad_norm": 0.44166098382760377,
"learning_rate": 8.671612547178428e-06,
"loss": 0.027,
"step": 307
},
{
"epoch": 0.9447852760736196,
"grad_norm": 0.5911491565988687,
"learning_rate": 8.659472610908628e-06,
"loss": 0.0191,
"step": 308
},
{
"epoch": 0.9478527607361963,
"grad_norm": 0.4765718802258442,
"learning_rate": 8.647286035294009e-06,
"loss": 0.0264,
"step": 309
},
{
"epoch": 0.950920245398773,
"grad_norm": 0.5852473574381952,
"learning_rate": 8.63505297565037e-06,
"loss": 0.0294,
"step": 310
},
{
"epoch": 0.9539877300613497,
"grad_norm": 0.484982934872736,
"learning_rate": 8.622773587885935e-06,
"loss": 0.0311,
"step": 311
},
{
"epoch": 0.9570552147239264,
"grad_norm": 0.4847238416638495,
"learning_rate": 8.610448028499377e-06,
"loss": 0.0295,
"step": 312
},
{
"epoch": 0.9601226993865031,
"grad_norm": 0.47605607743918293,
"learning_rate": 8.598076454577815e-06,
"loss": 0.0192,
"step": 313
},
{
"epoch": 0.9631901840490797,
"grad_norm": 0.42846762697517876,
"learning_rate": 8.585659023794818e-06,
"loss": 0.016,
"step": 314
},
{
"epoch": 0.9662576687116564,
"grad_norm": 0.6802283196669912,
"learning_rate": 8.57319589440839e-06,
"loss": 0.0617,
"step": 315
},
{
"epoch": 0.9693251533742331,
"grad_norm": 0.6454169454682009,
"learning_rate": 8.56068722525896e-06,
"loss": 0.038,
"step": 316
},
{
"epoch": 0.9723926380368099,
"grad_norm": 0.428982591104215,
"learning_rate": 8.548133175767345e-06,
"loss": 0.0236,
"step": 317
},
{
"epoch": 0.9754601226993865,
"grad_norm": 0.5625492848129335,
"learning_rate": 8.535533905932739e-06,
"loss": 0.0271,
"step": 318
},
{
"epoch": 0.9785276073619632,
"grad_norm": 0.4561896126978212,
"learning_rate": 8.522889576330649e-06,
"loss": 0.0256,
"step": 319
},
{
"epoch": 0.9815950920245399,
"grad_norm": 0.49887714884464673,
"learning_rate": 8.510200348110868e-06,
"loss": 0.0244,
"step": 320
},
{
"epoch": 0.9846625766871165,
"grad_norm": 0.3420730257876837,
"learning_rate": 8.497466382995413e-06,
"loss": 0.0164,
"step": 321
},
{
"epoch": 0.9877300613496932,
"grad_norm": 0.5755461353996324,
"learning_rate": 8.48468784327647e-06,
"loss": 0.0326,
"step": 322
},
{
"epoch": 0.99079754601227,
"grad_norm": 0.800649841232001,
"learning_rate": 8.471864891814304e-06,
"loss": 0.0431,
"step": 323
},
{
"epoch": 0.9938650306748467,
"grad_norm": 0.7338066801521459,
"learning_rate": 8.45899769203522e-06,
"loss": 0.046,
"step": 324
},
{
"epoch": 0.9969325153374233,
"grad_norm": 0.6907151900519473,
"learning_rate": 8.446086407929448e-06,
"loss": 0.0382,
"step": 325
},
{
"epoch": 1.0,
"grad_norm": 0.6407879693661741,
"learning_rate": 8.433131204049067e-06,
"loss": 0.0356,
"step": 326
},
{
"epoch": 1.0030674846625767,
"grad_norm": 0.3297580524046458,
"learning_rate": 8.420132245505912e-06,
"loss": 0.0111,
"step": 327
},
{
"epoch": 1.0061349693251533,
"grad_norm": 0.9160618795214253,
"learning_rate": 8.407089697969458e-06,
"loss": 0.0068,
"step": 328
},
{
"epoch": 1.00920245398773,
"grad_norm": 0.4355224927337849,
"learning_rate": 8.39400372766471e-06,
"loss": 0.0129,
"step": 329
},
{
"epoch": 1.0122699386503067,
"grad_norm": 0.25041904255355624,
"learning_rate": 8.380874501370098e-06,
"loss": 0.0051,
"step": 330
},
{
"epoch": 1.0153374233128833,
"grad_norm": 0.3627103002389334,
"learning_rate": 8.367702186415338e-06,
"loss": 0.019,
"step": 331
},
{
"epoch": 1.01840490797546,
"grad_norm": 0.38724283256482883,
"learning_rate": 8.354486950679301e-06,
"loss": 0.0139,
"step": 332
},
{
"epoch": 1.021472392638037,
"grad_norm": 0.2420700649978279,
"learning_rate": 8.341228962587881e-06,
"loss": 0.0102,
"step": 333
},
{
"epoch": 1.0245398773006136,
"grad_norm": 0.4873618568078472,
"learning_rate": 8.327928391111841e-06,
"loss": 0.0182,
"step": 334
},
{
"epoch": 1.0276073619631902,
"grad_norm": 0.4856636301987984,
"learning_rate": 8.31458540576466e-06,
"loss": 0.0188,
"step": 335
},
{
"epoch": 1.030674846625767,
"grad_norm": 0.27927576098631274,
"learning_rate": 8.301200176600375e-06,
"loss": 0.0086,
"step": 336
},
{
"epoch": 1.0337423312883436,
"grad_norm": 0.4700940197211149,
"learning_rate": 8.287772874211414e-06,
"loss": 0.0109,
"step": 337
},
{
"epoch": 1.0368098159509203,
"grad_norm": 0.2741293793378236,
"learning_rate": 8.274303669726427e-06,
"loss": 0.0086,
"step": 338
},
{
"epoch": 1.039877300613497,
"grad_norm": 0.47118341110657486,
"learning_rate": 8.260792734808085e-06,
"loss": 0.0115,
"step": 339
},
{
"epoch": 1.0429447852760736,
"grad_norm": 0.19194865035728823,
"learning_rate": 8.247240241650918e-06,
"loss": 0.0053,
"step": 340
},
{
"epoch": 1.0460122699386503,
"grad_norm": 0.37141621100709526,
"learning_rate": 8.233646362979106e-06,
"loss": 0.0178,
"step": 341
},
{
"epoch": 1.049079754601227,
"grad_norm": 0.3513576345329715,
"learning_rate": 8.220011272044276e-06,
"loss": 0.0091,
"step": 342
},
{
"epoch": 1.0521472392638036,
"grad_norm": 0.4198658600158762,
"learning_rate": 8.206335142623305e-06,
"loss": 0.0122,
"step": 343
},
{
"epoch": 1.0552147239263803,
"grad_norm": 0.4461226377098048,
"learning_rate": 8.192618149016092e-06,
"loss": 0.0142,
"step": 344
},
{
"epoch": 1.058282208588957,
"grad_norm": 0.2883795931974432,
"learning_rate": 8.178860466043344e-06,
"loss": 0.0084,
"step": 345
},
{
"epoch": 1.0613496932515338,
"grad_norm": 0.2659747508038468,
"learning_rate": 8.165062269044353e-06,
"loss": 0.0059,
"step": 346
},
{
"epoch": 1.0644171779141105,
"grad_norm": 0.4052660141940983,
"learning_rate": 8.151223733874747e-06,
"loss": 0.0077,
"step": 347
},
{
"epoch": 1.0674846625766872,
"grad_norm": 0.43345686528418303,
"learning_rate": 8.13734503690426e-06,
"loss": 0.0047,
"step": 348
},
{
"epoch": 1.0705521472392638,
"grad_norm": 0.5189604081089867,
"learning_rate": 8.123426355014485e-06,
"loss": 0.0059,
"step": 349
},
{
"epoch": 1.0736196319018405,
"grad_norm": 0.5168669787962032,
"learning_rate": 8.109467865596612e-06,
"loss": 0.0102,
"step": 350
},
{
"epoch": 1.0766871165644172,
"grad_norm": 0.6184876506284773,
"learning_rate": 8.095469746549172e-06,
"loss": 0.0155,
"step": 351
},
{
"epoch": 1.0797546012269938,
"grad_norm": 0.4523533733127926,
"learning_rate": 8.081432176275765e-06,
"loss": 0.012,
"step": 352
},
{
"epoch": 1.0828220858895705,
"grad_norm": 0.6042328026836615,
"learning_rate": 8.067355333682799e-06,
"loss": 0.0099,
"step": 353
},
{
"epoch": 1.0858895705521472,
"grad_norm": 0.3712026513508595,
"learning_rate": 8.053239398177191e-06,
"loss": 0.0064,
"step": 354
},
{
"epoch": 1.0889570552147239,
"grad_norm": 0.20180786491281724,
"learning_rate": 8.039084549664098e-06,
"loss": 0.0027,
"step": 355
},
{
"epoch": 1.0920245398773005,
"grad_norm": 0.3787474646578074,
"learning_rate": 8.024890968544614e-06,
"loss": 0.0059,
"step": 356
},
{
"epoch": 1.0950920245398774,
"grad_norm": 0.4859890722014772,
"learning_rate": 8.01065883571347e-06,
"loss": 0.0196,
"step": 357
},
{
"epoch": 1.098159509202454,
"grad_norm": 0.8325992603907834,
"learning_rate": 7.996388332556735e-06,
"loss": 0.0213,
"step": 358
},
{
"epoch": 1.1012269938650308,
"grad_norm": 0.7990116238977559,
"learning_rate": 7.982079640949504e-06,
"loss": 0.0167,
"step": 359
},
{
"epoch": 1.1042944785276074,
"grad_norm": 0.3820950122221644,
"learning_rate": 7.967732943253572e-06,
"loss": 0.007,
"step": 360
},
{
"epoch": 1.107361963190184,
"grad_norm": 0.43830665202404445,
"learning_rate": 7.953348422315115e-06,
"loss": 0.0123,
"step": 361
},
{
"epoch": 1.1104294478527608,
"grad_norm": 0.26105895388858136,
"learning_rate": 7.938926261462366e-06,
"loss": 0.0065,
"step": 362
},
{
"epoch": 1.1134969325153374,
"grad_norm": 0.395098975702821,
"learning_rate": 7.924466644503265e-06,
"loss": 0.0063,
"step": 363
},
{
"epoch": 1.116564417177914,
"grad_norm": 0.49236578269029757,
"learning_rate": 7.90996975572313e-06,
"loss": 0.0102,
"step": 364
},
{
"epoch": 1.1196319018404908,
"grad_norm": 0.2920551819139753,
"learning_rate": 7.895435779882294e-06,
"loss": 0.0047,
"step": 365
},
{
"epoch": 1.1226993865030674,
"grad_norm": 0.652075843621035,
"learning_rate": 7.880864902213765e-06,
"loss": 0.0121,
"step": 366
},
{
"epoch": 1.1257668711656441,
"grad_norm": 0.35616324665939764,
"learning_rate": 7.866257308420858e-06,
"loss": 0.0054,
"step": 367
},
{
"epoch": 1.1288343558282208,
"grad_norm": 0.4669198553674882,
"learning_rate": 7.851613184674821e-06,
"loss": 0.0115,
"step": 368
},
{
"epoch": 1.1319018404907975,
"grad_norm": 0.26000909056545185,
"learning_rate": 7.836932717612481e-06,
"loss": 0.006,
"step": 369
},
{
"epoch": 1.1349693251533743,
"grad_norm": 0.4669430776716613,
"learning_rate": 7.822216094333847e-06,
"loss": 0.0099,
"step": 370
},
{
"epoch": 1.138036809815951,
"grad_norm": 0.6134594985097804,
"learning_rate": 7.807463502399735e-06,
"loss": 0.0152,
"step": 371
},
{
"epoch": 1.1411042944785277,
"grad_norm": 0.3822828619584183,
"learning_rate": 7.792675129829374e-06,
"loss": 0.0111,
"step": 372
},
{
"epoch": 1.1441717791411044,
"grad_norm": 0.41327016906829583,
"learning_rate": 7.777851165098012e-06,
"loss": 0.0173,
"step": 373
},
{
"epoch": 1.147239263803681,
"grad_norm": 0.2714658959849174,
"learning_rate": 7.762991797134513e-06,
"loss": 0.0107,
"step": 374
},
{
"epoch": 1.1503067484662577,
"grad_norm": 0.5544800061588814,
"learning_rate": 7.748097215318951e-06,
"loss": 0.0118,
"step": 375
},
{
"epoch": 1.1533742331288344,
"grad_norm": 0.5040932416460946,
"learning_rate": 7.73316760948019e-06,
"loss": 0.0099,
"step": 376
},
{
"epoch": 1.156441717791411,
"grad_norm": 0.3685884655615084,
"learning_rate": 7.718203169893472e-06,
"loss": 0.0086,
"step": 377
},
{
"epoch": 1.1595092024539877,
"grad_norm": 0.6030394172186191,
"learning_rate": 7.703204087277989e-06,
"loss": 0.0141,
"step": 378
},
{
"epoch": 1.1625766871165644,
"grad_norm": 0.5895133158335721,
"learning_rate": 7.68817055279445e-06,
"loss": 0.013,
"step": 379
},
{
"epoch": 1.165644171779141,
"grad_norm": 0.41300680548579227,
"learning_rate": 7.673102758042653e-06,
"loss": 0.0192,
"step": 380
},
{
"epoch": 1.1687116564417177,
"grad_norm": 0.3374801828046708,
"learning_rate": 7.65800089505903e-06,
"loss": 0.0088,
"step": 381
},
{
"epoch": 1.1717791411042944,
"grad_norm": 0.3454554679129908,
"learning_rate": 7.64286515631421e-06,
"loss": 0.0063,
"step": 382
},
{
"epoch": 1.1748466257668713,
"grad_norm": 0.41181297072167894,
"learning_rate": 7.627695734710565e-06,
"loss": 0.0071,
"step": 383
},
{
"epoch": 1.177914110429448,
"grad_norm": 0.2719651060060069,
"learning_rate": 7.612492823579744e-06,
"loss": 0.0069,
"step": 384
},
{
"epoch": 1.1809815950920246,
"grad_norm": 0.7134985643754311,
"learning_rate": 7.597256616680219e-06,
"loss": 0.0128,
"step": 385
},
{
"epoch": 1.1840490797546013,
"grad_norm": 0.41654763808421014,
"learning_rate": 7.5819873081948105e-06,
"loss": 0.0123,
"step": 386
},
{
"epoch": 1.187116564417178,
"grad_norm": 0.387643208763756,
"learning_rate": 7.566685092728207e-06,
"loss": 0.0079,
"step": 387
},
{
"epoch": 1.1901840490797546,
"grad_norm": 0.7361696877729214,
"learning_rate": 7.5513501653045e-06,
"loss": 0.0248,
"step": 388
},
{
"epoch": 1.1932515337423313,
"grad_norm": 0.4404703959236721,
"learning_rate": 7.535982721364679e-06,
"loss": 0.0122,
"step": 389
},
{
"epoch": 1.196319018404908,
"grad_norm": 0.4151059096070281,
"learning_rate": 7.52058295676416e-06,
"loss": 0.0109,
"step": 390
},
{
"epoch": 1.1993865030674846,
"grad_norm": 0.34795956449739657,
"learning_rate": 7.505151067770277e-06,
"loss": 0.0093,
"step": 391
},
{
"epoch": 1.2024539877300613,
"grad_norm": 0.33091849762828235,
"learning_rate": 7.48968725105978e-06,
"loss": 0.0066,
"step": 392
},
{
"epoch": 1.205521472392638,
"grad_norm": 0.5219288124176215,
"learning_rate": 7.474191703716339e-06,
"loss": 0.0129,
"step": 393
},
{
"epoch": 1.2085889570552146,
"grad_norm": 0.5725267558019617,
"learning_rate": 7.45866462322802e-06,
"loss": 0.0144,
"step": 394
},
{
"epoch": 1.2116564417177913,
"grad_norm": 0.47084724943390577,
"learning_rate": 7.443106207484776e-06,
"loss": 0.0078,
"step": 395
},
{
"epoch": 1.2147239263803682,
"grad_norm": 0.563293178774316,
"learning_rate": 7.427516654775921e-06,
"loss": 0.0097,
"step": 396
},
{
"epoch": 1.2177914110429449,
"grad_norm": 0.45467942711747367,
"learning_rate": 7.411896163787607e-06,
"loss": 0.0234,
"step": 397
},
{
"epoch": 1.2208588957055215,
"grad_norm": 0.607816846697092,
"learning_rate": 7.396244933600285e-06,
"loss": 0.0249,
"step": 398
},
{
"epoch": 1.2239263803680982,
"grad_norm": 0.4194689019423566,
"learning_rate": 7.380563163686174e-06,
"loss": 0.0125,
"step": 399
},
{
"epoch": 1.2269938650306749,
"grad_norm": 0.36586881186412595,
"learning_rate": 7.364851053906719e-06,
"loss": 0.013,
"step": 400
},
{
"epoch": 1.2300613496932515,
"grad_norm": 0.5550225458831591,
"learning_rate": 7.3491088045100365e-06,
"loss": 0.0743,
"step": 401
},
{
"epoch": 1.2331288343558282,
"grad_norm": 0.4466491951675168,
"learning_rate": 7.333336616128369e-06,
"loss": 0.0111,
"step": 402
},
{
"epoch": 1.2361963190184049,
"grad_norm": 0.19051013453254942,
"learning_rate": 7.317534689775527e-06,
"loss": 0.003,
"step": 403
},
{
"epoch": 1.2392638036809815,
"grad_norm": 0.2592362774390213,
"learning_rate": 7.301703226844328e-06,
"loss": 0.0044,
"step": 404
},
{
"epoch": 1.2423312883435582,
"grad_norm": 0.4406593596675372,
"learning_rate": 7.285842429104023e-06,
"loss": 0.011,
"step": 405
},
{
"epoch": 1.2453987730061349,
"grad_norm": 0.36125048050407316,
"learning_rate": 7.269952498697734e-06,
"loss": 0.0063,
"step": 406
},
{
"epoch": 1.2484662576687118,
"grad_norm": 0.7328229395413601,
"learning_rate": 7.2540336381398745e-06,
"loss": 0.0181,
"step": 407
},
{
"epoch": 1.2515337423312882,
"grad_norm": 0.46059726711050014,
"learning_rate": 7.238086050313563e-06,
"loss": 0.0138,
"step": 408
},
{
"epoch": 1.2546012269938651,
"grad_norm": 0.32550746217650556,
"learning_rate": 7.2221099384680485e-06,
"loss": 0.0075,
"step": 409
},
{
"epoch": 1.2576687116564418,
"grad_norm": 0.5498065211034909,
"learning_rate": 7.206105506216107e-06,
"loss": 0.01,
"step": 410
},
{
"epoch": 1.2607361963190185,
"grad_norm": 0.49119461834253564,
"learning_rate": 7.19007295753146e-06,
"loss": 0.0093,
"step": 411
},
{
"epoch": 1.2638036809815951,
"grad_norm": 0.5480841951866141,
"learning_rate": 7.174012496746161e-06,
"loss": 0.0117,
"step": 412
},
{
"epoch": 1.2668711656441718,
"grad_norm": 0.46105615708600756,
"learning_rate": 7.157924328548003e-06,
"loss": 0.0116,
"step": 413
},
{
"epoch": 1.2699386503067485,
"grad_norm": 0.3678598691625631,
"learning_rate": 7.1418086579779075e-06,
"loss": 0.006,
"step": 414
},
{
"epoch": 1.2730061349693251,
"grad_norm": 0.3960979904562928,
"learning_rate": 7.125665690427301e-06,
"loss": 0.0087,
"step": 415
},
{
"epoch": 1.2760736196319018,
"grad_norm": 0.43312438511224055,
"learning_rate": 7.109495631635512e-06,
"loss": 0.0071,
"step": 416
},
{
"epoch": 1.2791411042944785,
"grad_norm": 0.4306674783353921,
"learning_rate": 7.093298687687141e-06,
"loss": 0.0104,
"step": 417
},
{
"epoch": 1.2822085889570551,
"grad_norm": 0.42081355809821863,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.0122,
"step": 418
},
{
"epoch": 1.2852760736196318,
"grad_norm": 0.2781000974050112,
"learning_rate": 7.060824970369651e-06,
"loss": 0.0029,
"step": 419
},
{
"epoch": 1.2883435582822087,
"grad_norm": 0.442937301778717,
"learning_rate": 7.044548610872435e-06,
"loss": 0.0086,
"step": 420
},
{
"epoch": 1.2914110429447851,
"grad_norm": 0.37495174577866447,
"learning_rate": 7.028246193957171e-06,
"loss": 0.0104,
"step": 421
},
{
"epoch": 1.294478527607362,
"grad_norm": 0.5023190671289056,
"learning_rate": 7.011917927395341e-06,
"loss": 0.0108,
"step": 422
},
{
"epoch": 1.2975460122699387,
"grad_norm": 0.23654582360722132,
"learning_rate": 6.9955640192878705e-06,
"loss": 0.0042,
"step": 423
},
{
"epoch": 1.3006134969325154,
"grad_norm": 0.3855334572363615,
"learning_rate": 6.979184678062493e-06,
"loss": 0.0095,
"step": 424
},
{
"epoch": 1.303680981595092,
"grad_norm": 0.4576634484879509,
"learning_rate": 6.962780112471073e-06,
"loss": 0.0086,
"step": 425
},
{
"epoch": 1.3067484662576687,
"grad_norm": 0.42556417794111534,
"learning_rate": 6.946350531586959e-06,
"loss": 0.0087,
"step": 426
},
{
"epoch": 1.3098159509202454,
"grad_norm": 0.3237784652171552,
"learning_rate": 6.929896144802314e-06,
"loss": 0.0083,
"step": 427
},
{
"epoch": 1.312883435582822,
"grad_norm": 0.3524888807864575,
"learning_rate": 6.913417161825449e-06,
"loss": 0.0086,
"step": 428
},
{
"epoch": 1.3159509202453987,
"grad_norm": 0.4553142705420359,
"learning_rate": 6.896913792678151e-06,
"loss": 0.0116,
"step": 429
},
{
"epoch": 1.3190184049079754,
"grad_norm": 0.40859472374578365,
"learning_rate": 6.880386247692999e-06,
"loss": 0.0111,
"step": 430
},
{
"epoch": 1.322085889570552,
"grad_norm": 0.4777916998093413,
"learning_rate": 6.863834737510695e-06,
"loss": 0.0145,
"step": 431
},
{
"epoch": 1.3251533742331287,
"grad_norm": 0.5288188669572824,
"learning_rate": 6.84725947307737e-06,
"loss": 0.0098,
"step": 432
},
{
"epoch": 1.3282208588957056,
"grad_norm": 0.33681749772954367,
"learning_rate": 6.830660665641898e-06,
"loss": 0.0074,
"step": 433
},
{
"epoch": 1.331288343558282,
"grad_norm": 0.380221021862379,
"learning_rate": 6.814038526753205e-06,
"loss": 0.0074,
"step": 434
},
{
"epoch": 1.334355828220859,
"grad_norm": 0.47839600855959413,
"learning_rate": 6.797393268257577e-06,
"loss": 0.0111,
"step": 435
},
{
"epoch": 1.3374233128834356,
"grad_norm": 0.4962504965337215,
"learning_rate": 6.780725102295949e-06,
"loss": 0.008,
"step": 436
},
{
"epoch": 1.3404907975460123,
"grad_norm": 0.4665720038905597,
"learning_rate": 6.764034241301208e-06,
"loss": 0.01,
"step": 437
},
{
"epoch": 1.343558282208589,
"grad_norm": 0.5222835885924834,
"learning_rate": 6.747320897995493e-06,
"loss": 0.0171,
"step": 438
},
{
"epoch": 1.3466257668711656,
"grad_norm": 0.38877477124771365,
"learning_rate": 6.730585285387465e-06,
"loss": 0.0073,
"step": 439
},
{
"epoch": 1.3496932515337423,
"grad_norm": 0.4599037526704758,
"learning_rate": 6.7138276167696135e-06,
"loss": 0.0082,
"step": 440
},
{
"epoch": 1.352760736196319,
"grad_norm": 0.3746438265537782,
"learning_rate": 6.697048105715521e-06,
"loss": 0.0092,
"step": 441
},
{
"epoch": 1.3558282208588956,
"grad_norm": 0.44845911397633703,
"learning_rate": 6.680246966077151e-06,
"loss": 0.0162,
"step": 442
},
{
"epoch": 1.3588957055214723,
"grad_norm": 0.6221158471942455,
"learning_rate": 6.663424411982121e-06,
"loss": 0.0121,
"step": 443
},
{
"epoch": 1.3619631901840492,
"grad_norm": 0.4239501149738595,
"learning_rate": 6.646580657830967e-06,
"loss": 0.0096,
"step": 444
},
{
"epoch": 1.3650306748466257,
"grad_norm": 0.5037673671165694,
"learning_rate": 6.629715918294422e-06,
"loss": 0.012,
"step": 445
},
{
"epoch": 1.3680981595092025,
"grad_norm": 0.5349443172718803,
"learning_rate": 6.612830408310671e-06,
"loss": 0.0137,
"step": 446
},
{
"epoch": 1.3711656441717792,
"grad_norm": 0.5684159361020573,
"learning_rate": 6.595924343082613e-06,
"loss": 0.0096,
"step": 447
},
{
"epoch": 1.3742331288343559,
"grad_norm": 0.47959961882205193,
"learning_rate": 6.578997938075126e-06,
"loss": 0.0126,
"step": 448
},
{
"epoch": 1.3773006134969326,
"grad_norm": 0.3894016180198823,
"learning_rate": 6.562051409012308e-06,
"loss": 0.0086,
"step": 449
},
{
"epoch": 1.3803680981595092,
"grad_norm": 0.5650260235219143,
"learning_rate": 6.545084971874738e-06,
"loss": 0.0065,
"step": 450
},
{
"epoch": 1.383435582822086,
"grad_norm": 0.5097608717889676,
"learning_rate": 6.528098842896721e-06,
"loss": 0.0081,
"step": 451
},
{
"epoch": 1.3865030674846626,
"grad_norm": 0.39970265760625523,
"learning_rate": 6.51109323856353e-06,
"loss": 0.0178,
"step": 452
},
{
"epoch": 1.3895705521472392,
"grad_norm": 0.31936637132172235,
"learning_rate": 6.494068375608646e-06,
"loss": 0.006,
"step": 453
},
{
"epoch": 1.392638036809816,
"grad_norm": 0.48256810572064407,
"learning_rate": 6.477024471011001e-06,
"loss": 0.0137,
"step": 454
},
{
"epoch": 1.3957055214723926,
"grad_norm": 0.3268496682428815,
"learning_rate": 6.459961741992208e-06,
"loss": 0.0101,
"step": 455
},
{
"epoch": 1.3987730061349692,
"grad_norm": 0.27522272221212113,
"learning_rate": 6.442880406013795e-06,
"loss": 0.0061,
"step": 456
},
{
"epoch": 1.4018404907975461,
"grad_norm": 0.3967353171926939,
"learning_rate": 6.425780680774429e-06,
"loss": 0.0084,
"step": 457
},
{
"epoch": 1.4049079754601226,
"grad_norm": 0.4825910261512796,
"learning_rate": 6.408662784207149e-06,
"loss": 0.0186,
"step": 458
},
{
"epoch": 1.4079754601226995,
"grad_norm": 0.3756408674022022,
"learning_rate": 6.391526934476584e-06,
"loss": 0.0099,
"step": 459
},
{
"epoch": 1.4110429447852761,
"grad_norm": 0.3003980819124125,
"learning_rate": 6.374373349976169e-06,
"loss": 0.0031,
"step": 460
},
{
"epoch": 1.4141104294478528,
"grad_norm": 0.32353782969275396,
"learning_rate": 6.3572022493253715e-06,
"loss": 0.0073,
"step": 461
},
{
"epoch": 1.4171779141104295,
"grad_norm": 0.4094722519435924,
"learning_rate": 6.3400138513668955e-06,
"loss": 0.0073,
"step": 462
},
{
"epoch": 1.4202453987730062,
"grad_norm": 0.3620868544363379,
"learning_rate": 6.322808375163896e-06,
"loss": 0.0087,
"step": 463
},
{
"epoch": 1.4233128834355828,
"grad_norm": 0.33066038137309844,
"learning_rate": 6.305586039997188e-06,
"loss": 0.0105,
"step": 464
},
{
"epoch": 1.4263803680981595,
"grad_norm": 0.46544639219707007,
"learning_rate": 6.2883470653624525e-06,
"loss": 0.006,
"step": 465
},
{
"epoch": 1.4294478527607362,
"grad_norm": 0.42293823165805505,
"learning_rate": 6.271091670967437e-06,
"loss": 0.0133,
"step": 466
},
{
"epoch": 1.4325153374233128,
"grad_norm": 0.631453029409068,
"learning_rate": 6.253820076729151e-06,
"loss": 0.0165,
"step": 467
},
{
"epoch": 1.4355828220858895,
"grad_norm": 0.40562333528166167,
"learning_rate": 6.236532502771078e-06,
"loss": 0.0115,
"step": 468
},
{
"epoch": 1.4386503067484662,
"grad_norm": 0.6375756159004377,
"learning_rate": 6.219229169420354e-06,
"loss": 0.0183,
"step": 469
},
{
"epoch": 1.441717791411043,
"grad_norm": 0.23763952949099515,
"learning_rate": 6.201910297204963e-06,
"loss": 0.0064,
"step": 470
},
{
"epoch": 1.4447852760736195,
"grad_norm": 0.34910826558617847,
"learning_rate": 6.1845761068509365e-06,
"loss": 0.0063,
"step": 471
},
{
"epoch": 1.4478527607361964,
"grad_norm": 0.3946649165234159,
"learning_rate": 6.1672268192795285e-06,
"loss": 0.0067,
"step": 472
},
{
"epoch": 1.450920245398773,
"grad_norm": 0.3762470480826563,
"learning_rate": 6.149862655604404e-06,
"loss": 0.0056,
"step": 473
},
{
"epoch": 1.4539877300613497,
"grad_norm": 0.32139974596711196,
"learning_rate": 6.132483837128823e-06,
"loss": 0.0081,
"step": 474
},
{
"epoch": 1.4570552147239264,
"grad_norm": 0.49808493719028013,
"learning_rate": 6.115090585342818e-06,
"loss": 0.0135,
"step": 475
},
{
"epoch": 1.460122699386503,
"grad_norm": 0.2557083632490389,
"learning_rate": 6.097683121920373e-06,
"loss": 0.0064,
"step": 476
},
{
"epoch": 1.4631901840490797,
"grad_norm": 0.27134111617865514,
"learning_rate": 6.080261668716594e-06,
"loss": 0.0059,
"step": 477
},
{
"epoch": 1.4662576687116564,
"grad_norm": 0.33147639324012906,
"learning_rate": 6.062826447764883e-06,
"loss": 0.0057,
"step": 478
},
{
"epoch": 1.469325153374233,
"grad_norm": 0.6381027833440677,
"learning_rate": 6.045377681274118e-06,
"loss": 0.0171,
"step": 479
},
{
"epoch": 1.4723926380368098,
"grad_norm": 0.4862891083399746,
"learning_rate": 6.027915591625804e-06,
"loss": 0.0088,
"step": 480
},
{
"epoch": 1.4754601226993864,
"grad_norm": 0.4199767355964214,
"learning_rate": 6.010440401371249e-06,
"loss": 0.0137,
"step": 481
},
{
"epoch": 1.478527607361963,
"grad_norm": 0.36036542279401557,
"learning_rate": 5.9929523332287275e-06,
"loss": 0.009,
"step": 482
},
{
"epoch": 1.48159509202454,
"grad_norm": 0.48796334911431527,
"learning_rate": 5.975451610080643e-06,
"loss": 0.0119,
"step": 483
},
{
"epoch": 1.4846625766871164,
"grad_norm": 0.5634036649823058,
"learning_rate": 5.957938454970678e-06,
"loss": 0.0252,
"step": 484
},
{
"epoch": 1.4877300613496933,
"grad_norm": 0.40399169151476966,
"learning_rate": 5.940413091100963e-06,
"loss": 0.0095,
"step": 485
},
{
"epoch": 1.49079754601227,
"grad_norm": 0.4399277205497462,
"learning_rate": 5.922875741829227e-06,
"loss": 0.0112,
"step": 486
},
{
"epoch": 1.4938650306748467,
"grad_norm": 0.5288214595932401,
"learning_rate": 5.905326630665952e-06,
"loss": 0.0081,
"step": 487
},
{
"epoch": 1.4969325153374233,
"grad_norm": 0.49760115052816156,
"learning_rate": 5.887765981271518e-06,
"loss": 0.0102,
"step": 488
},
{
"epoch": 1.5,
"grad_norm": 0.41652107889647155,
"learning_rate": 5.87019401745336e-06,
"loss": 0.0049,
"step": 489
},
{
"epoch": 1.5030674846625767,
"grad_norm": 0.2597678001433525,
"learning_rate": 5.85261096316312e-06,
"loss": 0.0073,
"step": 490
},
{
"epoch": 1.5061349693251533,
"grad_norm": 0.38080455678840186,
"learning_rate": 5.835017042493776e-06,
"loss": 0.0096,
"step": 491
},
{
"epoch": 1.50920245398773,
"grad_norm": 0.43224653405380203,
"learning_rate": 5.817412479676801e-06,
"loss": 0.0057,
"step": 492
},
{
"epoch": 1.5122699386503067,
"grad_norm": 0.3262531946722451,
"learning_rate": 5.799797499079301e-06,
"loss": 0.0087,
"step": 493
},
{
"epoch": 1.5153374233128836,
"grad_norm": 0.3016558145931953,
"learning_rate": 5.782172325201155e-06,
"loss": 0.0056,
"step": 494
},
{
"epoch": 1.51840490797546,
"grad_norm": 0.37233224370927953,
"learning_rate": 5.764537182672151e-06,
"loss": 0.0083,
"step": 495
},
{
"epoch": 1.521472392638037,
"grad_norm": 0.37132800727522514,
"learning_rate": 5.746892296249126e-06,
"loss": 0.011,
"step": 496
},
{
"epoch": 1.5245398773006134,
"grad_norm": 0.40201041032133183,
"learning_rate": 5.729237890813104e-06,
"loss": 0.009,
"step": 497
},
{
"epoch": 1.5276073619631902,
"grad_norm": 0.40733654757757093,
"learning_rate": 5.711574191366427e-06,
"loss": 0.0131,
"step": 498
},
{
"epoch": 1.530674846625767,
"grad_norm": 0.6727885144806748,
"learning_rate": 5.693901423029882e-06,
"loss": 0.0152,
"step": 499
},
{
"epoch": 1.5337423312883436,
"grad_norm": 1.000739650786129,
"learning_rate": 5.676219811039844e-06,
"loss": 0.0241,
"step": 500
},
{
"epoch": 1.5368098159509203,
"grad_norm": 0.5235874249664593,
"learning_rate": 5.658529580745399e-06,
"loss": 0.0132,
"step": 501
},
{
"epoch": 1.539877300613497,
"grad_norm": 0.5350650961027892,
"learning_rate": 5.640830957605466e-06,
"loss": 0.0127,
"step": 502
},
{
"epoch": 1.5429447852760736,
"grad_norm": 0.3817664984754992,
"learning_rate": 5.62312416718593e-06,
"loss": 0.0114,
"step": 503
},
{
"epoch": 1.5460122699386503,
"grad_norm": 0.3825385427474814,
"learning_rate": 5.605409435156774e-06,
"loss": 0.0138,
"step": 504
},
{
"epoch": 1.5490797546012272,
"grad_norm": 0.21138154531244147,
"learning_rate": 5.587686987289189e-06,
"loss": 0.0035,
"step": 505
},
{
"epoch": 1.5521472392638036,
"grad_norm": 0.38123503529777064,
"learning_rate": 5.569957049452703e-06,
"loss": 0.0089,
"step": 506
},
{
"epoch": 1.5552147239263805,
"grad_norm": 0.600172423644381,
"learning_rate": 5.552219847612307e-06,
"loss": 0.0152,
"step": 507
},
{
"epoch": 1.558282208588957,
"grad_norm": 0.4589414710140213,
"learning_rate": 5.534475607825566e-06,
"loss": 0.013,
"step": 508
},
{
"epoch": 1.5613496932515338,
"grad_norm": 0.3995615128331652,
"learning_rate": 5.516724556239741e-06,
"loss": 0.0106,
"step": 509
},
{
"epoch": 1.5644171779141103,
"grad_norm": 0.5902773538723448,
"learning_rate": 5.498966919088914e-06,
"loss": 0.0121,
"step": 510
},
{
"epoch": 1.5674846625766872,
"grad_norm": 0.33586208745792945,
"learning_rate": 5.481202922691097e-06,
"loss": 0.0059,
"step": 511
},
{
"epoch": 1.5705521472392638,
"grad_norm": 0.29125387626531496,
"learning_rate": 5.4634327934453444e-06,
"loss": 0.0055,
"step": 512
},
{
"epoch": 1.5736196319018405,
"grad_norm": 0.4066095991071624,
"learning_rate": 5.44565675782888e-06,
"loss": 0.011,
"step": 513
},
{
"epoch": 1.5766871165644172,
"grad_norm": 0.32808822216637573,
"learning_rate": 5.4278750423942e-06,
"loss": 0.0078,
"step": 514
},
{
"epoch": 1.5797546012269938,
"grad_norm": 0.3314484003720461,
"learning_rate": 5.410087873766187e-06,
"loss": 0.0083,
"step": 515
},
{
"epoch": 1.5828220858895705,
"grad_norm": 0.3336629272045388,
"learning_rate": 5.392295478639226e-06,
"loss": 0.0088,
"step": 516
},
{
"epoch": 1.5858895705521472,
"grad_norm": 0.3628271904977443,
"learning_rate": 5.3744980837743125e-06,
"loss": 0.0105,
"step": 517
},
{
"epoch": 1.588957055214724,
"grad_norm": 0.31955150321101644,
"learning_rate": 5.356695915996162e-06,
"loss": 0.0084,
"step": 518
},
{
"epoch": 1.5920245398773005,
"grad_norm": 0.5210028446504106,
"learning_rate": 5.338889202190322e-06,
"loss": 0.0058,
"step": 519
},
{
"epoch": 1.5950920245398774,
"grad_norm": 0.759371015661337,
"learning_rate": 5.321078169300275e-06,
"loss": 0.0613,
"step": 520
},
{
"epoch": 1.5981595092024539,
"grad_norm": 0.3113390592301875,
"learning_rate": 5.303263044324556e-06,
"loss": 0.0041,
"step": 521
},
{
"epoch": 1.6012269938650308,
"grad_norm": 0.4287829629260837,
"learning_rate": 5.285444054313841e-06,
"loss": 0.0087,
"step": 522
},
{
"epoch": 1.6042944785276072,
"grad_norm": 0.46055387526192015,
"learning_rate": 5.267621426368076e-06,
"loss": 0.0082,
"step": 523
},
{
"epoch": 1.607361963190184,
"grad_norm": 0.3797840805599973,
"learning_rate": 5.249795387633571e-06,
"loss": 0.0065,
"step": 524
},
{
"epoch": 1.6104294478527608,
"grad_norm": 0.3930675190109252,
"learning_rate": 5.231966165300099e-06,
"loss": 0.0103,
"step": 525
},
{
"epoch": 1.6134969325153374,
"grad_norm": 0.3442820596582778,
"learning_rate": 5.214133986598014e-06,
"loss": 0.0082,
"step": 526
},
{
"epoch": 1.616564417177914,
"grad_norm": 0.3050671116162007,
"learning_rate": 5.1962990787953436e-06,
"loss": 0.006,
"step": 527
},
{
"epoch": 1.6196319018404908,
"grad_norm": 0.3536096571221858,
"learning_rate": 5.178461669194903e-06,
"loss": 0.0065,
"step": 528
},
{
"epoch": 1.6226993865030674,
"grad_norm": 0.4060284135099058,
"learning_rate": 5.160621985131389e-06,
"loss": 0.0257,
"step": 529
},
{
"epoch": 1.6257668711656441,
"grad_norm": 0.25927903674544184,
"learning_rate": 5.142780253968481e-06,
"loss": 0.0066,
"step": 530
},
{
"epoch": 1.628834355828221,
"grad_norm": 0.2838865350019555,
"learning_rate": 5.124936703095961e-06,
"loss": 0.0099,
"step": 531
},
{
"epoch": 1.6319018404907975,
"grad_norm": 0.3504714577452807,
"learning_rate": 5.1070915599267914e-06,
"loss": 0.0069,
"step": 532
},
{
"epoch": 1.6349693251533743,
"grad_norm": 0.3550050266957588,
"learning_rate": 5.089245051894231e-06,
"loss": 0.0111,
"step": 533
},
{
"epoch": 1.6380368098159508,
"grad_norm": 0.35728707924870967,
"learning_rate": 5.071397406448937e-06,
"loss": 0.0099,
"step": 534
},
{
"epoch": 1.6411042944785277,
"grad_norm": 0.3773540116151394,
"learning_rate": 5.05354885105606e-06,
"loss": 0.0058,
"step": 535
},
{
"epoch": 1.6441717791411041,
"grad_norm": 0.3949338659980868,
"learning_rate": 5.035699613192348e-06,
"loss": 0.0082,
"step": 536
},
{
"epoch": 1.647239263803681,
"grad_norm": 0.4646573686558157,
"learning_rate": 5.017849920343246e-06,
"loss": 0.01,
"step": 537
},
{
"epoch": 1.6503067484662577,
"grad_norm": 0.5642128335140358,
"learning_rate": 5e-06,
"loss": 0.0143,
"step": 538
},
{
"epoch": 1.6533742331288344,
"grad_norm": 0.2479092759047113,
"learning_rate": 4.9821500796567565e-06,
"loss": 0.0032,
"step": 539
},
{
"epoch": 1.656441717791411,
"grad_norm": 0.42550957150659624,
"learning_rate": 4.964300386807653e-06,
"loss": 0.0089,
"step": 540
},
{
"epoch": 1.6595092024539877,
"grad_norm": 0.36622590607369476,
"learning_rate": 4.946451148943941e-06,
"loss": 0.0064,
"step": 541
},
{
"epoch": 1.6625766871165644,
"grad_norm": 0.4546077208760806,
"learning_rate": 4.928602593551065e-06,
"loss": 0.0113,
"step": 542
},
{
"epoch": 1.665644171779141,
"grad_norm": 0.3196662160157007,
"learning_rate": 4.91075494810577e-06,
"loss": 0.0038,
"step": 543
},
{
"epoch": 1.668711656441718,
"grad_norm": 0.1781642502108262,
"learning_rate": 4.89290844007321e-06,
"loss": 0.0026,
"step": 544
},
{
"epoch": 1.6717791411042944,
"grad_norm": 0.44145521346579086,
"learning_rate": 4.875063296904042e-06,
"loss": 0.0147,
"step": 545
},
{
"epoch": 1.6748466257668713,
"grad_norm": 0.2982215293082941,
"learning_rate": 4.85721974603152e-06,
"loss": 0.0034,
"step": 546
},
{
"epoch": 1.6779141104294477,
"grad_norm": 0.5088058908733735,
"learning_rate": 4.8393780148686135e-06,
"loss": 0.0106,
"step": 547
},
{
"epoch": 1.6809815950920246,
"grad_norm": 0.2439455722582877,
"learning_rate": 4.821538330805098e-06,
"loss": 0.0047,
"step": 548
},
{
"epoch": 1.6840490797546013,
"grad_norm": 0.48092459587570097,
"learning_rate": 4.803700921204659e-06,
"loss": 0.0071,
"step": 549
},
{
"epoch": 1.687116564417178,
"grad_norm": 0.43594614747810895,
"learning_rate": 4.785866013401989e-06,
"loss": 0.0097,
"step": 550
},
{
"epoch": 1.6901840490797546,
"grad_norm": 0.2719151732829852,
"learning_rate": 4.768033834699903e-06,
"loss": 0.0051,
"step": 551
},
{
"epoch": 1.6932515337423313,
"grad_norm": 0.2671115953727491,
"learning_rate": 4.750204612366432e-06,
"loss": 0.0059,
"step": 552
},
{
"epoch": 1.696319018404908,
"grad_norm": 0.5253601733548036,
"learning_rate": 4.7323785736319246e-06,
"loss": 0.0081,
"step": 553
},
{
"epoch": 1.6993865030674846,
"grad_norm": 0.3830793685344685,
"learning_rate": 4.71455594568616e-06,
"loss": 0.0083,
"step": 554
},
{
"epoch": 1.7024539877300615,
"grad_norm": 0.31034852969587023,
"learning_rate": 4.6967369556754476e-06,
"loss": 0.0054,
"step": 555
},
{
"epoch": 1.705521472392638,
"grad_norm": 0.4396568468476409,
"learning_rate": 4.678921830699724e-06,
"loss": 0.0085,
"step": 556
},
{
"epoch": 1.7085889570552149,
"grad_norm": 0.40764526944989854,
"learning_rate": 4.661110797809679e-06,
"loss": 0.0084,
"step": 557
},
{
"epoch": 1.7116564417177913,
"grad_norm": 0.4813930042673,
"learning_rate": 4.643304084003839e-06,
"loss": 0.0078,
"step": 558
},
{
"epoch": 1.7147239263803682,
"grad_norm": 0.38796762918199773,
"learning_rate": 4.62550191622569e-06,
"loss": 0.0093,
"step": 559
},
{
"epoch": 1.7177914110429446,
"grad_norm": 0.26689306148678504,
"learning_rate": 4.6077045213607765e-06,
"loss": 0.0077,
"step": 560
},
{
"epoch": 1.7208588957055215,
"grad_norm": 0.40170358200898043,
"learning_rate": 4.589912126233815e-06,
"loss": 0.0087,
"step": 561
},
{
"epoch": 1.7239263803680982,
"grad_norm": 0.2382315472763123,
"learning_rate": 4.572124957605803e-06,
"loss": 0.0044,
"step": 562
},
{
"epoch": 1.7269938650306749,
"grad_norm": 0.7026811275146098,
"learning_rate": 4.554343242171121e-06,
"loss": 0.0095,
"step": 563
},
{
"epoch": 1.7300613496932515,
"grad_norm": 0.35813866462271615,
"learning_rate": 4.536567206554656e-06,
"loss": 0.0082,
"step": 564
},
{
"epoch": 1.7331288343558282,
"grad_norm": 0.461673612521973,
"learning_rate": 4.5187970773089054e-06,
"loss": 0.0173,
"step": 565
},
{
"epoch": 1.7361963190184049,
"grad_norm": 0.2649325350539358,
"learning_rate": 4.501033080911087e-06,
"loss": 0.0067,
"step": 566
},
{
"epoch": 1.7392638036809815,
"grad_norm": 0.36931292694664386,
"learning_rate": 4.483275443760261e-06,
"loss": 0.0071,
"step": 567
},
{
"epoch": 1.7423312883435584,
"grad_norm": 0.46353780643994597,
"learning_rate": 4.465524392174437e-06,
"loss": 0.013,
"step": 568
},
{
"epoch": 1.7453987730061349,
"grad_norm": 0.14701431312044777,
"learning_rate": 4.447780152387694e-06,
"loss": 0.0018,
"step": 569
},
{
"epoch": 1.7484662576687118,
"grad_norm": 0.24576972288599763,
"learning_rate": 4.430042950547298e-06,
"loss": 0.0044,
"step": 570
},
{
"epoch": 1.7515337423312882,
"grad_norm": 0.5453432857666575,
"learning_rate": 4.4123130127108125e-06,
"loss": 0.0196,
"step": 571
},
{
"epoch": 1.7546012269938651,
"grad_norm": 0.3874868229083472,
"learning_rate": 4.394590564843226e-06,
"loss": 0.0054,
"step": 572
},
{
"epoch": 1.7576687116564416,
"grad_norm": 0.3535835403255216,
"learning_rate": 4.376875832814071e-06,
"loss": 0.0053,
"step": 573
},
{
"epoch": 1.7607361963190185,
"grad_norm": 0.2749489766791634,
"learning_rate": 4.359169042394537e-06,
"loss": 0.0058,
"step": 574
},
{
"epoch": 1.7638036809815951,
"grad_norm": 0.27719491743372315,
"learning_rate": 4.341470419254603e-06,
"loss": 0.0056,
"step": 575
},
{
"epoch": 1.7668711656441718,
"grad_norm": 0.42628419434925463,
"learning_rate": 4.323780188960156e-06,
"loss": 0.0082,
"step": 576
},
{
"epoch": 1.7699386503067485,
"grad_norm": 0.37149048561678855,
"learning_rate": 4.306098576970119e-06,
"loss": 0.0073,
"step": 577
},
{
"epoch": 1.7730061349693251,
"grad_norm": 0.4347325087122037,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.0081,
"step": 578
},
{
"epoch": 1.7760736196319018,
"grad_norm": 0.4011537380642771,
"learning_rate": 4.270762109186896e-06,
"loss": 0.007,
"step": 579
},
{
"epoch": 1.7791411042944785,
"grad_norm": 0.3626109586279271,
"learning_rate": 4.253107703750875e-06,
"loss": 0.0069,
"step": 580
},
{
"epoch": 1.7822085889570554,
"grad_norm": 0.27490585927193734,
"learning_rate": 4.235462817327851e-06,
"loss": 0.0053,
"step": 581
},
{
"epoch": 1.7852760736196318,
"grad_norm": 0.4947399469938696,
"learning_rate": 4.217827674798845e-06,
"loss": 0.0058,
"step": 582
},
{
"epoch": 1.7883435582822087,
"grad_norm": 0.4136276570729244,
"learning_rate": 4.2002025009207e-06,
"loss": 0.0117,
"step": 583
},
{
"epoch": 1.7914110429447851,
"grad_norm": 0.38559944723198136,
"learning_rate": 4.1825875203232015e-06,
"loss": 0.0088,
"step": 584
},
{
"epoch": 1.794478527607362,
"grad_norm": 0.4626653370701894,
"learning_rate": 4.164982957506226e-06,
"loss": 0.0078,
"step": 585
},
{
"epoch": 1.7975460122699385,
"grad_norm": 0.24447282240138413,
"learning_rate": 4.147389036836881e-06,
"loss": 0.0026,
"step": 586
},
{
"epoch": 1.8006134969325154,
"grad_norm": 0.3519436504460508,
"learning_rate": 4.129805982546641e-06,
"loss": 0.0085,
"step": 587
},
{
"epoch": 1.803680981595092,
"grad_norm": 0.28482524916262597,
"learning_rate": 4.1122340187284845e-06,
"loss": 0.0033,
"step": 588
},
{
"epoch": 1.8067484662576687,
"grad_norm": 0.29892032636140786,
"learning_rate": 4.09467336933405e-06,
"loss": 0.0042,
"step": 589
},
{
"epoch": 1.8098159509202454,
"grad_norm": 0.4407796725334618,
"learning_rate": 4.077124258170774e-06,
"loss": 0.009,
"step": 590
},
{
"epoch": 1.812883435582822,
"grad_norm": 0.4583133431318533,
"learning_rate": 4.059586908899039e-06,
"loss": 0.0076,
"step": 591
},
{
"epoch": 1.8159509202453987,
"grad_norm": 0.8039166331283459,
"learning_rate": 4.042061545029323e-06,
"loss": 0.0089,
"step": 592
},
{
"epoch": 1.8190184049079754,
"grad_norm": 0.3667345238233017,
"learning_rate": 4.02454838991936e-06,
"loss": 0.011,
"step": 593
},
{
"epoch": 1.8220858895705523,
"grad_norm": 0.208178307389126,
"learning_rate": 4.007047666771274e-06,
"loss": 0.0022,
"step": 594
},
{
"epoch": 1.8251533742331287,
"grad_norm": 0.5198316863113215,
"learning_rate": 3.989559598628753e-06,
"loss": 0.0117,
"step": 595
},
{
"epoch": 1.8282208588957056,
"grad_norm": 0.4442650765755917,
"learning_rate": 3.972084408374198e-06,
"loss": 0.0101,
"step": 596
},
{
"epoch": 1.831288343558282,
"grad_norm": 0.48936961648999217,
"learning_rate": 3.954622318725884e-06,
"loss": 0.0068,
"step": 597
},
{
"epoch": 1.834355828220859,
"grad_norm": 0.20528461066058848,
"learning_rate": 3.937173552235117e-06,
"loss": 0.0045,
"step": 598
},
{
"epoch": 1.8374233128834356,
"grad_norm": 0.29341413585659293,
"learning_rate": 3.919738331283408e-06,
"loss": 0.0057,
"step": 599
},
{
"epoch": 1.8404907975460123,
"grad_norm": 0.2557977944149396,
"learning_rate": 3.902316878079629e-06,
"loss": 0.0041,
"step": 600
},
{
"epoch": 1.843558282208589,
"grad_norm": 0.4227990072533373,
"learning_rate": 3.8849094146571836e-06,
"loss": 0.0119,
"step": 601
},
{
"epoch": 1.8466257668711656,
"grad_norm": 0.5214661055541566,
"learning_rate": 3.867516162871177e-06,
"loss": 0.0141,
"step": 602
},
{
"epoch": 1.8496932515337423,
"grad_norm": 0.37635698001587115,
"learning_rate": 3.850137344395598e-06,
"loss": 0.0082,
"step": 603
},
{
"epoch": 1.852760736196319,
"grad_norm": 0.23128959245280112,
"learning_rate": 3.832773180720475e-06,
"loss": 0.0056,
"step": 604
},
{
"epoch": 1.8558282208588959,
"grad_norm": 0.38717364607066673,
"learning_rate": 3.815423893149065e-06,
"loss": 0.0135,
"step": 605
},
{
"epoch": 1.8588957055214723,
"grad_norm": 0.27375167119044574,
"learning_rate": 3.798089702795038e-06,
"loss": 0.0066,
"step": 606
},
{
"epoch": 1.8619631901840492,
"grad_norm": 0.3375505038267758,
"learning_rate": 3.7807708305796494e-06,
"loss": 0.0076,
"step": 607
},
{
"epoch": 1.8650306748466257,
"grad_norm": 0.5530104418841859,
"learning_rate": 3.7634674972289227e-06,
"loss": 0.0146,
"step": 608
},
{
"epoch": 1.8680981595092025,
"grad_norm": 0.41041915581422406,
"learning_rate": 3.7461799232708497e-06,
"loss": 0.0051,
"step": 609
},
{
"epoch": 1.871165644171779,
"grad_norm": 0.3713305823986346,
"learning_rate": 3.7289083290325668e-06,
"loss": 0.0102,
"step": 610
},
{
"epoch": 1.8742331288343559,
"grad_norm": 0.46230186432718484,
"learning_rate": 3.7116529346375487e-06,
"loss": 0.0098,
"step": 611
},
{
"epoch": 1.8773006134969326,
"grad_norm": 0.16535019806482476,
"learning_rate": 3.694413960002814e-06,
"loss": 0.003,
"step": 612
},
{
"epoch": 1.8803680981595092,
"grad_norm": 0.2930581875533518,
"learning_rate": 3.677191624836106e-06,
"loss": 0.0032,
"step": 613
},
{
"epoch": 1.883435582822086,
"grad_norm": 0.373888513580928,
"learning_rate": 3.6599861486331074e-06,
"loss": 0.0059,
"step": 614
},
{
"epoch": 1.8865030674846626,
"grad_norm": 0.6381390350325629,
"learning_rate": 3.6427977506746293e-06,
"loss": 0.0122,
"step": 615
},
{
"epoch": 1.8895705521472392,
"grad_norm": 0.41045091382297616,
"learning_rate": 3.6256266500238312e-06,
"loss": 0.0129,
"step": 616
},
{
"epoch": 1.892638036809816,
"grad_norm": 0.49828625275146515,
"learning_rate": 3.608473065523419e-06,
"loss": 0.01,
"step": 617
},
{
"epoch": 1.8957055214723928,
"grad_norm": 0.3956832540966562,
"learning_rate": 3.5913372157928515e-06,
"loss": 0.0062,
"step": 618
},
{
"epoch": 1.8987730061349692,
"grad_norm": 0.3672630309971258,
"learning_rate": 3.5742193192255724e-06,
"loss": 0.0061,
"step": 619
},
{
"epoch": 1.9018404907975461,
"grad_norm": 0.3145272087003987,
"learning_rate": 3.557119593986208e-06,
"loss": 0.0059,
"step": 620
},
{
"epoch": 1.9049079754601226,
"grad_norm": 0.30910831663414223,
"learning_rate": 3.5400382580077923e-06,
"loss": 0.0033,
"step": 621
},
{
"epoch": 1.9079754601226995,
"grad_norm": 0.1959441278445393,
"learning_rate": 3.5229755289890005e-06,
"loss": 0.0043,
"step": 622
},
{
"epoch": 1.911042944785276,
"grad_norm": 0.5277004855106132,
"learning_rate": 3.5059316243913556e-06,
"loss": 0.0106,
"step": 623
},
{
"epoch": 1.9141104294478528,
"grad_norm": 0.21771914977302229,
"learning_rate": 3.4889067614364714e-06,
"loss": 0.0041,
"step": 624
},
{
"epoch": 1.9171779141104295,
"grad_norm": 0.39360915150568093,
"learning_rate": 3.47190115710328e-06,
"loss": 0.0073,
"step": 625
},
{
"epoch": 1.9202453987730062,
"grad_norm": 0.3485481083987293,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.0136,
"step": 626
},
{
"epoch": 1.9233128834355828,
"grad_norm": 0.24255517433539572,
"learning_rate": 3.437948590987693e-06,
"loss": 0.0051,
"step": 627
},
{
"epoch": 1.9263803680981595,
"grad_norm": 0.29586738644042815,
"learning_rate": 3.4210020619248762e-06,
"loss": 0.0089,
"step": 628
},
{
"epoch": 1.9294478527607362,
"grad_norm": 0.4186524656839354,
"learning_rate": 3.404075656917388e-06,
"loss": 0.0074,
"step": 629
},
{
"epoch": 1.9325153374233128,
"grad_norm": 0.30619441436870193,
"learning_rate": 3.3871695916893314e-06,
"loss": 0.0096,
"step": 630
},
{
"epoch": 1.9355828220858897,
"grad_norm": 0.578307433564674,
"learning_rate": 3.370284081705579e-06,
"loss": 0.0075,
"step": 631
},
{
"epoch": 1.9386503067484662,
"grad_norm": 0.44719452467917326,
"learning_rate": 3.3534193421690348e-06,
"loss": 0.0108,
"step": 632
},
{
"epoch": 1.941717791411043,
"grad_norm": 0.26118371726805234,
"learning_rate": 3.336575588017881e-06,
"loss": 0.0052,
"step": 633
},
{
"epoch": 1.9447852760736195,
"grad_norm": 0.48800768739543726,
"learning_rate": 3.319753033922849e-06,
"loss": 0.0086,
"step": 634
},
{
"epoch": 1.9478527607361964,
"grad_norm": 0.4006588666794408,
"learning_rate": 3.302951894284481e-06,
"loss": 0.0114,
"step": 635
},
{
"epoch": 1.9509202453987728,
"grad_norm": 0.2289092092840238,
"learning_rate": 3.286172383230388e-06,
"loss": 0.0024,
"step": 636
},
{
"epoch": 1.9539877300613497,
"grad_norm": 0.4127045861094555,
"learning_rate": 3.269414714612534e-06,
"loss": 0.0087,
"step": 637
},
{
"epoch": 1.9570552147239264,
"grad_norm": 0.3163153860623139,
"learning_rate": 3.252679102004509e-06,
"loss": 0.0065,
"step": 638
},
{
"epoch": 1.960122699386503,
"grad_norm": 0.2603842753498582,
"learning_rate": 3.2359657586987934e-06,
"loss": 0.0067,
"step": 639
},
{
"epoch": 1.9631901840490797,
"grad_norm": 0.557283191260559,
"learning_rate": 3.2192748977040535e-06,
"loss": 0.0097,
"step": 640
},
{
"epoch": 1.9662576687116564,
"grad_norm": 0.34901479007130604,
"learning_rate": 3.2026067317424244e-06,
"loss": 0.0096,
"step": 641
},
{
"epoch": 1.969325153374233,
"grad_norm": 0.45136631101338426,
"learning_rate": 3.1859614732467957e-06,
"loss": 0.0135,
"step": 642
},
{
"epoch": 1.9723926380368098,
"grad_norm": 0.23384558728197918,
"learning_rate": 3.169339334358105e-06,
"loss": 0.0054,
"step": 643
},
{
"epoch": 1.9754601226993866,
"grad_norm": 0.14084876331113608,
"learning_rate": 3.152740526922631e-06,
"loss": 0.0017,
"step": 644
},
{
"epoch": 1.978527607361963,
"grad_norm": 0.4962557161989476,
"learning_rate": 3.1361652624893065e-06,
"loss": 0.0534,
"step": 645
},
{
"epoch": 1.98159509202454,
"grad_norm": 0.34403517138193335,
"learning_rate": 3.119613752307002e-06,
"loss": 0.0041,
"step": 646
},
{
"epoch": 1.9846625766871164,
"grad_norm": 0.15306880270008735,
"learning_rate": 3.1030862073218493e-06,
"loss": 0.0023,
"step": 647
},
{
"epoch": 1.9877300613496933,
"grad_norm": 0.33113489538489377,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.0071,
"step": 648
},
{
"epoch": 1.99079754601227,
"grad_norm": 0.5136944684210311,
"learning_rate": 3.0701038551976888e-06,
"loss": 0.0127,
"step": 649
},
{
"epoch": 1.9938650306748467,
"grad_norm": 0.636270989600524,
"learning_rate": 3.053649468413043e-06,
"loss": 0.0163,
"step": 650
},
{
"epoch": 1.9969325153374233,
"grad_norm": 0.4511697907324155,
"learning_rate": 3.037219887528928e-06,
"loss": 0.0075,
"step": 651
},
{
"epoch": 2.0,
"grad_norm": 0.3185377520844948,
"learning_rate": 3.020815321937509e-06,
"loss": 0.0052,
"step": 652
},
{
"epoch": 2.003067484662577,
"grad_norm": 0.1568715672001099,
"learning_rate": 3.0044359807121295e-06,
"loss": 0.0029,
"step": 653
},
{
"epoch": 2.0061349693251533,
"grad_norm": 0.08384719456802235,
"learning_rate": 2.9880820726046613e-06,
"loss": 0.0009,
"step": 654
},
{
"epoch": 2.0092024539877302,
"grad_norm": 0.2688822844440017,
"learning_rate": 2.9717538060428307e-06,
"loss": 0.0057,
"step": 655
},
{
"epoch": 2.0122699386503067,
"grad_norm": 0.156419801199432,
"learning_rate": 2.955451389127567e-06,
"loss": 0.0027,
"step": 656
},
{
"epoch": 2.0153374233128836,
"grad_norm": 0.13118375650446254,
"learning_rate": 2.9391750296303506e-06,
"loss": 0.0009,
"step": 657
},
{
"epoch": 2.01840490797546,
"grad_norm": 0.2776210187070672,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.003,
"step": 658
},
{
"epoch": 2.021472392638037,
"grad_norm": 0.23522515094248445,
"learning_rate": 2.906701312312861e-06,
"loss": 0.0042,
"step": 659
},
{
"epoch": 2.0245398773006134,
"grad_norm": 0.2197503271341387,
"learning_rate": 2.8905043683644875e-06,
"loss": 0.0022,
"step": 660
},
{
"epoch": 2.0276073619631902,
"grad_norm": 0.45337971579413167,
"learning_rate": 2.8743343095727006e-06,
"loss": 0.0059,
"step": 661
},
{
"epoch": 2.0306748466257667,
"grad_norm": 0.18232608740756975,
"learning_rate": 2.8581913420220954e-06,
"loss": 0.0017,
"step": 662
},
{
"epoch": 2.0337423312883436,
"grad_norm": 0.10602348834285584,
"learning_rate": 2.842075671451996e-06,
"loss": 0.0027,
"step": 663
},
{
"epoch": 2.03680981595092,
"grad_norm": 0.24863800686061804,
"learning_rate": 2.8259875032538407e-06,
"loss": 0.0053,
"step": 664
},
{
"epoch": 2.039877300613497,
"grad_norm": 0.24148419846802188,
"learning_rate": 2.8099270424685427e-06,
"loss": 0.0036,
"step": 665
},
{
"epoch": 2.042944785276074,
"grad_norm": 0.18602273100673639,
"learning_rate": 2.7938944937838924e-06,
"loss": 0.0031,
"step": 666
},
{
"epoch": 2.0460122699386503,
"grad_norm": 0.22353235398611618,
"learning_rate": 2.7778900615319536e-06,
"loss": 0.0018,
"step": 667
},
{
"epoch": 2.049079754601227,
"grad_norm": 0.22727585190425154,
"learning_rate": 2.761913949686438e-06,
"loss": 0.0023,
"step": 668
},
{
"epoch": 2.0521472392638036,
"grad_norm": 0.18334423927687288,
"learning_rate": 2.745966361860127e-06,
"loss": 0.0023,
"step": 669
},
{
"epoch": 2.0552147239263805,
"grad_norm": 0.18018696754304367,
"learning_rate": 2.7300475013022666e-06,
"loss": 0.0033,
"step": 670
},
{
"epoch": 2.058282208588957,
"grad_norm": 0.2365128870735293,
"learning_rate": 2.7141575708959777e-06,
"loss": 0.0025,
"step": 671
},
{
"epoch": 2.061349693251534,
"grad_norm": 0.24627552484555662,
"learning_rate": 2.698296773155673e-06,
"loss": 0.0024,
"step": 672
},
{
"epoch": 2.0644171779141103,
"grad_norm": 0.3428729630059143,
"learning_rate": 2.682465310224473e-06,
"loss": 0.0096,
"step": 673
},
{
"epoch": 2.067484662576687,
"grad_norm": 0.21165758715609964,
"learning_rate": 2.6666633838716317e-06,
"loss": 0.0043,
"step": 674
},
{
"epoch": 2.0705521472392636,
"grad_norm": 0.12204006380648764,
"learning_rate": 2.6508911954899648e-06,
"loss": 0.0014,
"step": 675
},
{
"epoch": 2.0736196319018405,
"grad_norm": 0.1545359743955643,
"learning_rate": 2.6351489460932815e-06,
"loss": 0.0038,
"step": 676
},
{
"epoch": 2.0766871165644174,
"grad_norm": 0.27334965743962947,
"learning_rate": 2.6194368363138258e-06,
"loss": 0.0039,
"step": 677
},
{
"epoch": 2.079754601226994,
"grad_norm": 0.14500449963412065,
"learning_rate": 2.603755066399718e-06,
"loss": 0.0026,
"step": 678
},
{
"epoch": 2.0828220858895707,
"grad_norm": 0.25173969209235253,
"learning_rate": 2.5881038362123937e-06,
"loss": 0.004,
"step": 679
},
{
"epoch": 2.085889570552147,
"grad_norm": 0.23000782863325714,
"learning_rate": 2.5724833452240796e-06,
"loss": 0.0045,
"step": 680
},
{
"epoch": 2.088957055214724,
"grad_norm": 0.3760891599612169,
"learning_rate": 2.5568937925152272e-06,
"loss": 0.0071,
"step": 681
},
{
"epoch": 2.0920245398773005,
"grad_norm": 0.4376412312537201,
"learning_rate": 2.5413353767719805e-06,
"loss": 0.0043,
"step": 682
},
{
"epoch": 2.0950920245398774,
"grad_norm": 0.05798098025158203,
"learning_rate": 2.5258082962836616e-06,
"loss": 0.0007,
"step": 683
},
{
"epoch": 2.098159509202454,
"grad_norm": 0.2054750633268483,
"learning_rate": 2.510312748940222e-06,
"loss": 0.0027,
"step": 684
},
{
"epoch": 2.1012269938650308,
"grad_norm": 0.11726586713716912,
"learning_rate": 2.4948489322297258e-06,
"loss": 0.0016,
"step": 685
},
{
"epoch": 2.104294478527607,
"grad_norm": 0.12698443724257102,
"learning_rate": 2.4794170432358415e-06,
"loss": 0.0031,
"step": 686
},
{
"epoch": 2.107361963190184,
"grad_norm": 0.22795055728578995,
"learning_rate": 2.464017278635323e-06,
"loss": 0.0032,
"step": 687
},
{
"epoch": 2.1104294478527605,
"grad_norm": 0.1957131964875298,
"learning_rate": 2.448649834695503e-06,
"loss": 0.0033,
"step": 688
},
{
"epoch": 2.1134969325153374,
"grad_norm": 0.19210852074104182,
"learning_rate": 2.433314907271794e-06,
"loss": 0.0027,
"step": 689
},
{
"epoch": 2.116564417177914,
"grad_norm": 0.04233819240534726,
"learning_rate": 2.418012691805191e-06,
"loss": 0.0005,
"step": 690
},
{
"epoch": 2.1196319018404908,
"grad_norm": 0.26464285040782537,
"learning_rate": 2.402743383319781e-06,
"loss": 0.0039,
"step": 691
},
{
"epoch": 2.1226993865030677,
"grad_norm": 0.11980748517310268,
"learning_rate": 2.387507176420256e-06,
"loss": 0.0021,
"step": 692
},
{
"epoch": 2.125766871165644,
"grad_norm": 0.3334739663015842,
"learning_rate": 2.372304265289436e-06,
"loss": 0.0037,
"step": 693
},
{
"epoch": 2.128834355828221,
"grad_norm": 0.6850456912727723,
"learning_rate": 2.3571348436857906e-06,
"loss": 0.0028,
"step": 694
},
{
"epoch": 2.1319018404907975,
"grad_norm": 0.28144676205745234,
"learning_rate": 2.3419991049409713e-06,
"loss": 0.0036,
"step": 695
},
{
"epoch": 2.1349693251533743,
"grad_norm": 0.108927883546715,
"learning_rate": 2.326897241957348e-06,
"loss": 0.0018,
"step": 696
},
{
"epoch": 2.138036809815951,
"grad_norm": 0.38148279786228145,
"learning_rate": 2.3118294472055497e-06,
"loss": 0.0019,
"step": 697
},
{
"epoch": 2.1411042944785277,
"grad_norm": 0.1970691641521866,
"learning_rate": 2.296795912722014e-06,
"loss": 0.0041,
"step": 698
},
{
"epoch": 2.144171779141104,
"grad_norm": 0.15517303717415623,
"learning_rate": 2.2817968301065284e-06,
"loss": 0.002,
"step": 699
},
{
"epoch": 2.147239263803681,
"grad_norm": 0.14207658565717568,
"learning_rate": 2.266832390519811e-06,
"loss": 0.0027,
"step": 700
},
{
"epoch": 2.1503067484662575,
"grad_norm": 0.15125238624427667,
"learning_rate": 2.2519027846810517e-06,
"loss": 0.0019,
"step": 701
},
{
"epoch": 2.1533742331288344,
"grad_norm": 0.7896376793637964,
"learning_rate": 2.2370082028654867e-06,
"loss": 0.0032,
"step": 702
},
{
"epoch": 2.1564417177914113,
"grad_norm": 0.10686878822853349,
"learning_rate": 2.2221488349019903e-06,
"loss": 0.0009,
"step": 703
},
{
"epoch": 2.1595092024539877,
"grad_norm": 0.22123439350597152,
"learning_rate": 2.207324870170629e-06,
"loss": 0.0052,
"step": 704
},
{
"epoch": 2.1625766871165646,
"grad_norm": 0.2728382879459407,
"learning_rate": 2.192536497600266e-06,
"loss": 0.0032,
"step": 705
},
{
"epoch": 2.165644171779141,
"grad_norm": 0.10959838407771286,
"learning_rate": 2.1777839056661555e-06,
"loss": 0.0026,
"step": 706
},
{
"epoch": 2.168711656441718,
"grad_norm": 0.09379423416997926,
"learning_rate": 2.1630672823875207e-06,
"loss": 0.0013,
"step": 707
},
{
"epoch": 2.1717791411042944,
"grad_norm": 0.24830346437651182,
"learning_rate": 2.148386815325179e-06,
"loss": 0.0027,
"step": 708
},
{
"epoch": 2.1748466257668713,
"grad_norm": 0.29344159057905606,
"learning_rate": 2.133742691579145e-06,
"loss": 0.0009,
"step": 709
},
{
"epoch": 2.1779141104294477,
"grad_norm": 0.30344248949976294,
"learning_rate": 2.1191350977862362e-06,
"loss": 0.0009,
"step": 710
},
{
"epoch": 2.1809815950920246,
"grad_norm": 0.2217780773980183,
"learning_rate": 2.1045642201177075e-06,
"loss": 0.0033,
"step": 711
},
{
"epoch": 2.184049079754601,
"grad_norm": 0.07121103642716385,
"learning_rate": 2.0900302442768717e-06,
"loss": 0.0009,
"step": 712
},
{
"epoch": 2.187116564417178,
"grad_norm": 0.3321155284383894,
"learning_rate": 2.075533355496735e-06,
"loss": 0.0045,
"step": 713
},
{
"epoch": 2.190184049079755,
"grad_norm": 0.40941768481145857,
"learning_rate": 2.061073738537635e-06,
"loss": 0.0073,
"step": 714
},
{
"epoch": 2.1932515337423313,
"grad_norm": 0.21360420814522896,
"learning_rate": 2.0466515776848857e-06,
"loss": 0.004,
"step": 715
},
{
"epoch": 2.196319018404908,
"grad_norm": 0.22144562584721872,
"learning_rate": 2.0322670567464304e-06,
"loss": 0.0034,
"step": 716
},
{
"epoch": 2.1993865030674846,
"grad_norm": 0.09113523991255253,
"learning_rate": 2.0179203590504976e-06,
"loss": 0.0008,
"step": 717
},
{
"epoch": 2.2024539877300615,
"grad_norm": 0.21551251871892765,
"learning_rate": 2.0036116674432653e-06,
"loss": 0.0029,
"step": 718
},
{
"epoch": 2.205521472392638,
"grad_norm": 0.1251050643902028,
"learning_rate": 1.9893411642865317e-06,
"loss": 0.0008,
"step": 719
},
{
"epoch": 2.208588957055215,
"grad_norm": 0.12424583137608142,
"learning_rate": 1.975109031455388e-06,
"loss": 0.002,
"step": 720
},
{
"epoch": 2.2116564417177913,
"grad_norm": 0.21560055643150958,
"learning_rate": 1.9609154503359024e-06,
"loss": 0.0031,
"step": 721
},
{
"epoch": 2.214723926380368,
"grad_norm": 0.2651282013461321,
"learning_rate": 1.946760601822809e-06,
"loss": 0.007,
"step": 722
},
{
"epoch": 2.2177914110429446,
"grad_norm": 0.20960322018223657,
"learning_rate": 1.9326446663172036e-06,
"loss": 0.0046,
"step": 723
},
{
"epoch": 2.2208588957055215,
"grad_norm": 0.1515234364008256,
"learning_rate": 1.9185678237242373e-06,
"loss": 0.001,
"step": 724
},
{
"epoch": 2.223926380368098,
"grad_norm": 0.027001817306581417,
"learning_rate": 1.9045302534508298e-06,
"loss": 0.0003,
"step": 725
},
{
"epoch": 2.226993865030675,
"grad_norm": 0.3772275262465891,
"learning_rate": 1.89053213440339e-06,
"loss": 0.0027,
"step": 726
},
{
"epoch": 2.2300613496932513,
"grad_norm": 0.16416633898381086,
"learning_rate": 1.8765736449855165e-06,
"loss": 0.003,
"step": 727
},
{
"epoch": 2.233128834355828,
"grad_norm": 0.24149851337878356,
"learning_rate": 1.8626549630957397e-06,
"loss": 0.0043,
"step": 728
},
{
"epoch": 2.236196319018405,
"grad_norm": 0.15235662451937504,
"learning_rate": 1.8487762661252557e-06,
"loss": 0.0021,
"step": 729
},
{
"epoch": 2.2392638036809815,
"grad_norm": 0.1793857007355656,
"learning_rate": 1.8349377309556487e-06,
"loss": 0.0032,
"step": 730
},
{
"epoch": 2.2423312883435584,
"grad_norm": 0.1250791648939321,
"learning_rate": 1.8211395339566561e-06,
"loss": 0.0016,
"step": 731
},
{
"epoch": 2.245398773006135,
"grad_norm": 0.17768178437092705,
"learning_rate": 1.8073818509839097e-06,
"loss": 0.0022,
"step": 732
},
{
"epoch": 2.2484662576687118,
"grad_norm": 0.36063973591837933,
"learning_rate": 1.7936648573766956e-06,
"loss": 0.0027,
"step": 733
},
{
"epoch": 2.2515337423312882,
"grad_norm": 0.23032103099080775,
"learning_rate": 1.7799887279557238e-06,
"loss": 0.0041,
"step": 734
},
{
"epoch": 2.254601226993865,
"grad_norm": 0.44527941027974083,
"learning_rate": 1.766353637020895e-06,
"loss": 0.0051,
"step": 735
},
{
"epoch": 2.2576687116564416,
"grad_norm": 0.24421891863384448,
"learning_rate": 1.7527597583490825e-06,
"loss": 0.0019,
"step": 736
},
{
"epoch": 2.2607361963190185,
"grad_norm": 0.17699844104396198,
"learning_rate": 1.7392072651919163e-06,
"loss": 0.0022,
"step": 737
},
{
"epoch": 2.263803680981595,
"grad_norm": 0.09239369823066165,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.0011,
"step": 738
},
{
"epoch": 2.266871165644172,
"grad_norm": 0.2896983984065453,
"learning_rate": 1.7122271257885853e-06,
"loss": 0.0045,
"step": 739
},
{
"epoch": 2.2699386503067487,
"grad_norm": 0.48822736310943793,
"learning_rate": 1.6987998233996279e-06,
"loss": 0.0029,
"step": 740
},
{
"epoch": 2.273006134969325,
"grad_norm": 0.09163689641484868,
"learning_rate": 1.6854145942353417e-06,
"loss": 0.0007,
"step": 741
},
{
"epoch": 2.276073619631902,
"grad_norm": 0.08173136485172053,
"learning_rate": 1.6720716088881595e-06,
"loss": 0.0014,
"step": 742
},
{
"epoch": 2.2791411042944785,
"grad_norm": 0.2545572168918258,
"learning_rate": 1.6587710374121203e-06,
"loss": 0.0037,
"step": 743
},
{
"epoch": 2.2822085889570554,
"grad_norm": 0.1336031235680743,
"learning_rate": 1.645513049320699e-06,
"loss": 0.0016,
"step": 744
},
{
"epoch": 2.285276073619632,
"grad_norm": 0.12057113363078809,
"learning_rate": 1.6322978135846628e-06,
"loss": 0.0015,
"step": 745
},
{
"epoch": 2.2883435582822087,
"grad_norm": 0.4840801469818166,
"learning_rate": 1.6191254986299044e-06,
"loss": 0.004,
"step": 746
},
{
"epoch": 2.291411042944785,
"grad_norm": 0.07732626484388591,
"learning_rate": 1.6059962723352912e-06,
"loss": 0.0009,
"step": 747
},
{
"epoch": 2.294478527607362,
"grad_norm": 0.8607829885785011,
"learning_rate": 1.5929103020305441e-06,
"loss": 0.0088,
"step": 748
},
{
"epoch": 2.2975460122699385,
"grad_norm": 0.12352491257367763,
"learning_rate": 1.5798677544940894e-06,
"loss": 0.0015,
"step": 749
},
{
"epoch": 2.3006134969325154,
"grad_norm": 0.11613033648370168,
"learning_rate": 1.566868795950932e-06,
"loss": 0.0018,
"step": 750
},
{
"epoch": 2.3036809815950923,
"grad_norm": 0.12377281598855457,
"learning_rate": 1.5539135920705545e-06,
"loss": 0.0018,
"step": 751
},
{
"epoch": 2.3067484662576687,
"grad_norm": 0.1545665427345137,
"learning_rate": 1.5410023079647823e-06,
"loss": 0.0017,
"step": 752
},
{
"epoch": 2.309815950920245,
"grad_norm": 0.17077252050239153,
"learning_rate": 1.5281351081856977e-06,
"loss": 0.0027,
"step": 753
},
{
"epoch": 2.312883435582822,
"grad_norm": 0.158419257234868,
"learning_rate": 1.5153121567235334e-06,
"loss": 0.0033,
"step": 754
},
{
"epoch": 2.315950920245399,
"grad_norm": 0.0742824026432101,
"learning_rate": 1.5025336170045861e-06,
"loss": 0.0005,
"step": 755
},
{
"epoch": 2.3190184049079754,
"grad_norm": 0.34483635411603586,
"learning_rate": 1.4897996518891328e-06,
"loss": 0.0022,
"step": 756
},
{
"epoch": 2.3220858895705523,
"grad_norm": 0.06803963933434524,
"learning_rate": 1.4771104236693524e-06,
"loss": 0.0008,
"step": 757
},
{
"epoch": 2.3251533742331287,
"grad_norm": 0.24863338473945598,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0026,
"step": 758
},
{
"epoch": 2.3282208588957056,
"grad_norm": 0.11453250446971239,
"learning_rate": 1.451866824232655e-06,
"loss": 0.0015,
"step": 759
},
{
"epoch": 2.331288343558282,
"grad_norm": 0.14650541046670912,
"learning_rate": 1.439312774741042e-06,
"loss": 0.003,
"step": 760
},
{
"epoch": 2.334355828220859,
"grad_norm": 0.1648695133277369,
"learning_rate": 1.4268041055916104e-06,
"loss": 0.0033,
"step": 761
},
{
"epoch": 2.3374233128834354,
"grad_norm": 0.13479759235284597,
"learning_rate": 1.4143409762051829e-06,
"loss": 0.0024,
"step": 762
},
{
"epoch": 2.3404907975460123,
"grad_norm": 0.2810401943509781,
"learning_rate": 1.4019235454221858e-06,
"loss": 0.0055,
"step": 763
},
{
"epoch": 2.3435582822085887,
"grad_norm": 0.16871442265684417,
"learning_rate": 1.389551971500624e-06,
"loss": 0.0015,
"step": 764
},
{
"epoch": 2.3466257668711656,
"grad_norm": 0.05570036580769489,
"learning_rate": 1.3772264121140661e-06,
"loss": 0.0009,
"step": 765
},
{
"epoch": 2.3496932515337425,
"grad_norm": 0.1683059834869823,
"learning_rate": 1.3649470243496327e-06,
"loss": 0.0037,
"step": 766
},
{
"epoch": 2.352760736196319,
"grad_norm": 0.17630906022011067,
"learning_rate": 1.3527139647059912e-06,
"loss": 0.0016,
"step": 767
},
{
"epoch": 2.355828220858896,
"grad_norm": 0.19859929329947976,
"learning_rate": 1.340527389091374e-06,
"loss": 0.0048,
"step": 768
},
{
"epoch": 2.3588957055214723,
"grad_norm": 0.13967179062236237,
"learning_rate": 1.3283874528215735e-06,
"loss": 0.0045,
"step": 769
},
{
"epoch": 2.361963190184049,
"grad_norm": 0.0984015071068125,
"learning_rate": 1.3162943106179748e-06,
"loss": 0.0011,
"step": 770
},
{
"epoch": 2.3650306748466257,
"grad_norm": 0.2649561326989602,
"learning_rate": 1.304248116605587e-06,
"loss": 0.0034,
"step": 771
},
{
"epoch": 2.3680981595092025,
"grad_norm": 0.20608647696978882,
"learning_rate": 1.2922490243110613e-06,
"loss": 0.0021,
"step": 772
},
{
"epoch": 2.371165644171779,
"grad_norm": 0.14261322619364292,
"learning_rate": 1.2802971866607522e-06,
"loss": 0.001,
"step": 773
},
{
"epoch": 2.374233128834356,
"grad_norm": 0.2809342227158972,
"learning_rate": 1.2683927559787657e-06,
"loss": 0.006,
"step": 774
},
{
"epoch": 2.3773006134969323,
"grad_norm": 0.14128256049100985,
"learning_rate": 1.256535883985005e-06,
"loss": 0.0022,
"step": 775
},
{
"epoch": 2.3803680981595092,
"grad_norm": 0.17658685558545897,
"learning_rate": 1.2447267217932508e-06,
"loss": 0.002,
"step": 776
},
{
"epoch": 2.383435582822086,
"grad_norm": 0.16558249370135808,
"learning_rate": 1.232965419909235e-06,
"loss": 0.0038,
"step": 777
},
{
"epoch": 2.3865030674846626,
"grad_norm": 0.3563889958841285,
"learning_rate": 1.2212521282287093e-06,
"loss": 0.005,
"step": 778
},
{
"epoch": 2.3895705521472395,
"grad_norm": 0.17260337058148664,
"learning_rate": 1.2095869960355494e-06,
"loss": 0.003,
"step": 779
},
{
"epoch": 2.392638036809816,
"grad_norm": 0.1256214537945841,
"learning_rate": 1.1979701719998454e-06,
"loss": 0.0019,
"step": 780
},
{
"epoch": 2.395705521472393,
"grad_norm": 0.20072911308799066,
"learning_rate": 1.186401804176009e-06,
"loss": 0.0026,
"step": 781
},
{
"epoch": 2.3987730061349692,
"grad_norm": 0.12122821617966971,
"learning_rate": 1.1748820400008843e-06,
"loss": 0.0014,
"step": 782
},
{
"epoch": 2.401840490797546,
"grad_norm": 0.12587338917377983,
"learning_rate": 1.1634110262918718e-06,
"loss": 0.0041,
"step": 783
},
{
"epoch": 2.4049079754601226,
"grad_norm": 0.3656333842091313,
"learning_rate": 1.1519889092450543e-06,
"loss": 0.0018,
"step": 784
},
{
"epoch": 2.4079754601226995,
"grad_norm": 0.17872994295712075,
"learning_rate": 1.1406158344333367e-06,
"loss": 0.002,
"step": 785
},
{
"epoch": 2.411042944785276,
"grad_norm": 0.1619892096996537,
"learning_rate": 1.1292919468045876e-06,
"loss": 0.0023,
"step": 786
},
{
"epoch": 2.414110429447853,
"grad_norm": 0.19439110501138088,
"learning_rate": 1.1180173906797947e-06,
"loss": 0.0025,
"step": 787
},
{
"epoch": 2.4171779141104293,
"grad_norm": 0.10302567991744616,
"learning_rate": 1.1067923097512256e-06,
"loss": 0.0016,
"step": 788
},
{
"epoch": 2.420245398773006,
"grad_norm": 0.2911971113981604,
"learning_rate": 1.0956168470805877e-06,
"loss": 0.0069,
"step": 789
},
{
"epoch": 2.4233128834355826,
"grad_norm": 0.06255863079734855,
"learning_rate": 1.0844911450972228e-06,
"loss": 0.0009,
"step": 790
},
{
"epoch": 2.4263803680981595,
"grad_norm": 0.08073513643263174,
"learning_rate": 1.0734153455962765e-06,
"loss": 0.0011,
"step": 791
},
{
"epoch": 2.4294478527607364,
"grad_norm": 0.10514179663335815,
"learning_rate": 1.0623895897368914e-06,
"loss": 0.0015,
"step": 792
},
{
"epoch": 2.432515337423313,
"grad_norm": 0.1666940116725245,
"learning_rate": 1.0514140180404202e-06,
"loss": 0.003,
"step": 793
},
{
"epoch": 2.4355828220858897,
"grad_norm": 0.3297566953046385,
"learning_rate": 1.0404887703886252e-06,
"loss": 0.0044,
"step": 794
},
{
"epoch": 2.438650306748466,
"grad_norm": 0.17531363640577552,
"learning_rate": 1.0296139860218928e-06,
"loss": 0.0024,
"step": 795
},
{
"epoch": 2.441717791411043,
"grad_norm": 0.16868562336567175,
"learning_rate": 1.0187898035374683e-06,
"loss": 0.0016,
"step": 796
},
{
"epoch": 2.4447852760736195,
"grad_norm": 0.04584091385348708,
"learning_rate": 1.008016360887683e-06,
"loss": 0.0004,
"step": 797
},
{
"epoch": 2.4478527607361964,
"grad_norm": 0.18449622203743876,
"learning_rate": 9.972937953781985e-07,
"loss": 0.0042,
"step": 798
},
{
"epoch": 2.450920245398773,
"grad_norm": 0.13526404943688417,
"learning_rate": 9.86622243666256e-07,
"loss": 0.0016,
"step": 799
},
{
"epoch": 2.4539877300613497,
"grad_norm": 0.25252415406049705,
"learning_rate": 9.760018417589333e-07,
"loss": 0.0034,
"step": 800
},
{
"epoch": 2.457055214723926,
"grad_norm": 0.215518358226254,
"learning_rate": 9.654327250114147e-07,
"loss": 0.0037,
"step": 801
},
{
"epoch": 2.460122699386503,
"grad_norm": 0.24277757923937995,
"learning_rate": 9.549150281252633e-07,
"loss": 0.0023,
"step": 802
},
{
"epoch": 2.46319018404908,
"grad_norm": 0.35989818534447426,
"learning_rate": 9.444488851467043e-07,
"loss": 0.003,
"step": 803
},
{
"epoch": 2.4662576687116564,
"grad_norm": 0.21107732092261233,
"learning_rate": 9.340344294649184e-07,
"loss": 0.0015,
"step": 804
},
{
"epoch": 2.4693251533742333,
"grad_norm": 0.16593048874828475,
"learning_rate": 9.236717938103396e-07,
"loss": 0.0021,
"step": 805
},
{
"epoch": 2.4723926380368098,
"grad_norm": 0.07842110293160545,
"learning_rate": 9.133611102529655e-07,
"loss": 0.001,
"step": 806
},
{
"epoch": 2.4754601226993866,
"grad_norm": 0.0736410449630101,
"learning_rate": 9.031025102006725e-07,
"loss": 0.0007,
"step": 807
},
{
"epoch": 2.478527607361963,
"grad_norm": 0.15374285119898826,
"learning_rate": 8.928961243975437e-07,
"loss": 0.0009,
"step": 808
},
{
"epoch": 2.48159509202454,
"grad_norm": 0.09763968706400328,
"learning_rate": 8.827420829221944e-07,
"loss": 0.0009,
"step": 809
},
{
"epoch": 2.4846625766871164,
"grad_norm": 0.5363611758679326,
"learning_rate": 8.7264051518613e-07,
"loss": 0.0054,
"step": 810
},
{
"epoch": 2.4877300613496933,
"grad_norm": 0.09587721531791853,
"learning_rate": 8.625915499320819e-07,
"loss": 0.001,
"step": 811
},
{
"epoch": 2.4907975460122698,
"grad_norm": 0.07047828098328363,
"learning_rate": 8.525953152323685e-07,
"loss": 0.0009,
"step": 812
},
{
"epoch": 2.4938650306748467,
"grad_norm": 0.18694536936682596,
"learning_rate": 8.426519384872733e-07,
"loss": 0.0024,
"step": 813
},
{
"epoch": 2.4969325153374236,
"grad_norm": 0.0805331719571376,
"learning_rate": 8.327615464234129e-07,
"loss": 0.0006,
"step": 814
},
{
"epoch": 2.5,
"grad_norm": 0.07186649864490034,
"learning_rate": 8.229242650921166e-07,
"loss": 0.0008,
"step": 815
},
{
"epoch": 2.5030674846625764,
"grad_norm": 0.12548189563671605,
"learning_rate": 8.131402198678373e-07,
"loss": 0.0027,
"step": 816
},
{
"epoch": 2.5061349693251533,
"grad_norm": 0.2623745485957233,
"learning_rate": 8.034095354465337e-07,
"loss": 0.0028,
"step": 817
},
{
"epoch": 2.5092024539877302,
"grad_norm": 0.11779343034917025,
"learning_rate": 7.937323358440935e-07,
"loss": 0.001,
"step": 818
},
{
"epoch": 2.5122699386503067,
"grad_norm": 0.09129107157387073,
"learning_rate": 7.841087443947543e-07,
"loss": 0.0015,
"step": 819
},
{
"epoch": 2.5153374233128836,
"grad_norm": 0.0603800063286766,
"learning_rate": 7.745388837495188e-07,
"loss": 0.0006,
"step": 820
},
{
"epoch": 2.51840490797546,
"grad_norm": 0.1546472643676182,
"learning_rate": 7.650228758746059e-07,
"loss": 0.0024,
"step": 821
},
{
"epoch": 2.521472392638037,
"grad_norm": 0.3094740167503855,
"learning_rate": 7.555608420498872e-07,
"loss": 0.0051,
"step": 822
},
{
"epoch": 2.5245398773006134,
"grad_norm": 0.28126279193166526,
"learning_rate": 7.461529028673465e-07,
"loss": 0.0197,
"step": 823
},
{
"epoch": 2.5276073619631902,
"grad_norm": 0.0954026314786444,
"learning_rate": 7.367991782295392e-07,
"loss": 0.001,
"step": 824
},
{
"epoch": 2.530674846625767,
"grad_norm": 0.08452856435235079,
"learning_rate": 7.274997873480666e-07,
"loss": 0.0011,
"step": 825
},
{
"epoch": 2.5337423312883436,
"grad_norm": 0.12493971123469698,
"learning_rate": 7.182548487420555e-07,
"loss": 0.0019,
"step": 826
},
{
"epoch": 2.53680981595092,
"grad_norm": 0.21679833329538098,
"learning_rate": 7.090644802366469e-07,
"loss": 0.0027,
"step": 827
},
{
"epoch": 2.539877300613497,
"grad_norm": 0.1801329143229757,
"learning_rate": 6.999287989614972e-07,
"loss": 0.0014,
"step": 828
},
{
"epoch": 2.542944785276074,
"grad_norm": 0.22834022050034936,
"learning_rate": 6.908479213492825e-07,
"loss": 0.0011,
"step": 829
},
{
"epoch": 2.5460122699386503,
"grad_norm": 0.2595629467564202,
"learning_rate": 6.818219631342149e-07,
"loss": 0.0039,
"step": 830
},
{
"epoch": 2.549079754601227,
"grad_norm": 0.2836220482178321,
"learning_rate": 6.728510393505694e-07,
"loss": 0.0026,
"step": 831
},
{
"epoch": 2.5521472392638036,
"grad_norm": 0.06453238892514226,
"learning_rate": 6.639352643312164e-07,
"loss": 0.0004,
"step": 832
},
{
"epoch": 2.5552147239263805,
"grad_norm": 0.20852970651359395,
"learning_rate": 6.550747517061656e-07,
"loss": 0.001,
"step": 833
},
{
"epoch": 2.558282208588957,
"grad_norm": 0.2841174821202171,
"learning_rate": 6.462696144011149e-07,
"loss": 0.0014,
"step": 834
},
{
"epoch": 2.561349693251534,
"grad_norm": 0.2581856484121087,
"learning_rate": 6.375199646360142e-07,
"loss": 0.0022,
"step": 835
},
{
"epoch": 2.5644171779141103,
"grad_norm": 0.2834383185441278,
"learning_rate": 6.28825913923638e-07,
"loss": 0.0047,
"step": 836
},
{
"epoch": 2.567484662576687,
"grad_norm": 0.5107731515234254,
"learning_rate": 6.201875730681544e-07,
"loss": 0.0093,
"step": 837
},
{
"epoch": 2.5705521472392636,
"grad_norm": 0.27824373108650474,
"learning_rate": 6.116050521637218e-07,
"loss": 0.0049,
"step": 838
},
{
"epoch": 2.5736196319018405,
"grad_norm": 0.2579706167604278,
"learning_rate": 6.030784605930856e-07,
"loss": 0.0039,
"step": 839
},
{
"epoch": 2.5766871165644174,
"grad_norm": 0.12452116042145692,
"learning_rate": 5.946079070261773e-07,
"loss": 0.0024,
"step": 840
},
{
"epoch": 2.579754601226994,
"grad_norm": 0.2858948864429028,
"learning_rate": 5.861934994187357e-07,
"loss": 0.008,
"step": 841
},
{
"epoch": 2.5828220858895703,
"grad_norm": 0.237981754274074,
"learning_rate": 5.778353450109286e-07,
"loss": 0.0029,
"step": 842
},
{
"epoch": 2.585889570552147,
"grad_norm": 0.02606381443790744,
"learning_rate": 5.69533550325988e-07,
"loss": 0.0004,
"step": 843
},
{
"epoch": 2.588957055214724,
"grad_norm": 0.14042042098956603,
"learning_rate": 5.61288221168848e-07,
"loss": 0.0031,
"step": 844
},
{
"epoch": 2.5920245398773005,
"grad_norm": 0.1407516693295058,
"learning_rate": 5.530994626248026e-07,
"loss": 0.001,
"step": 845
},
{
"epoch": 2.5950920245398774,
"grad_norm": 0.10839729984992401,
"learning_rate": 5.449673790581611e-07,
"loss": 0.0022,
"step": 846
},
{
"epoch": 2.598159509202454,
"grad_norm": 0.39504221516607024,
"learning_rate": 5.368920741109202e-07,
"loss": 0.005,
"step": 847
},
{
"epoch": 2.6012269938650308,
"grad_norm": 0.16569517436714892,
"learning_rate": 5.288736507014436e-07,
"loss": 0.0023,
"step": 848
},
{
"epoch": 2.604294478527607,
"grad_norm": 0.10691366249706033,
"learning_rate": 5.209122110231496e-07,
"loss": 0.0015,
"step": 849
},
{
"epoch": 2.607361963190184,
"grad_norm": 0.08708747519441029,
"learning_rate": 5.130078565432089e-07,
"loss": 0.0015,
"step": 850
},
{
"epoch": 2.610429447852761,
"grad_norm": 0.16248278133516086,
"learning_rate": 5.051606880012483e-07,
"loss": 0.0014,
"step": 851
},
{
"epoch": 2.6134969325153374,
"grad_norm": 0.15115304316641817,
"learning_rate": 4.97370805408075e-07,
"loss": 0.0014,
"step": 852
},
{
"epoch": 2.616564417177914,
"grad_norm": 0.17910642709470473,
"learning_rate": 4.896383080443934e-07,
"loss": 0.0019,
"step": 853
},
{
"epoch": 2.6196319018404908,
"grad_norm": 0.10629488236344448,
"learning_rate": 4.819632944595415e-07,
"loss": 0.0018,
"step": 854
},
{
"epoch": 2.6226993865030677,
"grad_norm": 0.3160945524607155,
"learning_rate": 4.7434586247024075e-07,
"loss": 0.003,
"step": 855
},
{
"epoch": 2.625766871165644,
"grad_norm": 0.1796109375846772,
"learning_rate": 4.667861091593434e-07,
"loss": 0.002,
"step": 856
},
{
"epoch": 2.628834355828221,
"grad_norm": 0.09805120856646073,
"learning_rate": 4.5928413087459325e-07,
"loss": 0.0015,
"step": 857
},
{
"epoch": 2.6319018404907975,
"grad_norm": 0.06512120496498192,
"learning_rate": 4.5184002322740784e-07,
"loss": 0.0009,
"step": 858
},
{
"epoch": 2.6349693251533743,
"grad_norm": 0.08736046400576912,
"learning_rate": 4.444538810916499e-07,
"loss": 0.0005,
"step": 859
},
{
"epoch": 2.638036809815951,
"grad_norm": 0.11903062986503751,
"learning_rate": 4.371257986024202e-07,
"loss": 0.0014,
"step": 860
},
{
"epoch": 2.6411042944785277,
"grad_norm": 0.1738579960227546,
"learning_rate": 4.298558691548638e-07,
"loss": 0.0039,
"step": 861
},
{
"epoch": 2.644171779141104,
"grad_norm": 0.23024694590361497,
"learning_rate": 4.2264418540297e-07,
"loss": 0.0042,
"step": 862
},
{
"epoch": 2.647239263803681,
"grad_norm": 0.09629140238564009,
"learning_rate": 4.1549083925840174e-07,
"loss": 0.0007,
"step": 863
},
{
"epoch": 2.6503067484662575,
"grad_norm": 0.17847211521709486,
"learning_rate": 4.083959218893158e-07,
"loss": 0.0021,
"step": 864
},
{
"epoch": 2.6533742331288344,
"grad_norm": 0.14244846012513432,
"learning_rate": 4.0135952371920636e-07,
"loss": 0.0018,
"step": 865
},
{
"epoch": 2.6564417177914113,
"grad_norm": 1.1152167570510674,
"learning_rate": 3.9438173442575e-07,
"loss": 0.0059,
"step": 866
},
{
"epoch": 2.6595092024539877,
"grad_norm": 0.07020182370714435,
"learning_rate": 3.874626429396622e-07,
"loss": 0.0009,
"step": 867
},
{
"epoch": 2.662576687116564,
"grad_norm": 0.18577423108935284,
"learning_rate": 3.8060233744356634e-07,
"loss": 0.0062,
"step": 868
},
{
"epoch": 2.665644171779141,
"grad_norm": 0.4435684511619857,
"learning_rate": 3.7380090537086853e-07,
"loss": 0.0022,
"step": 869
},
{
"epoch": 2.668711656441718,
"grad_norm": 0.2716721122126474,
"learning_rate": 3.6705843340464287e-07,
"loss": 0.0211,
"step": 870
},
{
"epoch": 2.6717791411042944,
"grad_norm": 0.1978025404729548,
"learning_rate": 3.6037500747652665e-07,
"loss": 0.0011,
"step": 871
},
{
"epoch": 2.6748466257668713,
"grad_norm": 0.2832440687817866,
"learning_rate": 3.53750712765627e-07,
"loss": 0.0025,
"step": 872
},
{
"epoch": 2.6779141104294477,
"grad_norm": 0.24595271118155285,
"learning_rate": 3.4718563369743217e-07,
"loss": 0.0058,
"step": 873
},
{
"epoch": 2.6809815950920246,
"grad_norm": 0.09084236194044377,
"learning_rate": 3.406798539427386e-07,
"loss": 0.0013,
"step": 874
},
{
"epoch": 2.684049079754601,
"grad_norm": 0.06503950468432222,
"learning_rate": 3.3423345641658275e-07,
"loss": 0.0013,
"step": 875
},
{
"epoch": 2.687116564417178,
"grad_norm": 0.3314300890265219,
"learning_rate": 3.2784652327718544e-07,
"loss": 0.0031,
"step": 876
},
{
"epoch": 2.690184049079755,
"grad_norm": 0.1396336301171356,
"learning_rate": 3.2151913592490256e-07,
"loss": 0.0034,
"step": 877
},
{
"epoch": 2.6932515337423313,
"grad_norm": 0.5653766171978473,
"learning_rate": 3.1525137500119207e-07,
"loss": 0.0045,
"step": 878
},
{
"epoch": 2.6963190184049077,
"grad_norm": 0.10599295362951303,
"learning_rate": 3.0904332038757977e-07,
"loss": 0.0013,
"step": 879
},
{
"epoch": 2.6993865030674846,
"grad_norm": 0.23406302898617593,
"learning_rate": 3.0289505120464743e-07,
"loss": 0.0021,
"step": 880
},
{
"epoch": 2.7024539877300615,
"grad_norm": 0.06314907207577944,
"learning_rate": 2.96806645811023e-07,
"loss": 0.0012,
"step": 881
},
{
"epoch": 2.705521472392638,
"grad_norm": 0.2275280025600531,
"learning_rate": 2.9077818180237693e-07,
"loss": 0.0062,
"step": 882
},
{
"epoch": 2.708588957055215,
"grad_norm": 0.21241533924947312,
"learning_rate": 2.848097360104396e-07,
"loss": 0.0044,
"step": 883
},
{
"epoch": 2.7116564417177913,
"grad_norm": 0.31301496464316575,
"learning_rate": 2.7890138450202053e-07,
"loss": 0.0059,
"step": 884
},
{
"epoch": 2.714723926380368,
"grad_norm": 0.12241935449577525,
"learning_rate": 2.7305320257803524e-07,
"loss": 0.0014,
"step": 885
},
{
"epoch": 2.7177914110429446,
"grad_norm": 0.3673718302258716,
"learning_rate": 2.6726526477254986e-07,
"loss": 0.0039,
"step": 886
},
{
"epoch": 2.7208588957055215,
"grad_norm": 0.2702232084402326,
"learning_rate": 2.615376448518314e-07,
"loss": 0.0037,
"step": 887
},
{
"epoch": 2.7239263803680984,
"grad_norm": 0.4714333984588122,
"learning_rate": 2.5587041581340235e-07,
"loss": 0.004,
"step": 888
},
{
"epoch": 2.726993865030675,
"grad_norm": 0.07117338469739014,
"learning_rate": 2.502636498851152e-07,
"loss": 0.0009,
"step": 889
},
{
"epoch": 2.7300613496932513,
"grad_norm": 0.13772813511574597,
"learning_rate": 2.447174185242324e-07,
"loss": 0.0007,
"step": 890
},
{
"epoch": 2.733128834355828,
"grad_norm": 0.14647291454327963,
"learning_rate": 2.392317924165116e-07,
"loss": 0.0029,
"step": 891
},
{
"epoch": 2.736196319018405,
"grad_norm": 0.1033756801336396,
"learning_rate": 2.338068414753075e-07,
"loss": 0.0034,
"step": 892
},
{
"epoch": 2.7392638036809815,
"grad_norm": 0.13310767902357046,
"learning_rate": 2.2844263484068097e-07,
"loss": 0.0017,
"step": 893
},
{
"epoch": 2.7423312883435584,
"grad_norm": 0.11225484608545211,
"learning_rate": 2.2313924087851657e-07,
"loss": 0.0016,
"step": 894
},
{
"epoch": 2.745398773006135,
"grad_norm": 0.1581141376006445,
"learning_rate": 2.1789672717965227e-07,
"loss": 0.0036,
"step": 895
},
{
"epoch": 2.7484662576687118,
"grad_norm": 0.13380182051018658,
"learning_rate": 2.1271516055901774e-07,
"loss": 0.0015,
"step": 896
},
{
"epoch": 2.7515337423312882,
"grad_norm": 0.30152265206371165,
"learning_rate": 2.0759460705478186e-07,
"loss": 0.0046,
"step": 897
},
{
"epoch": 2.754601226993865,
"grad_norm": 0.2248407912430735,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.0055,
"step": 898
},
{
"epoch": 2.7576687116564416,
"grad_norm": 0.2815675146560431,
"learning_rate": 1.9753679965934526e-07,
"loss": 0.004,
"step": 899
},
{
"epoch": 2.7607361963190185,
"grad_norm": 0.32432774333312003,
"learning_rate": 1.925996739531577e-07,
"loss": 0.0016,
"step": 900
},
{
"epoch": 2.763803680981595,
"grad_norm": 0.12189177547164826,
"learning_rate": 1.8772381773176417e-07,
"loss": 0.0019,
"step": 901
},
{
"epoch": 2.766871165644172,
"grad_norm": 0.16085814495182016,
"learning_rate": 1.8290929313710514e-07,
"loss": 0.0018,
"step": 902
},
{
"epoch": 2.7699386503067487,
"grad_norm": 0.23030017813530398,
"learning_rate": 1.7815616152946523e-07,
"loss": 0.0033,
"step": 903
},
{
"epoch": 2.773006134969325,
"grad_norm": 0.2560418398994958,
"learning_rate": 1.7346448348668443e-07,
"loss": 0.0031,
"step": 904
},
{
"epoch": 2.7760736196319016,
"grad_norm": 0.17074392845672137,
"learning_rate": 1.6883431880338473e-07,
"loss": 0.001,
"step": 905
},
{
"epoch": 2.7791411042944785,
"grad_norm": 0.2075421386457064,
"learning_rate": 1.6426572649021477e-07,
"loss": 0.003,
"step": 906
},
{
"epoch": 2.7822085889570554,
"grad_norm": 0.2904212772947595,
"learning_rate": 1.5975876477309193e-07,
"loss": 0.0031,
"step": 907
},
{
"epoch": 2.785276073619632,
"grad_norm": 0.1008864192169771,
"learning_rate": 1.5531349109246364e-07,
"loss": 0.0015,
"step": 908
},
{
"epoch": 2.7883435582822087,
"grad_norm": 0.10209292364608329,
"learning_rate": 1.5092996210257281e-07,
"loss": 0.0004,
"step": 909
},
{
"epoch": 2.791411042944785,
"grad_norm": 0.12376916447693681,
"learning_rate": 1.466082336707375e-07,
"loss": 0.0022,
"step": 910
},
{
"epoch": 2.794478527607362,
"grad_norm": 0.25821128489268347,
"learning_rate": 1.423483608766385e-07,
"loss": 0.0029,
"step": 911
},
{
"epoch": 2.7975460122699385,
"grad_norm": 0.1579943616682322,
"learning_rate": 1.3815039801161723e-07,
"loss": 0.0036,
"step": 912
},
{
"epoch": 2.8006134969325154,
"grad_norm": 0.3539618179713858,
"learning_rate": 1.3401439857798292e-07,
"loss": 0.005,
"step": 913
},
{
"epoch": 2.8036809815950923,
"grad_norm": 0.11487729898070202,
"learning_rate": 1.2994041528833267e-07,
"loss": 0.001,
"step": 914
},
{
"epoch": 2.8067484662576687,
"grad_norm": 0.20363041791420816,
"learning_rate": 1.2592850006487801e-07,
"loss": 0.0046,
"step": 915
},
{
"epoch": 2.809815950920245,
"grad_norm": 0.18459789759276166,
"learning_rate": 1.2197870403878375e-07,
"loss": 0.0017,
"step": 916
},
{
"epoch": 2.812883435582822,
"grad_norm": 0.2409014981906113,
"learning_rate": 1.1809107754951643e-07,
"loss": 0.0019,
"step": 917
},
{
"epoch": 2.815950920245399,
"grad_norm": 0.13585872733436774,
"learning_rate": 1.1426567014420297e-07,
"loss": 0.0018,
"step": 918
},
{
"epoch": 2.8190184049079754,
"grad_norm": 0.11734644264645676,
"learning_rate": 1.105025305769969e-07,
"loss": 0.0021,
"step": 919
},
{
"epoch": 2.8220858895705523,
"grad_norm": 0.15393567968740215,
"learning_rate": 1.0680170680846259e-07,
"loss": 0.0035,
"step": 920
},
{
"epoch": 2.8251533742331287,
"grad_norm": 0.5560090616250628,
"learning_rate": 1.0316324600495697e-07,
"loss": 0.0476,
"step": 921
},
{
"epoch": 2.8282208588957056,
"grad_norm": 0.09066848667102294,
"learning_rate": 9.958719453803278e-08,
"loss": 0.0008,
"step": 922
},
{
"epoch": 2.831288343558282,
"grad_norm": 0.10536567544481347,
"learning_rate": 9.607359798384785e-08,
"loss": 0.0015,
"step": 923
},
{
"epoch": 2.834355828220859,
"grad_norm": 0.12467104732377465,
"learning_rate": 9.26225011225812e-08,
"loss": 0.001,
"step": 924
},
{
"epoch": 2.837423312883436,
"grad_norm": 0.2808585290483472,
"learning_rate": 8.923394793786455e-08,
"loss": 0.0018,
"step": 925
},
{
"epoch": 2.8404907975460123,
"grad_norm": 0.27131651565465975,
"learning_rate": 8.590798161622227e-08,
"loss": 0.0032,
"step": 926
},
{
"epoch": 2.8435582822085887,
"grad_norm": 0.23536713230244996,
"learning_rate": 8.264464454651843e-08,
"loss": 0.0023,
"step": 927
},
{
"epoch": 2.8466257668711656,
"grad_norm": 0.3065758572901688,
"learning_rate": 7.944397831941952e-08,
"loss": 0.0056,
"step": 928
},
{
"epoch": 2.8496932515337425,
"grad_norm": 0.0762524485953778,
"learning_rate": 7.630602372686258e-08,
"loss": 0.0017,
"step": 929
},
{
"epoch": 2.852760736196319,
"grad_norm": 0.23912997282686557,
"learning_rate": 7.32308207615351e-08,
"loss": 0.003,
"step": 930
},
{
"epoch": 2.855828220858896,
"grad_norm": 0.17423640557655332,
"learning_rate": 7.021840861636652e-08,
"loss": 0.0017,
"step": 931
},
{
"epoch": 2.8588957055214723,
"grad_norm": 0.1868208824915922,
"learning_rate": 6.72688256840287e-08,
"loss": 0.0027,
"step": 932
},
{
"epoch": 2.861963190184049,
"grad_norm": 0.1894809084876684,
"learning_rate": 6.438210955644453e-08,
"loss": 0.0009,
"step": 933
},
{
"epoch": 2.8650306748466257,
"grad_norm": 0.46134304784124597,
"learning_rate": 6.15582970243117e-08,
"loss": 0.0029,
"step": 934
},
{
"epoch": 2.8680981595092025,
"grad_norm": 0.4497782763650271,
"learning_rate": 5.879742407663147e-08,
"loss": 0.0028,
"step": 935
},
{
"epoch": 2.871165644171779,
"grad_norm": 0.047072709464050136,
"learning_rate": 5.609952590025225e-08,
"loss": 0.0005,
"step": 936
},
{
"epoch": 2.874233128834356,
"grad_norm": 0.07165859203606187,
"learning_rate": 5.3464636879419496e-08,
"loss": 0.001,
"step": 937
},
{
"epoch": 2.8773006134969323,
"grad_norm": 0.09661223755215212,
"learning_rate": 5.089279059533658e-08,
"loss": 0.0023,
"step": 938
},
{
"epoch": 2.8803680981595092,
"grad_norm": 0.09171222486373179,
"learning_rate": 4.8384019825739036e-08,
"loss": 0.0009,
"step": 939
},
{
"epoch": 2.883435582822086,
"grad_norm": 0.2299910781565878,
"learning_rate": 4.593835654447709e-08,
"loss": 0.0049,
"step": 940
},
{
"epoch": 2.8865030674846626,
"grad_norm": 0.09951233304726008,
"learning_rate": 4.3555831921104905e-08,
"loss": 0.0004,
"step": 941
},
{
"epoch": 2.889570552147239,
"grad_norm": 0.13293849716173955,
"learning_rate": 4.123647632048644e-08,
"loss": 0.0019,
"step": 942
},
{
"epoch": 2.892638036809816,
"grad_norm": 0.17984529573061198,
"learning_rate": 3.8980319302407976e-08,
"loss": 0.0022,
"step": 943
},
{
"epoch": 2.895705521472393,
"grad_norm": 0.23956821016046442,
"learning_rate": 3.678738962119899e-08,
"loss": 0.0032,
"step": 944
},
{
"epoch": 2.8987730061349692,
"grad_norm": 0.1195679361937312,
"learning_rate": 3.465771522536854e-08,
"loss": 0.0014,
"step": 945
},
{
"epoch": 2.901840490797546,
"grad_norm": 0.06300492295803174,
"learning_rate": 3.25913232572489e-08,
"loss": 0.0003,
"step": 946
},
{
"epoch": 2.9049079754601226,
"grad_norm": 0.1186283025014797,
"learning_rate": 3.058824005264805e-08,
"loss": 0.0008,
"step": 947
},
{
"epoch": 2.9079754601226995,
"grad_norm": 0.2913471755472139,
"learning_rate": 2.8648491140513267e-08,
"loss": 0.0027,
"step": 948
},
{
"epoch": 2.911042944785276,
"grad_norm": 0.37708500248147947,
"learning_rate": 2.6772101242610316e-08,
"loss": 0.0033,
"step": 949
},
{
"epoch": 2.914110429447853,
"grad_norm": 0.10203749592807297,
"learning_rate": 2.495909427320198e-08,
"loss": 0.0011,
"step": 950
},
{
"epoch": 2.9171779141104297,
"grad_norm": 0.04844387099370729,
"learning_rate": 2.3209493338749444e-08,
"loss": 0.0008,
"step": 951
},
{
"epoch": 2.920245398773006,
"grad_norm": 0.14844116201176166,
"learning_rate": 2.1523320737613095e-08,
"loss": 0.0023,
"step": 952
},
{
"epoch": 2.9233128834355826,
"grad_norm": 0.06648586707215681,
"learning_rate": 1.9900597959770506e-08,
"loss": 0.001,
"step": 953
},
{
"epoch": 2.9263803680981595,
"grad_norm": 0.06323384833319899,
"learning_rate": 1.834134568654333e-08,
"loss": 0.0009,
"step": 954
},
{
"epoch": 2.9294478527607364,
"grad_norm": 0.21617483164150297,
"learning_rate": 1.6845583790330855e-08,
"loss": 0.0053,
"step": 955
},
{
"epoch": 2.932515337423313,
"grad_norm": 0.46122917062522734,
"learning_rate": 1.541333133436018e-08,
"loss": 0.0061,
"step": 956
},
{
"epoch": 2.9355828220858897,
"grad_norm": 0.03574910339123186,
"learning_rate": 1.404460657244089e-08,
"loss": 0.0002,
"step": 957
},
{
"epoch": 2.938650306748466,
"grad_norm": 0.39754534061164976,
"learning_rate": 1.2739426948732426e-08,
"loss": 0.0057,
"step": 958
},
{
"epoch": 2.941717791411043,
"grad_norm": 0.22679275982144256,
"learning_rate": 1.1497809097523737e-08,
"loss": 0.0018,
"step": 959
},
{
"epoch": 2.9447852760736195,
"grad_norm": 0.320664640430231,
"learning_rate": 1.0319768843018995e-08,
"loss": 0.0038,
"step": 960
},
{
"epoch": 2.9478527607361964,
"grad_norm": 0.2519872312593028,
"learning_rate": 9.20532119913775e-09,
"loss": 0.0065,
"step": 961
},
{
"epoch": 2.950920245398773,
"grad_norm": 0.11284270513200542,
"learning_rate": 8.15448036932176e-09,
"loss": 0.0006,
"step": 962
},
{
"epoch": 2.9539877300613497,
"grad_norm": 0.10208352343863136,
"learning_rate": 7.167259746355681e-09,
"loss": 0.0011,
"step": 963
},
{
"epoch": 2.957055214723926,
"grad_norm": 0.25701217063452914,
"learning_rate": 6.243671912194993e-09,
"loss": 0.002,
"step": 964
},
{
"epoch": 2.960122699386503,
"grad_norm": 0.0764572908354291,
"learning_rate": 5.383728637807784e-09,
"loss": 0.0015,
"step": 965
},
{
"epoch": 2.96319018404908,
"grad_norm": 0.07471945840486285,
"learning_rate": 4.587440883021543e-09,
"loss": 0.0008,
"step": 966
},
{
"epoch": 2.9662576687116564,
"grad_norm": 0.2099586583717868,
"learning_rate": 3.854818796385495e-09,
"loss": 0.0045,
"step": 967
},
{
"epoch": 2.969325153374233,
"grad_norm": 0.24637085011322238,
"learning_rate": 3.1858717150412554e-09,
"loss": 0.0019,
"step": 968
},
{
"epoch": 2.9723926380368098,
"grad_norm": 0.05193551709164074,
"learning_rate": 2.580608164604592e-09,
"loss": 0.0003,
"step": 969
},
{
"epoch": 2.9754601226993866,
"grad_norm": 0.04547774488809871,
"learning_rate": 2.0390358590538507e-09,
"loss": 0.0004,
"step": 970
},
{
"epoch": 2.978527607361963,
"grad_norm": 0.7184876930985127,
"learning_rate": 1.5611617006344725e-09,
"loss": 0.006,
"step": 971
},
{
"epoch": 2.98159509202454,
"grad_norm": 0.14243992488733678,
"learning_rate": 1.1469917797696238e-09,
"loss": 0.0017,
"step": 972
},
{
"epoch": 2.9846625766871164,
"grad_norm": 0.16006164134574483,
"learning_rate": 7.96531374983589e-10,
"loss": 0.0043,
"step": 973
},
{
"epoch": 2.9877300613496933,
"grad_norm": 0.09809638018672716,
"learning_rate": 5.09784952833492e-10,
"loss": 0.0016,
"step": 974
},
{
"epoch": 2.9907975460122698,
"grad_norm": 0.2412737011340249,
"learning_rate": 2.8675616785212023e-10,
"loss": 0.0062,
"step": 975
},
{
"epoch": 2.9938650306748467,
"grad_norm": 0.17763972988409896,
"learning_rate": 1.2744786250407092e-10,
"loss": 0.0017,
"step": 976
},
{
"epoch": 2.9969325153374236,
"grad_norm": 0.21922904263222162,
"learning_rate": 3.186206714522744e-11,
"loss": 0.0036,
"step": 977
},
{
"epoch": 3.0,
"grad_norm": 0.04875957060132452,
"learning_rate": 0.0,
"loss": 0.0005,
"step": 978
},
{
"epoch": 3.0,
"step": 978,
"total_flos": 34026430169088.0,
"train_loss": 0.014223355380586132,
"train_runtime": 5099.3334,
"train_samples_per_second": 6.131,
"train_steps_per_second": 0.192
}
],
"logging_steps": 1,
"max_steps": 978,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 783,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 34026430169088.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}