0124_oneshot_v25_t25 / trainer_state.json
KerwinJob's picture
Upload folder using huggingface_hub
6e44edb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3125,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 7.186999998384225,
"learning_rate": 1.0638297872340426e-07,
"loss": 0.8419,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 15.195501156241782,
"learning_rate": 2.1276595744680852e-07,
"loss": 0.8607,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 6.471859790144152,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.8707,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 8.663960370878844,
"learning_rate": 4.2553191489361704e-07,
"loss": 1.0322,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 3.846806967544908,
"learning_rate": 5.319148936170213e-07,
"loss": 0.293,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 3.649076371491653,
"learning_rate": 6.382978723404255e-07,
"loss": 0.3746,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 3.7565091564146607,
"learning_rate": 7.446808510638298e-07,
"loss": 0.3011,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 3.7390758749183277,
"learning_rate": 8.510638297872341e-07,
"loss": 0.2792,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 2.9837584928681395,
"learning_rate": 9.574468085106384e-07,
"loss": 0.3738,
"step": 9
},
{
"epoch": 0.0,
"grad_norm": 5.947697119780937,
"learning_rate": 1.0638297872340427e-06,
"loss": 0.7324,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 8.260074298895535,
"learning_rate": 1.170212765957447e-06,
"loss": 0.7568,
"step": 11
},
{
"epoch": 0.0,
"grad_norm": 5.570877454874293,
"learning_rate": 1.276595744680851e-06,
"loss": 0.9866,
"step": 12
},
{
"epoch": 0.0,
"grad_norm": 6.179108386871164,
"learning_rate": 1.3829787234042555e-06,
"loss": 0.7508,
"step": 13
},
{
"epoch": 0.0,
"grad_norm": 6.33706086250621,
"learning_rate": 1.4893617021276596e-06,
"loss": 0.6779,
"step": 14
},
{
"epoch": 0.0,
"grad_norm": 5.348047232724415,
"learning_rate": 1.595744680851064e-06,
"loss": 0.7782,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 2.3373731647637666,
"learning_rate": 1.7021276595744682e-06,
"loss": 0.2998,
"step": 16
},
{
"epoch": 0.01,
"grad_norm": 6.074878326998715,
"learning_rate": 1.8085106382978727e-06,
"loss": 0.817,
"step": 17
},
{
"epoch": 0.01,
"grad_norm": 5.919283016168947,
"learning_rate": 1.9148936170212767e-06,
"loss": 0.8892,
"step": 18
},
{
"epoch": 0.01,
"grad_norm": 4.5315206387377005,
"learning_rate": 2.021276595744681e-06,
"loss": 0.5379,
"step": 19
},
{
"epoch": 0.01,
"grad_norm": 2.4658963420960665,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.3507,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 2.5168553994661753,
"learning_rate": 2.2340425531914894e-06,
"loss": 0.2616,
"step": 21
},
{
"epoch": 0.01,
"grad_norm": 6.926878667097406,
"learning_rate": 2.340425531914894e-06,
"loss": 0.8075,
"step": 22
},
{
"epoch": 0.01,
"grad_norm": 2.190515637234529,
"learning_rate": 2.446808510638298e-06,
"loss": 0.2471,
"step": 23
},
{
"epoch": 0.01,
"grad_norm": 2.1011555987879365,
"learning_rate": 2.553191489361702e-06,
"loss": 0.2433,
"step": 24
},
{
"epoch": 0.01,
"grad_norm": 5.466686958576261,
"learning_rate": 2.6595744680851065e-06,
"loss": 0.778,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 2.1202865982496992,
"learning_rate": 2.765957446808511e-06,
"loss": 0.2767,
"step": 26
},
{
"epoch": 0.01,
"grad_norm": 2.1714962642460858,
"learning_rate": 2.8723404255319155e-06,
"loss": 0.2637,
"step": 27
},
{
"epoch": 0.01,
"grad_norm": 6.713848871868974,
"learning_rate": 2.978723404255319e-06,
"loss": 0.8652,
"step": 28
},
{
"epoch": 0.01,
"grad_norm": 2.115806755756005,
"learning_rate": 3.0851063829787237e-06,
"loss": 0.2327,
"step": 29
},
{
"epoch": 0.01,
"grad_norm": 7.6527791089253006,
"learning_rate": 3.191489361702128e-06,
"loss": 0.8428,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 5.952188134702268,
"learning_rate": 3.297872340425532e-06,
"loss": 0.8583,
"step": 31
},
{
"epoch": 0.01,
"grad_norm": 19.692620114754746,
"learning_rate": 3.4042553191489363e-06,
"loss": 0.7858,
"step": 32
},
{
"epoch": 0.01,
"grad_norm": 2.0759321339191934,
"learning_rate": 3.510638297872341e-06,
"loss": 0.2241,
"step": 33
},
{
"epoch": 0.01,
"grad_norm": 5.5301026342716595,
"learning_rate": 3.6170212765957453e-06,
"loss": 0.663,
"step": 34
},
{
"epoch": 0.01,
"grad_norm": 5.585744509955249,
"learning_rate": 3.723404255319149e-06,
"loss": 0.6562,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 6.265222587262519,
"learning_rate": 3.8297872340425535e-06,
"loss": 0.5355,
"step": 36
},
{
"epoch": 0.01,
"grad_norm": 2.4701842123897486,
"learning_rate": 3.936170212765958e-06,
"loss": 0.2582,
"step": 37
},
{
"epoch": 0.01,
"grad_norm": 2.122181502870864,
"learning_rate": 4.042553191489362e-06,
"loss": 0.2856,
"step": 38
},
{
"epoch": 0.01,
"grad_norm": 6.9282484979982195,
"learning_rate": 4.148936170212766e-06,
"loss": 0.6174,
"step": 39
},
{
"epoch": 0.01,
"grad_norm": 5.975838840358755,
"learning_rate": 4.255319148936171e-06,
"loss": 0.7906,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 2.0345349032020126,
"learning_rate": 4.361702127659575e-06,
"loss": 0.2424,
"step": 41
},
{
"epoch": 0.01,
"grad_norm": 2.0615934625099346,
"learning_rate": 4.468085106382979e-06,
"loss": 0.2116,
"step": 42
},
{
"epoch": 0.01,
"grad_norm": 5.705896830279454,
"learning_rate": 4.574468085106383e-06,
"loss": 0.6891,
"step": 43
},
{
"epoch": 0.01,
"grad_norm": 13.158667169865367,
"learning_rate": 4.680851063829788e-06,
"loss": 0.5684,
"step": 44
},
{
"epoch": 0.01,
"grad_norm": 8.208102208041616,
"learning_rate": 4.787234042553192e-06,
"loss": 0.7109,
"step": 45
},
{
"epoch": 0.01,
"grad_norm": 13.41232565070186,
"learning_rate": 4.893617021276596e-06,
"loss": 0.5345,
"step": 46
},
{
"epoch": 0.02,
"grad_norm": 4.382535756710318,
"learning_rate": 5e-06,
"loss": 0.5668,
"step": 47
},
{
"epoch": 0.02,
"grad_norm": 1.9642261227014204,
"learning_rate": 5.106382978723404e-06,
"loss": 0.2235,
"step": 48
},
{
"epoch": 0.02,
"grad_norm": 2.355165416214269,
"learning_rate": 5.212765957446809e-06,
"loss": 0.2517,
"step": 49
},
{
"epoch": 0.02,
"grad_norm": 2.251290417809313,
"learning_rate": 5.319148936170213e-06,
"loss": 0.2282,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 2.1719053672114463,
"learning_rate": 5.425531914893617e-06,
"loss": 0.2927,
"step": 51
},
{
"epoch": 0.02,
"grad_norm": 1.9890876845802554,
"learning_rate": 5.531914893617022e-06,
"loss": 0.2073,
"step": 52
},
{
"epoch": 0.02,
"grad_norm": 2.3250034678509506,
"learning_rate": 5.638297872340426e-06,
"loss": 0.2757,
"step": 53
},
{
"epoch": 0.02,
"grad_norm": 6.374850230264314,
"learning_rate": 5.744680851063831e-06,
"loss": 0.6418,
"step": 54
},
{
"epoch": 0.02,
"grad_norm": 2.0771023570501375,
"learning_rate": 5.851063829787235e-06,
"loss": 0.2194,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": 2.110659928743081,
"learning_rate": 5.957446808510638e-06,
"loss": 0.2423,
"step": 56
},
{
"epoch": 0.02,
"grad_norm": 2.0592390004569676,
"learning_rate": 6.063829787234044e-06,
"loss": 0.2304,
"step": 57
},
{
"epoch": 0.02,
"grad_norm": 2.057093727460016,
"learning_rate": 6.170212765957447e-06,
"loss": 0.2304,
"step": 58
},
{
"epoch": 0.02,
"grad_norm": 5.045560870554051,
"learning_rate": 6.276595744680851e-06,
"loss": 0.553,
"step": 59
},
{
"epoch": 0.02,
"grad_norm": 8.28752932319891,
"learning_rate": 6.382978723404256e-06,
"loss": 0.5713,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 1.7885632031927219,
"learning_rate": 6.48936170212766e-06,
"loss": 0.224,
"step": 61
},
{
"epoch": 0.02,
"grad_norm": 5.192017917650936,
"learning_rate": 6.595744680851064e-06,
"loss": 0.521,
"step": 62
},
{
"epoch": 0.02,
"grad_norm": 2.3351268569331904,
"learning_rate": 6.702127659574469e-06,
"loss": 0.2633,
"step": 63
},
{
"epoch": 0.02,
"grad_norm": 2.1532968301258983,
"learning_rate": 6.808510638297873e-06,
"loss": 0.2165,
"step": 64
},
{
"epoch": 0.02,
"grad_norm": 1.9511934221645288,
"learning_rate": 6.914893617021278e-06,
"loss": 0.2079,
"step": 65
},
{
"epoch": 0.02,
"grad_norm": 1.949857025522768,
"learning_rate": 7.021276595744682e-06,
"loss": 0.247,
"step": 66
},
{
"epoch": 0.02,
"grad_norm": 1.8713256319822902,
"learning_rate": 7.127659574468085e-06,
"loss": 0.1753,
"step": 67
},
{
"epoch": 0.02,
"grad_norm": 2.2299096915752994,
"learning_rate": 7.234042553191491e-06,
"loss": 0.2232,
"step": 68
},
{
"epoch": 0.02,
"grad_norm": 1.9291440073894126,
"learning_rate": 7.340425531914894e-06,
"loss": 0.2033,
"step": 69
},
{
"epoch": 0.02,
"grad_norm": 5.717848884350137,
"learning_rate": 7.446808510638298e-06,
"loss": 0.5015,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 2.3133611736855175,
"learning_rate": 7.553191489361703e-06,
"loss": 0.2118,
"step": 71
},
{
"epoch": 0.02,
"grad_norm": 2.0854358236952564,
"learning_rate": 7.659574468085107e-06,
"loss": 0.2394,
"step": 72
},
{
"epoch": 0.02,
"grad_norm": 2.224655649542742,
"learning_rate": 7.765957446808511e-06,
"loss": 0.265,
"step": 73
},
{
"epoch": 0.02,
"grad_norm": 5.929271235180811,
"learning_rate": 7.872340425531916e-06,
"loss": 0.769,
"step": 74
},
{
"epoch": 0.02,
"grad_norm": 6.3708936816392265,
"learning_rate": 7.97872340425532e-06,
"loss": 0.7683,
"step": 75
},
{
"epoch": 0.02,
"grad_norm": 2.2604552953768673,
"learning_rate": 8.085106382978723e-06,
"loss": 0.2831,
"step": 76
},
{
"epoch": 0.02,
"grad_norm": 6.126149805744268,
"learning_rate": 8.191489361702128e-06,
"loss": 0.6417,
"step": 77
},
{
"epoch": 0.02,
"grad_norm": 5.373342833190464,
"learning_rate": 8.297872340425532e-06,
"loss": 0.6714,
"step": 78
},
{
"epoch": 0.03,
"grad_norm": 2.4780087686567374,
"learning_rate": 8.404255319148937e-06,
"loss": 0.274,
"step": 79
},
{
"epoch": 0.03,
"grad_norm": 6.0560530873284275,
"learning_rate": 8.510638297872341e-06,
"loss": 0.6787,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 5.52228143292844,
"learning_rate": 8.617021276595746e-06,
"loss": 0.5285,
"step": 81
},
{
"epoch": 0.03,
"grad_norm": 7.424573867850644,
"learning_rate": 8.72340425531915e-06,
"loss": 0.8279,
"step": 82
},
{
"epoch": 0.03,
"grad_norm": 2.0566553331232797,
"learning_rate": 8.829787234042555e-06,
"loss": 0.1796,
"step": 83
},
{
"epoch": 0.03,
"grad_norm": 1.7736431844288432,
"learning_rate": 8.936170212765958e-06,
"loss": 0.1994,
"step": 84
},
{
"epoch": 0.03,
"grad_norm": 4.983461904065465,
"learning_rate": 9.042553191489362e-06,
"loss": 0.5259,
"step": 85
},
{
"epoch": 0.03,
"grad_norm": 8.163488446770579,
"learning_rate": 9.148936170212767e-06,
"loss": 0.886,
"step": 86
},
{
"epoch": 0.03,
"grad_norm": 5.988575327788609,
"learning_rate": 9.255319148936171e-06,
"loss": 0.7793,
"step": 87
},
{
"epoch": 0.03,
"grad_norm": 1.8929495581316096,
"learning_rate": 9.361702127659576e-06,
"loss": 0.2037,
"step": 88
},
{
"epoch": 0.03,
"grad_norm": 2.2085110026193715,
"learning_rate": 9.46808510638298e-06,
"loss": 0.2481,
"step": 89
},
{
"epoch": 0.03,
"grad_norm": 9.762190689935485,
"learning_rate": 9.574468085106385e-06,
"loss": 0.6555,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 4.402889827649181,
"learning_rate": 9.680851063829787e-06,
"loss": 0.5985,
"step": 91
},
{
"epoch": 0.03,
"grad_norm": 2.0189402746940495,
"learning_rate": 9.787234042553192e-06,
"loss": 0.2526,
"step": 92
},
{
"epoch": 0.03,
"grad_norm": 8.113680435659742,
"learning_rate": 9.893617021276596e-06,
"loss": 0.725,
"step": 93
},
{
"epoch": 0.03,
"grad_norm": 4.8822328335458245,
"learning_rate": 1e-05,
"loss": 0.482,
"step": 94
},
{
"epoch": 0.03,
"grad_norm": 7.462243307627707,
"learning_rate": 9.999997314236036e-06,
"loss": 0.5765,
"step": 95
},
{
"epoch": 0.03,
"grad_norm": 2.368939909600803,
"learning_rate": 9.999989256947029e-06,
"loss": 0.2621,
"step": 96
},
{
"epoch": 0.03,
"grad_norm": 6.4184710762944235,
"learning_rate": 9.999975828141635e-06,
"loss": 0.5185,
"step": 97
},
{
"epoch": 0.03,
"grad_norm": 1.9004648689011145,
"learning_rate": 9.999957027834282e-06,
"loss": 0.244,
"step": 98
},
{
"epoch": 0.03,
"grad_norm": 2.01583191932197,
"learning_rate": 9.999932856045164e-06,
"loss": 0.2724,
"step": 99
},
{
"epoch": 0.03,
"grad_norm": 2.0809206317338558,
"learning_rate": 9.99990331280025e-06,
"loss": 0.2221,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 2.0466622912745494,
"learning_rate": 9.999868398131282e-06,
"loss": 0.2625,
"step": 101
},
{
"epoch": 0.03,
"grad_norm": 7.446390750033542,
"learning_rate": 9.999828112075764e-06,
"loss": 0.6449,
"step": 102
},
{
"epoch": 0.03,
"grad_norm": 6.7602889610343215,
"learning_rate": 9.99978245467698e-06,
"loss": 0.7039,
"step": 103
},
{
"epoch": 0.03,
"grad_norm": 1.9500980506545675,
"learning_rate": 9.999731425983975e-06,
"loss": 0.2355,
"step": 104
},
{
"epoch": 0.03,
"grad_norm": 7.2513016036019335,
"learning_rate": 9.999675026051576e-06,
"loss": 0.6968,
"step": 105
},
{
"epoch": 0.03,
"grad_norm": 43.33082783120513,
"learning_rate": 9.999613254940368e-06,
"loss": 0.4654,
"step": 106
},
{
"epoch": 0.03,
"grad_norm": 1.9004552677187696,
"learning_rate": 9.999546112716715e-06,
"loss": 0.2029,
"step": 107
},
{
"epoch": 0.03,
"grad_norm": 2.194236317843529,
"learning_rate": 9.999473599452746e-06,
"loss": 0.2813,
"step": 108
},
{
"epoch": 0.03,
"grad_norm": 8.035356250059664,
"learning_rate": 9.999395715226365e-06,
"loss": 0.667,
"step": 109
},
{
"epoch": 0.04,
"grad_norm": 1.9145140818009003,
"learning_rate": 9.999312460121242e-06,
"loss": 0.2297,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 1.7982756322993592,
"learning_rate": 9.999223834226817e-06,
"loss": 0.2475,
"step": 111
},
{
"epoch": 0.04,
"grad_norm": 1.7314876163084512,
"learning_rate": 9.999129837638303e-06,
"loss": 0.1724,
"step": 112
},
{
"epoch": 0.04,
"grad_norm": 7.323150351960898,
"learning_rate": 9.999030470456684e-06,
"loss": 0.6735,
"step": 113
},
{
"epoch": 0.04,
"grad_norm": 2.1080102277359214,
"learning_rate": 9.998925732788706e-06,
"loss": 0.2153,
"step": 114
},
{
"epoch": 0.04,
"grad_norm": 5.903214475636129,
"learning_rate": 9.99881562474689e-06,
"loss": 0.6093,
"step": 115
},
{
"epoch": 0.04,
"grad_norm": 1.9446574201365474,
"learning_rate": 9.998700146449528e-06,
"loss": 0.2293,
"step": 116
},
{
"epoch": 0.04,
"grad_norm": 9.402474227627586,
"learning_rate": 9.998579298020676e-06,
"loss": 0.4865,
"step": 117
},
{
"epoch": 0.04,
"grad_norm": 5.997676640949038,
"learning_rate": 9.998453079590167e-06,
"loss": 0.4849,
"step": 118
},
{
"epoch": 0.04,
"grad_norm": 2.05414612783157,
"learning_rate": 9.998321491293592e-06,
"loss": 0.3184,
"step": 119
},
{
"epoch": 0.04,
"grad_norm": 6.6314295483116545,
"learning_rate": 9.998184533272321e-06,
"loss": 0.7459,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 1.9723909371748105,
"learning_rate": 9.998042205673489e-06,
"loss": 0.2924,
"step": 121
},
{
"epoch": 0.04,
"grad_norm": 2.1580999203943905,
"learning_rate": 9.997894508649995e-06,
"loss": 0.272,
"step": 122
},
{
"epoch": 0.04,
"grad_norm": 1.8013373131993664,
"learning_rate": 9.997741442360515e-06,
"loss": 0.2739,
"step": 123
},
{
"epoch": 0.04,
"grad_norm": 8.06345215974651,
"learning_rate": 9.99758300696949e-06,
"loss": 0.6538,
"step": 124
},
{
"epoch": 0.04,
"grad_norm": 1.9726083487811696,
"learning_rate": 9.997419202647124e-06,
"loss": 0.245,
"step": 125
},
{
"epoch": 0.04,
"grad_norm": 1.9678765556915165,
"learning_rate": 9.997250029569395e-06,
"loss": 0.2684,
"step": 126
},
{
"epoch": 0.04,
"grad_norm": 10.237118233392872,
"learning_rate": 9.997075487918047e-06,
"loss": 0.619,
"step": 127
},
{
"epoch": 0.04,
"grad_norm": 1.9105611501823063,
"learning_rate": 9.99689557788059e-06,
"loss": 0.2739,
"step": 128
},
{
"epoch": 0.04,
"grad_norm": 2.0096267154815597,
"learning_rate": 9.996710299650302e-06,
"loss": 0.3259,
"step": 129
},
{
"epoch": 0.04,
"grad_norm": 4.445495901455712,
"learning_rate": 9.996519653426229e-06,
"loss": 0.5219,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 5.605850026827202,
"learning_rate": 9.996323639413185e-06,
"loss": 0.6143,
"step": 131
},
{
"epoch": 0.04,
"grad_norm": 1.8916002019145026,
"learning_rate": 9.996122257821746e-06,
"loss": 0.224,
"step": 132
},
{
"epoch": 0.04,
"grad_norm": 15.579924374242733,
"learning_rate": 9.99591550886826e-06,
"loss": 0.7796,
"step": 133
},
{
"epoch": 0.04,
"grad_norm": 7.221135134011073,
"learning_rate": 9.995703392774836e-06,
"loss": 0.4544,
"step": 134
},
{
"epoch": 0.04,
"grad_norm": 16.49546367183523,
"learning_rate": 9.995485909769354e-06,
"loss": 0.6682,
"step": 135
},
{
"epoch": 0.04,
"grad_norm": 13.874669468561029,
"learning_rate": 9.995263060085456e-06,
"loss": 0.7609,
"step": 136
},
{
"epoch": 0.04,
"grad_norm": 6.303393045385302,
"learning_rate": 9.99503484396255e-06,
"loss": 0.4596,
"step": 137
},
{
"epoch": 0.04,
"grad_norm": 9.058577656777699,
"learning_rate": 9.99480126164581e-06,
"loss": 0.6164,
"step": 138
},
{
"epoch": 0.04,
"grad_norm": 6.637101781956161,
"learning_rate": 9.994562313386177e-06,
"loss": 0.7215,
"step": 139
},
{
"epoch": 0.04,
"grad_norm": 1.765877060254261,
"learning_rate": 9.994317999440351e-06,
"loss": 0.2007,
"step": 140
},
{
"epoch": 0.05,
"grad_norm": 12.923758398693709,
"learning_rate": 9.994068320070805e-06,
"loss": 0.5322,
"step": 141
},
{
"epoch": 0.05,
"grad_norm": 2.00990811412376,
"learning_rate": 9.993813275545764e-06,
"loss": 0.2872,
"step": 142
},
{
"epoch": 0.05,
"grad_norm": 8.332512786467698,
"learning_rate": 9.99355286613923e-06,
"loss": 0.5918,
"step": 143
},
{
"epoch": 0.05,
"grad_norm": 11.049783636211211,
"learning_rate": 9.993287092130956e-06,
"loss": 0.6115,
"step": 144
},
{
"epoch": 0.05,
"grad_norm": 2.0191315641950207,
"learning_rate": 9.993015953806472e-06,
"loss": 0.2369,
"step": 145
},
{
"epoch": 0.05,
"grad_norm": 7.267256545843167,
"learning_rate": 9.992739451457058e-06,
"loss": 0.6323,
"step": 146
},
{
"epoch": 0.05,
"grad_norm": 2.0704901493165035,
"learning_rate": 9.992457585379764e-06,
"loss": 0.2656,
"step": 147
},
{
"epoch": 0.05,
"grad_norm": 7.429213454553269,
"learning_rate": 9.992170355877398e-06,
"loss": 0.6515,
"step": 148
},
{
"epoch": 0.05,
"grad_norm": 5.525353846641136,
"learning_rate": 9.991877763258538e-06,
"loss": 0.6956,
"step": 149
},
{
"epoch": 0.05,
"grad_norm": 6.99048709636458,
"learning_rate": 9.991579807837511e-06,
"loss": 0.6963,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 1.8319076937894148,
"learning_rate": 9.991276489934416e-06,
"loss": 0.278,
"step": 151
},
{
"epoch": 0.05,
"grad_norm": 8.30888476109113,
"learning_rate": 9.990967809875107e-06,
"loss": 0.5689,
"step": 152
},
{
"epoch": 0.05,
"grad_norm": 10.121949999243235,
"learning_rate": 9.990653767991203e-06,
"loss": 0.6782,
"step": 153
},
{
"epoch": 0.05,
"grad_norm": 6.949448664548724,
"learning_rate": 9.99033436462008e-06,
"loss": 0.8458,
"step": 154
},
{
"epoch": 0.05,
"grad_norm": 1.7631365862407675,
"learning_rate": 9.990009600104875e-06,
"loss": 0.2387,
"step": 155
},
{
"epoch": 0.05,
"grad_norm": 1.95646929645247,
"learning_rate": 9.989679474794484e-06,
"loss": 0.2584,
"step": 156
},
{
"epoch": 0.05,
"grad_norm": 1.5968211493704039,
"learning_rate": 9.989343989043563e-06,
"loss": 0.2326,
"step": 157
},
{
"epoch": 0.05,
"grad_norm": 5.43077862961733,
"learning_rate": 9.989003143212526e-06,
"loss": 0.5847,
"step": 158
},
{
"epoch": 0.05,
"grad_norm": 1.7014016514942623,
"learning_rate": 9.988656937667544e-06,
"loss": 0.2557,
"step": 159
},
{
"epoch": 0.05,
"grad_norm": 5.4041752599024,
"learning_rate": 9.98830537278055e-06,
"loss": 0.5055,
"step": 160
},
{
"epoch": 0.05,
"grad_norm": 2.112904610234277,
"learning_rate": 9.987948448929232e-06,
"loss": 0.2591,
"step": 161
},
{
"epoch": 0.05,
"grad_norm": 1.8032816303999848,
"learning_rate": 9.987586166497032e-06,
"loss": 0.2757,
"step": 162
},
{
"epoch": 0.05,
"grad_norm": 1.772808892651813,
"learning_rate": 9.987218525873155e-06,
"loss": 0.2461,
"step": 163
},
{
"epoch": 0.05,
"grad_norm": 1.911641867802362,
"learning_rate": 9.98684552745256e-06,
"loss": 0.2403,
"step": 164
},
{
"epoch": 0.05,
"grad_norm": 1.7493725607696422,
"learning_rate": 9.98646717163596e-06,
"loss": 0.2433,
"step": 165
},
{
"epoch": 0.05,
"grad_norm": 7.7961677705380605,
"learning_rate": 9.986083458829824e-06,
"loss": 0.5065,
"step": 166
},
{
"epoch": 0.05,
"grad_norm": 2.177300666490729,
"learning_rate": 9.985694389446378e-06,
"loss": 0.2436,
"step": 167
},
{
"epoch": 0.05,
"grad_norm": 4.530179647043451,
"learning_rate": 9.9852999639036e-06,
"loss": 0.5275,
"step": 168
},
{
"epoch": 0.05,
"grad_norm": 2.090913867645795,
"learning_rate": 9.984900182625226e-06,
"loss": 0.3134,
"step": 169
},
{
"epoch": 0.05,
"grad_norm": 6.731588330774811,
"learning_rate": 9.98449504604074e-06,
"loss": 0.7066,
"step": 170
},
{
"epoch": 0.05,
"grad_norm": 14.448000839451272,
"learning_rate": 9.984084554585387e-06,
"loss": 0.7049,
"step": 171
},
{
"epoch": 0.06,
"grad_norm": 5.742724641735612,
"learning_rate": 9.983668708700156e-06,
"loss": 0.5357,
"step": 172
},
{
"epoch": 0.06,
"grad_norm": 1.8344366677326172,
"learning_rate": 9.983247508831795e-06,
"loss": 0.2459,
"step": 173
},
{
"epoch": 0.06,
"grad_norm": 1.9099309329965592,
"learning_rate": 9.9828209554328e-06,
"loss": 0.2658,
"step": 174
},
{
"epoch": 0.06,
"grad_norm": 1.6321601388074647,
"learning_rate": 9.982389048961421e-06,
"loss": 0.2604,
"step": 175
},
{
"epoch": 0.06,
"grad_norm": 7.549083750784864,
"learning_rate": 9.981951789881657e-06,
"loss": 0.7438,
"step": 176
},
{
"epoch": 0.06,
"grad_norm": 4.950187800141307,
"learning_rate": 9.981509178663256e-06,
"loss": 0.5696,
"step": 177
},
{
"epoch": 0.06,
"grad_norm": 7.226166260751547,
"learning_rate": 9.98106121578172e-06,
"loss": 0.7509,
"step": 178
},
{
"epoch": 0.06,
"grad_norm": 9.549776784209499,
"learning_rate": 9.980607901718297e-06,
"loss": 0.7863,
"step": 179
},
{
"epoch": 0.06,
"grad_norm": 2.2974781122942716,
"learning_rate": 9.980149236959986e-06,
"loss": 0.2952,
"step": 180
},
{
"epoch": 0.06,
"grad_norm": 6.5401223182948245,
"learning_rate": 9.979685221999532e-06,
"loss": 0.5763,
"step": 181
},
{
"epoch": 0.06,
"grad_norm": 5.752458813451822,
"learning_rate": 9.97921585733543e-06,
"loss": 0.618,
"step": 182
},
{
"epoch": 0.06,
"grad_norm": 5.5480082188645525,
"learning_rate": 9.97874114347192e-06,
"loss": 0.622,
"step": 183
},
{
"epoch": 0.06,
"grad_norm": 1.808710962103662,
"learning_rate": 9.978261080918988e-06,
"loss": 0.2828,
"step": 184
},
{
"epoch": 0.06,
"grad_norm": 1.799105004739947,
"learning_rate": 9.977775670192373e-06,
"loss": 0.2528,
"step": 185
},
{
"epoch": 0.06,
"grad_norm": 6.243875053588291,
"learning_rate": 9.977284911813549e-06,
"loss": 0.5032,
"step": 186
},
{
"epoch": 0.06,
"grad_norm": 4.880295190945132,
"learning_rate": 9.976788806309742e-06,
"loss": 0.547,
"step": 187
},
{
"epoch": 0.06,
"grad_norm": 1.6908031523481455,
"learning_rate": 9.976287354213924e-06,
"loss": 0.1809,
"step": 188
},
{
"epoch": 0.06,
"grad_norm": 11.874030701403033,
"learning_rate": 9.975780556064806e-06,
"loss": 0.6395,
"step": 189
},
{
"epoch": 0.06,
"grad_norm": 9.639996712082842,
"learning_rate": 9.975268412406842e-06,
"loss": 0.6557,
"step": 190
},
{
"epoch": 0.06,
"grad_norm": 2.1448436503091655,
"learning_rate": 9.974750923790234e-06,
"loss": 0.2842,
"step": 191
},
{
"epoch": 0.06,
"grad_norm": 4.108059139362723,
"learning_rate": 9.97422809077092e-06,
"loss": 0.6994,
"step": 192
},
{
"epoch": 0.06,
"grad_norm": 5.188802830624759,
"learning_rate": 9.973699913910584e-06,
"loss": 0.5398,
"step": 193
},
{
"epoch": 0.06,
"grad_norm": 1.7631244750636252,
"learning_rate": 9.97316639377665e-06,
"loss": 0.2767,
"step": 194
},
{
"epoch": 0.06,
"grad_norm": 5.776454382682188,
"learning_rate": 9.97262753094228e-06,
"loss": 0.5958,
"step": 195
},
{
"epoch": 0.06,
"grad_norm": 2.0539095988416336,
"learning_rate": 9.972083325986377e-06,
"loss": 0.2507,
"step": 196
},
{
"epoch": 0.06,
"grad_norm": 7.71647708129,
"learning_rate": 9.971533779493586e-06,
"loss": 0.6313,
"step": 197
},
{
"epoch": 0.06,
"grad_norm": 9.885032841099774,
"learning_rate": 9.970978892054286e-06,
"loss": 0.5581,
"step": 198
},
{
"epoch": 0.06,
"grad_norm": 1.78503609944141,
"learning_rate": 9.970418664264596e-06,
"loss": 0.2364,
"step": 199
},
{
"epoch": 0.06,
"grad_norm": 5.183970320492835,
"learning_rate": 9.969853096726372e-06,
"loss": 0.7529,
"step": 200
},
{
"epoch": 0.06,
"grad_norm": 4.590290754655386,
"learning_rate": 9.969282190047207e-06,
"loss": 0.523,
"step": 201
},
{
"epoch": 0.06,
"grad_norm": 4.57222467490719,
"learning_rate": 9.968705944840428e-06,
"loss": 0.4612,
"step": 202
},
{
"epoch": 0.06,
"grad_norm": 6.707984670342173,
"learning_rate": 9.968124361725098e-06,
"loss": 0.6136,
"step": 203
},
{
"epoch": 0.07,
"grad_norm": 10.496256043657745,
"learning_rate": 9.967537441326018e-06,
"loss": 0.6126,
"step": 204
},
{
"epoch": 0.07,
"grad_norm": 9.868715415306239,
"learning_rate": 9.966945184273716e-06,
"loss": 0.6428,
"step": 205
},
{
"epoch": 0.07,
"grad_norm": 9.668861159469753,
"learning_rate": 9.966347591204459e-06,
"loss": 0.7014,
"step": 206
},
{
"epoch": 0.07,
"grad_norm": 1.7954523281162194,
"learning_rate": 9.965744662760246e-06,
"loss": 0.2307,
"step": 207
},
{
"epoch": 0.07,
"grad_norm": 1.9575524083025957,
"learning_rate": 9.965136399588803e-06,
"loss": 0.2551,
"step": 208
},
{
"epoch": 0.07,
"grad_norm": 2.0487271777285883,
"learning_rate": 9.964522802343593e-06,
"loss": 0.3312,
"step": 209
},
{
"epoch": 0.07,
"grad_norm": 5.406807318098485,
"learning_rate": 9.963903871683806e-06,
"loss": 0.6646,
"step": 210
},
{
"epoch": 0.07,
"grad_norm": 5.369438968167279,
"learning_rate": 9.963279608274364e-06,
"loss": 0.5249,
"step": 211
},
{
"epoch": 0.07,
"grad_norm": 2.0976265634507163,
"learning_rate": 9.962650012785917e-06,
"loss": 0.3104,
"step": 212
},
{
"epoch": 0.07,
"grad_norm": 2.0455693449228236,
"learning_rate": 9.962015085894838e-06,
"loss": 0.2619,
"step": 213
},
{
"epoch": 0.07,
"grad_norm": 5.460373040624954,
"learning_rate": 9.961374828283239e-06,
"loss": 0.6343,
"step": 214
},
{
"epoch": 0.07,
"grad_norm": 1.6036581876392395,
"learning_rate": 9.960729240638947e-06,
"loss": 0.2809,
"step": 215
},
{
"epoch": 0.07,
"grad_norm": 1.6539767213635388,
"learning_rate": 9.960078323655524e-06,
"loss": 0.2592,
"step": 216
},
{
"epoch": 0.07,
"grad_norm": 5.2639118853712885,
"learning_rate": 9.959422078032253e-06,
"loss": 0.6271,
"step": 217
},
{
"epoch": 0.07,
"grad_norm": 6.183046858090365,
"learning_rate": 9.958760504474144e-06,
"loss": 0.6513,
"step": 218
},
{
"epoch": 0.07,
"grad_norm": 1.8640131168963845,
"learning_rate": 9.958093603691923e-06,
"loss": 0.2528,
"step": 219
},
{
"epoch": 0.07,
"grad_norm": 1.8894317015528737,
"learning_rate": 9.957421376402053e-06,
"loss": 0.2938,
"step": 220
},
{
"epoch": 0.07,
"grad_norm": 1.8225244919346641,
"learning_rate": 9.956743823326704e-06,
"loss": 0.2652,
"step": 221
},
{
"epoch": 0.07,
"grad_norm": 2.016628969006215,
"learning_rate": 9.956060945193781e-06,
"loss": 0.2655,
"step": 222
},
{
"epoch": 0.07,
"grad_norm": 1.871720299399146,
"learning_rate": 9.955372742736903e-06,
"loss": 0.2282,
"step": 223
},
{
"epoch": 0.07,
"grad_norm": 5.251804505199205,
"learning_rate": 9.954679216695406e-06,
"loss": 0.7544,
"step": 224
},
{
"epoch": 0.07,
"grad_norm": 1.6841352163321124,
"learning_rate": 9.953980367814354e-06,
"loss": 0.2324,
"step": 225
},
{
"epoch": 0.07,
"grad_norm": 1.9551569313943153,
"learning_rate": 9.953276196844519e-06,
"loss": 0.2607,
"step": 226
},
{
"epoch": 0.07,
"grad_norm": 2.279701863291198,
"learning_rate": 9.9525667045424e-06,
"loss": 0.2336,
"step": 227
},
{
"epoch": 0.07,
"grad_norm": 2.0942832584637263,
"learning_rate": 9.951851891670206e-06,
"loss": 0.2428,
"step": 228
},
{
"epoch": 0.07,
"grad_norm": 2.03070287347475,
"learning_rate": 9.951131758995866e-06,
"loss": 0.2874,
"step": 229
},
{
"epoch": 0.07,
"grad_norm": 1.6708016558581609,
"learning_rate": 9.950406307293023e-06,
"loss": 0.2417,
"step": 230
},
{
"epoch": 0.07,
"grad_norm": 6.63843141297905,
"learning_rate": 9.949675537341031e-06,
"loss": 0.6903,
"step": 231
},
{
"epoch": 0.07,
"grad_norm": 11.998445255287676,
"learning_rate": 9.948939449924964e-06,
"loss": 0.594,
"step": 232
},
{
"epoch": 0.07,
"grad_norm": 1.9657571867354962,
"learning_rate": 9.948198045835601e-06,
"loss": 0.2757,
"step": 233
},
{
"epoch": 0.07,
"grad_norm": 5.238820252838469,
"learning_rate": 9.94745132586944e-06,
"loss": 0.591,
"step": 234
},
{
"epoch": 0.08,
"grad_norm": 4.199219289945727,
"learning_rate": 9.946699290828683e-06,
"loss": 0.6224,
"step": 235
},
{
"epoch": 0.08,
"grad_norm": 2.286185413567256,
"learning_rate": 9.94594194152125e-06,
"loss": 0.2635,
"step": 236
},
{
"epoch": 0.08,
"grad_norm": 5.155721769180376,
"learning_rate": 9.945179278760759e-06,
"loss": 0.652,
"step": 237
},
{
"epoch": 0.08,
"grad_norm": 1.7670381153982928,
"learning_rate": 9.94441130336655e-06,
"loss": 0.2414,
"step": 238
},
{
"epoch": 0.08,
"grad_norm": 1.7930068540247865,
"learning_rate": 9.943638016163658e-06,
"loss": 0.2304,
"step": 239
},
{
"epoch": 0.08,
"grad_norm": 5.175473506021528,
"learning_rate": 9.942859417982833e-06,
"loss": 0.6327,
"step": 240
},
{
"epoch": 0.08,
"grad_norm": 6.483158980004239,
"learning_rate": 9.942075509660527e-06,
"loss": 0.7988,
"step": 241
},
{
"epoch": 0.08,
"grad_norm": 1.890761915446787,
"learning_rate": 9.941286292038894e-06,
"loss": 0.2275,
"step": 242
},
{
"epoch": 0.08,
"grad_norm": 2.2858130028155097,
"learning_rate": 9.940491765965798e-06,
"loss": 0.3103,
"step": 243
},
{
"epoch": 0.08,
"grad_norm": 1.677858006478938,
"learning_rate": 9.939691932294804e-06,
"loss": 0.195,
"step": 244
},
{
"epoch": 0.08,
"grad_norm": 9.548415559641747,
"learning_rate": 9.938886791885172e-06,
"loss": 0.8219,
"step": 245
},
{
"epoch": 0.08,
"grad_norm": 1.7070562876457582,
"learning_rate": 9.938076345601875e-06,
"loss": 0.2794,
"step": 246
},
{
"epoch": 0.08,
"grad_norm": 5.186033973337935,
"learning_rate": 9.937260594315578e-06,
"loss": 0.5882,
"step": 247
},
{
"epoch": 0.08,
"grad_norm": 7.0285474198747036,
"learning_rate": 9.936439538902644e-06,
"loss": 0.8776,
"step": 248
},
{
"epoch": 0.08,
"grad_norm": 6.120945551228446,
"learning_rate": 9.935613180245143e-06,
"loss": 0.5905,
"step": 249
},
{
"epoch": 0.08,
"grad_norm": 2.197689977984544,
"learning_rate": 9.934781519230832e-06,
"loss": 0.2693,
"step": 250
},
{
"epoch": 0.08,
"grad_norm": 2.2876710297000424,
"learning_rate": 9.933944556753173e-06,
"loss": 0.2444,
"step": 251
},
{
"epoch": 0.08,
"grad_norm": 1.839881700013773,
"learning_rate": 9.933102293711314e-06,
"loss": 0.2687,
"step": 252
},
{
"epoch": 0.08,
"grad_norm": 6.663100438730311,
"learning_rate": 9.932254731010108e-06,
"loss": 0.5255,
"step": 253
},
{
"epoch": 0.08,
"grad_norm": 6.266933677264044,
"learning_rate": 9.931401869560096e-06,
"loss": 0.5345,
"step": 254
},
{
"epoch": 0.08,
"grad_norm": 1.8303981871835013,
"learning_rate": 9.93054371027751e-06,
"loss": 0.2347,
"step": 255
},
{
"epoch": 0.08,
"grad_norm": 6.26129546605341,
"learning_rate": 9.929680254084273e-06,
"loss": 0.6773,
"step": 256
},
{
"epoch": 0.08,
"grad_norm": 7.267941666701096,
"learning_rate": 9.928811501908006e-06,
"loss": 0.704,
"step": 257
},
{
"epoch": 0.08,
"grad_norm": 1.6836439908992122,
"learning_rate": 9.92793745468201e-06,
"loss": 0.2223,
"step": 258
},
{
"epoch": 0.08,
"grad_norm": 1.9495656535116324,
"learning_rate": 9.927058113345282e-06,
"loss": 0.2466,
"step": 259
},
{
"epoch": 0.08,
"grad_norm": 1.8209131439186006,
"learning_rate": 9.926173478842502e-06,
"loss": 0.2687,
"step": 260
},
{
"epoch": 0.08,
"grad_norm": 1.931823503029787,
"learning_rate": 9.925283552124039e-06,
"loss": 0.2572,
"step": 261
},
{
"epoch": 0.08,
"grad_norm": 2.2229504002443874,
"learning_rate": 9.924388334145943e-06,
"loss": 0.2779,
"step": 262
},
{
"epoch": 0.08,
"grad_norm": 7.114886827530793,
"learning_rate": 9.923487825869955e-06,
"loss": 0.7203,
"step": 263
},
{
"epoch": 0.08,
"grad_norm": 1.6552448432832247,
"learning_rate": 9.922582028263495e-06,
"loss": 0.2394,
"step": 264
},
{
"epoch": 0.08,
"grad_norm": 6.512736012940505,
"learning_rate": 9.921670942299664e-06,
"loss": 0.7148,
"step": 265
},
{
"epoch": 0.09,
"grad_norm": 1.619176270047411,
"learning_rate": 9.92075456895725e-06,
"loss": 0.2278,
"step": 266
},
{
"epoch": 0.09,
"grad_norm": 1.7636787308085922,
"learning_rate": 9.919832909220717e-06,
"loss": 0.2654,
"step": 267
},
{
"epoch": 0.09,
"grad_norm": 6.366453218825804,
"learning_rate": 9.91890596408021e-06,
"loss": 0.5214,
"step": 268
},
{
"epoch": 0.09,
"grad_norm": 1.7926927092351945,
"learning_rate": 9.917973734531549e-06,
"loss": 0.3013,
"step": 269
},
{
"epoch": 0.09,
"grad_norm": 9.27917675911004,
"learning_rate": 9.917036221576235e-06,
"loss": 0.4644,
"step": 270
},
{
"epoch": 0.09,
"grad_norm": 6.5166207880778675,
"learning_rate": 9.916093426221445e-06,
"loss": 0.7791,
"step": 271
},
{
"epoch": 0.09,
"grad_norm": 1.7803593703574099,
"learning_rate": 9.915145349480027e-06,
"loss": 0.2063,
"step": 272
},
{
"epoch": 0.09,
"grad_norm": 1.7164340087067527,
"learning_rate": 9.914191992370504e-06,
"loss": 0.2404,
"step": 273
},
{
"epoch": 0.09,
"grad_norm": 1.7053855130147761,
"learning_rate": 9.913233355917075e-06,
"loss": 0.2496,
"step": 274
},
{
"epoch": 0.09,
"grad_norm": 8.170343927786698,
"learning_rate": 9.91226944114961e-06,
"loss": 0.4809,
"step": 275
},
{
"epoch": 0.09,
"grad_norm": 11.373476584361782,
"learning_rate": 9.911300249103646e-06,
"loss": 0.4187,
"step": 276
},
{
"epoch": 0.09,
"grad_norm": 1.956910542155799,
"learning_rate": 9.910325780820391e-06,
"loss": 0.279,
"step": 277
},
{
"epoch": 0.09,
"grad_norm": 5.541510463993858,
"learning_rate": 9.90934603734672e-06,
"loss": 0.5838,
"step": 278
},
{
"epoch": 0.09,
"grad_norm": 7.9733412880119925,
"learning_rate": 9.908361019735181e-06,
"loss": 0.5904,
"step": 279
},
{
"epoch": 0.09,
"grad_norm": 1.4831004582918137,
"learning_rate": 9.907370729043984e-06,
"loss": 0.2193,
"step": 280
},
{
"epoch": 0.09,
"grad_norm": 1.984365249719393,
"learning_rate": 9.906375166336998e-06,
"loss": 0.2606,
"step": 281
},
{
"epoch": 0.09,
"grad_norm": 5.067254711450738,
"learning_rate": 9.905374332683768e-06,
"loss": 0.6655,
"step": 282
},
{
"epoch": 0.09,
"grad_norm": 13.994611511088726,
"learning_rate": 9.904368229159494e-06,
"loss": 0.7717,
"step": 283
},
{
"epoch": 0.09,
"grad_norm": 1.925005418994528,
"learning_rate": 9.903356856845035e-06,
"loss": 0.2474,
"step": 284
},
{
"epoch": 0.09,
"grad_norm": 5.749146638259409,
"learning_rate": 9.902340216826915e-06,
"loss": 0.559,
"step": 285
},
{
"epoch": 0.09,
"grad_norm": 5.16502544359464,
"learning_rate": 9.90131831019732e-06,
"loss": 0.6786,
"step": 286
},
{
"epoch": 0.09,
"grad_norm": 1.871063685934545,
"learning_rate": 9.900291138054086e-06,
"loss": 0.2044,
"step": 287
},
{
"epoch": 0.09,
"grad_norm": 1.8115443702306113,
"learning_rate": 9.899258701500712e-06,
"loss": 0.2443,
"step": 288
},
{
"epoch": 0.09,
"grad_norm": 1.762357615478928,
"learning_rate": 9.89822100164635e-06,
"loss": 0.2604,
"step": 289
},
{
"epoch": 0.09,
"grad_norm": 1.581180552288332,
"learning_rate": 9.897178039605803e-06,
"loss": 0.2554,
"step": 290
},
{
"epoch": 0.09,
"grad_norm": 1.7046379154630582,
"learning_rate": 9.896129816499535e-06,
"loss": 0.2242,
"step": 291
},
{
"epoch": 0.09,
"grad_norm": 1.6587541886565693,
"learning_rate": 9.89507633345366e-06,
"loss": 0.2934,
"step": 292
},
{
"epoch": 0.09,
"grad_norm": 1.767186610249366,
"learning_rate": 9.894017591599934e-06,
"loss": 0.2572,
"step": 293
},
{
"epoch": 0.09,
"grad_norm": 8.622410807088071,
"learning_rate": 9.892953592075776e-06,
"loss": 0.8176,
"step": 294
},
{
"epoch": 0.09,
"grad_norm": 1.463559656044879,
"learning_rate": 9.891884336024242e-06,
"loss": 0.2255,
"step": 295
},
{
"epoch": 0.09,
"grad_norm": 6.224974673190329,
"learning_rate": 9.890809824594041e-06,
"loss": 0.6921,
"step": 296
},
{
"epoch": 0.1,
"grad_norm": 6.275651838369934,
"learning_rate": 9.889730058939529e-06,
"loss": 0.4799,
"step": 297
},
{
"epoch": 0.1,
"grad_norm": 1.6281851231133666,
"learning_rate": 9.8886450402207e-06,
"loss": 0.2179,
"step": 298
},
{
"epoch": 0.1,
"grad_norm": 1.619527212818428,
"learning_rate": 9.8875547696032e-06,
"loss": 0.2594,
"step": 299
},
{
"epoch": 0.1,
"grad_norm": 5.934279358354166,
"learning_rate": 9.88645924825831e-06,
"loss": 0.6144,
"step": 300
},
{
"epoch": 0.1,
"grad_norm": 1.815838597766129,
"learning_rate": 9.885358477362956e-06,
"loss": 0.2479,
"step": 301
},
{
"epoch": 0.1,
"grad_norm": 1.7555660967992508,
"learning_rate": 9.8842524580997e-06,
"loss": 0.2647,
"step": 302
},
{
"epoch": 0.1,
"grad_norm": 2.094040601120562,
"learning_rate": 9.883141191656748e-06,
"loss": 0.2699,
"step": 303
},
{
"epoch": 0.1,
"grad_norm": 1.7027153956456136,
"learning_rate": 9.88202467922794e-06,
"loss": 0.2925,
"step": 304
},
{
"epoch": 0.1,
"grad_norm": 9.715297506105168,
"learning_rate": 9.880902922012747e-06,
"loss": 0.7013,
"step": 305
},
{
"epoch": 0.1,
"grad_norm": 1.8421367831514053,
"learning_rate": 9.879775921216284e-06,
"loss": 0.2537,
"step": 306
},
{
"epoch": 0.1,
"grad_norm": 1.6479818584304384,
"learning_rate": 9.87864367804929e-06,
"loss": 0.2513,
"step": 307
},
{
"epoch": 0.1,
"grad_norm": 1.8001522637106266,
"learning_rate": 9.877506193728144e-06,
"loss": 0.2984,
"step": 308
},
{
"epoch": 0.1,
"grad_norm": 1.4737787840102685,
"learning_rate": 9.876363469474848e-06,
"loss": 0.2117,
"step": 309
},
{
"epoch": 0.1,
"grad_norm": 1.606172703146015,
"learning_rate": 9.87521550651704e-06,
"loss": 0.2092,
"step": 310
},
{
"epoch": 0.1,
"grad_norm": 8.202056619133858,
"learning_rate": 9.874062306087983e-06,
"loss": 0.636,
"step": 311
},
{
"epoch": 0.1,
"grad_norm": 7.108320148950683,
"learning_rate": 9.872903869426564e-06,
"loss": 0.7243,
"step": 312
},
{
"epoch": 0.1,
"grad_norm": 5.506551610092641,
"learning_rate": 9.8717401977773e-06,
"loss": 0.5324,
"step": 313
},
{
"epoch": 0.1,
"grad_norm": 7.394711036441917,
"learning_rate": 9.870571292390331e-06,
"loss": 0.6106,
"step": 314
},
{
"epoch": 0.1,
"grad_norm": 1.7168080888601716,
"learning_rate": 9.869397154521418e-06,
"loss": 0.243,
"step": 315
},
{
"epoch": 0.1,
"grad_norm": 1.630605514144201,
"learning_rate": 9.868217785431942e-06,
"loss": 0.2204,
"step": 316
},
{
"epoch": 0.1,
"grad_norm": 2.0363393202892546,
"learning_rate": 9.867033186388906e-06,
"loss": 0.2953,
"step": 317
},
{
"epoch": 0.1,
"grad_norm": 5.788957640221821,
"learning_rate": 9.865843358664933e-06,
"loss": 0.5283,
"step": 318
},
{
"epoch": 0.1,
"grad_norm": 9.925878268253182,
"learning_rate": 9.86464830353826e-06,
"loss": 0.5873,
"step": 319
},
{
"epoch": 0.1,
"grad_norm": 6.488725346331065,
"learning_rate": 9.863448022292742e-06,
"loss": 0.6456,
"step": 320
},
{
"epoch": 0.1,
"grad_norm": 1.787596878035153,
"learning_rate": 9.86224251621785e-06,
"loss": 0.2382,
"step": 321
},
{
"epoch": 0.1,
"grad_norm": 1.6260525664288765,
"learning_rate": 9.861031786608663e-06,
"loss": 0.2009,
"step": 322
},
{
"epoch": 0.1,
"grad_norm": 1.785192574131053,
"learning_rate": 9.859815834765875e-06,
"loss": 0.2502,
"step": 323
},
{
"epoch": 0.1,
"grad_norm": 1.6494386800023098,
"learning_rate": 9.858594661995792e-06,
"loss": 0.2729,
"step": 324
},
{
"epoch": 0.1,
"grad_norm": 1.5171719328800632,
"learning_rate": 9.857368269610325e-06,
"loss": 0.1935,
"step": 325
},
{
"epoch": 0.1,
"grad_norm": 1.6019978694116501,
"learning_rate": 9.856136658926993e-06,
"loss": 0.2303,
"step": 326
},
{
"epoch": 0.1,
"grad_norm": 6.077455820696771,
"learning_rate": 9.854899831268926e-06,
"loss": 0.6023,
"step": 327
},
{
"epoch": 0.1,
"grad_norm": 1.7670319271592085,
"learning_rate": 9.85365778796485e-06,
"loss": 0.2914,
"step": 328
},
{
"epoch": 0.11,
"grad_norm": 9.537528884281995,
"learning_rate": 9.852410530349102e-06,
"loss": 0.643,
"step": 329
},
{
"epoch": 0.11,
"grad_norm": 1.862552947917582,
"learning_rate": 9.851158059761617e-06,
"loss": 0.245,
"step": 330
},
{
"epoch": 0.11,
"grad_norm": 6.338295160744264,
"learning_rate": 9.849900377547933e-06,
"loss": 0.6725,
"step": 331
},
{
"epoch": 0.11,
"grad_norm": 1.5749237201985713,
"learning_rate": 9.848637485059183e-06,
"loss": 0.2203,
"step": 332
},
{
"epoch": 0.11,
"grad_norm": 1.7798119281452047,
"learning_rate": 9.8473693836521e-06,
"loss": 0.2694,
"step": 333
},
{
"epoch": 0.11,
"grad_norm": 5.306498575393919,
"learning_rate": 9.846096074689012e-06,
"loss": 0.6167,
"step": 334
},
{
"epoch": 0.11,
"grad_norm": 1.9986387257054445,
"learning_rate": 9.844817559537841e-06,
"loss": 0.3073,
"step": 335
},
{
"epoch": 0.11,
"grad_norm": 1.7917036615218112,
"learning_rate": 9.843533839572105e-06,
"loss": 0.1986,
"step": 336
},
{
"epoch": 0.11,
"grad_norm": 1.9444855554710954,
"learning_rate": 9.842244916170913e-06,
"loss": 0.2507,
"step": 337
},
{
"epoch": 0.11,
"grad_norm": 5.515741606750657,
"learning_rate": 9.840950790718959e-06,
"loss": 0.7681,
"step": 338
},
{
"epoch": 0.11,
"grad_norm": 1.739345255762653,
"learning_rate": 9.83965146460653e-06,
"loss": 0.2939,
"step": 339
},
{
"epoch": 0.11,
"grad_norm": 15.818366926020826,
"learning_rate": 9.838346939229501e-06,
"loss": 0.566,
"step": 340
},
{
"epoch": 0.11,
"grad_norm": 9.475580321325761,
"learning_rate": 9.83703721598933e-06,
"loss": 0.6702,
"step": 341
},
{
"epoch": 0.11,
"grad_norm": 1.5561445661668403,
"learning_rate": 9.835722296293058e-06,
"loss": 0.2575,
"step": 342
},
{
"epoch": 0.11,
"grad_norm": 1.6374928593645883,
"learning_rate": 9.834402181553314e-06,
"loss": 0.2574,
"step": 343
},
{
"epoch": 0.11,
"grad_norm": 1.7720785739929117,
"learning_rate": 9.833076873188303e-06,
"loss": 0.2415,
"step": 344
},
{
"epoch": 0.11,
"grad_norm": 20.445412723262997,
"learning_rate": 9.831746372621811e-06,
"loss": 0.5882,
"step": 345
},
{
"epoch": 0.11,
"grad_norm": 1.6000166556546866,
"learning_rate": 9.830410681283203e-06,
"loss": 0.2209,
"step": 346
},
{
"epoch": 0.11,
"grad_norm": 1.6465319703747732,
"learning_rate": 9.829069800607418e-06,
"loss": 0.2412,
"step": 347
},
{
"epoch": 0.11,
"grad_norm": 4.8046461402805605,
"learning_rate": 9.827723732034972e-06,
"loss": 0.6327,
"step": 348
},
{
"epoch": 0.11,
"grad_norm": 1.870520515355423,
"learning_rate": 9.826372477011956e-06,
"loss": 0.2609,
"step": 349
},
{
"epoch": 0.11,
"grad_norm": 1.7548998761142005,
"learning_rate": 9.825016036990029e-06,
"loss": 0.2656,
"step": 350
},
{
"epoch": 0.11,
"grad_norm": 1.7931254599659536,
"learning_rate": 9.823654413426424e-06,
"loss": 0.3034,
"step": 351
},
{
"epoch": 0.11,
"grad_norm": 1.5247697719663527,
"learning_rate": 9.822287607783938e-06,
"loss": 0.2237,
"step": 352
},
{
"epoch": 0.11,
"grad_norm": 1.798493899979202,
"learning_rate": 9.820915621530939e-06,
"loss": 0.2746,
"step": 353
},
{
"epoch": 0.11,
"grad_norm": 5.203687613711387,
"learning_rate": 9.81953845614136e-06,
"loss": 0.7089,
"step": 354
},
{
"epoch": 0.11,
"grad_norm": 6.1928739384010685,
"learning_rate": 9.818156113094699e-06,
"loss": 0.6123,
"step": 355
},
{
"epoch": 0.11,
"grad_norm": 1.616422274885141,
"learning_rate": 9.816768593876012e-06,
"loss": 0.2769,
"step": 356
},
{
"epoch": 0.11,
"grad_norm": 6.601813272912302,
"learning_rate": 9.81537589997592e-06,
"loss": 0.6817,
"step": 357
},
{
"epoch": 0.11,
"grad_norm": 1.8732789501841371,
"learning_rate": 9.8139780328906e-06,
"loss": 0.24,
"step": 358
},
{
"epoch": 0.11,
"grad_norm": 5.817718745140282,
"learning_rate": 9.812574994121791e-06,
"loss": 0.5641,
"step": 359
},
{
"epoch": 0.12,
"grad_norm": 1.7035610866186042,
"learning_rate": 9.811166785176785e-06,
"loss": 0.2141,
"step": 360
},
{
"epoch": 0.12,
"grad_norm": 6.059233018531428,
"learning_rate": 9.809753407568427e-06,
"loss": 0.5628,
"step": 361
},
{
"epoch": 0.12,
"grad_norm": 1.9798596247591356,
"learning_rate": 9.80833486281512e-06,
"loss": 0.2539,
"step": 362
},
{
"epoch": 0.12,
"grad_norm": 11.649935725007392,
"learning_rate": 9.80691115244081e-06,
"loss": 0.7371,
"step": 363
},
{
"epoch": 0.12,
"grad_norm": 1.6194679707686632,
"learning_rate": 9.805482277974999e-06,
"loss": 0.2111,
"step": 364
},
{
"epoch": 0.12,
"grad_norm": 2.312164201949697,
"learning_rate": 9.804048240952736e-06,
"loss": 0.2856,
"step": 365
},
{
"epoch": 0.12,
"grad_norm": 1.8007353724474517,
"learning_rate": 9.802609042914614e-06,
"loss": 0.2582,
"step": 366
},
{
"epoch": 0.12,
"grad_norm": 24.776689688816656,
"learning_rate": 9.80116468540677e-06,
"loss": 0.5147,
"step": 367
},
{
"epoch": 0.12,
"grad_norm": 7.163689734496585,
"learning_rate": 9.79971516998089e-06,
"loss": 0.6445,
"step": 368
},
{
"epoch": 0.12,
"grad_norm": 7.864571925331906,
"learning_rate": 9.79826049819419e-06,
"loss": 0.6159,
"step": 369
},
{
"epoch": 0.12,
"grad_norm": 1.7450807359970308,
"learning_rate": 9.796800671609436e-06,
"loss": 0.2518,
"step": 370
},
{
"epoch": 0.12,
"grad_norm": 8.727857811594829,
"learning_rate": 9.795335691794929e-06,
"loss": 0.5013,
"step": 371
},
{
"epoch": 0.12,
"grad_norm": 8.919913818750866,
"learning_rate": 9.793865560324503e-06,
"loss": 0.6285,
"step": 372
},
{
"epoch": 0.12,
"grad_norm": 8.429917857657687,
"learning_rate": 9.792390278777527e-06,
"loss": 0.6394,
"step": 373
},
{
"epoch": 0.12,
"grad_norm": 12.476180607787782,
"learning_rate": 9.790909848738907e-06,
"loss": 0.6466,
"step": 374
},
{
"epoch": 0.12,
"grad_norm": 7.098836577863875,
"learning_rate": 9.789424271799075e-06,
"loss": 0.709,
"step": 375
},
{
"epoch": 0.12,
"grad_norm": 5.801376908749421,
"learning_rate": 9.787933549553996e-06,
"loss": 0.6269,
"step": 376
},
{
"epoch": 0.12,
"grad_norm": 9.496884685048222,
"learning_rate": 9.786437683605161e-06,
"loss": 0.6664,
"step": 377
},
{
"epoch": 0.12,
"grad_norm": 1.6319948879086812,
"learning_rate": 9.78493667555959e-06,
"loss": 0.2707,
"step": 378
},
{
"epoch": 0.12,
"grad_norm": 1.5708516466390696,
"learning_rate": 9.783430527029818e-06,
"loss": 0.1913,
"step": 379
},
{
"epoch": 0.12,
"grad_norm": 8.023793238995243,
"learning_rate": 9.781919239633912e-06,
"loss": 0.5994,
"step": 380
},
{
"epoch": 0.12,
"grad_norm": 1.8126674527164652,
"learning_rate": 9.780402814995458e-06,
"loss": 0.2318,
"step": 381
},
{
"epoch": 0.12,
"grad_norm": 14.426470287475814,
"learning_rate": 9.77888125474356e-06,
"loss": 0.6649,
"step": 382
},
{
"epoch": 0.12,
"grad_norm": 10.71094575036116,
"learning_rate": 9.777354560512835e-06,
"loss": 0.494,
"step": 383
},
{
"epoch": 0.12,
"grad_norm": 7.713425379700125,
"learning_rate": 9.77582273394342e-06,
"loss": 0.7045,
"step": 384
},
{
"epoch": 0.12,
"grad_norm": 8.943228477771676,
"learning_rate": 9.774285776680967e-06,
"loss": 0.5535,
"step": 385
},
{
"epoch": 0.12,
"grad_norm": 1.7267654150299514,
"learning_rate": 9.772743690376636e-06,
"loss": 0.2491,
"step": 386
},
{
"epoch": 0.12,
"grad_norm": 1.8191540559886012,
"learning_rate": 9.7711964766871e-06,
"loss": 0.2703,
"step": 387
},
{
"epoch": 0.12,
"grad_norm": 6.544574630134056,
"learning_rate": 9.76964413727454e-06,
"loss": 0.5892,
"step": 388
},
{
"epoch": 0.12,
"grad_norm": 5.71619739119168,
"learning_rate": 9.768086673806638e-06,
"loss": 0.5279,
"step": 389
},
{
"epoch": 0.12,
"grad_norm": 1.7386028680970294,
"learning_rate": 9.766524087956592e-06,
"loss": 0.2672,
"step": 390
},
{
"epoch": 0.13,
"grad_norm": 10.798882919692216,
"learning_rate": 9.764956381403095e-06,
"loss": 0.6545,
"step": 391
},
{
"epoch": 0.13,
"grad_norm": 12.465881210964136,
"learning_rate": 9.76338355583034e-06,
"loss": 0.6039,
"step": 392
},
{
"epoch": 0.13,
"grad_norm": 5.494982234088186,
"learning_rate": 9.761805612928025e-06,
"loss": 0.519,
"step": 393
},
{
"epoch": 0.13,
"grad_norm": 5.053272331920926,
"learning_rate": 9.760222554391343e-06,
"loss": 0.5176,
"step": 394
},
{
"epoch": 0.13,
"grad_norm": 9.891972560729801,
"learning_rate": 9.758634381920982e-06,
"loss": 0.5177,
"step": 395
},
{
"epoch": 0.13,
"grad_norm": 4.2656155278201675,
"learning_rate": 9.757041097223123e-06,
"loss": 0.6046,
"step": 396
},
{
"epoch": 0.13,
"grad_norm": 1.6395442627324692,
"learning_rate": 9.755442702009443e-06,
"loss": 0.2523,
"step": 397
},
{
"epoch": 0.13,
"grad_norm": 5.571416657542466,
"learning_rate": 9.753839197997105e-06,
"loss": 0.5219,
"step": 398
},
{
"epoch": 0.13,
"grad_norm": 4.3700690088783025,
"learning_rate": 9.752230586908767e-06,
"loss": 0.2403,
"step": 399
},
{
"epoch": 0.13,
"grad_norm": 7.3442548066250275,
"learning_rate": 9.75061687047256e-06,
"loss": 0.4425,
"step": 400
},
{
"epoch": 0.13,
"grad_norm": 5.102192276870105,
"learning_rate": 9.748998050422117e-06,
"loss": 0.6099,
"step": 401
},
{
"epoch": 0.13,
"grad_norm": 3.596595451608656,
"learning_rate": 9.747374128496541e-06,
"loss": 0.2457,
"step": 402
},
{
"epoch": 0.13,
"grad_norm": 4.638267201907921,
"learning_rate": 9.745745106440422e-06,
"loss": 0.4629,
"step": 403
},
{
"epoch": 0.13,
"grad_norm": 5.457131519006896,
"learning_rate": 9.744110986003826e-06,
"loss": 0.5778,
"step": 404
},
{
"epoch": 0.13,
"grad_norm": 3.6000720224378533,
"learning_rate": 9.742471768942299e-06,
"loss": 0.2596,
"step": 405
},
{
"epoch": 0.13,
"grad_norm": 1.7493100911873254,
"learning_rate": 9.740827457016863e-06,
"loss": 0.2691,
"step": 406
},
{
"epoch": 0.13,
"grad_norm": 6.068632023860864,
"learning_rate": 9.739178051994008e-06,
"loss": 0.6243,
"step": 407
},
{
"epoch": 0.13,
"grad_norm": 2.034333215512261,
"learning_rate": 9.7375235556457e-06,
"loss": 0.2646,
"step": 408
},
{
"epoch": 0.13,
"grad_norm": 108.67978047726994,
"learning_rate": 9.735863969749373e-06,
"loss": 0.5287,
"step": 409
},
{
"epoch": 0.13,
"grad_norm": 18.032116388026967,
"learning_rate": 9.734199296087932e-06,
"loss": 0.277,
"step": 410
},
{
"epoch": 0.13,
"grad_norm": 6.247397291623058,
"learning_rate": 9.732529536449741e-06,
"loss": 0.8102,
"step": 411
},
{
"epoch": 0.13,
"grad_norm": 1.8587537608039923,
"learning_rate": 9.730854692628637e-06,
"loss": 0.2621,
"step": 412
},
{
"epoch": 0.13,
"grad_norm": 1.719599060155659,
"learning_rate": 9.729174766423912e-06,
"loss": 0.2645,
"step": 413
},
{
"epoch": 0.13,
"grad_norm": 1.5511281705551219,
"learning_rate": 9.72748975964032e-06,
"loss": 0.2389,
"step": 414
},
{
"epoch": 0.13,
"grad_norm": 1.657070469633183,
"learning_rate": 9.725799674088072e-06,
"loss": 0.2275,
"step": 415
},
{
"epoch": 0.13,
"grad_norm": 10.362288274296773,
"learning_rate": 9.724104511582838e-06,
"loss": 0.5781,
"step": 416
},
{
"epoch": 0.13,
"grad_norm": 5.574234794631768,
"learning_rate": 9.72240427394574e-06,
"loss": 0.5624,
"step": 417
},
{
"epoch": 0.13,
"grad_norm": 7.4448490908876135,
"learning_rate": 9.720698963003351e-06,
"loss": 0.6852,
"step": 418
},
{
"epoch": 0.13,
"grad_norm": 1.5433086783424876,
"learning_rate": 9.7189885805877e-06,
"loss": 0.2052,
"step": 419
},
{
"epoch": 0.13,
"grad_norm": 5.282788060395431,
"learning_rate": 9.717273128536259e-06,
"loss": 0.753,
"step": 420
},
{
"epoch": 0.13,
"grad_norm": 1.9816401253282105,
"learning_rate": 9.715552608691944e-06,
"loss": 0.2751,
"step": 421
},
{
"epoch": 0.14,
"grad_norm": 1.861973572972449,
"learning_rate": 9.713827022903124e-06,
"loss": 0.2489,
"step": 422
},
{
"epoch": 0.14,
"grad_norm": 1.8784674952919553,
"learning_rate": 9.712096373023603e-06,
"loss": 0.2807,
"step": 423
},
{
"epoch": 0.14,
"grad_norm": 2.0645563466318584,
"learning_rate": 9.710360660912629e-06,
"loss": 0.3016,
"step": 424
},
{
"epoch": 0.14,
"grad_norm": 6.682479363719368,
"learning_rate": 9.708619888434887e-06,
"loss": 0.5129,
"step": 425
},
{
"epoch": 0.14,
"grad_norm": 1.7400890155991255,
"learning_rate": 9.706874057460497e-06,
"loss": 0.2594,
"step": 426
},
{
"epoch": 0.14,
"grad_norm": 8.86234579968045,
"learning_rate": 9.705123169865016e-06,
"loss": 0.6763,
"step": 427
},
{
"epoch": 0.14,
"grad_norm": 14.882292145776598,
"learning_rate": 9.703367227529432e-06,
"loss": 0.488,
"step": 428
},
{
"epoch": 0.14,
"grad_norm": 7.657954403197554,
"learning_rate": 9.701606232340165e-06,
"loss": 0.4734,
"step": 429
},
{
"epoch": 0.14,
"grad_norm": 1.8928143901726409,
"learning_rate": 9.699840186189061e-06,
"loss": 0.2786,
"step": 430
},
{
"epoch": 0.14,
"grad_norm": 6.017547091352742,
"learning_rate": 9.698069090973391e-06,
"loss": 0.6868,
"step": 431
},
{
"epoch": 0.14,
"grad_norm": 1.5864654805452103,
"learning_rate": 9.696292948595857e-06,
"loss": 0.2131,
"step": 432
},
{
"epoch": 0.14,
"grad_norm": 1.761612831154044,
"learning_rate": 9.694511760964578e-06,
"loss": 0.2532,
"step": 433
},
{
"epoch": 0.14,
"grad_norm": 1.7403966486679432,
"learning_rate": 9.69272552999309e-06,
"loss": 0.2561,
"step": 434
},
{
"epoch": 0.14,
"grad_norm": 1.7323797944976795,
"learning_rate": 9.690934257600353e-06,
"loss": 0.2465,
"step": 435
},
{
"epoch": 0.14,
"grad_norm": 1.516914235462595,
"learning_rate": 9.689137945710742e-06,
"loss": 0.263,
"step": 436
},
{
"epoch": 0.14,
"grad_norm": 7.682260069657834,
"learning_rate": 9.687336596254045e-06,
"loss": 0.513,
"step": 437
},
{
"epoch": 0.14,
"grad_norm": 5.417370136409828,
"learning_rate": 9.685530211165459e-06,
"loss": 0.5077,
"step": 438
},
{
"epoch": 0.14,
"grad_norm": 1.7532214718484167,
"learning_rate": 9.683718792385595e-06,
"loss": 0.2618,
"step": 439
},
{
"epoch": 0.14,
"grad_norm": 7.559056297443667,
"learning_rate": 9.681902341860471e-06,
"loss": 0.7226,
"step": 440
},
{
"epoch": 0.14,
"grad_norm": 1.783196373519197,
"learning_rate": 9.680080861541511e-06,
"loss": 0.2088,
"step": 441
},
{
"epoch": 0.14,
"grad_norm": 4.952472408705992,
"learning_rate": 9.678254353385538e-06,
"loss": 0.5452,
"step": 442
},
{
"epoch": 0.14,
"grad_norm": 5.609645206584396,
"learning_rate": 9.676422819354785e-06,
"loss": 0.5814,
"step": 443
},
{
"epoch": 0.14,
"grad_norm": 1.8253918239886093,
"learning_rate": 9.674586261416874e-06,
"loss": 0.2326,
"step": 444
},
{
"epoch": 0.14,
"grad_norm": 1.9423878166580468,
"learning_rate": 9.672744681544834e-06,
"loss": 0.3192,
"step": 445
},
{
"epoch": 0.14,
"grad_norm": 1.7183487922754226,
"learning_rate": 9.670898081717079e-06,
"loss": 0.261,
"step": 446
},
{
"epoch": 0.14,
"grad_norm": 1.583231612184501,
"learning_rate": 9.669046463917427e-06,
"loss": 0.1921,
"step": 447
},
{
"epoch": 0.14,
"grad_norm": 1.699976665331866,
"learning_rate": 9.667189830135078e-06,
"loss": 0.2459,
"step": 448
},
{
"epoch": 0.14,
"grad_norm": 4.830208654876044,
"learning_rate": 9.665328182364627e-06,
"loss": 0.5133,
"step": 449
},
{
"epoch": 0.14,
"grad_norm": 8.704400793715795,
"learning_rate": 9.663461522606049e-06,
"loss": 0.6686,
"step": 450
},
{
"epoch": 0.14,
"grad_norm": 6.937812975673912,
"learning_rate": 9.66158985286471e-06,
"loss": 0.388,
"step": 451
},
{
"epoch": 0.14,
"grad_norm": 9.162898390854838,
"learning_rate": 9.659713175151352e-06,
"loss": 0.5718,
"step": 452
},
{
"epoch": 0.14,
"grad_norm": 1.5464286975229098,
"learning_rate": 9.657831491482103e-06,
"loss": 0.2102,
"step": 453
},
{
"epoch": 0.15,
"grad_norm": 9.004355033796678,
"learning_rate": 9.655944803878467e-06,
"loss": 0.5886,
"step": 454
},
{
"epoch": 0.15,
"grad_norm": 1.7914194566193655,
"learning_rate": 9.654053114367321e-06,
"loss": 0.2858,
"step": 455
},
{
"epoch": 0.15,
"grad_norm": 1.7496161140767366,
"learning_rate": 9.65215642498092e-06,
"loss": 0.2231,
"step": 456
},
{
"epoch": 0.15,
"grad_norm": 5.660911158180701,
"learning_rate": 9.650254737756883e-06,
"loss": 0.4718,
"step": 457
},
{
"epoch": 0.15,
"grad_norm": 1.602415790855819,
"learning_rate": 9.648348054738208e-06,
"loss": 0.2206,
"step": 458
},
{
"epoch": 0.15,
"grad_norm": 8.297344269887379,
"learning_rate": 9.646436377973253e-06,
"loss": 0.695,
"step": 459
},
{
"epoch": 0.15,
"grad_norm": 4.562823331457611,
"learning_rate": 9.644519709515746e-06,
"loss": 0.3869,
"step": 460
},
{
"epoch": 0.15,
"grad_norm": 6.510693383446655,
"learning_rate": 9.642598051424772e-06,
"loss": 0.6238,
"step": 461
},
{
"epoch": 0.15,
"grad_norm": 5.338732275568637,
"learning_rate": 9.640671405764777e-06,
"loss": 0.463,
"step": 462
},
{
"epoch": 0.15,
"grad_norm": 1.726729233128839,
"learning_rate": 9.638739774605572e-06,
"loss": 0.2686,
"step": 463
},
{
"epoch": 0.15,
"grad_norm": 7.508232814292006,
"learning_rate": 9.636803160022314e-06,
"loss": 0.7177,
"step": 464
},
{
"epoch": 0.15,
"grad_norm": 1.7462604260074626,
"learning_rate": 9.634861564095525e-06,
"loss": 0.2734,
"step": 465
},
{
"epoch": 0.15,
"grad_norm": 1.547103753887767,
"learning_rate": 9.632914988911066e-06,
"loss": 0.2215,
"step": 466
},
{
"epoch": 0.15,
"grad_norm": 1.65680886983581,
"learning_rate": 9.63096343656016e-06,
"loss": 0.2663,
"step": 467
},
{
"epoch": 0.15,
"grad_norm": 6.9542028644509815,
"learning_rate": 9.629006909139363e-06,
"loss": 0.7216,
"step": 468
},
{
"epoch": 0.15,
"grad_norm": 16.065217703971534,
"learning_rate": 9.62704540875059e-06,
"loss": 0.5666,
"step": 469
},
{
"epoch": 0.15,
"grad_norm": 1.8217908058149368,
"learning_rate": 9.625078937501089e-06,
"loss": 0.2915,
"step": 470
},
{
"epoch": 0.15,
"grad_norm": 5.675952292457328,
"learning_rate": 9.62310749750345e-06,
"loss": 0.54,
"step": 471
},
{
"epoch": 0.15,
"grad_norm": 7.354932909323429,
"learning_rate": 9.621131090875603e-06,
"loss": 0.4529,
"step": 472
},
{
"epoch": 0.15,
"grad_norm": 7.552225342234603,
"learning_rate": 9.619149719740817e-06,
"loss": 0.6706,
"step": 473
},
{
"epoch": 0.15,
"grad_norm": 7.003636609102059,
"learning_rate": 9.617163386227683e-06,
"loss": 0.5179,
"step": 474
},
{
"epoch": 0.15,
"grad_norm": 7.87176422191421,
"learning_rate": 9.615172092470134e-06,
"loss": 0.5432,
"step": 475
},
{
"epoch": 0.15,
"grad_norm": 8.722305589354004,
"learning_rate": 9.613175840607428e-06,
"loss": 0.6106,
"step": 476
},
{
"epoch": 0.15,
"grad_norm": 1.6766957441259023,
"learning_rate": 9.611174632784147e-06,
"loss": 0.222,
"step": 477
},
{
"epoch": 0.15,
"grad_norm": 7.88698944356783,
"learning_rate": 9.609168471150202e-06,
"loss": 0.5604,
"step": 478
},
{
"epoch": 0.15,
"grad_norm": 4.901710771804992,
"learning_rate": 9.607157357860823e-06,
"loss": 0.5331,
"step": 479
},
{
"epoch": 0.15,
"grad_norm": 1.541628104886714,
"learning_rate": 9.605141295076561e-06,
"loss": 0.2418,
"step": 480
},
{
"epoch": 0.15,
"grad_norm": 1.646102352925008,
"learning_rate": 9.603120284963284e-06,
"loss": 0.2368,
"step": 481
},
{
"epoch": 0.15,
"grad_norm": 22.17130760384479,
"learning_rate": 9.601094329692173e-06,
"loss": 0.5453,
"step": 482
},
{
"epoch": 0.15,
"grad_norm": 1.532667603750822,
"learning_rate": 9.599063431439721e-06,
"loss": 0.2178,
"step": 483
},
{
"epoch": 0.15,
"grad_norm": 1.4035418727441649,
"learning_rate": 9.597027592387739e-06,
"loss": 0.2121,
"step": 484
},
{
"epoch": 0.16,
"grad_norm": 4.478680004347673,
"learning_rate": 9.594986814723335e-06,
"loss": 0.4924,
"step": 485
},
{
"epoch": 0.16,
"grad_norm": 7.468955163342549,
"learning_rate": 9.59294110063893e-06,
"loss": 0.8559,
"step": 486
},
{
"epoch": 0.16,
"grad_norm": 7.517207014437163,
"learning_rate": 9.590890452332249e-06,
"loss": 0.7547,
"step": 487
},
{
"epoch": 0.16,
"grad_norm": 1.7386384148451286,
"learning_rate": 9.588834872006308e-06,
"loss": 0.2873,
"step": 488
},
{
"epoch": 0.16,
"grad_norm": 6.111369512347133,
"learning_rate": 9.586774361869436e-06,
"loss": 0.7991,
"step": 489
},
{
"epoch": 0.16,
"grad_norm": 5.818342367685323,
"learning_rate": 9.584708924135245e-06,
"loss": 0.6113,
"step": 490
},
{
"epoch": 0.16,
"grad_norm": 1.8684943149292454,
"learning_rate": 9.582638561022646e-06,
"loss": 0.2406,
"step": 491
},
{
"epoch": 0.16,
"grad_norm": 1.6927560874280256,
"learning_rate": 9.580563274755848e-06,
"loss": 0.2438,
"step": 492
},
{
"epoch": 0.16,
"grad_norm": 7.383883334475161,
"learning_rate": 9.578483067564335e-06,
"loss": 0.5863,
"step": 493
},
{
"epoch": 0.16,
"grad_norm": 4.77457350402351,
"learning_rate": 9.576397941682891e-06,
"loss": 0.6171,
"step": 494
},
{
"epoch": 0.16,
"grad_norm": 1.5213260321597142,
"learning_rate": 9.574307899351574e-06,
"loss": 0.2399,
"step": 495
},
{
"epoch": 0.16,
"grad_norm": 1.6725404783281153,
"learning_rate": 9.572212942815734e-06,
"loss": 0.2136,
"step": 496
},
{
"epoch": 0.16,
"grad_norm": 1.8319233281937581,
"learning_rate": 9.570113074325986e-06,
"loss": 0.2536,
"step": 497
},
{
"epoch": 0.16,
"grad_norm": 14.083688310977637,
"learning_rate": 9.568008296138238e-06,
"loss": 0.7214,
"step": 498
},
{
"epoch": 0.16,
"grad_norm": 1.5351720079812152,
"learning_rate": 9.565898610513661e-06,
"loss": 0.2357,
"step": 499
},
{
"epoch": 0.16,
"grad_norm": 1.846796286369406,
"learning_rate": 9.563784019718704e-06,
"loss": 0.2659,
"step": 500
},
{
"epoch": 0.16,
"grad_norm": 9.109412341380644,
"learning_rate": 9.561664526025082e-06,
"loss": 0.6468,
"step": 501
},
{
"epoch": 0.16,
"grad_norm": 4.665032230332076,
"learning_rate": 9.55954013170978e-06,
"loss": 0.5738,
"step": 502
},
{
"epoch": 0.16,
"grad_norm": 11.970552835497465,
"learning_rate": 9.557410839055047e-06,
"loss": 0.4872,
"step": 503
},
{
"epoch": 0.16,
"grad_norm": 11.403070918811682,
"learning_rate": 9.555276650348393e-06,
"loss": 0.7133,
"step": 504
},
{
"epoch": 0.16,
"grad_norm": 1.8223233705669901,
"learning_rate": 9.55313756788259e-06,
"loss": 0.2925,
"step": 505
},
{
"epoch": 0.16,
"grad_norm": 1.852613958622955,
"learning_rate": 9.550993593955665e-06,
"loss": 0.2913,
"step": 506
},
{
"epoch": 0.16,
"grad_norm": 4.498398929916578,
"learning_rate": 9.548844730870903e-06,
"loss": 0.4451,
"step": 507
},
{
"epoch": 0.16,
"grad_norm": 1.648219625284076,
"learning_rate": 9.546690980936836e-06,
"loss": 0.2615,
"step": 508
},
{
"epoch": 0.16,
"grad_norm": 1.5284412497408182,
"learning_rate": 9.544532346467254e-06,
"loss": 0.226,
"step": 509
},
{
"epoch": 0.16,
"grad_norm": 8.983584090394281,
"learning_rate": 9.542368829781186e-06,
"loss": 0.5188,
"step": 510
},
{
"epoch": 0.16,
"grad_norm": 1.625012319223229,
"learning_rate": 9.540200433202913e-06,
"loss": 0.2223,
"step": 511
},
{
"epoch": 0.16,
"grad_norm": 7.848716453870051,
"learning_rate": 9.538027159061955e-06,
"loss": 0.6142,
"step": 512
},
{
"epoch": 0.16,
"grad_norm": 5.619009301696381,
"learning_rate": 9.535849009693072e-06,
"loss": 0.4499,
"step": 513
},
{
"epoch": 0.16,
"grad_norm": 1.5799572961064898,
"learning_rate": 9.533665987436262e-06,
"loss": 0.2796,
"step": 514
},
{
"epoch": 0.16,
"grad_norm": 1.8129121062203717,
"learning_rate": 9.531478094636758e-06,
"loss": 0.2311,
"step": 515
},
{
"epoch": 0.17,
"grad_norm": 5.939978021754256,
"learning_rate": 9.529285333645027e-06,
"loss": 0.4902,
"step": 516
},
{
"epoch": 0.17,
"grad_norm": 4.379150393398199,
"learning_rate": 9.527087706816762e-06,
"loss": 0.4729,
"step": 517
},
{
"epoch": 0.17,
"grad_norm": 1.885728998582742,
"learning_rate": 9.524885216512887e-06,
"loss": 0.2484,
"step": 518
},
{
"epoch": 0.17,
"grad_norm": 4.846423738543627,
"learning_rate": 9.522677865099548e-06,
"loss": 0.4624,
"step": 519
},
{
"epoch": 0.17,
"grad_norm": 1.7713682514140527,
"learning_rate": 9.520465654948119e-06,
"loss": 0.2428,
"step": 520
},
{
"epoch": 0.17,
"grad_norm": 1.6484256990751187,
"learning_rate": 9.518248588435185e-06,
"loss": 0.2728,
"step": 521
},
{
"epoch": 0.17,
"grad_norm": 1.85025912629073,
"learning_rate": 9.516026667942557e-06,
"loss": 0.2968,
"step": 522
},
{
"epoch": 0.17,
"grad_norm": 1.472425467009718,
"learning_rate": 9.513799895857252e-06,
"loss": 0.2189,
"step": 523
},
{
"epoch": 0.17,
"grad_norm": 1.8324029561679585,
"learning_rate": 9.511568274571508e-06,
"loss": 0.2788,
"step": 524
},
{
"epoch": 0.17,
"grad_norm": 1.853724342533399,
"learning_rate": 9.509331806482767e-06,
"loss": 0.3316,
"step": 525
},
{
"epoch": 0.17,
"grad_norm": 1.7662162335150542,
"learning_rate": 9.507090493993677e-06,
"loss": 0.2845,
"step": 526
},
{
"epoch": 0.17,
"grad_norm": 6.857778864737697,
"learning_rate": 9.504844339512096e-06,
"loss": 0.8126,
"step": 527
},
{
"epoch": 0.17,
"grad_norm": 1.4154894498443327,
"learning_rate": 9.502593345451078e-06,
"loss": 0.234,
"step": 528
},
{
"epoch": 0.17,
"grad_norm": 1.559160109741637,
"learning_rate": 9.500337514228878e-06,
"loss": 0.2134,
"step": 529
},
{
"epoch": 0.17,
"grad_norm": 1.7392915493940049,
"learning_rate": 9.49807684826895e-06,
"loss": 0.2505,
"step": 530
},
{
"epoch": 0.17,
"grad_norm": 5.7068179181213345,
"learning_rate": 9.495811349999941e-06,
"loss": 0.6602,
"step": 531
},
{
"epoch": 0.17,
"grad_norm": 8.078581062271683,
"learning_rate": 9.493541021855685e-06,
"loss": 0.5826,
"step": 532
},
{
"epoch": 0.17,
"grad_norm": 1.5377398073960513,
"learning_rate": 9.49126586627521e-06,
"loss": 0.2403,
"step": 533
},
{
"epoch": 0.17,
"grad_norm": 2.1239080125298515,
"learning_rate": 9.488985885702728e-06,
"loss": 0.2966,
"step": 534
},
{
"epoch": 0.17,
"grad_norm": 1.6240974393234549,
"learning_rate": 9.486701082587635e-06,
"loss": 0.2331,
"step": 535
},
{
"epoch": 0.17,
"grad_norm": 1.6747138927073455,
"learning_rate": 9.484411459384508e-06,
"loss": 0.2135,
"step": 536
},
{
"epoch": 0.17,
"grad_norm": 7.124509975698555,
"learning_rate": 9.482117018553101e-06,
"loss": 0.4871,
"step": 537
},
{
"epoch": 0.17,
"grad_norm": 5.410259795106361,
"learning_rate": 9.479817762558345e-06,
"loss": 0.7595,
"step": 538
},
{
"epoch": 0.17,
"grad_norm": 1.9126348752324163,
"learning_rate": 9.477513693870347e-06,
"loss": 0.2562,
"step": 539
},
{
"epoch": 0.17,
"grad_norm": 5.776277118670486,
"learning_rate": 9.475204814964374e-06,
"loss": 0.4539,
"step": 540
},
{
"epoch": 0.17,
"grad_norm": 8.454367566571307,
"learning_rate": 9.472891128320874e-06,
"loss": 0.5973,
"step": 541
},
{
"epoch": 0.17,
"grad_norm": 5.913077225031681,
"learning_rate": 9.470572636425451e-06,
"loss": 0.6188,
"step": 542
},
{
"epoch": 0.17,
"grad_norm": 6.090466598908505,
"learning_rate": 9.46824934176887e-06,
"loss": 0.6631,
"step": 543
},
{
"epoch": 0.17,
"grad_norm": 6.198244063689312,
"learning_rate": 9.465921246847067e-06,
"loss": 0.5703,
"step": 544
},
{
"epoch": 0.17,
"grad_norm": 1.660589900649375,
"learning_rate": 9.463588354161122e-06,
"loss": 0.2352,
"step": 545
},
{
"epoch": 0.17,
"grad_norm": 8.707932228113194,
"learning_rate": 9.461250666217277e-06,
"loss": 0.6988,
"step": 546
},
{
"epoch": 0.18,
"grad_norm": 1.6480449260174401,
"learning_rate": 9.458908185526921e-06,
"loss": 0.2167,
"step": 547
},
{
"epoch": 0.18,
"grad_norm": 1.746732564216852,
"learning_rate": 9.456560914606594e-06,
"loss": 0.2634,
"step": 548
},
{
"epoch": 0.18,
"grad_norm": 9.349519321728842,
"learning_rate": 9.454208855977986e-06,
"loss": 0.5316,
"step": 549
},
{
"epoch": 0.18,
"grad_norm": 5.516413135371106,
"learning_rate": 9.451852012167924e-06,
"loss": 0.7709,
"step": 550
},
{
"epoch": 0.18,
"grad_norm": 17.400695990877335,
"learning_rate": 9.449490385708378e-06,
"loss": 0.5955,
"step": 551
},
{
"epoch": 0.18,
"grad_norm": 1.6392762842053246,
"learning_rate": 9.447123979136457e-06,
"loss": 0.2294,
"step": 552
},
{
"epoch": 0.18,
"grad_norm": 5.600355142882258,
"learning_rate": 9.444752794994408e-06,
"loss": 0.6541,
"step": 553
},
{
"epoch": 0.18,
"grad_norm": 1.5746993685116102,
"learning_rate": 9.4423768358296e-06,
"loss": 0.2396,
"step": 554
},
{
"epoch": 0.18,
"grad_norm": 1.7105327612570742,
"learning_rate": 9.439996104194546e-06,
"loss": 0.2193,
"step": 555
},
{
"epoch": 0.18,
"grad_norm": 7.441655802757599,
"learning_rate": 9.437610602646878e-06,
"loss": 0.6482,
"step": 556
},
{
"epoch": 0.18,
"grad_norm": 6.95945207101213,
"learning_rate": 9.43522033374935e-06,
"loss": 0.4506,
"step": 557
},
{
"epoch": 0.18,
"grad_norm": 5.44751626018781,
"learning_rate": 9.432825300069848e-06,
"loss": 0.5949,
"step": 558
},
{
"epoch": 0.18,
"grad_norm": 5.133345036398373,
"learning_rate": 9.430425504181361e-06,
"loss": 0.5625,
"step": 559
},
{
"epoch": 0.18,
"grad_norm": 1.793145741950075,
"learning_rate": 9.428020948662012e-06,
"loss": 0.251,
"step": 560
},
{
"epoch": 0.18,
"grad_norm": 5.737725174283238,
"learning_rate": 9.425611636095023e-06,
"loss": 0.5844,
"step": 561
},
{
"epoch": 0.18,
"grad_norm": 1.5754937662547965,
"learning_rate": 9.423197569068733e-06,
"loss": 0.2238,
"step": 562
},
{
"epoch": 0.18,
"grad_norm": 4.114959234043904,
"learning_rate": 9.420778750176588e-06,
"loss": 0.3734,
"step": 563
},
{
"epoch": 0.18,
"grad_norm": 1.6458700109490814,
"learning_rate": 9.418355182017138e-06,
"loss": 0.2593,
"step": 564
},
{
"epoch": 0.18,
"grad_norm": 7.9018071590278565,
"learning_rate": 9.41592686719404e-06,
"loss": 0.7318,
"step": 565
},
{
"epoch": 0.18,
"grad_norm": 1.5511525841874374,
"learning_rate": 9.413493808316038e-06,
"loss": 0.2632,
"step": 566
},
{
"epoch": 0.18,
"grad_norm": 4.173891491715113,
"learning_rate": 9.411056007996989e-06,
"loss": 0.6797,
"step": 567
},
{
"epoch": 0.18,
"grad_norm": 1.5845364864420737,
"learning_rate": 9.408613468855829e-06,
"loss": 0.241,
"step": 568
},
{
"epoch": 0.18,
"grad_norm": 1.686902076792496,
"learning_rate": 9.406166193516596e-06,
"loss": 0.2577,
"step": 569
},
{
"epoch": 0.18,
"grad_norm": 4.920333098863435,
"learning_rate": 9.403714184608411e-06,
"loss": 0.4498,
"step": 570
},
{
"epoch": 0.18,
"grad_norm": 5.364993266959027,
"learning_rate": 9.40125744476548e-06,
"loss": 0.5533,
"step": 571
},
{
"epoch": 0.18,
"grad_norm": 1.6173874199022953,
"learning_rate": 9.398795976627091e-06,
"loss": 0.276,
"step": 572
},
{
"epoch": 0.18,
"grad_norm": 1.6041241221350935,
"learning_rate": 9.396329782837614e-06,
"loss": 0.2537,
"step": 573
},
{
"epoch": 0.18,
"grad_norm": 5.644765129803302,
"learning_rate": 9.393858866046494e-06,
"loss": 0.6233,
"step": 574
},
{
"epoch": 0.18,
"grad_norm": 6.940660129614199,
"learning_rate": 9.391383228908253e-06,
"loss": 0.6914,
"step": 575
},
{
"epoch": 0.18,
"grad_norm": 1.624608021417376,
"learning_rate": 9.388902874082482e-06,
"loss": 0.2075,
"step": 576
},
{
"epoch": 0.18,
"grad_norm": 1.6578572047598017,
"learning_rate": 9.386417804233836e-06,
"loss": 0.2461,
"step": 577
},
{
"epoch": 0.18,
"grad_norm": 1.6521043358538268,
"learning_rate": 9.383928022032044e-06,
"loss": 0.2652,
"step": 578
},
{
"epoch": 0.19,
"grad_norm": 6.970230658708915,
"learning_rate": 9.381433530151887e-06,
"loss": 0.5884,
"step": 579
},
{
"epoch": 0.19,
"grad_norm": 1.7553736548624894,
"learning_rate": 9.37893433127322e-06,
"loss": 0.2887,
"step": 580
},
{
"epoch": 0.19,
"grad_norm": 1.9350946570850633,
"learning_rate": 9.376430428080939e-06,
"loss": 0.2824,
"step": 581
},
{
"epoch": 0.19,
"grad_norm": 1.7506675874139501,
"learning_rate": 9.373921823265004e-06,
"loss": 0.2994,
"step": 582
},
{
"epoch": 0.19,
"grad_norm": 1.7958765064076347,
"learning_rate": 9.371408519520421e-06,
"loss": 0.2839,
"step": 583
},
{
"epoch": 0.19,
"grad_norm": 1.589297384908829,
"learning_rate": 9.36889051954725e-06,
"loss": 0.2233,
"step": 584
},
{
"epoch": 0.19,
"grad_norm": 1.5861450409894153,
"learning_rate": 9.366367826050593e-06,
"loss": 0.2131,
"step": 585
},
{
"epoch": 0.19,
"grad_norm": 1.6207442015067852,
"learning_rate": 9.36384044174059e-06,
"loss": 0.2613,
"step": 586
},
{
"epoch": 0.19,
"grad_norm": 1.6179364940734717,
"learning_rate": 9.361308369332426e-06,
"loss": 0.2564,
"step": 587
},
{
"epoch": 0.19,
"grad_norm": 1.4492002384966378,
"learning_rate": 9.358771611546319e-06,
"loss": 0.2223,
"step": 588
},
{
"epoch": 0.19,
"grad_norm": 6.026513940478561,
"learning_rate": 9.356230171107524e-06,
"loss": 0.625,
"step": 589
},
{
"epoch": 0.19,
"grad_norm": 4.252908040021938,
"learning_rate": 9.353684050746323e-06,
"loss": 0.5486,
"step": 590
},
{
"epoch": 0.19,
"grad_norm": 4.585690047708393,
"learning_rate": 9.351133253198027e-06,
"loss": 0.508,
"step": 591
},
{
"epoch": 0.19,
"grad_norm": 1.8093757029592847,
"learning_rate": 9.348577781202976e-06,
"loss": 0.3174,
"step": 592
},
{
"epoch": 0.19,
"grad_norm": 4.8059623125324125,
"learning_rate": 9.346017637506523e-06,
"loss": 0.6029,
"step": 593
},
{
"epoch": 0.19,
"grad_norm": 4.820157002934122,
"learning_rate": 9.343452824859048e-06,
"loss": 0.6118,
"step": 594
},
{
"epoch": 0.19,
"grad_norm": 4.792292745536942,
"learning_rate": 9.340883346015941e-06,
"loss": 0.4967,
"step": 595
},
{
"epoch": 0.19,
"grad_norm": 5.778832171325122,
"learning_rate": 9.338309203737609e-06,
"loss": 0.6744,
"step": 596
},
{
"epoch": 0.19,
"grad_norm": 4.150228699187317,
"learning_rate": 9.335730400789466e-06,
"loss": 0.5699,
"step": 597
},
{
"epoch": 0.19,
"grad_norm": 7.722756882503639,
"learning_rate": 9.333146939941938e-06,
"loss": 0.6951,
"step": 598
},
{
"epoch": 0.19,
"grad_norm": 5.296551030338628,
"learning_rate": 9.330558823970448e-06,
"loss": 0.6075,
"step": 599
},
{
"epoch": 0.19,
"grad_norm": 1.6248021042250564,
"learning_rate": 9.327966055655424e-06,
"loss": 0.2108,
"step": 600
},
{
"epoch": 0.19,
"grad_norm": 5.8872539663544226,
"learning_rate": 9.325368637782292e-06,
"loss": 0.427,
"step": 601
},
{
"epoch": 0.19,
"grad_norm": 1.5516053060269306,
"learning_rate": 9.322766573141473e-06,
"loss": 0.2381,
"step": 602
},
{
"epoch": 0.19,
"grad_norm": 8.414158151979361,
"learning_rate": 9.320159864528378e-06,
"loss": 0.61,
"step": 603
},
{
"epoch": 0.19,
"grad_norm": 1.4150529791591346,
"learning_rate": 9.31754851474341e-06,
"loss": 0.2366,
"step": 604
},
{
"epoch": 0.19,
"grad_norm": 6.312607193376232,
"learning_rate": 9.314932526591956e-06,
"loss": 0.6776,
"step": 605
},
{
"epoch": 0.19,
"grad_norm": 9.194739133513545,
"learning_rate": 9.312311902884388e-06,
"loss": 0.4316,
"step": 606
},
{
"epoch": 0.19,
"grad_norm": 5.343246517463489,
"learning_rate": 9.309686646436053e-06,
"loss": 0.5531,
"step": 607
},
{
"epoch": 0.19,
"grad_norm": 5.920143525678418,
"learning_rate": 9.307056760067284e-06,
"loss": 0.7927,
"step": 608
},
{
"epoch": 0.19,
"grad_norm": 5.312603829868963,
"learning_rate": 9.30442224660338e-06,
"loss": 0.6748,
"step": 609
},
{
"epoch": 0.2,
"grad_norm": 7.410183273720859,
"learning_rate": 9.301783108874611e-06,
"loss": 0.4833,
"step": 610
},
{
"epoch": 0.2,
"grad_norm": 6.241546780821695,
"learning_rate": 9.299139349716221e-06,
"loss": 0.6342,
"step": 611
},
{
"epoch": 0.2,
"grad_norm": 6.614197398378457,
"learning_rate": 9.296490971968416e-06,
"loss": 0.7108,
"step": 612
},
{
"epoch": 0.2,
"grad_norm": 11.12842756885836,
"learning_rate": 9.293837978476359e-06,
"loss": 0.7183,
"step": 613
},
{
"epoch": 0.2,
"grad_norm": 1.5690747712727964,
"learning_rate": 9.291180372090178e-06,
"loss": 0.2025,
"step": 614
},
{
"epoch": 0.2,
"grad_norm": 7.567264761751759,
"learning_rate": 9.288518155664956e-06,
"loss": 0.5909,
"step": 615
},
{
"epoch": 0.2,
"grad_norm": 1.7632136067242392,
"learning_rate": 9.285851332060722e-06,
"loss": 0.2026,
"step": 616
},
{
"epoch": 0.2,
"grad_norm": 6.789591111745626,
"learning_rate": 9.283179904142465e-06,
"loss": 0.7386,
"step": 617
},
{
"epoch": 0.2,
"grad_norm": 5.91600337942705,
"learning_rate": 9.280503874780112e-06,
"loss": 0.628,
"step": 618
},
{
"epoch": 0.2,
"grad_norm": 6.981500952396759,
"learning_rate": 9.277823246848537e-06,
"loss": 0.5649,
"step": 619
},
{
"epoch": 0.2,
"grad_norm": 1.8397071025068248,
"learning_rate": 9.275138023227555e-06,
"loss": 0.286,
"step": 620
},
{
"epoch": 0.2,
"grad_norm": 1.607878832996544,
"learning_rate": 9.272448206801912e-06,
"loss": 0.2084,
"step": 621
},
{
"epoch": 0.2,
"grad_norm": 7.555829633070186,
"learning_rate": 9.269753800461299e-06,
"loss": 0.5523,
"step": 622
},
{
"epoch": 0.2,
"grad_norm": 9.27324587935331,
"learning_rate": 9.267054807100327e-06,
"loss": 0.5827,
"step": 623
},
{
"epoch": 0.2,
"grad_norm": 1.7081135272874568,
"learning_rate": 9.264351229618541e-06,
"loss": 0.2807,
"step": 624
},
{
"epoch": 0.2,
"grad_norm": 1.62276817476442,
"learning_rate": 9.261643070920409e-06,
"loss": 0.2197,
"step": 625
},
{
"epoch": 0.2,
"grad_norm": 9.382840642334658,
"learning_rate": 9.258930333915325e-06,
"loss": 0.5187,
"step": 626
},
{
"epoch": 0.2,
"grad_norm": 6.020399116685949,
"learning_rate": 9.256213021517593e-06,
"loss": 0.6994,
"step": 627
},
{
"epoch": 0.2,
"grad_norm": 1.551810172772469,
"learning_rate": 9.253491136646437e-06,
"loss": 0.1855,
"step": 628
},
{
"epoch": 0.2,
"grad_norm": 12.477242115295764,
"learning_rate": 9.250764682225997e-06,
"loss": 0.5945,
"step": 629
},
{
"epoch": 0.2,
"grad_norm": 2.023072932974826,
"learning_rate": 9.248033661185313e-06,
"loss": 0.2755,
"step": 630
},
{
"epoch": 0.2,
"grad_norm": 1.5946135073812442,
"learning_rate": 9.24529807645834e-06,
"loss": 0.2641,
"step": 631
},
{
"epoch": 0.2,
"grad_norm": 1.8137331869914235,
"learning_rate": 9.24255793098393e-06,
"loss": 0.2332,
"step": 632
},
{
"epoch": 0.2,
"grad_norm": 6.142954377653386,
"learning_rate": 9.23981322770584e-06,
"loss": 0.4164,
"step": 633
},
{
"epoch": 0.2,
"grad_norm": 1.6930437395931752,
"learning_rate": 9.237063969572713e-06,
"loss": 0.2722,
"step": 634
},
{
"epoch": 0.2,
"grad_norm": 1.6418304944887714,
"learning_rate": 9.2343101595381e-06,
"loss": 0.2373,
"step": 635
},
{
"epoch": 0.2,
"grad_norm": 1.4910986023588277,
"learning_rate": 9.23155180056043e-06,
"loss": 0.198,
"step": 636
},
{
"epoch": 0.2,
"grad_norm": 23.14030810895617,
"learning_rate": 9.228788895603024e-06,
"loss": 0.5749,
"step": 637
},
{
"epoch": 0.2,
"grad_norm": 5.76419133673438,
"learning_rate": 9.226021447634085e-06,
"loss": 0.5483,
"step": 638
},
{
"epoch": 0.2,
"grad_norm": 7.3039110909254825,
"learning_rate": 9.223249459626704e-06,
"loss": 0.6503,
"step": 639
},
{
"epoch": 0.2,
"grad_norm": 1.5353953125637876,
"learning_rate": 9.220472934558838e-06,
"loss": 0.2744,
"step": 640
},
{
"epoch": 0.21,
"grad_norm": 1.562708677756112,
"learning_rate": 9.217691875413323e-06,
"loss": 0.2552,
"step": 641
},
{
"epoch": 0.21,
"grad_norm": 1.7750167517256847,
"learning_rate": 9.214906285177867e-06,
"loss": 0.2382,
"step": 642
},
{
"epoch": 0.21,
"grad_norm": 1.4632002541152762,
"learning_rate": 9.212116166845048e-06,
"loss": 0.2458,
"step": 643
},
{
"epoch": 0.21,
"grad_norm": 1.629934994703476,
"learning_rate": 9.209321523412303e-06,
"loss": 0.2484,
"step": 644
},
{
"epoch": 0.21,
"grad_norm": 1.5393945631387778,
"learning_rate": 9.206522357881931e-06,
"loss": 0.2483,
"step": 645
},
{
"epoch": 0.21,
"grad_norm": 5.128491776174643,
"learning_rate": 9.203718673261098e-06,
"loss": 0.4915,
"step": 646
},
{
"epoch": 0.21,
"grad_norm": 1.7357749156562752,
"learning_rate": 9.20091047256181e-06,
"loss": 0.2484,
"step": 647
},
{
"epoch": 0.21,
"grad_norm": 1.386563520463881,
"learning_rate": 9.198097758800938e-06,
"loss": 0.2054,
"step": 648
},
{
"epoch": 0.21,
"grad_norm": 1.586299737267633,
"learning_rate": 9.195280535000196e-06,
"loss": 0.2761,
"step": 649
},
{
"epoch": 0.21,
"grad_norm": 1.607760059641086,
"learning_rate": 9.19245880418614e-06,
"loss": 0.227,
"step": 650
},
{
"epoch": 0.21,
"grad_norm": 8.145264164527298,
"learning_rate": 9.189632569390172e-06,
"loss": 0.6212,
"step": 651
},
{
"epoch": 0.21,
"grad_norm": 1.5998220170775523,
"learning_rate": 9.186801833648535e-06,
"loss": 0.2534,
"step": 652
},
{
"epoch": 0.21,
"grad_norm": 27.003441048008447,
"learning_rate": 9.183966600002301e-06,
"loss": 0.4521,
"step": 653
},
{
"epoch": 0.21,
"grad_norm": 18.871874110217032,
"learning_rate": 9.181126871497378e-06,
"loss": 0.7232,
"step": 654
},
{
"epoch": 0.21,
"grad_norm": 6.596082007569886,
"learning_rate": 9.178282651184506e-06,
"loss": 0.5631,
"step": 655
},
{
"epoch": 0.21,
"grad_norm": 7.388392936405897,
"learning_rate": 9.175433942119238e-06,
"loss": 0.6402,
"step": 656
},
{
"epoch": 0.21,
"grad_norm": 1.768985853028781,
"learning_rate": 9.172580747361968e-06,
"loss": 0.3014,
"step": 657
},
{
"epoch": 0.21,
"grad_norm": 1.7419466909192685,
"learning_rate": 9.169723069977892e-06,
"loss": 0.2612,
"step": 658
},
{
"epoch": 0.21,
"grad_norm": 1.8395910363602748,
"learning_rate": 9.166860913037032e-06,
"loss": 0.246,
"step": 659
},
{
"epoch": 0.21,
"grad_norm": 1.5135649476546447,
"learning_rate": 9.163994279614218e-06,
"loss": 0.2318,
"step": 660
},
{
"epoch": 0.21,
"grad_norm": 12.152057367824401,
"learning_rate": 9.161123172789091e-06,
"loss": 0.4432,
"step": 661
},
{
"epoch": 0.21,
"grad_norm": 1.4904021731698984,
"learning_rate": 9.158247595646098e-06,
"loss": 0.2012,
"step": 662
},
{
"epoch": 0.21,
"grad_norm": 5.699349685689834,
"learning_rate": 9.155367551274485e-06,
"loss": 0.4084,
"step": 663
},
{
"epoch": 0.21,
"grad_norm": 7.3980368624923925,
"learning_rate": 9.152483042768302e-06,
"loss": 0.5378,
"step": 664
},
{
"epoch": 0.21,
"grad_norm": 1.6587560737993832,
"learning_rate": 9.149594073226391e-06,
"loss": 0.2606,
"step": 665
},
{
"epoch": 0.21,
"grad_norm": 1.599962130861489,
"learning_rate": 9.14670064575239e-06,
"loss": 0.2274,
"step": 666
},
{
"epoch": 0.21,
"grad_norm": 1.7445914210601148,
"learning_rate": 9.143802763454723e-06,
"loss": 0.266,
"step": 667
},
{
"epoch": 0.21,
"grad_norm": 4.49859542583798,
"learning_rate": 9.140900429446601e-06,
"loss": 0.7648,
"step": 668
},
{
"epoch": 0.21,
"grad_norm": 1.5330251303876583,
"learning_rate": 9.137993646846018e-06,
"loss": 0.2794,
"step": 669
},
{
"epoch": 0.21,
"grad_norm": 1.5084724425102092,
"learning_rate": 9.135082418775746e-06,
"loss": 0.2433,
"step": 670
},
{
"epoch": 0.21,
"grad_norm": 1.6364880389220589,
"learning_rate": 9.132166748363335e-06,
"loss": 0.2534,
"step": 671
},
{
"epoch": 0.22,
"grad_norm": 8.579718620310658,
"learning_rate": 9.129246638741108e-06,
"loss": 0.6495,
"step": 672
},
{
"epoch": 0.22,
"grad_norm": 1.6621812549035762,
"learning_rate": 9.126322093046149e-06,
"loss": 0.2948,
"step": 673
},
{
"epoch": 0.22,
"grad_norm": 13.310540789425271,
"learning_rate": 9.123393114420318e-06,
"loss": 0.5038,
"step": 674
},
{
"epoch": 0.22,
"grad_norm": 6.525291196096151,
"learning_rate": 9.120459706010233e-06,
"loss": 0.6439,
"step": 675
},
{
"epoch": 0.22,
"grad_norm": 5.700086882795741,
"learning_rate": 9.11752187096727e-06,
"loss": 0.3714,
"step": 676
},
{
"epoch": 0.22,
"grad_norm": 9.559097162394567,
"learning_rate": 9.114579612447562e-06,
"loss": 0.7145,
"step": 677
},
{
"epoch": 0.22,
"grad_norm": 7.429212102267612,
"learning_rate": 9.111632933611993e-06,
"loss": 0.4803,
"step": 678
},
{
"epoch": 0.22,
"grad_norm": 1.7216462064235085,
"learning_rate": 9.108681837626199e-06,
"loss": 0.2794,
"step": 679
},
{
"epoch": 0.22,
"grad_norm": 12.791478411516074,
"learning_rate": 9.105726327660556e-06,
"loss": 0.7364,
"step": 680
},
{
"epoch": 0.22,
"grad_norm": 6.133174155539079,
"learning_rate": 9.102766406890185e-06,
"loss": 0.7558,
"step": 681
},
{
"epoch": 0.22,
"grad_norm": 1.6929638635543172,
"learning_rate": 9.099802078494947e-06,
"loss": 0.2172,
"step": 682
},
{
"epoch": 0.22,
"grad_norm": 1.648205135013034,
"learning_rate": 9.096833345659437e-06,
"loss": 0.2948,
"step": 683
},
{
"epoch": 0.22,
"grad_norm": 1.5931274816148164,
"learning_rate": 9.09386021157298e-06,
"loss": 0.2752,
"step": 684
},
{
"epoch": 0.22,
"grad_norm": 19.09722886984461,
"learning_rate": 9.09088267942963e-06,
"loss": 0.5787,
"step": 685
},
{
"epoch": 0.22,
"grad_norm": 7.094242815810287,
"learning_rate": 9.087900752428168e-06,
"loss": 0.6867,
"step": 686
},
{
"epoch": 0.22,
"grad_norm": 8.952583171505099,
"learning_rate": 9.084914433772094e-06,
"loss": 0.4357,
"step": 687
},
{
"epoch": 0.22,
"grad_norm": 10.097596781830811,
"learning_rate": 9.081923726669626e-06,
"loss": 0.432,
"step": 688
},
{
"epoch": 0.22,
"grad_norm": 1.6240538724758538,
"learning_rate": 9.0789286343337e-06,
"loss": 0.2482,
"step": 689
},
{
"epoch": 0.22,
"grad_norm": 1.7394940860333676,
"learning_rate": 9.075929159981957e-06,
"loss": 0.2349,
"step": 690
},
{
"epoch": 0.22,
"grad_norm": 6.247495284748309,
"learning_rate": 9.072925306836751e-06,
"loss": 0.6756,
"step": 691
},
{
"epoch": 0.22,
"grad_norm": 2.644825838147394,
"learning_rate": 9.06991707812514e-06,
"loss": 0.2562,
"step": 692
},
{
"epoch": 0.22,
"grad_norm": 4.9425686233157125,
"learning_rate": 9.066904477078875e-06,
"loss": 0.5812,
"step": 693
},
{
"epoch": 0.22,
"grad_norm": 8.611710175765456,
"learning_rate": 9.063887506934417e-06,
"loss": 0.541,
"step": 694
},
{
"epoch": 0.22,
"grad_norm": 9.798715952262672,
"learning_rate": 9.06086617093291e-06,
"loss": 0.6091,
"step": 695
},
{
"epoch": 0.22,
"grad_norm": 5.090455849781971,
"learning_rate": 9.057840472320192e-06,
"loss": 0.369,
"step": 696
},
{
"epoch": 0.22,
"grad_norm": 4.724373661013194,
"learning_rate": 9.054810414346789e-06,
"loss": 0.525,
"step": 697
},
{
"epoch": 0.22,
"grad_norm": 4.634085714902342,
"learning_rate": 9.05177600026791e-06,
"loss": 0.5811,
"step": 698
},
{
"epoch": 0.22,
"grad_norm": 1.6644096217496105,
"learning_rate": 9.048737233343442e-06,
"loss": 0.2294,
"step": 699
},
{
"epoch": 0.22,
"grad_norm": 1.5592779619156767,
"learning_rate": 9.045694116837948e-06,
"loss": 0.2123,
"step": 700
},
{
"epoch": 0.22,
"grad_norm": 1.587281520082312,
"learning_rate": 9.042646654020667e-06,
"loss": 0.2175,
"step": 701
},
{
"epoch": 0.22,
"grad_norm": 1.67902318285347,
"learning_rate": 9.039594848165507e-06,
"loss": 0.2544,
"step": 702
},
{
"epoch": 0.22,
"grad_norm": 6.287298171598728,
"learning_rate": 9.036538702551037e-06,
"loss": 0.5301,
"step": 703
},
{
"epoch": 0.23,
"grad_norm": 8.160902954056166,
"learning_rate": 9.03347822046049e-06,
"loss": 0.7306,
"step": 704
},
{
"epoch": 0.23,
"grad_norm": 24.09115381418598,
"learning_rate": 9.03041340518176e-06,
"loss": 0.6126,
"step": 705
},
{
"epoch": 0.23,
"grad_norm": 9.806039321186942,
"learning_rate": 9.027344260007401e-06,
"loss": 0.5585,
"step": 706
},
{
"epoch": 0.23,
"grad_norm": 7.069193941856005,
"learning_rate": 9.024270788234606e-06,
"loss": 0.499,
"step": 707
},
{
"epoch": 0.23,
"grad_norm": 1.5194477737410057,
"learning_rate": 9.021192993165224e-06,
"loss": 0.2322,
"step": 708
},
{
"epoch": 0.23,
"grad_norm": 6.0641664147259835,
"learning_rate": 9.01811087810575e-06,
"loss": 0.6656,
"step": 709
},
{
"epoch": 0.23,
"grad_norm": 2.5206543795362655,
"learning_rate": 9.015024446367315e-06,
"loss": 0.2521,
"step": 710
},
{
"epoch": 0.23,
"grad_norm": 2.0917356658790958,
"learning_rate": 9.01193370126569e-06,
"loss": 0.2604,
"step": 711
},
{
"epoch": 0.23,
"grad_norm": 9.499645522937357,
"learning_rate": 9.008838646121282e-06,
"loss": 0.5874,
"step": 712
},
{
"epoch": 0.23,
"grad_norm": 1.8038572094239258,
"learning_rate": 9.005739284259123e-06,
"loss": 0.2524,
"step": 713
},
{
"epoch": 0.23,
"grad_norm": 7.157493390955907,
"learning_rate": 9.002635619008877e-06,
"loss": 0.5076,
"step": 714
},
{
"epoch": 0.23,
"grad_norm": 6.397558022513768,
"learning_rate": 8.999527653704829e-06,
"loss": 0.547,
"step": 715
},
{
"epoch": 0.23,
"grad_norm": 4.619919620826792,
"learning_rate": 8.996415391685882e-06,
"loss": 0.4299,
"step": 716
},
{
"epoch": 0.23,
"grad_norm": 6.529113341573041,
"learning_rate": 8.993298836295556e-06,
"loss": 0.6004,
"step": 717
},
{
"epoch": 0.23,
"grad_norm": 8.48129945705957,
"learning_rate": 8.990177990881986e-06,
"loss": 0.6099,
"step": 718
},
{
"epoch": 0.23,
"grad_norm": 1.7333448005910954,
"learning_rate": 8.987052858797914e-06,
"loss": 0.2503,
"step": 719
},
{
"epoch": 0.23,
"grad_norm": 1.6374799656813561,
"learning_rate": 8.983923443400682e-06,
"loss": 0.2465,
"step": 720
},
{
"epoch": 0.23,
"grad_norm": 1.5156187701860444,
"learning_rate": 8.980789748052245e-06,
"loss": 0.2298,
"step": 721
},
{
"epoch": 0.23,
"grad_norm": 1.5643473209138472,
"learning_rate": 8.977651776119145e-06,
"loss": 0.2259,
"step": 722
},
{
"epoch": 0.23,
"grad_norm": 1.376360105350978,
"learning_rate": 8.974509530972523e-06,
"loss": 0.183,
"step": 723
},
{
"epoch": 0.23,
"grad_norm": 1.64982442464039,
"learning_rate": 8.971363015988115e-06,
"loss": 0.2379,
"step": 724
},
{
"epoch": 0.23,
"grad_norm": 7.018141561272961,
"learning_rate": 8.968212234546235e-06,
"loss": 0.544,
"step": 725
},
{
"epoch": 0.23,
"grad_norm": 6.31348366155464,
"learning_rate": 8.965057190031785e-06,
"loss": 0.502,
"step": 726
},
{
"epoch": 0.23,
"grad_norm": 1.5730767002449284,
"learning_rate": 8.961897885834247e-06,
"loss": 0.217,
"step": 727
},
{
"epoch": 0.23,
"grad_norm": 1.4574113349478706,
"learning_rate": 8.958734325347684e-06,
"loss": 0.1932,
"step": 728
},
{
"epoch": 0.23,
"grad_norm": 1.5853274210824488,
"learning_rate": 8.955566511970721e-06,
"loss": 0.2622,
"step": 729
},
{
"epoch": 0.23,
"grad_norm": 7.869412759007845,
"learning_rate": 8.95239444910656e-06,
"loss": 0.6187,
"step": 730
},
{
"epoch": 0.23,
"grad_norm": 1.493489624359773,
"learning_rate": 8.949218140162965e-06,
"loss": 0.1976,
"step": 731
},
{
"epoch": 0.23,
"grad_norm": 9.868875837133936,
"learning_rate": 8.946037588552266e-06,
"loss": 0.7038,
"step": 732
},
{
"epoch": 0.23,
"grad_norm": 1.5670087425538508,
"learning_rate": 8.94285279769134e-06,
"loss": 0.2122,
"step": 733
},
{
"epoch": 0.23,
"grad_norm": 7.262462716485521,
"learning_rate": 8.939663771001632e-06,
"loss": 0.5317,
"step": 734
},
{
"epoch": 0.24,
"grad_norm": 1.7074620966901546,
"learning_rate": 8.93647051190913e-06,
"loss": 0.2161,
"step": 735
},
{
"epoch": 0.24,
"grad_norm": 1.563088640604172,
"learning_rate": 8.93327302384437e-06,
"loss": 0.2535,
"step": 736
},
{
"epoch": 0.24,
"grad_norm": 1.7432968666284017,
"learning_rate": 8.930071310242429e-06,
"loss": 0.2418,
"step": 737
},
{
"epoch": 0.24,
"grad_norm": 5.530145271928336,
"learning_rate": 8.926865374542928e-06,
"loss": 0.6239,
"step": 738
},
{
"epoch": 0.24,
"grad_norm": 1.6816103340065633,
"learning_rate": 8.92365522019002e-06,
"loss": 0.2183,
"step": 739
},
{
"epoch": 0.24,
"grad_norm": 1.3399913301251825,
"learning_rate": 8.920440850632395e-06,
"loss": 0.2205,
"step": 740
},
{
"epoch": 0.24,
"grad_norm": 11.550609549065676,
"learning_rate": 8.917222269323263e-06,
"loss": 0.5839,
"step": 741
},
{
"epoch": 0.24,
"grad_norm": 5.165694148077282,
"learning_rate": 8.91399947972037e-06,
"loss": 0.5495,
"step": 742
},
{
"epoch": 0.24,
"grad_norm": 7.330150211344736,
"learning_rate": 8.91077248528597e-06,
"loss": 0.5782,
"step": 743
},
{
"epoch": 0.24,
"grad_norm": 13.151204937371048,
"learning_rate": 8.907541289486847e-06,
"loss": 0.6479,
"step": 744
},
{
"epoch": 0.24,
"grad_norm": 1.519064133046649,
"learning_rate": 8.904305895794292e-06,
"loss": 0.2459,
"step": 745
},
{
"epoch": 0.24,
"grad_norm": 1.6022800920512217,
"learning_rate": 8.901066307684102e-06,
"loss": 0.2458,
"step": 746
},
{
"epoch": 0.24,
"grad_norm": 1.7147403426955812,
"learning_rate": 8.89782252863659e-06,
"loss": 0.2569,
"step": 747
},
{
"epoch": 0.24,
"grad_norm": 5.670430578581199,
"learning_rate": 8.894574562136561e-06,
"loss": 0.5057,
"step": 748
},
{
"epoch": 0.24,
"grad_norm": 6.959529161210214,
"learning_rate": 8.89132241167333e-06,
"loss": 0.6522,
"step": 749
},
{
"epoch": 0.24,
"grad_norm": 1.7820823733746547,
"learning_rate": 8.888066080740692e-06,
"loss": 0.279,
"step": 750
},
{
"epoch": 0.24,
"grad_norm": 7.361769640465293,
"learning_rate": 8.88480557283695e-06,
"loss": 0.6678,
"step": 751
},
{
"epoch": 0.24,
"grad_norm": 5.163392974539672,
"learning_rate": 8.88154089146488e-06,
"loss": 0.5459,
"step": 752
},
{
"epoch": 0.24,
"grad_norm": 6.24247169542295,
"learning_rate": 8.878272040131748e-06,
"loss": 0.595,
"step": 753
},
{
"epoch": 0.24,
"grad_norm": 1.463345046277402,
"learning_rate": 8.874999022349303e-06,
"loss": 0.2448,
"step": 754
},
{
"epoch": 0.24,
"grad_norm": 6.242443414538638,
"learning_rate": 8.871721841633762e-06,
"loss": 0.6578,
"step": 755
},
{
"epoch": 0.24,
"grad_norm": 5.682520955037908,
"learning_rate": 8.868440501505822e-06,
"loss": 0.5833,
"step": 756
},
{
"epoch": 0.24,
"grad_norm": 6.484549304514638,
"learning_rate": 8.865155005490643e-06,
"loss": 0.7468,
"step": 757
},
{
"epoch": 0.24,
"grad_norm": 6.302754066055148,
"learning_rate": 8.861865357117852e-06,
"loss": 0.7221,
"step": 758
},
{
"epoch": 0.24,
"grad_norm": 5.857443360175216,
"learning_rate": 8.858571559921539e-06,
"loss": 0.6406,
"step": 759
},
{
"epoch": 0.24,
"grad_norm": 1.7623095964863735,
"learning_rate": 8.855273617440243e-06,
"loss": 0.2485,
"step": 760
},
{
"epoch": 0.24,
"grad_norm": 5.210709853076465,
"learning_rate": 8.851971533216968e-06,
"loss": 0.5855,
"step": 761
},
{
"epoch": 0.24,
"grad_norm": 8.365648509155578,
"learning_rate": 8.848665310799156e-06,
"loss": 0.5802,
"step": 762
},
{
"epoch": 0.24,
"grad_norm": 1.5648919156529912,
"learning_rate": 8.845354953738706e-06,
"loss": 0.2344,
"step": 763
},
{
"epoch": 0.24,
"grad_norm": 8.703297721624168,
"learning_rate": 8.84204046559195e-06,
"loss": 0.7219,
"step": 764
},
{
"epoch": 0.24,
"grad_norm": 6.170541209244984,
"learning_rate": 8.83872184991966e-06,
"loss": 0.5098,
"step": 765
},
{
"epoch": 0.25,
"grad_norm": 1.7385020557898248,
"learning_rate": 8.835399110287046e-06,
"loss": 0.2236,
"step": 766
},
{
"epoch": 0.25,
"grad_norm": 1.4661230490009658,
"learning_rate": 8.832072250263746e-06,
"loss": 0.2413,
"step": 767
},
{
"epoch": 0.25,
"grad_norm": 1.65953158170735,
"learning_rate": 8.82874127342382e-06,
"loss": 0.2573,
"step": 768
},
{
"epoch": 0.25,
"grad_norm": 1.5080627608844777,
"learning_rate": 8.82540618334576e-06,
"loss": 0.2457,
"step": 769
},
{
"epoch": 0.25,
"grad_norm": 9.832298571768307,
"learning_rate": 8.82206698361247e-06,
"loss": 0.7066,
"step": 770
},
{
"epoch": 0.25,
"grad_norm": 8.277558372822316,
"learning_rate": 8.818723677811269e-06,
"loss": 0.5355,
"step": 771
},
{
"epoch": 0.25,
"grad_norm": 1.4418416223623052,
"learning_rate": 8.815376269533893e-06,
"loss": 0.2066,
"step": 772
},
{
"epoch": 0.25,
"grad_norm": 9.649849845477053,
"learning_rate": 8.812024762376477e-06,
"loss": 0.6768,
"step": 773
},
{
"epoch": 0.25,
"grad_norm": 5.843745363878495,
"learning_rate": 8.808669159939568e-06,
"loss": 0.6621,
"step": 774
},
{
"epoch": 0.25,
"grad_norm": 9.414511673736458,
"learning_rate": 8.805309465828105e-06,
"loss": 0.7382,
"step": 775
},
{
"epoch": 0.25,
"grad_norm": 1.516642979165497,
"learning_rate": 8.80194568365143e-06,
"loss": 0.2167,
"step": 776
},
{
"epoch": 0.25,
"grad_norm": 3.990212164513675,
"learning_rate": 8.798577817023269e-06,
"loss": 0.4577,
"step": 777
},
{
"epoch": 0.25,
"grad_norm": 1.5896855461224302,
"learning_rate": 8.795205869561742e-06,
"loss": 0.231,
"step": 778
},
{
"epoch": 0.25,
"grad_norm": 6.974568598248131,
"learning_rate": 8.79182984488935e-06,
"loss": 0.6463,
"step": 779
},
{
"epoch": 0.25,
"grad_norm": 12.322774136067109,
"learning_rate": 8.788449746632976e-06,
"loss": 0.6105,
"step": 780
},
{
"epoch": 0.25,
"grad_norm": 1.669707596316299,
"learning_rate": 8.78506557842388e-06,
"loss": 0.2275,
"step": 781
},
{
"epoch": 0.25,
"grad_norm": 10.283945570112477,
"learning_rate": 8.781677343897687e-06,
"loss": 0.7113,
"step": 782
},
{
"epoch": 0.25,
"grad_norm": 9.663439942965859,
"learning_rate": 8.778285046694403e-06,
"loss": 0.5229,
"step": 783
},
{
"epoch": 0.25,
"grad_norm": 1.512582270088964,
"learning_rate": 8.77488869045839e-06,
"loss": 0.2626,
"step": 784
},
{
"epoch": 0.25,
"grad_norm": 6.957722562450112,
"learning_rate": 8.771488278838368e-06,
"loss": 0.5329,
"step": 785
},
{
"epoch": 0.25,
"grad_norm": 8.546833167469059,
"learning_rate": 8.768083815487428e-06,
"loss": 0.6326,
"step": 786
},
{
"epoch": 0.25,
"grad_norm": 6.569317927329583,
"learning_rate": 8.764675304062992e-06,
"loss": 0.6385,
"step": 787
},
{
"epoch": 0.25,
"grad_norm": 6.826173570363225,
"learning_rate": 8.76126274822685e-06,
"loss": 0.4427,
"step": 788
},
{
"epoch": 0.25,
"grad_norm": 6.370630972136984,
"learning_rate": 8.75784615164513e-06,
"loss": 0.4011,
"step": 789
},
{
"epoch": 0.25,
"grad_norm": 14.141899407748205,
"learning_rate": 8.754425517988298e-06,
"loss": 0.6297,
"step": 790
},
{
"epoch": 0.25,
"grad_norm": 25.04984916586043,
"learning_rate": 8.751000850931162e-06,
"loss": 0.5998,
"step": 791
},
{
"epoch": 0.25,
"grad_norm": 1.4086297275415713,
"learning_rate": 8.74757215415286e-06,
"loss": 0.2335,
"step": 792
},
{
"epoch": 0.25,
"grad_norm": 5.8832008847198685,
"learning_rate": 8.74413943133686e-06,
"loss": 0.6533,
"step": 793
},
{
"epoch": 0.25,
"grad_norm": 6.128009992673158,
"learning_rate": 8.740702686170955e-06,
"loss": 0.5133,
"step": 794
},
{
"epoch": 0.25,
"grad_norm": 1.8100722093658241,
"learning_rate": 8.73726192234726e-06,
"loss": 0.2315,
"step": 795
},
{
"epoch": 0.25,
"grad_norm": 8.121105568280397,
"learning_rate": 8.733817143562207e-06,
"loss": 0.4929,
"step": 796
},
{
"epoch": 0.26,
"grad_norm": 6.063649005918384,
"learning_rate": 8.73036835351654e-06,
"loss": 0.4996,
"step": 797
},
{
"epoch": 0.26,
"grad_norm": 1.4302425013841211,
"learning_rate": 8.726915555915317e-06,
"loss": 0.1863,
"step": 798
},
{
"epoch": 0.26,
"grad_norm": 1.6939262130664616,
"learning_rate": 8.723458754467893e-06,
"loss": 0.1969,
"step": 799
},
{
"epoch": 0.26,
"grad_norm": 7.056615528678235,
"learning_rate": 8.719997952887932e-06,
"loss": 0.5954,
"step": 800
},
{
"epoch": 0.26,
"grad_norm": 1.3693791069491372,
"learning_rate": 8.71653315489339e-06,
"loss": 0.1708,
"step": 801
},
{
"epoch": 0.26,
"grad_norm": 1.605845650568055,
"learning_rate": 8.71306436420652e-06,
"loss": 0.1999,
"step": 802
},
{
"epoch": 0.26,
"grad_norm": 1.6213491814636538,
"learning_rate": 8.709591584553865e-06,
"loss": 0.2564,
"step": 803
},
{
"epoch": 0.26,
"grad_norm": 1.5534122030041937,
"learning_rate": 8.706114819666249e-06,
"loss": 0.2171,
"step": 804
},
{
"epoch": 0.26,
"grad_norm": 24.46971085214679,
"learning_rate": 8.702634073278784e-06,
"loss": 0.5805,
"step": 805
},
{
"epoch": 0.26,
"grad_norm": 10.299870968946921,
"learning_rate": 8.699149349130848e-06,
"loss": 0.7832,
"step": 806
},
{
"epoch": 0.26,
"grad_norm": 1.6564460747680942,
"learning_rate": 8.695660650966109e-06,
"loss": 0.2721,
"step": 807
},
{
"epoch": 0.26,
"grad_norm": 4.500603362360334,
"learning_rate": 8.692167982532487e-06,
"loss": 0.4253,
"step": 808
},
{
"epoch": 0.26,
"grad_norm": 10.538332097952509,
"learning_rate": 8.688671347582178e-06,
"loss": 0.5227,
"step": 809
},
{
"epoch": 0.26,
"grad_norm": 1.464779623110201,
"learning_rate": 8.685170749871638e-06,
"loss": 0.2154,
"step": 810
},
{
"epoch": 0.26,
"grad_norm": 8.657587901907425,
"learning_rate": 8.681666193161578e-06,
"loss": 0.5902,
"step": 811
},
{
"epoch": 0.26,
"grad_norm": 9.225749603194462,
"learning_rate": 8.67815768121696e-06,
"loss": 0.6304,
"step": 812
},
{
"epoch": 0.26,
"grad_norm": 1.6130060216926752,
"learning_rate": 8.674645217807e-06,
"loss": 0.2261,
"step": 813
},
{
"epoch": 0.26,
"grad_norm": 7.367222456767956,
"learning_rate": 8.671128806705159e-06,
"loss": 0.6718,
"step": 814
},
{
"epoch": 0.26,
"grad_norm": 5.761031895494796,
"learning_rate": 8.667608451689135e-06,
"loss": 0.5507,
"step": 815
},
{
"epoch": 0.26,
"grad_norm": 6.8020377060599735,
"learning_rate": 8.664084156540864e-06,
"loss": 0.6018,
"step": 816
},
{
"epoch": 0.26,
"grad_norm": 7.149889953402013,
"learning_rate": 8.660555925046518e-06,
"loss": 0.535,
"step": 817
},
{
"epoch": 0.26,
"grad_norm": 9.66930716361746,
"learning_rate": 8.657023760996497e-06,
"loss": 0.7477,
"step": 818
},
{
"epoch": 0.26,
"grad_norm": 1.8553835758820338,
"learning_rate": 8.653487668185419e-06,
"loss": 0.2639,
"step": 819
},
{
"epoch": 0.26,
"grad_norm": 1.5807595552460332,
"learning_rate": 8.649947650412135e-06,
"loss": 0.2533,
"step": 820
},
{
"epoch": 0.26,
"grad_norm": 5.003858299431936,
"learning_rate": 8.646403711479702e-06,
"loss": 0.584,
"step": 821
},
{
"epoch": 0.26,
"grad_norm": 18.940436427722357,
"learning_rate": 8.642855855195394e-06,
"loss": 0.5363,
"step": 822
},
{
"epoch": 0.26,
"grad_norm": 1.5435972264861446,
"learning_rate": 8.639304085370692e-06,
"loss": 0.2301,
"step": 823
},
{
"epoch": 0.26,
"grad_norm": 7.591995588519707,
"learning_rate": 8.635748405821285e-06,
"loss": 0.6262,
"step": 824
},
{
"epoch": 0.26,
"grad_norm": 1.4897553699726007,
"learning_rate": 8.632188820367056e-06,
"loss": 0.1984,
"step": 825
},
{
"epoch": 0.26,
"grad_norm": 1.6942123556365838,
"learning_rate": 8.62862533283209e-06,
"loss": 0.2532,
"step": 826
},
{
"epoch": 0.26,
"grad_norm": 1.6469115110904957,
"learning_rate": 8.625057947044662e-06,
"loss": 0.2305,
"step": 827
},
{
"epoch": 0.26,
"grad_norm": 6.303157276945475,
"learning_rate": 8.62148666683723e-06,
"loss": 0.6848,
"step": 828
},
{
"epoch": 0.27,
"grad_norm": 20.989116059598974,
"learning_rate": 8.617911496046446e-06,
"loss": 0.7894,
"step": 829
},
{
"epoch": 0.27,
"grad_norm": 6.561870313214275,
"learning_rate": 8.614332438513132e-06,
"loss": 0.6544,
"step": 830
},
{
"epoch": 0.27,
"grad_norm": 5.14236901016574,
"learning_rate": 8.610749498082291e-06,
"loss": 0.6395,
"step": 831
},
{
"epoch": 0.27,
"grad_norm": 1.5235194242647812,
"learning_rate": 8.607162678603097e-06,
"loss": 0.2324,
"step": 832
},
{
"epoch": 0.27,
"grad_norm": 1.704837203299887,
"learning_rate": 8.603571983928888e-06,
"loss": 0.2514,
"step": 833
},
{
"epoch": 0.27,
"grad_norm": 1.4187248790707914,
"learning_rate": 8.599977417917169e-06,
"loss": 0.2461,
"step": 834
},
{
"epoch": 0.27,
"grad_norm": 4.701976212131134,
"learning_rate": 8.5963789844296e-06,
"loss": 0.5067,
"step": 835
},
{
"epoch": 0.27,
"grad_norm": 1.4414253542300597,
"learning_rate": 8.592776687332003e-06,
"loss": 0.2048,
"step": 836
},
{
"epoch": 0.27,
"grad_norm": 6.611317201824275,
"learning_rate": 8.58917053049434e-06,
"loss": 0.5289,
"step": 837
},
{
"epoch": 0.27,
"grad_norm": 1.4949147393495994,
"learning_rate": 8.58556051779073e-06,
"loss": 0.2724,
"step": 838
},
{
"epoch": 0.27,
"grad_norm": 1.6284172525696397,
"learning_rate": 8.581946653099427e-06,
"loss": 0.2875,
"step": 839
},
{
"epoch": 0.27,
"grad_norm": 1.6197613046378103,
"learning_rate": 8.578328940302827e-06,
"loss": 0.2145,
"step": 840
},
{
"epoch": 0.27,
"grad_norm": 4.714725732777521,
"learning_rate": 8.574707383287459e-06,
"loss": 0.6025,
"step": 841
},
{
"epoch": 0.27,
"grad_norm": 6.3927437325020255,
"learning_rate": 8.571081985943984e-06,
"loss": 0.6115,
"step": 842
},
{
"epoch": 0.27,
"grad_norm": 8.096428448758006,
"learning_rate": 8.567452752167183e-06,
"loss": 0.6769,
"step": 843
},
{
"epoch": 0.27,
"grad_norm": 1.5775772470528593,
"learning_rate": 8.563819685855963e-06,
"loss": 0.233,
"step": 844
},
{
"epoch": 0.27,
"grad_norm": 1.4985075939845311,
"learning_rate": 8.560182790913349e-06,
"loss": 0.21,
"step": 845
},
{
"epoch": 0.27,
"grad_norm": 1.4528883699852078,
"learning_rate": 8.556542071246476e-06,
"loss": 0.2423,
"step": 846
},
{
"epoch": 0.27,
"grad_norm": 1.8045829129907685,
"learning_rate": 8.552897530766592e-06,
"loss": 0.3062,
"step": 847
},
{
"epoch": 0.27,
"grad_norm": 6.363121819025957,
"learning_rate": 8.549249173389045e-06,
"loss": 0.5701,
"step": 848
},
{
"epoch": 0.27,
"grad_norm": 5.323940029256627,
"learning_rate": 8.545597003033286e-06,
"loss": 0.5023,
"step": 849
},
{
"epoch": 0.27,
"grad_norm": 2.027599692607025,
"learning_rate": 8.54194102362286e-06,
"loss": 0.2693,
"step": 850
},
{
"epoch": 0.27,
"grad_norm": 6.546242260539323,
"learning_rate": 8.538281239085411e-06,
"loss": 0.6131,
"step": 851
},
{
"epoch": 0.27,
"grad_norm": 7.295077814114934,
"learning_rate": 8.534617653352661e-06,
"loss": 0.5568,
"step": 852
},
{
"epoch": 0.27,
"grad_norm": 6.484864693221408,
"learning_rate": 8.530950270360425e-06,
"loss": 0.5634,
"step": 853
},
{
"epoch": 0.27,
"grad_norm": 1.5403880030840937,
"learning_rate": 8.52727909404859e-06,
"loss": 0.2438,
"step": 854
},
{
"epoch": 0.27,
"grad_norm": 5.07206173015186,
"learning_rate": 8.523604128361123e-06,
"loss": 0.6061,
"step": 855
},
{
"epoch": 0.27,
"grad_norm": 1.6685551330454524,
"learning_rate": 8.519925377246057e-06,
"loss": 0.2368,
"step": 856
},
{
"epoch": 0.27,
"grad_norm": 1.728742188440905,
"learning_rate": 8.516242844655498e-06,
"loss": 0.2153,
"step": 857
},
{
"epoch": 0.27,
"grad_norm": 1.5952897553270604,
"learning_rate": 8.512556534545612e-06,
"loss": 0.2266,
"step": 858
},
{
"epoch": 0.27,
"grad_norm": 1.4863335402974511,
"learning_rate": 8.50886645087662e-06,
"loss": 0.2076,
"step": 859
},
{
"epoch": 0.28,
"grad_norm": 11.340967979123214,
"learning_rate": 8.5051725976128e-06,
"loss": 0.6309,
"step": 860
},
{
"epoch": 0.28,
"grad_norm": 6.899729097187553,
"learning_rate": 8.50147497872248e-06,
"loss": 0.7037,
"step": 861
},
{
"epoch": 0.28,
"grad_norm": 6.422692032652846,
"learning_rate": 8.497773598178033e-06,
"loss": 0.6942,
"step": 862
},
{
"epoch": 0.28,
"grad_norm": 1.4342106968594903,
"learning_rate": 8.494068459955871e-06,
"loss": 0.2169,
"step": 863
},
{
"epoch": 0.28,
"grad_norm": 11.421594355547967,
"learning_rate": 8.490359568036446e-06,
"loss": 0.5014,
"step": 864
},
{
"epoch": 0.28,
"grad_norm": 1.6416992908316417,
"learning_rate": 8.486646926404243e-06,
"loss": 0.2854,
"step": 865
},
{
"epoch": 0.28,
"grad_norm": 1.5490326018370097,
"learning_rate": 8.48293053904777e-06,
"loss": 0.2449,
"step": 866
},
{
"epoch": 0.28,
"grad_norm": 7.257007373529935,
"learning_rate": 8.479210409959565e-06,
"loss": 0.6813,
"step": 867
},
{
"epoch": 0.28,
"grad_norm": 1.8945067281458927,
"learning_rate": 8.475486543136181e-06,
"loss": 0.2896,
"step": 868
},
{
"epoch": 0.28,
"grad_norm": 9.4701021587884,
"learning_rate": 8.471758942578193e-06,
"loss": 0.6194,
"step": 869
},
{
"epoch": 0.28,
"grad_norm": 10.511886956772441,
"learning_rate": 8.46802761229018e-06,
"loss": 0.6511,
"step": 870
},
{
"epoch": 0.28,
"grad_norm": 7.863057086989649,
"learning_rate": 8.464292556280734e-06,
"loss": 0.5907,
"step": 871
},
{
"epoch": 0.28,
"grad_norm": 6.167361796987621,
"learning_rate": 8.46055377856244e-06,
"loss": 0.5901,
"step": 872
},
{
"epoch": 0.28,
"grad_norm": 19.680661510408303,
"learning_rate": 8.456811283151896e-06,
"loss": 0.5894,
"step": 873
},
{
"epoch": 0.28,
"grad_norm": 9.438303117934163,
"learning_rate": 8.453065074069682e-06,
"loss": 0.5033,
"step": 874
},
{
"epoch": 0.28,
"grad_norm": 1.625431846534336,
"learning_rate": 8.449315155340369e-06,
"loss": 0.2799,
"step": 875
},
{
"epoch": 0.28,
"grad_norm": 1.7554253384961724,
"learning_rate": 8.44556153099252e-06,
"loss": 0.3268,
"step": 876
},
{
"epoch": 0.28,
"grad_norm": 6.1580376296166435,
"learning_rate": 8.441804205058672e-06,
"loss": 0.6697,
"step": 877
},
{
"epoch": 0.28,
"grad_norm": 7.051159034338603,
"learning_rate": 8.43804318157534e-06,
"loss": 0.578,
"step": 878
},
{
"epoch": 0.28,
"grad_norm": 6.7336555036387455,
"learning_rate": 8.434278464583018e-06,
"loss": 0.5324,
"step": 879
},
{
"epoch": 0.28,
"grad_norm": 8.150776766355575,
"learning_rate": 8.430510058126156e-06,
"loss": 0.636,
"step": 880
},
{
"epoch": 0.28,
"grad_norm": 1.704230168862192,
"learning_rate": 8.426737966253176e-06,
"loss": 0.2553,
"step": 881
},
{
"epoch": 0.28,
"grad_norm": 1.576742469550776,
"learning_rate": 8.422962193016459e-06,
"loss": 0.2505,
"step": 882
},
{
"epoch": 0.28,
"grad_norm": 1.722189642524467,
"learning_rate": 8.41918274247234e-06,
"loss": 0.2113,
"step": 883
},
{
"epoch": 0.28,
"grad_norm": 4.841587903078907,
"learning_rate": 8.415399618681101e-06,
"loss": 0.6088,
"step": 884
},
{
"epoch": 0.28,
"grad_norm": 1.5448985811556761,
"learning_rate": 8.411612825706976e-06,
"loss": 0.227,
"step": 885
},
{
"epoch": 0.28,
"grad_norm": 6.33255782512184,
"learning_rate": 8.407822367618135e-06,
"loss": 0.5294,
"step": 886
},
{
"epoch": 0.28,
"grad_norm": 5.7490443677960466,
"learning_rate": 8.40402824848669e-06,
"loss": 0.6894,
"step": 887
},
{
"epoch": 0.28,
"grad_norm": 8.311970930734226,
"learning_rate": 8.400230472388684e-06,
"loss": 0.6214,
"step": 888
},
{
"epoch": 0.28,
"grad_norm": 3.927420266585678,
"learning_rate": 8.396429043404088e-06,
"loss": 0.3584,
"step": 889
},
{
"epoch": 0.28,
"grad_norm": 1.8058260712439282,
"learning_rate": 8.3926239656168e-06,
"loss": 0.2764,
"step": 890
},
{
"epoch": 0.29,
"grad_norm": 6.524393247368285,
"learning_rate": 8.388815243114637e-06,
"loss": 0.5819,
"step": 891
},
{
"epoch": 0.29,
"grad_norm": 65.01476758786156,
"learning_rate": 8.385002879989328e-06,
"loss": 0.5696,
"step": 892
},
{
"epoch": 0.29,
"grad_norm": 1.6470301998126498,
"learning_rate": 8.381186880336518e-06,
"loss": 0.2538,
"step": 893
},
{
"epoch": 0.29,
"grad_norm": 7.253730717024666,
"learning_rate": 8.377367248255757e-06,
"loss": 0.5736,
"step": 894
},
{
"epoch": 0.29,
"grad_norm": 8.346427209524279,
"learning_rate": 8.373543987850494e-06,
"loss": 0.6371,
"step": 895
},
{
"epoch": 0.29,
"grad_norm": 1.4447458999529055,
"learning_rate": 8.369717103228084e-06,
"loss": 0.2204,
"step": 896
},
{
"epoch": 0.29,
"grad_norm": 16.640222927391598,
"learning_rate": 8.365886598499766e-06,
"loss": 0.4546,
"step": 897
},
{
"epoch": 0.29,
"grad_norm": 1.705159341486315,
"learning_rate": 8.362052477780677e-06,
"loss": 0.2985,
"step": 898
},
{
"epoch": 0.29,
"grad_norm": 1.8126938156568462,
"learning_rate": 8.35821474518983e-06,
"loss": 0.2424,
"step": 899
},
{
"epoch": 0.29,
"grad_norm": 1.9265518644124338,
"learning_rate": 8.354373404850124e-06,
"loss": 0.2831,
"step": 900
},
{
"epoch": 0.29,
"grad_norm": 4.312216463404106,
"learning_rate": 8.350528460888334e-06,
"loss": 0.5011,
"step": 901
},
{
"epoch": 0.29,
"grad_norm": 1.6362315327967591,
"learning_rate": 8.346679917435104e-06,
"loss": 0.2444,
"step": 902
},
{
"epoch": 0.29,
"grad_norm": 1.5451004685493321,
"learning_rate": 8.342827778624943e-06,
"loss": 0.2263,
"step": 903
},
{
"epoch": 0.29,
"grad_norm": 1.634051725367432,
"learning_rate": 8.33897204859623e-06,
"loss": 0.2642,
"step": 904
},
{
"epoch": 0.29,
"grad_norm": 10.006264049410033,
"learning_rate": 8.335112731491192e-06,
"loss": 0.5239,
"step": 905
},
{
"epoch": 0.29,
"grad_norm": 1.499144688672292,
"learning_rate": 8.331249831455921e-06,
"loss": 0.21,
"step": 906
},
{
"epoch": 0.29,
"grad_norm": 6.83080401550471,
"learning_rate": 8.327383352640347e-06,
"loss": 0.7192,
"step": 907
},
{
"epoch": 0.29,
"grad_norm": 5.445123052559625,
"learning_rate": 8.323513299198252e-06,
"loss": 0.593,
"step": 908
},
{
"epoch": 0.29,
"grad_norm": 1.5800330102245062,
"learning_rate": 8.319639675287255e-06,
"loss": 0.1956,
"step": 909
},
{
"epoch": 0.29,
"grad_norm": 10.985925353289685,
"learning_rate": 8.315762485068815e-06,
"loss": 0.5086,
"step": 910
},
{
"epoch": 0.29,
"grad_norm": 7.066589444646886,
"learning_rate": 8.311881732708213e-06,
"loss": 0.5215,
"step": 911
},
{
"epoch": 0.29,
"grad_norm": 7.67825901174198,
"learning_rate": 8.307997422374569e-06,
"loss": 0.8038,
"step": 912
},
{
"epoch": 0.29,
"grad_norm": 6.304539661629028,
"learning_rate": 8.304109558240817e-06,
"loss": 0.5006,
"step": 913
},
{
"epoch": 0.29,
"grad_norm": 1.6000598294765127,
"learning_rate": 8.300218144483709e-06,
"loss": 0.2031,
"step": 914
},
{
"epoch": 0.29,
"grad_norm": 6.873152965555242,
"learning_rate": 8.296323185283816e-06,
"loss": 0.6036,
"step": 915
},
{
"epoch": 0.29,
"grad_norm": 1.8715148595581925,
"learning_rate": 8.292424684825514e-06,
"loss": 0.249,
"step": 916
},
{
"epoch": 0.29,
"grad_norm": 1.7200882059628804,
"learning_rate": 8.28852264729698e-06,
"loss": 0.2683,
"step": 917
},
{
"epoch": 0.29,
"grad_norm": 6.2879962991369345,
"learning_rate": 8.284617076890199e-06,
"loss": 0.4912,
"step": 918
},
{
"epoch": 0.29,
"grad_norm": 1.6424687050291746,
"learning_rate": 8.280707977800944e-06,
"loss": 0.2321,
"step": 919
},
{
"epoch": 0.29,
"grad_norm": 8.060673556802215,
"learning_rate": 8.276795354228785e-06,
"loss": 0.6667,
"step": 920
},
{
"epoch": 0.29,
"grad_norm": 1.6034982772062687,
"learning_rate": 8.272879210377074e-06,
"loss": 0.2779,
"step": 921
},
{
"epoch": 0.3,
"grad_norm": 6.265308314957551,
"learning_rate": 8.268959550452946e-06,
"loss": 0.6491,
"step": 922
},
{
"epoch": 0.3,
"grad_norm": 5.224539104730754,
"learning_rate": 8.265036378667312e-06,
"loss": 0.6368,
"step": 923
},
{
"epoch": 0.3,
"grad_norm": 4.9642791771130055,
"learning_rate": 8.261109699234862e-06,
"loss": 0.6846,
"step": 924
},
{
"epoch": 0.3,
"grad_norm": 5.326075465035541,
"learning_rate": 8.257179516374045e-06,
"loss": 0.5553,
"step": 925
},
{
"epoch": 0.3,
"grad_norm": 5.123716962012845,
"learning_rate": 8.253245834307079e-06,
"loss": 0.5297,
"step": 926
},
{
"epoch": 0.3,
"grad_norm": 8.63248593059496,
"learning_rate": 8.249308657259943e-06,
"loss": 0.5384,
"step": 927
},
{
"epoch": 0.3,
"grad_norm": 8.948336121415593,
"learning_rate": 8.245367989462368e-06,
"loss": 0.4715,
"step": 928
},
{
"epoch": 0.3,
"grad_norm": 1.5286834573921628,
"learning_rate": 8.241423835147833e-06,
"loss": 0.2124,
"step": 929
},
{
"epoch": 0.3,
"grad_norm": 6.778579936718943,
"learning_rate": 8.237476198553567e-06,
"loss": 0.5749,
"step": 930
},
{
"epoch": 0.3,
"grad_norm": 8.735820903898144,
"learning_rate": 8.233525083920536e-06,
"loss": 0.6569,
"step": 931
},
{
"epoch": 0.3,
"grad_norm": 1.6071581878864913,
"learning_rate": 8.229570495493447e-06,
"loss": 0.2267,
"step": 932
},
{
"epoch": 0.3,
"grad_norm": 1.4098869673911527,
"learning_rate": 8.225612437520736e-06,
"loss": 0.2043,
"step": 933
},
{
"epoch": 0.3,
"grad_norm": 1.4849291540429495,
"learning_rate": 8.221650914254566e-06,
"loss": 0.2583,
"step": 934
},
{
"epoch": 0.3,
"grad_norm": 1.6908420580341865,
"learning_rate": 8.217685929950823e-06,
"loss": 0.2791,
"step": 935
},
{
"epoch": 0.3,
"grad_norm": 1.577991292993094,
"learning_rate": 8.213717488869113e-06,
"loss": 0.2564,
"step": 936
},
{
"epoch": 0.3,
"grad_norm": 1.6304667338549887,
"learning_rate": 8.209745595272755e-06,
"loss": 0.2492,
"step": 937
},
{
"epoch": 0.3,
"grad_norm": 6.12028252403479,
"learning_rate": 8.205770253428775e-06,
"loss": 0.4604,
"step": 938
},
{
"epoch": 0.3,
"grad_norm": 1.5198702030985125,
"learning_rate": 8.201791467607905e-06,
"loss": 0.2431,
"step": 939
},
{
"epoch": 0.3,
"grad_norm": 1.4871758189086686,
"learning_rate": 8.197809242084575e-06,
"loss": 0.2491,
"step": 940
},
{
"epoch": 0.3,
"grad_norm": 5.207452305123646,
"learning_rate": 8.193823581136919e-06,
"loss": 0.5642,
"step": 941
},
{
"epoch": 0.3,
"grad_norm": 1.7255330273398901,
"learning_rate": 8.189834489046746e-06,
"loss": 0.2537,
"step": 942
},
{
"epoch": 0.3,
"grad_norm": 1.343723206415828,
"learning_rate": 8.185841970099566e-06,
"loss": 0.1964,
"step": 943
},
{
"epoch": 0.3,
"grad_norm": 8.361566314702689,
"learning_rate": 8.181846028584563e-06,
"loss": 0.5342,
"step": 944
},
{
"epoch": 0.3,
"grad_norm": 1.4091082835237403,
"learning_rate": 8.177846668794598e-06,
"loss": 0.1915,
"step": 945
},
{
"epoch": 0.3,
"grad_norm": 5.632022424143484,
"learning_rate": 8.173843895026207e-06,
"loss": 0.5986,
"step": 946
},
{
"epoch": 0.3,
"grad_norm": 1.810578946218269,
"learning_rate": 8.169837711579591e-06,
"loss": 0.2462,
"step": 947
},
{
"epoch": 0.3,
"grad_norm": 5.474212152359759,
"learning_rate": 8.165828122758615e-06,
"loss": 0.6495,
"step": 948
},
{
"epoch": 0.3,
"grad_norm": 5.909343469433645,
"learning_rate": 8.161815132870806e-06,
"loss": 0.6268,
"step": 949
},
{
"epoch": 0.3,
"grad_norm": 1.650687508215565,
"learning_rate": 8.157798746227337e-06,
"loss": 0.1904,
"step": 950
},
{
"epoch": 0.3,
"grad_norm": 1.5497901471006141,
"learning_rate": 8.153778967143035e-06,
"loss": 0.2185,
"step": 951
},
{
"epoch": 0.3,
"grad_norm": 1.590537747184238,
"learning_rate": 8.149755799936377e-06,
"loss": 0.2029,
"step": 952
},
{
"epoch": 0.3,
"grad_norm": 1.8145458158756518,
"learning_rate": 8.145729248929466e-06,
"loss": 0.3129,
"step": 953
},
{
"epoch": 0.31,
"grad_norm": 6.599237686710324,
"learning_rate": 8.141699318448053e-06,
"loss": 0.6723,
"step": 954
},
{
"epoch": 0.31,
"grad_norm": 5.0690496052178835,
"learning_rate": 8.137666012821514e-06,
"loss": 0.6524,
"step": 955
},
{
"epoch": 0.31,
"grad_norm": 1.687080582405302,
"learning_rate": 8.13362933638285e-06,
"loss": 0.2391,
"step": 956
},
{
"epoch": 0.31,
"grad_norm": 6.38689930953936,
"learning_rate": 8.129589293468689e-06,
"loss": 0.5736,
"step": 957
},
{
"epoch": 0.31,
"grad_norm": 1.5226156646274227,
"learning_rate": 8.125545888419269e-06,
"loss": 0.2518,
"step": 958
},
{
"epoch": 0.31,
"grad_norm": 1.4502611685092015,
"learning_rate": 8.12149912557844e-06,
"loss": 0.2387,
"step": 959
},
{
"epoch": 0.31,
"grad_norm": 1.4924326291951637,
"learning_rate": 8.117449009293668e-06,
"loss": 0.262,
"step": 960
},
{
"epoch": 0.31,
"grad_norm": 8.88618189798143,
"learning_rate": 8.113395543916012e-06,
"loss": 0.7492,
"step": 961
},
{
"epoch": 0.31,
"grad_norm": 1.5439044787564637,
"learning_rate": 8.109338733800132e-06,
"loss": 0.2688,
"step": 962
},
{
"epoch": 0.31,
"grad_norm": 6.043219000084835,
"learning_rate": 8.10527858330428e-06,
"loss": 0.6305,
"step": 963
},
{
"epoch": 0.31,
"grad_norm": 6.691904839136024,
"learning_rate": 8.101215096790305e-06,
"loss": 0.6562,
"step": 964
},
{
"epoch": 0.31,
"grad_norm": 7.741322245640578,
"learning_rate": 8.097148278623628e-06,
"loss": 0.6244,
"step": 965
},
{
"epoch": 0.31,
"grad_norm": 1.6772790622352332,
"learning_rate": 8.093078133173256e-06,
"loss": 0.2499,
"step": 966
},
{
"epoch": 0.31,
"grad_norm": 1.6939554958023613,
"learning_rate": 8.089004664811767e-06,
"loss": 0.2931,
"step": 967
},
{
"epoch": 0.31,
"grad_norm": 7.300929933908393,
"learning_rate": 8.084927877915314e-06,
"loss": 0.6952,
"step": 968
},
{
"epoch": 0.31,
"grad_norm": 1.6200759183085958,
"learning_rate": 8.080847776863609e-06,
"loss": 0.2202,
"step": 969
},
{
"epoch": 0.31,
"grad_norm": 1.577818449634306,
"learning_rate": 8.07676436603993e-06,
"loss": 0.2436,
"step": 970
},
{
"epoch": 0.31,
"grad_norm": 1.7568781422144297,
"learning_rate": 8.072677649831107e-06,
"loss": 0.2333,
"step": 971
},
{
"epoch": 0.31,
"grad_norm": 9.024773267499588,
"learning_rate": 8.068587632627521e-06,
"loss": 0.501,
"step": 972
},
{
"epoch": 0.31,
"grad_norm": 1.5420024123926483,
"learning_rate": 8.064494318823102e-06,
"loss": 0.2753,
"step": 973
},
{
"epoch": 0.31,
"grad_norm": 1.5656359437849388,
"learning_rate": 8.060397712815318e-06,
"loss": 0.2497,
"step": 974
},
{
"epoch": 0.31,
"grad_norm": 1.6247670447736333,
"learning_rate": 8.056297819005177e-06,
"loss": 0.2289,
"step": 975
},
{
"epoch": 0.31,
"grad_norm": 4.756855006115204,
"learning_rate": 8.052194641797217e-06,
"loss": 0.795,
"step": 976
},
{
"epoch": 0.31,
"grad_norm": 1.6605381181871688,
"learning_rate": 8.048088185599507e-06,
"loss": 0.2345,
"step": 977
},
{
"epoch": 0.31,
"grad_norm": 1.4577986656434714,
"learning_rate": 8.043978454823632e-06,
"loss": 0.2514,
"step": 978
},
{
"epoch": 0.31,
"grad_norm": 1.8563748924672185,
"learning_rate": 8.0398654538847e-06,
"loss": 0.3161,
"step": 979
},
{
"epoch": 0.31,
"grad_norm": 5.014797297051163,
"learning_rate": 8.035749187201333e-06,
"loss": 0.517,
"step": 980
},
{
"epoch": 0.31,
"grad_norm": 1.5680786373670434,
"learning_rate": 8.031629659195657e-06,
"loss": 0.2402,
"step": 981
},
{
"epoch": 0.31,
"grad_norm": 8.196102593968495,
"learning_rate": 8.027506874293304e-06,
"loss": 0.5746,
"step": 982
},
{
"epoch": 0.31,
"grad_norm": 7.278802967337519,
"learning_rate": 8.023380836923404e-06,
"loss": 0.7167,
"step": 983
},
{
"epoch": 0.31,
"grad_norm": 1.4157414766476062,
"learning_rate": 8.019251551518585e-06,
"loss": 0.2333,
"step": 984
},
{
"epoch": 0.32,
"grad_norm": 18.566101779205937,
"learning_rate": 8.015119022514958e-06,
"loss": 0.5788,
"step": 985
},
{
"epoch": 0.32,
"grad_norm": 5.5214903643031255,
"learning_rate": 8.010983254352127e-06,
"loss": 0.6308,
"step": 986
},
{
"epoch": 0.32,
"grad_norm": 5.844163978244507,
"learning_rate": 8.006844251473165e-06,
"loss": 0.6814,
"step": 987
},
{
"epoch": 0.32,
"grad_norm": 1.4867426534826607,
"learning_rate": 8.002702018324629e-06,
"loss": 0.1987,
"step": 988
},
{
"epoch": 0.32,
"grad_norm": 1.5159336503630736,
"learning_rate": 7.998556559356543e-06,
"loss": 0.2561,
"step": 989
},
{
"epoch": 0.32,
"grad_norm": 1.4554991897170064,
"learning_rate": 7.994407879022397e-06,
"loss": 0.1772,
"step": 990
},
{
"epoch": 0.32,
"grad_norm": 13.232086482270557,
"learning_rate": 7.990255981779139e-06,
"loss": 0.5657,
"step": 991
},
{
"epoch": 0.32,
"grad_norm": 6.499740410030531,
"learning_rate": 7.986100872087177e-06,
"loss": 0.6406,
"step": 992
},
{
"epoch": 0.32,
"grad_norm": 7.167695370460036,
"learning_rate": 7.981942554410371e-06,
"loss": 0.6665,
"step": 993
},
{
"epoch": 0.32,
"grad_norm": 1.8426339914572896,
"learning_rate": 7.97778103321602e-06,
"loss": 0.21,
"step": 994
},
{
"epoch": 0.32,
"grad_norm": 8.557321656981843,
"learning_rate": 7.973616312974876e-06,
"loss": 0.4842,
"step": 995
},
{
"epoch": 0.32,
"grad_norm": 5.874621150418908,
"learning_rate": 7.969448398161115e-06,
"loss": 0.6745,
"step": 996
},
{
"epoch": 0.32,
"grad_norm": 1.6665195438715732,
"learning_rate": 7.965277293252354e-06,
"loss": 0.2129,
"step": 997
},
{
"epoch": 0.32,
"grad_norm": 5.9711904071282245,
"learning_rate": 7.961103002729634e-06,
"loss": 0.475,
"step": 998
},
{
"epoch": 0.32,
"grad_norm": 1.700950579625396,
"learning_rate": 7.956925531077417e-06,
"loss": 0.2788,
"step": 999
},
{
"epoch": 0.32,
"grad_norm": 6.1089608261539485,
"learning_rate": 7.952744882783587e-06,
"loss": 0.5439,
"step": 1000
},
{
"epoch": 0.32,
"grad_norm": 1.689080045654782,
"learning_rate": 7.948561062339435e-06,
"loss": 0.2755,
"step": 1001
},
{
"epoch": 0.32,
"grad_norm": 1.7153482062599326,
"learning_rate": 7.944374074239665e-06,
"loss": 0.2422,
"step": 1002
},
{
"epoch": 0.32,
"grad_norm": 1.646721440739929,
"learning_rate": 7.940183922982381e-06,
"loss": 0.2675,
"step": 1003
},
{
"epoch": 0.32,
"grad_norm": 5.372490958973571,
"learning_rate": 7.935990613069087e-06,
"loss": 0.6422,
"step": 1004
},
{
"epoch": 0.32,
"grad_norm": 9.198653623452538,
"learning_rate": 7.931794149004675e-06,
"loss": 0.6502,
"step": 1005
},
{
"epoch": 0.32,
"grad_norm": 1.52594294469927,
"learning_rate": 7.927594535297433e-06,
"loss": 0.217,
"step": 1006
},
{
"epoch": 0.32,
"grad_norm": 6.280041790877249,
"learning_rate": 7.923391776459031e-06,
"loss": 0.6249,
"step": 1007
},
{
"epoch": 0.32,
"grad_norm": 5.868426083787363,
"learning_rate": 7.919185877004515e-06,
"loss": 0.6534,
"step": 1008
},
{
"epoch": 0.32,
"grad_norm": 6.731001321305892,
"learning_rate": 7.914976841452304e-06,
"loss": 0.7446,
"step": 1009
},
{
"epoch": 0.32,
"grad_norm": 1.5875925359845395,
"learning_rate": 7.91076467432419e-06,
"loss": 0.2311,
"step": 1010
},
{
"epoch": 0.32,
"grad_norm": 4.9586770613684035,
"learning_rate": 7.90654938014533e-06,
"loss": 0.6075,
"step": 1011
},
{
"epoch": 0.32,
"grad_norm": 7.0514251067612115,
"learning_rate": 7.902330963444234e-06,
"loss": 0.5834,
"step": 1012
},
{
"epoch": 0.32,
"grad_norm": 1.4457420322548138,
"learning_rate": 7.898109428752773e-06,
"loss": 0.2278,
"step": 1013
},
{
"epoch": 0.32,
"grad_norm": 5.223756030865263,
"learning_rate": 7.893884780606164e-06,
"loss": 0.4812,
"step": 1014
},
{
"epoch": 0.32,
"grad_norm": 1.5123115814863226,
"learning_rate": 7.889657023542973e-06,
"loss": 0.2431,
"step": 1015
},
{
"epoch": 0.33,
"grad_norm": 8.1927800996665,
"learning_rate": 7.885426162105101e-06,
"loss": 0.7178,
"step": 1016
},
{
"epoch": 0.33,
"grad_norm": 1.4361919853028358,
"learning_rate": 7.881192200837785e-06,
"loss": 0.2334,
"step": 1017
},
{
"epoch": 0.33,
"grad_norm": 1.6821320951520453,
"learning_rate": 7.876955144289594e-06,
"loss": 0.2727,
"step": 1018
},
{
"epoch": 0.33,
"grad_norm": 1.64615433229082,
"learning_rate": 7.872714997012421e-06,
"loss": 0.287,
"step": 1019
},
{
"epoch": 0.33,
"grad_norm": 1.5961821260575324,
"learning_rate": 7.868471763561482e-06,
"loss": 0.2612,
"step": 1020
},
{
"epoch": 0.33,
"grad_norm": 4.260052766123083,
"learning_rate": 7.864225448495304e-06,
"loss": 0.6239,
"step": 1021
},
{
"epoch": 0.33,
"grad_norm": 27.705341590163066,
"learning_rate": 7.85997605637573e-06,
"loss": 0.7627,
"step": 1022
},
{
"epoch": 0.33,
"grad_norm": 5.396292625469772,
"learning_rate": 7.855723591767903e-06,
"loss": 0.6497,
"step": 1023
},
{
"epoch": 0.33,
"grad_norm": 1.5134632432860375,
"learning_rate": 7.85146805924027e-06,
"loss": 0.2252,
"step": 1024
},
{
"epoch": 0.33,
"grad_norm": 8.969826699493574,
"learning_rate": 7.847209463364574e-06,
"loss": 0.5587,
"step": 1025
},
{
"epoch": 0.33,
"grad_norm": 8.041023991570867,
"learning_rate": 7.842947808715848e-06,
"loss": 0.6362,
"step": 1026
},
{
"epoch": 0.33,
"grad_norm": 1.5662874953195092,
"learning_rate": 7.83868309987241e-06,
"loss": 0.2194,
"step": 1027
},
{
"epoch": 0.33,
"grad_norm": 6.299266990989718,
"learning_rate": 7.834415341415862e-06,
"loss": 0.6061,
"step": 1028
},
{
"epoch": 0.33,
"grad_norm": 1.860859508577284,
"learning_rate": 7.830144537931082e-06,
"loss": 0.2787,
"step": 1029
},
{
"epoch": 0.33,
"grad_norm": 1.482805550334068,
"learning_rate": 7.825870694006217e-06,
"loss": 0.2164,
"step": 1030
},
{
"epoch": 0.33,
"grad_norm": 6.613164329297252,
"learning_rate": 7.82159381423268e-06,
"loss": 0.5754,
"step": 1031
},
{
"epoch": 0.33,
"grad_norm": 9.863889557878194,
"learning_rate": 7.817313903205148e-06,
"loss": 0.6181,
"step": 1032
},
{
"epoch": 0.33,
"grad_norm": 5.726291356356693,
"learning_rate": 7.813030965521554e-06,
"loss": 0.6551,
"step": 1033
},
{
"epoch": 0.33,
"grad_norm": 6.774162233097553,
"learning_rate": 7.80874500578308e-06,
"loss": 0.6264,
"step": 1034
},
{
"epoch": 0.33,
"grad_norm": 6.234214286889,
"learning_rate": 7.804456028594158e-06,
"loss": 0.6222,
"step": 1035
},
{
"epoch": 0.33,
"grad_norm": 1.9347285684404372,
"learning_rate": 7.80016403856246e-06,
"loss": 0.2751,
"step": 1036
},
{
"epoch": 0.33,
"grad_norm": 1.5520764220517966,
"learning_rate": 7.795869040298895e-06,
"loss": 0.2234,
"step": 1037
},
{
"epoch": 0.33,
"grad_norm": 6.052441876883189,
"learning_rate": 7.791571038417602e-06,
"loss": 0.4274,
"step": 1038
},
{
"epoch": 0.33,
"grad_norm": 5.895285255858338,
"learning_rate": 7.78727003753595e-06,
"loss": 0.6069,
"step": 1039
},
{
"epoch": 0.33,
"grad_norm": 10.79283859744142,
"learning_rate": 7.782966042274529e-06,
"loss": 0.5398,
"step": 1040
},
{
"epoch": 0.33,
"grad_norm": 5.795367350445128,
"learning_rate": 7.778659057257144e-06,
"loss": 0.5116,
"step": 1041
},
{
"epoch": 0.33,
"grad_norm": 7.115590126112468,
"learning_rate": 7.774349087110813e-06,
"loss": 0.5862,
"step": 1042
},
{
"epoch": 0.33,
"grad_norm": 19.186958069824044,
"learning_rate": 7.77003613646576e-06,
"loss": 0.6993,
"step": 1043
},
{
"epoch": 0.33,
"grad_norm": 1.5596763052378617,
"learning_rate": 7.765720209955414e-06,
"loss": 0.2379,
"step": 1044
},
{
"epoch": 0.33,
"grad_norm": 5.5453115203298875,
"learning_rate": 7.761401312216398e-06,
"loss": 0.7534,
"step": 1045
},
{
"epoch": 0.33,
"grad_norm": 1.4347707338100348,
"learning_rate": 7.757079447888529e-06,
"loss": 0.2546,
"step": 1046
},
{
"epoch": 0.34,
"grad_norm": 1.3853027058911302,
"learning_rate": 7.752754621614807e-06,
"loss": 0.1907,
"step": 1047
},
{
"epoch": 0.34,
"grad_norm": 9.12407220325666,
"learning_rate": 7.748426838041421e-06,
"loss": 0.6993,
"step": 1048
},
{
"epoch": 0.34,
"grad_norm": 1.4582603432101668,
"learning_rate": 7.744096101817731e-06,
"loss": 0.2337,
"step": 1049
},
{
"epoch": 0.34,
"grad_norm": 6.544452978143089,
"learning_rate": 7.73976241759627e-06,
"loss": 0.5982,
"step": 1050
},
{
"epoch": 0.34,
"grad_norm": 1.547444676269326,
"learning_rate": 7.73542579003274e-06,
"loss": 0.2192,
"step": 1051
},
{
"epoch": 0.34,
"grad_norm": 7.899907932162064,
"learning_rate": 7.731086223786006e-06,
"loss": 0.5546,
"step": 1052
},
{
"epoch": 0.34,
"grad_norm": 1.720644882947618,
"learning_rate": 7.726743723518087e-06,
"loss": 0.2878,
"step": 1053
},
{
"epoch": 0.34,
"grad_norm": 1.4463293971324411,
"learning_rate": 7.722398293894153e-06,
"loss": 0.2411,
"step": 1054
},
{
"epoch": 0.34,
"grad_norm": 6.399152675925439,
"learning_rate": 7.718049939582529e-06,
"loss": 0.6644,
"step": 1055
},
{
"epoch": 0.34,
"grad_norm": 1.535805744652878,
"learning_rate": 7.713698665254669e-06,
"loss": 0.2172,
"step": 1056
},
{
"epoch": 0.34,
"grad_norm": 1.8411273230737173,
"learning_rate": 7.70934447558518e-06,
"loss": 0.2479,
"step": 1057
},
{
"epoch": 0.34,
"grad_norm": 8.476171050864176,
"learning_rate": 7.704987375251782e-06,
"loss": 0.7114,
"step": 1058
},
{
"epoch": 0.34,
"grad_norm": 6.170794617861031,
"learning_rate": 7.70062736893534e-06,
"loss": 0.707,
"step": 1059
},
{
"epoch": 0.34,
"grad_norm": 1.5428935228716163,
"learning_rate": 7.696264461319831e-06,
"loss": 0.1995,
"step": 1060
},
{
"epoch": 0.34,
"grad_norm": 1.4706280562326233,
"learning_rate": 7.69189865709235e-06,
"loss": 0.2156,
"step": 1061
},
{
"epoch": 0.34,
"grad_norm": 1.5182855633267356,
"learning_rate": 7.687529960943107e-06,
"loss": 0.2155,
"step": 1062
},
{
"epoch": 0.34,
"grad_norm": 6.598787588558327,
"learning_rate": 7.683158377565415e-06,
"loss": 0.5596,
"step": 1063
},
{
"epoch": 0.34,
"grad_norm": 1.4701515988054943,
"learning_rate": 7.678783911655691e-06,
"loss": 0.2387,
"step": 1064
},
{
"epoch": 0.34,
"grad_norm": 1.7487342920858722,
"learning_rate": 7.674406567913447e-06,
"loss": 0.2849,
"step": 1065
},
{
"epoch": 0.34,
"grad_norm": 6.720506302872264,
"learning_rate": 7.67002635104129e-06,
"loss": 0.6633,
"step": 1066
},
{
"epoch": 0.34,
"grad_norm": 1.528372063446923,
"learning_rate": 7.66564326574491e-06,
"loss": 0.2445,
"step": 1067
},
{
"epoch": 0.34,
"grad_norm": 1.4659505765883247,
"learning_rate": 7.661257316733078e-06,
"loss": 0.2436,
"step": 1068
},
{
"epoch": 0.34,
"grad_norm": 1.5795243509696442,
"learning_rate": 7.656868508717648e-06,
"loss": 0.2671,
"step": 1069
},
{
"epoch": 0.34,
"grad_norm": 9.825712492400415,
"learning_rate": 7.652476846413537e-06,
"loss": 0.7208,
"step": 1070
},
{
"epoch": 0.34,
"grad_norm": 1.5701890378920915,
"learning_rate": 7.648082334538735e-06,
"loss": 0.262,
"step": 1071
},
{
"epoch": 0.34,
"grad_norm": 6.6832118611123095,
"learning_rate": 7.64368497781429e-06,
"loss": 0.5806,
"step": 1072
},
{
"epoch": 0.34,
"grad_norm": 1.6414381358525465,
"learning_rate": 7.639284780964307e-06,
"loss": 0.2409,
"step": 1073
},
{
"epoch": 0.34,
"grad_norm": 6.37110642480901,
"learning_rate": 7.634881748715941e-06,
"loss": 0.6684,
"step": 1074
},
{
"epoch": 0.34,
"grad_norm": 1.43825188221218,
"learning_rate": 7.630475885799395e-06,
"loss": 0.199,
"step": 1075
},
{
"epoch": 0.34,
"grad_norm": 5.637745509627005,
"learning_rate": 7.626067196947913e-06,
"loss": 0.7578,
"step": 1076
},
{
"epoch": 0.34,
"grad_norm": 7.540397124778485,
"learning_rate": 7.621655686897771e-06,
"loss": 0.6169,
"step": 1077
},
{
"epoch": 0.34,
"grad_norm": 1.5383291305985427,
"learning_rate": 7.617241360388282e-06,
"loss": 0.1869,
"step": 1078
},
{
"epoch": 0.35,
"grad_norm": 1.5769069469444448,
"learning_rate": 7.612824222161781e-06,
"loss": 0.2225,
"step": 1079
},
{
"epoch": 0.35,
"grad_norm": 1.6398866286392093,
"learning_rate": 7.608404276963623e-06,
"loss": 0.2062,
"step": 1080
},
{
"epoch": 0.35,
"grad_norm": 5.470655820078232,
"learning_rate": 7.60398152954218e-06,
"loss": 0.6265,
"step": 1081
},
{
"epoch": 0.35,
"grad_norm": 1.6552078121199802,
"learning_rate": 7.599555984648836e-06,
"loss": 0.2076,
"step": 1082
},
{
"epoch": 0.35,
"grad_norm": 1.6012296119356733,
"learning_rate": 7.595127647037976e-06,
"loss": 0.1988,
"step": 1083
},
{
"epoch": 0.35,
"grad_norm": 6.939505698849627,
"learning_rate": 7.590696521466992e-06,
"loss": 0.5608,
"step": 1084
},
{
"epoch": 0.35,
"grad_norm": 4.910673417408083,
"learning_rate": 7.586262612696263e-06,
"loss": 0.677,
"step": 1085
},
{
"epoch": 0.35,
"grad_norm": 1.6897523696154988,
"learning_rate": 7.5818259254891614e-06,
"loss": 0.2501,
"step": 1086
},
{
"epoch": 0.35,
"grad_norm": 1.3434317652867565,
"learning_rate": 7.577386464612049e-06,
"loss": 0.1795,
"step": 1087
},
{
"epoch": 0.35,
"grad_norm": 1.6902559742508552,
"learning_rate": 7.572944234834261e-06,
"loss": 0.2267,
"step": 1088
},
{
"epoch": 0.35,
"grad_norm": 1.3883211446125365,
"learning_rate": 7.568499240928109e-06,
"loss": 0.2141,
"step": 1089
},
{
"epoch": 0.35,
"grad_norm": 1.5298055785686242,
"learning_rate": 7.5640514876688765e-06,
"loss": 0.2406,
"step": 1090
},
{
"epoch": 0.35,
"grad_norm": 9.496424132872413,
"learning_rate": 7.559600979834809e-06,
"loss": 0.4984,
"step": 1091
},
{
"epoch": 0.35,
"grad_norm": 1.6057294345638613,
"learning_rate": 7.555147722207111e-06,
"loss": 0.2431,
"step": 1092
},
{
"epoch": 0.35,
"grad_norm": 5.860705570333863,
"learning_rate": 7.550691719569944e-06,
"loss": 0.5838,
"step": 1093
},
{
"epoch": 0.35,
"grad_norm": 1.5642369137762708,
"learning_rate": 7.546232976710413e-06,
"loss": 0.2329,
"step": 1094
},
{
"epoch": 0.35,
"grad_norm": 1.5083245316947056,
"learning_rate": 7.541771498418575e-06,
"loss": 0.2231,
"step": 1095
},
{
"epoch": 0.35,
"grad_norm": 12.391806214588945,
"learning_rate": 7.537307289487419e-06,
"loss": 0.6165,
"step": 1096
},
{
"epoch": 0.35,
"grad_norm": 6.900426095465898,
"learning_rate": 7.532840354712868e-06,
"loss": 0.5279,
"step": 1097
},
{
"epoch": 0.35,
"grad_norm": 1.6678388256247385,
"learning_rate": 7.5283706988937765e-06,
"loss": 0.2434,
"step": 1098
},
{
"epoch": 0.35,
"grad_norm": 7.705637433534857,
"learning_rate": 7.523898326831921e-06,
"loss": 0.516,
"step": 1099
},
{
"epoch": 0.35,
"grad_norm": 1.405010079202137,
"learning_rate": 7.5194232433319955e-06,
"loss": 0.2058,
"step": 1100
},
{
"epoch": 0.35,
"grad_norm": 6.5585545569480015,
"learning_rate": 7.514945453201608e-06,
"loss": 0.5136,
"step": 1101
},
{
"epoch": 0.35,
"grad_norm": 36.403557439297025,
"learning_rate": 7.510464961251271e-06,
"loss": 0.5227,
"step": 1102
},
{
"epoch": 0.35,
"grad_norm": 4.83369309587166,
"learning_rate": 7.505981772294404e-06,
"loss": 0.4469,
"step": 1103
},
{
"epoch": 0.35,
"grad_norm": 11.349035410601026,
"learning_rate": 7.501495891147322e-06,
"loss": 0.7265,
"step": 1104
},
{
"epoch": 0.35,
"grad_norm": 1.5579260386667464,
"learning_rate": 7.497007322629231e-06,
"loss": 0.2236,
"step": 1105
},
{
"epoch": 0.35,
"grad_norm": 10.065921900057672,
"learning_rate": 7.492516071562226e-06,
"loss": 0.5644,
"step": 1106
},
{
"epoch": 0.35,
"grad_norm": 1.8056548718693688,
"learning_rate": 7.488022142771282e-06,
"loss": 0.264,
"step": 1107
},
{
"epoch": 0.35,
"grad_norm": 1.727491729848117,
"learning_rate": 7.483525541084253e-06,
"loss": 0.2353,
"step": 1108
},
{
"epoch": 0.35,
"grad_norm": 1.6648267876413188,
"learning_rate": 7.479026271331864e-06,
"loss": 0.2699,
"step": 1109
},
{
"epoch": 0.36,
"grad_norm": 5.528195596975851,
"learning_rate": 7.4745243383477055e-06,
"loss": 0.5324,
"step": 1110
},
{
"epoch": 0.36,
"grad_norm": 1.483370410699109,
"learning_rate": 7.470019746968226e-06,
"loss": 0.2263,
"step": 1111
},
{
"epoch": 0.36,
"grad_norm": 9.464428186655587,
"learning_rate": 7.4655125020327376e-06,
"loss": 0.7973,
"step": 1112
},
{
"epoch": 0.36,
"grad_norm": 1.513103440189699,
"learning_rate": 7.461002608383396e-06,
"loss": 0.2109,
"step": 1113
},
{
"epoch": 0.36,
"grad_norm": 1.6308949314556291,
"learning_rate": 7.456490070865206e-06,
"loss": 0.2618,
"step": 1114
},
{
"epoch": 0.36,
"grad_norm": 5.951063225201904,
"learning_rate": 7.4519748943260126e-06,
"loss": 0.5295,
"step": 1115
},
{
"epoch": 0.36,
"grad_norm": 12.78982777413769,
"learning_rate": 7.447457083616494e-06,
"loss": 0.644,
"step": 1116
},
{
"epoch": 0.36,
"grad_norm": 1.517175737638629,
"learning_rate": 7.44293664359016e-06,
"loss": 0.2783,
"step": 1117
},
{
"epoch": 0.36,
"grad_norm": 1.31606475875191,
"learning_rate": 7.438413579103344e-06,
"loss": 0.2124,
"step": 1118
},
{
"epoch": 0.36,
"grad_norm": 1.5883162286889605,
"learning_rate": 7.433887895015199e-06,
"loss": 0.2407,
"step": 1119
},
{
"epoch": 0.36,
"grad_norm": 4.557176169061043,
"learning_rate": 7.429359596187694e-06,
"loss": 0.4328,
"step": 1120
},
{
"epoch": 0.36,
"grad_norm": 1.6339977285742284,
"learning_rate": 7.424828687485606e-06,
"loss": 0.198,
"step": 1121
},
{
"epoch": 0.36,
"grad_norm": 1.3224105771875427,
"learning_rate": 7.420295173776515e-06,
"loss": 0.2403,
"step": 1122
},
{
"epoch": 0.36,
"grad_norm": 1.4829810191238573,
"learning_rate": 7.415759059930799e-06,
"loss": 0.2191,
"step": 1123
},
{
"epoch": 0.36,
"grad_norm": 1.4337356279338318,
"learning_rate": 7.411220350821631e-06,
"loss": 0.2743,
"step": 1124
},
{
"epoch": 0.36,
"grad_norm": 1.65314659862354,
"learning_rate": 7.406679051324972e-06,
"loss": 0.2609,
"step": 1125
},
{
"epoch": 0.36,
"grad_norm": 1.7342291144299689,
"learning_rate": 7.402135166319567e-06,
"loss": 0.2289,
"step": 1126
},
{
"epoch": 0.36,
"grad_norm": 1.696792075138794,
"learning_rate": 7.397588700686933e-06,
"loss": 0.27,
"step": 1127
},
{
"epoch": 0.36,
"grad_norm": 1.6044938667398625,
"learning_rate": 7.393039659311366e-06,
"loss": 0.2504,
"step": 1128
},
{
"epoch": 0.36,
"grad_norm": 5.503861269795774,
"learning_rate": 7.388488047079927e-06,
"loss": 0.4895,
"step": 1129
},
{
"epoch": 0.36,
"grad_norm": 1.587935245498577,
"learning_rate": 7.383933868882438e-06,
"loss": 0.2838,
"step": 1130
},
{
"epoch": 0.36,
"grad_norm": 1.4158476436413507,
"learning_rate": 7.379377129611478e-06,
"loss": 0.2073,
"step": 1131
},
{
"epoch": 0.36,
"grad_norm": 1.7373105615765783,
"learning_rate": 7.374817834162378e-06,
"loss": 0.2975,
"step": 1132
},
{
"epoch": 0.36,
"grad_norm": 8.831666955944666,
"learning_rate": 7.3702559874332125e-06,
"loss": 0.5983,
"step": 1133
},
{
"epoch": 0.36,
"grad_norm": 9.31665713413904,
"learning_rate": 7.3656915943247984e-06,
"loss": 0.6335,
"step": 1134
},
{
"epoch": 0.36,
"grad_norm": 7.06706466947314,
"learning_rate": 7.3611246597406925e-06,
"loss": 0.4158,
"step": 1135
},
{
"epoch": 0.36,
"grad_norm": 1.5915787530412477,
"learning_rate": 7.356555188587178e-06,
"loss": 0.1906,
"step": 1136
},
{
"epoch": 0.36,
"grad_norm": 1.4657555028102185,
"learning_rate": 7.351983185773259e-06,
"loss": 0.1868,
"step": 1137
},
{
"epoch": 0.36,
"grad_norm": 9.539314795725554,
"learning_rate": 7.347408656210666e-06,
"loss": 0.7162,
"step": 1138
},
{
"epoch": 0.36,
"grad_norm": 1.6430391123504855,
"learning_rate": 7.342831604813844e-06,
"loss": 0.2382,
"step": 1139
},
{
"epoch": 0.36,
"grad_norm": 6.29943915826192,
"learning_rate": 7.338252036499941e-06,
"loss": 0.5422,
"step": 1140
},
{
"epoch": 0.37,
"grad_norm": 1.5739552981093123,
"learning_rate": 7.333669956188815e-06,
"loss": 0.2203,
"step": 1141
},
{
"epoch": 0.37,
"grad_norm": 5.891490773197444,
"learning_rate": 7.3290853688030196e-06,
"loss": 0.5411,
"step": 1142
},
{
"epoch": 0.37,
"grad_norm": 1.4076848345888082,
"learning_rate": 7.324498279267803e-06,
"loss": 0.2263,
"step": 1143
},
{
"epoch": 0.37,
"grad_norm": 1.771801958389905,
"learning_rate": 7.319908692511103e-06,
"loss": 0.2898,
"step": 1144
},
{
"epoch": 0.37,
"grad_norm": 1.5087526253506416,
"learning_rate": 7.315316613463535e-06,
"loss": 0.277,
"step": 1145
},
{
"epoch": 0.37,
"grad_norm": 5.112445795077407,
"learning_rate": 7.310722047058396e-06,
"loss": 0.5377,
"step": 1146
},
{
"epoch": 0.37,
"grad_norm": 5.482099175081986,
"learning_rate": 7.306124998231655e-06,
"loss": 0.6483,
"step": 1147
},
{
"epoch": 0.37,
"grad_norm": 1.3755375793547229,
"learning_rate": 7.301525471921949e-06,
"loss": 0.2425,
"step": 1148
},
{
"epoch": 0.37,
"grad_norm": 6.510124397597574,
"learning_rate": 7.296923473070571e-06,
"loss": 0.5505,
"step": 1149
},
{
"epoch": 0.37,
"grad_norm": 3.8451392454881526,
"learning_rate": 7.292319006621477e-06,
"loss": 0.4919,
"step": 1150
},
{
"epoch": 0.37,
"grad_norm": 1.6767803172131721,
"learning_rate": 7.2877120775212685e-06,
"loss": 0.2552,
"step": 1151
},
{
"epoch": 0.37,
"grad_norm": 5.311592966112747,
"learning_rate": 7.283102690719198e-06,
"loss": 0.6434,
"step": 1152
},
{
"epoch": 0.37,
"grad_norm": 1.6591025243854203,
"learning_rate": 7.278490851167155e-06,
"loss": 0.2732,
"step": 1153
},
{
"epoch": 0.37,
"grad_norm": 5.11335338371546,
"learning_rate": 7.2738765638196625e-06,
"loss": 0.4616,
"step": 1154
},
{
"epoch": 0.37,
"grad_norm": 1.4705208467577287,
"learning_rate": 7.269259833633877e-06,
"loss": 0.237,
"step": 1155
},
{
"epoch": 0.37,
"grad_norm": 5.8603109418712185,
"learning_rate": 7.264640665569577e-06,
"loss": 0.7292,
"step": 1156
},
{
"epoch": 0.37,
"grad_norm": 5.671324260946241,
"learning_rate": 7.26001906458916e-06,
"loss": 0.6222,
"step": 1157
},
{
"epoch": 0.37,
"grad_norm": 1.542420695545935,
"learning_rate": 7.255395035657639e-06,
"loss": 0.2652,
"step": 1158
},
{
"epoch": 0.37,
"grad_norm": 1.6422364296970795,
"learning_rate": 7.250768583742634e-06,
"loss": 0.2404,
"step": 1159
},
{
"epoch": 0.37,
"grad_norm": 1.6714535837388254,
"learning_rate": 7.246139713814365e-06,
"loss": 0.2571,
"step": 1160
},
{
"epoch": 0.37,
"grad_norm": 1.6316225727344515,
"learning_rate": 7.241508430845656e-06,
"loss": 0.2256,
"step": 1161
},
{
"epoch": 0.37,
"grad_norm": 1.4787343653996288,
"learning_rate": 7.236874739811921e-06,
"loss": 0.2382,
"step": 1162
},
{
"epoch": 0.37,
"grad_norm": 1.4072637031481081,
"learning_rate": 7.232238645691157e-06,
"loss": 0.2002,
"step": 1163
},
{
"epoch": 0.37,
"grad_norm": 1.426067123824302,
"learning_rate": 7.227600153463947e-06,
"loss": 0.1755,
"step": 1164
},
{
"epoch": 0.37,
"grad_norm": 4.655704700840468,
"learning_rate": 7.222959268113452e-06,
"loss": 0.3598,
"step": 1165
},
{
"epoch": 0.37,
"grad_norm": 5.370522241713215,
"learning_rate": 7.218315994625397e-06,
"loss": 0.557,
"step": 1166
},
{
"epoch": 0.37,
"grad_norm": 1.5889383930450987,
"learning_rate": 7.213670337988079e-06,
"loss": 0.2034,
"step": 1167
},
{
"epoch": 0.37,
"grad_norm": 1.8072550518122952,
"learning_rate": 7.209022303192351e-06,
"loss": 0.2207,
"step": 1168
},
{
"epoch": 0.37,
"grad_norm": 83.64482496451608,
"learning_rate": 7.204371895231623e-06,
"loss": 0.6215,
"step": 1169
},
{
"epoch": 0.37,
"grad_norm": 1.7440266241540439,
"learning_rate": 7.199719119101858e-06,
"loss": 0.2275,
"step": 1170
},
{
"epoch": 0.37,
"grad_norm": 7.512565989250296,
"learning_rate": 7.195063979801554e-06,
"loss": 0.4643,
"step": 1171
},
{
"epoch": 0.38,
"grad_norm": 6.892103848259442,
"learning_rate": 7.190406482331757e-06,
"loss": 0.5403,
"step": 1172
},
{
"epoch": 0.38,
"grad_norm": 1.4777073711805213,
"learning_rate": 7.18574663169604e-06,
"loss": 0.2513,
"step": 1173
},
{
"epoch": 0.38,
"grad_norm": 7.282807661223366,
"learning_rate": 7.1810844329005095e-06,
"loss": 0.5989,
"step": 1174
},
{
"epoch": 0.38,
"grad_norm": 1.5132827019244917,
"learning_rate": 7.176419890953788e-06,
"loss": 0.2253,
"step": 1175
},
{
"epoch": 0.38,
"grad_norm": 7.717783537027684,
"learning_rate": 7.171753010867023e-06,
"loss": 0.6799,
"step": 1176
},
{
"epoch": 0.38,
"grad_norm": 5.943915152849767,
"learning_rate": 7.167083797653866e-06,
"loss": 0.5902,
"step": 1177
},
{
"epoch": 0.38,
"grad_norm": 1.4586529974257227,
"learning_rate": 7.162412256330481e-06,
"loss": 0.202,
"step": 1178
},
{
"epoch": 0.38,
"grad_norm": 7.186611443861163,
"learning_rate": 7.157738391915531e-06,
"loss": 0.5302,
"step": 1179
},
{
"epoch": 0.38,
"grad_norm": 1.6834896407533133,
"learning_rate": 7.153062209430174e-06,
"loss": 0.251,
"step": 1180
},
{
"epoch": 0.38,
"grad_norm": 7.405116272777637,
"learning_rate": 7.148383713898058e-06,
"loss": 0.5743,
"step": 1181
},
{
"epoch": 0.38,
"grad_norm": 1.522999107758133,
"learning_rate": 7.143702910345318e-06,
"loss": 0.2723,
"step": 1182
},
{
"epoch": 0.38,
"grad_norm": 1.880200730593367,
"learning_rate": 7.139019803800569e-06,
"loss": 0.2816,
"step": 1183
},
{
"epoch": 0.38,
"grad_norm": 1.3816806243398323,
"learning_rate": 7.134334399294897e-06,
"loss": 0.2367,
"step": 1184
},
{
"epoch": 0.38,
"grad_norm": 1.7284420755099217,
"learning_rate": 7.129646701861858e-06,
"loss": 0.2544,
"step": 1185
},
{
"epoch": 0.38,
"grad_norm": 1.629550120138974,
"learning_rate": 7.124956716537471e-06,
"loss": 0.2068,
"step": 1186
},
{
"epoch": 0.38,
"grad_norm": 1.6243307078750684,
"learning_rate": 7.120264448360214e-06,
"loss": 0.213,
"step": 1187
},
{
"epoch": 0.38,
"grad_norm": 10.565396265308518,
"learning_rate": 7.115569902371018e-06,
"loss": 0.5904,
"step": 1188
},
{
"epoch": 0.38,
"grad_norm": 5.465892487151138,
"learning_rate": 7.110873083613259e-06,
"loss": 0.6009,
"step": 1189
},
{
"epoch": 0.38,
"grad_norm": 1.5358502320339789,
"learning_rate": 7.106173997132755e-06,
"loss": 0.2387,
"step": 1190
},
{
"epoch": 0.38,
"grad_norm": 1.363560919662825,
"learning_rate": 7.101472647977761e-06,
"loss": 0.2115,
"step": 1191
},
{
"epoch": 0.38,
"grad_norm": 4.92920993447301,
"learning_rate": 7.096769041198964e-06,
"loss": 0.5299,
"step": 1192
},
{
"epoch": 0.38,
"grad_norm": 1.6398438629568042,
"learning_rate": 7.0920631818494745e-06,
"loss": 0.2603,
"step": 1193
},
{
"epoch": 0.38,
"grad_norm": 1.3740360168319345,
"learning_rate": 7.087355074984823e-06,
"loss": 0.198,
"step": 1194
},
{
"epoch": 0.38,
"grad_norm": 1.5816758489089993,
"learning_rate": 7.082644725662954e-06,
"loss": 0.2146,
"step": 1195
},
{
"epoch": 0.38,
"grad_norm": 5.105693805152416,
"learning_rate": 7.077932138944225e-06,
"loss": 0.5887,
"step": 1196
},
{
"epoch": 0.38,
"grad_norm": 23.6755027232162,
"learning_rate": 7.073217319891391e-06,
"loss": 0.6281,
"step": 1197
},
{
"epoch": 0.38,
"grad_norm": 8.074180552610876,
"learning_rate": 7.068500273569612e-06,
"loss": 0.6841,
"step": 1198
},
{
"epoch": 0.38,
"grad_norm": 1.398156113046471,
"learning_rate": 7.063781005046433e-06,
"loss": 0.1879,
"step": 1199
},
{
"epoch": 0.38,
"grad_norm": 11.617325223575968,
"learning_rate": 7.059059519391794e-06,
"loss": 0.7071,
"step": 1200
},
{
"epoch": 0.38,
"grad_norm": 10.09826264231247,
"learning_rate": 7.054335821678012e-06,
"loss": 0.4581,
"step": 1201
},
{
"epoch": 0.38,
"grad_norm": 1.7538931020652984,
"learning_rate": 7.049609916979782e-06,
"loss": 0.2602,
"step": 1202
},
{
"epoch": 0.38,
"grad_norm": 3.735135329799002,
"learning_rate": 7.044881810374169e-06,
"loss": 0.3716,
"step": 1203
},
{
"epoch": 0.39,
"grad_norm": 4.869733006281636,
"learning_rate": 7.040151506940605e-06,
"loss": 0.2934,
"step": 1204
},
{
"epoch": 0.39,
"grad_norm": 7.683606301965944,
"learning_rate": 7.035419011760882e-06,
"loss": 0.697,
"step": 1205
},
{
"epoch": 0.39,
"grad_norm": 1.6759892898397664,
"learning_rate": 7.0306843299191465e-06,
"loss": 0.2519,
"step": 1206
},
{
"epoch": 0.39,
"grad_norm": 4.443344577883213,
"learning_rate": 7.0259474665018915e-06,
"loss": 0.5027,
"step": 1207
},
{
"epoch": 0.39,
"grad_norm": 1.4516786600799045,
"learning_rate": 7.0212084265979575e-06,
"loss": 0.2009,
"step": 1208
},
{
"epoch": 0.39,
"grad_norm": 5.018830872924124,
"learning_rate": 7.016467215298519e-06,
"loss": 0.702,
"step": 1209
},
{
"epoch": 0.39,
"grad_norm": 13.372418015625748,
"learning_rate": 7.011723837697091e-06,
"loss": 0.7114,
"step": 1210
},
{
"epoch": 0.39,
"grad_norm": 6.277932946105866,
"learning_rate": 7.0069782988895056e-06,
"loss": 0.5805,
"step": 1211
},
{
"epoch": 0.39,
"grad_norm": 1.531528513144936,
"learning_rate": 7.002230603973924e-06,
"loss": 0.1991,
"step": 1212
},
{
"epoch": 0.39,
"grad_norm": 1.8223707208415971,
"learning_rate": 6.9974807580508205e-06,
"loss": 0.2809,
"step": 1213
},
{
"epoch": 0.39,
"grad_norm": 8.546674650105915,
"learning_rate": 6.992728766222982e-06,
"loss": 0.6171,
"step": 1214
},
{
"epoch": 0.39,
"grad_norm": 9.548484544610723,
"learning_rate": 6.987974633595498e-06,
"loss": 0.5622,
"step": 1215
},
{
"epoch": 0.39,
"grad_norm": 1.5120183426116132,
"learning_rate": 6.9832183652757625e-06,
"loss": 0.2235,
"step": 1216
},
{
"epoch": 0.39,
"grad_norm": 6.71791971273723,
"learning_rate": 6.978459966373458e-06,
"loss": 0.5058,
"step": 1217
},
{
"epoch": 0.39,
"grad_norm": 6.364431409221105,
"learning_rate": 6.973699442000561e-06,
"loss": 0.4824,
"step": 1218
},
{
"epoch": 0.39,
"grad_norm": 5.7766999911114985,
"learning_rate": 6.96893679727133e-06,
"loss": 0.4832,
"step": 1219
},
{
"epoch": 0.39,
"grad_norm": 1.7531010618552019,
"learning_rate": 6.9641720373022996e-06,
"loss": 0.1971,
"step": 1220
},
{
"epoch": 0.39,
"grad_norm": 1.6570198426139906,
"learning_rate": 6.959405167212278e-06,
"loss": 0.2393,
"step": 1221
},
{
"epoch": 0.39,
"grad_norm": 1.5456440395214717,
"learning_rate": 6.954636192122339e-06,
"loss": 0.2604,
"step": 1222
},
{
"epoch": 0.39,
"grad_norm": 1.6624183851105219,
"learning_rate": 6.949865117155823e-06,
"loss": 0.2813,
"step": 1223
},
{
"epoch": 0.39,
"grad_norm": 8.06647912493275,
"learning_rate": 6.94509194743832e-06,
"loss": 0.5583,
"step": 1224
},
{
"epoch": 0.39,
"grad_norm": 7.806322784742366,
"learning_rate": 6.940316688097675e-06,
"loss": 0.3821,
"step": 1225
},
{
"epoch": 0.39,
"grad_norm": 1.4797776369891873,
"learning_rate": 6.935539344263971e-06,
"loss": 0.2116,
"step": 1226
},
{
"epoch": 0.39,
"grad_norm": 1.7083919322940107,
"learning_rate": 6.93075992106954e-06,
"loss": 0.2568,
"step": 1227
},
{
"epoch": 0.39,
"grad_norm": 9.226085074768402,
"learning_rate": 6.925978423648941e-06,
"loss": 0.6846,
"step": 1228
},
{
"epoch": 0.39,
"grad_norm": 1.375716705377596,
"learning_rate": 6.921194857138963e-06,
"loss": 0.197,
"step": 1229
},
{
"epoch": 0.39,
"grad_norm": 5.685446489524216,
"learning_rate": 6.91640922667862e-06,
"loss": 0.4744,
"step": 1230
},
{
"epoch": 0.39,
"grad_norm": 1.5215358669218515,
"learning_rate": 6.911621537409139e-06,
"loss": 0.2391,
"step": 1231
},
{
"epoch": 0.39,
"grad_norm": 6.511798129596943,
"learning_rate": 6.906831794473963e-06,
"loss": 0.517,
"step": 1232
},
{
"epoch": 0.39,
"grad_norm": 4.6931560492107725,
"learning_rate": 6.9020400030187394e-06,
"loss": 0.593,
"step": 1233
},
{
"epoch": 0.39,
"grad_norm": 7.712398832752451,
"learning_rate": 6.897246168191317e-06,
"loss": 0.5824,
"step": 1234
},
{
"epoch": 0.4,
"grad_norm": 1.613022044089724,
"learning_rate": 6.892450295141737e-06,
"loss": 0.2741,
"step": 1235
},
{
"epoch": 0.4,
"grad_norm": 7.279473121046338,
"learning_rate": 6.887652389022236e-06,
"loss": 0.438,
"step": 1236
},
{
"epoch": 0.4,
"grad_norm": 5.838290307338631,
"learning_rate": 6.88285245498723e-06,
"loss": 0.5796,
"step": 1237
},
{
"epoch": 0.4,
"grad_norm": 1.5404039634116309,
"learning_rate": 6.878050498193314e-06,
"loss": 0.2412,
"step": 1238
},
{
"epoch": 0.4,
"grad_norm": 14.262886472471997,
"learning_rate": 6.873246523799256e-06,
"loss": 0.6166,
"step": 1239
},
{
"epoch": 0.4,
"grad_norm": 7.073604111640909,
"learning_rate": 6.868440536965997e-06,
"loss": 0.6247,
"step": 1240
},
{
"epoch": 0.4,
"grad_norm": 1.5119057367294504,
"learning_rate": 6.863632542856632e-06,
"loss": 0.22,
"step": 1241
},
{
"epoch": 0.4,
"grad_norm": 6.770024997584175,
"learning_rate": 6.858822546636417e-06,
"loss": 0.5132,
"step": 1242
},
{
"epoch": 0.4,
"grad_norm": 4.621401190894274,
"learning_rate": 6.854010553472757e-06,
"loss": 0.5888,
"step": 1243
},
{
"epoch": 0.4,
"grad_norm": 5.731610298277731,
"learning_rate": 6.849196568535201e-06,
"loss": 0.6065,
"step": 1244
},
{
"epoch": 0.4,
"grad_norm": 14.911449842987928,
"learning_rate": 6.8443805969954445e-06,
"loss": 0.6184,
"step": 1245
},
{
"epoch": 0.4,
"grad_norm": 4.60898253192161,
"learning_rate": 6.839562644027311e-06,
"loss": 0.5104,
"step": 1246
},
{
"epoch": 0.4,
"grad_norm": 6.096953598140278,
"learning_rate": 6.834742714806754e-06,
"loss": 0.5489,
"step": 1247
},
{
"epoch": 0.4,
"grad_norm": 1.5113402009838244,
"learning_rate": 6.8299208145118475e-06,
"loss": 0.2437,
"step": 1248
},
{
"epoch": 0.4,
"grad_norm": 5.071904749982615,
"learning_rate": 6.825096948322791e-06,
"loss": 0.4732,
"step": 1249
},
{
"epoch": 0.4,
"grad_norm": 5.92365925359732,
"learning_rate": 6.820271121421889e-06,
"loss": 0.4908,
"step": 1250
},
{
"epoch": 0.4,
"grad_norm": 1.4776199050644825,
"learning_rate": 6.815443338993554e-06,
"loss": 0.2662,
"step": 1251
},
{
"epoch": 0.4,
"grad_norm": 34.18987797681408,
"learning_rate": 6.810613606224299e-06,
"loss": 0.5637,
"step": 1252
},
{
"epoch": 0.4,
"grad_norm": 1.5500535111980442,
"learning_rate": 6.805781928302732e-06,
"loss": 0.238,
"step": 1253
},
{
"epoch": 0.4,
"grad_norm": 1.4603160104682158,
"learning_rate": 6.800948310419554e-06,
"loss": 0.2363,
"step": 1254
},
{
"epoch": 0.4,
"grad_norm": 7.992049335749039,
"learning_rate": 6.796112757767547e-06,
"loss": 0.6291,
"step": 1255
},
{
"epoch": 0.4,
"grad_norm": 6.296197488841442,
"learning_rate": 6.7912752755415716e-06,
"loss": 0.5768,
"step": 1256
},
{
"epoch": 0.4,
"grad_norm": 5.462929177553609,
"learning_rate": 6.786435868938561e-06,
"loss": 0.6117,
"step": 1257
},
{
"epoch": 0.4,
"grad_norm": 1.4717241337016855,
"learning_rate": 6.78159454315752e-06,
"loss": 0.2492,
"step": 1258
},
{
"epoch": 0.4,
"grad_norm": 9.776704836987184,
"learning_rate": 6.776751303399509e-06,
"loss": 0.5362,
"step": 1259
},
{
"epoch": 0.4,
"grad_norm": 6.608262013803274,
"learning_rate": 6.771906154867649e-06,
"loss": 0.5421,
"step": 1260
},
{
"epoch": 0.4,
"grad_norm": 1.5844296226578645,
"learning_rate": 6.767059102767109e-06,
"loss": 0.2356,
"step": 1261
},
{
"epoch": 0.4,
"grad_norm": 6.090627408845525,
"learning_rate": 6.7622101523051045e-06,
"loss": 0.6988,
"step": 1262
},
{
"epoch": 0.4,
"grad_norm": 1.522165906490889,
"learning_rate": 6.757359308690889e-06,
"loss": 0.2305,
"step": 1263
},
{
"epoch": 0.4,
"grad_norm": 1.751553003654809,
"learning_rate": 6.7525065771357546e-06,
"loss": 0.2702,
"step": 1264
},
{
"epoch": 0.4,
"grad_norm": 1.55487397284913,
"learning_rate": 6.7476519628530145e-06,
"loss": 0.27,
"step": 1265
},
{
"epoch": 0.41,
"grad_norm": 1.5713202363010224,
"learning_rate": 6.742795471058009e-06,
"loss": 0.2193,
"step": 1266
},
{
"epoch": 0.41,
"grad_norm": 1.5682347759148718,
"learning_rate": 6.737937106968094e-06,
"loss": 0.1929,
"step": 1267
},
{
"epoch": 0.41,
"grad_norm": 10.448845373480625,
"learning_rate": 6.7330768758026374e-06,
"loss": 0.6081,
"step": 1268
},
{
"epoch": 0.41,
"grad_norm": 1.4756514362204363,
"learning_rate": 6.728214782783013e-06,
"loss": 0.2493,
"step": 1269
},
{
"epoch": 0.41,
"grad_norm": 8.736394717506005,
"learning_rate": 6.723350833132596e-06,
"loss": 0.5611,
"step": 1270
},
{
"epoch": 0.41,
"grad_norm": 1.432299091066597,
"learning_rate": 6.7184850320767505e-06,
"loss": 0.1889,
"step": 1271
},
{
"epoch": 0.41,
"grad_norm": 6.2082942303491455,
"learning_rate": 6.7136173848428375e-06,
"loss": 0.5751,
"step": 1272
},
{
"epoch": 0.41,
"grad_norm": 5.900510845746761,
"learning_rate": 6.708747896660196e-06,
"loss": 0.5811,
"step": 1273
},
{
"epoch": 0.41,
"grad_norm": 7.232392559033582,
"learning_rate": 6.703876572760144e-06,
"loss": 0.5332,
"step": 1274
},
{
"epoch": 0.41,
"grad_norm": 5.012627379636099,
"learning_rate": 6.6990034183759726e-06,
"loss": 0.421,
"step": 1275
},
{
"epoch": 0.41,
"grad_norm": 1.6039349162455028,
"learning_rate": 6.694128438742939e-06,
"loss": 0.2281,
"step": 1276
},
{
"epoch": 0.41,
"grad_norm": 7.378904891237365,
"learning_rate": 6.689251639098261e-06,
"loss": 0.5378,
"step": 1277
},
{
"epoch": 0.41,
"grad_norm": 1.6722139873092432,
"learning_rate": 6.684373024681112e-06,
"loss": 0.2682,
"step": 1278
},
{
"epoch": 0.41,
"grad_norm": 1.6777328407422487,
"learning_rate": 6.679492600732614e-06,
"loss": 0.2174,
"step": 1279
},
{
"epoch": 0.41,
"grad_norm": 6.0752801994029415,
"learning_rate": 6.674610372495832e-06,
"loss": 0.6384,
"step": 1280
},
{
"epoch": 0.41,
"grad_norm": 6.741101694208744,
"learning_rate": 6.669726345215776e-06,
"loss": 0.7203,
"step": 1281
},
{
"epoch": 0.41,
"grad_norm": 1.6406867327439791,
"learning_rate": 6.66484052413938e-06,
"loss": 0.2416,
"step": 1282
},
{
"epoch": 0.41,
"grad_norm": 7.514032353916119,
"learning_rate": 6.659952914515508e-06,
"loss": 0.485,
"step": 1283
},
{
"epoch": 0.41,
"grad_norm": 1.5597751611658328,
"learning_rate": 6.65506352159495e-06,
"loss": 0.2443,
"step": 1284
},
{
"epoch": 0.41,
"grad_norm": 1.5405462003001633,
"learning_rate": 6.650172350630406e-06,
"loss": 0.2741,
"step": 1285
},
{
"epoch": 0.41,
"grad_norm": 6.525607991866356,
"learning_rate": 6.645279406876488e-06,
"loss": 0.5759,
"step": 1286
},
{
"epoch": 0.41,
"grad_norm": 5.002358514388684,
"learning_rate": 6.640384695589714e-06,
"loss": 0.4653,
"step": 1287
},
{
"epoch": 0.41,
"grad_norm": 1.4727288285377615,
"learning_rate": 6.635488222028497e-06,
"loss": 0.2807,
"step": 1288
},
{
"epoch": 0.41,
"grad_norm": 1.5045382510946763,
"learning_rate": 6.630589991453148e-06,
"loss": 0.2361,
"step": 1289
},
{
"epoch": 0.41,
"grad_norm": 1.6865073317669421,
"learning_rate": 6.6256900091258644e-06,
"loss": 0.2688,
"step": 1290
},
{
"epoch": 0.41,
"grad_norm": 1.820786742344778,
"learning_rate": 6.620788280310722e-06,
"loss": 0.2648,
"step": 1291
},
{
"epoch": 0.41,
"grad_norm": 1.7462808336866076,
"learning_rate": 6.615884810273678e-06,
"loss": 0.2464,
"step": 1292
},
{
"epoch": 0.41,
"grad_norm": 1.5298051068868284,
"learning_rate": 6.610979604282557e-06,
"loss": 0.2221,
"step": 1293
},
{
"epoch": 0.41,
"grad_norm": 5.872999684855236,
"learning_rate": 6.606072667607048e-06,
"loss": 0.6946,
"step": 1294
},
{
"epoch": 0.41,
"grad_norm": 7.095836416798553,
"learning_rate": 6.601164005518702e-06,
"loss": 0.5355,
"step": 1295
},
{
"epoch": 0.41,
"grad_norm": 1.694822554317753,
"learning_rate": 6.59625362329092e-06,
"loss": 0.2686,
"step": 1296
},
{
"epoch": 0.42,
"grad_norm": 1.7911594577885972,
"learning_rate": 6.591341526198955e-06,
"loss": 0.2835,
"step": 1297
},
{
"epoch": 0.42,
"grad_norm": 1.7186980047557423,
"learning_rate": 6.586427719519901e-06,
"loss": 0.2505,
"step": 1298
},
{
"epoch": 0.42,
"grad_norm": 1.477694989730813,
"learning_rate": 6.581512208532685e-06,
"loss": 0.2179,
"step": 1299
},
{
"epoch": 0.42,
"grad_norm": 8.594806131062327,
"learning_rate": 6.576594998518071e-06,
"loss": 0.5909,
"step": 1300
},
{
"epoch": 0.42,
"grad_norm": 5.231386413737722,
"learning_rate": 6.5716760947586425e-06,
"loss": 0.4832,
"step": 1301
},
{
"epoch": 0.42,
"grad_norm": 1.5295569495021915,
"learning_rate": 6.566755502538806e-06,
"loss": 0.246,
"step": 1302
},
{
"epoch": 0.42,
"grad_norm": 8.168014180758991,
"learning_rate": 6.561833227144784e-06,
"loss": 0.5241,
"step": 1303
},
{
"epoch": 0.42,
"grad_norm": 1.43115436817828,
"learning_rate": 6.556909273864601e-06,
"loss": 0.1902,
"step": 1304
},
{
"epoch": 0.42,
"grad_norm": 1.4397401401153558,
"learning_rate": 6.551983647988089e-06,
"loss": 0.2052,
"step": 1305
},
{
"epoch": 0.42,
"grad_norm": 1.4140575314568167,
"learning_rate": 6.547056354806874e-06,
"loss": 0.2099,
"step": 1306
},
{
"epoch": 0.42,
"grad_norm": 13.958535014184495,
"learning_rate": 6.542127399614376e-06,
"loss": 0.5045,
"step": 1307
},
{
"epoch": 0.42,
"grad_norm": 5.929928698067259,
"learning_rate": 6.5371967877058e-06,
"loss": 0.6021,
"step": 1308
},
{
"epoch": 0.42,
"grad_norm": 6.4073521805396165,
"learning_rate": 6.532264524378128e-06,
"loss": 0.5263,
"step": 1309
},
{
"epoch": 0.42,
"grad_norm": 1.4476342489402703,
"learning_rate": 6.52733061493012e-06,
"loss": 0.24,
"step": 1310
},
{
"epoch": 0.42,
"grad_norm": 1.5255415468091642,
"learning_rate": 6.522395064662299e-06,
"loss": 0.2699,
"step": 1311
},
{
"epoch": 0.42,
"grad_norm": 1.4021509689977685,
"learning_rate": 6.517457878876958e-06,
"loss": 0.2091,
"step": 1312
},
{
"epoch": 0.42,
"grad_norm": 1.5747361561140838,
"learning_rate": 6.512519062878142e-06,
"loss": 0.2619,
"step": 1313
},
{
"epoch": 0.42,
"grad_norm": 7.755109727547206,
"learning_rate": 6.507578621971646e-06,
"loss": 0.6403,
"step": 1314
},
{
"epoch": 0.42,
"grad_norm": 5.277901904623132,
"learning_rate": 6.502636561465018e-06,
"loss": 0.5602,
"step": 1315
},
{
"epoch": 0.42,
"grad_norm": 6.702260641203853,
"learning_rate": 6.497692886667537e-06,
"loss": 0.6175,
"step": 1316
},
{
"epoch": 0.42,
"grad_norm": 1.3942715927968807,
"learning_rate": 6.492747602890223e-06,
"loss": 0.2102,
"step": 1317
},
{
"epoch": 0.42,
"grad_norm": 1.776422204585946,
"learning_rate": 6.487800715445822e-06,
"loss": 0.2451,
"step": 1318
},
{
"epoch": 0.42,
"grad_norm": 1.4579178378320363,
"learning_rate": 6.4828522296488014e-06,
"loss": 0.2442,
"step": 1319
},
{
"epoch": 0.42,
"grad_norm": 4.907790366492386,
"learning_rate": 6.477902150815347e-06,
"loss": 0.5934,
"step": 1320
},
{
"epoch": 0.42,
"grad_norm": 1.604508114703477,
"learning_rate": 6.472950484263359e-06,
"loss": 0.2447,
"step": 1321
},
{
"epoch": 0.42,
"grad_norm": 8.215101926760493,
"learning_rate": 6.467997235312437e-06,
"loss": 0.5843,
"step": 1322
},
{
"epoch": 0.42,
"grad_norm": 5.134432922013831,
"learning_rate": 6.463042409283885e-06,
"loss": 0.6326,
"step": 1323
},
{
"epoch": 0.42,
"grad_norm": 11.270702143795317,
"learning_rate": 6.458086011500703e-06,
"loss": 0.5792,
"step": 1324
},
{
"epoch": 0.42,
"grad_norm": 1.5642748476276402,
"learning_rate": 6.453128047287573e-06,
"loss": 0.2395,
"step": 1325
},
{
"epoch": 0.42,
"grad_norm": 5.643908819098535,
"learning_rate": 6.448168521970865e-06,
"loss": 0.6205,
"step": 1326
},
{
"epoch": 0.42,
"grad_norm": 5.88595989332353,
"learning_rate": 6.443207440878624e-06,
"loss": 0.5761,
"step": 1327
},
{
"epoch": 0.42,
"grad_norm": 6.492703074635773,
"learning_rate": 6.438244809340568e-06,
"loss": 0.6278,
"step": 1328
},
{
"epoch": 0.43,
"grad_norm": 5.962111127877899,
"learning_rate": 6.43328063268808e-06,
"loss": 0.6743,
"step": 1329
},
{
"epoch": 0.43,
"grad_norm": 1.661193805884559,
"learning_rate": 6.428314916254203e-06,
"loss": 0.2352,
"step": 1330
},
{
"epoch": 0.43,
"grad_norm": 4.8410545725964695,
"learning_rate": 6.423347665373633e-06,
"loss": 0.6717,
"step": 1331
},
{
"epoch": 0.43,
"grad_norm": 5.857134308919506,
"learning_rate": 6.418378885382716e-06,
"loss": 0.3853,
"step": 1332
},
{
"epoch": 0.43,
"grad_norm": 1.654657491282791,
"learning_rate": 6.41340858161944e-06,
"loss": 0.2793,
"step": 1333
},
{
"epoch": 0.43,
"grad_norm": 1.5100294686951206,
"learning_rate": 6.408436759423431e-06,
"loss": 0.2342,
"step": 1334
},
{
"epoch": 0.43,
"grad_norm": 1.5015655233250866,
"learning_rate": 6.403463424135943e-06,
"loss": 0.2294,
"step": 1335
},
{
"epoch": 0.43,
"grad_norm": 4.342949530298343,
"learning_rate": 6.398488581099859e-06,
"loss": 0.688,
"step": 1336
},
{
"epoch": 0.43,
"grad_norm": 4.847372303171137,
"learning_rate": 6.393512235659681e-06,
"loss": 0.6396,
"step": 1337
},
{
"epoch": 0.43,
"grad_norm": 1.559018842354788,
"learning_rate": 6.388534393161525e-06,
"loss": 0.2347,
"step": 1338
},
{
"epoch": 0.43,
"grad_norm": 5.652103439251986,
"learning_rate": 6.383555058953115e-06,
"loss": 0.6426,
"step": 1339
},
{
"epoch": 0.43,
"grad_norm": 1.64044125352812,
"learning_rate": 6.378574238383776e-06,
"loss": 0.2685,
"step": 1340
},
{
"epoch": 0.43,
"grad_norm": 7.090198166881005,
"learning_rate": 6.373591936804433e-06,
"loss": 0.5847,
"step": 1341
},
{
"epoch": 0.43,
"grad_norm": 4.883742202613736,
"learning_rate": 6.3686081595676e-06,
"loss": 0.464,
"step": 1342
},
{
"epoch": 0.43,
"grad_norm": 6.09957916816325,
"learning_rate": 6.3636229120273766e-06,
"loss": 0.5592,
"step": 1343
},
{
"epoch": 0.43,
"grad_norm": 5.403859884825113,
"learning_rate": 6.3586361995394415e-06,
"loss": 0.5085,
"step": 1344
},
{
"epoch": 0.43,
"grad_norm": 5.554581031595441,
"learning_rate": 6.353648027461048e-06,
"loss": 0.4029,
"step": 1345
},
{
"epoch": 0.43,
"grad_norm": 16.1020169268518,
"learning_rate": 6.348658401151018e-06,
"loss": 0.6541,
"step": 1346
},
{
"epoch": 0.43,
"grad_norm": 1.4614761809805563,
"learning_rate": 6.343667325969736e-06,
"loss": 0.2115,
"step": 1347
},
{
"epoch": 0.43,
"grad_norm": 5.740622425139341,
"learning_rate": 6.3386748072791395e-06,
"loss": 0.5811,
"step": 1348
},
{
"epoch": 0.43,
"grad_norm": 4.76093657013544,
"learning_rate": 6.33368085044272e-06,
"loss": 0.6175,
"step": 1349
},
{
"epoch": 0.43,
"grad_norm": 4.631229554250956,
"learning_rate": 6.328685460825512e-06,
"loss": 0.506,
"step": 1350
},
{
"epoch": 0.43,
"grad_norm": 1.648768239858883,
"learning_rate": 6.323688643794094e-06,
"loss": 0.2979,
"step": 1351
},
{
"epoch": 0.43,
"grad_norm": 20.025290556974912,
"learning_rate": 6.318690404716572e-06,
"loss": 0.4681,
"step": 1352
},
{
"epoch": 0.43,
"grad_norm": 14.346862070384379,
"learning_rate": 6.313690748962582e-06,
"loss": 0.4402,
"step": 1353
},
{
"epoch": 0.43,
"grad_norm": 1.4828472003197997,
"learning_rate": 6.3086896819032814e-06,
"loss": 0.2206,
"step": 1354
},
{
"epoch": 0.43,
"grad_norm": 7.0220406456740445,
"learning_rate": 6.303687208911348e-06,
"loss": 0.6517,
"step": 1355
},
{
"epoch": 0.43,
"grad_norm": 1.5489288875091363,
"learning_rate": 6.298683335360962e-06,
"loss": 0.2078,
"step": 1356
},
{
"epoch": 0.43,
"grad_norm": 6.176570408327461,
"learning_rate": 6.293678066627816e-06,
"loss": 0.4571,
"step": 1357
},
{
"epoch": 0.43,
"grad_norm": 10.591530096478122,
"learning_rate": 6.288671408089098e-06,
"loss": 0.6328,
"step": 1358
},
{
"epoch": 0.43,
"grad_norm": 6.31658187255168,
"learning_rate": 6.283663365123486e-06,
"loss": 0.4785,
"step": 1359
},
{
"epoch": 0.44,
"grad_norm": 6.758008842878058,
"learning_rate": 6.278653943111152e-06,
"loss": 0.4837,
"step": 1360
},
{
"epoch": 0.44,
"grad_norm": 1.4104171677419424,
"learning_rate": 6.273643147433743e-06,
"loss": 0.2289,
"step": 1361
},
{
"epoch": 0.44,
"grad_norm": 1.6197009224422692,
"learning_rate": 6.268630983474388e-06,
"loss": 0.2829,
"step": 1362
},
{
"epoch": 0.44,
"grad_norm": 6.249525275292694,
"learning_rate": 6.263617456617681e-06,
"loss": 0.6438,
"step": 1363
},
{
"epoch": 0.44,
"grad_norm": 1.513923936350329,
"learning_rate": 6.258602572249683e-06,
"loss": 0.2671,
"step": 1364
},
{
"epoch": 0.44,
"grad_norm": 1.7552401802120616,
"learning_rate": 6.2535863357579105e-06,
"loss": 0.2371,
"step": 1365
},
{
"epoch": 0.44,
"grad_norm": 1.731838791910952,
"learning_rate": 6.248568752531337e-06,
"loss": 0.2664,
"step": 1366
},
{
"epoch": 0.44,
"grad_norm": 4.4219639243732995,
"learning_rate": 6.243549827960378e-06,
"loss": 0.7413,
"step": 1367
},
{
"epoch": 0.44,
"grad_norm": 1.5935077451476027,
"learning_rate": 6.238529567436892e-06,
"loss": 0.2352,
"step": 1368
},
{
"epoch": 0.44,
"grad_norm": 6.377410112066094,
"learning_rate": 6.233507976354174e-06,
"loss": 0.5239,
"step": 1369
},
{
"epoch": 0.44,
"grad_norm": 14.826981063929932,
"learning_rate": 6.228485060106948e-06,
"loss": 0.6102,
"step": 1370
},
{
"epoch": 0.44,
"grad_norm": 3.709700351161225,
"learning_rate": 6.223460824091358e-06,
"loss": 0.3748,
"step": 1371
},
{
"epoch": 0.44,
"grad_norm": 7.541911782934291,
"learning_rate": 6.218435273704973e-06,
"loss": 0.6081,
"step": 1372
},
{
"epoch": 0.44,
"grad_norm": 5.034766786731644,
"learning_rate": 6.213408414346765e-06,
"loss": 0.533,
"step": 1373
},
{
"epoch": 0.44,
"grad_norm": 6.106431140775753,
"learning_rate": 6.208380251417122e-06,
"loss": 0.587,
"step": 1374
},
{
"epoch": 0.44,
"grad_norm": 5.62746791692136,
"learning_rate": 6.203350790317825e-06,
"loss": 0.586,
"step": 1375
},
{
"epoch": 0.44,
"grad_norm": 1.5934871745788728,
"learning_rate": 6.198320036452051e-06,
"loss": 0.2625,
"step": 1376
},
{
"epoch": 0.44,
"grad_norm": 1.7027648894463596,
"learning_rate": 6.193287995224371e-06,
"loss": 0.2191,
"step": 1377
},
{
"epoch": 0.44,
"grad_norm": 9.100797592288487,
"learning_rate": 6.18825467204073e-06,
"loss": 0.5967,
"step": 1378
},
{
"epoch": 0.44,
"grad_norm": 1.4300400133146662,
"learning_rate": 6.183220072308459e-06,
"loss": 0.2114,
"step": 1379
},
{
"epoch": 0.44,
"grad_norm": 8.307304161283804,
"learning_rate": 6.178184201436256e-06,
"loss": 0.7205,
"step": 1380
},
{
"epoch": 0.44,
"grad_norm": 8.519423645264656,
"learning_rate": 6.173147064834183e-06,
"loss": 0.5529,
"step": 1381
},
{
"epoch": 0.44,
"grad_norm": 8.721720890194614,
"learning_rate": 6.168108667913666e-06,
"loss": 0.7219,
"step": 1382
},
{
"epoch": 0.44,
"grad_norm": 10.656532411122027,
"learning_rate": 6.163069016087483e-06,
"loss": 0.6465,
"step": 1383
},
{
"epoch": 0.44,
"grad_norm": 1.7054559572063817,
"learning_rate": 6.158028114769758e-06,
"loss": 0.2313,
"step": 1384
},
{
"epoch": 0.44,
"grad_norm": 1.5034160019470488,
"learning_rate": 6.152985969375962e-06,
"loss": 0.2213,
"step": 1385
},
{
"epoch": 0.44,
"grad_norm": 1.441204816973364,
"learning_rate": 6.147942585322898e-06,
"loss": 0.185,
"step": 1386
},
{
"epoch": 0.44,
"grad_norm": 6.463243020843492,
"learning_rate": 6.142897968028704e-06,
"loss": 0.5157,
"step": 1387
},
{
"epoch": 0.44,
"grad_norm": 1.6634596678438656,
"learning_rate": 6.137852122912839e-06,
"loss": 0.2605,
"step": 1388
},
{
"epoch": 0.44,
"grad_norm": 1.5140618946971276,
"learning_rate": 6.1328050553960804e-06,
"loss": 0.2057,
"step": 1389
},
{
"epoch": 0.44,
"grad_norm": 1.3827811621096124,
"learning_rate": 6.1277567709005245e-06,
"loss": 0.2351,
"step": 1390
},
{
"epoch": 0.45,
"grad_norm": 12.096242813456865,
"learning_rate": 6.122707274849572e-06,
"loss": 0.691,
"step": 1391
},
{
"epoch": 0.45,
"grad_norm": 1.5592714009568418,
"learning_rate": 6.117656572667921e-06,
"loss": 0.2206,
"step": 1392
},
{
"epoch": 0.45,
"grad_norm": 1.2934985017598122,
"learning_rate": 6.112604669781572e-06,
"loss": 0.1876,
"step": 1393
},
{
"epoch": 0.45,
"grad_norm": 1.5518426348580512,
"learning_rate": 6.107551571617813e-06,
"loss": 0.1925,
"step": 1394
},
{
"epoch": 0.45,
"grad_norm": 5.958954702269303,
"learning_rate": 6.1024972836052135e-06,
"loss": 0.6241,
"step": 1395
},
{
"epoch": 0.45,
"grad_norm": 5.442321985431483,
"learning_rate": 6.0974418111736235e-06,
"loss": 0.6867,
"step": 1396
},
{
"epoch": 0.45,
"grad_norm": 1.551766771257588,
"learning_rate": 6.092385159754165e-06,
"loss": 0.1898,
"step": 1397
},
{
"epoch": 0.45,
"grad_norm": 4.001212140548543,
"learning_rate": 6.0873273347792275e-06,
"loss": 0.5045,
"step": 1398
},
{
"epoch": 0.45,
"grad_norm": 1.7793938294035376,
"learning_rate": 6.0822683416824625e-06,
"loss": 0.2023,
"step": 1399
},
{
"epoch": 0.45,
"grad_norm": 1.63866212649055,
"learning_rate": 6.077208185898772e-06,
"loss": 0.2622,
"step": 1400
},
{
"epoch": 0.45,
"grad_norm": 3.381632394924379,
"learning_rate": 6.07214687286431e-06,
"loss": 0.2216,
"step": 1401
},
{
"epoch": 0.45,
"grad_norm": 1.5895154817359234,
"learning_rate": 6.067084408016475e-06,
"loss": 0.2351,
"step": 1402
},
{
"epoch": 0.45,
"grad_norm": 1.443672079594385,
"learning_rate": 6.0620207967939e-06,
"loss": 0.1934,
"step": 1403
},
{
"epoch": 0.45,
"grad_norm": 7.115039207145125,
"learning_rate": 6.0569560446364495e-06,
"loss": 0.6115,
"step": 1404
},
{
"epoch": 0.45,
"grad_norm": 7.315327542828079,
"learning_rate": 6.051890156985217e-06,
"loss": 0.6131,
"step": 1405
},
{
"epoch": 0.45,
"grad_norm": 5.564007578956525,
"learning_rate": 6.046823139282515e-06,
"loss": 0.5503,
"step": 1406
},
{
"epoch": 0.45,
"grad_norm": 8.161419091740283,
"learning_rate": 6.041754996971866e-06,
"loss": 0.5399,
"step": 1407
},
{
"epoch": 0.45,
"grad_norm": 5.94974351366777,
"learning_rate": 6.036685735498004e-06,
"loss": 0.6738,
"step": 1408
},
{
"epoch": 0.45,
"grad_norm": 6.733711412578767,
"learning_rate": 6.031615360306867e-06,
"loss": 0.3184,
"step": 1409
},
{
"epoch": 0.45,
"grad_norm": 1.7126580681320984,
"learning_rate": 6.026543876845586e-06,
"loss": 0.2007,
"step": 1410
},
{
"epoch": 0.45,
"grad_norm": 1.7011935235958762,
"learning_rate": 6.021471290562484e-06,
"loss": 0.287,
"step": 1411
},
{
"epoch": 0.45,
"grad_norm": 5.4274651443922375,
"learning_rate": 6.016397606907069e-06,
"loss": 0.4691,
"step": 1412
},
{
"epoch": 0.45,
"grad_norm": 5.239967968092811,
"learning_rate": 6.011322831330028e-06,
"loss": 0.6135,
"step": 1413
},
{
"epoch": 0.45,
"grad_norm": 1.6278847693376925,
"learning_rate": 6.0062469692832205e-06,
"loss": 0.2015,
"step": 1414
},
{
"epoch": 0.45,
"grad_norm": 1.511697518375073,
"learning_rate": 6.001170026219673e-06,
"loss": 0.2016,
"step": 1415
},
{
"epoch": 0.45,
"grad_norm": 1.5004007832623714,
"learning_rate": 5.996092007593572e-06,
"loss": 0.2587,
"step": 1416
},
{
"epoch": 0.45,
"grad_norm": 5.52428019575851,
"learning_rate": 5.9910129188602665e-06,
"loss": 0.4986,
"step": 1417
},
{
"epoch": 0.45,
"grad_norm": 6.192378356562513,
"learning_rate": 5.985932765476246e-06,
"loss": 0.5822,
"step": 1418
},
{
"epoch": 0.45,
"grad_norm": 4.408229036197582,
"learning_rate": 5.9808515528991486e-06,
"loss": 0.6131,
"step": 1419
},
{
"epoch": 0.45,
"grad_norm": 3.9940885634068106,
"learning_rate": 5.975769286587747e-06,
"loss": 0.5009,
"step": 1420
},
{
"epoch": 0.45,
"grad_norm": 1.5060988705786211,
"learning_rate": 5.970685972001953e-06,
"loss": 0.1932,
"step": 1421
},
{
"epoch": 0.46,
"grad_norm": 1.7045467912876315,
"learning_rate": 5.965601614602798e-06,
"loss": 0.2377,
"step": 1422
},
{
"epoch": 0.46,
"grad_norm": 1.614092221012137,
"learning_rate": 5.960516219852433e-06,
"loss": 0.251,
"step": 1423
},
{
"epoch": 0.46,
"grad_norm": 1.7816140159741642,
"learning_rate": 5.955429793214129e-06,
"loss": 0.2594,
"step": 1424
},
{
"epoch": 0.46,
"grad_norm": 5.581341187828204,
"learning_rate": 5.950342340152261e-06,
"loss": 0.5742,
"step": 1425
},
{
"epoch": 0.46,
"grad_norm": 7.234871787931018,
"learning_rate": 5.945253866132308e-06,
"loss": 0.5095,
"step": 1426
},
{
"epoch": 0.46,
"grad_norm": 1.4986143435034935,
"learning_rate": 5.940164376620847e-06,
"loss": 0.294,
"step": 1427
},
{
"epoch": 0.46,
"grad_norm": 1.562984444963245,
"learning_rate": 5.935073877085546e-06,
"loss": 0.1948,
"step": 1428
},
{
"epoch": 0.46,
"grad_norm": 6.374888940526056,
"learning_rate": 5.9299823729951544e-06,
"loss": 0.5791,
"step": 1429
},
{
"epoch": 0.46,
"grad_norm": 7.329003624019677,
"learning_rate": 5.9248898698195054e-06,
"loss": 0.5772,
"step": 1430
},
{
"epoch": 0.46,
"grad_norm": 5.348094808840994,
"learning_rate": 5.919796373029504e-06,
"loss": 0.6713,
"step": 1431
},
{
"epoch": 0.46,
"grad_norm": 1.5162721511146773,
"learning_rate": 5.914701888097121e-06,
"loss": 0.2235,
"step": 1432
},
{
"epoch": 0.46,
"grad_norm": 1.5933679221185735,
"learning_rate": 5.90960642049539e-06,
"loss": 0.2001,
"step": 1433
},
{
"epoch": 0.46,
"grad_norm": 1.5968570997261389,
"learning_rate": 5.904509975698399e-06,
"loss": 0.2059,
"step": 1434
},
{
"epoch": 0.46,
"grad_norm": 6.032773887097328,
"learning_rate": 5.8994125591812914e-06,
"loss": 0.6078,
"step": 1435
},
{
"epoch": 0.46,
"grad_norm": 14.010395787236638,
"learning_rate": 5.894314176420247e-06,
"loss": 0.5693,
"step": 1436
},
{
"epoch": 0.46,
"grad_norm": 6.1418982593984355,
"learning_rate": 5.889214832892489e-06,
"loss": 0.4619,
"step": 1437
},
{
"epoch": 0.46,
"grad_norm": 1.748778932171975,
"learning_rate": 5.8841145340762665e-06,
"loss": 0.2725,
"step": 1438
},
{
"epoch": 0.46,
"grad_norm": 1.5967154667979515,
"learning_rate": 5.879013285450863e-06,
"loss": 0.2116,
"step": 1439
},
{
"epoch": 0.46,
"grad_norm": 5.50949370617981,
"learning_rate": 5.873911092496577e-06,
"loss": 0.7226,
"step": 1440
},
{
"epoch": 0.46,
"grad_norm": 5.423469449952084,
"learning_rate": 5.8688079606947226e-06,
"loss": 0.6379,
"step": 1441
},
{
"epoch": 0.46,
"grad_norm": 8.37037831983272,
"learning_rate": 5.8637038955276225e-06,
"loss": 0.5217,
"step": 1442
},
{
"epoch": 0.46,
"grad_norm": 1.549702917190697,
"learning_rate": 5.858598902478604e-06,
"loss": 0.2595,
"step": 1443
},
{
"epoch": 0.46,
"grad_norm": 1.5640442062254063,
"learning_rate": 5.853492987031989e-06,
"loss": 0.2442,
"step": 1444
},
{
"epoch": 0.46,
"grad_norm": 5.853289519816507,
"learning_rate": 5.8483861546730915e-06,
"loss": 0.6658,
"step": 1445
},
{
"epoch": 0.46,
"grad_norm": 7.383185087894194,
"learning_rate": 5.843278410888208e-06,
"loss": 0.5993,
"step": 1446
},
{
"epoch": 0.46,
"grad_norm": 6.915037700741412,
"learning_rate": 5.838169761164616e-06,
"loss": 0.6638,
"step": 1447
},
{
"epoch": 0.46,
"grad_norm": 1.6018368980030406,
"learning_rate": 5.83306021099057e-06,
"loss": 0.2413,
"step": 1448
},
{
"epoch": 0.46,
"grad_norm": 5.669865895754325,
"learning_rate": 5.827949765855285e-06,
"loss": 0.7527,
"step": 1449
},
{
"epoch": 0.46,
"grad_norm": 6.946321758995887,
"learning_rate": 5.822838431248943e-06,
"loss": 0.7266,
"step": 1450
},
{
"epoch": 0.46,
"grad_norm": 8.295686551610215,
"learning_rate": 5.817726212662678e-06,
"loss": 0.564,
"step": 1451
},
{
"epoch": 0.46,
"grad_norm": 1.3597518824025032,
"learning_rate": 5.812613115588575e-06,
"loss": 0.1907,
"step": 1452
},
{
"epoch": 0.46,
"grad_norm": 1.532874896372509,
"learning_rate": 5.807499145519663e-06,
"loss": 0.2264,
"step": 1453
},
{
"epoch": 0.47,
"grad_norm": 11.40493517560385,
"learning_rate": 5.802384307949909e-06,
"loss": 0.5242,
"step": 1454
},
{
"epoch": 0.47,
"grad_norm": 6.279974313265418,
"learning_rate": 5.79726860837421e-06,
"loss": 0.4193,
"step": 1455
},
{
"epoch": 0.47,
"grad_norm": 7.086086409607318,
"learning_rate": 5.792152052288391e-06,
"loss": 0.4238,
"step": 1456
},
{
"epoch": 0.47,
"grad_norm": 5.433958488568387,
"learning_rate": 5.787034645189199e-06,
"loss": 0.6271,
"step": 1457
},
{
"epoch": 0.47,
"grad_norm": 1.590299256922156,
"learning_rate": 5.7819163925742915e-06,
"loss": 0.244,
"step": 1458
},
{
"epoch": 0.47,
"grad_norm": 5.871403509877293,
"learning_rate": 5.776797299942236e-06,
"loss": 0.4004,
"step": 1459
},
{
"epoch": 0.47,
"grad_norm": 1.5616498231554254,
"learning_rate": 5.771677372792502e-06,
"loss": 0.251,
"step": 1460
},
{
"epoch": 0.47,
"grad_norm": 6.874638425737895,
"learning_rate": 5.766556616625456e-06,
"loss": 0.4635,
"step": 1461
},
{
"epoch": 0.47,
"grad_norm": 5.844574655902825,
"learning_rate": 5.7614350369423555e-06,
"loss": 0.5394,
"step": 1462
},
{
"epoch": 0.47,
"grad_norm": 1.5094171919456973,
"learning_rate": 5.7563126392453415e-06,
"loss": 0.2052,
"step": 1463
},
{
"epoch": 0.47,
"grad_norm": 6.914812909343156,
"learning_rate": 5.751189429037435e-06,
"loss": 0.5199,
"step": 1464
},
{
"epoch": 0.47,
"grad_norm": 1.4516814450314157,
"learning_rate": 5.746065411822528e-06,
"loss": 0.2077,
"step": 1465
},
{
"epoch": 0.47,
"grad_norm": 10.75699520458192,
"learning_rate": 5.740940593105383e-06,
"loss": 0.6669,
"step": 1466
},
{
"epoch": 0.47,
"grad_norm": 1.4071175629833979,
"learning_rate": 5.73581497839162e-06,
"loss": 0.1879,
"step": 1467
},
{
"epoch": 0.47,
"grad_norm": 1.7856542360993077,
"learning_rate": 5.730688573187715e-06,
"loss": 0.2152,
"step": 1468
},
{
"epoch": 0.47,
"grad_norm": 4.801735894205422,
"learning_rate": 5.725561383000994e-06,
"loss": 0.5234,
"step": 1469
},
{
"epoch": 0.47,
"grad_norm": 8.100454032430392,
"learning_rate": 5.720433413339627e-06,
"loss": 0.6928,
"step": 1470
},
{
"epoch": 0.47,
"grad_norm": 6.165357334442091,
"learning_rate": 5.71530466971262e-06,
"loss": 0.7244,
"step": 1471
},
{
"epoch": 0.47,
"grad_norm": 1.4082644657868995,
"learning_rate": 5.710175157629812e-06,
"loss": 0.179,
"step": 1472
},
{
"epoch": 0.47,
"grad_norm": 5.230557380399823,
"learning_rate": 5.705044882601862e-06,
"loss": 0.6223,
"step": 1473
},
{
"epoch": 0.47,
"grad_norm": 6.311395519487412,
"learning_rate": 5.69991385014026e-06,
"loss": 0.5669,
"step": 1474
},
{
"epoch": 0.47,
"grad_norm": 1.5884491952741175,
"learning_rate": 5.694782065757298e-06,
"loss": 0.2397,
"step": 1475
},
{
"epoch": 0.47,
"grad_norm": 1.5967611503008956,
"learning_rate": 5.689649534966083e-06,
"loss": 0.2117,
"step": 1476
},
{
"epoch": 0.47,
"grad_norm": 9.269060222334259,
"learning_rate": 5.684516263280519e-06,
"loss": 0.4528,
"step": 1477
},
{
"epoch": 0.47,
"grad_norm": 1.5570685995516629,
"learning_rate": 5.679382256215311e-06,
"loss": 0.2377,
"step": 1478
},
{
"epoch": 0.47,
"grad_norm": 1.5572563031572018,
"learning_rate": 5.674247519285951e-06,
"loss": 0.2151,
"step": 1479
},
{
"epoch": 0.47,
"grad_norm": 5.291556244427816,
"learning_rate": 5.6691120580087126e-06,
"loss": 0.5447,
"step": 1480
},
{
"epoch": 0.47,
"grad_norm": 1.6515665279687242,
"learning_rate": 5.6639758779006535e-06,
"loss": 0.2395,
"step": 1481
},
{
"epoch": 0.47,
"grad_norm": 6.089072374992358,
"learning_rate": 5.6588389844796e-06,
"loss": 0.575,
"step": 1482
},
{
"epoch": 0.47,
"grad_norm": 1.5221716539675059,
"learning_rate": 5.653701383264147e-06,
"loss": 0.2671,
"step": 1483
},
{
"epoch": 0.47,
"grad_norm": 1.6032218271717784,
"learning_rate": 5.648563079773646e-06,
"loss": 0.2181,
"step": 1484
},
{
"epoch": 0.48,
"grad_norm": 1.4587673717871135,
"learning_rate": 5.6434240795282045e-06,
"loss": 0.1982,
"step": 1485
},
{
"epoch": 0.48,
"grad_norm": 1.812150431885437,
"learning_rate": 5.63828438804868e-06,
"loss": 0.273,
"step": 1486
},
{
"epoch": 0.48,
"grad_norm": 1.3381332936511572,
"learning_rate": 5.6331440108566735e-06,
"loss": 0.2072,
"step": 1487
},
{
"epoch": 0.48,
"grad_norm": 1.611767812418644,
"learning_rate": 5.628002953474521e-06,
"loss": 0.2202,
"step": 1488
},
{
"epoch": 0.48,
"grad_norm": 7.9373186172470716,
"learning_rate": 5.622861221425286e-06,
"loss": 0.6506,
"step": 1489
},
{
"epoch": 0.48,
"grad_norm": 1.4949252067454528,
"learning_rate": 5.617718820232762e-06,
"loss": 0.2792,
"step": 1490
},
{
"epoch": 0.48,
"grad_norm": 1.694110592549428,
"learning_rate": 5.612575755421459e-06,
"loss": 0.2656,
"step": 1491
},
{
"epoch": 0.48,
"grad_norm": 1.7161503822743196,
"learning_rate": 5.607432032516601e-06,
"loss": 0.2191,
"step": 1492
},
{
"epoch": 0.48,
"grad_norm": 1.4554034554643032,
"learning_rate": 5.602287657044116e-06,
"loss": 0.1987,
"step": 1493
},
{
"epoch": 0.48,
"grad_norm": 5.5181690890907795,
"learning_rate": 5.597142634530639e-06,
"loss": 0.4919,
"step": 1494
},
{
"epoch": 0.48,
"grad_norm": 1.507976163980248,
"learning_rate": 5.5919969705034914e-06,
"loss": 0.2824,
"step": 1495
},
{
"epoch": 0.48,
"grad_norm": 13.592867993844482,
"learning_rate": 5.586850670490694e-06,
"loss": 0.559,
"step": 1496
},
{
"epoch": 0.48,
"grad_norm": 6.669820839398386,
"learning_rate": 5.581703740020943e-06,
"loss": 0.4962,
"step": 1497
},
{
"epoch": 0.48,
"grad_norm": 1.547748409577588,
"learning_rate": 5.576556184623615e-06,
"loss": 0.2775,
"step": 1498
},
{
"epoch": 0.48,
"grad_norm": 1.5626471518217038,
"learning_rate": 5.571408009828757e-06,
"loss": 0.1881,
"step": 1499
},
{
"epoch": 0.48,
"grad_norm": 1.6137393051541997,
"learning_rate": 5.56625922116708e-06,
"loss": 0.2485,
"step": 1500
},
{
"epoch": 0.48,
"grad_norm": 9.060923338470827,
"learning_rate": 5.561109824169962e-06,
"loss": 0.5879,
"step": 1501
},
{
"epoch": 0.48,
"grad_norm": 1.5275029545197205,
"learning_rate": 5.555959824369426e-06,
"loss": 0.2227,
"step": 1502
},
{
"epoch": 0.48,
"grad_norm": 5.890581980889761,
"learning_rate": 5.550809227298144e-06,
"loss": 0.5153,
"step": 1503
},
{
"epoch": 0.48,
"grad_norm": 9.476490387947244,
"learning_rate": 5.545658038489433e-06,
"loss": 0.6009,
"step": 1504
},
{
"epoch": 0.48,
"grad_norm": 1.5572148092298645,
"learning_rate": 5.540506263477243e-06,
"loss": 0.2361,
"step": 1505
},
{
"epoch": 0.48,
"grad_norm": 8.160987697726984,
"learning_rate": 5.535353907796155e-06,
"loss": 0.6372,
"step": 1506
},
{
"epoch": 0.48,
"grad_norm": 1.6589959079076817,
"learning_rate": 5.530200976981375e-06,
"loss": 0.2323,
"step": 1507
},
{
"epoch": 0.48,
"grad_norm": 1.4941811975869728,
"learning_rate": 5.525047476568722e-06,
"loss": 0.209,
"step": 1508
},
{
"epoch": 0.48,
"grad_norm": 1.4608107235234649,
"learning_rate": 5.519893412094631e-06,
"loss": 0.191,
"step": 1509
},
{
"epoch": 0.48,
"grad_norm": 1.5807509840278664,
"learning_rate": 5.514738789096146e-06,
"loss": 0.2184,
"step": 1510
},
{
"epoch": 0.48,
"grad_norm": 5.140081435868278,
"learning_rate": 5.509583613110904e-06,
"loss": 0.5034,
"step": 1511
},
{
"epoch": 0.48,
"grad_norm": 5.338634886118849,
"learning_rate": 5.504427889677141e-06,
"loss": 0.5531,
"step": 1512
},
{
"epoch": 0.48,
"grad_norm": 5.834479825244042,
"learning_rate": 5.499271624333676e-06,
"loss": 0.5415,
"step": 1513
},
{
"epoch": 0.48,
"grad_norm": 1.409874549266494,
"learning_rate": 5.494114822619918e-06,
"loss": 0.2146,
"step": 1514
},
{
"epoch": 0.48,
"grad_norm": 8.157767358611192,
"learning_rate": 5.488957490075846e-06,
"loss": 0.4026,
"step": 1515
},
{
"epoch": 0.49,
"grad_norm": 5.29294075997288,
"learning_rate": 5.483799632242012e-06,
"loss": 0.4952,
"step": 1516
},
{
"epoch": 0.49,
"grad_norm": 7.020074969303334,
"learning_rate": 5.478641254659528e-06,
"loss": 0.619,
"step": 1517
},
{
"epoch": 0.49,
"grad_norm": 6.501749877133997,
"learning_rate": 5.473482362870073e-06,
"loss": 0.7001,
"step": 1518
},
{
"epoch": 0.49,
"grad_norm": 9.145386640976326,
"learning_rate": 5.468322962415871e-06,
"loss": 0.8264,
"step": 1519
},
{
"epoch": 0.49,
"grad_norm": 1.4552497346840383,
"learning_rate": 5.463163058839694e-06,
"loss": 0.1967,
"step": 1520
},
{
"epoch": 0.49,
"grad_norm": 6.150379067884863,
"learning_rate": 5.4580026576848565e-06,
"loss": 0.4477,
"step": 1521
},
{
"epoch": 0.49,
"grad_norm": 6.856821377217917,
"learning_rate": 5.452841764495203e-06,
"loss": 0.5338,
"step": 1522
},
{
"epoch": 0.49,
"grad_norm": 5.548740046702573,
"learning_rate": 5.4476803848151146e-06,
"loss": 0.4569,
"step": 1523
},
{
"epoch": 0.49,
"grad_norm": 5.2199746505908315,
"learning_rate": 5.442518524189489e-06,
"loss": 0.5058,
"step": 1524
},
{
"epoch": 0.49,
"grad_norm": 1.569638947005932,
"learning_rate": 5.4373561881637405e-06,
"loss": 0.2068,
"step": 1525
},
{
"epoch": 0.49,
"grad_norm": 6.571124488387639,
"learning_rate": 5.432193382283794e-06,
"loss": 0.59,
"step": 1526
},
{
"epoch": 0.49,
"grad_norm": 11.072398210438648,
"learning_rate": 5.4270301120960856e-06,
"loss": 0.5034,
"step": 1527
},
{
"epoch": 0.49,
"grad_norm": 1.6604978324242805,
"learning_rate": 5.421866383147541e-06,
"loss": 0.2269,
"step": 1528
},
{
"epoch": 0.49,
"grad_norm": 1.5631886466397849,
"learning_rate": 5.416702200985585e-06,
"loss": 0.2451,
"step": 1529
},
{
"epoch": 0.49,
"grad_norm": 1.6920966611788442,
"learning_rate": 5.411537571158127e-06,
"loss": 0.2147,
"step": 1530
},
{
"epoch": 0.49,
"grad_norm": 1.428778094040855,
"learning_rate": 5.406372499213557e-06,
"loss": 0.1904,
"step": 1531
},
{
"epoch": 0.49,
"grad_norm": 8.719729696636715,
"learning_rate": 5.401206990700741e-06,
"loss": 0.7195,
"step": 1532
},
{
"epoch": 0.49,
"grad_norm": 6.322321002079452,
"learning_rate": 5.396041051169016e-06,
"loss": 0.2925,
"step": 1533
},
{
"epoch": 0.49,
"grad_norm": 1.4600090811959099,
"learning_rate": 5.390874686168176e-06,
"loss": 0.2174,
"step": 1534
},
{
"epoch": 0.49,
"grad_norm": 8.808701673971717,
"learning_rate": 5.385707901248478e-06,
"loss": 0.51,
"step": 1535
},
{
"epoch": 0.49,
"grad_norm": 5.090679953288352,
"learning_rate": 5.380540701960627e-06,
"loss": 0.6264,
"step": 1536
},
{
"epoch": 0.49,
"grad_norm": 7.9727189340200235,
"learning_rate": 5.375373093855774e-06,
"loss": 0.5375,
"step": 1537
},
{
"epoch": 0.49,
"grad_norm": 1.5756568587127695,
"learning_rate": 5.37020508248551e-06,
"loss": 0.2231,
"step": 1538
},
{
"epoch": 0.49,
"grad_norm": 7.208546903518983,
"learning_rate": 5.365036673401857e-06,
"loss": 0.6135,
"step": 1539
},
{
"epoch": 0.49,
"grad_norm": 6.451098663402852,
"learning_rate": 5.359867872157267e-06,
"loss": 0.6545,
"step": 1540
},
{
"epoch": 0.49,
"grad_norm": 5.373817665970237,
"learning_rate": 5.354698684304613e-06,
"loss": 0.5989,
"step": 1541
},
{
"epoch": 0.49,
"grad_norm": 6.9457673284712955,
"learning_rate": 5.3495291153971806e-06,
"loss": 0.4982,
"step": 1542
},
{
"epoch": 0.49,
"grad_norm": 1.9560317530021443,
"learning_rate": 5.344359170988668e-06,
"loss": 0.236,
"step": 1543
},
{
"epoch": 0.49,
"grad_norm": 1.6454647228387338,
"learning_rate": 5.339188856633173e-06,
"loss": 0.2521,
"step": 1544
},
{
"epoch": 0.49,
"grad_norm": 7.26682281445475,
"learning_rate": 5.3340181778851954e-06,
"loss": 0.6656,
"step": 1545
},
{
"epoch": 0.49,
"grad_norm": 1.565229116455418,
"learning_rate": 5.328847140299624e-06,
"loss": 0.2452,
"step": 1546
},
{
"epoch": 0.5,
"grad_norm": 1.4624006842320267,
"learning_rate": 5.323675749431732e-06,
"loss": 0.2404,
"step": 1547
},
{
"epoch": 0.5,
"grad_norm": 1.3092376971382669,
"learning_rate": 5.318504010837175e-06,
"loss": 0.2074,
"step": 1548
},
{
"epoch": 0.5,
"grad_norm": 1.3964277051842584,
"learning_rate": 5.313331930071981e-06,
"loss": 0.2302,
"step": 1549
},
{
"epoch": 0.5,
"grad_norm": 5.998195505824346,
"learning_rate": 5.308159512692544e-06,
"loss": 0.5598,
"step": 1550
},
{
"epoch": 0.5,
"grad_norm": 1.506562101070047,
"learning_rate": 5.302986764255621e-06,
"loss": 0.2524,
"step": 1551
},
{
"epoch": 0.5,
"grad_norm": 1.6797851981172873,
"learning_rate": 5.297813690318325e-06,
"loss": 0.2354,
"step": 1552
},
{
"epoch": 0.5,
"grad_norm": 7.9833282259171545,
"learning_rate": 5.292640296438116e-06,
"loss": 0.5724,
"step": 1553
},
{
"epoch": 0.5,
"grad_norm": 1.351785654007111,
"learning_rate": 5.287466588172804e-06,
"loss": 0.1724,
"step": 1554
},
{
"epoch": 0.5,
"grad_norm": 6.805419920563617,
"learning_rate": 5.2822925710805305e-06,
"loss": 0.6574,
"step": 1555
},
{
"epoch": 0.5,
"grad_norm": 1.4500618472699565,
"learning_rate": 5.27711825071977e-06,
"loss": 0.2205,
"step": 1556
},
{
"epoch": 0.5,
"grad_norm": 1.4666057939408428,
"learning_rate": 5.2719436326493255e-06,
"loss": 0.2131,
"step": 1557
},
{
"epoch": 0.5,
"grad_norm": 9.876809659366371,
"learning_rate": 5.266768722428318e-06,
"loss": 0.4681,
"step": 1558
},
{
"epoch": 0.5,
"grad_norm": 1.6895598053477852,
"learning_rate": 5.261593525616181e-06,
"loss": 0.2415,
"step": 1559
},
{
"epoch": 0.5,
"grad_norm": 33.141230223671855,
"learning_rate": 5.256418047772659e-06,
"loss": 0.6495,
"step": 1560
},
{
"epoch": 0.5,
"grad_norm": 7.222142527434707,
"learning_rate": 5.251242294457796e-06,
"loss": 0.5127,
"step": 1561
},
{
"epoch": 0.5,
"grad_norm": 1.606258762350376,
"learning_rate": 5.2460662712319335e-06,
"loss": 0.1983,
"step": 1562
},
{
"epoch": 0.5,
"grad_norm": 5.820040399203535,
"learning_rate": 5.240889983655701e-06,
"loss": 0.7071,
"step": 1563
},
{
"epoch": 0.5,
"grad_norm": 1.5914402191905133,
"learning_rate": 5.235713437290012e-06,
"loss": 0.2751,
"step": 1564
},
{
"epoch": 0.5,
"grad_norm": 6.225603346384584,
"learning_rate": 5.230536637696062e-06,
"loss": 0.5746,
"step": 1565
},
{
"epoch": 0.5,
"grad_norm": 1.541989696517418,
"learning_rate": 5.225359590435312e-06,
"loss": 0.2241,
"step": 1566
},
{
"epoch": 0.5,
"grad_norm": 8.6583511717209,
"learning_rate": 5.220182301069499e-06,
"loss": 0.5356,
"step": 1567
},
{
"epoch": 0.5,
"grad_norm": 1.5076977854306777,
"learning_rate": 5.215004775160608e-06,
"loss": 0.2231,
"step": 1568
},
{
"epoch": 0.5,
"grad_norm": 1.6451346099309636,
"learning_rate": 5.209827018270886e-06,
"loss": 0.2346,
"step": 1569
},
{
"epoch": 0.5,
"grad_norm": 1.475130054029139,
"learning_rate": 5.204649035962825e-06,
"loss": 0.1982,
"step": 1570
},
{
"epoch": 0.5,
"grad_norm": 8.804277991122476,
"learning_rate": 5.199470833799164e-06,
"loss": 0.5918,
"step": 1571
},
{
"epoch": 0.5,
"grad_norm": 6.092780082625882,
"learning_rate": 5.1942924173428725e-06,
"loss": 0.5275,
"step": 1572
},
{
"epoch": 0.5,
"grad_norm": 1.6915166302676385,
"learning_rate": 5.18911379215715e-06,
"loss": 0.2502,
"step": 1573
},
{
"epoch": 0.5,
"grad_norm": 1.6615219075229888,
"learning_rate": 5.1839349638054245e-06,
"loss": 0.2571,
"step": 1574
},
{
"epoch": 0.5,
"grad_norm": 5.776297222782462,
"learning_rate": 5.178755937851341e-06,
"loss": 0.6465,
"step": 1575
},
{
"epoch": 0.5,
"grad_norm": 1.5680198666699618,
"learning_rate": 5.173576719858755e-06,
"loss": 0.2135,
"step": 1576
},
{
"epoch": 0.5,
"grad_norm": 1.5331842990028555,
"learning_rate": 5.168397315391729e-06,
"loss": 0.2177,
"step": 1577
},
{
"epoch": 0.5,
"grad_norm": 1.7123322876331615,
"learning_rate": 5.1632177300145255e-06,
"loss": 0.2162,
"step": 1578
},
{
"epoch": 0.51,
"grad_norm": 5.657687983274009,
"learning_rate": 5.1580379692916025e-06,
"loss": 0.6432,
"step": 1579
},
{
"epoch": 0.51,
"grad_norm": 6.8565715982878075,
"learning_rate": 5.152858038787608e-06,
"loss": 0.6184,
"step": 1580
},
{
"epoch": 0.51,
"grad_norm": 1.2964004826638862,
"learning_rate": 5.147677944067368e-06,
"loss": 0.1906,
"step": 1581
},
{
"epoch": 0.51,
"grad_norm": 8.914120125918338,
"learning_rate": 5.142497690695888e-06,
"loss": 0.5578,
"step": 1582
},
{
"epoch": 0.51,
"grad_norm": 1.4918586765278812,
"learning_rate": 5.137317284238344e-06,
"loss": 0.1902,
"step": 1583
},
{
"epoch": 0.51,
"grad_norm": 4.910516822516931,
"learning_rate": 5.1321367302600726e-06,
"loss": 0.5745,
"step": 1584
},
{
"epoch": 0.51,
"grad_norm": 1.4232854829167805,
"learning_rate": 5.126956034326573e-06,
"loss": 0.2139,
"step": 1585
},
{
"epoch": 0.51,
"grad_norm": 1.6422619704895127,
"learning_rate": 5.121775202003499e-06,
"loss": 0.2442,
"step": 1586
},
{
"epoch": 0.51,
"grad_norm": 10.239078670288134,
"learning_rate": 5.116594238856645e-06,
"loss": 0.6108,
"step": 1587
},
{
"epoch": 0.51,
"grad_norm": 1.5131236382195519,
"learning_rate": 5.111413150451948e-06,
"loss": 0.2163,
"step": 1588
},
{
"epoch": 0.51,
"grad_norm": 1.4550396146124138,
"learning_rate": 5.1062319423554815e-06,
"loss": 0.2177,
"step": 1589
},
{
"epoch": 0.51,
"grad_norm": 6.032387469093304,
"learning_rate": 5.101050620133447e-06,
"loss": 0.5713,
"step": 1590
},
{
"epoch": 0.51,
"grad_norm": 1.5872937712989676,
"learning_rate": 5.095869189352166e-06,
"loss": 0.2172,
"step": 1591
},
{
"epoch": 0.51,
"grad_norm": 8.086742056216735,
"learning_rate": 5.090687655578078e-06,
"loss": 0.539,
"step": 1592
},
{
"epoch": 0.51,
"grad_norm": 7.058019487453007,
"learning_rate": 5.0855060243777366e-06,
"loss": 0.497,
"step": 1593
},
{
"epoch": 0.51,
"grad_norm": 1.5755013536979725,
"learning_rate": 5.080324301317795e-06,
"loss": 0.2408,
"step": 1594
},
{
"epoch": 0.51,
"grad_norm": 8.28028752357728,
"learning_rate": 5.0751424919650085e-06,
"loss": 0.5223,
"step": 1595
},
{
"epoch": 0.51,
"grad_norm": 7.1948558854661595,
"learning_rate": 5.069960601886224e-06,
"loss": 0.4149,
"step": 1596
},
{
"epoch": 0.51,
"grad_norm": 1.6331384794955384,
"learning_rate": 5.064778636648371e-06,
"loss": 0.2335,
"step": 1597
},
{
"epoch": 0.51,
"grad_norm": 1.600789803430444,
"learning_rate": 5.05959660181847e-06,
"loss": 0.217,
"step": 1598
},
{
"epoch": 0.51,
"grad_norm": 6.480435576720631,
"learning_rate": 5.054414502963605e-06,
"loss": 0.6898,
"step": 1599
},
{
"epoch": 0.51,
"grad_norm": 6.300727891687989,
"learning_rate": 5.049232345650936e-06,
"loss": 0.524,
"step": 1600
},
{
"epoch": 0.51,
"grad_norm": 7.718258211695682,
"learning_rate": 5.044050135447682e-06,
"loss": 0.6407,
"step": 1601
},
{
"epoch": 0.51,
"grad_norm": 6.390117065260238,
"learning_rate": 5.038867877921124e-06,
"loss": 0.5661,
"step": 1602
},
{
"epoch": 0.51,
"grad_norm": 1.5551064516429625,
"learning_rate": 5.033685578638586e-06,
"loss": 0.2444,
"step": 1603
},
{
"epoch": 0.51,
"grad_norm": 1.366443591531705,
"learning_rate": 5.028503243167443e-06,
"loss": 0.2058,
"step": 1604
},
{
"epoch": 0.51,
"grad_norm": 1.5408277425842203,
"learning_rate": 5.023320877075107e-06,
"loss": 0.2366,
"step": 1605
},
{
"epoch": 0.51,
"grad_norm": 28.713074028896308,
"learning_rate": 5.0181384859290215e-06,
"loss": 0.6233,
"step": 1606
},
{
"epoch": 0.51,
"grad_norm": 5.933668894742834,
"learning_rate": 5.01295607529666e-06,
"loss": 0.5112,
"step": 1607
},
{
"epoch": 0.51,
"grad_norm": 6.807756503769824,
"learning_rate": 5.007773650745514e-06,
"loss": 0.5858,
"step": 1608
},
{
"epoch": 0.51,
"grad_norm": 1.456530499385341,
"learning_rate": 5.0025912178430925e-06,
"loss": 0.178,
"step": 1609
},
{
"epoch": 0.52,
"grad_norm": 1.2971745276954956,
"learning_rate": 4.997408782156909e-06,
"loss": 0.1656,
"step": 1610
},
{
"epoch": 0.52,
"grad_norm": 4.7365125043629295,
"learning_rate": 4.9922263492544885e-06,
"loss": 0.5786,
"step": 1611
},
{
"epoch": 0.52,
"grad_norm": 1.5329901231772882,
"learning_rate": 4.987043924703342e-06,
"loss": 0.1941,
"step": 1612
},
{
"epoch": 0.52,
"grad_norm": 5.556001088065364,
"learning_rate": 4.981861514070979e-06,
"loss": 0.5425,
"step": 1613
},
{
"epoch": 0.52,
"grad_norm": 6.003284521046527,
"learning_rate": 4.976679122924896e-06,
"loss": 0.5762,
"step": 1614
},
{
"epoch": 0.52,
"grad_norm": 1.4350546509665651,
"learning_rate": 4.971496756832557e-06,
"loss": 0.1769,
"step": 1615
},
{
"epoch": 0.52,
"grad_norm": 11.116719286554067,
"learning_rate": 4.966314421361416e-06,
"loss": 0.7031,
"step": 1616
},
{
"epoch": 0.52,
"grad_norm": 5.856978400288504,
"learning_rate": 4.9611321220788775e-06,
"loss": 0.5082,
"step": 1617
},
{
"epoch": 0.52,
"grad_norm": 6.127190486034729,
"learning_rate": 4.955949864552318e-06,
"loss": 0.436,
"step": 1618
},
{
"epoch": 0.52,
"grad_norm": 7.534478150983952,
"learning_rate": 4.950767654349067e-06,
"loss": 0.7365,
"step": 1619
},
{
"epoch": 0.52,
"grad_norm": 1.4807038799566166,
"learning_rate": 4.945585497036396e-06,
"loss": 0.1882,
"step": 1620
},
{
"epoch": 0.52,
"grad_norm": 1.4785272497673148,
"learning_rate": 4.940403398181531e-06,
"loss": 0.1919,
"step": 1621
},
{
"epoch": 0.52,
"grad_norm": 1.7903098995121822,
"learning_rate": 4.935221363351631e-06,
"loss": 0.2484,
"step": 1622
},
{
"epoch": 0.52,
"grad_norm": 6.8985010613029925,
"learning_rate": 4.930039398113779e-06,
"loss": 0.6661,
"step": 1623
},
{
"epoch": 0.52,
"grad_norm": 7.243755959377869,
"learning_rate": 4.924857508034994e-06,
"loss": 0.6885,
"step": 1624
},
{
"epoch": 0.52,
"grad_norm": 7.642501092348366,
"learning_rate": 4.919675698682206e-06,
"loss": 0.6593,
"step": 1625
},
{
"epoch": 0.52,
"grad_norm": 1.5911392117340943,
"learning_rate": 4.914493975622263e-06,
"loss": 0.2475,
"step": 1626
},
{
"epoch": 0.52,
"grad_norm": 1.4100831001147551,
"learning_rate": 4.909312344421923e-06,
"loss": 0.2414,
"step": 1627
},
{
"epoch": 0.52,
"grad_norm": 6.571236025451696,
"learning_rate": 4.904130810647836e-06,
"loss": 0.4527,
"step": 1628
},
{
"epoch": 0.52,
"grad_norm": 6.669335144079173,
"learning_rate": 4.898949379866556e-06,
"loss": 0.6132,
"step": 1629
},
{
"epoch": 0.52,
"grad_norm": 1.4163218798256578,
"learning_rate": 4.893768057644519e-06,
"loss": 0.1848,
"step": 1630
},
{
"epoch": 0.52,
"grad_norm": 11.033302150040155,
"learning_rate": 4.888586849548053e-06,
"loss": 0.7577,
"step": 1631
},
{
"epoch": 0.52,
"grad_norm": 1.511734054726008,
"learning_rate": 4.883405761143357e-06,
"loss": 0.242,
"step": 1632
},
{
"epoch": 0.52,
"grad_norm": 7.732217467007269,
"learning_rate": 4.878224797996502e-06,
"loss": 0.634,
"step": 1633
},
{
"epoch": 0.52,
"grad_norm": 1.5410664491465915,
"learning_rate": 4.873043965673427e-06,
"loss": 0.2178,
"step": 1634
},
{
"epoch": 0.52,
"grad_norm": 1.648199921207977,
"learning_rate": 4.86786326973993e-06,
"loss": 0.1893,
"step": 1635
},
{
"epoch": 0.52,
"grad_norm": 1.6163905000816106,
"learning_rate": 4.862682715761658e-06,
"loss": 0.2211,
"step": 1636
},
{
"epoch": 0.52,
"grad_norm": 7.6480927880722005,
"learning_rate": 4.857502309304114e-06,
"loss": 0.4649,
"step": 1637
},
{
"epoch": 0.52,
"grad_norm": 7.508552081177089,
"learning_rate": 4.852322055932633e-06,
"loss": 0.5303,
"step": 1638
},
{
"epoch": 0.52,
"grad_norm": 3.464004710850647,
"learning_rate": 4.8471419612123925e-06,
"loss": 0.4644,
"step": 1639
},
{
"epoch": 0.52,
"grad_norm": 1.6136388326074615,
"learning_rate": 4.841962030708398e-06,
"loss": 0.2076,
"step": 1640
},
{
"epoch": 0.53,
"grad_norm": 7.585705888438502,
"learning_rate": 4.836782269985475e-06,
"loss": 0.5187,
"step": 1641
},
{
"epoch": 0.53,
"grad_norm": 1.4571905814330208,
"learning_rate": 4.831602684608274e-06,
"loss": 0.2184,
"step": 1642
},
{
"epoch": 0.53,
"grad_norm": 1.5722559874679716,
"learning_rate": 4.826423280141247e-06,
"loss": 0.2403,
"step": 1643
},
{
"epoch": 0.53,
"grad_norm": 4.848711773204869,
"learning_rate": 4.82124406214866e-06,
"loss": 0.604,
"step": 1644
},
{
"epoch": 0.53,
"grad_norm": 8.078099268633892,
"learning_rate": 4.816065036194576e-06,
"loss": 0.6213,
"step": 1645
},
{
"epoch": 0.53,
"grad_norm": 1.4522697917992005,
"learning_rate": 4.810886207842852e-06,
"loss": 0.1996,
"step": 1646
},
{
"epoch": 0.53,
"grad_norm": 9.659059380062939,
"learning_rate": 4.80570758265713e-06,
"loss": 0.5396,
"step": 1647
},
{
"epoch": 0.53,
"grad_norm": 1.3500567745829162,
"learning_rate": 4.800529166200837e-06,
"loss": 0.1811,
"step": 1648
},
{
"epoch": 0.53,
"grad_norm": 1.409901723972015,
"learning_rate": 4.795350964037174e-06,
"loss": 0.193,
"step": 1649
},
{
"epoch": 0.53,
"grad_norm": 1.67728842944199,
"learning_rate": 4.790172981729116e-06,
"loss": 0.2073,
"step": 1650
},
{
"epoch": 0.53,
"grad_norm": 1.4747163179967033,
"learning_rate": 4.784995224839394e-06,
"loss": 0.2011,
"step": 1651
},
{
"epoch": 0.53,
"grad_norm": 1.3918472947201297,
"learning_rate": 4.779817698930502e-06,
"loss": 0.1807,
"step": 1652
},
{
"epoch": 0.53,
"grad_norm": 1.5452694954497928,
"learning_rate": 4.774640409564688e-06,
"loss": 0.2433,
"step": 1653
},
{
"epoch": 0.53,
"grad_norm": 1.482549087730426,
"learning_rate": 4.76946336230394e-06,
"loss": 0.2244,
"step": 1654
},
{
"epoch": 0.53,
"grad_norm": 1.7114952280499294,
"learning_rate": 4.76428656270999e-06,
"loss": 0.1936,
"step": 1655
},
{
"epoch": 0.53,
"grad_norm": 1.7086166774833853,
"learning_rate": 4.759110016344302e-06,
"loss": 0.2624,
"step": 1656
},
{
"epoch": 0.53,
"grad_norm": 5.141583085656002,
"learning_rate": 4.753933728768069e-06,
"loss": 0.7628,
"step": 1657
},
{
"epoch": 0.53,
"grad_norm": 1.5872873234003366,
"learning_rate": 4.748757705542205e-06,
"loss": 0.2484,
"step": 1658
},
{
"epoch": 0.53,
"grad_norm": 5.550112130971094,
"learning_rate": 4.743581952227342e-06,
"loss": 0.4721,
"step": 1659
},
{
"epoch": 0.53,
"grad_norm": 8.536380150480248,
"learning_rate": 4.73840647438382e-06,
"loss": 0.6757,
"step": 1660
},
{
"epoch": 0.53,
"grad_norm": 4.989894442579678,
"learning_rate": 4.733231277571683e-06,
"loss": 0.6371,
"step": 1661
},
{
"epoch": 0.53,
"grad_norm": 1.4358952778672174,
"learning_rate": 4.7280563673506745e-06,
"loss": 0.205,
"step": 1662
},
{
"epoch": 0.53,
"grad_norm": 6.398696302319853,
"learning_rate": 4.722881749280232e-06,
"loss": 0.6624,
"step": 1663
},
{
"epoch": 0.53,
"grad_norm": 1.5544463809533282,
"learning_rate": 4.717707428919471e-06,
"loss": 0.2099,
"step": 1664
},
{
"epoch": 0.53,
"grad_norm": 1.5735953592365777,
"learning_rate": 4.712533411827197e-06,
"loss": 0.2089,
"step": 1665
},
{
"epoch": 0.53,
"grad_norm": 1.9030007651152627,
"learning_rate": 4.707359703561885e-06,
"loss": 0.2436,
"step": 1666
},
{
"epoch": 0.53,
"grad_norm": 13.291749202832973,
"learning_rate": 4.702186309681677e-06,
"loss": 0.5519,
"step": 1667
},
{
"epoch": 0.53,
"grad_norm": 1.410983901672467,
"learning_rate": 4.697013235744382e-06,
"loss": 0.1887,
"step": 1668
},
{
"epoch": 0.53,
"grad_norm": 1.613523732340002,
"learning_rate": 4.6918404873074574e-06,
"loss": 0.2727,
"step": 1669
},
{
"epoch": 0.53,
"grad_norm": 1.6072885094968894,
"learning_rate": 4.68666806992802e-06,
"loss": 0.2198,
"step": 1670
},
{
"epoch": 0.53,
"grad_norm": 1.5578545006744975,
"learning_rate": 4.681495989162826e-06,
"loss": 0.1873,
"step": 1671
},
{
"epoch": 0.54,
"grad_norm": 1.3940148056278308,
"learning_rate": 4.676324250568269e-06,
"loss": 0.2252,
"step": 1672
},
{
"epoch": 0.54,
"grad_norm": 5.288350557992271,
"learning_rate": 4.671152859700377e-06,
"loss": 0.5956,
"step": 1673
},
{
"epoch": 0.54,
"grad_norm": 18.703974121950782,
"learning_rate": 4.665981822114805e-06,
"loss": 0.68,
"step": 1674
},
{
"epoch": 0.54,
"grad_norm": 1.5045278407254707,
"learning_rate": 4.660811143366828e-06,
"loss": 0.2309,
"step": 1675
},
{
"epoch": 0.54,
"grad_norm": 1.7567489603891941,
"learning_rate": 4.655640829011335e-06,
"loss": 0.2693,
"step": 1676
},
{
"epoch": 0.54,
"grad_norm": 12.370221926729593,
"learning_rate": 4.65047088460282e-06,
"loss": 0.5591,
"step": 1677
},
{
"epoch": 0.54,
"grad_norm": 6.638345947199636,
"learning_rate": 4.645301315695387e-06,
"loss": 0.6048,
"step": 1678
},
{
"epoch": 0.54,
"grad_norm": 7.489636741394892,
"learning_rate": 4.6401321278427334e-06,
"loss": 0.5245,
"step": 1679
},
{
"epoch": 0.54,
"grad_norm": 6.883036801328955,
"learning_rate": 4.634963326598143e-06,
"loss": 0.4518,
"step": 1680
},
{
"epoch": 0.54,
"grad_norm": 6.186896493463807,
"learning_rate": 4.629794917514492e-06,
"loss": 0.4841,
"step": 1681
},
{
"epoch": 0.54,
"grad_norm": 1.3125884874686817,
"learning_rate": 4.624626906144227e-06,
"loss": 0.2029,
"step": 1682
},
{
"epoch": 0.54,
"grad_norm": 9.82696660510622,
"learning_rate": 4.619459298039373e-06,
"loss": 0.7887,
"step": 1683
},
{
"epoch": 0.54,
"grad_norm": 1.4749125491531885,
"learning_rate": 4.614292098751524e-06,
"loss": 0.194,
"step": 1684
},
{
"epoch": 0.54,
"grad_norm": 1.4302320856558082,
"learning_rate": 4.609125313831826e-06,
"loss": 0.2203,
"step": 1685
},
{
"epoch": 0.54,
"grad_norm": 1.295901379646909,
"learning_rate": 4.603958948830985e-06,
"loss": 0.2134,
"step": 1686
},
{
"epoch": 0.54,
"grad_norm": 7.043975863801591,
"learning_rate": 4.5987930092992596e-06,
"loss": 0.6505,
"step": 1687
},
{
"epoch": 0.54,
"grad_norm": 4.988966009411801,
"learning_rate": 4.593627500786444e-06,
"loss": 0.5901,
"step": 1688
},
{
"epoch": 0.54,
"grad_norm": 7.100701994793637,
"learning_rate": 4.588462428841875e-06,
"loss": 0.4935,
"step": 1689
},
{
"epoch": 0.54,
"grad_norm": 1.5612284060847004,
"learning_rate": 4.5832977990144165e-06,
"loss": 0.2397,
"step": 1690
},
{
"epoch": 0.54,
"grad_norm": 7.763162077786609,
"learning_rate": 4.578133616852462e-06,
"loss": 0.5933,
"step": 1691
},
{
"epoch": 0.54,
"grad_norm": 1.6277933828415267,
"learning_rate": 4.572969887903916e-06,
"loss": 0.2468,
"step": 1692
},
{
"epoch": 0.54,
"grad_norm": 9.051605590968945,
"learning_rate": 4.5678066177162065e-06,
"loss": 0.5205,
"step": 1693
},
{
"epoch": 0.54,
"grad_norm": 1.5020282985899696,
"learning_rate": 4.562643811836263e-06,
"loss": 0.2084,
"step": 1694
},
{
"epoch": 0.54,
"grad_norm": 1.5209349923279736,
"learning_rate": 4.557481475810512e-06,
"loss": 0.199,
"step": 1695
},
{
"epoch": 0.54,
"grad_norm": 1.5830508474830205,
"learning_rate": 4.5523196151848846e-06,
"loss": 0.2403,
"step": 1696
},
{
"epoch": 0.54,
"grad_norm": 1.5070688422079719,
"learning_rate": 4.547158235504797e-06,
"loss": 0.2077,
"step": 1697
},
{
"epoch": 0.54,
"grad_norm": 5.724310924018424,
"learning_rate": 4.541997342315145e-06,
"loss": 0.6048,
"step": 1698
},
{
"epoch": 0.54,
"grad_norm": 1.665470954850008,
"learning_rate": 4.536836941160308e-06,
"loss": 0.2369,
"step": 1699
},
{
"epoch": 0.54,
"grad_norm": 1.437798091753683,
"learning_rate": 4.5316770375841315e-06,
"loss": 0.2101,
"step": 1700
},
{
"epoch": 0.54,
"grad_norm": 5.1224393390996115,
"learning_rate": 4.526517637129927e-06,
"loss": 0.6111,
"step": 1701
},
{
"epoch": 0.54,
"grad_norm": 1.673943747936093,
"learning_rate": 4.5213587453404736e-06,
"loss": 0.2374,
"step": 1702
},
{
"epoch": 0.54,
"grad_norm": 7.690904641655284,
"learning_rate": 4.5162003677579905e-06,
"loss": 0.4557,
"step": 1703
},
{
"epoch": 0.55,
"grad_norm": 14.562195036058197,
"learning_rate": 4.511042509924157e-06,
"loss": 0.6341,
"step": 1704
},
{
"epoch": 0.55,
"grad_norm": 6.243810690232639,
"learning_rate": 4.505885177380083e-06,
"loss": 0.5888,
"step": 1705
},
{
"epoch": 0.55,
"grad_norm": 1.475041588060696,
"learning_rate": 4.5007283756663245e-06,
"loss": 0.2086,
"step": 1706
},
{
"epoch": 0.55,
"grad_norm": 1.6999297366948942,
"learning_rate": 4.495572110322862e-06,
"loss": 0.2425,
"step": 1707
},
{
"epoch": 0.55,
"grad_norm": 1.4591341691196764,
"learning_rate": 4.490416386889097e-06,
"loss": 0.2335,
"step": 1708
},
{
"epoch": 0.55,
"grad_norm": 1.5909566837696103,
"learning_rate": 4.485261210903854e-06,
"loss": 0.2214,
"step": 1709
},
{
"epoch": 0.55,
"grad_norm": 1.6263574903008335,
"learning_rate": 4.48010658790537e-06,
"loss": 0.2573,
"step": 1710
},
{
"epoch": 0.55,
"grad_norm": 1.715216576143055,
"learning_rate": 4.47495252343128e-06,
"loss": 0.2013,
"step": 1711
},
{
"epoch": 0.55,
"grad_norm": 7.54061829603288,
"learning_rate": 4.469799023018628e-06,
"loss": 0.552,
"step": 1712
},
{
"epoch": 0.55,
"grad_norm": 5.734174964300568,
"learning_rate": 4.464646092203846e-06,
"loss": 0.6602,
"step": 1713
},
{
"epoch": 0.55,
"grad_norm": 1.6703194095537768,
"learning_rate": 4.459493736522759e-06,
"loss": 0.2743,
"step": 1714
},
{
"epoch": 0.55,
"grad_norm": 16.523338714753198,
"learning_rate": 4.4543419615105685e-06,
"loss": 0.5262,
"step": 1715
},
{
"epoch": 0.55,
"grad_norm": 1.470165111573854,
"learning_rate": 4.449190772701857e-06,
"loss": 0.2122,
"step": 1716
},
{
"epoch": 0.55,
"grad_norm": 5.174557962848891,
"learning_rate": 4.444040175630577e-06,
"loss": 0.4696,
"step": 1717
},
{
"epoch": 0.55,
"grad_norm": 1.473974371288623,
"learning_rate": 4.438890175830039e-06,
"loss": 0.2049,
"step": 1718
},
{
"epoch": 0.55,
"grad_norm": 6.846259281417841,
"learning_rate": 4.433740778832919e-06,
"loss": 0.5117,
"step": 1719
},
{
"epoch": 0.55,
"grad_norm": 1.618092015002244,
"learning_rate": 4.428591990171246e-06,
"loss": 0.2407,
"step": 1720
},
{
"epoch": 0.55,
"grad_norm": 1.314876889469853,
"learning_rate": 4.423443815376387e-06,
"loss": 0.2395,
"step": 1721
},
{
"epoch": 0.55,
"grad_norm": 1.3907786379474978,
"learning_rate": 4.41829625997906e-06,
"loss": 0.1988,
"step": 1722
},
{
"epoch": 0.55,
"grad_norm": 5.320405944111289,
"learning_rate": 4.413149329509307e-06,
"loss": 0.5652,
"step": 1723
},
{
"epoch": 0.55,
"grad_norm": 6.131506633508225,
"learning_rate": 4.4080030294965085e-06,
"loss": 0.6283,
"step": 1724
},
{
"epoch": 0.55,
"grad_norm": 7.11126164609056,
"learning_rate": 4.402857365469364e-06,
"loss": 0.6257,
"step": 1725
},
{
"epoch": 0.55,
"grad_norm": 5.173440219100077,
"learning_rate": 4.397712342955885e-06,
"loss": 0.4752,
"step": 1726
},
{
"epoch": 0.55,
"grad_norm": 1.5879725972573628,
"learning_rate": 4.392567967483401e-06,
"loss": 0.1979,
"step": 1727
},
{
"epoch": 0.55,
"grad_norm": 1.6411051746008234,
"learning_rate": 4.387424244578543e-06,
"loss": 0.2097,
"step": 1728
},
{
"epoch": 0.55,
"grad_norm": 1.3874887488821213,
"learning_rate": 4.38228117976724e-06,
"loss": 0.1998,
"step": 1729
},
{
"epoch": 0.55,
"grad_norm": 6.474706912612365,
"learning_rate": 4.377138778574716e-06,
"loss": 0.598,
"step": 1730
},
{
"epoch": 0.55,
"grad_norm": 1.4060198079872865,
"learning_rate": 4.371997046525481e-06,
"loss": 0.1733,
"step": 1731
},
{
"epoch": 0.55,
"grad_norm": 5.102142597037899,
"learning_rate": 4.366855989143326e-06,
"loss": 0.4742,
"step": 1732
},
{
"epoch": 0.55,
"grad_norm": 1.6057890375213386,
"learning_rate": 4.3617156119513206e-06,
"loss": 0.2359,
"step": 1733
},
{
"epoch": 0.55,
"grad_norm": 6.140387765097553,
"learning_rate": 4.356575920471796e-06,
"loss": 0.5508,
"step": 1734
},
{
"epoch": 0.56,
"grad_norm": 4.3850640405697385,
"learning_rate": 4.351436920226357e-06,
"loss": 0.4268,
"step": 1735
},
{
"epoch": 0.56,
"grad_norm": 1.5197048783350897,
"learning_rate": 4.346298616735855e-06,
"loss": 0.2046,
"step": 1736
},
{
"epoch": 0.56,
"grad_norm": 20.629712192925226,
"learning_rate": 4.3411610155204e-06,
"loss": 0.4235,
"step": 1737
},
{
"epoch": 0.56,
"grad_norm": 1.6883210321669588,
"learning_rate": 4.336024122099348e-06,
"loss": 0.1832,
"step": 1738
},
{
"epoch": 0.56,
"grad_norm": 1.5904537055421788,
"learning_rate": 4.330887941991288e-06,
"loss": 0.2142,
"step": 1739
},
{
"epoch": 0.56,
"grad_norm": 5.795803994129138,
"learning_rate": 4.325752480714052e-06,
"loss": 0.5504,
"step": 1740
},
{
"epoch": 0.56,
"grad_norm": 6.523796739514021,
"learning_rate": 4.320617743784691e-06,
"loss": 0.6077,
"step": 1741
},
{
"epoch": 0.56,
"grad_norm": 1.5539773029900492,
"learning_rate": 4.315483736719482e-06,
"loss": 0.1987,
"step": 1742
},
{
"epoch": 0.56,
"grad_norm": 1.6922857302290626,
"learning_rate": 4.310350465033919e-06,
"loss": 0.2046,
"step": 1743
},
{
"epoch": 0.56,
"grad_norm": 1.7881382147495553,
"learning_rate": 4.305217934242703e-06,
"loss": 0.2259,
"step": 1744
},
{
"epoch": 0.56,
"grad_norm": 7.370882923128106,
"learning_rate": 4.30008614985974e-06,
"loss": 0.6432,
"step": 1745
},
{
"epoch": 0.56,
"grad_norm": 6.986938487466245,
"learning_rate": 4.294955117398139e-06,
"loss": 0.5934,
"step": 1746
},
{
"epoch": 0.56,
"grad_norm": 1.6701181700532506,
"learning_rate": 4.28982484237019e-06,
"loss": 0.2474,
"step": 1747
},
{
"epoch": 0.56,
"grad_norm": 1.7166998278994992,
"learning_rate": 4.284695330287383e-06,
"loss": 0.2933,
"step": 1748
},
{
"epoch": 0.56,
"grad_norm": 1.504265962284257,
"learning_rate": 4.279566586660375e-06,
"loss": 0.2139,
"step": 1749
},
{
"epoch": 0.56,
"grad_norm": 4.411824646932481,
"learning_rate": 4.274438616999007e-06,
"loss": 0.5801,
"step": 1750
},
{
"epoch": 0.56,
"grad_norm": 1.6267117477861073,
"learning_rate": 4.269311426812287e-06,
"loss": 0.1945,
"step": 1751
},
{
"epoch": 0.56,
"grad_norm": 1.4356191284625996,
"learning_rate": 4.264185021608382e-06,
"loss": 0.2397,
"step": 1752
},
{
"epoch": 0.56,
"grad_norm": 1.5724914920821444,
"learning_rate": 4.259059406894619e-06,
"loss": 0.2002,
"step": 1753
},
{
"epoch": 0.56,
"grad_norm": 1.7246348319553122,
"learning_rate": 4.253934588177473e-06,
"loss": 0.2042,
"step": 1754
},
{
"epoch": 0.56,
"grad_norm": 6.5861733210418905,
"learning_rate": 4.248810570962567e-06,
"loss": 0.5944,
"step": 1755
},
{
"epoch": 0.56,
"grad_norm": 1.6896236515184517,
"learning_rate": 4.24368736075466e-06,
"loss": 0.2512,
"step": 1756
},
{
"epoch": 0.56,
"grad_norm": 1.3707584100361558,
"learning_rate": 4.238564963057646e-06,
"loss": 0.1468,
"step": 1757
},
{
"epoch": 0.56,
"grad_norm": 6.754086049900066,
"learning_rate": 4.233443383374545e-06,
"loss": 0.4956,
"step": 1758
},
{
"epoch": 0.56,
"grad_norm": 1.7097356157742607,
"learning_rate": 4.228322627207499e-06,
"loss": 0.204,
"step": 1759
},
{
"epoch": 0.56,
"grad_norm": 1.407742853131279,
"learning_rate": 4.223202700057765e-06,
"loss": 0.1992,
"step": 1760
},
{
"epoch": 0.56,
"grad_norm": 3.8199802577515234,
"learning_rate": 4.21808360742571e-06,
"loss": 0.5266,
"step": 1761
},
{
"epoch": 0.56,
"grad_norm": 4.401863554668091,
"learning_rate": 4.212965354810802e-06,
"loss": 0.4576,
"step": 1762
},
{
"epoch": 0.56,
"grad_norm": 4.212513214675979,
"learning_rate": 4.207847947711609e-06,
"loss": 0.3691,
"step": 1763
},
{
"epoch": 0.56,
"grad_norm": 7.627512871078331,
"learning_rate": 4.202731391625793e-06,
"loss": 0.5152,
"step": 1764
},
{
"epoch": 0.56,
"grad_norm": 1.7060818304324066,
"learning_rate": 4.1976156920500935e-06,
"loss": 0.2203,
"step": 1765
},
{
"epoch": 0.57,
"grad_norm": 1.5876225424702133,
"learning_rate": 4.19250085448034e-06,
"loss": 0.2687,
"step": 1766
},
{
"epoch": 0.57,
"grad_norm": 6.748841244034787,
"learning_rate": 4.187386884411426e-06,
"loss": 0.6513,
"step": 1767
},
{
"epoch": 0.57,
"grad_norm": 1.7081167230192895,
"learning_rate": 4.182273787337323e-06,
"loss": 0.2453,
"step": 1768
},
{
"epoch": 0.57,
"grad_norm": 6.566783302797043,
"learning_rate": 4.177161568751058e-06,
"loss": 0.4728,
"step": 1769
},
{
"epoch": 0.57,
"grad_norm": 1.3607097035796645,
"learning_rate": 4.172050234144716e-06,
"loss": 0.1577,
"step": 1770
},
{
"epoch": 0.57,
"grad_norm": 5.071976306728582,
"learning_rate": 4.16693978900943e-06,
"loss": 0.5186,
"step": 1771
},
{
"epoch": 0.57,
"grad_norm": 1.4231539023670694,
"learning_rate": 4.161830238835386e-06,
"loss": 0.2001,
"step": 1772
},
{
"epoch": 0.57,
"grad_norm": 6.42368716816888,
"learning_rate": 4.156721589111794e-06,
"loss": 0.5327,
"step": 1773
},
{
"epoch": 0.57,
"grad_norm": 7.420872560774164,
"learning_rate": 4.151613845326912e-06,
"loss": 0.595,
"step": 1774
},
{
"epoch": 0.57,
"grad_norm": 8.846826652876,
"learning_rate": 4.146507012968013e-06,
"loss": 0.508,
"step": 1775
},
{
"epoch": 0.57,
"grad_norm": 1.7543235755795679,
"learning_rate": 4.141401097521396e-06,
"loss": 0.2541,
"step": 1776
},
{
"epoch": 0.57,
"grad_norm": 1.5510284346806966,
"learning_rate": 4.136296104472378e-06,
"loss": 0.2492,
"step": 1777
},
{
"epoch": 0.57,
"grad_norm": 5.458911366718936,
"learning_rate": 4.131192039305278e-06,
"loss": 0.4002,
"step": 1778
},
{
"epoch": 0.57,
"grad_norm": 1.4430146067488399,
"learning_rate": 4.1260889075034254e-06,
"loss": 0.1913,
"step": 1779
},
{
"epoch": 0.57,
"grad_norm": 7.781277777619646,
"learning_rate": 4.120986714549139e-06,
"loss": 0.7018,
"step": 1780
},
{
"epoch": 0.57,
"grad_norm": 1.4839708473717983,
"learning_rate": 4.115885465923734e-06,
"loss": 0.1654,
"step": 1781
},
{
"epoch": 0.57,
"grad_norm": 1.4460528605904932,
"learning_rate": 4.110785167107514e-06,
"loss": 0.1957,
"step": 1782
},
{
"epoch": 0.57,
"grad_norm": 6.773365435616381,
"learning_rate": 4.1056858235797545e-06,
"loss": 0.5647,
"step": 1783
},
{
"epoch": 0.57,
"grad_norm": 12.220426218358293,
"learning_rate": 4.100587440818709e-06,
"loss": 0.5426,
"step": 1784
},
{
"epoch": 0.57,
"grad_norm": 22.772817762692874,
"learning_rate": 4.0954900243016016e-06,
"loss": 0.7357,
"step": 1785
},
{
"epoch": 0.57,
"grad_norm": 7.48829174411786,
"learning_rate": 4.090393579504612e-06,
"loss": 0.7169,
"step": 1786
},
{
"epoch": 0.57,
"grad_norm": 4.648309740903713,
"learning_rate": 4.085298111902882e-06,
"loss": 0.5263,
"step": 1787
},
{
"epoch": 0.57,
"grad_norm": 1.597602967806243,
"learning_rate": 4.080203626970498e-06,
"loss": 0.2189,
"step": 1788
},
{
"epoch": 0.57,
"grad_norm": 6.071825457725986,
"learning_rate": 4.0751101301804945e-06,
"loss": 0.4255,
"step": 1789
},
{
"epoch": 0.57,
"grad_norm": 1.652792052327222,
"learning_rate": 4.070017627004847e-06,
"loss": 0.2202,
"step": 1790
},
{
"epoch": 0.57,
"grad_norm": 1.5537026369081302,
"learning_rate": 4.0649261229144554e-06,
"loss": 0.23,
"step": 1791
},
{
"epoch": 0.57,
"grad_norm": 8.08258986927151,
"learning_rate": 4.059835623379155e-06,
"loss": 0.5197,
"step": 1792
},
{
"epoch": 0.57,
"grad_norm": 7.320119345147985,
"learning_rate": 4.054746133867693e-06,
"loss": 0.5979,
"step": 1793
},
{
"epoch": 0.57,
"grad_norm": 1.4035316461225102,
"learning_rate": 4.0496576598477396e-06,
"loss": 0.2177,
"step": 1794
},
{
"epoch": 0.57,
"grad_norm": 4.939010069100584,
"learning_rate": 4.044570206785874e-06,
"loss": 0.5821,
"step": 1795
},
{
"epoch": 0.57,
"grad_norm": 5.746077810353174,
"learning_rate": 4.039483780147568e-06,
"loss": 0.6111,
"step": 1796
},
{
"epoch": 0.58,
"grad_norm": 1.6318997310079097,
"learning_rate": 4.0343983853972045e-06,
"loss": 0.278,
"step": 1797
},
{
"epoch": 0.58,
"grad_norm": 1.6138834178838686,
"learning_rate": 4.029314027998049e-06,
"loss": 0.2164,
"step": 1798
},
{
"epoch": 0.58,
"grad_norm": 1.5217006808164721,
"learning_rate": 4.024230713412253e-06,
"loss": 0.2481,
"step": 1799
},
{
"epoch": 0.58,
"grad_norm": 1.560099534240655,
"learning_rate": 4.019148447100855e-06,
"loss": 0.1939,
"step": 1800
},
{
"epoch": 0.58,
"grad_norm": 4.902827913888488,
"learning_rate": 4.014067234523756e-06,
"loss": 0.4879,
"step": 1801
},
{
"epoch": 0.58,
"grad_norm": 1.3308791069045471,
"learning_rate": 4.008987081139734e-06,
"loss": 0.1588,
"step": 1802
},
{
"epoch": 0.58,
"grad_norm": 5.461107239679109,
"learning_rate": 4.0039079924064285e-06,
"loss": 0.5968,
"step": 1803
},
{
"epoch": 0.58,
"grad_norm": 5.210652903402672,
"learning_rate": 3.998829973780329e-06,
"loss": 0.497,
"step": 1804
},
{
"epoch": 0.58,
"grad_norm": 1.4707306984844475,
"learning_rate": 3.993753030716783e-06,
"loss": 0.236,
"step": 1805
},
{
"epoch": 0.58,
"grad_norm": 1.5171120911365463,
"learning_rate": 3.988677168669974e-06,
"loss": 0.2225,
"step": 1806
},
{
"epoch": 0.58,
"grad_norm": 4.87098980089652,
"learning_rate": 3.983602393092931e-06,
"loss": 0.4864,
"step": 1807
},
{
"epoch": 0.58,
"grad_norm": 1.4358674306353407,
"learning_rate": 3.978528709437518e-06,
"loss": 0.2057,
"step": 1808
},
{
"epoch": 0.58,
"grad_norm": 1.3934230347776113,
"learning_rate": 3.973456123154415e-06,
"loss": 0.1959,
"step": 1809
},
{
"epoch": 0.58,
"grad_norm": 1.477054205162155,
"learning_rate": 3.9683846396931345e-06,
"loss": 0.1982,
"step": 1810
},
{
"epoch": 0.58,
"grad_norm": 8.558841450833153,
"learning_rate": 3.9633142645019965e-06,
"loss": 0.6265,
"step": 1811
},
{
"epoch": 0.58,
"grad_norm": 1.5627936620922227,
"learning_rate": 3.958245003028136e-06,
"loss": 0.2235,
"step": 1812
},
{
"epoch": 0.58,
"grad_norm": 6.318760709208686,
"learning_rate": 3.953176860717488e-06,
"loss": 0.5263,
"step": 1813
},
{
"epoch": 0.58,
"grad_norm": 7.759952604054854,
"learning_rate": 3.948109843014784e-06,
"loss": 0.6218,
"step": 1814
},
{
"epoch": 0.58,
"grad_norm": 5.424748184559697,
"learning_rate": 3.9430439553635504e-06,
"loss": 0.457,
"step": 1815
},
{
"epoch": 0.58,
"grad_norm": 1.5948020264956497,
"learning_rate": 3.937979203206103e-06,
"loss": 0.1881,
"step": 1816
},
{
"epoch": 0.58,
"grad_norm": 1.607966744580618,
"learning_rate": 3.932915591983526e-06,
"loss": 0.2084,
"step": 1817
},
{
"epoch": 0.58,
"grad_norm": 1.4836365539928573,
"learning_rate": 3.927853127135692e-06,
"loss": 0.2011,
"step": 1818
},
{
"epoch": 0.58,
"grad_norm": 1.5935039901049737,
"learning_rate": 3.92279181410123e-06,
"loss": 0.2311,
"step": 1819
},
{
"epoch": 0.58,
"grad_norm": 1.478616159116555,
"learning_rate": 3.917731658317538e-06,
"loss": 0.2153,
"step": 1820
},
{
"epoch": 0.58,
"grad_norm": 6.201648254842364,
"learning_rate": 3.912672665220773e-06,
"loss": 0.4949,
"step": 1821
},
{
"epoch": 0.58,
"grad_norm": 1.707593293804673,
"learning_rate": 3.907614840245836e-06,
"loss": 0.2136,
"step": 1822
},
{
"epoch": 0.58,
"grad_norm": 12.780929778792782,
"learning_rate": 3.90255818882638e-06,
"loss": 0.6331,
"step": 1823
},
{
"epoch": 0.58,
"grad_norm": 1.5792322617919698,
"learning_rate": 3.897502716394789e-06,
"loss": 0.2206,
"step": 1824
},
{
"epoch": 0.58,
"grad_norm": 1.580528741954114,
"learning_rate": 3.892448428382189e-06,
"loss": 0.1995,
"step": 1825
},
{
"epoch": 0.58,
"grad_norm": 6.1834818314351425,
"learning_rate": 3.887395330218429e-06,
"loss": 0.6776,
"step": 1826
},
{
"epoch": 0.58,
"grad_norm": 6.116530477185117,
"learning_rate": 3.8823434273320794e-06,
"loss": 0.4948,
"step": 1827
},
{
"epoch": 0.58,
"grad_norm": 39.93379751672083,
"learning_rate": 3.877292725150429e-06,
"loss": 0.7192,
"step": 1828
},
{
"epoch": 0.59,
"grad_norm": 1.4710177923811774,
"learning_rate": 3.872243229099476e-06,
"loss": 0.2086,
"step": 1829
},
{
"epoch": 0.59,
"grad_norm": 1.4747346622685849,
"learning_rate": 3.86719494460392e-06,
"loss": 0.1929,
"step": 1830
},
{
"epoch": 0.59,
"grad_norm": 9.177001305514132,
"learning_rate": 3.8621478770871645e-06,
"loss": 0.5628,
"step": 1831
},
{
"epoch": 0.59,
"grad_norm": 1.6173533928294679,
"learning_rate": 3.857102031971298e-06,
"loss": 0.2614,
"step": 1832
},
{
"epoch": 0.59,
"grad_norm": 10.674067426470812,
"learning_rate": 3.852057414677102e-06,
"loss": 0.7517,
"step": 1833
},
{
"epoch": 0.59,
"grad_norm": 1.671545325079946,
"learning_rate": 3.84701403062404e-06,
"loss": 0.2241,
"step": 1834
},
{
"epoch": 0.59,
"grad_norm": 1.4176956897590025,
"learning_rate": 3.841971885230243e-06,
"loss": 0.2066,
"step": 1835
},
{
"epoch": 0.59,
"grad_norm": 1.5737701194084075,
"learning_rate": 3.83693098391252e-06,
"loss": 0.2385,
"step": 1836
},
{
"epoch": 0.59,
"grad_norm": 6.775264960285367,
"learning_rate": 3.8318913320863355e-06,
"loss": 0.5762,
"step": 1837
},
{
"epoch": 0.59,
"grad_norm": 5.734145522241342,
"learning_rate": 3.826852935165818e-06,
"loss": 0.453,
"step": 1838
},
{
"epoch": 0.59,
"grad_norm": 1.5178221139864456,
"learning_rate": 3.8218157985637465e-06,
"loss": 0.2138,
"step": 1839
},
{
"epoch": 0.59,
"grad_norm": 6.409268309251431,
"learning_rate": 3.816779927691542e-06,
"loss": 0.5105,
"step": 1840
},
{
"epoch": 0.59,
"grad_norm": 5.486427847842529,
"learning_rate": 3.811745327959271e-06,
"loss": 0.4761,
"step": 1841
},
{
"epoch": 0.59,
"grad_norm": 1.6145787439209798,
"learning_rate": 3.8067120047756313e-06,
"loss": 0.2377,
"step": 1842
},
{
"epoch": 0.59,
"grad_norm": 7.775836928688941,
"learning_rate": 3.801679963547949e-06,
"loss": 0.7424,
"step": 1843
},
{
"epoch": 0.59,
"grad_norm": 1.6393563920290177,
"learning_rate": 3.7966492096821773e-06,
"loss": 0.2145,
"step": 1844
},
{
"epoch": 0.59,
"grad_norm": 1.554227386371731,
"learning_rate": 3.7916197485828793e-06,
"loss": 0.2287,
"step": 1845
},
{
"epoch": 0.59,
"grad_norm": 64.51241735007792,
"learning_rate": 3.786591585653235e-06,
"loss": 0.6451,
"step": 1846
},
{
"epoch": 0.59,
"grad_norm": 5.94324955915427,
"learning_rate": 3.7815647262950293e-06,
"loss": 0.5017,
"step": 1847
},
{
"epoch": 0.59,
"grad_norm": 1.627161395834181,
"learning_rate": 3.7765391759086424e-06,
"loss": 0.2171,
"step": 1848
},
{
"epoch": 0.59,
"grad_norm": 9.63365608585251,
"learning_rate": 3.771514939893055e-06,
"loss": 0.5464,
"step": 1849
},
{
"epoch": 0.59,
"grad_norm": 6.662502043180906,
"learning_rate": 3.766492023645827e-06,
"loss": 0.6075,
"step": 1850
},
{
"epoch": 0.59,
"grad_norm": 1.3520781910506796,
"learning_rate": 3.761470432563109e-06,
"loss": 0.1646,
"step": 1851
},
{
"epoch": 0.59,
"grad_norm": 1.4609425559575497,
"learning_rate": 3.7564501720396242e-06,
"loss": 0.1679,
"step": 1852
},
{
"epoch": 0.59,
"grad_norm": 1.5524679112039492,
"learning_rate": 3.7514312474686643e-06,
"loss": 0.1711,
"step": 1853
},
{
"epoch": 0.59,
"grad_norm": 1.4775660733216538,
"learning_rate": 3.74641366424209e-06,
"loss": 0.183,
"step": 1854
},
{
"epoch": 0.59,
"grad_norm": 9.881574715535093,
"learning_rate": 3.7413974277503183e-06,
"loss": 0.5769,
"step": 1855
},
{
"epoch": 0.59,
"grad_norm": 26.213951475540537,
"learning_rate": 3.7363825433823187e-06,
"loss": 0.5921,
"step": 1856
},
{
"epoch": 0.59,
"grad_norm": 1.4810382533508477,
"learning_rate": 3.7313690165256134e-06,
"loss": 0.2302,
"step": 1857
},
{
"epoch": 0.59,
"grad_norm": 4.144681656116014,
"learning_rate": 3.7263568525662574e-06,
"loss": 0.5283,
"step": 1858
},
{
"epoch": 0.59,
"grad_norm": 14.430189983347654,
"learning_rate": 3.7213460568888493e-06,
"loss": 0.6371,
"step": 1859
},
{
"epoch": 0.6,
"grad_norm": 5.732369200430303,
"learning_rate": 3.716336634876516e-06,
"loss": 0.508,
"step": 1860
},
{
"epoch": 0.6,
"grad_norm": 7.109457208818201,
"learning_rate": 3.711328591910904e-06,
"loss": 0.4142,
"step": 1861
},
{
"epoch": 0.6,
"grad_norm": 1.4623094872575517,
"learning_rate": 3.7063219333721857e-06,
"loss": 0.1638,
"step": 1862
},
{
"epoch": 0.6,
"grad_norm": 1.339292140127946,
"learning_rate": 3.7013166646390384e-06,
"loss": 0.175,
"step": 1863
},
{
"epoch": 0.6,
"grad_norm": 1.4369749908600948,
"learning_rate": 3.6963127910886526e-06,
"loss": 0.1886,
"step": 1864
},
{
"epoch": 0.6,
"grad_norm": 1.3876469665691422,
"learning_rate": 3.691310318096719e-06,
"loss": 0.1957,
"step": 1865
},
{
"epoch": 0.6,
"grad_norm": 15.726410514421962,
"learning_rate": 3.6863092510374198e-06,
"loss": 0.5672,
"step": 1866
},
{
"epoch": 0.6,
"grad_norm": 1.7560756066611702,
"learning_rate": 3.68130959528343e-06,
"loss": 0.2264,
"step": 1867
},
{
"epoch": 0.6,
"grad_norm": 9.844124211427951,
"learning_rate": 3.6763113562059077e-06,
"loss": 0.4824,
"step": 1868
},
{
"epoch": 0.6,
"grad_norm": 6.583757731411973,
"learning_rate": 3.6713145391744877e-06,
"loss": 0.5138,
"step": 1869
},
{
"epoch": 0.6,
"grad_norm": 1.542809947764467,
"learning_rate": 3.6663191495572827e-06,
"loss": 0.2279,
"step": 1870
},
{
"epoch": 0.6,
"grad_norm": 1.6469751073956655,
"learning_rate": 3.661325192720862e-06,
"loss": 0.2698,
"step": 1871
},
{
"epoch": 0.6,
"grad_norm": 1.4259574440751757,
"learning_rate": 3.6563326740302664e-06,
"loss": 0.1936,
"step": 1872
},
{
"epoch": 0.6,
"grad_norm": 1.6102929765619705,
"learning_rate": 3.6513415988489824e-06,
"loss": 0.2107,
"step": 1873
},
{
"epoch": 0.6,
"grad_norm": 11.98613733766446,
"learning_rate": 3.6463519725389516e-06,
"loss": 0.5655,
"step": 1874
},
{
"epoch": 0.6,
"grad_norm": 4.931491582455747,
"learning_rate": 3.64136380046056e-06,
"loss": 0.5029,
"step": 1875
},
{
"epoch": 0.6,
"grad_norm": 1.740292176898178,
"learning_rate": 3.6363770879726247e-06,
"loss": 0.2635,
"step": 1876
},
{
"epoch": 0.6,
"grad_norm": 5.246034674002168,
"learning_rate": 3.6313918404324e-06,
"loss": 0.4744,
"step": 1877
},
{
"epoch": 0.6,
"grad_norm": 1.5416381623598616,
"learning_rate": 3.6264080631955683e-06,
"loss": 0.2057,
"step": 1878
},
{
"epoch": 0.6,
"grad_norm": 6.5065831334552025,
"learning_rate": 3.621425761616224e-06,
"loss": 0.4561,
"step": 1879
},
{
"epoch": 0.6,
"grad_norm": 1.5709150371409581,
"learning_rate": 3.616444941046887e-06,
"loss": 0.226,
"step": 1880
},
{
"epoch": 0.6,
"grad_norm": 1.7661564422053209,
"learning_rate": 3.6114656068384767e-06,
"loss": 0.2397,
"step": 1881
},
{
"epoch": 0.6,
"grad_norm": 1.6447190302236576,
"learning_rate": 3.6064877643403194e-06,
"loss": 0.2028,
"step": 1882
},
{
"epoch": 0.6,
"grad_norm": 7.133717608567513,
"learning_rate": 3.601511418900143e-06,
"loss": 0.6751,
"step": 1883
},
{
"epoch": 0.6,
"grad_norm": 8.186006171230737,
"learning_rate": 3.5965365758640587e-06,
"loss": 0.5874,
"step": 1884
},
{
"epoch": 0.6,
"grad_norm": 1.7959754256515827,
"learning_rate": 3.591563240576572e-06,
"loss": 0.2749,
"step": 1885
},
{
"epoch": 0.6,
"grad_norm": 8.066879169307333,
"learning_rate": 3.5865914183805606e-06,
"loss": 0.7024,
"step": 1886
},
{
"epoch": 0.6,
"grad_norm": 1.561140650314446,
"learning_rate": 3.581621114617284e-06,
"loss": 0.2427,
"step": 1887
},
{
"epoch": 0.6,
"grad_norm": 1.5876581831452745,
"learning_rate": 3.5766523346263682e-06,
"loss": 0.1859,
"step": 1888
},
{
"epoch": 0.6,
"grad_norm": 4.346267893932182,
"learning_rate": 3.571685083745798e-06,
"loss": 0.4841,
"step": 1889
},
{
"epoch": 0.6,
"grad_norm": 5.402297344543703,
"learning_rate": 3.56671936731192e-06,
"loss": 0.5467,
"step": 1890
},
{
"epoch": 0.61,
"grad_norm": 22.42184732934803,
"learning_rate": 3.561755190659434e-06,
"loss": 0.6674,
"step": 1891
},
{
"epoch": 0.61,
"grad_norm": 1.4982944172410966,
"learning_rate": 3.556792559121377e-06,
"loss": 0.1991,
"step": 1892
},
{
"epoch": 0.61,
"grad_norm": 1.4595813614244566,
"learning_rate": 3.5518314780291384e-06,
"loss": 0.2148,
"step": 1893
},
{
"epoch": 0.61,
"grad_norm": 1.4316698683372513,
"learning_rate": 3.5468719527124294e-06,
"loss": 0.2419,
"step": 1894
},
{
"epoch": 0.61,
"grad_norm": 9.347706963596861,
"learning_rate": 3.541913988499299e-06,
"loss": 0.5543,
"step": 1895
},
{
"epoch": 0.61,
"grad_norm": 6.88633177109033,
"learning_rate": 3.5369575907161167e-06,
"loss": 0.6354,
"step": 1896
},
{
"epoch": 0.61,
"grad_norm": 1.4594224251034926,
"learning_rate": 3.5320027646875643e-06,
"loss": 0.2086,
"step": 1897
},
{
"epoch": 0.61,
"grad_norm": 13.25718892488323,
"learning_rate": 3.5270495157366434e-06,
"loss": 0.5,
"step": 1898
},
{
"epoch": 0.61,
"grad_norm": 1.423121637160957,
"learning_rate": 3.5220978491846534e-06,
"loss": 0.2008,
"step": 1899
},
{
"epoch": 0.61,
"grad_norm": 5.130360114988962,
"learning_rate": 3.517147770351199e-06,
"loss": 0.6193,
"step": 1900
},
{
"epoch": 0.61,
"grad_norm": 5.320821455316036,
"learning_rate": 3.5121992845541797e-06,
"loss": 0.4942,
"step": 1901
},
{
"epoch": 0.61,
"grad_norm": 1.659820858403778,
"learning_rate": 3.507252397109777e-06,
"loss": 0.2019,
"step": 1902
},
{
"epoch": 0.61,
"grad_norm": 2.63154526082194,
"learning_rate": 3.5023071133324627e-06,
"loss": 0.2248,
"step": 1903
},
{
"epoch": 0.61,
"grad_norm": 7.022826563452933,
"learning_rate": 3.497363438534984e-06,
"loss": 0.5328,
"step": 1904
},
{
"epoch": 0.61,
"grad_norm": 5.697366547300721,
"learning_rate": 3.4924213780283545e-06,
"loss": 0.5742,
"step": 1905
},
{
"epoch": 0.61,
"grad_norm": 1.331484318545027,
"learning_rate": 3.4874809371218608e-06,
"loss": 0.1748,
"step": 1906
},
{
"epoch": 0.61,
"grad_norm": 6.555156381913028,
"learning_rate": 3.4825421211230437e-06,
"loss": 0.4464,
"step": 1907
},
{
"epoch": 0.61,
"grad_norm": 1.831238929483396,
"learning_rate": 3.4776049353377016e-06,
"loss": 0.2653,
"step": 1908
},
{
"epoch": 0.61,
"grad_norm": 1.5539349573467844,
"learning_rate": 3.4726693850698824e-06,
"loss": 0.1831,
"step": 1909
},
{
"epoch": 0.61,
"grad_norm": 1.4921531483106225,
"learning_rate": 3.467735475621873e-06,
"loss": 0.2183,
"step": 1910
},
{
"epoch": 0.61,
"grad_norm": 6.822048837955331,
"learning_rate": 3.4628032122942024e-06,
"loss": 0.6846,
"step": 1911
},
{
"epoch": 0.61,
"grad_norm": 1.4375609709992334,
"learning_rate": 3.4578726003856245e-06,
"loss": 0.1897,
"step": 1912
},
{
"epoch": 0.61,
"grad_norm": 8.331973779115568,
"learning_rate": 3.4529436451931263e-06,
"loss": 0.6711,
"step": 1913
},
{
"epoch": 0.61,
"grad_norm": 1.433757835791655,
"learning_rate": 3.448016352011914e-06,
"loss": 0.206,
"step": 1914
},
{
"epoch": 0.61,
"grad_norm": 1.4097043159816938,
"learning_rate": 3.4430907261354e-06,
"loss": 0.2271,
"step": 1915
},
{
"epoch": 0.61,
"grad_norm": 1.4640202341252477,
"learning_rate": 3.438166772855218e-06,
"loss": 0.2438,
"step": 1916
},
{
"epoch": 0.61,
"grad_norm": 7.217793275103471,
"learning_rate": 3.4332444974611946e-06,
"loss": 0.5843,
"step": 1917
},
{
"epoch": 0.61,
"grad_norm": 1.6274207665218545,
"learning_rate": 3.428323905241358e-06,
"loss": 0.2075,
"step": 1918
},
{
"epoch": 0.61,
"grad_norm": 1.542948114277552,
"learning_rate": 3.4234050014819308e-06,
"loss": 0.1905,
"step": 1919
},
{
"epoch": 0.61,
"grad_norm": 4.881173284230082,
"learning_rate": 3.4184877914673155e-06,
"loss": 0.5326,
"step": 1920
},
{
"epoch": 0.61,
"grad_norm": 1.6121710493528836,
"learning_rate": 3.4135722804801004e-06,
"loss": 0.2501,
"step": 1921
},
{
"epoch": 0.62,
"grad_norm": 6.562998966937795,
"learning_rate": 3.4086584738010455e-06,
"loss": 0.7177,
"step": 1922
},
{
"epoch": 0.62,
"grad_norm": 5.635588077680637,
"learning_rate": 3.4037463767090807e-06,
"loss": 0.7366,
"step": 1923
},
{
"epoch": 0.62,
"grad_norm": 5.703916710895146,
"learning_rate": 3.3988359944812997e-06,
"loss": 0.5989,
"step": 1924
},
{
"epoch": 0.62,
"grad_norm": 5.786545352497819,
"learning_rate": 3.3939273323929533e-06,
"loss": 0.5596,
"step": 1925
},
{
"epoch": 0.62,
"grad_norm": 1.4666539704166872,
"learning_rate": 3.3890203957174437e-06,
"loss": 0.1994,
"step": 1926
},
{
"epoch": 0.62,
"grad_norm": 1.336478320422882,
"learning_rate": 3.3841151897263234e-06,
"loss": 0.1835,
"step": 1927
},
{
"epoch": 0.62,
"grad_norm": 6.148092605217953,
"learning_rate": 3.379211719689278e-06,
"loss": 0.4879,
"step": 1928
},
{
"epoch": 0.62,
"grad_norm": 1.7210581135618828,
"learning_rate": 3.3743099908741385e-06,
"loss": 0.236,
"step": 1929
},
{
"epoch": 0.62,
"grad_norm": 4.728235540536196,
"learning_rate": 3.3694100085468535e-06,
"loss": 0.5054,
"step": 1930
},
{
"epoch": 0.62,
"grad_norm": 4.778817295690519,
"learning_rate": 3.364511777971504e-06,
"loss": 0.5797,
"step": 1931
},
{
"epoch": 0.62,
"grad_norm": 1.6879178455255748,
"learning_rate": 3.3596153044102897e-06,
"loss": 0.2031,
"step": 1932
},
{
"epoch": 0.62,
"grad_norm": 6.524561783199086,
"learning_rate": 3.354720593123514e-06,
"loss": 0.4807,
"step": 1933
},
{
"epoch": 0.62,
"grad_norm": 1.4707396174392589,
"learning_rate": 3.349827649369596e-06,
"loss": 0.2166,
"step": 1934
},
{
"epoch": 0.62,
"grad_norm": 1.4140887072173238,
"learning_rate": 3.3449364784050515e-06,
"loss": 0.1924,
"step": 1935
},
{
"epoch": 0.62,
"grad_norm": 1.5230715057418331,
"learning_rate": 3.3400470854844925e-06,
"loss": 0.2251,
"step": 1936
},
{
"epoch": 0.62,
"grad_norm": 1.588316726208856,
"learning_rate": 3.3351594758606222e-06,
"loss": 0.1941,
"step": 1937
},
{
"epoch": 0.62,
"grad_norm": 1.392968028630388,
"learning_rate": 3.3302736547842263e-06,
"loss": 0.203,
"step": 1938
},
{
"epoch": 0.62,
"grad_norm": 15.627180374394712,
"learning_rate": 3.3253896275041677e-06,
"loss": 0.6433,
"step": 1939
},
{
"epoch": 0.62,
"grad_norm": 1.4727777997541183,
"learning_rate": 3.3205073992673885e-06,
"loss": 0.1627,
"step": 1940
},
{
"epoch": 0.62,
"grad_norm": 1.3929575755280745,
"learning_rate": 3.3156269753188895e-06,
"loss": 0.2365,
"step": 1941
},
{
"epoch": 0.62,
"grad_norm": 1.6450741699218787,
"learning_rate": 3.310748360901741e-06,
"loss": 0.2574,
"step": 1942
},
{
"epoch": 0.62,
"grad_norm": 1.4319705292160325,
"learning_rate": 3.3058715612570623e-06,
"loss": 0.1816,
"step": 1943
},
{
"epoch": 0.62,
"grad_norm": 1.907979158558452,
"learning_rate": 3.300996581624028e-06,
"loss": 0.2189,
"step": 1944
},
{
"epoch": 0.62,
"grad_norm": 1.5258151364041213,
"learning_rate": 3.2961234272398578e-06,
"loss": 0.22,
"step": 1945
},
{
"epoch": 0.62,
"grad_norm": 1.5108684651351383,
"learning_rate": 3.291252103339806e-06,
"loss": 0.2239,
"step": 1946
},
{
"epoch": 0.62,
"grad_norm": 8.675314497598764,
"learning_rate": 3.2863826151571654e-06,
"loss": 0.6347,
"step": 1947
},
{
"epoch": 0.62,
"grad_norm": 1.5687204871979519,
"learning_rate": 3.2815149679232507e-06,
"loss": 0.2125,
"step": 1948
},
{
"epoch": 0.62,
"grad_norm": 1.5618198177694862,
"learning_rate": 3.276649166867406e-06,
"loss": 0.1645,
"step": 1949
},
{
"epoch": 0.62,
"grad_norm": 21.69616175468184,
"learning_rate": 3.271785217216987e-06,
"loss": 0.5923,
"step": 1950
},
{
"epoch": 0.62,
"grad_norm": 1.6603910334654677,
"learning_rate": 3.266923124197363e-06,
"loss": 0.2036,
"step": 1951
},
{
"epoch": 0.62,
"grad_norm": 9.831953297013888,
"learning_rate": 3.2620628930319065e-06,
"loss": 0.5955,
"step": 1952
},
{
"epoch": 0.62,
"grad_norm": 1.4383358845394665,
"learning_rate": 3.257204528941993e-06,
"loss": 0.1878,
"step": 1953
},
{
"epoch": 0.63,
"grad_norm": 1.8937038713171377,
"learning_rate": 3.2523480371469863e-06,
"loss": 0.2566,
"step": 1954
},
{
"epoch": 0.63,
"grad_norm": 1.6601631341773913,
"learning_rate": 3.2474934228642475e-06,
"loss": 0.2116,
"step": 1955
},
{
"epoch": 0.63,
"grad_norm": 1.4946472491543998,
"learning_rate": 3.242640691309111e-06,
"loss": 0.2136,
"step": 1956
},
{
"epoch": 0.63,
"grad_norm": 1.663572259417549,
"learning_rate": 3.2377898476948964e-06,
"loss": 0.2139,
"step": 1957
},
{
"epoch": 0.63,
"grad_norm": 1.563525039515537,
"learning_rate": 3.2329408972328934e-06,
"loss": 0.2232,
"step": 1958
},
{
"epoch": 0.63,
"grad_norm": 8.282674321367239,
"learning_rate": 3.2280938451323524e-06,
"loss": 0.5165,
"step": 1959
},
{
"epoch": 0.63,
"grad_norm": 5.514702796745394,
"learning_rate": 3.223248696600493e-06,
"loss": 0.5625,
"step": 1960
},
{
"epoch": 0.63,
"grad_norm": 7.337272723423271,
"learning_rate": 3.2184054568424817e-06,
"loss": 0.652,
"step": 1961
},
{
"epoch": 0.63,
"grad_norm": 13.4565422202732,
"learning_rate": 3.2135641310614383e-06,
"loss": 0.4632,
"step": 1962
},
{
"epoch": 0.63,
"grad_norm": 8.20574380936658,
"learning_rate": 3.20872472445843e-06,
"loss": 0.5901,
"step": 1963
},
{
"epoch": 0.63,
"grad_norm": 1.6472218244461125,
"learning_rate": 3.203887242232455e-06,
"loss": 0.2726,
"step": 1964
},
{
"epoch": 0.63,
"grad_norm": 5.353518645247188,
"learning_rate": 3.1990516895804467e-06,
"loss": 0.6127,
"step": 1965
},
{
"epoch": 0.63,
"grad_norm": 6.213613480442003,
"learning_rate": 3.1942180716972698e-06,
"loss": 0.6286,
"step": 1966
},
{
"epoch": 0.63,
"grad_norm": 1.7739841634082338,
"learning_rate": 3.189386393775703e-06,
"loss": 0.2519,
"step": 1967
},
{
"epoch": 0.63,
"grad_norm": 1.6210267913735106,
"learning_rate": 3.1845566610064487e-06,
"loss": 0.1995,
"step": 1968
},
{
"epoch": 0.63,
"grad_norm": 1.5761620412579846,
"learning_rate": 3.179728878578112e-06,
"loss": 0.2101,
"step": 1969
},
{
"epoch": 0.63,
"grad_norm": 1.4856745471000272,
"learning_rate": 3.1749030516772084e-06,
"loss": 0.2027,
"step": 1970
},
{
"epoch": 0.63,
"grad_norm": 1.486780761283473,
"learning_rate": 3.170079185488153e-06,
"loss": 0.2093,
"step": 1971
},
{
"epoch": 0.63,
"grad_norm": 10.021905881679904,
"learning_rate": 3.165257285193248e-06,
"loss": 0.5723,
"step": 1972
},
{
"epoch": 0.63,
"grad_norm": 6.322326068998638,
"learning_rate": 3.1604373559726915e-06,
"loss": 0.5558,
"step": 1973
},
{
"epoch": 0.63,
"grad_norm": 9.687341983036632,
"learning_rate": 3.1556194030045563e-06,
"loss": 0.5726,
"step": 1974
},
{
"epoch": 0.63,
"grad_norm": 6.1359474444950735,
"learning_rate": 3.1508034314647994e-06,
"loss": 0.5188,
"step": 1975
},
{
"epoch": 0.63,
"grad_norm": 9.759980812467012,
"learning_rate": 3.1459894465272467e-06,
"loss": 0.7004,
"step": 1976
},
{
"epoch": 0.63,
"grad_norm": 9.084618702056398,
"learning_rate": 3.1411774533635854e-06,
"loss": 0.5408,
"step": 1977
},
{
"epoch": 0.63,
"grad_norm": 8.909269377302472,
"learning_rate": 3.136367457143369e-06,
"loss": 0.465,
"step": 1978
},
{
"epoch": 0.63,
"grad_norm": 8.94043320476312,
"learning_rate": 3.1315594630340052e-06,
"loss": 0.6813,
"step": 1979
},
{
"epoch": 0.63,
"grad_norm": 6.183749546562952,
"learning_rate": 3.1267534762007435e-06,
"loss": 0.4669,
"step": 1980
},
{
"epoch": 0.63,
"grad_norm": 1.5741234273784965,
"learning_rate": 3.1219495018066888e-06,
"loss": 0.2195,
"step": 1981
},
{
"epoch": 0.63,
"grad_norm": 7.276163102620715,
"learning_rate": 3.1171475450127717e-06,
"loss": 0.636,
"step": 1982
},
{
"epoch": 0.63,
"grad_norm": 1.6501460615291372,
"learning_rate": 3.112347610977764e-06,
"loss": 0.2233,
"step": 1983
},
{
"epoch": 0.63,
"grad_norm": 6.706422678256099,
"learning_rate": 3.1075497048582635e-06,
"loss": 0.5977,
"step": 1984
},
{
"epoch": 0.64,
"grad_norm": 5.121825184827592,
"learning_rate": 3.102753831808685e-06,
"loss": 0.5635,
"step": 1985
},
{
"epoch": 0.64,
"grad_norm": 5.944671423831112,
"learning_rate": 3.097959996981263e-06,
"loss": 0.6126,
"step": 1986
},
{
"epoch": 0.64,
"grad_norm": 1.5199018358150311,
"learning_rate": 3.093168205526038e-06,
"loss": 0.1821,
"step": 1987
},
{
"epoch": 0.64,
"grad_norm": 5.402984492584964,
"learning_rate": 3.0883784625908618e-06,
"loss": 0.5082,
"step": 1988
},
{
"epoch": 0.64,
"grad_norm": 8.79597122978233,
"learning_rate": 3.083590773321383e-06,
"loss": 0.6937,
"step": 1989
},
{
"epoch": 0.64,
"grad_norm": 6.055763599004475,
"learning_rate": 3.0788051428610377e-06,
"loss": 0.5702,
"step": 1990
},
{
"epoch": 0.64,
"grad_norm": 1.539020068194516,
"learning_rate": 3.0740215763510617e-06,
"loss": 0.2656,
"step": 1991
},
{
"epoch": 0.64,
"grad_norm": 1.4104076247022932,
"learning_rate": 3.069240078930461e-06,
"loss": 0.1914,
"step": 1992
},
{
"epoch": 0.64,
"grad_norm": 6.2384152458290965,
"learning_rate": 3.0644606557360303e-06,
"loss": 0.554,
"step": 1993
},
{
"epoch": 0.64,
"grad_norm": 5.780603707969804,
"learning_rate": 3.0596833119023283e-06,
"loss": 0.5852,
"step": 1994
},
{
"epoch": 0.64,
"grad_norm": 1.4946923984893086,
"learning_rate": 3.054908052561681e-06,
"loss": 0.2216,
"step": 1995
},
{
"epoch": 0.64,
"grad_norm": 1.5449652616854799,
"learning_rate": 3.0501348828441767e-06,
"loss": 0.2157,
"step": 1996
},
{
"epoch": 0.64,
"grad_norm": 7.333546997801793,
"learning_rate": 3.0453638078776614e-06,
"loss": 0.5461,
"step": 1997
},
{
"epoch": 0.64,
"grad_norm": 1.411111171589759,
"learning_rate": 3.0405948327877233e-06,
"loss": 0.1716,
"step": 1998
},
{
"epoch": 0.64,
"grad_norm": 9.2317474522265,
"learning_rate": 3.0358279626977034e-06,
"loss": 0.6057,
"step": 1999
},
{
"epoch": 0.64,
"grad_norm": 1.6595407476924195,
"learning_rate": 3.0310632027286717e-06,
"loss": 0.252,
"step": 2000
},
{
"epoch": 0.64,
"grad_norm": 10.38123760792797,
"learning_rate": 3.026300557999439e-06,
"loss": 0.4916,
"step": 2001
},
{
"epoch": 0.64,
"grad_norm": 9.01720415482311,
"learning_rate": 3.021540033626544e-06,
"loss": 0.6574,
"step": 2002
},
{
"epoch": 0.64,
"grad_norm": 1.4450001469176612,
"learning_rate": 3.0167816347242396e-06,
"loss": 0.1954,
"step": 2003
},
{
"epoch": 0.64,
"grad_norm": 1.549734064948244,
"learning_rate": 3.012025366404504e-06,
"loss": 0.1883,
"step": 2004
},
{
"epoch": 0.64,
"grad_norm": 1.409288695489273,
"learning_rate": 3.00727123377702e-06,
"loss": 0.1711,
"step": 2005
},
{
"epoch": 0.64,
"grad_norm": 1.496332161334161,
"learning_rate": 3.002519241949181e-06,
"loss": 0.1661,
"step": 2006
},
{
"epoch": 0.64,
"grad_norm": 1.5805932084747782,
"learning_rate": 2.997769396026078e-06,
"loss": 0.2469,
"step": 2007
},
{
"epoch": 0.64,
"grad_norm": 1.5664354665584634,
"learning_rate": 2.9930217011104957e-06,
"loss": 0.2136,
"step": 2008
},
{
"epoch": 0.64,
"grad_norm": 1.6462334538493848,
"learning_rate": 2.98827616230291e-06,
"loss": 0.2298,
"step": 2009
},
{
"epoch": 0.64,
"grad_norm": 6.35756523628157,
"learning_rate": 2.9835327847014816e-06,
"loss": 0.5649,
"step": 2010
},
{
"epoch": 0.64,
"grad_norm": 6.929907060457112,
"learning_rate": 2.9787915734020446e-06,
"loss": 0.3497,
"step": 2011
},
{
"epoch": 0.64,
"grad_norm": 9.218474483538316,
"learning_rate": 2.9740525334981105e-06,
"loss": 0.5576,
"step": 2012
},
{
"epoch": 0.64,
"grad_norm": 8.80653089641477,
"learning_rate": 2.9693156700808556e-06,
"loss": 0.5257,
"step": 2013
},
{
"epoch": 0.64,
"grad_norm": 1.537386140580507,
"learning_rate": 2.9645809882391187e-06,
"loss": 0.2227,
"step": 2014
},
{
"epoch": 0.64,
"grad_norm": 5.894783871681709,
"learning_rate": 2.959848493059396e-06,
"loss": 0.5558,
"step": 2015
},
{
"epoch": 0.65,
"grad_norm": 1.4413244205235274,
"learning_rate": 2.9551181896258317e-06,
"loss": 0.1972,
"step": 2016
},
{
"epoch": 0.65,
"grad_norm": 8.829090357580167,
"learning_rate": 2.9503900830202202e-06,
"loss": 0.645,
"step": 2017
},
{
"epoch": 0.65,
"grad_norm": 1.4472880455418229,
"learning_rate": 2.9456641783219897e-06,
"loss": 0.1827,
"step": 2018
},
{
"epoch": 0.65,
"grad_norm": 14.388491914537449,
"learning_rate": 2.9409404806082077e-06,
"loss": 0.5649,
"step": 2019
},
{
"epoch": 0.65,
"grad_norm": 1.49466934015538,
"learning_rate": 2.936218994953568e-06,
"loss": 0.2012,
"step": 2020
},
{
"epoch": 0.65,
"grad_norm": 1.6108952372793497,
"learning_rate": 2.93149972643039e-06,
"loss": 0.2223,
"step": 2021
},
{
"epoch": 0.65,
"grad_norm": 1.4561213432155709,
"learning_rate": 2.9267826801086103e-06,
"loss": 0.2059,
"step": 2022
},
{
"epoch": 0.65,
"grad_norm": 1.3975976715050646,
"learning_rate": 2.9220678610557773e-06,
"loss": 0.188,
"step": 2023
},
{
"epoch": 0.65,
"grad_norm": 5.734050672169014,
"learning_rate": 2.9173552743370454e-06,
"loss": 0.5869,
"step": 2024
},
{
"epoch": 0.65,
"grad_norm": 1.382280356501303,
"learning_rate": 2.912644925015179e-06,
"loss": 0.1759,
"step": 2025
},
{
"epoch": 0.65,
"grad_norm": 1.540302821087291,
"learning_rate": 2.9079368181505263e-06,
"loss": 0.1892,
"step": 2026
},
{
"epoch": 0.65,
"grad_norm": 5.756507654015998,
"learning_rate": 2.9032309588010372e-06,
"loss": 0.589,
"step": 2027
},
{
"epoch": 0.65,
"grad_norm": 1.423251323977148,
"learning_rate": 2.8985273520222414e-06,
"loss": 0.1678,
"step": 2028
},
{
"epoch": 0.65,
"grad_norm": 14.511239475297987,
"learning_rate": 2.893826002867247e-06,
"loss": 0.5658,
"step": 2029
},
{
"epoch": 0.65,
"grad_norm": 5.579483150865824,
"learning_rate": 2.889126916386744e-06,
"loss": 0.6073,
"step": 2030
},
{
"epoch": 0.65,
"grad_norm": 6.3032834228556505,
"learning_rate": 2.884430097628984e-06,
"loss": 0.5893,
"step": 2031
},
{
"epoch": 0.65,
"grad_norm": 10.049849649109069,
"learning_rate": 2.879735551639787e-06,
"loss": 0.7286,
"step": 2032
},
{
"epoch": 0.65,
"grad_norm": 1.7248569077162244,
"learning_rate": 2.8750432834625312e-06,
"loss": 0.2495,
"step": 2033
},
{
"epoch": 0.65,
"grad_norm": 1.4096401965762477,
"learning_rate": 2.8703532981381437e-06,
"loss": 0.1887,
"step": 2034
},
{
"epoch": 0.65,
"grad_norm": 17.53524436300472,
"learning_rate": 2.8656656007051055e-06,
"loss": 0.5363,
"step": 2035
},
{
"epoch": 0.65,
"grad_norm": 1.5426792696958962,
"learning_rate": 2.860980196199432e-06,
"loss": 0.2052,
"step": 2036
},
{
"epoch": 0.65,
"grad_norm": 1.4129423465102582,
"learning_rate": 2.8562970896546815e-06,
"loss": 0.2227,
"step": 2037
},
{
"epoch": 0.65,
"grad_norm": 1.3850626142230287,
"learning_rate": 2.8516162861019437e-06,
"loss": 0.1734,
"step": 2038
},
{
"epoch": 0.65,
"grad_norm": 1.5490721017643914,
"learning_rate": 2.846937790569828e-06,
"loss": 0.2004,
"step": 2039
},
{
"epoch": 0.65,
"grad_norm": 1.4591522754651152,
"learning_rate": 2.84226160808447e-06,
"loss": 0.1587,
"step": 2040
},
{
"epoch": 0.65,
"grad_norm": 7.18258853851953,
"learning_rate": 2.837587743669521e-06,
"loss": 0.6012,
"step": 2041
},
{
"epoch": 0.65,
"grad_norm": 6.390632791410251,
"learning_rate": 2.8329162023461355e-06,
"loss": 0.6074,
"step": 2042
},
{
"epoch": 0.65,
"grad_norm": 4.020579988074557,
"learning_rate": 2.82824698913298e-06,
"loss": 0.3801,
"step": 2043
},
{
"epoch": 0.65,
"grad_norm": 7.657631858464726,
"learning_rate": 2.823580109046212e-06,
"loss": 0.4631,
"step": 2044
},
{
"epoch": 0.65,
"grad_norm": 1.703102975789837,
"learning_rate": 2.8189155670994913e-06,
"loss": 0.2326,
"step": 2045
},
{
"epoch": 0.65,
"grad_norm": 5.848069849088212,
"learning_rate": 2.814253368303961e-06,
"loss": 0.5003,
"step": 2046
},
{
"epoch": 0.66,
"grad_norm": 7.306567833977227,
"learning_rate": 2.809593517668243e-06,
"loss": 0.5175,
"step": 2047
},
{
"epoch": 0.66,
"grad_norm": 1.56651860206685,
"learning_rate": 2.804936020198447e-06,
"loss": 0.2633,
"step": 2048
},
{
"epoch": 0.66,
"grad_norm": 1.4504924419562817,
"learning_rate": 2.800280880898143e-06,
"loss": 0.1824,
"step": 2049
},
{
"epoch": 0.66,
"grad_norm": 1.7417020399544938,
"learning_rate": 2.795628104768376e-06,
"loss": 0.2284,
"step": 2050
},
{
"epoch": 0.66,
"grad_norm": 1.4453823335646736,
"learning_rate": 2.79097769680765e-06,
"loss": 0.2183,
"step": 2051
},
{
"epoch": 0.66,
"grad_norm": 1.5264151059448143,
"learning_rate": 2.7863296620119217e-06,
"loss": 0.182,
"step": 2052
},
{
"epoch": 0.66,
"grad_norm": 1.6659410056454296,
"learning_rate": 2.781684005374604e-06,
"loss": 0.2121,
"step": 2053
},
{
"epoch": 0.66,
"grad_norm": 9.928166120867077,
"learning_rate": 2.777040731886549e-06,
"loss": 0.4689,
"step": 2054
},
{
"epoch": 0.66,
"grad_norm": 5.856001861805286,
"learning_rate": 2.7723998465360537e-06,
"loss": 0.6054,
"step": 2055
},
{
"epoch": 0.66,
"grad_norm": 1.664206349083662,
"learning_rate": 2.7677613543088432e-06,
"loss": 0.2158,
"step": 2056
},
{
"epoch": 0.66,
"grad_norm": 1.4771185514429073,
"learning_rate": 2.7631252601880816e-06,
"loss": 0.2255,
"step": 2057
},
{
"epoch": 0.66,
"grad_norm": 5.576621706196896,
"learning_rate": 2.7584915691543444e-06,
"loss": 0.4679,
"step": 2058
},
{
"epoch": 0.66,
"grad_norm": 1.8554856852860553,
"learning_rate": 2.753860286185637e-06,
"loss": 0.2524,
"step": 2059
},
{
"epoch": 0.66,
"grad_norm": 1.507848522901589,
"learning_rate": 2.7492314162573687e-06,
"loss": 0.2138,
"step": 2060
},
{
"epoch": 0.66,
"grad_norm": 5.643669411238887,
"learning_rate": 2.744604964342364e-06,
"loss": 0.5348,
"step": 2061
},
{
"epoch": 0.66,
"grad_norm": 1.4696225760376407,
"learning_rate": 2.7399809354108415e-06,
"loss": 0.2237,
"step": 2062
},
{
"epoch": 0.66,
"grad_norm": 1.4569540399198748,
"learning_rate": 2.735359334430424e-06,
"loss": 0.2011,
"step": 2063
},
{
"epoch": 0.66,
"grad_norm": 1.654621339761889,
"learning_rate": 2.7307401663661247e-06,
"loss": 0.2536,
"step": 2064
},
{
"epoch": 0.66,
"grad_norm": 1.64194064592359,
"learning_rate": 2.7261234361803383e-06,
"loss": 0.1966,
"step": 2065
},
{
"epoch": 0.66,
"grad_norm": 1.5042566355669824,
"learning_rate": 2.721509148832847e-06,
"loss": 0.1871,
"step": 2066
},
{
"epoch": 0.66,
"grad_norm": 11.15477235002936,
"learning_rate": 2.7168973092808025e-06,
"loss": 0.6684,
"step": 2067
},
{
"epoch": 0.66,
"grad_norm": 1.500220526937373,
"learning_rate": 2.7122879224787315e-06,
"loss": 0.2112,
"step": 2068
},
{
"epoch": 0.66,
"grad_norm": 1.5089956192186091,
"learning_rate": 2.7076809933785254e-06,
"loss": 0.209,
"step": 2069
},
{
"epoch": 0.66,
"grad_norm": 1.3978138088369234,
"learning_rate": 2.70307652692943e-06,
"loss": 0.232,
"step": 2070
},
{
"epoch": 0.66,
"grad_norm": 5.935467932603042,
"learning_rate": 2.6984745280780524e-06,
"loss": 0.5995,
"step": 2071
},
{
"epoch": 0.66,
"grad_norm": 7.033255000629101,
"learning_rate": 2.6938750017683457e-06,
"loss": 0.5448,
"step": 2072
},
{
"epoch": 0.66,
"grad_norm": 5.77239610023653,
"learning_rate": 2.6892779529416045e-06,
"loss": 0.5068,
"step": 2073
},
{
"epoch": 0.66,
"grad_norm": 1.535161586473928,
"learning_rate": 2.6846833865364674e-06,
"loss": 0.2284,
"step": 2074
},
{
"epoch": 0.66,
"grad_norm": 1.494737736566833,
"learning_rate": 2.6800913074888984e-06,
"loss": 0.2188,
"step": 2075
},
{
"epoch": 0.66,
"grad_norm": 4.584594545726581,
"learning_rate": 2.6755017207321964e-06,
"loss": 0.5806,
"step": 2076
},
{
"epoch": 0.66,
"grad_norm": 6.140984993945612,
"learning_rate": 2.6709146311969813e-06,
"loss": 0.6306,
"step": 2077
},
{
"epoch": 0.66,
"grad_norm": 1.5974617544111862,
"learning_rate": 2.666330043811185e-06,
"loss": 0.2068,
"step": 2078
},
{
"epoch": 0.67,
"grad_norm": 1.4237024176576276,
"learning_rate": 2.66174796350006e-06,
"loss": 0.1976,
"step": 2079
},
{
"epoch": 0.67,
"grad_norm": 6.873143281215649,
"learning_rate": 2.657168395186157e-06,
"loss": 0.5466,
"step": 2080
},
{
"epoch": 0.67,
"grad_norm": 10.481236055001368,
"learning_rate": 2.6525913437893346e-06,
"loss": 0.5907,
"step": 2081
},
{
"epoch": 0.67,
"grad_norm": 7.5131083787464314,
"learning_rate": 2.648016814226742e-06,
"loss": 0.6489,
"step": 2082
},
{
"epoch": 0.67,
"grad_norm": 5.960104977478367,
"learning_rate": 2.6434448114128252e-06,
"loss": 0.5608,
"step": 2083
},
{
"epoch": 0.67,
"grad_norm": 5.589418506171804,
"learning_rate": 2.6388753402593083e-06,
"loss": 0.4707,
"step": 2084
},
{
"epoch": 0.67,
"grad_norm": 1.480444693363657,
"learning_rate": 2.6343084056752032e-06,
"loss": 0.1878,
"step": 2085
},
{
"epoch": 0.67,
"grad_norm": 1.4776866473270172,
"learning_rate": 2.6297440125667904e-06,
"loss": 0.1888,
"step": 2086
},
{
"epoch": 0.67,
"grad_norm": 1.526571669167614,
"learning_rate": 2.6251821658376265e-06,
"loss": 0.2001,
"step": 2087
},
{
"epoch": 0.67,
"grad_norm": 7.8001761727734475,
"learning_rate": 2.620622870388524e-06,
"loss": 0.5157,
"step": 2088
},
{
"epoch": 0.67,
"grad_norm": 5.482754910414564,
"learning_rate": 2.616066131117563e-06,
"loss": 0.5156,
"step": 2089
},
{
"epoch": 0.67,
"grad_norm": 8.68340690123736,
"learning_rate": 2.6115119529200748e-06,
"loss": 0.748,
"step": 2090
},
{
"epoch": 0.67,
"grad_norm": 8.952323657950684,
"learning_rate": 2.6069603406886347e-06,
"loss": 0.7035,
"step": 2091
},
{
"epoch": 0.67,
"grad_norm": 5.986362463099499,
"learning_rate": 2.60241129931307e-06,
"loss": 0.494,
"step": 2092
},
{
"epoch": 0.67,
"grad_norm": 1.439463408245643,
"learning_rate": 2.597864833680436e-06,
"loss": 0.2374,
"step": 2093
},
{
"epoch": 0.67,
"grad_norm": 5.119700884405849,
"learning_rate": 2.593320948675029e-06,
"loss": 0.531,
"step": 2094
},
{
"epoch": 0.67,
"grad_norm": 1.5447946877587033,
"learning_rate": 2.588779649178371e-06,
"loss": 0.1656,
"step": 2095
},
{
"epoch": 0.67,
"grad_norm": 10.055606302840184,
"learning_rate": 2.5842409400692026e-06,
"loss": 0.4637,
"step": 2096
},
{
"epoch": 0.67,
"grad_norm": 5.548653527200506,
"learning_rate": 2.579704826223488e-06,
"loss": 0.5466,
"step": 2097
},
{
"epoch": 0.67,
"grad_norm": 9.668830579623814,
"learning_rate": 2.575171312514395e-06,
"loss": 0.5984,
"step": 2098
},
{
"epoch": 0.67,
"grad_norm": 6.052766473122458,
"learning_rate": 2.570640403812306e-06,
"loss": 0.5132,
"step": 2099
},
{
"epoch": 0.67,
"grad_norm": 5.124825402309653,
"learning_rate": 2.5661121049848026e-06,
"loss": 0.5369,
"step": 2100
},
{
"epoch": 0.67,
"grad_norm": 4.929832561363666,
"learning_rate": 2.5615864208966573e-06,
"loss": 0.4245,
"step": 2101
},
{
"epoch": 0.67,
"grad_norm": 1.807616728517506,
"learning_rate": 2.55706335640984e-06,
"loss": 0.232,
"step": 2102
},
{
"epoch": 0.67,
"grad_norm": 1.4950067489323393,
"learning_rate": 2.552542916383507e-06,
"loss": 0.1892,
"step": 2103
},
{
"epoch": 0.67,
"grad_norm": 1.5777636616102162,
"learning_rate": 2.5480251056739874e-06,
"loss": 0.1931,
"step": 2104
},
{
"epoch": 0.67,
"grad_norm": 1.5057852293550578,
"learning_rate": 2.543509929134794e-06,
"loss": 0.2454,
"step": 2105
},
{
"epoch": 0.67,
"grad_norm": 6.032793642333784,
"learning_rate": 2.5389973916166037e-06,
"loss": 0.5323,
"step": 2106
},
{
"epoch": 0.67,
"grad_norm": 1.539488745621972,
"learning_rate": 2.534487497967262e-06,
"loss": 0.2214,
"step": 2107
},
{
"epoch": 0.67,
"grad_norm": 1.8573540084986893,
"learning_rate": 2.529980253031774e-06,
"loss": 0.2008,
"step": 2108
},
{
"epoch": 0.67,
"grad_norm": 7.887172142365945,
"learning_rate": 2.5254756616522953e-06,
"loss": 0.6405,
"step": 2109
},
{
"epoch": 0.68,
"grad_norm": 12.12537771401812,
"learning_rate": 2.5209737286681367e-06,
"loss": 0.5544,
"step": 2110
},
{
"epoch": 0.68,
"grad_norm": 6.755619318097899,
"learning_rate": 2.5164744589157488e-06,
"loss": 0.4186,
"step": 2111
},
{
"epoch": 0.68,
"grad_norm": 1.293112090324478,
"learning_rate": 2.5119778572287195e-06,
"loss": 0.155,
"step": 2112
},
{
"epoch": 0.68,
"grad_norm": 1.3807406177529438,
"learning_rate": 2.5074839284377774e-06,
"loss": 0.1717,
"step": 2113
},
{
"epoch": 0.68,
"grad_norm": 9.441092800844213,
"learning_rate": 2.5029926773707713e-06,
"loss": 0.5546,
"step": 2114
},
{
"epoch": 0.68,
"grad_norm": 1.3544900079158484,
"learning_rate": 2.49850410885268e-06,
"loss": 0.1522,
"step": 2115
},
{
"epoch": 0.68,
"grad_norm": 5.412249658146296,
"learning_rate": 2.4940182277055987e-06,
"loss": 0.4664,
"step": 2116
},
{
"epoch": 0.68,
"grad_norm": 9.5821734481972,
"learning_rate": 2.4895350387487304e-06,
"loss": 0.7129,
"step": 2117
},
{
"epoch": 0.68,
"grad_norm": 9.044995221975011,
"learning_rate": 2.485054546798395e-06,
"loss": 0.5705,
"step": 2118
},
{
"epoch": 0.68,
"grad_norm": 1.4767352774929565,
"learning_rate": 2.4805767566680057e-06,
"loss": 0.1969,
"step": 2119
},
{
"epoch": 0.68,
"grad_norm": 1.5576494940386558,
"learning_rate": 2.4761016731680792e-06,
"loss": 0.1951,
"step": 2120
},
{
"epoch": 0.68,
"grad_norm": 1.5841094849229551,
"learning_rate": 2.4716293011062248e-06,
"loss": 0.2328,
"step": 2121
},
{
"epoch": 0.68,
"grad_norm": 4.941718067928212,
"learning_rate": 2.467159645287133e-06,
"loss": 0.396,
"step": 2122
},
{
"epoch": 0.68,
"grad_norm": 1.333176738276689,
"learning_rate": 2.4626927105125834e-06,
"loss": 0.169,
"step": 2123
},
{
"epoch": 0.68,
"grad_norm": 1.4017110709066132,
"learning_rate": 2.4582285015814263e-06,
"loss": 0.1784,
"step": 2124
},
{
"epoch": 0.68,
"grad_norm": 1.4012363237470569,
"learning_rate": 2.4537670232895866e-06,
"loss": 0.1968,
"step": 2125
},
{
"epoch": 0.68,
"grad_norm": 1.668346328054327,
"learning_rate": 2.4493082804300585e-06,
"loss": 0.2318,
"step": 2126
},
{
"epoch": 0.68,
"grad_norm": 7.387875598852549,
"learning_rate": 2.4448522777928903e-06,
"loss": 0.6096,
"step": 2127
},
{
"epoch": 0.68,
"grad_norm": 7.470739210749414,
"learning_rate": 2.4403990201651915e-06,
"loss": 0.5272,
"step": 2128
},
{
"epoch": 0.68,
"grad_norm": 1.363422577842959,
"learning_rate": 2.435948512331125e-06,
"loss": 0.2037,
"step": 2129
},
{
"epoch": 0.68,
"grad_norm": 6.057297144444324,
"learning_rate": 2.4315007590718913e-06,
"loss": 0.5972,
"step": 2130
},
{
"epoch": 0.68,
"grad_norm": 1.5487673475235315,
"learning_rate": 2.427055765165741e-06,
"loss": 0.1995,
"step": 2131
},
{
"epoch": 0.68,
"grad_norm": 1.4962584550323683,
"learning_rate": 2.4226135353879516e-06,
"loss": 0.1925,
"step": 2132
},
{
"epoch": 0.68,
"grad_norm": 1.5106017838876058,
"learning_rate": 2.4181740745108377e-06,
"loss": 0.2128,
"step": 2133
},
{
"epoch": 0.68,
"grad_norm": 1.5513108161316838,
"learning_rate": 2.413737387303739e-06,
"loss": 0.2068,
"step": 2134
},
{
"epoch": 0.68,
"grad_norm": 1.5451192131024105,
"learning_rate": 2.4093034785330087e-06,
"loss": 0.1922,
"step": 2135
},
{
"epoch": 0.68,
"grad_norm": 8.49502966881639,
"learning_rate": 2.4048723529620246e-06,
"loss": 0.5727,
"step": 2136
},
{
"epoch": 0.68,
"grad_norm": 9.300165349897595,
"learning_rate": 2.4004440153511642e-06,
"loss": 0.6384,
"step": 2137
},
{
"epoch": 0.68,
"grad_norm": 8.31084953343189,
"learning_rate": 2.396018470457821e-06,
"loss": 0.608,
"step": 2138
},
{
"epoch": 0.68,
"grad_norm": 1.575142105462621,
"learning_rate": 2.3915957230363783e-06,
"loss": 0.1931,
"step": 2139
},
{
"epoch": 0.68,
"grad_norm": 1.591976965943769,
"learning_rate": 2.3871757778382216e-06,
"loss": 0.2191,
"step": 2140
},
{
"epoch": 0.69,
"grad_norm": 1.4538578312332406,
"learning_rate": 2.3827586396117207e-06,
"loss": 0.2013,
"step": 2141
},
{
"epoch": 0.69,
"grad_norm": 1.4533705881801298,
"learning_rate": 2.378344313102231e-06,
"loss": 0.1762,
"step": 2142
},
{
"epoch": 0.69,
"grad_norm": 1.8439687180760513,
"learning_rate": 2.373932803052089e-06,
"loss": 0.2151,
"step": 2143
},
{
"epoch": 0.69,
"grad_norm": 1.5904218951550402,
"learning_rate": 2.369524114200607e-06,
"loss": 0.2218,
"step": 2144
},
{
"epoch": 0.69,
"grad_norm": 10.188160335340218,
"learning_rate": 2.3651182512840604e-06,
"loss": 0.4334,
"step": 2145
},
{
"epoch": 0.69,
"grad_norm": 1.393277693000533,
"learning_rate": 2.360715219035694e-06,
"loss": 0.239,
"step": 2146
},
{
"epoch": 0.69,
"grad_norm": 1.6821612536370698,
"learning_rate": 2.356315022185712e-06,
"loss": 0.2136,
"step": 2147
},
{
"epoch": 0.69,
"grad_norm": 6.536278091744725,
"learning_rate": 2.3519176654612657e-06,
"loss": 0.6949,
"step": 2148
},
{
"epoch": 0.69,
"grad_norm": 1.5439508048161585,
"learning_rate": 2.3475231535864653e-06,
"loss": 0.2314,
"step": 2149
},
{
"epoch": 0.69,
"grad_norm": 4.809688001029302,
"learning_rate": 2.3431314912823543e-06,
"loss": 0.458,
"step": 2150
},
{
"epoch": 0.69,
"grad_norm": 1.5992254269863293,
"learning_rate": 2.338742683266923e-06,
"loss": 0.2103,
"step": 2151
},
{
"epoch": 0.69,
"grad_norm": 5.189303577879517,
"learning_rate": 2.3343567342550933e-06,
"loss": 0.6068,
"step": 2152
},
{
"epoch": 0.69,
"grad_norm": 1.6646510638154512,
"learning_rate": 2.329973648958712e-06,
"loss": 0.2296,
"step": 2153
},
{
"epoch": 0.69,
"grad_norm": 1.5316039306664393,
"learning_rate": 2.3255934320865555e-06,
"loss": 0.1989,
"step": 2154
},
{
"epoch": 0.69,
"grad_norm": 6.156795151941741,
"learning_rate": 2.3212160883443107e-06,
"loss": 0.6604,
"step": 2155
},
{
"epoch": 0.69,
"grad_norm": 1.4808369477187273,
"learning_rate": 2.316841622434586e-06,
"loss": 0.235,
"step": 2156
},
{
"epoch": 0.69,
"grad_norm": 8.765104106256636,
"learning_rate": 2.3124700390568945e-06,
"loss": 0.4996,
"step": 2157
},
{
"epoch": 0.69,
"grad_norm": 9.551495354691971,
"learning_rate": 2.30810134290765e-06,
"loss": 0.6904,
"step": 2158
},
{
"epoch": 0.69,
"grad_norm": 1.5267502119057248,
"learning_rate": 2.3037355386801683e-06,
"loss": 0.1824,
"step": 2159
},
{
"epoch": 0.69,
"grad_norm": 1.520976510571733,
"learning_rate": 2.2993726310646603e-06,
"loss": 0.2111,
"step": 2160
},
{
"epoch": 0.69,
"grad_norm": 1.6117412402555655,
"learning_rate": 2.2950126247482178e-06,
"loss": 0.2201,
"step": 2161
},
{
"epoch": 0.69,
"grad_norm": 6.984267858215095,
"learning_rate": 2.2906555244148233e-06,
"loss": 0.5403,
"step": 2162
},
{
"epoch": 0.69,
"grad_norm": 6.084749533351431,
"learning_rate": 2.2863013347453305e-06,
"loss": 0.6068,
"step": 2163
},
{
"epoch": 0.69,
"grad_norm": 1.6333208376879278,
"learning_rate": 2.2819500604174733e-06,
"loss": 0.2174,
"step": 2164
},
{
"epoch": 0.69,
"grad_norm": 8.002520614785373,
"learning_rate": 2.277601706105847e-06,
"loss": 0.5838,
"step": 2165
},
{
"epoch": 0.69,
"grad_norm": 4.938421877014078,
"learning_rate": 2.2732562764819157e-06,
"loss": 0.4133,
"step": 2166
},
{
"epoch": 0.69,
"grad_norm": 8.594016694676238,
"learning_rate": 2.2689137762139952e-06,
"loss": 0.5487,
"step": 2167
},
{
"epoch": 0.69,
"grad_norm": 8.935725275208199,
"learning_rate": 2.264574209967262e-06,
"loss": 0.7306,
"step": 2168
},
{
"epoch": 0.69,
"grad_norm": 1.6260834195103835,
"learning_rate": 2.260237582403732e-06,
"loss": 0.1896,
"step": 2169
},
{
"epoch": 0.69,
"grad_norm": 11.481929304026814,
"learning_rate": 2.2559038981822724e-06,
"loss": 0.5342,
"step": 2170
},
{
"epoch": 0.69,
"grad_norm": 5.7505730354611035,
"learning_rate": 2.2515731619585814e-06,
"loss": 0.3837,
"step": 2171
},
{
"epoch": 0.7,
"grad_norm": 5.632461495267201,
"learning_rate": 2.247245378385195e-06,
"loss": 0.578,
"step": 2172
},
{
"epoch": 0.7,
"grad_norm": 5.280938569839824,
"learning_rate": 2.242920552111473e-06,
"loss": 0.5747,
"step": 2173
},
{
"epoch": 0.7,
"grad_norm": 1.622621353949337,
"learning_rate": 2.238598687783603e-06,
"loss": 0.2271,
"step": 2174
},
{
"epoch": 0.7,
"grad_norm": 1.4094720336722553,
"learning_rate": 2.234279790044588e-06,
"loss": 0.2039,
"step": 2175
},
{
"epoch": 0.7,
"grad_norm": 1.466984919198312,
"learning_rate": 2.229963863534241e-06,
"loss": 0.2309,
"step": 2176
},
{
"epoch": 0.7,
"grad_norm": 11.239427326449347,
"learning_rate": 2.225650912889188e-06,
"loss": 0.4516,
"step": 2177
},
{
"epoch": 0.7,
"grad_norm": 1.6792138363705542,
"learning_rate": 2.221340942742858e-06,
"loss": 0.2394,
"step": 2178
},
{
"epoch": 0.7,
"grad_norm": 6.313849287440056,
"learning_rate": 2.2170339577254714e-06,
"loss": 0.5399,
"step": 2179
},
{
"epoch": 0.7,
"grad_norm": 5.135409647455387,
"learning_rate": 2.212729962464051e-06,
"loss": 0.512,
"step": 2180
},
{
"epoch": 0.7,
"grad_norm": 1.4052058341244174,
"learning_rate": 2.208428961582399e-06,
"loss": 0.2496,
"step": 2181
},
{
"epoch": 0.7,
"grad_norm": 1.4710826958438774,
"learning_rate": 2.2041309597011057e-06,
"loss": 0.2335,
"step": 2182
},
{
"epoch": 0.7,
"grad_norm": 25.617847624969013,
"learning_rate": 2.1998359614375412e-06,
"loss": 0.559,
"step": 2183
},
{
"epoch": 0.7,
"grad_norm": 1.6057405096713442,
"learning_rate": 2.1955439714058422e-06,
"loss": 0.2036,
"step": 2184
},
{
"epoch": 0.7,
"grad_norm": 1.4263831277019077,
"learning_rate": 2.191254994216922e-06,
"loss": 0.1784,
"step": 2185
},
{
"epoch": 0.7,
"grad_norm": 1.301358164273251,
"learning_rate": 2.186969034478448e-06,
"loss": 0.1634,
"step": 2186
},
{
"epoch": 0.7,
"grad_norm": 4.195040872383668,
"learning_rate": 2.182686096794852e-06,
"loss": 0.3323,
"step": 2187
},
{
"epoch": 0.7,
"grad_norm": 7.276705829606348,
"learning_rate": 2.1784061857673217e-06,
"loss": 0.5848,
"step": 2188
},
{
"epoch": 0.7,
"grad_norm": 1.609015303555973,
"learning_rate": 2.174129305993784e-06,
"loss": 0.229,
"step": 2189
},
{
"epoch": 0.7,
"grad_norm": 8.030857623344524,
"learning_rate": 2.1698554620689178e-06,
"loss": 0.4022,
"step": 2190
},
{
"epoch": 0.7,
"grad_norm": 1.636436337760029,
"learning_rate": 2.165584658584138e-06,
"loss": 0.222,
"step": 2191
},
{
"epoch": 0.7,
"grad_norm": 14.087436399279088,
"learning_rate": 2.16131690012759e-06,
"loss": 0.5201,
"step": 2192
},
{
"epoch": 0.7,
"grad_norm": 15.306812454070704,
"learning_rate": 2.157052191284154e-06,
"loss": 0.5923,
"step": 2193
},
{
"epoch": 0.7,
"grad_norm": 1.6706590067478497,
"learning_rate": 2.1527905366354292e-06,
"loss": 0.2799,
"step": 2194
},
{
"epoch": 0.7,
"grad_norm": 1.215860995659821,
"learning_rate": 2.1485319407597315e-06,
"loss": 0.1549,
"step": 2195
},
{
"epoch": 0.7,
"grad_norm": 4.921574667054535,
"learning_rate": 2.1442764082321e-06,
"loss": 0.5129,
"step": 2196
},
{
"epoch": 0.7,
"grad_norm": 5.732924842836198,
"learning_rate": 2.140023943624272e-06,
"loss": 0.6023,
"step": 2197
},
{
"epoch": 0.7,
"grad_norm": 8.102984763000496,
"learning_rate": 2.135774551504698e-06,
"loss": 0.4917,
"step": 2198
},
{
"epoch": 0.7,
"grad_norm": 1.6618262296117945,
"learning_rate": 2.1315282364385197e-06,
"loss": 0.2193,
"step": 2199
},
{
"epoch": 0.7,
"grad_norm": 1.3960460419289757,
"learning_rate": 2.1272850029875802e-06,
"loss": 0.1574,
"step": 2200
},
{
"epoch": 0.7,
"grad_norm": 11.603423005829917,
"learning_rate": 2.1230448557104087e-06,
"loss": 0.545,
"step": 2201
},
{
"epoch": 0.7,
"grad_norm": 5.042970308045986,
"learning_rate": 2.1188077991622174e-06,
"loss": 0.3641,
"step": 2202
},
{
"epoch": 0.7,
"grad_norm": 1.5474507435821743,
"learning_rate": 2.1145738378949004e-06,
"loss": 0.2371,
"step": 2203
},
{
"epoch": 0.71,
"grad_norm": 9.950803527815106,
"learning_rate": 2.110342976457029e-06,
"loss": 0.5266,
"step": 2204
},
{
"epoch": 0.71,
"grad_norm": 1.6618813500238472,
"learning_rate": 2.1061152193938355e-06,
"loss": 0.215,
"step": 2205
},
{
"epoch": 0.71,
"grad_norm": 6.830984374276211,
"learning_rate": 2.1018905712472285e-06,
"loss": 0.5277,
"step": 2206
},
{
"epoch": 0.71,
"grad_norm": 1.5052744627478791,
"learning_rate": 2.0976690365557673e-06,
"loss": 0.1999,
"step": 2207
},
{
"epoch": 0.71,
"grad_norm": 5.6409906387431565,
"learning_rate": 2.093450619854671e-06,
"loss": 0.6337,
"step": 2208
},
{
"epoch": 0.71,
"grad_norm": 1.657684945956667,
"learning_rate": 2.0892353256758107e-06,
"loss": 0.2178,
"step": 2209
},
{
"epoch": 0.71,
"grad_norm": 5.118560263864878,
"learning_rate": 2.0850231585476965e-06,
"loss": 0.4665,
"step": 2210
},
{
"epoch": 0.71,
"grad_norm": 1.8137236999164266,
"learning_rate": 2.0808141229954876e-06,
"loss": 0.2002,
"step": 2211
},
{
"epoch": 0.71,
"grad_norm": 9.817857709861432,
"learning_rate": 2.0766082235409695e-06,
"loss": 0.4937,
"step": 2212
},
{
"epoch": 0.71,
"grad_norm": 1.5934699742301135,
"learning_rate": 2.072405464702566e-06,
"loss": 0.2585,
"step": 2213
},
{
"epoch": 0.71,
"grad_norm": 6.077520047362195,
"learning_rate": 2.068205850995326e-06,
"loss": 0.5491,
"step": 2214
},
{
"epoch": 0.71,
"grad_norm": 5.861050995792146,
"learning_rate": 2.064009386930915e-06,
"loss": 0.5114,
"step": 2215
},
{
"epoch": 0.71,
"grad_norm": 1.5038002055573227,
"learning_rate": 2.0598160770176208e-06,
"loss": 0.1906,
"step": 2216
},
{
"epoch": 0.71,
"grad_norm": 1.6958036181030445,
"learning_rate": 2.0556259257603355e-06,
"loss": 0.2393,
"step": 2217
},
{
"epoch": 0.71,
"grad_norm": 1.6086477208936687,
"learning_rate": 2.0514389376605646e-06,
"loss": 0.2688,
"step": 2218
},
{
"epoch": 0.71,
"grad_norm": 1.43271043730868,
"learning_rate": 2.0472551172164152e-06,
"loss": 0.2063,
"step": 2219
},
{
"epoch": 0.71,
"grad_norm": 6.381175730682259,
"learning_rate": 2.0430744689225833e-06,
"loss": 0.57,
"step": 2220
},
{
"epoch": 0.71,
"grad_norm": 1.3943765638105143,
"learning_rate": 2.0388969972703688e-06,
"loss": 0.2283,
"step": 2221
},
{
"epoch": 0.71,
"grad_norm": 8.011989023626537,
"learning_rate": 2.0347227067476478e-06,
"loss": 0.3946,
"step": 2222
},
{
"epoch": 0.71,
"grad_norm": 1.3876916030957878,
"learning_rate": 2.030551601838887e-06,
"loss": 0.1907,
"step": 2223
},
{
"epoch": 0.71,
"grad_norm": 6.369440824496463,
"learning_rate": 2.0263836870251277e-06,
"loss": 0.6874,
"step": 2224
},
{
"epoch": 0.71,
"grad_norm": 1.392030096155692,
"learning_rate": 2.0222189667839805e-06,
"loss": 0.1721,
"step": 2225
},
{
"epoch": 0.71,
"grad_norm": 7.505820135642883,
"learning_rate": 2.01805744558963e-06,
"loss": 0.5525,
"step": 2226
},
{
"epoch": 0.71,
"grad_norm": 10.217081467195106,
"learning_rate": 2.013899127912824e-06,
"loss": 0.5245,
"step": 2227
},
{
"epoch": 0.71,
"grad_norm": 8.771085852403852,
"learning_rate": 2.009744018220863e-06,
"loss": 0.4655,
"step": 2228
},
{
"epoch": 0.71,
"grad_norm": 6.354959097655579,
"learning_rate": 2.005592120977606e-06,
"loss": 0.5604,
"step": 2229
},
{
"epoch": 0.71,
"grad_norm": 1.7218011393895385,
"learning_rate": 2.0014434406434584e-06,
"loss": 0.1957,
"step": 2230
},
{
"epoch": 0.71,
"grad_norm": 1.4055159786799856,
"learning_rate": 1.9972979816753717e-06,
"loss": 0.1682,
"step": 2231
},
{
"epoch": 0.71,
"grad_norm": 4.084421952028699,
"learning_rate": 1.9931557485268365e-06,
"loss": 0.3722,
"step": 2232
},
{
"epoch": 0.71,
"grad_norm": 5.361724567386323,
"learning_rate": 1.9890167456478748e-06,
"loss": 0.5068,
"step": 2233
},
{
"epoch": 0.71,
"grad_norm": 1.60003653371184,
"learning_rate": 1.984880977485041e-06,
"loss": 0.211,
"step": 2234
},
{
"epoch": 0.72,
"grad_norm": 7.482603126691497,
"learning_rate": 1.980748448481416e-06,
"loss": 0.4358,
"step": 2235
},
{
"epoch": 0.72,
"grad_norm": 6.141325282590449,
"learning_rate": 1.9766191630765964e-06,
"loss": 0.5306,
"step": 2236
},
{
"epoch": 0.72,
"grad_norm": 1.4516625230105962,
"learning_rate": 1.9724931257066988e-06,
"loss": 0.2317,
"step": 2237
},
{
"epoch": 0.72,
"grad_norm": 9.343327315293724,
"learning_rate": 1.9683703408043447e-06,
"loss": 0.6164,
"step": 2238
},
{
"epoch": 0.72,
"grad_norm": 5.24934798833513,
"learning_rate": 1.9642508127986676e-06,
"loss": 0.5279,
"step": 2239
},
{
"epoch": 0.72,
"grad_norm": 1.4895377988466674,
"learning_rate": 1.9601345461153005e-06,
"loss": 0.2156,
"step": 2240
},
{
"epoch": 0.72,
"grad_norm": 1.5036358630247681,
"learning_rate": 1.9560215451763685e-06,
"loss": 0.1936,
"step": 2241
},
{
"epoch": 0.72,
"grad_norm": 9.300260916579875,
"learning_rate": 1.951911814400495e-06,
"loss": 0.5325,
"step": 2242
},
{
"epoch": 0.72,
"grad_norm": 1.370160034971188,
"learning_rate": 1.9478053582027826e-06,
"loss": 0.1512,
"step": 2243
},
{
"epoch": 0.72,
"grad_norm": 1.465307815297895,
"learning_rate": 1.9437021809948232e-06,
"loss": 0.2026,
"step": 2244
},
{
"epoch": 0.72,
"grad_norm": 9.277551552673337,
"learning_rate": 1.9396022871846836e-06,
"loss": 0.6607,
"step": 2245
},
{
"epoch": 0.72,
"grad_norm": 5.387263280662652,
"learning_rate": 1.935505681176899e-06,
"loss": 0.5917,
"step": 2246
},
{
"epoch": 0.72,
"grad_norm": 1.349938726512857,
"learning_rate": 1.9314123673724805e-06,
"loss": 0.1762,
"step": 2247
},
{
"epoch": 0.72,
"grad_norm": 1.6763593693800607,
"learning_rate": 1.9273223501688943e-06,
"loss": 0.1944,
"step": 2248
},
{
"epoch": 0.72,
"grad_norm": 5.549808705544746,
"learning_rate": 1.9232356339600717e-06,
"loss": 0.6513,
"step": 2249
},
{
"epoch": 0.72,
"grad_norm": 6.377562270114407,
"learning_rate": 1.919152223136391e-06,
"loss": 0.4733,
"step": 2250
},
{
"epoch": 0.72,
"grad_norm": 7.852463979276551,
"learning_rate": 1.9150721220846884e-06,
"loss": 0.7424,
"step": 2251
},
{
"epoch": 0.72,
"grad_norm": 1.687946465511382,
"learning_rate": 1.910995335188234e-06,
"loss": 0.2251,
"step": 2252
},
{
"epoch": 0.72,
"grad_norm": 1.5891175267595996,
"learning_rate": 1.906921866826747e-06,
"loss": 0.2003,
"step": 2253
},
{
"epoch": 0.72,
"grad_norm": 6.101562294379387,
"learning_rate": 1.9028517213763737e-06,
"loss": 0.5808,
"step": 2254
},
{
"epoch": 0.72,
"grad_norm": 7.967450356183768,
"learning_rate": 1.8987849032096973e-06,
"loss": 0.4792,
"step": 2255
},
{
"epoch": 0.72,
"grad_norm": 1.42460389872086,
"learning_rate": 1.89472141669572e-06,
"loss": 0.1894,
"step": 2256
},
{
"epoch": 0.72,
"grad_norm": 7.633483715732261,
"learning_rate": 1.8906612661998698e-06,
"loss": 0.5442,
"step": 2257
},
{
"epoch": 0.72,
"grad_norm": 8.4505173264,
"learning_rate": 1.8866044560839902e-06,
"loss": 0.5843,
"step": 2258
},
{
"epoch": 0.72,
"grad_norm": 6.345391544979886,
"learning_rate": 1.8825509907063328e-06,
"loss": 0.4357,
"step": 2259
},
{
"epoch": 0.72,
"grad_norm": 1.7058142928047633,
"learning_rate": 1.8785008744215606e-06,
"loss": 0.2384,
"step": 2260
},
{
"epoch": 0.72,
"grad_norm": 1.4893028857490218,
"learning_rate": 1.874454111580733e-06,
"loss": 0.1768,
"step": 2261
},
{
"epoch": 0.72,
"grad_norm": 6.414925517521466,
"learning_rate": 1.8704107065313116e-06,
"loss": 0.6891,
"step": 2262
},
{
"epoch": 0.72,
"grad_norm": 1.6135157017856627,
"learning_rate": 1.8663706636171503e-06,
"loss": 0.2245,
"step": 2263
},
{
"epoch": 0.72,
"grad_norm": 15.830859870957802,
"learning_rate": 1.8623339871784869e-06,
"loss": 0.571,
"step": 2264
},
{
"epoch": 0.72,
"grad_norm": 9.115565072336496,
"learning_rate": 1.8583006815519473e-06,
"loss": 0.4791,
"step": 2265
},
{
"epoch": 0.73,
"grad_norm": 1.3492396073479147,
"learning_rate": 1.8542707510705355e-06,
"loss": 0.1572,
"step": 2266
},
{
"epoch": 0.73,
"grad_norm": 4.694271699344701,
"learning_rate": 1.8502442000636246e-06,
"loss": 0.3593,
"step": 2267
},
{
"epoch": 0.73,
"grad_norm": 1.5952609772858102,
"learning_rate": 1.846221032856965e-06,
"loss": 0.2319,
"step": 2268
},
{
"epoch": 0.73,
"grad_norm": 1.5754968134503997,
"learning_rate": 1.8422012537726646e-06,
"loss": 0.2189,
"step": 2269
},
{
"epoch": 0.73,
"grad_norm": 1.5868406215734856,
"learning_rate": 1.8381848671291953e-06,
"loss": 0.1898,
"step": 2270
},
{
"epoch": 0.73,
"grad_norm": 8.171116086303511,
"learning_rate": 1.8341718772413852e-06,
"loss": 0.5808,
"step": 2271
},
{
"epoch": 0.73,
"grad_norm": 1.449524747276802,
"learning_rate": 1.8301622884204096e-06,
"loss": 0.1995,
"step": 2272
},
{
"epoch": 0.73,
"grad_norm": 6.486439349672904,
"learning_rate": 1.8261561049737946e-06,
"loss": 0.5245,
"step": 2273
},
{
"epoch": 0.73,
"grad_norm": 10.82732267235109,
"learning_rate": 1.8221533312054024e-06,
"loss": 0.6953,
"step": 2274
},
{
"epoch": 0.73,
"grad_norm": 1.679977002278223,
"learning_rate": 1.818153971415439e-06,
"loss": 0.2156,
"step": 2275
},
{
"epoch": 0.73,
"grad_norm": 1.58293077012093,
"learning_rate": 1.8141580299004342e-06,
"loss": 0.2454,
"step": 2276
},
{
"epoch": 0.73,
"grad_norm": 1.4055543468615495,
"learning_rate": 1.8101655109532552e-06,
"loss": 0.2021,
"step": 2277
},
{
"epoch": 0.73,
"grad_norm": 6.1193649017324026,
"learning_rate": 1.8061764188630831e-06,
"loss": 0.5663,
"step": 2278
},
{
"epoch": 0.73,
"grad_norm": 1.3305418674019964,
"learning_rate": 1.8021907579154257e-06,
"loss": 0.176,
"step": 2279
},
{
"epoch": 0.73,
"grad_norm": 10.00885650373271,
"learning_rate": 1.7982085323920973e-06,
"loss": 0.5901,
"step": 2280
},
{
"epoch": 0.73,
"grad_norm": 1.8071652982180177,
"learning_rate": 1.7942297465712282e-06,
"loss": 0.259,
"step": 2281
},
{
"epoch": 0.73,
"grad_norm": 1.675935122364972,
"learning_rate": 1.7902544047272468e-06,
"loss": 0.2302,
"step": 2282
},
{
"epoch": 0.73,
"grad_norm": 6.776791922815715,
"learning_rate": 1.7862825111308873e-06,
"loss": 0.664,
"step": 2283
},
{
"epoch": 0.73,
"grad_norm": 6.865538111907284,
"learning_rate": 1.7823140700491786e-06,
"loss": 0.5949,
"step": 2284
},
{
"epoch": 0.73,
"grad_norm": 5.802272910455937,
"learning_rate": 1.7783490857454354e-06,
"loss": 0.3682,
"step": 2285
},
{
"epoch": 0.73,
"grad_norm": 6.560312135377362,
"learning_rate": 1.7743875624792662e-06,
"loss": 0.5582,
"step": 2286
},
{
"epoch": 0.73,
"grad_norm": 1.5913109833044743,
"learning_rate": 1.770429504506554e-06,
"loss": 0.2239,
"step": 2287
},
{
"epoch": 0.73,
"grad_norm": 7.079366669373585,
"learning_rate": 1.7664749160794642e-06,
"loss": 0.6495,
"step": 2288
},
{
"epoch": 0.73,
"grad_norm": 8.09757716625291,
"learning_rate": 1.7625238014464358e-06,
"loss": 0.4482,
"step": 2289
},
{
"epoch": 0.73,
"grad_norm": 10.57674043946392,
"learning_rate": 1.7585761648521688e-06,
"loss": 0.5998,
"step": 2290
},
{
"epoch": 0.73,
"grad_norm": 12.421223041564375,
"learning_rate": 1.7546320105376346e-06,
"loss": 0.6031,
"step": 2291
},
{
"epoch": 0.73,
"grad_norm": 5.3650225502835776,
"learning_rate": 1.750691342740058e-06,
"loss": 0.7267,
"step": 2292
},
{
"epoch": 0.73,
"grad_norm": 8.879833657416945,
"learning_rate": 1.746754165692921e-06,
"loss": 0.5464,
"step": 2293
},
{
"epoch": 0.73,
"grad_norm": 1.5718775127322624,
"learning_rate": 1.742820483625957e-06,
"loss": 0.2074,
"step": 2294
},
{
"epoch": 0.73,
"grad_norm": 8.820128082658634,
"learning_rate": 1.7388903007651398e-06,
"loss": 0.4889,
"step": 2295
},
{
"epoch": 0.73,
"grad_norm": 6.283571796495923,
"learning_rate": 1.7349636213326876e-06,
"loss": 0.5186,
"step": 2296
},
{
"epoch": 0.74,
"grad_norm": 7.058987036721699,
"learning_rate": 1.7310404495470557e-06,
"loss": 0.4451,
"step": 2297
},
{
"epoch": 0.74,
"grad_norm": 1.563485148949926,
"learning_rate": 1.727120789622927e-06,
"loss": 0.179,
"step": 2298
},
{
"epoch": 0.74,
"grad_norm": 6.014529966282654,
"learning_rate": 1.7232046457712164e-06,
"loss": 0.4753,
"step": 2299
},
{
"epoch": 0.74,
"grad_norm": 8.167024305801668,
"learning_rate": 1.7192920221990566e-06,
"loss": 0.5438,
"step": 2300
},
{
"epoch": 0.74,
"grad_norm": 1.4582656606915507,
"learning_rate": 1.7153829231098018e-06,
"loss": 0.1758,
"step": 2301
},
{
"epoch": 0.74,
"grad_norm": 5.973447495717049,
"learning_rate": 1.7114773527030215e-06,
"loss": 0.461,
"step": 2302
},
{
"epoch": 0.74,
"grad_norm": 11.599549739132604,
"learning_rate": 1.7075753151744885e-06,
"loss": 0.5915,
"step": 2303
},
{
"epoch": 0.74,
"grad_norm": 6.068815615507931,
"learning_rate": 1.7036768147161853e-06,
"loss": 0.6128,
"step": 2304
},
{
"epoch": 0.74,
"grad_norm": 1.8422198212418037,
"learning_rate": 1.6997818555162915e-06,
"loss": 0.1973,
"step": 2305
},
{
"epoch": 0.74,
"grad_norm": 1.715902514534363,
"learning_rate": 1.6958904417591853e-06,
"loss": 0.2236,
"step": 2306
},
{
"epoch": 0.74,
"grad_norm": 1.60276075786763,
"learning_rate": 1.6920025776254334e-06,
"loss": 0.1898,
"step": 2307
},
{
"epoch": 0.74,
"grad_norm": 7.014840318131868,
"learning_rate": 1.6881182672917879e-06,
"loss": 0.478,
"step": 2308
},
{
"epoch": 0.74,
"grad_norm": 1.5300462322055253,
"learning_rate": 1.6842375149311868e-06,
"loss": 0.1978,
"step": 2309
},
{
"epoch": 0.74,
"grad_norm": 1.5219358965207703,
"learning_rate": 1.680360324712746e-06,
"loss": 0.1873,
"step": 2310
},
{
"epoch": 0.74,
"grad_norm": 1.55773836258357,
"learning_rate": 1.6764867008017493e-06,
"loss": 0.1992,
"step": 2311
},
{
"epoch": 0.74,
"grad_norm": 1.6277802076092838,
"learning_rate": 1.672616647359655e-06,
"loss": 0.1953,
"step": 2312
},
{
"epoch": 0.74,
"grad_norm": 1.5023181181423733,
"learning_rate": 1.668750168544081e-06,
"loss": 0.2323,
"step": 2313
},
{
"epoch": 0.74,
"grad_norm": 1.4795023942115406,
"learning_rate": 1.664887268508808e-06,
"loss": 0.178,
"step": 2314
},
{
"epoch": 0.74,
"grad_norm": 6.677022427612835,
"learning_rate": 1.6610279514037725e-06,
"loss": 0.4622,
"step": 2315
},
{
"epoch": 0.74,
"grad_norm": 1.406265038685406,
"learning_rate": 1.657172221375058e-06,
"loss": 0.1629,
"step": 2316
},
{
"epoch": 0.74,
"grad_norm": 1.6849794458922327,
"learning_rate": 1.6533200825648993e-06,
"loss": 0.1943,
"step": 2317
},
{
"epoch": 0.74,
"grad_norm": 7.780100197924541,
"learning_rate": 1.6494715391116671e-06,
"loss": 0.6186,
"step": 2318
},
{
"epoch": 0.74,
"grad_norm": 5.329751180976738,
"learning_rate": 1.6456265951498763e-06,
"loss": 0.6589,
"step": 2319
},
{
"epoch": 0.74,
"grad_norm": 1.537633057394514,
"learning_rate": 1.641785254810172e-06,
"loss": 0.1985,
"step": 2320
},
{
"epoch": 0.74,
"grad_norm": 1.4374966867681902,
"learning_rate": 1.6379475222193248e-06,
"loss": 0.2356,
"step": 2321
},
{
"epoch": 0.74,
"grad_norm": 1.479679886386861,
"learning_rate": 1.6341134015002352e-06,
"loss": 0.2364,
"step": 2322
},
{
"epoch": 0.74,
"grad_norm": 5.8586855752619025,
"learning_rate": 1.6302828967719175e-06,
"loss": 0.5224,
"step": 2323
},
{
"epoch": 0.74,
"grad_norm": 1.2614347887209656,
"learning_rate": 1.626456012149506e-06,
"loss": 0.1834,
"step": 2324
},
{
"epoch": 0.74,
"grad_norm": 7.745335126732027,
"learning_rate": 1.6226327517442453e-06,
"loss": 0.6006,
"step": 2325
},
{
"epoch": 0.74,
"grad_norm": 25.738539555946595,
"learning_rate": 1.6188131196634827e-06,
"loss": 0.5638,
"step": 2326
},
{
"epoch": 0.74,
"grad_norm": 1.515876565578083,
"learning_rate": 1.6149971200106723e-06,
"loss": 0.2098,
"step": 2327
},
{
"epoch": 0.74,
"grad_norm": 4.679477770645862,
"learning_rate": 1.6111847568853645e-06,
"loss": 0.4238,
"step": 2328
},
{
"epoch": 0.75,
"grad_norm": 1.5000856150130015,
"learning_rate": 1.6073760343831996e-06,
"loss": 0.1832,
"step": 2329
},
{
"epoch": 0.75,
"grad_norm": 1.6461882429471486,
"learning_rate": 1.603570956595913e-06,
"loss": 0.2228,
"step": 2330
},
{
"epoch": 0.75,
"grad_norm": 1.5807675669285757,
"learning_rate": 1.5997695276113168e-06,
"loss": 0.2178,
"step": 2331
},
{
"epoch": 0.75,
"grad_norm": 8.247096928630851,
"learning_rate": 1.595971751513311e-06,
"loss": 0.4801,
"step": 2332
},
{
"epoch": 0.75,
"grad_norm": 1.484294044808653,
"learning_rate": 1.5921776323818655e-06,
"loss": 0.1864,
"step": 2333
},
{
"epoch": 0.75,
"grad_norm": 4.369584217105914,
"learning_rate": 1.5883871742930257e-06,
"loss": 0.5279,
"step": 2334
},
{
"epoch": 0.75,
"grad_norm": 7.771794175699381,
"learning_rate": 1.5846003813188993e-06,
"loss": 0.4916,
"step": 2335
},
{
"epoch": 0.75,
"grad_norm": 6.767542280678469,
"learning_rate": 1.5808172575276615e-06,
"loss": 0.517,
"step": 2336
},
{
"epoch": 0.75,
"grad_norm": 16.037289163882527,
"learning_rate": 1.5770378069835412e-06,
"loss": 0.598,
"step": 2337
},
{
"epoch": 0.75,
"grad_norm": 1.6150945428755419,
"learning_rate": 1.5732620337468258e-06,
"loss": 0.2426,
"step": 2338
},
{
"epoch": 0.75,
"grad_norm": 4.8715631816307265,
"learning_rate": 1.5694899418738462e-06,
"loss": 0.6858,
"step": 2339
},
{
"epoch": 0.75,
"grad_norm": 1.6023643760317052,
"learning_rate": 1.5657215354169841e-06,
"loss": 0.185,
"step": 2340
},
{
"epoch": 0.75,
"grad_norm": 1.4888072128943393,
"learning_rate": 1.561956818424661e-06,
"loss": 0.1841,
"step": 2341
},
{
"epoch": 0.75,
"grad_norm": 1.6005429190663583,
"learning_rate": 1.5581957949413295e-06,
"loss": 0.2522,
"step": 2342
},
{
"epoch": 0.75,
"grad_norm": 1.4101795444918526,
"learning_rate": 1.554438469007482e-06,
"loss": 0.1682,
"step": 2343
},
{
"epoch": 0.75,
"grad_norm": 1.5328717521022854,
"learning_rate": 1.5506848446596317e-06,
"loss": 0.2017,
"step": 2344
},
{
"epoch": 0.75,
"grad_norm": 11.364712037092591,
"learning_rate": 1.546934925930319e-06,
"loss": 0.5613,
"step": 2345
},
{
"epoch": 0.75,
"grad_norm": 1.3647957100737897,
"learning_rate": 1.5431887168481051e-06,
"loss": 0.1412,
"step": 2346
},
{
"epoch": 0.75,
"grad_norm": 1.4670332075404113,
"learning_rate": 1.5394462214375593e-06,
"loss": 0.1962,
"step": 2347
},
{
"epoch": 0.75,
"grad_norm": 6.586589393567223,
"learning_rate": 1.5357074437192688e-06,
"loss": 0.4516,
"step": 2348
},
{
"epoch": 0.75,
"grad_norm": 10.56356967244964,
"learning_rate": 1.5319723877098202e-06,
"loss": 0.5722,
"step": 2349
},
{
"epoch": 0.75,
"grad_norm": 27.533933965993338,
"learning_rate": 1.5282410574218072e-06,
"loss": 0.6225,
"step": 2350
},
{
"epoch": 0.75,
"grad_norm": 7.3389769770249,
"learning_rate": 1.5245134568638197e-06,
"loss": 0.6991,
"step": 2351
},
{
"epoch": 0.75,
"grad_norm": 7.6747107205316025,
"learning_rate": 1.5207895900404363e-06,
"loss": 0.655,
"step": 2352
},
{
"epoch": 0.75,
"grad_norm": 1.7133153620797101,
"learning_rate": 1.5170694609522306e-06,
"loss": 0.2325,
"step": 2353
},
{
"epoch": 0.75,
"grad_norm": 5.833319223599012,
"learning_rate": 1.5133530735957586e-06,
"loss": 0.5796,
"step": 2354
},
{
"epoch": 0.75,
"grad_norm": 1.4469803480828578,
"learning_rate": 1.5096404319635533e-06,
"loss": 0.1964,
"step": 2355
},
{
"epoch": 0.75,
"grad_norm": 8.851584956787143,
"learning_rate": 1.50593154004413e-06,
"loss": 0.6571,
"step": 2356
},
{
"epoch": 0.75,
"grad_norm": 1.6610199134694212,
"learning_rate": 1.502226401821968e-06,
"loss": 0.2382,
"step": 2357
},
{
"epoch": 0.75,
"grad_norm": 4.581817614080536,
"learning_rate": 1.498525021277521e-06,
"loss": 0.482,
"step": 2358
},
{
"epoch": 0.75,
"grad_norm": 7.45310859680198,
"learning_rate": 1.4948274023872005e-06,
"loss": 0.5531,
"step": 2359
},
{
"epoch": 0.76,
"grad_norm": 1.750951578789252,
"learning_rate": 1.4911335491233818e-06,
"loss": 0.2255,
"step": 2360
},
{
"epoch": 0.76,
"grad_norm": 1.440148079355167,
"learning_rate": 1.487443465454389e-06,
"loss": 0.1655,
"step": 2361
},
{
"epoch": 0.76,
"grad_norm": 1.6263530608794818,
"learning_rate": 1.483757155344503e-06,
"loss": 0.2022,
"step": 2362
},
{
"epoch": 0.76,
"grad_norm": 6.509908589441633,
"learning_rate": 1.4800746227539437e-06,
"loss": 0.5018,
"step": 2363
},
{
"epoch": 0.76,
"grad_norm": 1.478431881245627,
"learning_rate": 1.4763958716388798e-06,
"loss": 0.2005,
"step": 2364
},
{
"epoch": 0.76,
"grad_norm": 11.073599191026318,
"learning_rate": 1.4727209059514114e-06,
"loss": 0.6426,
"step": 2365
},
{
"epoch": 0.76,
"grad_norm": 21.181615696021776,
"learning_rate": 1.4690497296395773e-06,
"loss": 0.5923,
"step": 2366
},
{
"epoch": 0.76,
"grad_norm": 17.85850577782123,
"learning_rate": 1.46538234664734e-06,
"loss": 0.499,
"step": 2367
},
{
"epoch": 0.76,
"grad_norm": 7.225088359123627,
"learning_rate": 1.4617187609145906e-06,
"loss": 0.4634,
"step": 2368
},
{
"epoch": 0.76,
"grad_norm": 5.4330131602081595,
"learning_rate": 1.4580589763771413e-06,
"loss": 0.49,
"step": 2369
},
{
"epoch": 0.76,
"grad_norm": 6.320076970953097,
"learning_rate": 1.4544029969667167e-06,
"loss": 0.6917,
"step": 2370
},
{
"epoch": 0.76,
"grad_norm": 1.5329530470500947,
"learning_rate": 1.4507508266109565e-06,
"loss": 0.184,
"step": 2371
},
{
"epoch": 0.76,
"grad_norm": 1.501294050131622,
"learning_rate": 1.4471024692334101e-06,
"loss": 0.2084,
"step": 2372
},
{
"epoch": 0.76,
"grad_norm": 4.755780190366352,
"learning_rate": 1.4434579287535244e-06,
"loss": 0.5128,
"step": 2373
},
{
"epoch": 0.76,
"grad_norm": 6.602086049509877,
"learning_rate": 1.439817209086653e-06,
"loss": 0.5971,
"step": 2374
},
{
"epoch": 0.76,
"grad_norm": 1.6857665566710058,
"learning_rate": 1.4361803141440384e-06,
"loss": 0.2171,
"step": 2375
},
{
"epoch": 0.76,
"grad_norm": 6.353716756607063,
"learning_rate": 1.432547247832819e-06,
"loss": 0.5053,
"step": 2376
},
{
"epoch": 0.76,
"grad_norm": 8.005481917673409,
"learning_rate": 1.4289180140560189e-06,
"loss": 0.6893,
"step": 2377
},
{
"epoch": 0.76,
"grad_norm": 7.207337731211934,
"learning_rate": 1.4252926167125413e-06,
"loss": 0.618,
"step": 2378
},
{
"epoch": 0.76,
"grad_norm": 1.4856239702812042,
"learning_rate": 1.421671059697175e-06,
"loss": 0.2069,
"step": 2379
},
{
"epoch": 0.76,
"grad_norm": 7.293121056948293,
"learning_rate": 1.418053346900574e-06,
"loss": 0.5381,
"step": 2380
},
{
"epoch": 0.76,
"grad_norm": 10.743860752985295,
"learning_rate": 1.4144394822092712e-06,
"loss": 0.5178,
"step": 2381
},
{
"epoch": 0.76,
"grad_norm": 7.942226089157316,
"learning_rate": 1.4108294695056606e-06,
"loss": 0.5673,
"step": 2382
},
{
"epoch": 0.76,
"grad_norm": 6.01833332655479,
"learning_rate": 1.4072233126679985e-06,
"loss": 0.525,
"step": 2383
},
{
"epoch": 0.76,
"grad_norm": 5.736861752132404,
"learning_rate": 1.4036210155703989e-06,
"loss": 0.3515,
"step": 2384
},
{
"epoch": 0.76,
"grad_norm": 1.386143205504466,
"learning_rate": 1.4000225820828317e-06,
"loss": 0.1874,
"step": 2385
},
{
"epoch": 0.76,
"grad_norm": 14.73927995898969,
"learning_rate": 1.3964280160711119e-06,
"loss": 0.5035,
"step": 2386
},
{
"epoch": 0.76,
"grad_norm": 1.421278867736656,
"learning_rate": 1.3928373213969038e-06,
"loss": 0.2144,
"step": 2387
},
{
"epoch": 0.76,
"grad_norm": 6.8491800401674325,
"learning_rate": 1.38925050191771e-06,
"loss": 0.493,
"step": 2388
},
{
"epoch": 0.76,
"grad_norm": 1.4477905976408243,
"learning_rate": 1.3856675614868687e-06,
"loss": 0.1524,
"step": 2389
},
{
"epoch": 0.76,
"grad_norm": 1.6095206288289274,
"learning_rate": 1.3820885039535564e-06,
"loss": 0.2371,
"step": 2390
},
{
"epoch": 0.77,
"grad_norm": 6.888230687986665,
"learning_rate": 1.378513333162771e-06,
"loss": 0.5841,
"step": 2391
},
{
"epoch": 0.77,
"grad_norm": 9.045799662925818,
"learning_rate": 1.3749420529553414e-06,
"loss": 0.5309,
"step": 2392
},
{
"epoch": 0.77,
"grad_norm": 6.077391830064108,
"learning_rate": 1.3713746671679112e-06,
"loss": 0.6062,
"step": 2393
},
{
"epoch": 0.77,
"grad_norm": 1.8349420626947697,
"learning_rate": 1.3678111796329446e-06,
"loss": 0.2625,
"step": 2394
},
{
"epoch": 0.77,
"grad_norm": 9.73021569182454,
"learning_rate": 1.3642515941787171e-06,
"loss": 0.4867,
"step": 2395
},
{
"epoch": 0.77,
"grad_norm": 13.698370345073796,
"learning_rate": 1.3606959146293086e-06,
"loss": 0.4307,
"step": 2396
},
{
"epoch": 0.77,
"grad_norm": 1.6944837465065132,
"learning_rate": 1.3571441448046086e-06,
"loss": 0.2304,
"step": 2397
},
{
"epoch": 0.77,
"grad_norm": 1.3612756215247916,
"learning_rate": 1.3535962885202997e-06,
"loss": 0.1837,
"step": 2398
},
{
"epoch": 0.77,
"grad_norm": 1.7508574085618989,
"learning_rate": 1.350052349587866e-06,
"loss": 0.2136,
"step": 2399
},
{
"epoch": 0.77,
"grad_norm": 5.945963849598769,
"learning_rate": 1.3465123318145817e-06,
"loss": 0.612,
"step": 2400
},
{
"epoch": 0.77,
"grad_norm": 1.3563017391440364,
"learning_rate": 1.342976239003505e-06,
"loss": 0.1595,
"step": 2401
},
{
"epoch": 0.77,
"grad_norm": 1.4513470956592684,
"learning_rate": 1.339444074953482e-06,
"loss": 0.1939,
"step": 2402
},
{
"epoch": 0.77,
"grad_norm": 1.5405561506611012,
"learning_rate": 1.335915843459137e-06,
"loss": 0.1618,
"step": 2403
},
{
"epoch": 0.77,
"grad_norm": 1.55313805260523,
"learning_rate": 1.3323915483108662e-06,
"loss": 0.1962,
"step": 2404
},
{
"epoch": 0.77,
"grad_norm": 1.402394552080233,
"learning_rate": 1.3288711932948427e-06,
"loss": 0.1921,
"step": 2405
},
{
"epoch": 0.77,
"grad_norm": 1.4552915386785408,
"learning_rate": 1.3253547821930002e-06,
"loss": 0.1888,
"step": 2406
},
{
"epoch": 0.77,
"grad_norm": 10.677169419957384,
"learning_rate": 1.3218423187830409e-06,
"loss": 0.6102,
"step": 2407
},
{
"epoch": 0.77,
"grad_norm": 1.3661472541973056,
"learning_rate": 1.3183338068384243e-06,
"loss": 0.2131,
"step": 2408
},
{
"epoch": 0.77,
"grad_norm": 8.069416045414519,
"learning_rate": 1.3148292501283627e-06,
"loss": 0.5967,
"step": 2409
},
{
"epoch": 0.77,
"grad_norm": 1.4718935332732668,
"learning_rate": 1.3113286524178232e-06,
"loss": 0.2282,
"step": 2410
},
{
"epoch": 0.77,
"grad_norm": 1.7259519323112857,
"learning_rate": 1.3078320174675141e-06,
"loss": 0.2041,
"step": 2411
},
{
"epoch": 0.77,
"grad_norm": 1.684452130814182,
"learning_rate": 1.3043393490338918e-06,
"loss": 0.2216,
"step": 2412
},
{
"epoch": 0.77,
"grad_norm": 1.6210119180826756,
"learning_rate": 1.3008506508691516e-06,
"loss": 0.186,
"step": 2413
},
{
"epoch": 0.77,
"grad_norm": 1.6543582716795695,
"learning_rate": 1.2973659267212173e-06,
"loss": 0.1759,
"step": 2414
},
{
"epoch": 0.77,
"grad_norm": 1.7221113167648243,
"learning_rate": 1.2938851803337516e-06,
"loss": 0.2711,
"step": 2415
},
{
"epoch": 0.77,
"grad_norm": 1.3452649138346509,
"learning_rate": 1.290408415446136e-06,
"loss": 0.1468,
"step": 2416
},
{
"epoch": 0.77,
"grad_norm": 11.631055232311622,
"learning_rate": 1.2869356357934815e-06,
"loss": 0.5374,
"step": 2417
},
{
"epoch": 0.77,
"grad_norm": 4.822561779391185,
"learning_rate": 1.2834668451066118e-06,
"loss": 0.397,
"step": 2418
},
{
"epoch": 0.77,
"grad_norm": 5.747961116244306,
"learning_rate": 1.2800020471120717e-06,
"loss": 0.5021,
"step": 2419
},
{
"epoch": 0.77,
"grad_norm": 5.616191178312223,
"learning_rate": 1.276541245532109e-06,
"loss": 0.5156,
"step": 2420
},
{
"epoch": 0.77,
"grad_norm": 1.6064079114412415,
"learning_rate": 1.2730844440846862e-06,
"loss": 0.2126,
"step": 2421
},
{
"epoch": 0.78,
"grad_norm": 8.419000170413655,
"learning_rate": 1.2696316464834607e-06,
"loss": 0.5254,
"step": 2422
},
{
"epoch": 0.78,
"grad_norm": 1.4723558832246442,
"learning_rate": 1.2661828564377948e-06,
"loss": 0.2147,
"step": 2423
},
{
"epoch": 0.78,
"grad_norm": 7.659881335092557,
"learning_rate": 1.2627380776527415e-06,
"loss": 0.6609,
"step": 2424
},
{
"epoch": 0.78,
"grad_norm": 1.5219241063364994,
"learning_rate": 1.259297313829046e-06,
"loss": 0.2274,
"step": 2425
},
{
"epoch": 0.78,
"grad_norm": 6.3997301053784135,
"learning_rate": 1.255860568663142e-06,
"loss": 0.4295,
"step": 2426
},
{
"epoch": 0.78,
"grad_norm": 1.3787264057905275,
"learning_rate": 1.2524278458471411e-06,
"loss": 0.2019,
"step": 2427
},
{
"epoch": 0.78,
"grad_norm": 1.6013440692023229,
"learning_rate": 1.248999149068838e-06,
"loss": 0.1796,
"step": 2428
},
{
"epoch": 0.78,
"grad_norm": 1.728939143806417,
"learning_rate": 1.2455744820117028e-06,
"loss": 0.1958,
"step": 2429
},
{
"epoch": 0.78,
"grad_norm": 1.464485738013023,
"learning_rate": 1.2421538483548706e-06,
"loss": 0.1655,
"step": 2430
},
{
"epoch": 0.78,
"grad_norm": 7.034298389357844,
"learning_rate": 1.2387372517731505e-06,
"loss": 0.4589,
"step": 2431
},
{
"epoch": 0.78,
"grad_norm": 12.37757862861587,
"learning_rate": 1.2353246959370086e-06,
"loss": 0.5021,
"step": 2432
},
{
"epoch": 0.78,
"grad_norm": 6.9792442549148435,
"learning_rate": 1.2319161845125744e-06,
"loss": 0.4972,
"step": 2433
},
{
"epoch": 0.78,
"grad_norm": 1.6712922440507088,
"learning_rate": 1.228511721161631e-06,
"loss": 0.1924,
"step": 2434
},
{
"epoch": 0.78,
"grad_norm": 1.5713949409100816,
"learning_rate": 1.2251113095416113e-06,
"loss": 0.2149,
"step": 2435
},
{
"epoch": 0.78,
"grad_norm": 1.46398599487867,
"learning_rate": 1.2217149533055976e-06,
"loss": 0.1773,
"step": 2436
},
{
"epoch": 0.78,
"grad_norm": 5.132408324494143,
"learning_rate": 1.2183226561023132e-06,
"loss": 0.5753,
"step": 2437
},
{
"epoch": 0.78,
"grad_norm": 5.857664332671467,
"learning_rate": 1.2149344215761216e-06,
"loss": 0.5602,
"step": 2438
},
{
"epoch": 0.78,
"grad_norm": 1.5876521548650295,
"learning_rate": 1.2115502533670253e-06,
"loss": 0.2446,
"step": 2439
},
{
"epoch": 0.78,
"grad_norm": 1.4493141431593062,
"learning_rate": 1.2081701551106506e-06,
"loss": 0.1996,
"step": 2440
},
{
"epoch": 0.78,
"grad_norm": 1.776534678525459,
"learning_rate": 1.20479413043826e-06,
"loss": 0.2469,
"step": 2441
},
{
"epoch": 0.78,
"grad_norm": 1.5689220844918574,
"learning_rate": 1.201422182976732e-06,
"loss": 0.1944,
"step": 2442
},
{
"epoch": 0.78,
"grad_norm": 14.335545365917188,
"learning_rate": 1.1980543163485726e-06,
"loss": 0.5762,
"step": 2443
},
{
"epoch": 0.78,
"grad_norm": 1.6300431013975771,
"learning_rate": 1.1946905341718951e-06,
"loss": 0.2157,
"step": 2444
},
{
"epoch": 0.78,
"grad_norm": 5.633850187021617,
"learning_rate": 1.1913308400604339e-06,
"loss": 0.6298,
"step": 2445
},
{
"epoch": 0.78,
"grad_norm": 1.3547056318444164,
"learning_rate": 1.1879752376235231e-06,
"loss": 0.1967,
"step": 2446
},
{
"epoch": 0.78,
"grad_norm": 9.750923071373556,
"learning_rate": 1.1846237304661095e-06,
"loss": 0.545,
"step": 2447
},
{
"epoch": 0.78,
"grad_norm": 6.662999758882776,
"learning_rate": 1.181276322188732e-06,
"loss": 0.6131,
"step": 2448
},
{
"epoch": 0.78,
"grad_norm": 1.6677086643354662,
"learning_rate": 1.1779330163875325e-06,
"loss": 0.2169,
"step": 2449
},
{
"epoch": 0.78,
"grad_norm": 1.6074696369988541,
"learning_rate": 1.1745938166542414e-06,
"loss": 0.1886,
"step": 2450
},
{
"epoch": 0.78,
"grad_norm": 4.811853847327237,
"learning_rate": 1.1712587265761799e-06,
"loss": 0.5485,
"step": 2451
},
{
"epoch": 0.78,
"grad_norm": 9.59561361609413,
"learning_rate": 1.1679277497362563e-06,
"loss": 0.5142,
"step": 2452
},
{
"epoch": 0.78,
"grad_norm": 23.86988713960673,
"learning_rate": 1.1646008897129546e-06,
"loss": 0.4215,
"step": 2453
},
{
"epoch": 0.79,
"grad_norm": 5.807472920956867,
"learning_rate": 1.161278150080341e-06,
"loss": 0.4971,
"step": 2454
},
{
"epoch": 0.79,
"grad_norm": 1.5991223030970334,
"learning_rate": 1.157959534408052e-06,
"loss": 0.2095,
"step": 2455
},
{
"epoch": 0.79,
"grad_norm": 1.5688822910998863,
"learning_rate": 1.1546450462612951e-06,
"loss": 0.1677,
"step": 2456
},
{
"epoch": 0.79,
"grad_norm": 1.6127490615241922,
"learning_rate": 1.151334689200845e-06,
"loss": 0.1992,
"step": 2457
},
{
"epoch": 0.79,
"grad_norm": 4.250124362846092,
"learning_rate": 1.1480284667830343e-06,
"loss": 0.6511,
"step": 2458
},
{
"epoch": 0.79,
"grad_norm": 6.532244488593466,
"learning_rate": 1.1447263825597577e-06,
"loss": 0.4948,
"step": 2459
},
{
"epoch": 0.79,
"grad_norm": 1.7963430910867482,
"learning_rate": 1.1414284400784643e-06,
"loss": 0.2555,
"step": 2460
},
{
"epoch": 0.79,
"grad_norm": 6.245490741574394,
"learning_rate": 1.1381346428821482e-06,
"loss": 0.5099,
"step": 2461
},
{
"epoch": 0.79,
"grad_norm": 9.041639519313504,
"learning_rate": 1.134844994509358e-06,
"loss": 0.5677,
"step": 2462
},
{
"epoch": 0.79,
"grad_norm": 1.481988512128748,
"learning_rate": 1.1315594984941786e-06,
"loss": 0.2139,
"step": 2463
},
{
"epoch": 0.79,
"grad_norm": 1.380496281928763,
"learning_rate": 1.1282781583662372e-06,
"loss": 0.1537,
"step": 2464
},
{
"epoch": 0.79,
"grad_norm": 5.073454771621429,
"learning_rate": 1.1250009776506982e-06,
"loss": 0.5818,
"step": 2465
},
{
"epoch": 0.79,
"grad_norm": 1.3944048376058802,
"learning_rate": 1.1217279598682518e-06,
"loss": 0.1951,
"step": 2466
},
{
"epoch": 0.79,
"grad_norm": 6.083484458011643,
"learning_rate": 1.118459108535122e-06,
"loss": 0.5768,
"step": 2467
},
{
"epoch": 0.79,
"grad_norm": 1.621953302541929,
"learning_rate": 1.1151944271630517e-06,
"loss": 0.2339,
"step": 2468
},
{
"epoch": 0.79,
"grad_norm": 1.3666553916208857,
"learning_rate": 1.1119339192593077e-06,
"loss": 0.2036,
"step": 2469
},
{
"epoch": 0.79,
"grad_norm": 1.5413769186843314,
"learning_rate": 1.1086775883266725e-06,
"loss": 0.2168,
"step": 2470
},
{
"epoch": 0.79,
"grad_norm": 1.518061414783004,
"learning_rate": 1.1054254378634399e-06,
"loss": 0.1752,
"step": 2471
},
{
"epoch": 0.79,
"grad_norm": 1.464210153980354,
"learning_rate": 1.102177471363412e-06,
"loss": 0.1809,
"step": 2472
},
{
"epoch": 0.79,
"grad_norm": 1.5872269482408712,
"learning_rate": 1.0989336923158999e-06,
"loss": 0.1802,
"step": 2473
},
{
"epoch": 0.79,
"grad_norm": 1.6405936373674377,
"learning_rate": 1.0956941042057106e-06,
"loss": 0.2145,
"step": 2474
},
{
"epoch": 0.79,
"grad_norm": 36.82423488510403,
"learning_rate": 1.0924587105131546e-06,
"loss": 0.4966,
"step": 2475
},
{
"epoch": 0.79,
"grad_norm": 1.6724867978947762,
"learning_rate": 1.0892275147140307e-06,
"loss": 0.2153,
"step": 2476
},
{
"epoch": 0.79,
"grad_norm": 1.6272667557449185,
"learning_rate": 1.086000520279632e-06,
"loss": 0.2137,
"step": 2477
},
{
"epoch": 0.79,
"grad_norm": 1.6051885665171814,
"learning_rate": 1.0827777306767384e-06,
"loss": 0.1802,
"step": 2478
},
{
"epoch": 0.79,
"grad_norm": 1.5636775827738592,
"learning_rate": 1.0795591493676072e-06,
"loss": 0.1948,
"step": 2479
},
{
"epoch": 0.79,
"grad_norm": 1.4944126627336913,
"learning_rate": 1.0763447798099813e-06,
"loss": 0.2312,
"step": 2480
},
{
"epoch": 0.79,
"grad_norm": 1.5618556069148841,
"learning_rate": 1.0731346254570735e-06,
"loss": 0.1841,
"step": 2481
},
{
"epoch": 0.79,
"grad_norm": 1.301146600753639,
"learning_rate": 1.0699286897575718e-06,
"loss": 0.1857,
"step": 2482
},
{
"epoch": 0.79,
"grad_norm": 6.672526601056036,
"learning_rate": 1.066726976155632e-06,
"loss": 0.2723,
"step": 2483
},
{
"epoch": 0.79,
"grad_norm": 8.854185007252035,
"learning_rate": 1.0635294880908702e-06,
"loss": 0.529,
"step": 2484
},
{
"epoch": 0.8,
"grad_norm": 7.174645636796516,
"learning_rate": 1.0603362289983687e-06,
"loss": 0.5951,
"step": 2485
},
{
"epoch": 0.8,
"grad_norm": 1.5295702173480263,
"learning_rate": 1.0571472023086604e-06,
"loss": 0.2012,
"step": 2486
},
{
"epoch": 0.8,
"grad_norm": 6.908237707717649,
"learning_rate": 1.053962411447736e-06,
"loss": 0.5908,
"step": 2487
},
{
"epoch": 0.8,
"grad_norm": 1.5778324264783983,
"learning_rate": 1.0507818598370355e-06,
"loss": 0.1894,
"step": 2488
},
{
"epoch": 0.8,
"grad_norm": 6.674324706171196,
"learning_rate": 1.0476055508934408e-06,
"loss": 0.5811,
"step": 2489
},
{
"epoch": 0.8,
"grad_norm": 4.897314396887511,
"learning_rate": 1.0444334880292794e-06,
"loss": 0.6365,
"step": 2490
},
{
"epoch": 0.8,
"grad_norm": 7.945153171999199,
"learning_rate": 1.0412656746523182e-06,
"loss": 0.5195,
"step": 2491
},
{
"epoch": 0.8,
"grad_norm": 1.3993321163668972,
"learning_rate": 1.0381021141657526e-06,
"loss": 0.1745,
"step": 2492
},
{
"epoch": 0.8,
"grad_norm": 1.6648621363034495,
"learning_rate": 1.0349428099682173e-06,
"loss": 0.2262,
"step": 2493
},
{
"epoch": 0.8,
"grad_norm": 8.39262549028051,
"learning_rate": 1.0317877654537672e-06,
"loss": 0.573,
"step": 2494
},
{
"epoch": 0.8,
"grad_norm": 13.261737124946308,
"learning_rate": 1.0286369840118859e-06,
"loss": 0.5983,
"step": 2495
},
{
"epoch": 0.8,
"grad_norm": 1.4379975384462143,
"learning_rate": 1.025490469027477e-06,
"loss": 0.184,
"step": 2496
},
{
"epoch": 0.8,
"grad_norm": 1.4363675604056922,
"learning_rate": 1.0223482238808557e-06,
"loss": 0.1803,
"step": 2497
},
{
"epoch": 0.8,
"grad_norm": 5.067103834534466,
"learning_rate": 1.0192102519477565e-06,
"loss": 0.5164,
"step": 2498
},
{
"epoch": 0.8,
"grad_norm": 1.6172231052222457,
"learning_rate": 1.016076556599318e-06,
"loss": 0.2087,
"step": 2499
},
{
"epoch": 0.8,
"grad_norm": 1.3916450698179588,
"learning_rate": 1.0129471412020886e-06,
"loss": 0.1704,
"step": 2500
},
{
"epoch": 0.8,
"grad_norm": 6.379376112173397,
"learning_rate": 1.0098220091180145e-06,
"loss": 0.6694,
"step": 2501
},
{
"epoch": 0.8,
"grad_norm": 1.414347592392862,
"learning_rate": 1.006701163704445e-06,
"loss": 0.1983,
"step": 2502
},
{
"epoch": 0.8,
"grad_norm": 1.610670843022095,
"learning_rate": 1.0035846083141193e-06,
"loss": 0.2061,
"step": 2503
},
{
"epoch": 0.8,
"grad_norm": 1.8901554645136118,
"learning_rate": 1.0004723462951732e-06,
"loss": 0.2325,
"step": 2504
},
{
"epoch": 0.8,
"grad_norm": 1.4386781328899296,
"learning_rate": 9.973643809911238e-07,
"loss": 0.212,
"step": 2505
},
{
"epoch": 0.8,
"grad_norm": 1.5778673041286801,
"learning_rate": 9.942607157408784e-07,
"loss": 0.1916,
"step": 2506
},
{
"epoch": 0.8,
"grad_norm": 1.4616468940958067,
"learning_rate": 9.911613538787196e-07,
"loss": 0.1631,
"step": 2507
},
{
"epoch": 0.8,
"grad_norm": 1.4308317265407906,
"learning_rate": 9.880662987343103e-07,
"loss": 0.1697,
"step": 2508
},
{
"epoch": 0.8,
"grad_norm": 4.6578055875903495,
"learning_rate": 9.849755536326866e-07,
"loss": 0.5274,
"step": 2509
},
{
"epoch": 0.8,
"grad_norm": 1.5709526897428592,
"learning_rate": 9.818891218942511e-07,
"loss": 0.2525,
"step": 2510
},
{
"epoch": 0.8,
"grad_norm": 1.4416668419248302,
"learning_rate": 9.78807006834777e-07,
"loss": 0.1782,
"step": 2511
},
{
"epoch": 0.8,
"grad_norm": 1.6251745102042936,
"learning_rate": 9.757292117653955e-07,
"loss": 0.2108,
"step": 2512
},
{
"epoch": 0.8,
"grad_norm": 6.9719931867493505,
"learning_rate": 9.726557399925995e-07,
"loss": 0.5532,
"step": 2513
},
{
"epoch": 0.8,
"grad_norm": 1.7711361425380343,
"learning_rate": 9.695865948182392e-07,
"loss": 0.1984,
"step": 2514
},
{
"epoch": 0.8,
"grad_norm": 7.717615445109604,
"learning_rate": 9.66521779539511e-07,
"loss": 0.544,
"step": 2515
},
{
"epoch": 0.81,
"grad_norm": 1.4683742252197753,
"learning_rate": 9.63461297448966e-07,
"loss": 0.1882,
"step": 2516
},
{
"epoch": 0.81,
"grad_norm": 1.5808992229471432,
"learning_rate": 9.604051518344948e-07,
"loss": 0.2048,
"step": 2517
},
{
"epoch": 0.81,
"grad_norm": 12.282569863775484,
"learning_rate": 9.57353345979332e-07,
"loss": 0.5281,
"step": 2518
},
{
"epoch": 0.81,
"grad_norm": 1.6271893351123086,
"learning_rate": 9.543058831620528e-07,
"loss": 0.2557,
"step": 2519
},
{
"epoch": 0.81,
"grad_norm": 1.6138184596302778,
"learning_rate": 9.512627666565588e-07,
"loss": 0.2418,
"step": 2520
},
{
"epoch": 0.81,
"grad_norm": 1.6122296182738112,
"learning_rate": 9.482239997320903e-07,
"loss": 0.1874,
"step": 2521
},
{
"epoch": 0.81,
"grad_norm": 1.4472149233201144,
"learning_rate": 9.451895856532117e-07,
"loss": 0.221,
"step": 2522
},
{
"epoch": 0.81,
"grad_norm": 14.97645190084707,
"learning_rate": 9.421595276798084e-07,
"loss": 0.6133,
"step": 2523
},
{
"epoch": 0.81,
"grad_norm": 1.4519990327298435,
"learning_rate": 9.39133829067092e-07,
"loss": 0.2109,
"step": 2524
},
{
"epoch": 0.81,
"grad_norm": 1.4610290448462637,
"learning_rate": 9.361124930655841e-07,
"loss": 0.2018,
"step": 2525
},
{
"epoch": 0.81,
"grad_norm": 1.498347722449172,
"learning_rate": 9.330955229211259e-07,
"loss": 0.1776,
"step": 2526
},
{
"epoch": 0.81,
"grad_norm": 1.5958498700482844,
"learning_rate": 9.300829218748625e-07,
"loss": 0.2672,
"step": 2527
},
{
"epoch": 0.81,
"grad_norm": 6.203034648831612,
"learning_rate": 9.270746931632501e-07,
"loss": 0.4314,
"step": 2528
},
{
"epoch": 0.81,
"grad_norm": 1.3300040390686352,
"learning_rate": 9.240708400180437e-07,
"loss": 0.1795,
"step": 2529
},
{
"epoch": 0.81,
"grad_norm": 14.3616155812818,
"learning_rate": 9.210713656663023e-07,
"loss": 0.5088,
"step": 2530
},
{
"epoch": 0.81,
"grad_norm": 5.884538514544935,
"learning_rate": 9.180762733303745e-07,
"loss": 0.533,
"step": 2531
},
{
"epoch": 0.81,
"grad_norm": 7.814694426867154,
"learning_rate": 9.150855662279079e-07,
"loss": 0.4018,
"step": 2532
},
{
"epoch": 0.81,
"grad_norm": 1.3973635596438183,
"learning_rate": 9.120992475718333e-07,
"loss": 0.1903,
"step": 2533
},
{
"epoch": 0.81,
"grad_norm": 1.4383665344471661,
"learning_rate": 9.091173205703708e-07,
"loss": 0.2065,
"step": 2534
},
{
"epoch": 0.81,
"grad_norm": 6.294289198358693,
"learning_rate": 9.061397884270217e-07,
"loss": 0.5422,
"step": 2535
},
{
"epoch": 0.81,
"grad_norm": 4.9857271791072035,
"learning_rate": 9.031666543405637e-07,
"loss": 0.5583,
"step": 2536
},
{
"epoch": 0.81,
"grad_norm": 8.327374247823952,
"learning_rate": 9.001979215050544e-07,
"loss": 0.4995,
"step": 2537
},
{
"epoch": 0.81,
"grad_norm": 4.488245645407394,
"learning_rate": 8.972335931098159e-07,
"loss": 0.466,
"step": 2538
},
{
"epoch": 0.81,
"grad_norm": 1.4002523589411047,
"learning_rate": 8.942736723394458e-07,
"loss": 0.2085,
"step": 2539
},
{
"epoch": 0.81,
"grad_norm": 4.889026022921027,
"learning_rate": 8.913181623738032e-07,
"loss": 0.6764,
"step": 2540
},
{
"epoch": 0.81,
"grad_norm": 7.750492978828258,
"learning_rate": 8.883670663880078e-07,
"loss": 0.6702,
"step": 2541
},
{
"epoch": 0.81,
"grad_norm": 5.127825878036379,
"learning_rate": 8.854203875524403e-07,
"loss": 0.5095,
"step": 2542
},
{
"epoch": 0.81,
"grad_norm": 1.799711649663046,
"learning_rate": 8.824781290327317e-07,
"loss": 0.2318,
"step": 2543
},
{
"epoch": 0.81,
"grad_norm": 1.6051235284074827,
"learning_rate": 8.795402939897679e-07,
"loss": 0.1928,
"step": 2544
},
{
"epoch": 0.81,
"grad_norm": 7.044225533853197,
"learning_rate": 8.766068855796833e-07,
"loss": 0.617,
"step": 2545
},
{
"epoch": 0.81,
"grad_norm": 8.052121845683034,
"learning_rate": 8.736779069538521e-07,
"loss": 0.4959,
"step": 2546
},
{
"epoch": 0.82,
"grad_norm": 17.23300986125481,
"learning_rate": 8.707533612588948e-07,
"loss": 0.569,
"step": 2547
},
{
"epoch": 0.82,
"grad_norm": 5.282669940799093,
"learning_rate": 8.67833251636665e-07,
"loss": 0.4632,
"step": 2548
},
{
"epoch": 0.82,
"grad_norm": 1.4320389198263321,
"learning_rate": 8.649175812242532e-07,
"loss": 0.1857,
"step": 2549
},
{
"epoch": 0.82,
"grad_norm": 7.580851961765514,
"learning_rate": 8.62006353153983e-07,
"loss": 0.8111,
"step": 2550
},
{
"epoch": 0.82,
"grad_norm": 1.544662525765582,
"learning_rate": 8.590995705533994e-07,
"loss": 0.209,
"step": 2551
},
{
"epoch": 0.82,
"grad_norm": 7.149161656017897,
"learning_rate": 8.561972365452775e-07,
"loss": 0.4482,
"step": 2552
},
{
"epoch": 0.82,
"grad_norm": 1.6270817149166188,
"learning_rate": 8.532993542476108e-07,
"loss": 0.2326,
"step": 2553
},
{
"epoch": 0.82,
"grad_norm": 7.320842607727228,
"learning_rate": 8.504059267736097e-07,
"loss": 0.5003,
"step": 2554
},
{
"epoch": 0.82,
"grad_norm": 1.2733848199707007,
"learning_rate": 8.475169572316988e-07,
"loss": 0.1798,
"step": 2555
},
{
"epoch": 0.82,
"grad_norm": 1.5418719853348621,
"learning_rate": 8.446324487255164e-07,
"loss": 0.1742,
"step": 2556
},
{
"epoch": 0.82,
"grad_norm": 5.99967558608403,
"learning_rate": 8.417524043539038e-07,
"loss": 0.5723,
"step": 2557
},
{
"epoch": 0.82,
"grad_norm": 1.3442831750304751,
"learning_rate": 8.388768272109105e-07,
"loss": 0.1884,
"step": 2558
},
{
"epoch": 0.82,
"grad_norm": 6.470561310477854,
"learning_rate": 8.36005720385783e-07,
"loss": 0.605,
"step": 2559
},
{
"epoch": 0.82,
"grad_norm": 10.15361411600289,
"learning_rate": 8.331390869629702e-07,
"loss": 0.7166,
"step": 2560
},
{
"epoch": 0.82,
"grad_norm": 1.3861308641502719,
"learning_rate": 8.302769300221098e-07,
"loss": 0.1641,
"step": 2561
},
{
"epoch": 0.82,
"grad_norm": 1.2012542474337997,
"learning_rate": 8.274192526380337e-07,
"loss": 0.1419,
"step": 2562
},
{
"epoch": 0.82,
"grad_norm": 1.5051072521255415,
"learning_rate": 8.24566057880763e-07,
"loss": 0.1897,
"step": 2563
},
{
"epoch": 0.82,
"grad_norm": 1.4835039914520893,
"learning_rate": 8.217173488154972e-07,
"loss": 0.2138,
"step": 2564
},
{
"epoch": 0.82,
"grad_norm": 5.488317253938668,
"learning_rate": 8.188731285026219e-07,
"loss": 0.5416,
"step": 2565
},
{
"epoch": 0.82,
"grad_norm": 12.708079568791273,
"learning_rate": 8.160333999977004e-07,
"loss": 0.5586,
"step": 2566
},
{
"epoch": 0.82,
"grad_norm": 7.844079556825954,
"learning_rate": 8.131981663514665e-07,
"loss": 0.5967,
"step": 2567
},
{
"epoch": 0.82,
"grad_norm": 5.514556203894869,
"learning_rate": 8.103674306098291e-07,
"loss": 0.4291,
"step": 2568
},
{
"epoch": 0.82,
"grad_norm": 11.294018881746007,
"learning_rate": 8.075411958138623e-07,
"loss": 0.3269,
"step": 2569
},
{
"epoch": 0.82,
"grad_norm": 1.5568098518084332,
"learning_rate": 8.047194649998063e-07,
"loss": 0.2458,
"step": 2570
},
{
"epoch": 0.82,
"grad_norm": 6.6662345305717725,
"learning_rate": 8.019022411990634e-07,
"loss": 0.6224,
"step": 2571
},
{
"epoch": 0.82,
"grad_norm": 5.594513397899825,
"learning_rate": 7.99089527438191e-07,
"loss": 0.5713,
"step": 2572
},
{
"epoch": 0.82,
"grad_norm": 1.5219324026730405,
"learning_rate": 7.962813267389052e-07,
"loss": 0.2215,
"step": 2573
},
{
"epoch": 0.82,
"grad_norm": 6.447484952604837,
"learning_rate": 7.93477642118069e-07,
"loss": 0.7177,
"step": 2574
},
{
"epoch": 0.82,
"grad_norm": 1.6396772877082404,
"learning_rate": 7.906784765876985e-07,
"loss": 0.2205,
"step": 2575
},
{
"epoch": 0.82,
"grad_norm": 1.3336659870784024,
"learning_rate": 7.878838331549538e-07,
"loss": 0.1731,
"step": 2576
},
{
"epoch": 0.82,
"grad_norm": 1.3515039283045287,
"learning_rate": 7.850937148221332e-07,
"loss": 0.2089,
"step": 2577
},
{
"epoch": 0.82,
"grad_norm": 1.4923380627301124,
"learning_rate": 7.823081245866776e-07,
"loss": 0.1906,
"step": 2578
},
{
"epoch": 0.83,
"grad_norm": 1.5277699646651623,
"learning_rate": 7.795270654411635e-07,
"loss": 0.2103,
"step": 2579
},
{
"epoch": 0.83,
"grad_norm": 1.3240100567701212,
"learning_rate": 7.767505403732961e-07,
"loss": 0.1629,
"step": 2580
},
{
"epoch": 0.83,
"grad_norm": 6.597982018865997,
"learning_rate": 7.739785523659144e-07,
"loss": 0.3574,
"step": 2581
},
{
"epoch": 0.83,
"grad_norm": 1.4229439072388805,
"learning_rate": 7.712111043969772e-07,
"loss": 0.1817,
"step": 2582
},
{
"epoch": 0.83,
"grad_norm": 8.611152415745416,
"learning_rate": 7.684481994395726e-07,
"loss": 0.395,
"step": 2583
},
{
"epoch": 0.83,
"grad_norm": 1.8485251598213055,
"learning_rate": 7.656898404619029e-07,
"loss": 0.2343,
"step": 2584
},
{
"epoch": 0.83,
"grad_norm": 6.762929749362675,
"learning_rate": 7.629360304272882e-07,
"loss": 0.4831,
"step": 2585
},
{
"epoch": 0.83,
"grad_norm": 5.0445718794221355,
"learning_rate": 7.601867722941642e-07,
"loss": 0.3216,
"step": 2586
},
{
"epoch": 0.83,
"grad_norm": 9.592577004766241,
"learning_rate": 7.57442069016071e-07,
"loss": 0.6185,
"step": 2587
},
{
"epoch": 0.83,
"grad_norm": 6.077894120077208,
"learning_rate": 7.547019235416609e-07,
"loss": 0.6236,
"step": 2588
},
{
"epoch": 0.83,
"grad_norm": 1.5405329002127461,
"learning_rate": 7.519663388146886e-07,
"loss": 0.1953,
"step": 2589
},
{
"epoch": 0.83,
"grad_norm": 6.577754669292901,
"learning_rate": 7.492353177740047e-07,
"loss": 0.376,
"step": 2590
},
{
"epoch": 0.83,
"grad_norm": 9.408829788343258,
"learning_rate": 7.465088633535639e-07,
"loss": 0.5448,
"step": 2591
},
{
"epoch": 0.83,
"grad_norm": 1.5320519268321242,
"learning_rate": 7.437869784824086e-07,
"loss": 0.2195,
"step": 2592
},
{
"epoch": 0.83,
"grad_norm": 1.5231665827699195,
"learning_rate": 7.410696660846761e-07,
"loss": 0.1723,
"step": 2593
},
{
"epoch": 0.83,
"grad_norm": 1.6874867821002582,
"learning_rate": 7.383569290795911e-07,
"loss": 0.2481,
"step": 2594
},
{
"epoch": 0.83,
"grad_norm": 1.5968693057949326,
"learning_rate": 7.356487703814602e-07,
"loss": 0.181,
"step": 2595
},
{
"epoch": 0.83,
"grad_norm": 7.840406851525264,
"learning_rate": 7.329451928996745e-07,
"loss": 0.5325,
"step": 2596
},
{
"epoch": 0.83,
"grad_norm": 1.4844619258708431,
"learning_rate": 7.302461995387033e-07,
"loss": 0.1758,
"step": 2597
},
{
"epoch": 0.83,
"grad_norm": 16.29674965582375,
"learning_rate": 7.275517931980886e-07,
"loss": 0.6096,
"step": 2598
},
{
"epoch": 0.83,
"grad_norm": 21.19066863166103,
"learning_rate": 7.24861976772448e-07,
"loss": 0.7152,
"step": 2599
},
{
"epoch": 0.83,
"grad_norm": 11.317114303086758,
"learning_rate": 7.22176753151464e-07,
"loss": 0.6161,
"step": 2600
},
{
"epoch": 0.83,
"grad_norm": 6.986948783906964,
"learning_rate": 7.194961252198885e-07,
"loss": 0.5213,
"step": 2601
},
{
"epoch": 0.83,
"grad_norm": 4.893252000606057,
"learning_rate": 7.168200958575361e-07,
"loss": 0.4924,
"step": 2602
},
{
"epoch": 0.83,
"grad_norm": 6.200996314158357,
"learning_rate": 7.141486679392778e-07,
"loss": 0.5967,
"step": 2603
},
{
"epoch": 0.83,
"grad_norm": 1.5078257564939408,
"learning_rate": 7.114818443350463e-07,
"loss": 0.1926,
"step": 2604
},
{
"epoch": 0.83,
"grad_norm": 5.839746009856321,
"learning_rate": 7.088196279098225e-07,
"loss": 0.6757,
"step": 2605
},
{
"epoch": 0.83,
"grad_norm": 1.2910292736416038,
"learning_rate": 7.061620215236415e-07,
"loss": 0.1497,
"step": 2606
},
{
"epoch": 0.83,
"grad_norm": 6.45423444267534,
"learning_rate": 7.035090280315854e-07,
"loss": 0.54,
"step": 2607
},
{
"epoch": 0.83,
"grad_norm": 1.5139209625655168,
"learning_rate": 7.008606502837784e-07,
"loss": 0.1677,
"step": 2608
},
{
"epoch": 0.83,
"grad_norm": 8.843478951192482,
"learning_rate": 6.982168911253895e-07,
"loss": 0.64,
"step": 2609
},
{
"epoch": 0.84,
"grad_norm": 9.254331185836687,
"learning_rate": 6.955777533966212e-07,
"loss": 0.4762,
"step": 2610
},
{
"epoch": 0.84,
"grad_norm": 6.6073102726749795,
"learning_rate": 6.929432399327174e-07,
"loss": 0.5791,
"step": 2611
},
{
"epoch": 0.84,
"grad_norm": 1.5608241342559577,
"learning_rate": 6.903133535639467e-07,
"loss": 0.2073,
"step": 2612
},
{
"epoch": 0.84,
"grad_norm": 1.5981984971688308,
"learning_rate": 6.876880971156147e-07,
"loss": 0.2374,
"step": 2613
},
{
"epoch": 0.84,
"grad_norm": 1.3580211791343035,
"learning_rate": 6.850674734080454e-07,
"loss": 0.1544,
"step": 2614
},
{
"epoch": 0.84,
"grad_norm": 1.6692509484909754,
"learning_rate": 6.824514852565922e-07,
"loss": 0.1843,
"step": 2615
},
{
"epoch": 0.84,
"grad_norm": 1.731129371209292,
"learning_rate": 6.798401354716233e-07,
"loss": 0.1988,
"step": 2616
},
{
"epoch": 0.84,
"grad_norm": 1.584868453254953,
"learning_rate": 6.772334268585296e-07,
"loss": 0.2202,
"step": 2617
},
{
"epoch": 0.84,
"grad_norm": 1.6211274381694143,
"learning_rate": 6.746313622177097e-07,
"loss": 0.2285,
"step": 2618
},
{
"epoch": 0.84,
"grad_norm": 1.6356376831839594,
"learning_rate": 6.720339443445772e-07,
"loss": 0.1855,
"step": 2619
},
{
"epoch": 0.84,
"grad_norm": 3.2565345030647532,
"learning_rate": 6.694411760295538e-07,
"loss": 0.531,
"step": 2620
},
{
"epoch": 0.84,
"grad_norm": 13.194009235153622,
"learning_rate": 6.66853060058063e-07,
"loss": 0.4903,
"step": 2621
},
{
"epoch": 0.84,
"grad_norm": 1.5463835963314558,
"learning_rate": 6.642695992105347e-07,
"loss": 0.2219,
"step": 2622
},
{
"epoch": 0.84,
"grad_norm": 1.4566373371656223,
"learning_rate": 6.61690796262392e-07,
"loss": 0.1668,
"step": 2623
},
{
"epoch": 0.84,
"grad_norm": 1.5464011596894065,
"learning_rate": 6.591166539840599e-07,
"loss": 0.218,
"step": 2624
},
{
"epoch": 0.84,
"grad_norm": 1.4726230979560666,
"learning_rate": 6.565471751409541e-07,
"loss": 0.2045,
"step": 2625
},
{
"epoch": 0.84,
"grad_norm": 7.53648323357261,
"learning_rate": 6.539823624934777e-07,
"loss": 0.6538,
"step": 2626
},
{
"epoch": 0.84,
"grad_norm": 4.3811779723790885,
"learning_rate": 6.514222187970248e-07,
"loss": 0.3418,
"step": 2627
},
{
"epoch": 0.84,
"grad_norm": 6.082964849648111,
"learning_rate": 6.488667468019727e-07,
"loss": 0.567,
"step": 2628
},
{
"epoch": 0.84,
"grad_norm": 7.3904723461789725,
"learning_rate": 6.46315949253678e-07,
"loss": 0.606,
"step": 2629
},
{
"epoch": 0.84,
"grad_norm": 1.3947875291118377,
"learning_rate": 6.437698288924777e-07,
"loss": 0.1841,
"step": 2630
},
{
"epoch": 0.84,
"grad_norm": 6.009049736365674,
"learning_rate": 6.412283884536818e-07,
"loss": 0.5414,
"step": 2631
},
{
"epoch": 0.84,
"grad_norm": 1.4672149857669146,
"learning_rate": 6.38691630667575e-07,
"loss": 0.2192,
"step": 2632
},
{
"epoch": 0.84,
"grad_norm": 11.53404226265188,
"learning_rate": 6.36159558259411e-07,
"loss": 0.8374,
"step": 2633
},
{
"epoch": 0.84,
"grad_norm": 1.2702099350975073,
"learning_rate": 6.336321739494072e-07,
"loss": 0.1739,
"step": 2634
},
{
"epoch": 0.84,
"grad_norm": 8.096994483604018,
"learning_rate": 6.31109480452749e-07,
"loss": 0.5601,
"step": 2635
},
{
"epoch": 0.84,
"grad_norm": 1.3277480034853422,
"learning_rate": 6.285914804795784e-07,
"loss": 0.1473,
"step": 2636
},
{
"epoch": 0.84,
"grad_norm": 1.4812499772472136,
"learning_rate": 6.260781767349983e-07,
"loss": 0.1971,
"step": 2637
},
{
"epoch": 0.84,
"grad_norm": 5.538018979603673,
"learning_rate": 6.235695719190632e-07,
"loss": 0.5932,
"step": 2638
},
{
"epoch": 0.84,
"grad_norm": 1.5124177468422904,
"learning_rate": 6.210656687267835e-07,
"loss": 0.1811,
"step": 2639
},
{
"epoch": 0.84,
"grad_norm": 5.738558629682813,
"learning_rate": 6.185664698481137e-07,
"loss": 0.3961,
"step": 2640
},
{
"epoch": 0.85,
"grad_norm": 7.5116895308050475,
"learning_rate": 6.160719779679597e-07,
"loss": 0.5401,
"step": 2641
},
{
"epoch": 0.85,
"grad_norm": 8.889935222961554,
"learning_rate": 6.135821957661658e-07,
"loss": 0.6612,
"step": 2642
},
{
"epoch": 0.85,
"grad_norm": 1.6046307953312196,
"learning_rate": 6.110971259175208e-07,
"loss": 0.1817,
"step": 2643
},
{
"epoch": 0.85,
"grad_norm": 6.892177321413231,
"learning_rate": 6.086167710917479e-07,
"loss": 0.5152,
"step": 2644
},
{
"epoch": 0.85,
"grad_norm": 13.071280804593417,
"learning_rate": 6.061411339535062e-07,
"loss": 0.6484,
"step": 2645
},
{
"epoch": 0.85,
"grad_norm": 1.3419632389583118,
"learning_rate": 6.036702171623876e-07,
"loss": 0.1383,
"step": 2646
},
{
"epoch": 0.85,
"grad_norm": 6.157792301410421,
"learning_rate": 6.012040233729105e-07,
"loss": 0.5189,
"step": 2647
},
{
"epoch": 0.85,
"grad_norm": 7.7104212355617925,
"learning_rate": 5.987425552345222e-07,
"loss": 0.5179,
"step": 2648
},
{
"epoch": 0.85,
"grad_norm": 1.5615036983923993,
"learning_rate": 5.962858153915896e-07,
"loss": 0.1928,
"step": 2649
},
{
"epoch": 0.85,
"grad_norm": 12.993945008932329,
"learning_rate": 5.938338064834037e-07,
"loss": 0.4566,
"step": 2650
},
{
"epoch": 0.85,
"grad_norm": 1.4056054595363345,
"learning_rate": 5.913865311441714e-07,
"loss": 0.1978,
"step": 2651
},
{
"epoch": 0.85,
"grad_norm": 1.367541351930958,
"learning_rate": 5.889439920030127e-07,
"loss": 0.1538,
"step": 2652
},
{
"epoch": 0.85,
"grad_norm": 6.393546120631601,
"learning_rate": 5.865061916839615e-07,
"loss": 0.5159,
"step": 2653
},
{
"epoch": 0.85,
"grad_norm": 1.4226337155233368,
"learning_rate": 5.840731328059629e-07,
"loss": 0.205,
"step": 2654
},
{
"epoch": 0.85,
"grad_norm": 6.86782039259812,
"learning_rate": 5.816448179828616e-07,
"loss": 0.6033,
"step": 2655
},
{
"epoch": 0.85,
"grad_norm": 1.6104876835275903,
"learning_rate": 5.792212498234134e-07,
"loss": 0.2062,
"step": 2656
},
{
"epoch": 0.85,
"grad_norm": 9.049602233806667,
"learning_rate": 5.768024309312681e-07,
"loss": 0.4272,
"step": 2657
},
{
"epoch": 0.85,
"grad_norm": 9.592778712216669,
"learning_rate": 5.74388363904978e-07,
"loss": 0.5094,
"step": 2658
},
{
"epoch": 0.85,
"grad_norm": 5.250655127944203,
"learning_rate": 5.719790513379891e-07,
"loss": 0.5485,
"step": 2659
},
{
"epoch": 0.85,
"grad_norm": 9.41096800686093,
"learning_rate": 5.695744958186383e-07,
"loss": 0.4675,
"step": 2660
},
{
"epoch": 0.85,
"grad_norm": 10.780266583961469,
"learning_rate": 5.671746999301542e-07,
"loss": 0.5982,
"step": 2661
},
{
"epoch": 0.85,
"grad_norm": 1.710761266629091,
"learning_rate": 5.647796662506493e-07,
"loss": 0.2093,
"step": 2662
},
{
"epoch": 0.85,
"grad_norm": 4.654933300278924,
"learning_rate": 5.623893973531225e-07,
"loss": 0.4755,
"step": 2663
},
{
"epoch": 0.85,
"grad_norm": 5.2427890587191275,
"learning_rate": 5.600038958054538e-07,
"loss": 0.5326,
"step": 2664
},
{
"epoch": 0.85,
"grad_norm": 6.693717545797934,
"learning_rate": 5.576231641703994e-07,
"loss": 0.4351,
"step": 2665
},
{
"epoch": 0.85,
"grad_norm": 4.523179412732861,
"learning_rate": 5.552472050055946e-07,
"loss": 0.568,
"step": 2666
},
{
"epoch": 0.85,
"grad_norm": 5.287751299486752,
"learning_rate": 5.528760208635436e-07,
"loss": 0.6917,
"step": 2667
},
{
"epoch": 0.85,
"grad_norm": 1.5086347552640127,
"learning_rate": 5.505096142916233e-07,
"loss": 0.1786,
"step": 2668
},
{
"epoch": 0.85,
"grad_norm": 5.505592709604087,
"learning_rate": 5.481479878320784e-07,
"loss": 0.4923,
"step": 2669
},
{
"epoch": 0.85,
"grad_norm": 1.276453692652011,
"learning_rate": 5.457911440220154e-07,
"loss": 0.1714,
"step": 2670
},
{
"epoch": 0.85,
"grad_norm": 5.3060623381904115,
"learning_rate": 5.434390853934063e-07,
"loss": 0.5048,
"step": 2671
},
{
"epoch": 0.86,
"grad_norm": 7.244293658133257,
"learning_rate": 5.410918144730815e-07,
"loss": 0.4543,
"step": 2672
},
{
"epoch": 0.86,
"grad_norm": 6.256776796215696,
"learning_rate": 5.387493337827254e-07,
"loss": 0.6208,
"step": 2673
},
{
"epoch": 0.86,
"grad_norm": 7.584912804732227,
"learning_rate": 5.364116458388802e-07,
"loss": 0.5999,
"step": 2674
},
{
"epoch": 0.86,
"grad_norm": 7.699756945569905,
"learning_rate": 5.340787531529346e-07,
"loss": 0.6547,
"step": 2675
},
{
"epoch": 0.86,
"grad_norm": 1.605575414031511,
"learning_rate": 5.3175065823113e-07,
"loss": 0.1968,
"step": 2676
},
{
"epoch": 0.86,
"grad_norm": 1.4817619941410574,
"learning_rate": 5.294273635745517e-07,
"loss": 0.2291,
"step": 2677
},
{
"epoch": 0.86,
"grad_norm": 5.90107264061013,
"learning_rate": 5.271088716791273e-07,
"loss": 0.513,
"step": 2678
},
{
"epoch": 0.86,
"grad_norm": 6.983756070460959,
"learning_rate": 5.24795185035627e-07,
"loss": 0.4819,
"step": 2679
},
{
"epoch": 0.86,
"grad_norm": 5.190412595121703,
"learning_rate": 5.224863061296553e-07,
"loss": 0.3798,
"step": 2680
},
{
"epoch": 0.86,
"grad_norm": 1.6978196461390462,
"learning_rate": 5.201822374416549e-07,
"loss": 0.1984,
"step": 2681
},
{
"epoch": 0.86,
"grad_norm": 9.240812347198206,
"learning_rate": 5.178829814469006e-07,
"loss": 0.6435,
"step": 2682
},
{
"epoch": 0.86,
"grad_norm": 1.656681270068995,
"learning_rate": 5.155885406154937e-07,
"loss": 0.2336,
"step": 2683
},
{
"epoch": 0.86,
"grad_norm": 1.5786867044648638,
"learning_rate": 5.132989174123659e-07,
"loss": 0.2072,
"step": 2684
},
{
"epoch": 0.86,
"grad_norm": 5.044095479711521,
"learning_rate": 5.110141142972735e-07,
"loss": 0.4497,
"step": 2685
},
{
"epoch": 0.86,
"grad_norm": 1.593603943847763,
"learning_rate": 5.087341337247914e-07,
"loss": 0.2106,
"step": 2686
},
{
"epoch": 0.86,
"grad_norm": 6.618972684768107,
"learning_rate": 5.064589781443163e-07,
"loss": 0.7105,
"step": 2687
},
{
"epoch": 0.86,
"grad_norm": 1.4376629646749215,
"learning_rate": 5.041886500000603e-07,
"loss": 0.1872,
"step": 2688
},
{
"epoch": 0.86,
"grad_norm": 1.440392194206104,
"learning_rate": 5.019231517310491e-07,
"loss": 0.1735,
"step": 2689
},
{
"epoch": 0.86,
"grad_norm": 6.088923357509856,
"learning_rate": 4.996624857711219e-07,
"loss": 0.5553,
"step": 2690
},
{
"epoch": 0.86,
"grad_norm": 1.7043003296847394,
"learning_rate": 4.97406654548922e-07,
"loss": 0.1879,
"step": 2691
},
{
"epoch": 0.86,
"grad_norm": 1.5775563243885704,
"learning_rate": 4.951556604879049e-07,
"loss": 0.1709,
"step": 2692
},
{
"epoch": 0.86,
"grad_norm": 1.4559291771321798,
"learning_rate": 4.929095060063227e-07,
"loss": 0.1728,
"step": 2693
},
{
"epoch": 0.86,
"grad_norm": 7.343298387449492,
"learning_rate": 4.906681935172342e-07,
"loss": 0.6308,
"step": 2694
},
{
"epoch": 0.86,
"grad_norm": 8.254690281488546,
"learning_rate": 4.88431725428492e-07,
"loss": 0.5289,
"step": 2695
},
{
"epoch": 0.86,
"grad_norm": 1.5805793183405417,
"learning_rate": 4.862001041427488e-07,
"loss": 0.2157,
"step": 2696
},
{
"epoch": 0.86,
"grad_norm": 1.757438267597915,
"learning_rate": 4.839733320574457e-07,
"loss": 0.2086,
"step": 2697
},
{
"epoch": 0.86,
"grad_norm": 8.33445307266451,
"learning_rate": 4.817514115648164e-07,
"loss": 0.418,
"step": 2698
},
{
"epoch": 0.86,
"grad_norm": 7.548588114648093,
"learning_rate": 4.795343450518825e-07,
"loss": 0.4581,
"step": 2699
},
{
"epoch": 0.86,
"grad_norm": 1.3352919122460238,
"learning_rate": 4.773221349004531e-07,
"loss": 0.1991,
"step": 2700
},
{
"epoch": 0.86,
"grad_norm": 1.3387781501315952,
"learning_rate": 4.7511478348711447e-07,
"loss": 0.1672,
"step": 2701
},
{
"epoch": 0.86,
"grad_norm": 1.7704078493779831,
"learning_rate": 4.729122931832392e-07,
"loss": 0.1869,
"step": 2702
},
{
"epoch": 0.86,
"grad_norm": 6.0396203658077,
"learning_rate": 4.707146663549744e-07,
"loss": 0.5129,
"step": 2703
},
{
"epoch": 0.87,
"grad_norm": 6.582816211876843,
"learning_rate": 4.685219053632423e-07,
"loss": 0.4932,
"step": 2704
},
{
"epoch": 0.87,
"grad_norm": 1.5537253110385736,
"learning_rate": 4.663340125637389e-07,
"loss": 0.2203,
"step": 2705
},
{
"epoch": 0.87,
"grad_norm": 1.4877347829698297,
"learning_rate": 4.6415099030692914e-07,
"loss": 0.1647,
"step": 2706
},
{
"epoch": 0.87,
"grad_norm": 5.9450673868837525,
"learning_rate": 4.619728409380453e-07,
"loss": 0.4525,
"step": 2707
},
{
"epoch": 0.87,
"grad_norm": 8.06939617997545,
"learning_rate": 4.597995667970878e-07,
"loss": 0.5758,
"step": 2708
},
{
"epoch": 0.87,
"grad_norm": 12.473711205485506,
"learning_rate": 4.5763117021881467e-07,
"loss": 0.3835,
"step": 2709
},
{
"epoch": 0.87,
"grad_norm": 1.3235374068538472,
"learning_rate": 4.5546765353274846e-07,
"loss": 0.1844,
"step": 2710
},
{
"epoch": 0.87,
"grad_norm": 4.862552219195205,
"learning_rate": 4.5330901906316506e-07,
"loss": 0.5605,
"step": 2711
},
{
"epoch": 0.87,
"grad_norm": 1.4153040869832534,
"learning_rate": 4.511552691290988e-07,
"loss": 0.1872,
"step": 2712
},
{
"epoch": 0.87,
"grad_norm": 5.210784876496466,
"learning_rate": 4.490064060443361e-07,
"loss": 0.4642,
"step": 2713
},
{
"epoch": 0.87,
"grad_norm": 1.5480203986117018,
"learning_rate": 4.468624321174109e-07,
"loss": 0.1905,
"step": 2714
},
{
"epoch": 0.87,
"grad_norm": 1.43235037299231,
"learning_rate": 4.4472334965160736e-07,
"loss": 0.1997,
"step": 2715
},
{
"epoch": 0.87,
"grad_norm": 1.5503954668745454,
"learning_rate": 4.4258916094495394e-07,
"loss": 0.1583,
"step": 2716
},
{
"epoch": 0.87,
"grad_norm": 7.398457461161245,
"learning_rate": 4.4045986829022e-07,
"loss": 0.5537,
"step": 2717
},
{
"epoch": 0.87,
"grad_norm": 7.218432948067189,
"learning_rate": 4.38335473974919e-07,
"loss": 0.6647,
"step": 2718
},
{
"epoch": 0.87,
"grad_norm": 1.4794803957855214,
"learning_rate": 4.362159802812971e-07,
"loss": 0.1837,
"step": 2719
},
{
"epoch": 0.87,
"grad_norm": 1.4278596696760588,
"learning_rate": 4.341013894863405e-07,
"loss": 0.1818,
"step": 2720
},
{
"epoch": 0.87,
"grad_norm": 5.591059833775441,
"learning_rate": 4.3199170386176325e-07,
"loss": 0.6646,
"step": 2721
},
{
"epoch": 0.87,
"grad_norm": 9.657243782582162,
"learning_rate": 4.2988692567401515e-07,
"loss": 0.4096,
"step": 2722
},
{
"epoch": 0.87,
"grad_norm": 7.34427746688386,
"learning_rate": 4.2778705718426907e-07,
"loss": 0.5594,
"step": 2723
},
{
"epoch": 0.87,
"grad_norm": 1.495038666260333,
"learning_rate": 4.2569210064842716e-07,
"loss": 0.1581,
"step": 2724
},
{
"epoch": 0.87,
"grad_norm": 1.5329584343646685,
"learning_rate": 4.236020583171108e-07,
"loss": 0.1958,
"step": 2725
},
{
"epoch": 0.87,
"grad_norm": 7.7896070161012645,
"learning_rate": 4.215169324356666e-07,
"loss": 0.6856,
"step": 2726
},
{
"epoch": 0.87,
"grad_norm": 7.771467751815555,
"learning_rate": 4.194367252441545e-07,
"loss": 0.6252,
"step": 2727
},
{
"epoch": 0.87,
"grad_norm": 1.6693745729825031,
"learning_rate": 4.1736143897735394e-07,
"loss": 0.2033,
"step": 2728
},
{
"epoch": 0.87,
"grad_norm": 1.331737824452968,
"learning_rate": 4.152910758647577e-07,
"loss": 0.187,
"step": 2729
},
{
"epoch": 0.87,
"grad_norm": 5.952112384295271,
"learning_rate": 4.1322563813056606e-07,
"loss": 0.7214,
"step": 2730
},
{
"epoch": 0.87,
"grad_norm": 1.8790949725849464,
"learning_rate": 4.111651279936929e-07,
"loss": 0.2495,
"step": 2731
},
{
"epoch": 0.87,
"grad_norm": 1.4096065548075443,
"learning_rate": 4.091095476677531e-07,
"loss": 0.1673,
"step": 2732
},
{
"epoch": 0.87,
"grad_norm": 1.5124954739782766,
"learning_rate": 4.070588993610697e-07,
"loss": 0.154,
"step": 2733
},
{
"epoch": 0.87,
"grad_norm": 1.5687885929449497,
"learning_rate": 4.050131852766659e-07,
"loss": 0.2173,
"step": 2734
},
{
"epoch": 0.88,
"grad_norm": 6.275592701555711,
"learning_rate": 4.029724076122621e-07,
"loss": 0.4758,
"step": 2735
},
{
"epoch": 0.88,
"grad_norm": 1.4748542433756284,
"learning_rate": 4.009365685602795e-07,
"loss": 0.2158,
"step": 2736
},
{
"epoch": 0.88,
"grad_norm": 9.414075729888554,
"learning_rate": 3.989056703078292e-07,
"loss": 0.5527,
"step": 2737
},
{
"epoch": 0.88,
"grad_norm": 1.5713542490218366,
"learning_rate": 3.968797150367171e-07,
"loss": 0.2018,
"step": 2738
},
{
"epoch": 0.88,
"grad_norm": 1.5449491396397508,
"learning_rate": 3.948587049234398e-07,
"loss": 0.1901,
"step": 2739
},
{
"epoch": 0.88,
"grad_norm": 1.6307394390097811,
"learning_rate": 3.928426421391773e-07,
"loss": 0.1784,
"step": 2740
},
{
"epoch": 0.88,
"grad_norm": 6.998713371935027,
"learning_rate": 3.9083152884979935e-07,
"loss": 0.5584,
"step": 2741
},
{
"epoch": 0.88,
"grad_norm": 1.4164373784508115,
"learning_rate": 3.8882536721585486e-07,
"loss": 0.1768,
"step": 2742
},
{
"epoch": 0.88,
"grad_norm": 7.755153311756699,
"learning_rate": 3.868241593925742e-07,
"loss": 0.6357,
"step": 2743
},
{
"epoch": 0.88,
"grad_norm": 1.5425594331206072,
"learning_rate": 3.848279075298678e-07,
"loss": 0.1762,
"step": 2744
},
{
"epoch": 0.88,
"grad_norm": 1.4271275279711206,
"learning_rate": 3.828366137723183e-07,
"loss": 0.2099,
"step": 2745
},
{
"epoch": 0.88,
"grad_norm": 7.057266626679748,
"learning_rate": 3.80850280259184e-07,
"loss": 0.6185,
"step": 2746
},
{
"epoch": 0.88,
"grad_norm": 1.4564045410306423,
"learning_rate": 3.7886890912439633e-07,
"loss": 0.1933,
"step": 2747
},
{
"epoch": 0.88,
"grad_norm": 1.4734666899751858,
"learning_rate": 3.768925024965503e-07,
"loss": 0.178,
"step": 2748
},
{
"epoch": 0.88,
"grad_norm": 5.7696830680755,
"learning_rate": 3.749210624989125e-07,
"loss": 0.4184,
"step": 2749
},
{
"epoch": 0.88,
"grad_norm": 1.3766507432246797,
"learning_rate": 3.729545912494115e-07,
"loss": 0.1684,
"step": 2750
},
{
"epoch": 0.88,
"grad_norm": 1.4692231082345608,
"learning_rate": 3.7099309086063794e-07,
"loss": 0.1902,
"step": 2751
},
{
"epoch": 0.88,
"grad_norm": 1.6021302055115212,
"learning_rate": 3.6903656343984293e-07,
"loss": 0.1716,
"step": 2752
},
{
"epoch": 0.88,
"grad_norm": 1.5941335703704298,
"learning_rate": 3.670850110889346e-07,
"loss": 0.1941,
"step": 2753
},
{
"epoch": 0.88,
"grad_norm": 6.651698021511934,
"learning_rate": 3.651384359044774e-07,
"loss": 0.498,
"step": 2754
},
{
"epoch": 0.88,
"grad_norm": 5.789963956744085,
"learning_rate": 3.631968399776864e-07,
"loss": 0.5474,
"step": 2755
},
{
"epoch": 0.88,
"grad_norm": 10.234126217846844,
"learning_rate": 3.6126022539442975e-07,
"loss": 0.473,
"step": 2756
},
{
"epoch": 0.88,
"grad_norm": 1.8953151152540306,
"learning_rate": 3.593285942352237e-07,
"loss": 0.2352,
"step": 2757
},
{
"epoch": 0.88,
"grad_norm": 5.586124086834919,
"learning_rate": 3.5740194857523e-07,
"loss": 0.4172,
"step": 2758
},
{
"epoch": 0.88,
"grad_norm": 6.343090688464675,
"learning_rate": 3.554802904842547e-07,
"loss": 0.6174,
"step": 2759
},
{
"epoch": 0.88,
"grad_norm": 5.142444498296554,
"learning_rate": 3.5356362202674687e-07,
"loss": 0.5368,
"step": 2760
},
{
"epoch": 0.88,
"grad_norm": 7.738189510492381,
"learning_rate": 3.516519452617922e-07,
"loss": 0.5712,
"step": 2761
},
{
"epoch": 0.88,
"grad_norm": 1.4335777220529844,
"learning_rate": 3.4974526224311744e-07,
"loss": 0.1897,
"step": 2762
},
{
"epoch": 0.88,
"grad_norm": 1.5217264160894917,
"learning_rate": 3.478435750190817e-07,
"loss": 0.234,
"step": 2763
},
{
"epoch": 0.88,
"grad_norm": 8.599142512430719,
"learning_rate": 3.459468856326792e-07,
"loss": 0.4893,
"step": 2764
},
{
"epoch": 0.88,
"grad_norm": 5.339084031965163,
"learning_rate": 3.4405519612153326e-07,
"loss": 0.4573,
"step": 2765
},
{
"epoch": 0.89,
"grad_norm": 8.37944674597712,
"learning_rate": 3.4216850851789663e-07,
"loss": 0.69,
"step": 2766
},
{
"epoch": 0.89,
"grad_norm": 5.712275397578137,
"learning_rate": 3.402868248486485e-07,
"loss": 0.5051,
"step": 2767
},
{
"epoch": 0.89,
"grad_norm": 9.48171144073477,
"learning_rate": 3.3841014713529184e-07,
"loss": 0.5432,
"step": 2768
},
{
"epoch": 0.89,
"grad_norm": 19.7958685386682,
"learning_rate": 3.3653847739395174e-07,
"loss": 0.616,
"step": 2769
},
{
"epoch": 0.89,
"grad_norm": 7.384955125014153,
"learning_rate": 3.346718176353747e-07,
"loss": 0.4825,
"step": 2770
},
{
"epoch": 0.89,
"grad_norm": 1.546675489425708,
"learning_rate": 3.3281016986492165e-07,
"loss": 0.2247,
"step": 2771
},
{
"epoch": 0.89,
"grad_norm": 7.383348616381246,
"learning_rate": 3.3095353608257385e-07,
"loss": 0.4587,
"step": 2772
},
{
"epoch": 0.89,
"grad_norm": 1.384415863590384,
"learning_rate": 3.2910191828292083e-07,
"loss": 0.1825,
"step": 2773
},
{
"epoch": 0.89,
"grad_norm": 12.185491676837916,
"learning_rate": 3.2725531845516744e-07,
"loss": 0.6282,
"step": 2774
},
{
"epoch": 0.89,
"grad_norm": 1.4675587976043334,
"learning_rate": 3.254137385831263e-07,
"loss": 0.2427,
"step": 2775
},
{
"epoch": 0.89,
"grad_norm": 7.1582183676316795,
"learning_rate": 3.2357718064521594e-07,
"loss": 0.562,
"step": 2776
},
{
"epoch": 0.89,
"grad_norm": 1.6649165418380594,
"learning_rate": 3.217456466144614e-07,
"loss": 0.1921,
"step": 2777
},
{
"epoch": 0.89,
"grad_norm": 8.367329099086248,
"learning_rate": 3.199191384584893e-07,
"loss": 0.5557,
"step": 2778
},
{
"epoch": 0.89,
"grad_norm": 9.522871424688466,
"learning_rate": 3.180976581395295e-07,
"loss": 0.4402,
"step": 2779
},
{
"epoch": 0.89,
"grad_norm": 6.946008058570308,
"learning_rate": 3.1628120761440616e-07,
"loss": 0.573,
"step": 2780
},
{
"epoch": 0.89,
"grad_norm": 5.949544153040697,
"learning_rate": 3.144697888345427e-07,
"loss": 0.5591,
"step": 2781
},
{
"epoch": 0.89,
"grad_norm": 1.563219206555618,
"learning_rate": 3.1266340374595693e-07,
"loss": 0.2009,
"step": 2782
},
{
"epoch": 0.89,
"grad_norm": 9.840995642921778,
"learning_rate": 3.108620542892593e-07,
"loss": 0.5893,
"step": 2783
},
{
"epoch": 0.89,
"grad_norm": 5.5277411461745105,
"learning_rate": 3.0906574239964795e-07,
"loss": 0.4025,
"step": 2784
},
{
"epoch": 0.89,
"grad_norm": 5.9312829901751485,
"learning_rate": 3.072744700069119e-07,
"loss": 0.4705,
"step": 2785
},
{
"epoch": 0.89,
"grad_norm": 1.4692101723238575,
"learning_rate": 3.054882390354241e-07,
"loss": 0.2353,
"step": 2786
},
{
"epoch": 0.89,
"grad_norm": 11.367735027012158,
"learning_rate": 3.0370705140414293e-07,
"loss": 0.4232,
"step": 2787
},
{
"epoch": 0.89,
"grad_norm": 7.789201059559417,
"learning_rate": 3.019309090266087e-07,
"loss": 0.6217,
"step": 2788
},
{
"epoch": 0.89,
"grad_norm": 1.3680402034081511,
"learning_rate": 3.0015981381094073e-07,
"loss": 0.1725,
"step": 2789
},
{
"epoch": 0.89,
"grad_norm": 8.714863976063446,
"learning_rate": 2.9839376765983583e-07,
"loss": 0.6529,
"step": 2790
},
{
"epoch": 0.89,
"grad_norm": 1.4995256520141007,
"learning_rate": 2.9663277247056923e-07,
"loss": 0.1802,
"step": 2791
},
{
"epoch": 0.89,
"grad_norm": 1.404499609913541,
"learning_rate": 2.9487683013498523e-07,
"loss": 0.2168,
"step": 2792
},
{
"epoch": 0.89,
"grad_norm": 6.106240227015083,
"learning_rate": 2.93125942539505e-07,
"loss": 0.4867,
"step": 2793
},
{
"epoch": 0.89,
"grad_norm": 5.292866306582783,
"learning_rate": 2.913801115651144e-07,
"loss": 0.4967,
"step": 2794
},
{
"epoch": 0.89,
"grad_norm": 5.731707102771506,
"learning_rate": 2.896393390873714e-07,
"loss": 0.5005,
"step": 2795
},
{
"epoch": 0.89,
"grad_norm": 1.538387808018448,
"learning_rate": 2.8790362697639685e-07,
"loss": 0.1781,
"step": 2796
},
{
"epoch": 0.9,
"grad_norm": 6.141995741207109,
"learning_rate": 2.8617297709687577e-07,
"loss": 0.5539,
"step": 2797
},
{
"epoch": 0.9,
"grad_norm": 1.766898595287886,
"learning_rate": 2.8444739130805587e-07,
"loss": 0.2714,
"step": 2798
},
{
"epoch": 0.9,
"grad_norm": 5.086068720235225,
"learning_rate": 2.827268714637421e-07,
"loss": 0.547,
"step": 2799
},
{
"epoch": 0.9,
"grad_norm": 1.5542713166653377,
"learning_rate": 2.810114194122998e-07,
"loss": 0.2125,
"step": 2800
},
{
"epoch": 0.9,
"grad_norm": 6.313678448452166,
"learning_rate": 2.793010369966487e-07,
"loss": 0.5274,
"step": 2801
},
{
"epoch": 0.9,
"grad_norm": 6.565773969501368,
"learning_rate": 2.7759572605426057e-07,
"loss": 0.6617,
"step": 2802
},
{
"epoch": 0.9,
"grad_norm": 6.368977047106074,
"learning_rate": 2.7589548841716274e-07,
"loss": 0.3688,
"step": 2803
},
{
"epoch": 0.9,
"grad_norm": 1.2415580407502018,
"learning_rate": 2.7420032591192856e-07,
"loss": 0.1499,
"step": 2804
},
{
"epoch": 0.9,
"grad_norm": 1.6188798679668956,
"learning_rate": 2.7251024035968134e-07,
"loss": 0.2165,
"step": 2805
},
{
"epoch": 0.9,
"grad_norm": 6.7213647514169805,
"learning_rate": 2.7082523357608856e-07,
"loss": 0.5418,
"step": 2806
},
{
"epoch": 0.9,
"grad_norm": 1.3919765447813295,
"learning_rate": 2.6914530737136346e-07,
"loss": 0.1883,
"step": 2807
},
{
"epoch": 0.9,
"grad_norm": 7.590480561051072,
"learning_rate": 2.674704635502584e-07,
"loss": 0.4917,
"step": 2808
},
{
"epoch": 0.9,
"grad_norm": 1.5450975740073691,
"learning_rate": 2.658007039120697e-07,
"loss": 0.2432,
"step": 2809
},
{
"epoch": 0.9,
"grad_norm": 6.150846024520127,
"learning_rate": 2.64136030250628e-07,
"loss": 0.4327,
"step": 2810
},
{
"epoch": 0.9,
"grad_norm": 6.517313862487519,
"learning_rate": 2.6247644435430263e-07,
"loss": 0.5871,
"step": 2811
},
{
"epoch": 0.9,
"grad_norm": 8.345085681229417,
"learning_rate": 2.6082194800599424e-07,
"loss": 0.6578,
"step": 2812
},
{
"epoch": 0.9,
"grad_norm": 1.558151080338734,
"learning_rate": 2.591725429831382e-07,
"loss": 0.2097,
"step": 2813
},
{
"epoch": 0.9,
"grad_norm": 1.5451625551303971,
"learning_rate": 2.57528231057701e-07,
"loss": 0.2227,
"step": 2814
},
{
"epoch": 0.9,
"grad_norm": 1.890145544796304,
"learning_rate": 2.558890139961745e-07,
"loss": 0.1953,
"step": 2815
},
{
"epoch": 0.9,
"grad_norm": 1.5366257554269054,
"learning_rate": 2.5425489355957956e-07,
"loss": 0.1981,
"step": 2816
},
{
"epoch": 0.9,
"grad_norm": 5.594092602461184,
"learning_rate": 2.526258715034602e-07,
"loss": 0.5163,
"step": 2817
},
{
"epoch": 0.9,
"grad_norm": 1.2872462268534564,
"learning_rate": 2.510019495778837e-07,
"loss": 0.1846,
"step": 2818
},
{
"epoch": 0.9,
"grad_norm": 1.4696999230227057,
"learning_rate": 2.4938312952744016e-07,
"loss": 0.2071,
"step": 2819
},
{
"epoch": 0.9,
"grad_norm": 5.3706244592045485,
"learning_rate": 2.477694130912356e-07,
"loss": 0.6052,
"step": 2820
},
{
"epoch": 0.9,
"grad_norm": 1.6093203291021525,
"learning_rate": 2.461608020028944e-07,
"loss": 0.1687,
"step": 2821
},
{
"epoch": 0.9,
"grad_norm": 6.522675308570204,
"learning_rate": 2.445572979905575e-07,
"loss": 0.5339,
"step": 2822
},
{
"epoch": 0.9,
"grad_norm": 7.055478940763568,
"learning_rate": 2.4295890277687695e-07,
"loss": 0.5388,
"step": 2823
},
{
"epoch": 0.9,
"grad_norm": 1.5084545161877478,
"learning_rate": 2.4136561807901916e-07,
"loss": 0.205,
"step": 2824
},
{
"epoch": 0.9,
"grad_norm": 1.7137511742906086,
"learning_rate": 2.397774456086577e-07,
"loss": 0.2395,
"step": 2825
},
{
"epoch": 0.9,
"grad_norm": 1.4841156326024527,
"learning_rate": 2.3819438707197495e-07,
"loss": 0.1946,
"step": 2826
},
{
"epoch": 0.9,
"grad_norm": 6.070685735254147,
"learning_rate": 2.3661644416966057e-07,
"loss": 0.6072,
"step": 2827
},
{
"epoch": 0.9,
"grad_norm": 8.512142589977733,
"learning_rate": 2.3504361859690628e-07,
"loss": 0.4394,
"step": 2828
},
{
"epoch": 0.91,
"grad_norm": 7.079742604152353,
"learning_rate": 2.3347591204340881e-07,
"loss": 0.5408,
"step": 2829
},
{
"epoch": 0.91,
"grad_norm": 11.829621193001326,
"learning_rate": 2.3191332619336204e-07,
"loss": 0.6588,
"step": 2830
},
{
"epoch": 0.91,
"grad_norm": 20.39111290338053,
"learning_rate": 2.3035586272546207e-07,
"loss": 0.5723,
"step": 2831
},
{
"epoch": 0.91,
"grad_norm": 1.4352689502580414,
"learning_rate": 2.2880352331290102e-07,
"loss": 0.1685,
"step": 2832
},
{
"epoch": 0.91,
"grad_norm": 5.10892056271425,
"learning_rate": 2.2725630962336542e-07,
"loss": 0.5223,
"step": 2833
},
{
"epoch": 0.91,
"grad_norm": 5.983565286314907,
"learning_rate": 2.2571422331903458e-07,
"loss": 0.6039,
"step": 2834
},
{
"epoch": 0.91,
"grad_norm": 4.629975595643532,
"learning_rate": 2.2417726605658164e-07,
"loss": 0.5059,
"step": 2835
},
{
"epoch": 0.91,
"grad_norm": 1.1957005697326524,
"learning_rate": 2.226454394871669e-07,
"loss": 0.1483,
"step": 2836
},
{
"epoch": 0.91,
"grad_norm": 1.392513449302165,
"learning_rate": 2.2111874525644228e-07,
"loss": 0.1925,
"step": 2837
},
{
"epoch": 0.91,
"grad_norm": 1.989833788465268,
"learning_rate": 2.1959718500454196e-07,
"loss": 0.2532,
"step": 2838
},
{
"epoch": 0.91,
"grad_norm": 6.223678565243159,
"learning_rate": 2.1808076036608783e-07,
"loss": 0.5268,
"step": 2839
},
{
"epoch": 0.91,
"grad_norm": 6.780954869756683,
"learning_rate": 2.165694729701834e-07,
"loss": 0.4708,
"step": 2840
},
{
"epoch": 0.91,
"grad_norm": 10.461543864853596,
"learning_rate": 2.1506332444041212e-07,
"loss": 0.5094,
"step": 2841
},
{
"epoch": 0.91,
"grad_norm": 7.972329492424538,
"learning_rate": 2.1356231639483917e-07,
"loss": 0.4595,
"step": 2842
},
{
"epoch": 0.91,
"grad_norm": 7.202713021400867,
"learning_rate": 2.1206645044600404e-07,
"loss": 0.5811,
"step": 2843
},
{
"epoch": 0.91,
"grad_norm": 8.93053592284642,
"learning_rate": 2.1057572820092576e-07,
"loss": 0.579,
"step": 2844
},
{
"epoch": 0.91,
"grad_norm": 1.5590482278159425,
"learning_rate": 2.0909015126109488e-07,
"loss": 0.1961,
"step": 2845
},
{
"epoch": 0.91,
"grad_norm": 1.4354444595115425,
"learning_rate": 2.0760972122247425e-07,
"loss": 0.1854,
"step": 2846
},
{
"epoch": 0.91,
"grad_norm": 1.426341737969202,
"learning_rate": 2.061344396754994e-07,
"loss": 0.1916,
"step": 2847
},
{
"epoch": 0.91,
"grad_norm": 6.850055342787105,
"learning_rate": 2.04664308205072e-07,
"loss": 0.4271,
"step": 2848
},
{
"epoch": 0.91,
"grad_norm": 7.668832901374139,
"learning_rate": 2.0319932839056365e-07,
"loss": 0.6114,
"step": 2849
},
{
"epoch": 0.91,
"grad_norm": 5.6236889361003,
"learning_rate": 2.0173950180581047e-07,
"loss": 0.5771,
"step": 2850
},
{
"epoch": 0.91,
"grad_norm": 6.235638019635725,
"learning_rate": 2.002848300191118e-07,
"loss": 0.5185,
"step": 2851
},
{
"epoch": 0.91,
"grad_norm": 6.138322497141576,
"learning_rate": 1.988353145932298e-07,
"loss": 0.5961,
"step": 2852
},
{
"epoch": 0.91,
"grad_norm": 1.4757457669273937,
"learning_rate": 1.9739095708538714e-07,
"loss": 0.1807,
"step": 2853
},
{
"epoch": 0.91,
"grad_norm": 1.7252598905388794,
"learning_rate": 1.9595175904726481e-07,
"loss": 0.2329,
"step": 2854
},
{
"epoch": 0.91,
"grad_norm": 27.86582099478461,
"learning_rate": 1.9451772202500163e-07,
"loss": 0.5558,
"step": 2855
},
{
"epoch": 0.91,
"grad_norm": 1.5933301652280694,
"learning_rate": 1.9308884755919132e-07,
"loss": 0.1907,
"step": 2856
},
{
"epoch": 0.91,
"grad_norm": 7.4008487216339764,
"learning_rate": 1.9166513718488155e-07,
"loss": 0.4857,
"step": 2857
},
{
"epoch": 0.91,
"grad_norm": 5.879695016121743,
"learning_rate": 1.902465924315733e-07,
"loss": 0.5451,
"step": 2858
},
{
"epoch": 0.91,
"grad_norm": 8.25822028400388,
"learning_rate": 1.8883321482321583e-07,
"loss": 0.615,
"step": 2859
},
{
"epoch": 0.92,
"grad_norm": 10.175180891593763,
"learning_rate": 1.8742500587820955e-07,
"loss": 0.567,
"step": 2860
},
{
"epoch": 0.92,
"grad_norm": 1.5893833108605107,
"learning_rate": 1.86021967109401e-07,
"loss": 0.1981,
"step": 2861
},
{
"epoch": 0.92,
"grad_norm": 1.3030970949694585,
"learning_rate": 1.8462410002408228e-07,
"loss": 0.1716,
"step": 2862
},
{
"epoch": 0.92,
"grad_norm": 1.5856352844453432,
"learning_rate": 1.8323140612399038e-07,
"loss": 0.1863,
"step": 2863
},
{
"epoch": 0.92,
"grad_norm": 1.5339491261603877,
"learning_rate": 1.8184388690530242e-07,
"loss": 0.1794,
"step": 2864
},
{
"epoch": 0.92,
"grad_norm": 1.7042141915449476,
"learning_rate": 1.804615438586399e-07,
"loss": 0.2322,
"step": 2865
},
{
"epoch": 0.92,
"grad_norm": 8.131326556983366,
"learning_rate": 1.7908437846906158e-07,
"loss": 0.6032,
"step": 2866
},
{
"epoch": 0.92,
"grad_norm": 7.562849471737348,
"learning_rate": 1.7771239221606285e-07,
"loss": 0.6066,
"step": 2867
},
{
"epoch": 0.92,
"grad_norm": 1.384289973847297,
"learning_rate": 1.7634558657357748e-07,
"loss": 0.1973,
"step": 2868
},
{
"epoch": 0.92,
"grad_norm": 10.089983241935549,
"learning_rate": 1.7498396300997146e-07,
"loss": 0.5804,
"step": 2869
},
{
"epoch": 0.92,
"grad_norm": 1.6044307892692768,
"learning_rate": 1.736275229880441e-07,
"loss": 0.2389,
"step": 2870
},
{
"epoch": 0.92,
"grad_norm": 7.241276701968791,
"learning_rate": 1.7227626796502807e-07,
"loss": 0.4521,
"step": 2871
},
{
"epoch": 0.92,
"grad_norm": 1.6115718489535953,
"learning_rate": 1.7093019939258327e-07,
"loss": 0.1495,
"step": 2872
},
{
"epoch": 0.92,
"grad_norm": 5.819339971265935,
"learning_rate": 1.6958931871679908e-07,
"loss": 0.5626,
"step": 2873
},
{
"epoch": 0.92,
"grad_norm": 1.4566726170315303,
"learning_rate": 1.6825362737818985e-07,
"loss": 0.2031,
"step": 2874
},
{
"epoch": 0.92,
"grad_norm": 5.250363755381174,
"learning_rate": 1.6692312681169775e-07,
"loss": 0.7009,
"step": 2875
},
{
"epoch": 0.92,
"grad_norm": 4.456766188220367,
"learning_rate": 1.6559781844668666e-07,
"loss": 0.4214,
"step": 2876
},
{
"epoch": 0.92,
"grad_norm": 7.7821418717005235,
"learning_rate": 1.6427770370694208e-07,
"loss": 0.5192,
"step": 2877
},
{
"epoch": 0.92,
"grad_norm": 1.6356190170377005,
"learning_rate": 1.6296278401067122e-07,
"loss": 0.1901,
"step": 2878
},
{
"epoch": 0.92,
"grad_norm": 1.7066681565007213,
"learning_rate": 1.6165306077049969e-07,
"loss": 0.2045,
"step": 2879
},
{
"epoch": 0.92,
"grad_norm": 6.874786697193116,
"learning_rate": 1.603485353934703e-07,
"loss": 0.5165,
"step": 2880
},
{
"epoch": 0.92,
"grad_norm": 6.215627299105615,
"learning_rate": 1.5904920928104196e-07,
"loss": 0.6417,
"step": 2881
},
{
"epoch": 0.92,
"grad_norm": 23.282797246982042,
"learning_rate": 1.577550838290881e-07,
"loss": 0.5461,
"step": 2882
},
{
"epoch": 0.92,
"grad_norm": 10.936842811991326,
"learning_rate": 1.564661604278944e-07,
"loss": 0.5793,
"step": 2883
},
{
"epoch": 0.92,
"grad_norm": 7.105473846049208,
"learning_rate": 1.5518244046215936e-07,
"loss": 0.5209,
"step": 2884
},
{
"epoch": 0.92,
"grad_norm": 1.6291138445685007,
"learning_rate": 1.539039253109892e-07,
"loss": 0.2067,
"step": 2885
},
{
"epoch": 0.92,
"grad_norm": 1.5330733203641582,
"learning_rate": 1.526306163479019e-07,
"loss": 0.1849,
"step": 2886
},
{
"epoch": 0.92,
"grad_norm": 8.577477788398626,
"learning_rate": 1.5136251494081822e-07,
"loss": 0.7031,
"step": 2887
},
{
"epoch": 0.92,
"grad_norm": 17.50367381791444,
"learning_rate": 1.5009962245206845e-07,
"loss": 0.5257,
"step": 2888
},
{
"epoch": 0.92,
"grad_norm": 6.824661403341482,
"learning_rate": 1.488419402383834e-07,
"loss": 0.5009,
"step": 2889
},
{
"epoch": 0.92,
"grad_norm": 7.750865837232404,
"learning_rate": 1.4758946965089894e-07,
"loss": 0.6712,
"step": 2890
},
{
"epoch": 0.93,
"grad_norm": 1.495357227840504,
"learning_rate": 1.4634221203515097e-07,
"loss": 0.2044,
"step": 2891
},
{
"epoch": 0.93,
"grad_norm": 1.6973063844464513,
"learning_rate": 1.4510016873107657e-07,
"loss": 0.1886,
"step": 2892
},
{
"epoch": 0.93,
"grad_norm": 1.4555547735917695,
"learning_rate": 1.4386334107300727e-07,
"loss": 0.1951,
"step": 2893
},
{
"epoch": 0.93,
"grad_norm": 6.284820189802007,
"learning_rate": 1.4263173038967627e-07,
"loss": 0.5042,
"step": 2894
},
{
"epoch": 0.93,
"grad_norm": 13.227742098011161,
"learning_rate": 1.4140533800420853e-07,
"loss": 0.6508,
"step": 2895
},
{
"epoch": 0.93,
"grad_norm": 88.15415177230122,
"learning_rate": 1.401841652341246e-07,
"loss": 0.6007,
"step": 2896
},
{
"epoch": 0.93,
"grad_norm": 5.796768955970198,
"learning_rate": 1.389682133913378e-07,
"loss": 0.7432,
"step": 2897
},
{
"epoch": 0.93,
"grad_norm": 6.879847456235132,
"learning_rate": 1.3775748378215047e-07,
"loss": 0.6708,
"step": 2898
},
{
"epoch": 0.93,
"grad_norm": 4.851883532350419,
"learning_rate": 1.3655197770725826e-07,
"loss": 0.3758,
"step": 2899
},
{
"epoch": 0.93,
"grad_norm": 9.977095689739897,
"learning_rate": 1.3535169646174073e-07,
"loss": 0.4572,
"step": 2900
},
{
"epoch": 0.93,
"grad_norm": 6.6640551359882725,
"learning_rate": 1.3415664133506812e-07,
"loss": 0.5807,
"step": 2901
},
{
"epoch": 0.93,
"grad_norm": 6.526859432151342,
"learning_rate": 1.3296681361109564e-07,
"loss": 0.4813,
"step": 2902
},
{
"epoch": 0.93,
"grad_norm": 1.3440258772018676,
"learning_rate": 1.3178221456806028e-07,
"loss": 0.1984,
"step": 2903
},
{
"epoch": 0.93,
"grad_norm": 1.4401183304907947,
"learning_rate": 1.3060284547858403e-07,
"loss": 0.208,
"step": 2904
},
{
"epoch": 0.93,
"grad_norm": 6.329726556651034,
"learning_rate": 1.2942870760966952e-07,
"loss": 0.5821,
"step": 2905
},
{
"epoch": 0.93,
"grad_norm": 5.094978910867227,
"learning_rate": 1.282598022226994e-07,
"loss": 0.5231,
"step": 2906
},
{
"epoch": 0.93,
"grad_norm": 1.4060308590002704,
"learning_rate": 1.270961305734364e-07,
"loss": 0.2206,
"step": 2907
},
{
"epoch": 0.93,
"grad_norm": 1.644107385283967,
"learning_rate": 1.2593769391201827e-07,
"loss": 0.2131,
"step": 2908
},
{
"epoch": 0.93,
"grad_norm": 1.8235233450387875,
"learning_rate": 1.247844934829606e-07,
"loss": 0.1971,
"step": 2909
},
{
"epoch": 0.93,
"grad_norm": 6.718301175320187,
"learning_rate": 1.2363653052515302e-07,
"loss": 0.4507,
"step": 2910
},
{
"epoch": 0.93,
"grad_norm": 1.676197969737339,
"learning_rate": 1.2249380627185781e-07,
"loss": 0.1956,
"step": 2911
},
{
"epoch": 0.93,
"grad_norm": 8.67306685375281,
"learning_rate": 1.2135632195071133e-07,
"loss": 0.5648,
"step": 2912
},
{
"epoch": 0.93,
"grad_norm": 1.594319802332187,
"learning_rate": 1.202240787837178e-07,
"loss": 0.2167,
"step": 2913
},
{
"epoch": 0.93,
"grad_norm": 6.7491465730094395,
"learning_rate": 1.1909707798725412e-07,
"loss": 0.6679,
"step": 2914
},
{
"epoch": 0.93,
"grad_norm": 1.7309112303750012,
"learning_rate": 1.1797532077206187e-07,
"loss": 0.2096,
"step": 2915
},
{
"epoch": 0.93,
"grad_norm": 1.3304986946071597,
"learning_rate": 1.1685880834325203e-07,
"loss": 0.16,
"step": 2916
},
{
"epoch": 0.93,
"grad_norm": 1.3626329682508962,
"learning_rate": 1.1574754190030014e-07,
"loss": 0.1922,
"step": 2917
},
{
"epoch": 0.93,
"grad_norm": 7.166258200913389,
"learning_rate": 1.1464152263704565e-07,
"loss": 0.6461,
"step": 2918
},
{
"epoch": 0.93,
"grad_norm": 1.3522158691801092,
"learning_rate": 1.1354075174169088e-07,
"loss": 0.1929,
"step": 2919
},
{
"epoch": 0.93,
"grad_norm": 1.6023939170404724,
"learning_rate": 1.12445230396801e-07,
"loss": 0.2059,
"step": 2920
},
{
"epoch": 0.93,
"grad_norm": 6.090268642091465,
"learning_rate": 1.1135495977930011e-07,
"loss": 0.5342,
"step": 2921
},
{
"epoch": 0.94,
"grad_norm": 1.5883808411565767,
"learning_rate": 1.1026994106047296e-07,
"loss": 0.1993,
"step": 2922
},
{
"epoch": 0.94,
"grad_norm": 1.6860915572927744,
"learning_rate": 1.0919017540595933e-07,
"loss": 0.1826,
"step": 2923
},
{
"epoch": 0.94,
"grad_norm": 8.721284754711782,
"learning_rate": 1.0811566397575912e-07,
"loss": 0.358,
"step": 2924
},
{
"epoch": 0.94,
"grad_norm": 3.935675864876987,
"learning_rate": 1.0704640792422616e-07,
"loss": 0.4267,
"step": 2925
},
{
"epoch": 0.94,
"grad_norm": 8.056284718040663,
"learning_rate": 1.0598240840006658e-07,
"loss": 0.5781,
"step": 2926
},
{
"epoch": 0.94,
"grad_norm": 6.63408809117204,
"learning_rate": 1.0492366654634211e-07,
"loss": 0.4878,
"step": 2927
},
{
"epoch": 0.94,
"grad_norm": 1.3656309333900636,
"learning_rate": 1.0387018350046519e-07,
"loss": 0.2072,
"step": 2928
},
{
"epoch": 0.94,
"grad_norm": 6.951365718753954,
"learning_rate": 1.0282196039419823e-07,
"loss": 0.5279,
"step": 2929
},
{
"epoch": 0.94,
"grad_norm": 1.4346216121931041,
"learning_rate": 1.0177899835365323e-07,
"loss": 0.202,
"step": 2930
},
{
"epoch": 0.94,
"grad_norm": 1.6888835421357935,
"learning_rate": 1.0074129849928948e-07,
"loss": 0.2275,
"step": 2931
},
{
"epoch": 0.94,
"grad_norm": 1.4423057325247473,
"learning_rate": 9.970886194591467e-08,
"loss": 0.1978,
"step": 2932
},
{
"epoch": 0.94,
"grad_norm": 1.4267009542011637,
"learning_rate": 9.8681689802681e-08,
"loss": 0.1821,
"step": 2933
},
{
"epoch": 0.94,
"grad_norm": 4.973516740039685,
"learning_rate": 9.765978317308522e-08,
"loss": 0.4752,
"step": 2934
},
{
"epoch": 0.94,
"grad_norm": 4.725959405022734,
"learning_rate": 9.664314315496692e-08,
"loss": 0.503,
"step": 2935
},
{
"epoch": 0.94,
"grad_norm": 1.4000403717757461,
"learning_rate": 9.5631770840508e-08,
"loss": 0.2098,
"step": 2936
},
{
"epoch": 0.94,
"grad_norm": 1.512264392172657,
"learning_rate": 9.462566731623213e-08,
"loss": 0.1908,
"step": 2937
},
{
"epoch": 0.94,
"grad_norm": 6.575995188879126,
"learning_rate": 9.36248336630019e-08,
"loss": 0.5007,
"step": 2938
},
{
"epoch": 0.94,
"grad_norm": 1.5617906757502826,
"learning_rate": 9.262927095601782e-08,
"loss": 0.2095,
"step": 2939
},
{
"epoch": 0.94,
"grad_norm": 1.6751730406524747,
"learning_rate": 9.163898026481876e-08,
"loss": 0.2249,
"step": 2940
},
{
"epoch": 0.94,
"grad_norm": 9.041493149958258,
"learning_rate": 9.065396265327986e-08,
"loss": 0.4572,
"step": 2941
},
{
"epoch": 0.94,
"grad_norm": 1.4750842071035337,
"learning_rate": 8.967421917961072e-08,
"loss": 0.1997,
"step": 2942
},
{
"epoch": 0.94,
"grad_norm": 8.271970625422963,
"learning_rate": 8.869975089635552e-08,
"loss": 0.4652,
"step": 2943
},
{
"epoch": 0.94,
"grad_norm": 1.3942467670470393,
"learning_rate": 8.773055885039072e-08,
"loss": 0.1652,
"step": 2944
},
{
"epoch": 0.94,
"grad_norm": 6.763267989712541,
"learning_rate": 8.676664408292457e-08,
"loss": 0.5331,
"step": 2945
},
{
"epoch": 0.94,
"grad_norm": 5.2386080492282,
"learning_rate": 8.580800762949704e-08,
"loss": 0.5473,
"step": 2946
},
{
"epoch": 0.94,
"grad_norm": 1.6535396390189872,
"learning_rate": 8.485465051997488e-08,
"loss": 0.2036,
"step": 2947
},
{
"epoch": 0.94,
"grad_norm": 1.5508936364876473,
"learning_rate": 8.39065737785566e-08,
"loss": 0.1836,
"step": 2948
},
{
"epoch": 0.94,
"grad_norm": 4.588146624149047,
"learning_rate": 8.296377842376524e-08,
"loss": 0.5541,
"step": 2949
},
{
"epoch": 0.94,
"grad_norm": 1.6431556861222913,
"learning_rate": 8.202626546845172e-08,
"loss": 0.2131,
"step": 2950
},
{
"epoch": 0.94,
"grad_norm": 6.7805094436163085,
"learning_rate": 8.109403591979148e-08,
"loss": 0.5631,
"step": 2951
},
{
"epoch": 0.94,
"grad_norm": 9.769084565696684,
"learning_rate": 8.016709077928397e-08,
"loss": 0.5768,
"step": 2952
},
{
"epoch": 0.94,
"grad_norm": 8.357217432789115,
"learning_rate": 7.924543104275095e-08,
"loss": 0.4799,
"step": 2953
},
{
"epoch": 0.95,
"grad_norm": 1.297962053997005,
"learning_rate": 7.832905770033705e-08,
"loss": 0.1345,
"step": 2954
},
{
"epoch": 0.95,
"grad_norm": 1.5274934813053564,
"learning_rate": 7.7417971736507e-08,
"loss": 0.2046,
"step": 2955
},
{
"epoch": 0.95,
"grad_norm": 13.267871007882906,
"learning_rate": 7.651217413004674e-08,
"loss": 0.5781,
"step": 2956
},
{
"epoch": 0.95,
"grad_norm": 7.938808149163866,
"learning_rate": 7.561166585405789e-08,
"loss": 0.4829,
"step": 2957
},
{
"epoch": 0.95,
"grad_norm": 1.4567831332091496,
"learning_rate": 7.47164478759621e-08,
"loss": 0.1964,
"step": 2958
},
{
"epoch": 0.95,
"grad_norm": 5.838862693988253,
"learning_rate": 7.382652115749789e-08,
"loss": 0.5398,
"step": 2959
},
{
"epoch": 0.95,
"grad_norm": 1.5771838835063452,
"learning_rate": 7.294188665471769e-08,
"loss": 0.1818,
"step": 2960
},
{
"epoch": 0.95,
"grad_norm": 1.430463795467331,
"learning_rate": 7.206254531799018e-08,
"loss": 0.1971,
"step": 2961
},
{
"epoch": 0.95,
"grad_norm": 1.4473265007418736,
"learning_rate": 7.118849809199524e-08,
"loss": 0.1747,
"step": 2962
},
{
"epoch": 0.95,
"grad_norm": 7.001770352559234,
"learning_rate": 7.031974591572732e-08,
"loss": 0.5199,
"step": 2963
},
{
"epoch": 0.95,
"grad_norm": 1.4584409439658936,
"learning_rate": 6.945628972249208e-08,
"loss": 0.1997,
"step": 2964
},
{
"epoch": 0.95,
"grad_norm": 1.5603112849189318,
"learning_rate": 6.859813043990526e-08,
"loss": 0.2087,
"step": 2965
},
{
"epoch": 0.95,
"grad_norm": 1.5989705437486081,
"learning_rate": 6.77452689898922e-08,
"loss": 0.2121,
"step": 2966
},
{
"epoch": 0.95,
"grad_norm": 1.5299259420967457,
"learning_rate": 6.689770628868609e-08,
"loss": 0.2284,
"step": 2967
},
{
"epoch": 0.95,
"grad_norm": 5.453348824888012,
"learning_rate": 6.605544324682855e-08,
"loss": 0.4395,
"step": 2968
},
{
"epoch": 0.95,
"grad_norm": 1.5849018751349726,
"learning_rate": 6.521848076916859e-08,
"loss": 0.2086,
"step": 2969
},
{
"epoch": 0.95,
"grad_norm": 1.5379039372042327,
"learning_rate": 6.438681975485805e-08,
"loss": 0.215,
"step": 2970
},
{
"epoch": 0.95,
"grad_norm": 11.850872203494525,
"learning_rate": 6.356046109735614e-08,
"loss": 0.6073,
"step": 2971
},
{
"epoch": 0.95,
"grad_norm": 1.6067848040383592,
"learning_rate": 6.273940568442327e-08,
"loss": 0.1977,
"step": 2972
},
{
"epoch": 0.95,
"grad_norm": 1.399031388079973,
"learning_rate": 6.192365439812553e-08,
"loss": 0.1573,
"step": 2973
},
{
"epoch": 0.95,
"grad_norm": 6.356556169923487,
"learning_rate": 6.111320811482802e-08,
"loss": 0.463,
"step": 2974
},
{
"epoch": 0.95,
"grad_norm": 1.6608622010185468,
"learning_rate": 6.030806770519815e-08,
"loss": 0.1943,
"step": 2975
},
{
"epoch": 0.95,
"grad_norm": 1.3672535592434387,
"learning_rate": 5.9508234034202364e-08,
"loss": 0.1661,
"step": 2976
},
{
"epoch": 0.95,
"grad_norm": 1.6686974930318208,
"learning_rate": 5.871370796110665e-08,
"loss": 0.193,
"step": 2977
},
{
"epoch": 0.95,
"grad_norm": 7.863053421390452,
"learning_rate": 5.7924490339474335e-08,
"loss": 0.5465,
"step": 2978
},
{
"epoch": 0.95,
"grad_norm": 5.980751998759158,
"learning_rate": 5.7140582017167764e-08,
"loss": 0.5738,
"step": 2979
},
{
"epoch": 0.95,
"grad_norm": 1.7375085215686261,
"learning_rate": 5.636198383634217e-08,
"loss": 0.1701,
"step": 2980
},
{
"epoch": 0.95,
"grad_norm": 1.7604058039303363,
"learning_rate": 5.558869663345123e-08,
"loss": 0.246,
"step": 2981
},
{
"epoch": 0.95,
"grad_norm": 1.5834894119079832,
"learning_rate": 5.482072123924098e-08,
"loss": 0.1968,
"step": 2982
},
{
"epoch": 0.95,
"grad_norm": 1.4063289344126568,
"learning_rate": 5.405805847875256e-08,
"loss": 0.1884,
"step": 2983
},
{
"epoch": 0.95,
"grad_norm": 5.697631443303541,
"learning_rate": 5.330070917131724e-08,
"loss": 0.527,
"step": 2984
},
{
"epoch": 0.96,
"grad_norm": 6.388984077995515,
"learning_rate": 5.2548674130561974e-08,
"loss": 0.3975,
"step": 2985
},
{
"epoch": 0.96,
"grad_norm": 1.5987057012740025,
"learning_rate": 5.1801954164399925e-08,
"loss": 0.2432,
"step": 2986
},
{
"epoch": 0.96,
"grad_norm": 5.812013938920401,
"learning_rate": 5.106055007503774e-08,
"loss": 0.4925,
"step": 2987
},
{
"epoch": 0.96,
"grad_norm": 1.5797247671065668,
"learning_rate": 5.0324462658969395e-08,
"loss": 0.2229,
"step": 2988
},
{
"epoch": 0.96,
"grad_norm": 1.664656061602779,
"learning_rate": 4.959369270697789e-08,
"loss": 0.2654,
"step": 2989
},
{
"epoch": 0.96,
"grad_norm": 8.105206792904527,
"learning_rate": 4.886824100413412e-08,
"loss": 0.7076,
"step": 2990
},
{
"epoch": 0.96,
"grad_norm": 23.236434026650407,
"learning_rate": 4.814810832979411e-08,
"loss": 0.6483,
"step": 2991
},
{
"epoch": 0.96,
"grad_norm": 5.308885397734057,
"learning_rate": 4.743329545760122e-08,
"loss": 0.5117,
"step": 2992
},
{
"epoch": 0.96,
"grad_norm": 14.907848429189544,
"learning_rate": 4.67238031554812e-08,
"loss": 0.6733,
"step": 2993
},
{
"epoch": 0.96,
"grad_norm": 6.4774314255112,
"learning_rate": 4.6019632185647645e-08,
"loss": 0.5325,
"step": 2994
},
{
"epoch": 0.96,
"grad_norm": 6.644308814873751,
"learning_rate": 4.532078330459433e-08,
"loss": 0.575,
"step": 2995
},
{
"epoch": 0.96,
"grad_norm": 1.6465106542055439,
"learning_rate": 4.4627257263098465e-08,
"loss": 0.175,
"step": 2996
},
{
"epoch": 0.96,
"grad_norm": 6.993332934686698,
"learning_rate": 4.393905480621907e-08,
"loss": 0.464,
"step": 2997
},
{
"epoch": 0.96,
"grad_norm": 1.4112580103582617,
"learning_rate": 4.3256176673295846e-08,
"loss": 0.1727,
"step": 2998
},
{
"epoch": 0.96,
"grad_norm": 7.449952228864845,
"learning_rate": 4.2578623597949174e-08,
"loss": 0.5289,
"step": 2999
},
{
"epoch": 0.96,
"grad_norm": 6.983314068003288,
"learning_rate": 4.1906396308077356e-08,
"loss": 0.4838,
"step": 3000
},
{
"epoch": 0.96,
"grad_norm": 1.420455900288509,
"learning_rate": 4.123949552585826e-08,
"loss": 0.2371,
"step": 3001
},
{
"epoch": 0.96,
"grad_norm": 5.960526883885027,
"learning_rate": 4.0577921967747126e-08,
"loss": 0.3775,
"step": 3002
},
{
"epoch": 0.96,
"grad_norm": 1.5666246212700372,
"learning_rate": 3.9921676344475966e-08,
"loss": 0.2083,
"step": 3003
},
{
"epoch": 0.96,
"grad_norm": 1.4196458859952457,
"learning_rate": 3.927075936105307e-08,
"loss": 0.1898,
"step": 3004
},
{
"epoch": 0.96,
"grad_norm": 10.699323728753926,
"learning_rate": 3.8625171716762385e-08,
"loss": 0.5733,
"step": 3005
},
{
"epoch": 0.96,
"grad_norm": 5.490490638625815,
"learning_rate": 3.7984914105162474e-08,
"loss": 0.3556,
"step": 3006
},
{
"epoch": 0.96,
"grad_norm": 1.7482767819557443,
"learning_rate": 3.7349987214084784e-08,
"loss": 0.1785,
"step": 3007
},
{
"epoch": 0.96,
"grad_norm": 1.4680399434949956,
"learning_rate": 3.672039172563646e-08,
"loss": 0.2078,
"step": 3008
},
{
"epoch": 0.96,
"grad_norm": 1.4830412849161663,
"learning_rate": 3.609612831619369e-08,
"loss": 0.1598,
"step": 3009
},
{
"epoch": 0.96,
"grad_norm": 14.456193489536421,
"learning_rate": 3.547719765640778e-08,
"loss": 0.5513,
"step": 3010
},
{
"epoch": 0.96,
"grad_norm": 6.669946189400554,
"learning_rate": 3.4863600411197404e-08,
"loss": 0.5237,
"step": 3011
},
{
"epoch": 0.96,
"grad_norm": 1.3913977980338987,
"learning_rate": 3.425533723975527e-08,
"loss": 0.1985,
"step": 3012
},
{
"epoch": 0.96,
"grad_norm": 1.519397239367645,
"learning_rate": 3.365240879554144e-08,
"loss": 0.2387,
"step": 3013
},
{
"epoch": 0.96,
"grad_norm": 1.5463128250860247,
"learning_rate": 3.3054815726285e-08,
"loss": 0.1895,
"step": 3014
},
{
"epoch": 0.96,
"grad_norm": 6.35177466171103,
"learning_rate": 3.2462558673983516e-08,
"loss": 0.4928,
"step": 3015
},
{
"epoch": 0.97,
"grad_norm": 7.576803306496446,
"learning_rate": 3.1875638274902476e-08,
"loss": 0.5856,
"step": 3016
},
{
"epoch": 0.97,
"grad_norm": 1.4257716411064958,
"learning_rate": 3.129405515957307e-08,
"loss": 0.1759,
"step": 3017
},
{
"epoch": 0.97,
"grad_norm": 1.6215858795651334,
"learning_rate": 3.071780995279439e-08,
"loss": 0.252,
"step": 3018
},
{
"epoch": 0.97,
"grad_norm": 5.048807746845461,
"learning_rate": 3.014690327362846e-08,
"loss": 0.438,
"step": 3019
},
{
"epoch": 0.97,
"grad_norm": 8.851945826700035,
"learning_rate": 2.9581335735404672e-08,
"loss": 0.5805,
"step": 3020
},
{
"epoch": 0.97,
"grad_norm": 1.4454792312951033,
"learning_rate": 2.9021107945714777e-08,
"loss": 0.1805,
"step": 3021
},
{
"epoch": 0.97,
"grad_norm": 1.4381068991020554,
"learning_rate": 2.8466220506414565e-08,
"loss": 0.1667,
"step": 3022
},
{
"epoch": 0.97,
"grad_norm": 1.6272251766125279,
"learning_rate": 2.79166740136233e-08,
"loss": 0.2039,
"step": 3023
},
{
"epoch": 0.97,
"grad_norm": 1.3805092466249094,
"learning_rate": 2.7372469057721506e-08,
"loss": 0.1887,
"step": 3024
},
{
"epoch": 0.97,
"grad_norm": 1.5745020096815718,
"learning_rate": 2.6833606223351515e-08,
"loss": 0.2059,
"step": 3025
},
{
"epoch": 0.97,
"grad_norm": 7.362129018288768,
"learning_rate": 2.6300086089416366e-08,
"loss": 0.6999,
"step": 3026
},
{
"epoch": 0.97,
"grad_norm": 1.774494427703464,
"learning_rate": 2.577190922908035e-08,
"loss": 0.2284,
"step": 3027
},
{
"epoch": 0.97,
"grad_norm": 1.5132740417702437,
"learning_rate": 2.5249076209767353e-08,
"loss": 0.2053,
"step": 3028
},
{
"epoch": 0.97,
"grad_norm": 1.7297296384375203,
"learning_rate": 2.473158759315808e-08,
"loss": 0.1827,
"step": 3029
},
{
"epoch": 0.97,
"grad_norm": 5.490226694397722,
"learning_rate": 2.421944393519504e-08,
"loss": 0.4102,
"step": 3030
},
{
"epoch": 0.97,
"grad_norm": 1.559266414410157,
"learning_rate": 2.3712645786075905e-08,
"loss": 0.1818,
"step": 3031
},
{
"epoch": 0.97,
"grad_norm": 10.133659475015277,
"learning_rate": 2.3211193690257373e-08,
"loss": 0.499,
"step": 3032
},
{
"epoch": 0.97,
"grad_norm": 5.3456419932725066,
"learning_rate": 2.271508818645185e-08,
"loss": 0.3714,
"step": 3033
},
{
"epoch": 0.97,
"grad_norm": 1.397147772347432,
"learning_rate": 2.222432980762912e-08,
"loss": 0.1636,
"step": 3034
},
{
"epoch": 0.97,
"grad_norm": 1.526139789617029,
"learning_rate": 2.1738919081012446e-08,
"loss": 0.2318,
"step": 3035
},
{
"epoch": 0.97,
"grad_norm": 1.5863043449471008,
"learning_rate": 2.1258856528081906e-08,
"loss": 0.1641,
"step": 3036
},
{
"epoch": 0.97,
"grad_norm": 1.8684179072745595,
"learning_rate": 2.0784142664571626e-08,
"loss": 0.2071,
"step": 3037
},
{
"epoch": 0.97,
"grad_norm": 11.788992003108227,
"learning_rate": 2.031477800046866e-08,
"loss": 0.5973,
"step": 3038
},
{
"epoch": 0.97,
"grad_norm": 7.604835128526027,
"learning_rate": 1.9850763040014654e-08,
"loss": 0.5794,
"step": 3039
},
{
"epoch": 0.97,
"grad_norm": 8.514897010317375,
"learning_rate": 1.939209828170363e-08,
"loss": 0.461,
"step": 3040
},
{
"epoch": 0.97,
"grad_norm": 10.669757021023933,
"learning_rate": 1.8938784218281435e-08,
"loss": 0.7521,
"step": 3041
},
{
"epoch": 0.97,
"grad_norm": 1.5168614554290425,
"learning_rate": 1.849082133674518e-08,
"loss": 0.1889,
"step": 3042
},
{
"epoch": 0.97,
"grad_norm": 4.147775107296752,
"learning_rate": 1.80482101183449e-08,
"loss": 0.3864,
"step": 3043
},
{
"epoch": 0.97,
"grad_norm": 9.244428175932155,
"learning_rate": 1.761095103858024e-08,
"loss": 0.5296,
"step": 3044
},
{
"epoch": 0.97,
"grad_norm": 7.505218349293431,
"learning_rate": 1.717904456720043e-08,
"loss": 0.6889,
"step": 3045
},
{
"epoch": 0.97,
"grad_norm": 1.352085160477461,
"learning_rate": 1.675249116820543e-08,
"loss": 0.1641,
"step": 3046
},
{
"epoch": 0.98,
"grad_norm": 1.299654688631552,
"learning_rate": 1.6331291299844233e-08,
"loss": 0.1653,
"step": 3047
},
{
"epoch": 0.98,
"grad_norm": 1.5684991611977552,
"learning_rate": 1.5915445414613208e-08,
"loss": 0.2617,
"step": 3048
},
{
"epoch": 0.98,
"grad_norm": 1.4482670914575277,
"learning_rate": 1.550495395925944e-08,
"loss": 0.1941,
"step": 3049
},
{
"epoch": 0.98,
"grad_norm": 7.256742985265447,
"learning_rate": 1.5099817374774615e-08,
"loss": 0.548,
"step": 3050
},
{
"epoch": 0.98,
"grad_norm": 7.699936238621277,
"learning_rate": 1.4700036096400028e-08,
"loss": 0.5615,
"step": 3051
},
{
"epoch": 0.98,
"grad_norm": 9.119117485970706,
"learning_rate": 1.4305610553623228e-08,
"loss": 0.6034,
"step": 3052
},
{
"epoch": 0.98,
"grad_norm": 1.6421789152262496,
"learning_rate": 1.3916541170176934e-08,
"loss": 0.176,
"step": 3053
},
{
"epoch": 0.98,
"grad_norm": 1.3522274306030595,
"learning_rate": 1.3532828364041239e-08,
"loss": 0.194,
"step": 3054
},
{
"epoch": 0.98,
"grad_norm": 1.6474357079295912,
"learning_rate": 1.3154472547440289e-08,
"loss": 0.1873,
"step": 3055
},
{
"epoch": 0.98,
"grad_norm": 5.44204046323545,
"learning_rate": 1.2781474126845051e-08,
"loss": 0.6603,
"step": 3056
},
{
"epoch": 0.98,
"grad_norm": 1.6273549373967076,
"learning_rate": 1.241383350296832e-08,
"loss": 0.1915,
"step": 3057
},
{
"epoch": 0.98,
"grad_norm": 6.286750400189176,
"learning_rate": 1.2051551070769719e-08,
"loss": 0.4309,
"step": 3058
},
{
"epoch": 0.98,
"grad_norm": 5.135787208051596,
"learning_rate": 1.1694627219450694e-08,
"loss": 0.4062,
"step": 3059
},
{
"epoch": 0.98,
"grad_norm": 7.165414563360992,
"learning_rate": 1.134306233245619e-08,
"loss": 0.5891,
"step": 3060
},
{
"epoch": 0.98,
"grad_norm": 1.6275508401703538,
"learning_rate": 1.0996856787475197e-08,
"loss": 0.2313,
"step": 3061
},
{
"epoch": 0.98,
"grad_norm": 6.526103785267677,
"learning_rate": 1.0656010956437979e-08,
"loss": 0.6737,
"step": 3062
},
{
"epoch": 0.98,
"grad_norm": 1.411322668377023,
"learning_rate": 1.0320525205516629e-08,
"loss": 0.1676,
"step": 3063
},
{
"epoch": 0.98,
"grad_norm": 9.193227665398865,
"learning_rate": 9.990399895125624e-09,
"loss": 0.434,
"step": 3064
},
{
"epoch": 0.98,
"grad_norm": 1.488605324614986,
"learning_rate": 9.665635379920157e-09,
"loss": 0.1869,
"step": 3065
},
{
"epoch": 0.98,
"grad_norm": 5.801227737969345,
"learning_rate": 9.346232008797252e-09,
"loss": 0.5074,
"step": 3066
},
{
"epoch": 0.98,
"grad_norm": 6.595733425369078,
"learning_rate": 9.032190124893536e-09,
"loss": 0.3494,
"step": 3067
},
{
"epoch": 0.98,
"grad_norm": 5.873193491313792,
"learning_rate": 8.723510065585806e-09,
"loss": 0.5724,
"step": 3068
},
{
"epoch": 0.98,
"grad_norm": 1.536886334269207,
"learning_rate": 8.42019216249046e-09,
"loss": 0.2056,
"step": 3069
},
{
"epoch": 0.98,
"grad_norm": 7.617227766345291,
"learning_rate": 8.122236741464618e-09,
"loss": 0.457,
"step": 3070
},
{
"epoch": 0.98,
"grad_norm": 1.6439233519395178,
"learning_rate": 7.82964412260223e-09,
"loss": 0.2278,
"step": 3071
},
{
"epoch": 0.98,
"grad_norm": 7.584597301220255,
"learning_rate": 7.542414620237414e-09,
"loss": 0.5963,
"step": 3072
},
{
"epoch": 0.98,
"grad_norm": 1.7329227900430477,
"learning_rate": 7.260548542943335e-09,
"loss": 0.2189,
"step": 3073
},
{
"epoch": 0.98,
"grad_norm": 1.7161071849637957,
"learning_rate": 6.984046193528881e-09,
"loss": 0.2063,
"step": 3074
},
{
"epoch": 0.98,
"grad_norm": 6.313601738654207,
"learning_rate": 6.712907869043661e-09,
"loss": 0.4828,
"step": 3075
},
{
"epoch": 0.98,
"grad_norm": 1.309216968825294,
"learning_rate": 6.447133860771893e-09,
"loss": 0.1278,
"step": 3076
},
{
"epoch": 0.98,
"grad_norm": 5.732384056113262,
"learning_rate": 6.186724454236847e-09,
"loss": 0.6362,
"step": 3077
},
{
"epoch": 0.98,
"grad_norm": 1.569061524189713,
"learning_rate": 5.9316799291969654e-09,
"loss": 0.2119,
"step": 3078
},
{
"epoch": 0.99,
"grad_norm": 1.4694221335985644,
"learning_rate": 5.682000559649181e-09,
"loss": 0.2094,
"step": 3079
},
{
"epoch": 0.99,
"grad_norm": 9.261448007385779,
"learning_rate": 5.437686613823934e-09,
"loss": 0.3961,
"step": 3080
},
{
"epoch": 0.99,
"grad_norm": 1.6067208100390094,
"learning_rate": 5.198738354190158e-09,
"loss": 0.219,
"step": 3081
},
{
"epoch": 0.99,
"grad_norm": 6.1009075406485005,
"learning_rate": 4.9651560374514015e-09,
"loss": 0.4553,
"step": 3082
},
{
"epoch": 0.99,
"grad_norm": 1.4256870486709714,
"learning_rate": 4.736939914545824e-09,
"loss": 0.1788,
"step": 3083
},
{
"epoch": 0.99,
"grad_norm": 5.378017075157903,
"learning_rate": 4.514090230647305e-09,
"loss": 0.4654,
"step": 3084
},
{
"epoch": 0.99,
"grad_norm": 6.112138137560136,
"learning_rate": 4.296607225164895e-09,
"loss": 0.6969,
"step": 3085
},
{
"epoch": 0.99,
"grad_norm": 7.250462609629612,
"learning_rate": 4.084491131741697e-09,
"loss": 0.7226,
"step": 3086
},
{
"epoch": 0.99,
"grad_norm": 8.628188351336053,
"learning_rate": 3.877742178254873e-09,
"loss": 0.7685,
"step": 3087
},
{
"epoch": 0.99,
"grad_norm": 7.162283701232727,
"learning_rate": 3.6763605868167516e-09,
"loss": 0.6627,
"step": 3088
},
{
"epoch": 0.99,
"grad_norm": 6.397798704627545,
"learning_rate": 3.4803465737714983e-09,
"loss": 0.4927,
"step": 3089
},
{
"epoch": 0.99,
"grad_norm": 5.4537538091253195,
"learning_rate": 3.289700349698999e-09,
"loss": 0.6303,
"step": 3090
},
{
"epoch": 0.99,
"grad_norm": 5.454843813066502,
"learning_rate": 3.104422119411532e-09,
"loss": 0.3822,
"step": 3091
},
{
"epoch": 0.99,
"grad_norm": 1.5629152223697327,
"learning_rate": 2.9245120819543226e-09,
"loss": 0.188,
"step": 3092
},
{
"epoch": 0.99,
"grad_norm": 17.27712515755136,
"learning_rate": 2.749970430605542e-09,
"loss": 0.5413,
"step": 3093
},
{
"epoch": 0.99,
"grad_norm": 1.5181043823476956,
"learning_rate": 2.5807973528768626e-09,
"loss": 0.1834,
"step": 3094
},
{
"epoch": 0.99,
"grad_norm": 7.209806498981695,
"learning_rate": 2.416993030511239e-09,
"loss": 0.6215,
"step": 3095
},
{
"epoch": 0.99,
"grad_norm": 5.914505942903512,
"learning_rate": 2.258557639484571e-09,
"loss": 0.5087,
"step": 3096
},
{
"epoch": 0.99,
"grad_norm": 4.674993130466707,
"learning_rate": 2.1054913500051512e-09,
"loss": 0.4659,
"step": 3097
},
{
"epoch": 0.99,
"grad_norm": 1.5402829311423731,
"learning_rate": 1.957794326513107e-09,
"loss": 0.1981,
"step": 3098
},
{
"epoch": 0.99,
"grad_norm": 1.517322625685105,
"learning_rate": 1.8154667276798488e-09,
"loss": 0.1802,
"step": 3099
},
{
"epoch": 0.99,
"grad_norm": 1.5378631489322545,
"learning_rate": 1.6785087064086213e-09,
"loss": 0.1637,
"step": 3100
},
{
"epoch": 0.99,
"grad_norm": 1.4670537938773458,
"learning_rate": 1.546920409834507e-09,
"loss": 0.1683,
"step": 3101
},
{
"epoch": 0.99,
"grad_norm": 5.2917356066256245,
"learning_rate": 1.4207019793238686e-09,
"loss": 0.5907,
"step": 3102
},
{
"epoch": 0.99,
"grad_norm": 6.98601503007583,
"learning_rate": 1.299853550472685e-09,
"loss": 0.476,
"step": 3103
},
{
"epoch": 0.99,
"grad_norm": 7.200946600616249,
"learning_rate": 1.1843752531104368e-09,
"loss": 0.4721,
"step": 3104
},
{
"epoch": 0.99,
"grad_norm": 1.589855680885164,
"learning_rate": 1.0742672112951103e-09,
"loss": 0.2493,
"step": 3105
},
{
"epoch": 0.99,
"grad_norm": 6.235917287828203,
"learning_rate": 9.695295433170826e-10,
"loss": 0.5953,
"step": 3106
},
{
"epoch": 0.99,
"grad_norm": 5.989546147230207,
"learning_rate": 8.701623616963472e-10,
"loss": 0.5155,
"step": 3107
},
{
"epoch": 0.99,
"grad_norm": 8.608292594450589,
"learning_rate": 7.761657731836236e-10,
"loss": 0.5289,
"step": 3108
},
{
"epoch": 0.99,
"grad_norm": 1.2868548811530105,
"learning_rate": 6.87539878759802e-10,
"loss": 0.1499,
"step": 3109
},
{
"epoch": 1.0,
"grad_norm": 1.5380312363926991,
"learning_rate": 6.042847736364987e-10,
"loss": 0.1935,
"step": 3110
},
{
"epoch": 1.0,
"grad_norm": 4.746891666683149,
"learning_rate": 5.264005472549461e-10,
"loss": 0.3446,
"step": 3111
},
{
"epoch": 1.0,
"grad_norm": 9.518489206428116,
"learning_rate": 4.538872832865471e-10,
"loss": 0.4071,
"step": 3112
},
{
"epoch": 1.0,
"grad_norm": 1.7664412590705008,
"learning_rate": 3.867450596328759e-10,
"loss": 0.2381,
"step": 3113
},
{
"epoch": 1.0,
"grad_norm": 1.6988190436169681,
"learning_rate": 3.2497394842512244e-10,
"loss": 0.218,
"step": 3114
},
{
"epoch": 1.0,
"grad_norm": 1.4416306644380266,
"learning_rate": 2.685740160240924e-10,
"loss": 0.2128,
"step": 3115
},
{
"epoch": 1.0,
"grad_norm": 7.825396015056772,
"learning_rate": 2.1754532302076247e-10,
"loss": 0.511,
"step": 3116
},
{
"epoch": 1.0,
"grad_norm": 1.6608648888600894,
"learning_rate": 1.718879242357252e-10,
"loss": 0.2151,
"step": 3117
},
{
"epoch": 1.0,
"grad_norm": 5.650930222088315,
"learning_rate": 1.316018687191889e-10,
"loss": 0.5032,
"step": 3118
},
{
"epoch": 1.0,
"grad_norm": 15.307567535576162,
"learning_rate": 9.668719974986751e-11,
"loss": 0.6285,
"step": 3119
},
{
"epoch": 1.0,
"grad_norm": 8.882364937663029,
"learning_rate": 6.714395483720105e-11,
"loss": 0.5117,
"step": 3120
},
{
"epoch": 1.0,
"grad_norm": 6.942683694133464,
"learning_rate": 4.297216571969021e-11,
"loss": 0.5333,
"step": 3121
},
{
"epoch": 1.0,
"grad_norm": 1.722721440266414,
"learning_rate": 2.417185836545155e-11,
"loss": 0.2157,
"step": 3122
},
{
"epoch": 1.0,
"grad_norm": 1.501297152856583,
"learning_rate": 1.0743052971107225e-11,
"loss": 0.1943,
"step": 3123
},
{
"epoch": 1.0,
"grad_norm": 7.410022358554075,
"learning_rate": 2.6857639640054387e-12,
"loss": 0.4378,
"step": 3124
},
{
"epoch": 1.0,
"grad_norm": 6.343821428663217,
"learning_rate": 0.0,
"loss": 0.378,
"step": 3125
},
{
"epoch": 1.0,
"step": 3125,
"total_flos": 887644537620480.0,
"train_loss": 0.39999343316555025,
"train_runtime": 11344.852,
"train_samples_per_second": 4.407,
"train_steps_per_second": 0.275
}
],
"logging_steps": 1.0,
"max_steps": 3125,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"total_flos": 887644537620480.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}