qwen7B_bespoke_agentflanv2 / trainer_state.json
groundhogLLM's picture
Upload folder using huggingface_hub
b0ebef8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 822,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003658536585365854,
"grad_norm": 6.277445410210355,
"learning_rate": 0.0,
"loss": 0.8949,
"step": 1
},
{
"epoch": 0.007317073170731708,
"grad_norm": 6.189966552075687,
"learning_rate": 1.204819277108434e-07,
"loss": 0.875,
"step": 2
},
{
"epoch": 0.01097560975609756,
"grad_norm": 6.164450292976063,
"learning_rate": 2.409638554216868e-07,
"loss": 0.8753,
"step": 3
},
{
"epoch": 0.014634146341463415,
"grad_norm": 6.197109316032434,
"learning_rate": 3.614457831325301e-07,
"loss": 0.8511,
"step": 4
},
{
"epoch": 0.018292682926829267,
"grad_norm": 6.423202590634042,
"learning_rate": 4.819277108433736e-07,
"loss": 0.8748,
"step": 5
},
{
"epoch": 0.02195121951219512,
"grad_norm": 6.25214947511438,
"learning_rate": 6.024096385542169e-07,
"loss": 0.8883,
"step": 6
},
{
"epoch": 0.025609756097560974,
"grad_norm": 5.959365118165624,
"learning_rate": 7.228915662650602e-07,
"loss": 0.8739,
"step": 7
},
{
"epoch": 0.02926829268292683,
"grad_norm": 5.699309208501387,
"learning_rate": 8.433734939759036e-07,
"loss": 0.8641,
"step": 8
},
{
"epoch": 0.032926829268292684,
"grad_norm": 5.896427299318614,
"learning_rate": 9.638554216867472e-07,
"loss": 0.8376,
"step": 9
},
{
"epoch": 0.036585365853658534,
"grad_norm": 4.457488780662371,
"learning_rate": 1.0843373493975905e-06,
"loss": 0.8127,
"step": 10
},
{
"epoch": 0.04024390243902439,
"grad_norm": 4.83357688640288,
"learning_rate": 1.2048192771084338e-06,
"loss": 0.8569,
"step": 11
},
{
"epoch": 0.04390243902439024,
"grad_norm": 4.606224868944638,
"learning_rate": 1.3253012048192773e-06,
"loss": 0.8594,
"step": 12
},
{
"epoch": 0.0475609756097561,
"grad_norm": 2.939566450225436,
"learning_rate": 1.4457831325301204e-06,
"loss": 0.7608,
"step": 13
},
{
"epoch": 0.05121951219512195,
"grad_norm": 2.520318983948017,
"learning_rate": 1.566265060240964e-06,
"loss": 0.7226,
"step": 14
},
{
"epoch": 0.054878048780487805,
"grad_norm": 2.5228998490908845,
"learning_rate": 1.6867469879518073e-06,
"loss": 0.7426,
"step": 15
},
{
"epoch": 0.05853658536585366,
"grad_norm": 2.3043155456582545,
"learning_rate": 1.8072289156626508e-06,
"loss": 0.7395,
"step": 16
},
{
"epoch": 0.06219512195121951,
"grad_norm": 2.0517255621206454,
"learning_rate": 1.9277108433734943e-06,
"loss": 0.7115,
"step": 17
},
{
"epoch": 0.06585365853658537,
"grad_norm": 2.263867308051565,
"learning_rate": 2.0481927710843377e-06,
"loss": 0.6575,
"step": 18
},
{
"epoch": 0.06951219512195123,
"grad_norm": 2.3534448895798623,
"learning_rate": 2.168674698795181e-06,
"loss": 0.6783,
"step": 19
},
{
"epoch": 0.07317073170731707,
"grad_norm": 2.5801360082960314,
"learning_rate": 2.2891566265060243e-06,
"loss": 0.6791,
"step": 20
},
{
"epoch": 0.07682926829268293,
"grad_norm": 2.091602122954677,
"learning_rate": 2.4096385542168676e-06,
"loss": 0.6447,
"step": 21
},
{
"epoch": 0.08048780487804878,
"grad_norm": 1.8784090908022497,
"learning_rate": 2.530120481927711e-06,
"loss": 0.6341,
"step": 22
},
{
"epoch": 0.08414634146341464,
"grad_norm": 1.6942257363329816,
"learning_rate": 2.6506024096385547e-06,
"loss": 0.6357,
"step": 23
},
{
"epoch": 0.08780487804878048,
"grad_norm": 1.6095157334175276,
"learning_rate": 2.771084337349398e-06,
"loss": 0.6345,
"step": 24
},
{
"epoch": 0.09146341463414634,
"grad_norm": 1.7414308121430007,
"learning_rate": 2.891566265060241e-06,
"loss": 0.602,
"step": 25
},
{
"epoch": 0.0951219512195122,
"grad_norm": 1.8575881807291224,
"learning_rate": 3.012048192771085e-06,
"loss": 0.6362,
"step": 26
},
{
"epoch": 0.09878048780487805,
"grad_norm": 1.7851864056717361,
"learning_rate": 3.132530120481928e-06,
"loss": 0.612,
"step": 27
},
{
"epoch": 0.1024390243902439,
"grad_norm": 1.440836469753712,
"learning_rate": 3.2530120481927713e-06,
"loss": 0.5809,
"step": 28
},
{
"epoch": 0.10609756097560975,
"grad_norm": 1.0609497126438352,
"learning_rate": 3.3734939759036146e-06,
"loss": 0.5171,
"step": 29
},
{
"epoch": 0.10975609756097561,
"grad_norm": 1.14774673610937,
"learning_rate": 3.4939759036144583e-06,
"loss": 0.542,
"step": 30
},
{
"epoch": 0.11341463414634147,
"grad_norm": 1.1996784532412303,
"learning_rate": 3.6144578313253016e-06,
"loss": 0.5745,
"step": 31
},
{
"epoch": 0.11707317073170732,
"grad_norm": 1.1058886982759129,
"learning_rate": 3.7349397590361445e-06,
"loss": 0.535,
"step": 32
},
{
"epoch": 0.12073170731707317,
"grad_norm": 1.1654070589558059,
"learning_rate": 3.855421686746989e-06,
"loss": 0.5394,
"step": 33
},
{
"epoch": 0.12439024390243902,
"grad_norm": 1.0975202327930376,
"learning_rate": 3.975903614457832e-06,
"loss": 0.5235,
"step": 34
},
{
"epoch": 0.12804878048780488,
"grad_norm": 1.0349455335423454,
"learning_rate": 4.096385542168675e-06,
"loss": 0.5248,
"step": 35
},
{
"epoch": 0.13170731707317074,
"grad_norm": 1.0886433836793852,
"learning_rate": 4.216867469879519e-06,
"loss": 0.5217,
"step": 36
},
{
"epoch": 0.1353658536585366,
"grad_norm": 1.0243537702205467,
"learning_rate": 4.337349397590362e-06,
"loss": 0.5269,
"step": 37
},
{
"epoch": 0.13902439024390245,
"grad_norm": 0.8706005558780466,
"learning_rate": 4.457831325301205e-06,
"loss": 0.5224,
"step": 38
},
{
"epoch": 0.14268292682926828,
"grad_norm": 0.9201501100893733,
"learning_rate": 4.578313253012049e-06,
"loss": 0.489,
"step": 39
},
{
"epoch": 0.14634146341463414,
"grad_norm": 0.9395976154976792,
"learning_rate": 4.698795180722892e-06,
"loss": 0.5119,
"step": 40
},
{
"epoch": 0.15,
"grad_norm": 0.8427641210847073,
"learning_rate": 4.819277108433735e-06,
"loss": 0.4676,
"step": 41
},
{
"epoch": 0.15365853658536585,
"grad_norm": 0.8083286994818667,
"learning_rate": 4.939759036144578e-06,
"loss": 0.4929,
"step": 42
},
{
"epoch": 0.1573170731707317,
"grad_norm": 0.8645108522630794,
"learning_rate": 5.060240963855422e-06,
"loss": 0.4892,
"step": 43
},
{
"epoch": 0.16097560975609757,
"grad_norm": 0.9003614919042666,
"learning_rate": 5.180722891566266e-06,
"loss": 0.4814,
"step": 44
},
{
"epoch": 0.16463414634146342,
"grad_norm": 0.741593728453306,
"learning_rate": 5.301204819277109e-06,
"loss": 0.5015,
"step": 45
},
{
"epoch": 0.16829268292682928,
"grad_norm": 0.7562265354454827,
"learning_rate": 5.421686746987952e-06,
"loss": 0.4696,
"step": 46
},
{
"epoch": 0.1719512195121951,
"grad_norm": 0.7656658535399147,
"learning_rate": 5.542168674698796e-06,
"loss": 0.4765,
"step": 47
},
{
"epoch": 0.17560975609756097,
"grad_norm": 0.8167177748510339,
"learning_rate": 5.66265060240964e-06,
"loss": 0.4794,
"step": 48
},
{
"epoch": 0.17926829268292682,
"grad_norm": 0.8458910289909082,
"learning_rate": 5.783132530120482e-06,
"loss": 0.4677,
"step": 49
},
{
"epoch": 0.18292682926829268,
"grad_norm": 0.7346378269144168,
"learning_rate": 5.9036144578313255e-06,
"loss": 0.4733,
"step": 50
},
{
"epoch": 0.18658536585365854,
"grad_norm": 0.740671844456031,
"learning_rate": 6.02409638554217e-06,
"loss": 0.474,
"step": 51
},
{
"epoch": 0.1902439024390244,
"grad_norm": 0.8016621843811075,
"learning_rate": 6.144578313253012e-06,
"loss": 0.4726,
"step": 52
},
{
"epoch": 0.19390243902439025,
"grad_norm": 0.687264969205199,
"learning_rate": 6.265060240963856e-06,
"loss": 0.4361,
"step": 53
},
{
"epoch": 0.1975609756097561,
"grad_norm": 0.6904700215176642,
"learning_rate": 6.385542168674699e-06,
"loss": 0.4585,
"step": 54
},
{
"epoch": 0.20121951219512196,
"grad_norm": 0.7341819290585861,
"learning_rate": 6.5060240963855425e-06,
"loss": 0.4351,
"step": 55
},
{
"epoch": 0.2048780487804878,
"grad_norm": 0.7027985388306266,
"learning_rate": 6.626506024096386e-06,
"loss": 0.4514,
"step": 56
},
{
"epoch": 0.20853658536585365,
"grad_norm": 0.7865290192260256,
"learning_rate": 6.746987951807229e-06,
"loss": 0.4329,
"step": 57
},
{
"epoch": 0.2121951219512195,
"grad_norm": 0.80407188175117,
"learning_rate": 6.867469879518073e-06,
"loss": 0.4435,
"step": 58
},
{
"epoch": 0.21585365853658536,
"grad_norm": 0.7172401190061911,
"learning_rate": 6.987951807228917e-06,
"loss": 0.4513,
"step": 59
},
{
"epoch": 0.21951219512195122,
"grad_norm": 0.8101533915145436,
"learning_rate": 7.1084337349397595e-06,
"loss": 0.4503,
"step": 60
},
{
"epoch": 0.22317073170731708,
"grad_norm": 0.804368270905315,
"learning_rate": 7.228915662650603e-06,
"loss": 0.45,
"step": 61
},
{
"epoch": 0.22682926829268293,
"grad_norm": 0.9006759847187962,
"learning_rate": 7.349397590361447e-06,
"loss": 0.4655,
"step": 62
},
{
"epoch": 0.2304878048780488,
"grad_norm": 0.7424358924212873,
"learning_rate": 7.469879518072289e-06,
"loss": 0.451,
"step": 63
},
{
"epoch": 0.23414634146341465,
"grad_norm": 0.7737469569927911,
"learning_rate": 7.590361445783133e-06,
"loss": 0.435,
"step": 64
},
{
"epoch": 0.23780487804878048,
"grad_norm": 0.7292890277210534,
"learning_rate": 7.710843373493977e-06,
"loss": 0.4578,
"step": 65
},
{
"epoch": 0.24146341463414633,
"grad_norm": 0.7375239690173336,
"learning_rate": 7.83132530120482e-06,
"loss": 0.4248,
"step": 66
},
{
"epoch": 0.2451219512195122,
"grad_norm": 0.7635364447903387,
"learning_rate": 7.951807228915663e-06,
"loss": 0.4389,
"step": 67
},
{
"epoch": 0.24878048780487805,
"grad_norm": 0.8154765127796729,
"learning_rate": 8.072289156626508e-06,
"loss": 0.4465,
"step": 68
},
{
"epoch": 0.2524390243902439,
"grad_norm": 0.7035246328533662,
"learning_rate": 8.19277108433735e-06,
"loss": 0.4458,
"step": 69
},
{
"epoch": 0.25609756097560976,
"grad_norm": 0.7489537524137279,
"learning_rate": 8.313253012048194e-06,
"loss": 0.446,
"step": 70
},
{
"epoch": 0.2597560975609756,
"grad_norm": 0.737620333742491,
"learning_rate": 8.433734939759038e-06,
"loss": 0.4161,
"step": 71
},
{
"epoch": 0.2634146341463415,
"grad_norm": 0.7850264326588713,
"learning_rate": 8.55421686746988e-06,
"loss": 0.4458,
"step": 72
},
{
"epoch": 0.26707317073170733,
"grad_norm": 0.6816894813147583,
"learning_rate": 8.674698795180724e-06,
"loss": 0.4131,
"step": 73
},
{
"epoch": 0.2707317073170732,
"grad_norm": 0.7565446389184619,
"learning_rate": 8.795180722891567e-06,
"loss": 0.428,
"step": 74
},
{
"epoch": 0.27439024390243905,
"grad_norm": 0.701791274439343,
"learning_rate": 8.91566265060241e-06,
"loss": 0.4178,
"step": 75
},
{
"epoch": 0.2780487804878049,
"grad_norm": 0.7340761193641352,
"learning_rate": 9.036144578313254e-06,
"loss": 0.4427,
"step": 76
},
{
"epoch": 0.2817073170731707,
"grad_norm": 0.7618958840715552,
"learning_rate": 9.156626506024097e-06,
"loss": 0.4286,
"step": 77
},
{
"epoch": 0.28536585365853656,
"grad_norm": 0.8304904668902089,
"learning_rate": 9.27710843373494e-06,
"loss": 0.4514,
"step": 78
},
{
"epoch": 0.2890243902439024,
"grad_norm": 0.7030300483997182,
"learning_rate": 9.397590361445785e-06,
"loss": 0.4241,
"step": 79
},
{
"epoch": 0.2926829268292683,
"grad_norm": 0.6805879055096362,
"learning_rate": 9.518072289156628e-06,
"loss": 0.4172,
"step": 80
},
{
"epoch": 0.29634146341463413,
"grad_norm": 0.8356922060920742,
"learning_rate": 9.63855421686747e-06,
"loss": 0.4364,
"step": 81
},
{
"epoch": 0.3,
"grad_norm": 0.7503110247952592,
"learning_rate": 9.759036144578315e-06,
"loss": 0.4413,
"step": 82
},
{
"epoch": 0.30365853658536585,
"grad_norm": 0.7061215452986729,
"learning_rate": 9.879518072289156e-06,
"loss": 0.4338,
"step": 83
},
{
"epoch": 0.3073170731707317,
"grad_norm": 0.7634467193523269,
"learning_rate": 1e-05,
"loss": 0.4321,
"step": 84
},
{
"epoch": 0.31097560975609756,
"grad_norm": 0.7210623022309584,
"learning_rate": 9.999954819584226e-06,
"loss": 0.4194,
"step": 85
},
{
"epoch": 0.3146341463414634,
"grad_norm": 0.721915503004975,
"learning_rate": 9.99981927915341e-06,
"loss": 0.4054,
"step": 86
},
{
"epoch": 0.3182926829268293,
"grad_norm": 0.7175117036906977,
"learning_rate": 9.999593381157061e-06,
"loss": 0.4064,
"step": 87
},
{
"epoch": 0.32195121951219513,
"grad_norm": 0.7700973325270625,
"learning_rate": 9.999277129677647e-06,
"loss": 0.4353,
"step": 88
},
{
"epoch": 0.325609756097561,
"grad_norm": 0.6814147419837185,
"learning_rate": 9.998870530430517e-06,
"loss": 0.4194,
"step": 89
},
{
"epoch": 0.32926829268292684,
"grad_norm": 0.7678648920525801,
"learning_rate": 9.998373590763798e-06,
"loss": 0.4162,
"step": 90
},
{
"epoch": 0.3329268292682927,
"grad_norm": 0.7311404514370706,
"learning_rate": 9.997786319658269e-06,
"loss": 0.4289,
"step": 91
},
{
"epoch": 0.33658536585365856,
"grad_norm": 0.7611041087207226,
"learning_rate": 9.99710872772719e-06,
"loss": 0.4301,
"step": 92
},
{
"epoch": 0.3402439024390244,
"grad_norm": 0.828204792592121,
"learning_rate": 9.996340827216114e-06,
"loss": 0.4265,
"step": 93
},
{
"epoch": 0.3439024390243902,
"grad_norm": 0.7616248679645873,
"learning_rate": 9.995482632002666e-06,
"loss": 0.4332,
"step": 94
},
{
"epoch": 0.3475609756097561,
"grad_norm": 0.7439027924570666,
"learning_rate": 9.9945341575963e-06,
"loss": 0.4278,
"step": 95
},
{
"epoch": 0.35121951219512193,
"grad_norm": 0.8590239356762064,
"learning_rate": 9.993495421137991e-06,
"loss": 0.4233,
"step": 96
},
{
"epoch": 0.3548780487804878,
"grad_norm": 0.6934508286067458,
"learning_rate": 9.992366441399968e-06,
"loss": 0.398,
"step": 97
},
{
"epoch": 0.35853658536585364,
"grad_norm": 0.7663186915430809,
"learning_rate": 9.991147238785335e-06,
"loss": 0.4631,
"step": 98
},
{
"epoch": 0.3621951219512195,
"grad_norm": 0.7156930809406346,
"learning_rate": 9.989837835327724e-06,
"loss": 0.4208,
"step": 99
},
{
"epoch": 0.36585365853658536,
"grad_norm": 0.8002982909933671,
"learning_rate": 9.988438254690896e-06,
"loss": 0.4152,
"step": 100
},
{
"epoch": 0.3695121951219512,
"grad_norm": 0.7294127443277085,
"learning_rate": 9.986948522168301e-06,
"loss": 0.413,
"step": 101
},
{
"epoch": 0.37317073170731707,
"grad_norm": 0.8058181737161048,
"learning_rate": 9.985368664682636e-06,
"loss": 0.392,
"step": 102
},
{
"epoch": 0.37682926829268293,
"grad_norm": 0.7274807055972744,
"learning_rate": 9.983698710785345e-06,
"loss": 0.4224,
"step": 103
},
{
"epoch": 0.3804878048780488,
"grad_norm": 0.6993436655613549,
"learning_rate": 9.981938690656117e-06,
"loss": 0.4088,
"step": 104
},
{
"epoch": 0.38414634146341464,
"grad_norm": 0.7940440414933521,
"learning_rate": 9.980088636102323e-06,
"loss": 0.4072,
"step": 105
},
{
"epoch": 0.3878048780487805,
"grad_norm": 0.8290354247980892,
"learning_rate": 9.97814858055846e-06,
"loss": 0.4131,
"step": 106
},
{
"epoch": 0.39146341463414636,
"grad_norm": 0.7441200330283944,
"learning_rate": 9.976118559085535e-06,
"loss": 0.3919,
"step": 107
},
{
"epoch": 0.3951219512195122,
"grad_norm": 0.6950410289839455,
"learning_rate": 9.973998608370433e-06,
"loss": 0.3959,
"step": 108
},
{
"epoch": 0.39878048780487807,
"grad_norm": 0.8708483538432179,
"learning_rate": 9.971788766725254e-06,
"loss": 0.3971,
"step": 109
},
{
"epoch": 0.4024390243902439,
"grad_norm": 0.759084878650044,
"learning_rate": 9.969489074086626e-06,
"loss": 0.409,
"step": 110
},
{
"epoch": 0.4060975609756098,
"grad_norm": 0.7871272877847223,
"learning_rate": 9.967099572014977e-06,
"loss": 0.421,
"step": 111
},
{
"epoch": 0.4097560975609756,
"grad_norm": 0.7903507778760472,
"learning_rate": 9.964620303693784e-06,
"loss": 0.4047,
"step": 112
},
{
"epoch": 0.41341463414634144,
"grad_norm": 0.723841099163788,
"learning_rate": 9.962051313928796e-06,
"loss": 0.4092,
"step": 113
},
{
"epoch": 0.4170731707317073,
"grad_norm": 0.6639342794876416,
"learning_rate": 9.959392649147226e-06,
"loss": 0.4027,
"step": 114
},
{
"epoch": 0.42073170731707316,
"grad_norm": 0.807078988818577,
"learning_rate": 9.956644357396905e-06,
"loss": 0.4278,
"step": 115
},
{
"epoch": 0.424390243902439,
"grad_norm": 0.7650258943283277,
"learning_rate": 9.953806488345417e-06,
"loss": 0.4088,
"step": 116
},
{
"epoch": 0.42804878048780487,
"grad_norm": 0.7150263286569847,
"learning_rate": 9.950879093279204e-06,
"loss": 0.4004,
"step": 117
},
{
"epoch": 0.4317073170731707,
"grad_norm": 0.7284032347300947,
"learning_rate": 9.947862225102637e-06,
"loss": 0.4172,
"step": 118
},
{
"epoch": 0.4353658536585366,
"grad_norm": 0.7646864504880536,
"learning_rate": 9.944755938337063e-06,
"loss": 0.3873,
"step": 119
},
{
"epoch": 0.43902439024390244,
"grad_norm": 0.6983159212265896,
"learning_rate": 9.941560289119808e-06,
"loss": 0.3884,
"step": 120
},
{
"epoch": 0.4426829268292683,
"grad_norm": 0.767699892100864,
"learning_rate": 9.938275335203176e-06,
"loss": 0.4419,
"step": 121
},
{
"epoch": 0.44634146341463415,
"grad_norm": 0.7705493162673682,
"learning_rate": 9.934901135953402e-06,
"loss": 0.389,
"step": 122
},
{
"epoch": 0.45,
"grad_norm": 0.7664723925888519,
"learning_rate": 9.931437752349579e-06,
"loss": 0.4073,
"step": 123
},
{
"epoch": 0.45365853658536587,
"grad_norm": 0.8462520333011967,
"learning_rate": 9.927885246982548e-06,
"loss": 0.4074,
"step": 124
},
{
"epoch": 0.4573170731707317,
"grad_norm": 0.7775485328681504,
"learning_rate": 9.924243684053778e-06,
"loss": 0.3931,
"step": 125
},
{
"epoch": 0.4609756097560976,
"grad_norm": 0.7690878823572883,
"learning_rate": 9.920513129374198e-06,
"loss": 0.4163,
"step": 126
},
{
"epoch": 0.46463414634146344,
"grad_norm": 0.8486196279486743,
"learning_rate": 9.916693650363014e-06,
"loss": 0.4063,
"step": 127
},
{
"epoch": 0.4682926829268293,
"grad_norm": 0.7060741443892173,
"learning_rate": 9.912785316046487e-06,
"loss": 0.4029,
"step": 128
},
{
"epoch": 0.4719512195121951,
"grad_norm": 0.7610752199019202,
"learning_rate": 9.908788197056682e-06,
"loss": 0.4172,
"step": 129
},
{
"epoch": 0.47560975609756095,
"grad_norm": 0.7252945736486252,
"learning_rate": 9.9047023656302e-06,
"loss": 0.3935,
"step": 130
},
{
"epoch": 0.4792682926829268,
"grad_norm": 0.767038892879291,
"learning_rate": 9.900527895606868e-06,
"loss": 0.3972,
"step": 131
},
{
"epoch": 0.48292682926829267,
"grad_norm": 0.643692685037689,
"learning_rate": 9.8962648624284e-06,
"loss": 0.4225,
"step": 132
},
{
"epoch": 0.4865853658536585,
"grad_norm": 0.8783566981280354,
"learning_rate": 9.891913343137041e-06,
"loss": 0.4416,
"step": 133
},
{
"epoch": 0.4902439024390244,
"grad_norm": 0.7185000940147632,
"learning_rate": 9.887473416374169e-06,
"loss": 0.3992,
"step": 134
},
{
"epoch": 0.49390243902439024,
"grad_norm": 0.6892638783815311,
"learning_rate": 9.882945162378884e-06,
"loss": 0.4198,
"step": 135
},
{
"epoch": 0.4975609756097561,
"grad_norm": 0.696543424071767,
"learning_rate": 9.87832866298654e-06,
"loss": 0.3896,
"step": 136
},
{
"epoch": 0.501219512195122,
"grad_norm": 0.7371986690918538,
"learning_rate": 9.873624001627286e-06,
"loss": 0.406,
"step": 137
},
{
"epoch": 0.5048780487804878,
"grad_norm": 0.7393120814879373,
"learning_rate": 9.868831263324543e-06,
"loss": 0.3948,
"step": 138
},
{
"epoch": 0.5085365853658537,
"grad_norm": 0.7660439237825246,
"learning_rate": 9.863950534693474e-06,
"loss": 0.4069,
"step": 139
},
{
"epoch": 0.5121951219512195,
"grad_norm": 0.7272706576678343,
"learning_rate": 9.858981903939419e-06,
"loss": 0.4024,
"step": 140
},
{
"epoch": 0.5158536585365854,
"grad_norm": 0.7481707493506783,
"learning_rate": 9.853925460856299e-06,
"loss": 0.418,
"step": 141
},
{
"epoch": 0.5195121951219512,
"grad_norm": 0.7115691589222297,
"learning_rate": 9.848781296824994e-06,
"loss": 0.3632,
"step": 142
},
{
"epoch": 0.5231707317073171,
"grad_norm": 0.7631193372021012,
"learning_rate": 9.843549504811695e-06,
"loss": 0.3966,
"step": 143
},
{
"epoch": 0.526829268292683,
"grad_norm": 0.6756136811278868,
"learning_rate": 9.838230179366213e-06,
"loss": 0.3766,
"step": 144
},
{
"epoch": 0.5304878048780488,
"grad_norm": 0.7161082479999907,
"learning_rate": 9.832823416620285e-06,
"loss": 0.3734,
"step": 145
},
{
"epoch": 0.5341463414634147,
"grad_norm": 0.8034199359900721,
"learning_rate": 9.827329314285825e-06,
"loss": 0.3947,
"step": 146
},
{
"epoch": 0.5378048780487805,
"grad_norm": 0.7413237614885045,
"learning_rate": 9.821747971653164e-06,
"loss": 0.3951,
"step": 147
},
{
"epoch": 0.5414634146341464,
"grad_norm": 0.7175495817807958,
"learning_rate": 9.816079489589257e-06,
"loss": 0.4057,
"step": 148
},
{
"epoch": 0.5451219512195122,
"grad_norm": 0.7937438771947285,
"learning_rate": 9.810323970535851e-06,
"loss": 0.4029,
"step": 149
},
{
"epoch": 0.5487804878048781,
"grad_norm": 0.7432618293923435,
"learning_rate": 9.804481518507645e-06,
"loss": 0.4375,
"step": 150
},
{
"epoch": 0.552439024390244,
"grad_norm": 0.6993854930910921,
"learning_rate": 9.798552239090404e-06,
"loss": 0.3791,
"step": 151
},
{
"epoch": 0.5560975609756098,
"grad_norm": 0.7198012758680304,
"learning_rate": 9.792536239439052e-06,
"loss": 0.401,
"step": 152
},
{
"epoch": 0.5597560975609757,
"grad_norm": 0.6415672054488112,
"learning_rate": 9.786433628275735e-06,
"loss": 0.3783,
"step": 153
},
{
"epoch": 0.5634146341463414,
"grad_norm": 0.6920734346987311,
"learning_rate": 9.780244515887856e-06,
"loss": 0.4043,
"step": 154
},
{
"epoch": 0.5670731707317073,
"grad_norm": 0.7503756171018907,
"learning_rate": 9.773969014126084e-06,
"loss": 0.3724,
"step": 155
},
{
"epoch": 0.5707317073170731,
"grad_norm": 0.7117225351020292,
"learning_rate": 9.76760723640233e-06,
"loss": 0.384,
"step": 156
},
{
"epoch": 0.574390243902439,
"grad_norm": 0.6734357880050217,
"learning_rate": 9.7611592976877e-06,
"loss": 0.3837,
"step": 157
},
{
"epoch": 0.5780487804878048,
"grad_norm": 0.7337246817368871,
"learning_rate": 9.754625314510416e-06,
"loss": 0.3997,
"step": 158
},
{
"epoch": 0.5817073170731707,
"grad_norm": 0.7045366717146077,
"learning_rate": 9.748005404953705e-06,
"loss": 0.381,
"step": 159
},
{
"epoch": 0.5853658536585366,
"grad_norm": 0.6963782507139761,
"learning_rate": 9.741299688653676e-06,
"loss": 0.38,
"step": 160
},
{
"epoch": 0.5890243902439024,
"grad_norm": 0.7948834479120092,
"learning_rate": 9.734508286797148e-06,
"loss": 0.389,
"step": 161
},
{
"epoch": 0.5926829268292683,
"grad_norm": 0.7105686277711006,
"learning_rate": 9.727631322119467e-06,
"loss": 0.3886,
"step": 162
},
{
"epoch": 0.5963414634146341,
"grad_norm": 0.6521720402273905,
"learning_rate": 9.72066891890228e-06,
"loss": 0.3655,
"step": 163
},
{
"epoch": 0.6,
"grad_norm": 0.7132643040658287,
"learning_rate": 9.713621202971297e-06,
"loss": 0.385,
"step": 164
},
{
"epoch": 0.6036585365853658,
"grad_norm": 0.7206674268681118,
"learning_rate": 9.706488301694013e-06,
"loss": 0.394,
"step": 165
},
{
"epoch": 0.6073170731707317,
"grad_norm": 0.7241063229535367,
"learning_rate": 9.699270343977403e-06,
"loss": 0.3693,
"step": 166
},
{
"epoch": 0.6109756097560975,
"grad_norm": 0.6824731673625768,
"learning_rate": 9.691967460265604e-06,
"loss": 0.3856,
"step": 167
},
{
"epoch": 0.6146341463414634,
"grad_norm": 0.760082427210616,
"learning_rate": 9.684579782537542e-06,
"loss": 0.3548,
"step": 168
},
{
"epoch": 0.6182926829268293,
"grad_norm": 0.772003373608569,
"learning_rate": 9.677107444304556e-06,
"loss": 0.3743,
"step": 169
},
{
"epoch": 0.6219512195121951,
"grad_norm": 0.8083887172564229,
"learning_rate": 9.66955058060799e-06,
"loss": 0.3815,
"step": 170
},
{
"epoch": 0.625609756097561,
"grad_norm": 0.8277017660896508,
"learning_rate": 9.661909328016739e-06,
"loss": 0.3837,
"step": 171
},
{
"epoch": 0.6292682926829268,
"grad_norm": 0.7445701030987295,
"learning_rate": 9.654183824624789e-06,
"loss": 0.3909,
"step": 172
},
{
"epoch": 0.6329268292682927,
"grad_norm": 0.7277845875454032,
"learning_rate": 9.646374210048723e-06,
"loss": 0.3739,
"step": 173
},
{
"epoch": 0.6365853658536585,
"grad_norm": 0.9082732763858772,
"learning_rate": 9.638480625425197e-06,
"loss": 0.4024,
"step": 174
},
{
"epoch": 0.6402439024390244,
"grad_norm": 0.6982452948788327,
"learning_rate": 9.630503213408383e-06,
"loss": 0.3744,
"step": 175
},
{
"epoch": 0.6439024390243903,
"grad_norm": 0.6744263389412143,
"learning_rate": 9.622442118167396e-06,
"loss": 0.3894,
"step": 176
},
{
"epoch": 0.6475609756097561,
"grad_norm": 0.7729120265080925,
"learning_rate": 9.614297485383693e-06,
"loss": 0.3584,
"step": 177
},
{
"epoch": 0.651219512195122,
"grad_norm": 0.7028666356034352,
"learning_rate": 9.606069462248432e-06,
"loss": 0.3617,
"step": 178
},
{
"epoch": 0.6548780487804878,
"grad_norm": 0.7402052980733588,
"learning_rate": 9.597758197459814e-06,
"loss": 0.4102,
"step": 179
},
{
"epoch": 0.6585365853658537,
"grad_norm": 0.7506506066011618,
"learning_rate": 9.589363841220398e-06,
"loss": 0.3989,
"step": 180
},
{
"epoch": 0.6621951219512195,
"grad_norm": 0.7794807064224093,
"learning_rate": 9.580886545234387e-06,
"loss": 0.3777,
"step": 181
},
{
"epoch": 0.6658536585365854,
"grad_norm": 0.6495178330993069,
"learning_rate": 9.572326462704884e-06,
"loss": 0.3861,
"step": 182
},
{
"epoch": 0.6695121951219513,
"grad_norm": 0.7946496357370726,
"learning_rate": 9.563683748331123e-06,
"loss": 0.3852,
"step": 183
},
{
"epoch": 0.6731707317073171,
"grad_norm": 0.7486601013846049,
"learning_rate": 9.554958558305678e-06,
"loss": 0.397,
"step": 184
},
{
"epoch": 0.676829268292683,
"grad_norm": 0.7088712034584526,
"learning_rate": 9.546151050311632e-06,
"loss": 0.3902,
"step": 185
},
{
"epoch": 0.6804878048780488,
"grad_norm": 0.6932437360491843,
"learning_rate": 9.537261383519736e-06,
"loss": 0.3433,
"step": 186
},
{
"epoch": 0.6841463414634147,
"grad_norm": 0.7183303900647539,
"learning_rate": 9.528289718585523e-06,
"loss": 0.3749,
"step": 187
},
{
"epoch": 0.6878048780487804,
"grad_norm": 0.6748508130997356,
"learning_rate": 9.519236217646419e-06,
"loss": 0.3748,
"step": 188
},
{
"epoch": 0.6914634146341463,
"grad_norm": 0.6706028292433445,
"learning_rate": 9.510101044318795e-06,
"loss": 0.3757,
"step": 189
},
{
"epoch": 0.6951219512195121,
"grad_norm": 0.6870273832569191,
"learning_rate": 9.500884363695025e-06,
"loss": 0.3638,
"step": 190
},
{
"epoch": 0.698780487804878,
"grad_norm": 0.7321391217873353,
"learning_rate": 9.49158634234049e-06,
"loss": 0.3799,
"step": 191
},
{
"epoch": 0.7024390243902439,
"grad_norm": 0.6003693164345862,
"learning_rate": 9.482207148290585e-06,
"loss": 0.3804,
"step": 192
},
{
"epoch": 0.7060975609756097,
"grad_norm": 0.6583230239921988,
"learning_rate": 9.472746951047657e-06,
"loss": 0.3725,
"step": 193
},
{
"epoch": 0.7097560975609756,
"grad_norm": 0.6742830191100991,
"learning_rate": 9.463205921577972e-06,
"loss": 0.3662,
"step": 194
},
{
"epoch": 0.7134146341463414,
"grad_norm": 0.7010665460150319,
"learning_rate": 9.453584232308593e-06,
"loss": 0.3623,
"step": 195
},
{
"epoch": 0.7170731707317073,
"grad_norm": 0.6448071014559433,
"learning_rate": 9.443882057124294e-06,
"loss": 0.357,
"step": 196
},
{
"epoch": 0.7207317073170731,
"grad_norm": 0.709113132802626,
"learning_rate": 9.434099571364396e-06,
"loss": 0.385,
"step": 197
},
{
"epoch": 0.724390243902439,
"grad_norm": 0.7570745833677422,
"learning_rate": 9.424236951819612e-06,
"loss": 0.3851,
"step": 198
},
{
"epoch": 0.7280487804878049,
"grad_norm": 0.642564214786846,
"learning_rate": 9.41429437672884e-06,
"loss": 0.3773,
"step": 199
},
{
"epoch": 0.7317073170731707,
"grad_norm": 0.68482160328781,
"learning_rate": 9.40427202577595e-06,
"loss": 0.3911,
"step": 200
},
{
"epoch": 0.7353658536585366,
"grad_norm": 0.690309120558691,
"learning_rate": 9.394170080086538e-06,
"loss": 0.3843,
"step": 201
},
{
"epoch": 0.7390243902439024,
"grad_norm": 0.6769970564023863,
"learning_rate": 9.383988722224642e-06,
"loss": 0.3887,
"step": 202
},
{
"epoch": 0.7426829268292683,
"grad_norm": 0.7074513706865134,
"learning_rate": 9.37372813618946e-06,
"loss": 0.3793,
"step": 203
},
{
"epoch": 0.7463414634146341,
"grad_norm": 0.6777055688042114,
"learning_rate": 9.363388507412005e-06,
"loss": 0.3938,
"step": 204
},
{
"epoch": 0.75,
"grad_norm": 0.6720337268026889,
"learning_rate": 9.35297002275177e-06,
"loss": 0.3859,
"step": 205
},
{
"epoch": 0.7536585365853659,
"grad_norm": 0.655107581741595,
"learning_rate": 9.342472870493342e-06,
"loss": 0.386,
"step": 206
},
{
"epoch": 0.7573170731707317,
"grad_norm": 0.6851327055789187,
"learning_rate": 9.331897240343001e-06,
"loss": 0.392,
"step": 207
},
{
"epoch": 0.7609756097560976,
"grad_norm": 0.6556370821127617,
"learning_rate": 9.321243323425298e-06,
"loss": 0.3851,
"step": 208
},
{
"epoch": 0.7646341463414634,
"grad_norm": 0.6401412447193511,
"learning_rate": 9.310511312279586e-06,
"loss": 0.365,
"step": 209
},
{
"epoch": 0.7682926829268293,
"grad_norm": 0.6762861190866665,
"learning_rate": 9.29970140085656e-06,
"loss": 0.3721,
"step": 210
},
{
"epoch": 0.7719512195121951,
"grad_norm": 0.6713698619241105,
"learning_rate": 9.288813784514733e-06,
"loss": 0.3768,
"step": 211
},
{
"epoch": 0.775609756097561,
"grad_norm": 0.6782913708342606,
"learning_rate": 9.277848660016921e-06,
"loss": 0.3826,
"step": 212
},
{
"epoch": 0.7792682926829269,
"grad_norm": 0.7857162080237674,
"learning_rate": 9.266806225526677e-06,
"loss": 0.3876,
"step": 213
},
{
"epoch": 0.7829268292682927,
"grad_norm": 0.7058571807350672,
"learning_rate": 9.255686680604712e-06,
"loss": 0.3775,
"step": 214
},
{
"epoch": 0.7865853658536586,
"grad_norm": 0.6470360391059924,
"learning_rate": 9.244490226205294e-06,
"loss": 0.3676,
"step": 215
},
{
"epoch": 0.7902439024390244,
"grad_norm": 0.693553517207013,
"learning_rate": 9.233217064672607e-06,
"loss": 0.3685,
"step": 216
},
{
"epoch": 0.7939024390243903,
"grad_norm": 0.7721074742274187,
"learning_rate": 9.221867399737101e-06,
"loss": 0.3955,
"step": 217
},
{
"epoch": 0.7975609756097561,
"grad_norm": 0.6937965952731138,
"learning_rate": 9.21044143651181e-06,
"loss": 0.3842,
"step": 218
},
{
"epoch": 0.801219512195122,
"grad_norm": 0.669785255096806,
"learning_rate": 9.198939381488638e-06,
"loss": 0.3701,
"step": 219
},
{
"epoch": 0.8048780487804879,
"grad_norm": 0.68468277903504,
"learning_rate": 9.187361442534641e-06,
"loss": 0.3744,
"step": 220
},
{
"epoch": 0.8085365853658537,
"grad_norm": 0.7085279773560048,
"learning_rate": 9.175707828888255e-06,
"loss": 0.3592,
"step": 221
},
{
"epoch": 0.8121951219512196,
"grad_norm": 0.6347781601971861,
"learning_rate": 9.163978751155522e-06,
"loss": 0.3704,
"step": 222
},
{
"epoch": 0.8158536585365853,
"grad_norm": 0.6200919359866742,
"learning_rate": 9.152174421306288e-06,
"loss": 0.3604,
"step": 223
},
{
"epoch": 0.8195121951219512,
"grad_norm": 0.6154635559910653,
"learning_rate": 9.140295052670365e-06,
"loss": 0.3851,
"step": 224
},
{
"epoch": 0.823170731707317,
"grad_norm": 0.647566035539756,
"learning_rate": 9.128340859933677e-06,
"loss": 0.364,
"step": 225
},
{
"epoch": 0.8268292682926829,
"grad_norm": 0.6163426618840316,
"learning_rate": 9.116312059134386e-06,
"loss": 0.3703,
"step": 226
},
{
"epoch": 0.8304878048780487,
"grad_norm": 0.6813695201145943,
"learning_rate": 9.104208867658977e-06,
"loss": 0.3801,
"step": 227
},
{
"epoch": 0.8341463414634146,
"grad_norm": 0.6447781117989946,
"learning_rate": 9.092031504238343e-06,
"loss": 0.371,
"step": 228
},
{
"epoch": 0.8378048780487805,
"grad_norm": 0.6488889361560148,
"learning_rate": 9.079780188943819e-06,
"loss": 0.3611,
"step": 229
},
{
"epoch": 0.8414634146341463,
"grad_norm": 0.6892176659739193,
"learning_rate": 9.067455143183213e-06,
"loss": 0.3645,
"step": 230
},
{
"epoch": 0.8451219512195122,
"grad_norm": 0.6615908246281889,
"learning_rate": 9.0550565896968e-06,
"loss": 0.3717,
"step": 231
},
{
"epoch": 0.848780487804878,
"grad_norm": 0.6608091922176914,
"learning_rate": 9.042584752553303e-06,
"loss": 0.3939,
"step": 232
},
{
"epoch": 0.8524390243902439,
"grad_norm": 0.7781223334917082,
"learning_rate": 9.030039857145836e-06,
"loss": 0.3608,
"step": 233
},
{
"epoch": 0.8560975609756097,
"grad_norm": 0.6356612947154711,
"learning_rate": 9.017422130187834e-06,
"loss": 0.3691,
"step": 234
},
{
"epoch": 0.8597560975609756,
"grad_norm": 0.6673516460275246,
"learning_rate": 9.004731799708961e-06,
"loss": 0.3587,
"step": 235
},
{
"epoch": 0.8634146341463415,
"grad_norm": 0.756626298213216,
"learning_rate": 8.991969095050976e-06,
"loss": 0.3814,
"step": 236
},
{
"epoch": 0.8670731707317073,
"grad_norm": 0.7356372585319284,
"learning_rate": 8.979134246863598e-06,
"loss": 0.3651,
"step": 237
},
{
"epoch": 0.8707317073170732,
"grad_norm": 0.7125847173556636,
"learning_rate": 8.966227487100346e-06,
"loss": 0.3758,
"step": 238
},
{
"epoch": 0.874390243902439,
"grad_norm": 0.7382930519755999,
"learning_rate": 8.953249049014324e-06,
"loss": 0.3871,
"step": 239
},
{
"epoch": 0.8780487804878049,
"grad_norm": 0.7666434750440383,
"learning_rate": 8.94019916715402e-06,
"loss": 0.3807,
"step": 240
},
{
"epoch": 0.8817073170731707,
"grad_norm": 0.6976662371355642,
"learning_rate": 8.927078077359076e-06,
"loss": 0.3721,
"step": 241
},
{
"epoch": 0.8853658536585366,
"grad_norm": 0.7860831512575027,
"learning_rate": 8.913886016756007e-06,
"loss": 0.3754,
"step": 242
},
{
"epoch": 0.8890243902439025,
"grad_norm": 0.7069750574806342,
"learning_rate": 8.900623223753923e-06,
"loss": 0.3904,
"step": 243
},
{
"epoch": 0.8926829268292683,
"grad_norm": 0.7292537500467997,
"learning_rate": 8.887289938040229e-06,
"loss": 0.388,
"step": 244
},
{
"epoch": 0.8963414634146342,
"grad_norm": 0.7915538577084609,
"learning_rate": 8.873886400576279e-06,
"loss": 0.3836,
"step": 245
},
{
"epoch": 0.9,
"grad_norm": 0.7452425515775055,
"learning_rate": 8.860412853593033e-06,
"loss": 0.3736,
"step": 246
},
{
"epoch": 0.9036585365853659,
"grad_norm": 0.648977235676723,
"learning_rate": 8.846869540586671e-06,
"loss": 0.3678,
"step": 247
},
{
"epoch": 0.9073170731707317,
"grad_norm": 0.6864106518210626,
"learning_rate": 8.8332567063142e-06,
"loss": 0.3823,
"step": 248
},
{
"epoch": 0.9109756097560976,
"grad_norm": 0.7349118351330677,
"learning_rate": 8.819574596789025e-06,
"loss": 0.3761,
"step": 249
},
{
"epoch": 0.9146341463414634,
"grad_norm": 0.7633616936437505,
"learning_rate": 8.805823459276501e-06,
"loss": 0.388,
"step": 250
},
{
"epoch": 0.9182926829268293,
"grad_norm": 0.6524916307250452,
"learning_rate": 8.792003542289478e-06,
"loss": 0.3747,
"step": 251
},
{
"epoch": 0.9219512195121952,
"grad_norm": 0.6536188734248112,
"learning_rate": 8.77811509558379e-06,
"loss": 0.3664,
"step": 252
},
{
"epoch": 0.925609756097561,
"grad_norm": 0.7531682065790463,
"learning_rate": 8.764158370153755e-06,
"loss": 0.3727,
"step": 253
},
{
"epoch": 0.9292682926829269,
"grad_norm": 0.693728850312522,
"learning_rate": 8.75013361822764e-06,
"loss": 0.3674,
"step": 254
},
{
"epoch": 0.9329268292682927,
"grad_norm": 0.657737265214606,
"learning_rate": 8.736041093263092e-06,
"loss": 0.376,
"step": 255
},
{
"epoch": 0.9365853658536586,
"grad_norm": 0.7496080034820701,
"learning_rate": 8.721881049942565e-06,
"loss": 0.3684,
"step": 256
},
{
"epoch": 0.9402439024390243,
"grad_norm": 0.6683122746778191,
"learning_rate": 8.707653744168718e-06,
"loss": 0.3653,
"step": 257
},
{
"epoch": 0.9439024390243902,
"grad_norm": 0.6546707237838714,
"learning_rate": 8.693359433059789e-06,
"loss": 0.3747,
"step": 258
},
{
"epoch": 0.947560975609756,
"grad_norm": 0.6924361208939677,
"learning_rate": 8.67899837494494e-06,
"loss": 0.3826,
"step": 259
},
{
"epoch": 0.9512195121951219,
"grad_norm": 0.7048197646181643,
"learning_rate": 8.664570829359608e-06,
"loss": 0.3713,
"step": 260
},
{
"epoch": 0.9548780487804878,
"grad_norm": 0.6717794456103233,
"learning_rate": 8.650077057040794e-06,
"loss": 0.3863,
"step": 261
},
{
"epoch": 0.9585365853658536,
"grad_norm": 0.6360542228266761,
"learning_rate": 8.635517319922359e-06,
"loss": 0.357,
"step": 262
},
{
"epoch": 0.9621951219512195,
"grad_norm": 0.605097486114699,
"learning_rate": 8.620891881130297e-06,
"loss": 0.3478,
"step": 263
},
{
"epoch": 0.9658536585365853,
"grad_norm": 0.7759870929652042,
"learning_rate": 8.606201004977967e-06,
"loss": 0.3847,
"step": 264
},
{
"epoch": 0.9695121951219512,
"grad_norm": 0.6911181654981698,
"learning_rate": 8.591444956961333e-06,
"loss": 0.3703,
"step": 265
},
{
"epoch": 0.973170731707317,
"grad_norm": 0.6853370502539734,
"learning_rate": 8.57662400375414e-06,
"loss": 0.3644,
"step": 266
},
{
"epoch": 0.9768292682926829,
"grad_norm": 0.6849837602731838,
"learning_rate": 8.561738413203124e-06,
"loss": 0.3742,
"step": 267
},
{
"epoch": 0.9804878048780488,
"grad_norm": 0.6924423604236972,
"learning_rate": 8.546788454323153e-06,
"loss": 0.3696,
"step": 268
},
{
"epoch": 0.9841463414634146,
"grad_norm": 0.6672850829111578,
"learning_rate": 8.53177439729237e-06,
"loss": 0.3672,
"step": 269
},
{
"epoch": 0.9878048780487805,
"grad_norm": 0.6884397982659812,
"learning_rate": 8.516696513447308e-06,
"loss": 0.3695,
"step": 270
},
{
"epoch": 0.9914634146341463,
"grad_norm": 0.6448421759053435,
"learning_rate": 8.501555075277997e-06,
"loss": 0.3525,
"step": 271
},
{
"epoch": 0.9951219512195122,
"grad_norm": 0.643104212334238,
"learning_rate": 8.486350356423021e-06,
"loss": 0.3523,
"step": 272
},
{
"epoch": 0.998780487804878,
"grad_norm": 0.639193169537897,
"learning_rate": 8.471082631664588e-06,
"loss": 0.3636,
"step": 273
},
{
"epoch": 1.0,
"grad_norm": 0.639193169537897,
"learning_rate": 8.455752176923561e-06,
"loss": 0.3744,
"step": 274
},
{
"epoch": 1.0036585365853659,
"grad_norm": 1.1473923148744305,
"learning_rate": 8.440359269254468e-06,
"loss": 0.3204,
"step": 275
},
{
"epoch": 1.0073170731707317,
"grad_norm": 0.6738354361167268,
"learning_rate": 8.424904186840495e-06,
"loss": 0.3185,
"step": 276
},
{
"epoch": 1.0109756097560976,
"grad_norm": 0.569651838834323,
"learning_rate": 8.40938720898846e-06,
"loss": 0.3242,
"step": 277
},
{
"epoch": 1.0146341463414634,
"grad_norm": 0.6512424474189245,
"learning_rate": 8.393808616123771e-06,
"loss": 0.3378,
"step": 278
},
{
"epoch": 1.0182926829268293,
"grad_norm": 0.6936858791581263,
"learning_rate": 8.378168689785346e-06,
"loss": 0.3308,
"step": 279
},
{
"epoch": 1.0219512195121951,
"grad_norm": 0.6563652415572625,
"learning_rate": 8.36246771262054e-06,
"loss": 0.3094,
"step": 280
},
{
"epoch": 1.025609756097561,
"grad_norm": 0.6298667529680904,
"learning_rate": 8.346705968380015e-06,
"loss": 0.3323,
"step": 281
},
{
"epoch": 1.0292682926829269,
"grad_norm": 0.6639245800866381,
"learning_rate": 8.330883741912644e-06,
"loss": 0.3209,
"step": 282
},
{
"epoch": 1.0329268292682927,
"grad_norm": 0.6286428100246292,
"learning_rate": 8.315001319160327e-06,
"loss": 0.3136,
"step": 283
},
{
"epoch": 1.0365853658536586,
"grad_norm": 0.617323964575384,
"learning_rate": 8.299058987152854e-06,
"loss": 0.292,
"step": 284
},
{
"epoch": 1.0402439024390244,
"grad_norm": 0.6666320305150188,
"learning_rate": 8.283057034002699e-06,
"loss": 0.3275,
"step": 285
},
{
"epoch": 1.0439024390243903,
"grad_norm": 0.6428680669935322,
"learning_rate": 8.26699574889982e-06,
"loss": 0.3153,
"step": 286
},
{
"epoch": 1.0475609756097561,
"grad_norm": 0.637715026411299,
"learning_rate": 8.250875422106434e-06,
"loss": 0.3228,
"step": 287
},
{
"epoch": 1.051219512195122,
"grad_norm": 0.6522273141210564,
"learning_rate": 8.234696344951767e-06,
"loss": 0.3066,
"step": 288
},
{
"epoch": 1.0548780487804879,
"grad_norm": 0.6454285200212009,
"learning_rate": 8.21845880982679e-06,
"loss": 0.3268,
"step": 289
},
{
"epoch": 1.0585365853658537,
"grad_norm": 0.5990451449609265,
"learning_rate": 8.202163110178945e-06,
"loss": 0.3152,
"step": 290
},
{
"epoch": 1.0621951219512196,
"grad_norm": 0.6588671859842463,
"learning_rate": 8.185809540506818e-06,
"loss": 0.3108,
"step": 291
},
{
"epoch": 1.0658536585365854,
"grad_norm": 0.625175506582043,
"learning_rate": 8.169398396354844e-06,
"loss": 0.2888,
"step": 292
},
{
"epoch": 1.0695121951219513,
"grad_norm": 0.7086099623038007,
"learning_rate": 8.152929974307949e-06,
"loss": 0.3096,
"step": 293
},
{
"epoch": 1.0731707317073171,
"grad_norm": 0.612473791527845,
"learning_rate": 8.136404571986194e-06,
"loss": 0.3147,
"step": 294
},
{
"epoch": 1.076829268292683,
"grad_norm": 0.7424912205691075,
"learning_rate": 8.1198224880394e-06,
"loss": 0.316,
"step": 295
},
{
"epoch": 1.0804878048780489,
"grad_norm": 0.6947218892107693,
"learning_rate": 8.103184022141746e-06,
"loss": 0.3188,
"step": 296
},
{
"epoch": 1.0841463414634147,
"grad_norm": 0.6501089480964608,
"learning_rate": 8.08648947498635e-06,
"loss": 0.3076,
"step": 297
},
{
"epoch": 1.0878048780487806,
"grad_norm": 0.7085015308115117,
"learning_rate": 8.069739148279851e-06,
"loss": 0.3263,
"step": 298
},
{
"epoch": 1.0914634146341464,
"grad_norm": 0.6977632322290905,
"learning_rate": 8.052933344736937e-06,
"loss": 0.3352,
"step": 299
},
{
"epoch": 1.0951219512195123,
"grad_norm": 0.6237640578371839,
"learning_rate": 8.036072368074883e-06,
"loss": 0.3078,
"step": 300
},
{
"epoch": 1.0987804878048781,
"grad_norm": 0.7293112733748618,
"learning_rate": 8.019156523008065e-06,
"loss": 0.3109,
"step": 301
},
{
"epoch": 1.102439024390244,
"grad_norm": 0.6523525486092905,
"learning_rate": 8.002186115242447e-06,
"loss": 0.3204,
"step": 302
},
{
"epoch": 1.1060975609756099,
"grad_norm": 0.6831085429924993,
"learning_rate": 7.985161451470061e-06,
"loss": 0.3188,
"step": 303
},
{
"epoch": 1.1097560975609757,
"grad_norm": 0.6789690434687307,
"learning_rate": 7.968082839363462e-06,
"loss": 0.3266,
"step": 304
},
{
"epoch": 1.1134146341463416,
"grad_norm": 0.6348758617481739,
"learning_rate": 7.95095058757017e-06,
"loss": 0.3121,
"step": 305
},
{
"epoch": 1.1170731707317074,
"grad_norm": 0.6546771590816294,
"learning_rate": 7.933765005707085e-06,
"loss": 0.3207,
"step": 306
},
{
"epoch": 1.120731707317073,
"grad_norm": 0.6642730324041832,
"learning_rate": 7.916526404354905e-06,
"loss": 0.3052,
"step": 307
},
{
"epoch": 1.1243902439024391,
"grad_norm": 0.6578600118558376,
"learning_rate": 7.899235095052497e-06,
"loss": 0.3042,
"step": 308
},
{
"epoch": 1.1280487804878048,
"grad_norm": 0.6315343541006101,
"learning_rate": 7.881891390291281e-06,
"loss": 0.3127,
"step": 309
},
{
"epoch": 1.1317073170731708,
"grad_norm": 0.688043709337069,
"learning_rate": 7.864495603509571e-06,
"loss": 0.3268,
"step": 310
},
{
"epoch": 1.1353658536585365,
"grad_norm": 0.6626118603817104,
"learning_rate": 7.84704804908692e-06,
"loss": 0.3151,
"step": 311
},
{
"epoch": 1.1390243902439026,
"grad_norm": 0.624691719057794,
"learning_rate": 7.829549042338436e-06,
"loss": 0.3647,
"step": 312
},
{
"epoch": 1.1426829268292682,
"grad_norm": 0.6357333460431095,
"learning_rate": 7.811998899509076e-06,
"loss": 0.2989,
"step": 313
},
{
"epoch": 1.146341463414634,
"grad_norm": 0.627087816143451,
"learning_rate": 7.794397937767941e-06,
"loss": 0.3195,
"step": 314
},
{
"epoch": 1.15,
"grad_norm": 0.6715288963393866,
"learning_rate": 7.77674647520254e-06,
"loss": 0.2984,
"step": 315
},
{
"epoch": 1.1536585365853658,
"grad_norm": 0.6767962115518564,
"learning_rate": 7.759044830813036e-06,
"loss": 0.3053,
"step": 316
},
{
"epoch": 1.1573170731707316,
"grad_norm": 0.6207594775740026,
"learning_rate": 7.741293324506493e-06,
"loss": 0.3019,
"step": 317
},
{
"epoch": 1.1609756097560975,
"grad_norm": 0.661920788242436,
"learning_rate": 7.723492277091089e-06,
"loss": 0.3154,
"step": 318
},
{
"epoch": 1.1646341463414633,
"grad_norm": 0.6036319823158318,
"learning_rate": 7.705642010270306e-06,
"loss": 0.3088,
"step": 319
},
{
"epoch": 1.1682926829268292,
"grad_norm": 0.6796660873801121,
"learning_rate": 7.687742846637141e-06,
"loss": 0.3224,
"step": 320
},
{
"epoch": 1.171951219512195,
"grad_norm": 0.6198409436731228,
"learning_rate": 7.66979510966825e-06,
"loss": 0.3243,
"step": 321
},
{
"epoch": 1.175609756097561,
"grad_norm": 0.6178487393901908,
"learning_rate": 7.651799123718126e-06,
"loss": 0.3315,
"step": 322
},
{
"epoch": 1.1792682926829268,
"grad_norm": 0.6620709841506736,
"learning_rate": 7.63375521401322e-06,
"loss": 0.3286,
"step": 323
},
{
"epoch": 1.1829268292682926,
"grad_norm": 0.6496124170177628,
"learning_rate": 7.615663706646063e-06,
"loss": 0.3096,
"step": 324
},
{
"epoch": 1.1865853658536585,
"grad_norm": 0.5587747317986588,
"learning_rate": 7.597524928569391e-06,
"loss": 0.3208,
"step": 325
},
{
"epoch": 1.1902439024390243,
"grad_norm": 0.6701337541172385,
"learning_rate": 7.579339207590216e-06,
"loss": 0.3142,
"step": 326
},
{
"epoch": 1.1939024390243902,
"grad_norm": 0.6825894061442603,
"learning_rate": 7.561106872363911e-06,
"loss": 0.3081,
"step": 327
},
{
"epoch": 1.197560975609756,
"grad_norm": 0.6201375755843246,
"learning_rate": 7.542828252388271e-06,
"loss": 0.3307,
"step": 328
},
{
"epoch": 1.201219512195122,
"grad_norm": 0.6404653599994984,
"learning_rate": 7.524503677997557e-06,
"loss": 0.3088,
"step": 329
},
{
"epoch": 1.2048780487804878,
"grad_norm": 0.6681252945789206,
"learning_rate": 7.506133480356523e-06,
"loss": 0.3062,
"step": 330
},
{
"epoch": 1.2085365853658536,
"grad_norm": 0.6348173893075612,
"learning_rate": 7.487717991454441e-06,
"loss": 0.2998,
"step": 331
},
{
"epoch": 1.2121951219512195,
"grad_norm": 0.6476835069549743,
"learning_rate": 7.469257544099081e-06,
"loss": 0.3228,
"step": 332
},
{
"epoch": 1.2158536585365853,
"grad_norm": 0.6873002671793336,
"learning_rate": 7.450752471910725e-06,
"loss": 0.3177,
"step": 333
},
{
"epoch": 1.2195121951219512,
"grad_norm": 0.5995570200932562,
"learning_rate": 7.432203109316112e-06,
"loss": 0.3157,
"step": 334
},
{
"epoch": 1.223170731707317,
"grad_norm": 0.7020387116477846,
"learning_rate": 7.413609791542407e-06,
"loss": 0.3221,
"step": 335
},
{
"epoch": 1.226829268292683,
"grad_norm": 0.6813547839022456,
"learning_rate": 7.394972854611142e-06,
"loss": 0.3105,
"step": 336
},
{
"epoch": 1.2304878048780488,
"grad_norm": 0.6156200538885269,
"learning_rate": 7.376292635332142e-06,
"loss": 0.3019,
"step": 337
},
{
"epoch": 1.2341463414634146,
"grad_norm": 0.6645540728481677,
"learning_rate": 7.3575694712974335e-06,
"loss": 0.3083,
"step": 338
},
{
"epoch": 1.2378048780487805,
"grad_norm": 0.6692767414956421,
"learning_rate": 7.338803700875153e-06,
"loss": 0.3088,
"step": 339
},
{
"epoch": 1.2414634146341463,
"grad_norm": 0.6616979629249964,
"learning_rate": 7.319995663203425e-06,
"loss": 0.3174,
"step": 340
},
{
"epoch": 1.2451219512195122,
"grad_norm": 0.7614157176916696,
"learning_rate": 7.301145698184233e-06,
"loss": 0.3117,
"step": 341
},
{
"epoch": 1.248780487804878,
"grad_norm": 0.5694694334732344,
"learning_rate": 7.282254146477281e-06,
"loss": 0.2977,
"step": 342
},
{
"epoch": 1.252439024390244,
"grad_norm": 0.613501009109194,
"learning_rate": 7.263321349493833e-06,
"loss": 0.3029,
"step": 343
},
{
"epoch": 1.2560975609756098,
"grad_norm": 0.6185975460458262,
"learning_rate": 7.244347649390542e-06,
"loss": 0.3052,
"step": 344
},
{
"epoch": 1.2597560975609756,
"grad_norm": 0.6450583790490763,
"learning_rate": 7.225333389063276e-06,
"loss": 0.3203,
"step": 345
},
{
"epoch": 1.2634146341463415,
"grad_norm": 0.5848954020900502,
"learning_rate": 7.206278912140907e-06,
"loss": 0.3071,
"step": 346
},
{
"epoch": 1.2670731707317073,
"grad_norm": 0.6761598102383233,
"learning_rate": 7.187184562979112e-06,
"loss": 0.3149,
"step": 347
},
{
"epoch": 1.2707317073170732,
"grad_norm": 0.6333122175147837,
"learning_rate": 7.168050686654144e-06,
"loss": 0.3287,
"step": 348
},
{
"epoch": 1.274390243902439,
"grad_norm": 0.6142569877993334,
"learning_rate": 7.148877628956598e-06,
"loss": 0.3108,
"step": 349
},
{
"epoch": 1.278048780487805,
"grad_norm": 0.5992913682315228,
"learning_rate": 7.1296657363851644e-06,
"loss": 0.3083,
"step": 350
},
{
"epoch": 1.2817073170731708,
"grad_norm": 0.693923292682762,
"learning_rate": 7.110415356140357e-06,
"loss": 0.3241,
"step": 351
},
{
"epoch": 1.2853658536585366,
"grad_norm": 0.6135479274958128,
"learning_rate": 7.091126836118249e-06,
"loss": 0.301,
"step": 352
},
{
"epoch": 1.2890243902439025,
"grad_norm": 0.5671663217628563,
"learning_rate": 7.071800524904185e-06,
"loss": 0.3025,
"step": 353
},
{
"epoch": 1.2926829268292683,
"grad_norm": 0.6340067432971223,
"learning_rate": 7.052436771766474e-06,
"loss": 0.3056,
"step": 354
},
{
"epoch": 1.2963414634146342,
"grad_norm": 0.6749369096026892,
"learning_rate": 7.033035926650084e-06,
"loss": 0.3165,
"step": 355
},
{
"epoch": 1.3,
"grad_norm": 0.6601655491194676,
"learning_rate": 7.0135983401703125e-06,
"loss": 0.3082,
"step": 356
},
{
"epoch": 1.303658536585366,
"grad_norm": 0.5790814002165854,
"learning_rate": 6.994124363606457e-06,
"loss": 0.3016,
"step": 357
},
{
"epoch": 1.3073170731707318,
"grad_norm": 0.6462497543472036,
"learning_rate": 6.974614348895459e-06,
"loss": 0.3227,
"step": 358
},
{
"epoch": 1.3109756097560976,
"grad_norm": 0.6818223846641225,
"learning_rate": 6.95506864862555e-06,
"loss": 0.3144,
"step": 359
},
{
"epoch": 1.3146341463414635,
"grad_norm": 0.5953658678828784,
"learning_rate": 6.9354876160298764e-06,
"loss": 0.2803,
"step": 360
},
{
"epoch": 1.3182926829268293,
"grad_norm": 0.6480981830399114,
"learning_rate": 6.915871604980115e-06,
"loss": 0.3359,
"step": 361
},
{
"epoch": 1.3219512195121952,
"grad_norm": 0.7143958836163122,
"learning_rate": 6.89622096998008e-06,
"loss": 0.3112,
"step": 362
},
{
"epoch": 1.325609756097561,
"grad_norm": 0.6281272945430706,
"learning_rate": 6.876536066159315e-06,
"loss": 0.318,
"step": 363
},
{
"epoch": 1.329268292682927,
"grad_norm": 0.6195212204195637,
"learning_rate": 6.856817249266676e-06,
"loss": 0.3101,
"step": 364
},
{
"epoch": 1.3329268292682928,
"grad_norm": 0.67165922544889,
"learning_rate": 6.837064875663901e-06,
"loss": 0.3055,
"step": 365
},
{
"epoch": 1.3365853658536586,
"grad_norm": 0.6195823500964106,
"learning_rate": 6.817279302319171e-06,
"loss": 0.3302,
"step": 366
},
{
"epoch": 1.3402439024390245,
"grad_norm": 0.6100082115147878,
"learning_rate": 6.797460886800658e-06,
"loss": 0.3242,
"step": 367
},
{
"epoch": 1.34390243902439,
"grad_norm": 0.6440688199214216,
"learning_rate": 6.777609987270064e-06,
"loss": 0.3217,
"step": 368
},
{
"epoch": 1.3475609756097562,
"grad_norm": 0.5621112226548872,
"learning_rate": 6.757726962476145e-06,
"loss": 0.3042,
"step": 369
},
{
"epoch": 1.3512195121951218,
"grad_norm": 0.6592929751452373,
"learning_rate": 6.737812171748234e-06,
"loss": 0.3092,
"step": 370
},
{
"epoch": 1.354878048780488,
"grad_norm": 0.5840319742865415,
"learning_rate": 6.717865974989739e-06,
"loss": 0.3069,
"step": 371
},
{
"epoch": 1.3585365853658535,
"grad_norm": 0.6479111812916528,
"learning_rate": 6.6978887326716455e-06,
"loss": 0.3047,
"step": 372
},
{
"epoch": 1.3621951219512196,
"grad_norm": 0.6405731778314568,
"learning_rate": 6.677880805825998e-06,
"loss": 0.313,
"step": 373
},
{
"epoch": 1.3658536585365852,
"grad_norm": 0.596212161768833,
"learning_rate": 6.6578425560393835e-06,
"loss": 0.3035,
"step": 374
},
{
"epoch": 1.3695121951219513,
"grad_norm": 0.6045424729604323,
"learning_rate": 6.6377743454463785e-06,
"loss": 0.3281,
"step": 375
},
{
"epoch": 1.373170731707317,
"grad_norm": 0.6128263053984974,
"learning_rate": 6.617676536723024e-06,
"loss": 0.2897,
"step": 376
},
{
"epoch": 1.376829268292683,
"grad_norm": 0.5787213510400789,
"learning_rate": 6.597549493080263e-06,
"loss": 0.2921,
"step": 377
},
{
"epoch": 1.3804878048780487,
"grad_norm": 0.6279325443526219,
"learning_rate": 6.577393578257375e-06,
"loss": 0.3077,
"step": 378
},
{
"epoch": 1.3841463414634148,
"grad_norm": 0.6900536831097186,
"learning_rate": 6.557209156515403e-06,
"loss": 0.3362,
"step": 379
},
{
"epoch": 1.3878048780487804,
"grad_norm": 0.6253403111205682,
"learning_rate": 6.536996592630578e-06,
"loss": 0.2967,
"step": 380
},
{
"epoch": 1.3914634146341465,
"grad_norm": 0.6104584449859696,
"learning_rate": 6.516756251887711e-06,
"loss": 0.3184,
"step": 381
},
{
"epoch": 1.395121951219512,
"grad_norm": 0.6281312306081094,
"learning_rate": 6.496488500073608e-06,
"loss": 0.3126,
"step": 382
},
{
"epoch": 1.3987804878048782,
"grad_norm": 0.5854353977761765,
"learning_rate": 6.476193703470454e-06,
"loss": 0.3034,
"step": 383
},
{
"epoch": 1.4024390243902438,
"grad_norm": 0.5924846776205195,
"learning_rate": 6.455872228849182e-06,
"loss": 0.3122,
"step": 384
},
{
"epoch": 1.40609756097561,
"grad_norm": 0.629553578206909,
"learning_rate": 6.435524443462865e-06,
"loss": 0.3085,
"step": 385
},
{
"epoch": 1.4097560975609755,
"grad_norm": 0.6347794227496868,
"learning_rate": 6.415150715040066e-06,
"loss": 0.3043,
"step": 386
},
{
"epoch": 1.4134146341463414,
"grad_norm": 0.6367266740240046,
"learning_rate": 6.394751411778188e-06,
"loss": 0.3031,
"step": 387
},
{
"epoch": 1.4170731707317072,
"grad_norm": 0.5747360730840103,
"learning_rate": 6.374326902336838e-06,
"loss": 0.2992,
"step": 388
},
{
"epoch": 1.420731707317073,
"grad_norm": 0.6449249430626156,
"learning_rate": 6.353877555831144e-06,
"loss": 0.302,
"step": 389
},
{
"epoch": 1.424390243902439,
"grad_norm": 0.6542878218567583,
"learning_rate": 6.3334037418250975e-06,
"loss": 0.3015,
"step": 390
},
{
"epoch": 1.4280487804878048,
"grad_norm": 0.6025024314363183,
"learning_rate": 6.312905830324871e-06,
"loss": 0.3028,
"step": 391
},
{
"epoch": 1.4317073170731707,
"grad_norm": 0.6067694802806707,
"learning_rate": 6.292384191772128e-06,
"loss": 0.315,
"step": 392
},
{
"epoch": 1.4353658536585365,
"grad_norm": 0.6232527972793704,
"learning_rate": 6.271839197037337e-06,
"loss": 0.3042,
"step": 393
},
{
"epoch": 1.4390243902439024,
"grad_norm": 0.6135565219677418,
"learning_rate": 6.251271217413059e-06,
"loss": 0.3121,
"step": 394
},
{
"epoch": 1.4426829268292682,
"grad_norm": 0.6028567158447554,
"learning_rate": 6.230680624607237e-06,
"loss": 0.2943,
"step": 395
},
{
"epoch": 1.446341463414634,
"grad_norm": 0.5680751035754046,
"learning_rate": 6.210067790736496e-06,
"loss": 0.3121,
"step": 396
},
{
"epoch": 1.45,
"grad_norm": 0.6435335111059491,
"learning_rate": 6.189433088319394e-06,
"loss": 0.3001,
"step": 397
},
{
"epoch": 1.4536585365853658,
"grad_norm": 0.6316933527533821,
"learning_rate": 6.1687768902697045e-06,
"loss": 0.3176,
"step": 398
},
{
"epoch": 1.4573170731707317,
"grad_norm": 0.6257119035903327,
"learning_rate": 6.148099569889675e-06,
"loss": 0.3124,
"step": 399
},
{
"epoch": 1.4609756097560975,
"grad_norm": 0.6504617753214657,
"learning_rate": 6.127401500863281e-06,
"loss": 0.2971,
"step": 400
},
{
"epoch": 1.4646341463414634,
"grad_norm": 0.6085074214868041,
"learning_rate": 6.106683057249461e-06,
"loss": 0.2966,
"step": 401
},
{
"epoch": 1.4682926829268292,
"grad_norm": 0.6654694916787747,
"learning_rate": 6.085944613475381e-06,
"loss": 0.3062,
"step": 402
},
{
"epoch": 1.471951219512195,
"grad_norm": 0.656833640936344,
"learning_rate": 6.065186544329641e-06,
"loss": 0.3069,
"step": 403
},
{
"epoch": 1.475609756097561,
"grad_norm": 0.639981866077395,
"learning_rate": 6.044409224955522e-06,
"loss": 0.3031,
"step": 404
},
{
"epoch": 1.4792682926829268,
"grad_norm": 0.651818391326447,
"learning_rate": 6.023613030844194e-06,
"loss": 0.2899,
"step": 405
},
{
"epoch": 1.4829268292682927,
"grad_norm": 0.5860131898598232,
"learning_rate": 6.0027983378279355e-06,
"loss": 0.3151,
"step": 406
},
{
"epoch": 1.4865853658536585,
"grad_norm": 0.6329564555310796,
"learning_rate": 5.981965522073341e-06,
"loss": 0.3141,
"step": 407
},
{
"epoch": 1.4902439024390244,
"grad_norm": 0.5343136578984965,
"learning_rate": 5.96111496007452e-06,
"loss": 0.301,
"step": 408
},
{
"epoch": 1.4939024390243902,
"grad_norm": 0.6878892449113025,
"learning_rate": 5.940247028646299e-06,
"loss": 0.2918,
"step": 409
},
{
"epoch": 1.497560975609756,
"grad_norm": 0.6584719645530801,
"learning_rate": 5.919362104917403e-06,
"loss": 0.3039,
"step": 410
},
{
"epoch": 1.501219512195122,
"grad_norm": 0.6061856674952371,
"learning_rate": 5.898460566323649e-06,
"loss": 0.3071,
"step": 411
},
{
"epoch": 1.5048780487804878,
"grad_norm": 0.6329216805871392,
"learning_rate": 5.877542790601116e-06,
"loss": 0.2989,
"step": 412
},
{
"epoch": 1.5085365853658537,
"grad_norm": 0.5811394474667385,
"learning_rate": 5.856609155779327e-06,
"loss": 0.3084,
"step": 413
},
{
"epoch": 1.5121951219512195,
"grad_norm": 0.6624194190479747,
"learning_rate": 5.835660040174413e-06,
"loss": 0.3199,
"step": 414
},
{
"epoch": 1.5158536585365854,
"grad_norm": 0.5877849057929369,
"learning_rate": 5.814695822382274e-06,
"loss": 0.3062,
"step": 415
},
{
"epoch": 1.5195121951219512,
"grad_norm": 0.6324026124729334,
"learning_rate": 5.793716881271742e-06,
"loss": 0.3069,
"step": 416
},
{
"epoch": 1.523170731707317,
"grad_norm": 0.6541754157726875,
"learning_rate": 5.772723595977728e-06,
"loss": 0.322,
"step": 417
},
{
"epoch": 1.526829268292683,
"grad_norm": 0.6455997368435114,
"learning_rate": 5.751716345894377e-06,
"loss": 0.3034,
"step": 418
},
{
"epoch": 1.5304878048780488,
"grad_norm": 0.6120672144668279,
"learning_rate": 5.730695510668204e-06,
"loss": 0.3049,
"step": 419
},
{
"epoch": 1.5341463414634147,
"grad_norm": 0.6831050808314805,
"learning_rate": 5.709661470191241e-06,
"loss": 0.3114,
"step": 420
},
{
"epoch": 1.5378048780487805,
"grad_norm": 0.60423345440851,
"learning_rate": 5.688614604594165e-06,
"loss": 0.306,
"step": 421
},
{
"epoch": 1.5414634146341464,
"grad_norm": 0.6579384904880561,
"learning_rate": 5.66755529423943e-06,
"loss": 0.3129,
"step": 422
},
{
"epoch": 1.5451219512195122,
"grad_norm": 0.5655478798902573,
"learning_rate": 5.646483919714398e-06,
"loss": 0.2725,
"step": 423
},
{
"epoch": 1.548780487804878,
"grad_norm": 0.6876809669509347,
"learning_rate": 5.625400861824452e-06,
"loss": 0.3031,
"step": 424
},
{
"epoch": 1.552439024390244,
"grad_norm": 0.6359182181607586,
"learning_rate": 5.60430650158612e-06,
"loss": 0.3114,
"step": 425
},
{
"epoch": 1.5560975609756098,
"grad_norm": 0.6082595644514966,
"learning_rate": 5.583201220220189e-06,
"loss": 0.325,
"step": 426
},
{
"epoch": 1.5597560975609757,
"grad_norm": 0.6466616658146095,
"learning_rate": 5.562085399144815e-06,
"loss": 0.3052,
"step": 427
},
{
"epoch": 1.5634146341463415,
"grad_norm": 0.5830672803700676,
"learning_rate": 5.5409594199686265e-06,
"loss": 0.2983,
"step": 428
},
{
"epoch": 1.5670731707317072,
"grad_norm": 0.6190570589105842,
"learning_rate": 5.519823664483834e-06,
"loss": 0.2926,
"step": 429
},
{
"epoch": 1.5707317073170732,
"grad_norm": 0.6679466145592807,
"learning_rate": 5.4986785146593255e-06,
"loss": 0.2981,
"step": 430
},
{
"epoch": 1.5743902439024389,
"grad_norm": 0.6059070844416552,
"learning_rate": 5.477524352633764e-06,
"loss": 0.3112,
"step": 431
},
{
"epoch": 1.578048780487805,
"grad_norm": 0.6339996136313506,
"learning_rate": 5.4563615607086865e-06,
"loss": 0.3128,
"step": 432
},
{
"epoch": 1.5817073170731706,
"grad_norm": 0.638994673364677,
"learning_rate": 5.435190521341584e-06,
"loss": 0.3152,
"step": 433
},
{
"epoch": 1.5853658536585367,
"grad_norm": 0.6234136145013041,
"learning_rate": 5.414011617139004e-06,
"loss": 0.2789,
"step": 434
},
{
"epoch": 1.5890243902439023,
"grad_norm": 0.615079581586681,
"learning_rate": 5.392825230849626e-06,
"loss": 0.3015,
"step": 435
},
{
"epoch": 1.5926829268292684,
"grad_norm": 0.6205254532976391,
"learning_rate": 5.371631745357344e-06,
"loss": 0.2921,
"step": 436
},
{
"epoch": 1.596341463414634,
"grad_norm": 0.6076702662224442,
"learning_rate": 5.3504315436743545e-06,
"loss": 0.2941,
"step": 437
},
{
"epoch": 1.6,
"grad_norm": 0.6169851323637991,
"learning_rate": 5.329225008934228e-06,
"loss": 0.3107,
"step": 438
},
{
"epoch": 1.6036585365853657,
"grad_norm": 0.5916553236301388,
"learning_rate": 5.308012524384986e-06,
"loss": 0.3044,
"step": 439
},
{
"epoch": 1.6073170731707318,
"grad_norm": 0.6681712481979999,
"learning_rate": 5.286794473382178e-06,
"loss": 0.3116,
"step": 440
},
{
"epoch": 1.6109756097560974,
"grad_norm": 0.5631447516908423,
"learning_rate": 5.2655712393819504e-06,
"loss": 0.3226,
"step": 441
},
{
"epoch": 1.6146341463414635,
"grad_norm": 0.6284610548650537,
"learning_rate": 5.244343205934118e-06,
"loss": 0.2988,
"step": 442
},
{
"epoch": 1.6182926829268292,
"grad_norm": 0.6183892400516535,
"learning_rate": 5.223110756675231e-06,
"loss": 0.3129,
"step": 443
},
{
"epoch": 1.6219512195121952,
"grad_norm": 0.599291363344575,
"learning_rate": 5.201874275321642e-06,
"loss": 0.315,
"step": 444
},
{
"epoch": 1.6256097560975609,
"grad_norm": 0.6001954294328656,
"learning_rate": 5.1806341456625785e-06,
"loss": 0.298,
"step": 445
},
{
"epoch": 1.629268292682927,
"grad_norm": 0.6149268111955434,
"learning_rate": 5.159390751553191e-06,
"loss": 0.2865,
"step": 446
},
{
"epoch": 1.6329268292682926,
"grad_norm": 0.6512843754790982,
"learning_rate": 5.138144476907634e-06,
"loss": 0.3039,
"step": 447
},
{
"epoch": 1.6365853658536587,
"grad_norm": 0.650047956082531,
"learning_rate": 5.116895705692117e-06,
"loss": 0.3114,
"step": 448
},
{
"epoch": 1.6402439024390243,
"grad_norm": 0.6029542336663615,
"learning_rate": 5.095644821917964e-06,
"loss": 0.3068,
"step": 449
},
{
"epoch": 1.6439024390243904,
"grad_norm": 0.6007591221577205,
"learning_rate": 5.0743922096346836e-06,
"loss": 0.2996,
"step": 450
},
{
"epoch": 1.647560975609756,
"grad_norm": 0.603771559433405,
"learning_rate": 5.053138252923019e-06,
"loss": 0.2952,
"step": 451
},
{
"epoch": 1.651219512195122,
"grad_norm": 0.55661515069207,
"learning_rate": 5.031883335888009e-06,
"loss": 0.3096,
"step": 452
},
{
"epoch": 1.6548780487804877,
"grad_norm": 0.6416454330032674,
"learning_rate": 5.010627842652049e-06,
"loss": 0.2974,
"step": 453
},
{
"epoch": 1.6585365853658538,
"grad_norm": 0.6063267101737585,
"learning_rate": 4.989372157347951e-06,
"loss": 0.2835,
"step": 454
},
{
"epoch": 1.6621951219512194,
"grad_norm": 0.58786831681036,
"learning_rate": 4.968116664111992e-06,
"loss": 0.3042,
"step": 455
},
{
"epoch": 1.6658536585365855,
"grad_norm": 0.6382722947842386,
"learning_rate": 4.946861747076983e-06,
"loss": 0.2992,
"step": 456
},
{
"epoch": 1.6695121951219511,
"grad_norm": 0.6277617762282729,
"learning_rate": 4.925607790365319e-06,
"loss": 0.3032,
"step": 457
},
{
"epoch": 1.6731707317073172,
"grad_norm": 0.6298104723656519,
"learning_rate": 4.904355178082038e-06,
"loss": 0.314,
"step": 458
},
{
"epoch": 1.6768292682926829,
"grad_norm": 0.6107292006492634,
"learning_rate": 4.883104294307884e-06,
"loss": 0.2942,
"step": 459
},
{
"epoch": 1.680487804878049,
"grad_norm": 0.6176967723038016,
"learning_rate": 4.861855523092366e-06,
"loss": 0.2997,
"step": 460
},
{
"epoch": 1.6841463414634146,
"grad_norm": 0.5913665790260473,
"learning_rate": 4.840609248446809e-06,
"loss": 0.3184,
"step": 461
},
{
"epoch": 1.6878048780487804,
"grad_norm": 0.5953694773185858,
"learning_rate": 4.819365854337423e-06,
"loss": 0.3165,
"step": 462
},
{
"epoch": 1.6914634146341463,
"grad_norm": 0.5707973515045525,
"learning_rate": 4.7981257246783595e-06,
"loss": 0.3104,
"step": 463
},
{
"epoch": 1.6951219512195121,
"grad_norm": 0.6156767325990874,
"learning_rate": 4.776889243324772e-06,
"loss": 0.3151,
"step": 464
},
{
"epoch": 1.698780487804878,
"grad_norm": 0.5877462803769106,
"learning_rate": 4.755656794065884e-06,
"loss": 0.2832,
"step": 465
},
{
"epoch": 1.7024390243902439,
"grad_norm": 0.6205361496175391,
"learning_rate": 4.73442876061805e-06,
"loss": 0.3045,
"step": 466
},
{
"epoch": 1.7060975609756097,
"grad_norm": 0.6228470039101209,
"learning_rate": 4.713205526617822e-06,
"loss": 0.322,
"step": 467
},
{
"epoch": 1.7097560975609756,
"grad_norm": 0.5803220572591428,
"learning_rate": 4.691987475615016e-06,
"loss": 0.2973,
"step": 468
},
{
"epoch": 1.7134146341463414,
"grad_norm": 0.5954242966594924,
"learning_rate": 4.670774991065774e-06,
"loss": 0.2946,
"step": 469
},
{
"epoch": 1.7170731707317073,
"grad_norm": 0.6008261653591337,
"learning_rate": 4.649568456325645e-06,
"loss": 0.2955,
"step": 470
},
{
"epoch": 1.7207317073170731,
"grad_norm": 0.5937214372456813,
"learning_rate": 4.6283682546426564e-06,
"loss": 0.2826,
"step": 471
},
{
"epoch": 1.724390243902439,
"grad_norm": 0.5234628318336602,
"learning_rate": 4.607174769150375e-06,
"loss": 0.2825,
"step": 472
},
{
"epoch": 1.7280487804878049,
"grad_norm": 0.6189630230422453,
"learning_rate": 4.5859883828609965e-06,
"loss": 0.3186,
"step": 473
},
{
"epoch": 1.7317073170731707,
"grad_norm": 0.5793938651866555,
"learning_rate": 4.564809478658419e-06,
"loss": 0.2977,
"step": 474
},
{
"epoch": 1.7353658536585366,
"grad_norm": 0.5736103569893777,
"learning_rate": 4.543638439291317e-06,
"loss": 0.286,
"step": 475
},
{
"epoch": 1.7390243902439024,
"grad_norm": 0.6106146033891122,
"learning_rate": 4.5224756473662365e-06,
"loss": 0.3086,
"step": 476
},
{
"epoch": 1.7426829268292683,
"grad_norm": 0.648329612248683,
"learning_rate": 4.501321485340676e-06,
"loss": 0.3107,
"step": 477
},
{
"epoch": 1.7463414634146341,
"grad_norm": 0.572713588951205,
"learning_rate": 4.480176335516167e-06,
"loss": 0.3003,
"step": 478
},
{
"epoch": 1.75,
"grad_norm": 0.6136351450374243,
"learning_rate": 4.459040580031374e-06,
"loss": 0.2967,
"step": 479
},
{
"epoch": 1.7536585365853659,
"grad_norm": 0.6499237376964561,
"learning_rate": 4.437914600855187e-06,
"loss": 0.3084,
"step": 480
},
{
"epoch": 1.7573170731707317,
"grad_norm": 0.647740834520244,
"learning_rate": 4.41679877977981e-06,
"loss": 0.3137,
"step": 481
},
{
"epoch": 1.7609756097560976,
"grad_norm": 0.6089886321475284,
"learning_rate": 4.3956934984138815e-06,
"loss": 0.307,
"step": 482
},
{
"epoch": 1.7646341463414634,
"grad_norm": 0.5882692243451826,
"learning_rate": 4.374599138175551e-06,
"loss": 0.2998,
"step": 483
},
{
"epoch": 1.7682926829268293,
"grad_norm": 0.6433244483480277,
"learning_rate": 4.353516080285603e-06,
"loss": 0.299,
"step": 484
},
{
"epoch": 1.7719512195121951,
"grad_norm": 0.6514720927170986,
"learning_rate": 4.332444705760571e-06,
"loss": 0.3129,
"step": 485
},
{
"epoch": 1.775609756097561,
"grad_norm": 0.5863962303909052,
"learning_rate": 4.3113853954058385e-06,
"loss": 0.3134,
"step": 486
},
{
"epoch": 1.7792682926829269,
"grad_norm": 0.5733292141072323,
"learning_rate": 4.2903385298087595e-06,
"loss": 0.2879,
"step": 487
},
{
"epoch": 1.7829268292682927,
"grad_norm": 0.5946111816840389,
"learning_rate": 4.269304489331797e-06,
"loss": 0.2974,
"step": 488
},
{
"epoch": 1.7865853658536586,
"grad_norm": 0.609851345774362,
"learning_rate": 4.248283654105624e-06,
"loss": 0.2921,
"step": 489
},
{
"epoch": 1.7902439024390244,
"grad_norm": 0.6321610405118764,
"learning_rate": 4.2272764040222724e-06,
"loss": 0.317,
"step": 490
},
{
"epoch": 1.7939024390243903,
"grad_norm": 0.6372499429582906,
"learning_rate": 4.206283118728258e-06,
"loss": 0.3029,
"step": 491
},
{
"epoch": 1.7975609756097561,
"grad_norm": 0.5820995527331286,
"learning_rate": 4.185304177617725e-06,
"loss": 0.2844,
"step": 492
},
{
"epoch": 1.801219512195122,
"grad_norm": 0.6175292122161644,
"learning_rate": 4.164339959825587e-06,
"loss": 0.2997,
"step": 493
},
{
"epoch": 1.8048780487804879,
"grad_norm": 0.5681456854505189,
"learning_rate": 4.1433908442206735e-06,
"loss": 0.2965,
"step": 494
},
{
"epoch": 1.8085365853658537,
"grad_norm": 0.5924758008964693,
"learning_rate": 4.122457209398886e-06,
"loss": 0.2829,
"step": 495
},
{
"epoch": 1.8121951219512196,
"grad_norm": 0.5996535608299406,
"learning_rate": 4.101539433676354e-06,
"loss": 0.3034,
"step": 496
},
{
"epoch": 1.8158536585365854,
"grad_norm": 0.5829324041998495,
"learning_rate": 4.0806378950825996e-06,
"loss": 0.3024,
"step": 497
},
{
"epoch": 1.819512195121951,
"grad_norm": 0.6081335311388287,
"learning_rate": 4.059752971353702e-06,
"loss": 0.3052,
"step": 498
},
{
"epoch": 1.8231707317073171,
"grad_norm": 0.5616344855594736,
"learning_rate": 4.038885039925481e-06,
"loss": 0.298,
"step": 499
},
{
"epoch": 1.8268292682926828,
"grad_norm": 0.5767878756565518,
"learning_rate": 4.018034477926661e-06,
"loss": 0.3089,
"step": 500
},
{
"epoch": 1.8304878048780489,
"grad_norm": 0.6019923918904264,
"learning_rate": 3.997201662172065e-06,
"loss": 0.285,
"step": 501
},
{
"epoch": 1.8341463414634145,
"grad_norm": 0.5689341408873392,
"learning_rate": 3.976386969155807e-06,
"loss": 0.301,
"step": 502
},
{
"epoch": 1.8378048780487806,
"grad_norm": 0.6170699137265363,
"learning_rate": 3.9555907750444785e-06,
"loss": 0.3092,
"step": 503
},
{
"epoch": 1.8414634146341462,
"grad_norm": 0.628745537543142,
"learning_rate": 3.934813455670359e-06,
"loss": 0.3016,
"step": 504
},
{
"epoch": 1.8451219512195123,
"grad_norm": 0.6152669681872653,
"learning_rate": 3.914055386524621e-06,
"loss": 0.2917,
"step": 505
},
{
"epoch": 1.848780487804878,
"grad_norm": 0.6118927040375713,
"learning_rate": 3.89331694275054e-06,
"loss": 0.309,
"step": 506
},
{
"epoch": 1.852439024390244,
"grad_norm": 0.5871831149186963,
"learning_rate": 3.872598499136723e-06,
"loss": 0.3063,
"step": 507
},
{
"epoch": 1.8560975609756096,
"grad_norm": 0.5923860193562817,
"learning_rate": 3.851900430110326e-06,
"loss": 0.3043,
"step": 508
},
{
"epoch": 1.8597560975609757,
"grad_norm": 0.5702777294822635,
"learning_rate": 3.831223109730296e-06,
"loss": 0.276,
"step": 509
},
{
"epoch": 1.8634146341463413,
"grad_norm": 0.5818433208728103,
"learning_rate": 3.810566911680607e-06,
"loss": 0.2936,
"step": 510
},
{
"epoch": 1.8670731707317074,
"grad_norm": 0.6678513890057626,
"learning_rate": 3.789932209263506e-06,
"loss": 0.297,
"step": 511
},
{
"epoch": 1.870731707317073,
"grad_norm": 0.6355121966400645,
"learning_rate": 3.769319375392764e-06,
"loss": 0.3006,
"step": 512
},
{
"epoch": 1.8743902439024391,
"grad_norm": 0.5771949043253944,
"learning_rate": 3.7487287825869445e-06,
"loss": 0.3007,
"step": 513
},
{
"epoch": 1.8780487804878048,
"grad_norm": 0.6045825081482409,
"learning_rate": 3.7281608029626636e-06,
"loss": 0.3039,
"step": 514
},
{
"epoch": 1.8817073170731708,
"grad_norm": 0.5878998409142763,
"learning_rate": 3.707615808227872e-06,
"loss": 0.2949,
"step": 515
},
{
"epoch": 1.8853658536585365,
"grad_norm": 0.6231546765696477,
"learning_rate": 3.6870941696751307e-06,
"loss": 0.295,
"step": 516
},
{
"epoch": 1.8890243902439026,
"grad_norm": 0.5832664311621261,
"learning_rate": 3.6665962581749046e-06,
"loss": 0.3059,
"step": 517
},
{
"epoch": 1.8926829268292682,
"grad_norm": 0.5722327245982995,
"learning_rate": 3.646122444168858e-06,
"loss": 0.3083,
"step": 518
},
{
"epoch": 1.8963414634146343,
"grad_norm": 0.5967837161066656,
"learning_rate": 3.6256730976631637e-06,
"loss": 0.3126,
"step": 519
},
{
"epoch": 1.9,
"grad_norm": 0.6048351316153445,
"learning_rate": 3.6052485882218124e-06,
"loss": 0.3004,
"step": 520
},
{
"epoch": 1.903658536585366,
"grad_norm": 0.6245134644651108,
"learning_rate": 3.5848492849599357e-06,
"loss": 0.2994,
"step": 521
},
{
"epoch": 1.9073170731707316,
"grad_norm": 0.5619671228019864,
"learning_rate": 3.564475556537136e-06,
"loss": 0.2899,
"step": 522
},
{
"epoch": 1.9109756097560977,
"grad_norm": 0.5927941945328951,
"learning_rate": 3.54412777115082e-06,
"loss": 0.3211,
"step": 523
},
{
"epoch": 1.9146341463414633,
"grad_norm": 0.6136632759095979,
"learning_rate": 3.5238062965295493e-06,
"loss": 0.2929,
"step": 524
},
{
"epoch": 1.9182926829268294,
"grad_norm": 0.6254834746812936,
"learning_rate": 3.5035114999263918e-06,
"loss": 0.2937,
"step": 525
},
{
"epoch": 1.921951219512195,
"grad_norm": 0.6045756197387149,
"learning_rate": 3.48324374811229e-06,
"loss": 0.3078,
"step": 526
},
{
"epoch": 1.9256097560975611,
"grad_norm": 0.6140608673385813,
"learning_rate": 3.463003407369424e-06,
"loss": 0.3162,
"step": 527
},
{
"epoch": 1.9292682926829268,
"grad_norm": 0.5738010260945632,
"learning_rate": 3.442790843484598e-06,
"loss": 0.2968,
"step": 528
},
{
"epoch": 1.9329268292682928,
"grad_norm": 0.6161416586651335,
"learning_rate": 3.4226064217426276e-06,
"loss": 0.2923,
"step": 529
},
{
"epoch": 1.9365853658536585,
"grad_norm": 0.5935224400360348,
"learning_rate": 3.4024505069197387e-06,
"loss": 0.2903,
"step": 530
},
{
"epoch": 1.9402439024390243,
"grad_norm": 0.60896816099721,
"learning_rate": 3.382323463276977e-06,
"loss": 0.307,
"step": 531
},
{
"epoch": 1.9439024390243902,
"grad_norm": 0.639666502462028,
"learning_rate": 3.362225654553623e-06,
"loss": 0.3048,
"step": 532
},
{
"epoch": 1.947560975609756,
"grad_norm": 0.569997113145275,
"learning_rate": 3.3421574439606186e-06,
"loss": 0.2926,
"step": 533
},
{
"epoch": 1.951219512195122,
"grad_norm": 0.6010854819630191,
"learning_rate": 3.322119194174003e-06,
"loss": 0.2962,
"step": 534
},
{
"epoch": 1.9548780487804878,
"grad_norm": 0.6088175492399895,
"learning_rate": 3.3021112673283574e-06,
"loss": 0.2832,
"step": 535
},
{
"epoch": 1.9585365853658536,
"grad_norm": 0.5883233331917551,
"learning_rate": 3.282134025010263e-06,
"loss": 0.3008,
"step": 536
},
{
"epoch": 1.9621951219512195,
"grad_norm": 0.5572804861289024,
"learning_rate": 3.2621878282517684e-06,
"loss": 0.2888,
"step": 537
},
{
"epoch": 1.9658536585365853,
"grad_norm": 0.5796243962499318,
"learning_rate": 3.2422730375238566e-06,
"loss": 0.2803,
"step": 538
},
{
"epoch": 1.9695121951219512,
"grad_norm": 0.5935266061720855,
"learning_rate": 3.222390012729938e-06,
"loss": 0.2994,
"step": 539
},
{
"epoch": 1.973170731707317,
"grad_norm": 0.5662598655298823,
"learning_rate": 3.2025391131993443e-06,
"loss": 0.2883,
"step": 540
},
{
"epoch": 1.976829268292683,
"grad_norm": 0.5753627844134035,
"learning_rate": 3.182720697680831e-06,
"loss": 0.2844,
"step": 541
},
{
"epoch": 1.9804878048780488,
"grad_norm": 0.637210434542176,
"learning_rate": 3.1629351243361007e-06,
"loss": 0.3185,
"step": 542
},
{
"epoch": 1.9841463414634146,
"grad_norm": 0.5627361281713845,
"learning_rate": 3.1431827507333257e-06,
"loss": 0.2941,
"step": 543
},
{
"epoch": 1.9878048780487805,
"grad_norm": 0.5959644102363714,
"learning_rate": 3.1234639338406867e-06,
"loss": 0.3001,
"step": 544
},
{
"epoch": 1.9914634146341463,
"grad_norm": 0.6077022077078119,
"learning_rate": 3.103779030019922e-06,
"loss": 0.308,
"step": 545
},
{
"epoch": 1.9951219512195122,
"grad_norm": 0.6082143409326339,
"learning_rate": 3.0841283950198875e-06,
"loss": 0.2887,
"step": 546
},
{
"epoch": 1.998780487804878,
"grad_norm": 0.5525286653885901,
"learning_rate": 3.064512383970124e-06,
"loss": 0.2702,
"step": 547
},
{
"epoch": 2.0,
"grad_norm": 0.5525286653885901,
"learning_rate": 3.044931351374451e-06,
"loss": 0.2863,
"step": 548
},
{
"epoch": 2.0036585365853656,
"grad_norm": 1.1491146976563753,
"learning_rate": 3.025385651104542e-06,
"loss": 0.2299,
"step": 549
},
{
"epoch": 2.0073170731707317,
"grad_norm": 0.5812839047916396,
"learning_rate": 3.0058756363935447e-06,
"loss": 0.2315,
"step": 550
},
{
"epoch": 2.0109756097560973,
"grad_norm": 0.599577291742412,
"learning_rate": 2.9864016598296896e-06,
"loss": 0.2454,
"step": 551
},
{
"epoch": 2.0146341463414634,
"grad_norm": 0.5633881485245802,
"learning_rate": 2.9669640733499184e-06,
"loss": 0.243,
"step": 552
},
{
"epoch": 2.018292682926829,
"grad_norm": 0.5392393159339521,
"learning_rate": 2.9475632282335265e-06,
"loss": 0.2488,
"step": 553
},
{
"epoch": 2.021951219512195,
"grad_norm": 0.6167736588501275,
"learning_rate": 2.928199475095816e-06,
"loss": 0.2482,
"step": 554
},
{
"epoch": 2.0256097560975608,
"grad_norm": 0.6763236210901113,
"learning_rate": 2.908873163881752e-06,
"loss": 0.2512,
"step": 555
},
{
"epoch": 2.029268292682927,
"grad_norm": 0.6732027253244217,
"learning_rate": 2.8895846438596462e-06,
"loss": 0.2346,
"step": 556
},
{
"epoch": 2.0329268292682925,
"grad_norm": 0.5689881452286486,
"learning_rate": 2.870334263614838e-06,
"loss": 0.2458,
"step": 557
},
{
"epoch": 2.0365853658536586,
"grad_norm": 0.6342251062567795,
"learning_rate": 2.8511223710434016e-06,
"loss": 0.2391,
"step": 558
},
{
"epoch": 2.040243902439024,
"grad_norm": 0.5367197192306216,
"learning_rate": 2.8319493133458575e-06,
"loss": 0.2364,
"step": 559
},
{
"epoch": 2.0439024390243903,
"grad_norm": 0.6229233192794811,
"learning_rate": 2.8128154370208895e-06,
"loss": 0.2372,
"step": 560
},
{
"epoch": 2.047560975609756,
"grad_norm": 0.6223140169848902,
"learning_rate": 2.7937210878590947e-06,
"loss": 0.2516,
"step": 561
},
{
"epoch": 2.051219512195122,
"grad_norm": 0.6351534138607783,
"learning_rate": 2.774666610936727e-06,
"loss": 0.24,
"step": 562
},
{
"epoch": 2.0548780487804876,
"grad_norm": 0.6000383406556633,
"learning_rate": 2.755652350609459e-06,
"loss": 0.242,
"step": 563
},
{
"epoch": 2.0585365853658537,
"grad_norm": 0.5808977074271768,
"learning_rate": 2.736678650506168e-06,
"loss": 0.2398,
"step": 564
},
{
"epoch": 2.0621951219512193,
"grad_norm": 0.6202232826347814,
"learning_rate": 2.71774585352272e-06,
"loss": 0.2596,
"step": 565
},
{
"epoch": 2.0658536585365854,
"grad_norm": 0.676317002155401,
"learning_rate": 2.6988543018157667e-06,
"loss": 0.2601,
"step": 566
},
{
"epoch": 2.069512195121951,
"grad_norm": 0.6261727462539696,
"learning_rate": 2.6800043367965754e-06,
"loss": 0.2357,
"step": 567
},
{
"epoch": 2.073170731707317,
"grad_norm": 0.5675481331948632,
"learning_rate": 2.6611962991248487e-06,
"loss": 0.2553,
"step": 568
},
{
"epoch": 2.0768292682926828,
"grad_norm": 0.5682792620688895,
"learning_rate": 2.642430528702568e-06,
"loss": 0.2423,
"step": 569
},
{
"epoch": 2.080487804878049,
"grad_norm": 0.6210900397441359,
"learning_rate": 2.6237073646678596e-06,
"loss": 0.2497,
"step": 570
},
{
"epoch": 2.0841463414634145,
"grad_norm": 0.6185039051605767,
"learning_rate": 2.60502714538886e-06,
"loss": 0.2527,
"step": 571
},
{
"epoch": 2.0878048780487806,
"grad_norm": 0.5830870526935391,
"learning_rate": 2.5863902084575943e-06,
"loss": 0.2419,
"step": 572
},
{
"epoch": 2.091463414634146,
"grad_norm": 0.5473178750068909,
"learning_rate": 2.5677968906838907e-06,
"loss": 0.232,
"step": 573
},
{
"epoch": 2.0951219512195123,
"grad_norm": 0.5587329875528281,
"learning_rate": 2.5492475280892757e-06,
"loss": 0.2495,
"step": 574
},
{
"epoch": 2.098780487804878,
"grad_norm": 0.5596060847586221,
"learning_rate": 2.5307424559009196e-06,
"loss": 0.2379,
"step": 575
},
{
"epoch": 2.102439024390244,
"grad_norm": 0.5447954930321499,
"learning_rate": 2.512282008545561e-06,
"loss": 0.242,
"step": 576
},
{
"epoch": 2.1060975609756096,
"grad_norm": 0.6243081793349892,
"learning_rate": 2.4938665196434775e-06,
"loss": 0.2559,
"step": 577
},
{
"epoch": 2.1097560975609757,
"grad_norm": 0.5983554972287728,
"learning_rate": 2.4754963220024452e-06,
"loss": 0.2383,
"step": 578
},
{
"epoch": 2.1134146341463413,
"grad_norm": 0.5601175525317587,
"learning_rate": 2.4571717476117302e-06,
"loss": 0.237,
"step": 579
},
{
"epoch": 2.1170731707317074,
"grad_norm": 0.5703488707814759,
"learning_rate": 2.4388931276360898e-06,
"loss": 0.2518,
"step": 580
},
{
"epoch": 2.120731707317073,
"grad_norm": 0.5762453114940407,
"learning_rate": 2.4206607924097857e-06,
"loss": 0.2442,
"step": 581
},
{
"epoch": 2.124390243902439,
"grad_norm": 0.5983912760451229,
"learning_rate": 2.4024750714306093e-06,
"loss": 0.2597,
"step": 582
},
{
"epoch": 2.1280487804878048,
"grad_norm": 0.5984265659585949,
"learning_rate": 2.384336293353938e-06,
"loss": 0.2311,
"step": 583
},
{
"epoch": 2.131707317073171,
"grad_norm": 0.604350894913572,
"learning_rate": 2.3662447859867837e-06,
"loss": 0.2535,
"step": 584
},
{
"epoch": 2.1353658536585365,
"grad_norm": 0.535421144423019,
"learning_rate": 2.3482008762818727e-06,
"loss": 0.2234,
"step": 585
},
{
"epoch": 2.1390243902439026,
"grad_norm": 0.5421114833925664,
"learning_rate": 2.3302048903317497e-06,
"loss": 0.2612,
"step": 586
},
{
"epoch": 2.142682926829268,
"grad_norm": 0.6046980678553843,
"learning_rate": 2.312257153362862e-06,
"loss": 0.2387,
"step": 587
},
{
"epoch": 2.1463414634146343,
"grad_norm": 0.6109825890260535,
"learning_rate": 2.2943579897296947e-06,
"loss": 0.2369,
"step": 588
},
{
"epoch": 2.15,
"grad_norm": 0.5778782524223457,
"learning_rate": 2.2765077229089146e-06,
"loss": 0.2306,
"step": 589
},
{
"epoch": 2.153658536585366,
"grad_norm": 0.5706488907922543,
"learning_rate": 2.2587066754935088e-06,
"loss": 0.2406,
"step": 590
},
{
"epoch": 2.1573170731707316,
"grad_norm": 0.5950872809237095,
"learning_rate": 2.240955169186965e-06,
"loss": 0.2506,
"step": 591
},
{
"epoch": 2.1609756097560977,
"grad_norm": 0.5700219691586104,
"learning_rate": 2.223253524797463e-06,
"loss": 0.2453,
"step": 592
},
{
"epoch": 2.1646341463414633,
"grad_norm": 0.5995511680667281,
"learning_rate": 2.2056020622320614e-06,
"loss": 0.2495,
"step": 593
},
{
"epoch": 2.1682926829268294,
"grad_norm": 0.6136735169642444,
"learning_rate": 2.1880011004909253e-06,
"loss": 0.2306,
"step": 594
},
{
"epoch": 2.171951219512195,
"grad_norm": 0.6256640342184288,
"learning_rate": 2.170450957661566e-06,
"loss": 0.2329,
"step": 595
},
{
"epoch": 2.175609756097561,
"grad_norm": 0.548577114684068,
"learning_rate": 2.15295195091308e-06,
"loss": 0.2275,
"step": 596
},
{
"epoch": 2.1792682926829268,
"grad_norm": 0.5687383781384396,
"learning_rate": 2.135504396490429e-06,
"loss": 0.2321,
"step": 597
},
{
"epoch": 2.182926829268293,
"grad_norm": 0.5579917295229633,
"learning_rate": 2.1181086097087204e-06,
"loss": 0.2318,
"step": 598
},
{
"epoch": 2.1865853658536585,
"grad_norm": 0.580363538298193,
"learning_rate": 2.1007649049475046e-06,
"loss": 0.2482,
"step": 599
},
{
"epoch": 2.1902439024390246,
"grad_norm": 0.5341887911643112,
"learning_rate": 2.083473595645096e-06,
"loss": 0.2454,
"step": 600
},
{
"epoch": 2.19390243902439,
"grad_norm": 0.6095901733738088,
"learning_rate": 2.066234994292916e-06,
"loss": 0.2525,
"step": 601
},
{
"epoch": 2.1975609756097563,
"grad_norm": 0.59396704070015,
"learning_rate": 2.0490494124298314e-06,
"loss": 0.2372,
"step": 602
},
{
"epoch": 2.201219512195122,
"grad_norm": 0.6081435264800101,
"learning_rate": 2.031917160636537e-06,
"loss": 0.2461,
"step": 603
},
{
"epoch": 2.204878048780488,
"grad_norm": 0.5609971726135861,
"learning_rate": 2.01483854852994e-06,
"loss": 0.2345,
"step": 604
},
{
"epoch": 2.2085365853658536,
"grad_norm": 0.5923610417844827,
"learning_rate": 1.997813884757555e-06,
"loss": 0.2486,
"step": 605
},
{
"epoch": 2.2121951219512197,
"grad_norm": 0.5783000329707068,
"learning_rate": 1.980843476991936e-06,
"loss": 0.2334,
"step": 606
},
{
"epoch": 2.2158536585365853,
"grad_norm": 0.5534361667872443,
"learning_rate": 1.9639276319251166e-06,
"loss": 0.2569,
"step": 607
},
{
"epoch": 2.2195121951219514,
"grad_norm": 0.5721660940023355,
"learning_rate": 1.947066655263064e-06,
"loss": 0.2516,
"step": 608
},
{
"epoch": 2.223170731707317,
"grad_norm": 0.5824791898002271,
"learning_rate": 1.93026085172015e-06,
"loss": 0.2321,
"step": 609
},
{
"epoch": 2.226829268292683,
"grad_norm": 0.6026540847795739,
"learning_rate": 1.91351052501365e-06,
"loss": 0.2487,
"step": 610
},
{
"epoch": 2.2304878048780488,
"grad_norm": 0.5698599518813655,
"learning_rate": 1.8968159778582572e-06,
"loss": 0.2662,
"step": 611
},
{
"epoch": 2.234146341463415,
"grad_norm": 0.6004371446298645,
"learning_rate": 1.8801775119606009e-06,
"loss": 0.2462,
"step": 612
},
{
"epoch": 2.2378048780487805,
"grad_norm": 0.5767977787961263,
"learning_rate": 1.8635954280138058e-06,
"loss": 0.25,
"step": 613
},
{
"epoch": 2.241463414634146,
"grad_norm": 0.5761299967173042,
"learning_rate": 1.8470700256920527e-06,
"loss": 0.231,
"step": 614
},
{
"epoch": 2.245121951219512,
"grad_norm": 0.6029460653486376,
"learning_rate": 1.8306016036451584e-06,
"loss": 0.2481,
"step": 615
},
{
"epoch": 2.2487804878048783,
"grad_norm": 0.603799230865757,
"learning_rate": 1.8141904594931836e-06,
"loss": 0.2491,
"step": 616
},
{
"epoch": 2.252439024390244,
"grad_norm": 0.5651935206892335,
"learning_rate": 1.7978368898210585e-06,
"loss": 0.2221,
"step": 617
},
{
"epoch": 2.2560975609756095,
"grad_norm": 0.6155871393031543,
"learning_rate": 1.7815411901732093e-06,
"loss": 0.2418,
"step": 618
},
{
"epoch": 2.2597560975609756,
"grad_norm": 0.5822183713085968,
"learning_rate": 1.765303655048234e-06,
"loss": 0.2418,
"step": 619
},
{
"epoch": 2.2634146341463417,
"grad_norm": 0.6162680679231551,
"learning_rate": 1.7491245778935673e-06,
"loss": 0.2328,
"step": 620
},
{
"epoch": 2.2670731707317073,
"grad_norm": 0.5734640882021516,
"learning_rate": 1.733004251100182e-06,
"loss": 0.2399,
"step": 621
},
{
"epoch": 2.270731707317073,
"grad_norm": 0.5802580047444629,
"learning_rate": 1.7169429659973024e-06,
"loss": 0.2335,
"step": 622
},
{
"epoch": 2.274390243902439,
"grad_norm": 0.5885677954780522,
"learning_rate": 1.7009410128471481e-06,
"loss": 0.2313,
"step": 623
},
{
"epoch": 2.278048780487805,
"grad_norm": 0.5805060869510791,
"learning_rate": 1.6849986808396746e-06,
"loss": 0.2489,
"step": 624
},
{
"epoch": 2.2817073170731708,
"grad_norm": 0.5692390222760896,
"learning_rate": 1.6691162580873576e-06,
"loss": 0.2544,
"step": 625
},
{
"epoch": 2.2853658536585364,
"grad_norm": 0.6250325114294697,
"learning_rate": 1.6532940316199853e-06,
"loss": 0.2529,
"step": 626
},
{
"epoch": 2.2890243902439025,
"grad_norm": 0.5634074649409233,
"learning_rate": 1.6375322873794635e-06,
"loss": 0.226,
"step": 627
},
{
"epoch": 2.292682926829268,
"grad_norm": 0.5118509902258667,
"learning_rate": 1.6218313102146544e-06,
"loss": 0.2369,
"step": 628
},
{
"epoch": 2.296341463414634,
"grad_norm": 0.5720341341375265,
"learning_rate": 1.60619138387623e-06,
"loss": 0.243,
"step": 629
},
{
"epoch": 2.3,
"grad_norm": 0.614966015901642,
"learning_rate": 1.5906127910115414e-06,
"loss": 0.2276,
"step": 630
},
{
"epoch": 2.303658536585366,
"grad_norm": 0.5378503989863892,
"learning_rate": 1.5750958131595072e-06,
"loss": 0.2126,
"step": 631
},
{
"epoch": 2.3073170731707315,
"grad_norm": 0.5645461452595779,
"learning_rate": 1.559640730745534e-06,
"loss": 0.2478,
"step": 632
},
{
"epoch": 2.3109756097560976,
"grad_norm": 0.5478752076017742,
"learning_rate": 1.5442478230764412e-06,
"loss": 0.2877,
"step": 633
},
{
"epoch": 2.3146341463414632,
"grad_norm": 0.5960823599634195,
"learning_rate": 1.528917368335413e-06,
"loss": 0.2497,
"step": 634
},
{
"epoch": 2.3182926829268293,
"grad_norm": 0.5756343891738284,
"learning_rate": 1.5136496435769804e-06,
"loss": 0.2432,
"step": 635
},
{
"epoch": 2.321951219512195,
"grad_norm": 0.558342457601641,
"learning_rate": 1.4984449247220046e-06,
"loss": 0.2285,
"step": 636
},
{
"epoch": 2.325609756097561,
"grad_norm": 0.5562662816415436,
"learning_rate": 1.4833034865526913e-06,
"loss": 0.2395,
"step": 637
},
{
"epoch": 2.3292682926829267,
"grad_norm": 0.5789722140208441,
"learning_rate": 1.4682256027076313e-06,
"loss": 0.2557,
"step": 638
},
{
"epoch": 2.3329268292682928,
"grad_norm": 0.6242635332827038,
"learning_rate": 1.4532115456768485e-06,
"loss": 0.2551,
"step": 639
},
{
"epoch": 2.3365853658536584,
"grad_norm": 0.6275804931184046,
"learning_rate": 1.4382615867968768e-06,
"loss": 0.2363,
"step": 640
},
{
"epoch": 2.3402439024390245,
"grad_norm": 0.5319917903538446,
"learning_rate": 1.4233759962458604e-06,
"loss": 0.2384,
"step": 641
},
{
"epoch": 2.34390243902439,
"grad_norm": 0.5755574311949859,
"learning_rate": 1.4085550430386696e-06,
"loss": 0.2377,
"step": 642
},
{
"epoch": 2.347560975609756,
"grad_norm": 0.5723847122260605,
"learning_rate": 1.3937989950220321e-06,
"loss": 0.239,
"step": 643
},
{
"epoch": 2.351219512195122,
"grad_norm": 0.6309713290014296,
"learning_rate": 1.3791081188697047e-06,
"loss": 0.2441,
"step": 644
},
{
"epoch": 2.354878048780488,
"grad_norm": 0.5355989960002594,
"learning_rate": 1.3644826800776434e-06,
"loss": 0.2412,
"step": 645
},
{
"epoch": 2.3585365853658535,
"grad_norm": 0.581694065503623,
"learning_rate": 1.3499229429592087e-06,
"loss": 0.243,
"step": 646
},
{
"epoch": 2.3621951219512196,
"grad_norm": 0.597316677443761,
"learning_rate": 1.3354291706403926e-06,
"loss": 0.2477,
"step": 647
},
{
"epoch": 2.3658536585365852,
"grad_norm": 0.555894151254139,
"learning_rate": 1.3210016250550605e-06,
"loss": 0.228,
"step": 648
},
{
"epoch": 2.3695121951219513,
"grad_norm": 0.5746285862835825,
"learning_rate": 1.3066405669402126e-06,
"loss": 0.2465,
"step": 649
},
{
"epoch": 2.373170731707317,
"grad_norm": 0.6008983785495297,
"learning_rate": 1.2923462558312827e-06,
"loss": 0.2502,
"step": 650
},
{
"epoch": 2.376829268292683,
"grad_norm": 0.5750611972493124,
"learning_rate": 1.2781189500574354e-06,
"loss": 0.2622,
"step": 651
},
{
"epoch": 2.3804878048780487,
"grad_norm": 0.5678813804956473,
"learning_rate": 1.26395890673691e-06,
"loss": 0.245,
"step": 652
},
{
"epoch": 2.3841463414634148,
"grad_norm": 0.597798261155427,
"learning_rate": 1.2498663817723604e-06,
"loss": 0.247,
"step": 653
},
{
"epoch": 2.3878048780487804,
"grad_norm": 0.5512370776781218,
"learning_rate": 1.2358416298462456e-06,
"loss": 0.235,
"step": 654
},
{
"epoch": 2.3914634146341465,
"grad_norm": 0.5707273910330682,
"learning_rate": 1.2218849044162112e-06,
"loss": 0.2416,
"step": 655
},
{
"epoch": 2.395121951219512,
"grad_norm": 0.5839165892388363,
"learning_rate": 1.2079964577105241e-06,
"loss": 0.2536,
"step": 656
},
{
"epoch": 2.398780487804878,
"grad_norm": 0.5839125276922039,
"learning_rate": 1.194176540723499e-06,
"loss": 0.2479,
"step": 657
},
{
"epoch": 2.402439024390244,
"grad_norm": 0.5345728457615189,
"learning_rate": 1.1804254032109774e-06,
"loss": 0.236,
"step": 658
},
{
"epoch": 2.40609756097561,
"grad_norm": 0.5439161399421847,
"learning_rate": 1.1667432936858002e-06,
"loss": 0.2389,
"step": 659
},
{
"epoch": 2.4097560975609755,
"grad_norm": 0.5698271145521743,
"learning_rate": 1.1531304594133297e-06,
"loss": 0.2324,
"step": 660
},
{
"epoch": 2.4134146341463416,
"grad_norm": 0.5160638872772833,
"learning_rate": 1.139587146406969e-06,
"loss": 0.2314,
"step": 661
},
{
"epoch": 2.4170731707317072,
"grad_norm": 0.5351749178167325,
"learning_rate": 1.1261135994237204e-06,
"loss": 0.2353,
"step": 662
},
{
"epoch": 2.4207317073170733,
"grad_norm": 0.5698909345124533,
"learning_rate": 1.1127100619597715e-06,
"loss": 0.2523,
"step": 663
},
{
"epoch": 2.424390243902439,
"grad_norm": 0.5686270061728808,
"learning_rate": 1.0993767762460777e-06,
"loss": 0.2381,
"step": 664
},
{
"epoch": 2.428048780487805,
"grad_norm": 0.6150451380411043,
"learning_rate": 1.0861139832439938e-06,
"loss": 0.2365,
"step": 665
},
{
"epoch": 2.4317073170731707,
"grad_norm": 0.5493712156080535,
"learning_rate": 1.0729219226409242e-06,
"loss": 0.2361,
"step": 666
},
{
"epoch": 2.4353658536585368,
"grad_norm": 0.571359468750548,
"learning_rate": 1.0598008328459797e-06,
"loss": 0.2281,
"step": 667
},
{
"epoch": 2.4390243902439024,
"grad_norm": 0.567709087384941,
"learning_rate": 1.0467509509856772e-06,
"loss": 0.2234,
"step": 668
},
{
"epoch": 2.4426829268292685,
"grad_norm": 0.5373694816069636,
"learning_rate": 1.0337725128996544e-06,
"loss": 0.2348,
"step": 669
},
{
"epoch": 2.446341463414634,
"grad_norm": 0.6077276375613963,
"learning_rate": 1.020865753136402e-06,
"loss": 0.2323,
"step": 670
},
{
"epoch": 2.45,
"grad_norm": 0.5085168375614834,
"learning_rate": 1.008030904949026e-06,
"loss": 0.2205,
"step": 671
},
{
"epoch": 2.453658536585366,
"grad_norm": 0.5897640252687923,
"learning_rate": 9.952682002910412e-07,
"loss": 0.2398,
"step": 672
},
{
"epoch": 2.457317073170732,
"grad_norm": 0.5547114702268138,
"learning_rate": 9.825778698121663e-07,
"loss": 0.2377,
"step": 673
},
{
"epoch": 2.4609756097560975,
"grad_norm": 0.5983889476544828,
"learning_rate": 9.69960142854165e-07,
"loss": 0.2521,
"step": 674
},
{
"epoch": 2.4646341463414636,
"grad_norm": 0.5493142204332745,
"learning_rate": 9.574152474466986e-07,
"loss": 0.2572,
"step": 675
},
{
"epoch": 2.4682926829268292,
"grad_norm": 0.5719327457080328,
"learning_rate": 9.449434103032018e-07,
"loss": 0.2476,
"step": 676
},
{
"epoch": 2.471951219512195,
"grad_norm": 0.5785120866063957,
"learning_rate": 9.325448568167888e-07,
"loss": 0.2397,
"step": 677
},
{
"epoch": 2.475609756097561,
"grad_norm": 0.5671550898722578,
"learning_rate": 9.202198110561817e-07,
"loss": 0.2486,
"step": 678
},
{
"epoch": 2.479268292682927,
"grad_norm": 0.5765095536540357,
"learning_rate": 9.07968495761658e-07,
"loss": 0.2427,
"step": 679
},
{
"epoch": 2.4829268292682927,
"grad_norm": 0.5484215020688389,
"learning_rate": 8.957911323410229e-07,
"loss": 0.2417,
"step": 680
},
{
"epoch": 2.4865853658536583,
"grad_norm": 0.6023263022216668,
"learning_rate": 8.836879408656157e-07,
"loss": 0.2429,
"step": 681
},
{
"epoch": 2.4902439024390244,
"grad_norm": 0.5797747760934718,
"learning_rate": 8.716591400663249e-07,
"loss": 0.2281,
"step": 682
},
{
"epoch": 2.4939024390243905,
"grad_norm": 0.572555201842755,
"learning_rate": 8.59704947329637e-07,
"loss": 0.2401,
"step": 683
},
{
"epoch": 2.497560975609756,
"grad_norm": 0.5415634954670027,
"learning_rate": 8.478255786937129e-07,
"loss": 0.229,
"step": 684
},
{
"epoch": 2.5012195121951217,
"grad_norm": 0.5113662473102487,
"learning_rate": 8.360212488444797e-07,
"loss": 0.2422,
"step": 685
},
{
"epoch": 2.504878048780488,
"grad_norm": 0.603131680716288,
"learning_rate": 8.242921711117469e-07,
"loss": 0.2449,
"step": 686
},
{
"epoch": 2.508536585365854,
"grad_norm": 0.5711686630435312,
"learning_rate": 8.126385574653606e-07,
"loss": 0.2442,
"step": 687
},
{
"epoch": 2.5121951219512195,
"grad_norm": 0.5710312547579648,
"learning_rate": 8.010606185113628e-07,
"loss": 0.2509,
"step": 688
},
{
"epoch": 2.515853658536585,
"grad_norm": 0.6000591180723375,
"learning_rate": 7.89558563488192e-07,
"loss": 0.2322,
"step": 689
},
{
"epoch": 2.5195121951219512,
"grad_norm": 0.6188613367700759,
"learning_rate": 7.781326002628991e-07,
"loss": 0.2375,
"step": 690
},
{
"epoch": 2.5231707317073173,
"grad_norm": 0.6233174102298019,
"learning_rate": 7.667829353273943e-07,
"loss": 0.2379,
"step": 691
},
{
"epoch": 2.526829268292683,
"grad_norm": 0.549218628145928,
"learning_rate": 7.555097737947076e-07,
"loss": 0.2409,
"step": 692
},
{
"epoch": 2.5304878048780486,
"grad_norm": 0.5650966641823335,
"learning_rate": 7.443133193952884e-07,
"loss": 0.2493,
"step": 693
},
{
"epoch": 2.5341463414634147,
"grad_norm": 0.5794507269900794,
"learning_rate": 7.331937744733248e-07,
"loss": 0.2436,
"step": 694
},
{
"epoch": 2.5378048780487807,
"grad_norm": 0.6045788546182774,
"learning_rate": 7.221513399830798e-07,
"loss": 0.2614,
"step": 695
},
{
"epoch": 2.5414634146341464,
"grad_norm": 0.5938491421154302,
"learning_rate": 7.111862154852672e-07,
"loss": 0.2303,
"step": 696
},
{
"epoch": 2.545121951219512,
"grad_norm": 0.5442083357758543,
"learning_rate": 7.002985991434418e-07,
"loss": 0.245,
"step": 697
},
{
"epoch": 2.548780487804878,
"grad_norm": 0.5712927405061993,
"learning_rate": 6.894886877204155e-07,
"loss": 0.2395,
"step": 698
},
{
"epoch": 2.552439024390244,
"grad_norm": 0.5546735475244872,
"learning_rate": 6.78756676574704e-07,
"loss": 0.2296,
"step": 699
},
{
"epoch": 2.55609756097561,
"grad_norm": 0.5273016852140356,
"learning_rate": 6.681027596569988e-07,
"loss": 0.2305,
"step": 700
},
{
"epoch": 2.5597560975609754,
"grad_norm": 0.5613527654490458,
"learning_rate": 6.575271295066593e-07,
"loss": 0.2318,
"step": 701
},
{
"epoch": 2.5634146341463415,
"grad_norm": 0.5617712758782253,
"learning_rate": 6.470299772482307e-07,
"loss": 0.2424,
"step": 702
},
{
"epoch": 2.567073170731707,
"grad_norm": 0.5657700489243026,
"learning_rate": 6.366114925879962e-07,
"loss": 0.2368,
"step": 703
},
{
"epoch": 2.5707317073170732,
"grad_norm": 0.5708454774862058,
"learning_rate": 6.262718638105425e-07,
"loss": 0.2343,
"step": 704
},
{
"epoch": 2.574390243902439,
"grad_norm": 0.5409052342393588,
"learning_rate": 6.160112777753585e-07,
"loss": 0.2451,
"step": 705
},
{
"epoch": 2.578048780487805,
"grad_norm": 0.5513303692706916,
"learning_rate": 6.058299199134637e-07,
"loss": 0.2414,
"step": 706
},
{
"epoch": 2.5817073170731706,
"grad_norm": 0.5529279063327968,
"learning_rate": 5.957279742240507e-07,
"loss": 0.2272,
"step": 707
},
{
"epoch": 2.5853658536585367,
"grad_norm": 0.5405182573819264,
"learning_rate": 5.857056232711611e-07,
"loss": 0.2251,
"step": 708
},
{
"epoch": 2.5890243902439023,
"grad_norm": 0.5289491829969317,
"learning_rate": 5.757630481803889e-07,
"loss": 0.2375,
"step": 709
},
{
"epoch": 2.5926829268292684,
"grad_norm": 0.5452238409528825,
"learning_rate": 5.659004286356045e-07,
"loss": 0.2319,
"step": 710
},
{
"epoch": 2.596341463414634,
"grad_norm": 0.6172680232767701,
"learning_rate": 5.561179428757063e-07,
"loss": 0.241,
"step": 711
},
{
"epoch": 2.6,
"grad_norm": 0.5346562294926773,
"learning_rate": 5.464157676914078e-07,
"loss": 0.2358,
"step": 712
},
{
"epoch": 2.6036585365853657,
"grad_norm": 0.5682283402698644,
"learning_rate": 5.367940784220305e-07,
"loss": 0.246,
"step": 713
},
{
"epoch": 2.607317073170732,
"grad_norm": 0.5468426751424924,
"learning_rate": 5.272530489523425e-07,
"loss": 0.2589,
"step": 714
},
{
"epoch": 2.6109756097560974,
"grad_norm": 0.5748997644311935,
"learning_rate": 5.177928517094166e-07,
"loss": 0.2341,
"step": 715
},
{
"epoch": 2.6146341463414635,
"grad_norm": 0.5751311413606573,
"learning_rate": 5.0841365765951e-07,
"loss": 0.2486,
"step": 716
},
{
"epoch": 2.618292682926829,
"grad_norm": 0.5490008995229373,
"learning_rate": 4.991156363049765e-07,
"loss": 0.2442,
"step": 717
},
{
"epoch": 2.6219512195121952,
"grad_norm": 0.5750789304658456,
"learning_rate": 4.89898955681205e-07,
"loss": 0.2538,
"step": 718
},
{
"epoch": 2.625609756097561,
"grad_norm": 0.5989073598299415,
"learning_rate": 4.807637823535821e-07,
"loss": 0.2465,
"step": 719
},
{
"epoch": 2.629268292682927,
"grad_norm": 0.595299686305544,
"learning_rate": 4.7171028141447693e-07,
"loss": 0.2422,
"step": 720
},
{
"epoch": 2.6329268292682926,
"grad_norm": 0.5554287718554282,
"learning_rate": 4.627386164802661e-07,
"loss": 0.2527,
"step": 721
},
{
"epoch": 2.6365853658536587,
"grad_norm": 0.5661580070133602,
"learning_rate": 4.538489496883686e-07,
"loss": 0.2446,
"step": 722
},
{
"epoch": 2.6402439024390243,
"grad_norm": 0.5256299309600876,
"learning_rate": 4.450414416943233e-07,
"loss": 0.2343,
"step": 723
},
{
"epoch": 2.6439024390243904,
"grad_norm": 0.5594023249360274,
"learning_rate": 4.363162516688774e-07,
"loss": 0.2277,
"step": 724
},
{
"epoch": 2.647560975609756,
"grad_norm": 0.547174675788788,
"learning_rate": 4.2767353729511796e-07,
"loss": 0.2401,
"step": 725
},
{
"epoch": 2.651219512195122,
"grad_norm": 0.5698482629553446,
"learning_rate": 4.191134547656145e-07,
"loss": 0.2367,
"step": 726
},
{
"epoch": 2.6548780487804877,
"grad_norm": 0.5245951306910515,
"learning_rate": 4.1063615877960427e-07,
"loss": 0.2594,
"step": 727
},
{
"epoch": 2.658536585365854,
"grad_norm": 0.5429005359039983,
"learning_rate": 4.0224180254018807e-07,
"loss": 0.2364,
"step": 728
},
{
"epoch": 2.6621951219512194,
"grad_norm": 0.5582877597327396,
"learning_rate": 3.9393053775156955e-07,
"loss": 0.2412,
"step": 729
},
{
"epoch": 2.6658536585365855,
"grad_norm": 0.5966956037127704,
"learning_rate": 3.8570251461630735e-07,
"loss": 0.2292,
"step": 730
},
{
"epoch": 2.669512195121951,
"grad_norm": 0.5522903020151757,
"learning_rate": 3.775578818326048e-07,
"loss": 0.2335,
"step": 731
},
{
"epoch": 2.6731707317073172,
"grad_norm": 0.5229730045275881,
"learning_rate": 3.6949678659161827e-07,
"loss": 0.2399,
"step": 732
},
{
"epoch": 2.676829268292683,
"grad_norm": 0.5639593382334762,
"learning_rate": 3.615193745748036e-07,
"loss": 0.2283,
"step": 733
},
{
"epoch": 2.680487804878049,
"grad_norm": 0.5951326633000662,
"learning_rate": 3.536257899512768e-07,
"loss": 0.2441,
"step": 734
},
{
"epoch": 2.6841463414634146,
"grad_norm": 0.5688456984898208,
"learning_rate": 3.458161753752126e-07,
"loss": 0.2319,
"step": 735
},
{
"epoch": 2.68780487804878,
"grad_norm": 0.5290898917591977,
"learning_rate": 3.380906719832627e-07,
"loss": 0.2555,
"step": 736
},
{
"epoch": 2.6914634146341463,
"grad_norm": 0.5418544486569097,
"learning_rate": 3.3044941939201104e-07,
"loss": 0.2251,
"step": 737
},
{
"epoch": 2.6951219512195124,
"grad_norm": 0.5400829302955231,
"learning_rate": 3.228925556954443e-07,
"loss": 0.2469,
"step": 738
},
{
"epoch": 2.698780487804878,
"grad_norm": 0.5613160759441016,
"learning_rate": 3.1542021746245934e-07,
"loss": 0.2445,
"step": 739
},
{
"epoch": 2.7024390243902436,
"grad_norm": 0.5824767170530214,
"learning_rate": 3.080325397343969e-07,
"loss": 0.2408,
"step": 740
},
{
"epoch": 2.7060975609756097,
"grad_norm": 0.5395944310522093,
"learning_rate": 3.007296560225975e-07,
"loss": 0.2526,
"step": 741
},
{
"epoch": 2.709756097560976,
"grad_norm": 0.5242409861099263,
"learning_rate": 2.935116983059888e-07,
"loss": 0.2302,
"step": 742
},
{
"epoch": 2.7134146341463414,
"grad_norm": 0.5913937186515237,
"learning_rate": 2.8637879702870407e-07,
"loss": 0.2344,
"step": 743
},
{
"epoch": 2.717073170731707,
"grad_norm": 0.5906361021677703,
"learning_rate": 2.7933108109772066e-07,
"loss": 0.2351,
"step": 744
},
{
"epoch": 2.720731707317073,
"grad_norm": 0.5949066439617051,
"learning_rate": 2.7236867788053343e-07,
"loss": 0.2373,
"step": 745
},
{
"epoch": 2.7243902439024392,
"grad_norm": 0.5471386508447823,
"learning_rate": 2.6549171320285226e-07,
"loss": 0.2352,
"step": 746
},
{
"epoch": 2.728048780487805,
"grad_norm": 0.5552310997986438,
"learning_rate": 2.5870031134632543e-07,
"loss": 0.2278,
"step": 747
},
{
"epoch": 2.7317073170731705,
"grad_norm": 0.5591003186791976,
"learning_rate": 2.519945950462965e-07,
"loss": 0.236,
"step": 748
},
{
"epoch": 2.7353658536585366,
"grad_norm": 0.6216695411274681,
"learning_rate": 2.453746854895861e-07,
"loss": 0.2592,
"step": 749
},
{
"epoch": 2.7390243902439027,
"grad_norm": 0.5427018458395848,
"learning_rate": 2.388407023123007e-07,
"loss": 0.235,
"step": 750
},
{
"epoch": 2.7426829268292683,
"grad_norm": 0.5646945272289545,
"learning_rate": 2.3239276359767025e-07,
"loss": 0.2542,
"step": 751
},
{
"epoch": 2.746341463414634,
"grad_norm": 0.5571032201299158,
"learning_rate": 2.2603098587391737e-07,
"loss": 0.2386,
"step": 752
},
{
"epoch": 2.75,
"grad_norm": 0.544300301663097,
"learning_rate": 2.1975548411214577e-07,
"loss": 0.2376,
"step": 753
},
{
"epoch": 2.753658536585366,
"grad_norm": 0.5797633979529553,
"learning_rate": 2.1356637172426697e-07,
"loss": 0.245,
"step": 754
},
{
"epoch": 2.7573170731707317,
"grad_norm": 0.5332930378597391,
"learning_rate": 2.074637605609492e-07,
"loss": 0.2382,
"step": 755
},
{
"epoch": 2.7609756097560973,
"grad_norm": 0.5798577844773429,
"learning_rate": 2.0144776090959718e-07,
"loss": 0.2428,
"step": 756
},
{
"epoch": 2.7646341463414634,
"grad_norm": 0.6090360913976046,
"learning_rate": 1.9551848149235607e-07,
"loss": 0.245,
"step": 757
},
{
"epoch": 2.7682926829268295,
"grad_norm": 0.5839943430547335,
"learning_rate": 1.8967602946415088e-07,
"loss": 0.2563,
"step": 758
},
{
"epoch": 2.771951219512195,
"grad_norm": 0.5258252983191701,
"learning_rate": 1.8392051041074498e-07,
"loss": 0.2343,
"step": 759
},
{
"epoch": 2.7756097560975608,
"grad_norm": 0.5506279521494741,
"learning_rate": 1.782520283468364e-07,
"loss": 0.2446,
"step": 760
},
{
"epoch": 2.779268292682927,
"grad_norm": 0.5634761604767314,
"learning_rate": 1.7267068571417633e-07,
"loss": 0.2398,
"step": 761
},
{
"epoch": 2.782926829268293,
"grad_norm": 0.5980586546172675,
"learning_rate": 1.671765833797162e-07,
"loss": 0.2429,
"step": 762
},
{
"epoch": 2.7865853658536586,
"grad_norm": 0.5500214071241183,
"learning_rate": 1.6176982063378754e-07,
"loss": 0.2495,
"step": 763
},
{
"epoch": 2.790243902439024,
"grad_norm": 0.575157278926917,
"learning_rate": 1.5645049518830614e-07,
"loss": 0.2462,
"step": 764
},
{
"epoch": 2.7939024390243903,
"grad_norm": 0.5445436607053848,
"learning_rate": 1.512187031750062e-07,
"loss": 0.2324,
"step": 765
},
{
"epoch": 2.7975609756097564,
"grad_norm": 0.5329228786605588,
"learning_rate": 1.4607453914370185e-07,
"loss": 0.2338,
"step": 766
},
{
"epoch": 2.801219512195122,
"grad_norm": 0.5536006141829138,
"learning_rate": 1.410180960605817e-07,
"loss": 0.2276,
"step": 767
},
{
"epoch": 2.8048780487804876,
"grad_norm": 0.5501904220602625,
"learning_rate": 1.3604946530652695e-07,
"loss": 0.2493,
"step": 768
},
{
"epoch": 2.8085365853658537,
"grad_norm": 0.5609170603936632,
"learning_rate": 1.3116873667545827e-07,
"loss": 0.2351,
"step": 769
},
{
"epoch": 2.81219512195122,
"grad_norm": 0.5594953404382451,
"learning_rate": 1.263759983727142e-07,
"loss": 0.2418,
"step": 770
},
{
"epoch": 2.8158536585365854,
"grad_norm": 0.5575873120930057,
"learning_rate": 1.2167133701345979e-07,
"loss": 0.2291,
"step": 771
},
{
"epoch": 2.819512195121951,
"grad_norm": 0.5430632979708804,
"learning_rate": 1.1705483762111725e-07,
"loss": 0.2401,
"step": 772
},
{
"epoch": 2.823170731707317,
"grad_norm": 0.5474457581234221,
"learning_rate": 1.1252658362583102e-07,
"loss": 0.2497,
"step": 773
},
{
"epoch": 2.8268292682926828,
"grad_norm": 0.5688470281131273,
"learning_rate": 1.0808665686296072e-07,
"loss": 0.231,
"step": 774
},
{
"epoch": 2.830487804878049,
"grad_norm": 0.5254433758139256,
"learning_rate": 1.0373513757160114e-07,
"loss": 0.2255,
"step": 775
},
{
"epoch": 2.8341463414634145,
"grad_norm": 0.5578309858607658,
"learning_rate": 9.947210439313237e-08,
"loss": 0.2524,
"step": 776
},
{
"epoch": 2.8378048780487806,
"grad_norm": 0.5999128731542565,
"learning_rate": 9.529763436979923e-08,
"loss": 0.2532,
"step": 777
},
{
"epoch": 2.841463414634146,
"grad_norm": 0.5547526000184041,
"learning_rate": 9.121180294331844e-08,
"loss": 0.246,
"step": 778
},
{
"epoch": 2.8451219512195123,
"grad_norm": 0.6243311864358262,
"learning_rate": 8.721468395351428e-08,
"loss": 0.2481,
"step": 779
},
{
"epoch": 2.848780487804878,
"grad_norm": 0.5407753899453527,
"learning_rate": 8.33063496369868e-08,
"loss": 0.2482,
"step": 780
},
{
"epoch": 2.852439024390244,
"grad_norm": 0.5475952015446557,
"learning_rate": 7.948687062580341e-08,
"loss": 0.2431,
"step": 781
},
{
"epoch": 2.8560975609756096,
"grad_norm": 0.5804952728263567,
"learning_rate": 7.575631594622323e-08,
"loss": 0.2261,
"step": 782
},
{
"epoch": 2.8597560975609757,
"grad_norm": 0.5610293885829224,
"learning_rate": 7.211475301745264e-08,
"loss": 0.2589,
"step": 783
},
{
"epoch": 2.8634146341463413,
"grad_norm": 0.571595545059285,
"learning_rate": 6.856224765042163e-08,
"loss": 0.2307,
"step": 784
},
{
"epoch": 2.8670731707317074,
"grad_norm": 0.5078487348143493,
"learning_rate": 6.509886404659715e-08,
"loss": 0.2455,
"step": 785
},
{
"epoch": 2.870731707317073,
"grad_norm": 0.5626791514208339,
"learning_rate": 6.172466479682449e-08,
"loss": 0.2458,
"step": 786
},
{
"epoch": 2.874390243902439,
"grad_norm": 0.5917030136132226,
"learning_rate": 5.8439710880194287e-08,
"loss": 0.2314,
"step": 787
},
{
"epoch": 2.8780487804878048,
"grad_norm": 0.5405338171059761,
"learning_rate": 5.5244061662937944e-08,
"loss": 0.2348,
"step": 788
},
{
"epoch": 2.881707317073171,
"grad_norm": 0.5748908534192652,
"learning_rate": 5.213777489736227e-08,
"loss": 0.2552,
"step": 789
},
{
"epoch": 2.8853658536585365,
"grad_norm": 0.5942895745694118,
"learning_rate": 4.91209067207965e-08,
"loss": 0.2433,
"step": 790
},
{
"epoch": 2.8890243902439026,
"grad_norm": 0.5166417037795502,
"learning_rate": 4.6193511654584186e-08,
"loss": 0.2446,
"step": 791
},
{
"epoch": 2.892682926829268,
"grad_norm": 0.5507990948723721,
"learning_rate": 4.335564260309677e-08,
"loss": 0.2297,
"step": 792
},
{
"epoch": 2.8963414634146343,
"grad_norm": 0.5781950059218034,
"learning_rate": 4.06073508527749e-08,
"loss": 0.2421,
"step": 793
},
{
"epoch": 2.9,
"grad_norm": 0.5749833983841531,
"learning_rate": 3.794868607120417e-08,
"loss": 0.2341,
"step": 794
},
{
"epoch": 2.903658536585366,
"grad_norm": 0.5754752915871355,
"learning_rate": 3.537969630621752e-08,
"loss": 0.2641,
"step": 795
},
{
"epoch": 2.9073170731707316,
"grad_norm": 0.5791697991229936,
"learning_rate": 3.290042798502424e-08,
"loss": 0.2384,
"step": 796
},
{
"epoch": 2.9109756097560977,
"grad_norm": 0.5835307370273534,
"learning_rate": 3.051092591337401e-08,
"loss": 0.229,
"step": 797
},
{
"epoch": 2.9146341463414633,
"grad_norm": 0.5803370436780367,
"learning_rate": 2.8211233274745842e-08,
"loss": 0.2343,
"step": 798
},
{
"epoch": 2.9182926829268294,
"grad_norm": 0.5343132160568691,
"learning_rate": 2.600139162956761e-08,
"loss": 0.2608,
"step": 799
},
{
"epoch": 2.921951219512195,
"grad_norm": 0.5396281266368242,
"learning_rate": 2.388144091446498e-08,
"loss": 0.2433,
"step": 800
},
{
"epoch": 2.925609756097561,
"grad_norm": 0.5372475354081403,
"learning_rate": 2.185141944153979e-08,
"loss": 0.2335,
"step": 801
},
{
"epoch": 2.9292682926829268,
"grad_norm": 0.5591750381484034,
"learning_rate": 1.9911363897677228e-08,
"loss": 0.23,
"step": 802
},
{
"epoch": 2.932926829268293,
"grad_norm": 0.5408723741122057,
"learning_rate": 1.8061309343884724e-08,
"loss": 0.2289,
"step": 803
},
{
"epoch": 2.9365853658536585,
"grad_norm": 0.565184007294025,
"learning_rate": 1.6301289214655236e-08,
"loss": 0.2306,
"step": 804
},
{
"epoch": 2.9402439024390246,
"grad_norm": 0.5403935578429421,
"learning_rate": 1.4631335317365492e-08,
"loss": 0.2377,
"step": 805
},
{
"epoch": 2.94390243902439,
"grad_norm": 0.6324858665100218,
"learning_rate": 1.3051477831699798e-08,
"loss": 0.2416,
"step": 806
},
{
"epoch": 2.947560975609756,
"grad_norm": 0.5737715575991524,
"learning_rate": 1.1561745309105476e-08,
"loss": 0.242,
"step": 807
},
{
"epoch": 2.951219512195122,
"grad_norm": 0.5115828715456013,
"learning_rate": 1.0162164672276598e-08,
"loss": 0.2389,
"step": 808
},
{
"epoch": 2.954878048780488,
"grad_norm": 0.5537564527003395,
"learning_rate": 8.852761214666605e-09,
"loss": 0.2476,
"step": 809
},
{
"epoch": 2.9585365853658536,
"grad_norm": 0.5424642804569867,
"learning_rate": 7.633558600033675e-09,
"loss": 0.2597,
"step": 810
},
{
"epoch": 2.9621951219512193,
"grad_norm": 0.5705724169258695,
"learning_rate": 6.504578862009392e-09,
"loss": 0.2504,
"step": 811
},
{
"epoch": 2.9658536585365853,
"grad_norm": 0.5844841317025147,
"learning_rate": 5.4658424037029585e-09,
"loss": 0.2394,
"step": 812
},
{
"epoch": 2.9695121951219514,
"grad_norm": 0.5612989699157095,
"learning_rate": 4.5173679973337105e-09,
"loss": 0.2462,
"step": 813
},
{
"epoch": 2.973170731707317,
"grad_norm": 0.5745256781886607,
"learning_rate": 3.659172783887499e-09,
"loss": 0.2412,
"step": 814
},
{
"epoch": 2.9768292682926827,
"grad_norm": 0.5532600490739356,
"learning_rate": 2.89127227281194e-09,
"loss": 0.2325,
"step": 815
},
{
"epoch": 2.9804878048780488,
"grad_norm": 0.5558431587776496,
"learning_rate": 2.213680341732194e-09,
"loss": 0.241,
"step": 816
},
{
"epoch": 2.984146341463415,
"grad_norm": 0.5556399642651051,
"learning_rate": 1.6264092362028306e-09,
"loss": 0.2213,
"step": 817
},
{
"epoch": 2.9878048780487805,
"grad_norm": 0.5730682042333336,
"learning_rate": 1.1294695694841207e-09,
"loss": 0.2343,
"step": 818
},
{
"epoch": 2.991463414634146,
"grad_norm": 0.581621235538938,
"learning_rate": 7.228703223532974e-10,
"loss": 0.2399,
"step": 819
},
{
"epoch": 2.995121951219512,
"grad_norm": 0.5575710843072575,
"learning_rate": 4.0661884293913266e-10,
"loss": 0.2405,
"step": 820
},
{
"epoch": 2.9987804878048783,
"grad_norm": 0.5537487533940493,
"learning_rate": 1.8072084659093158e-10,
"loss": 0.2436,
"step": 821
},
{
"epoch": 3.0,
"grad_norm": 1.067961677556421,
"learning_rate": 4.518041577472598e-11,
"loss": 0.2422,
"step": 822
},
{
"epoch": 3.0,
"step": 822,
"total_flos": 370118663536640.0,
"train_loss": 0.32970622370422903,
"train_runtime": 8808.919,
"train_samples_per_second": 8.934,
"train_steps_per_second": 0.093
}
],
"logging_steps": 1,
"max_steps": 822,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": -822,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 370118663536640.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}