Wanff
Add fine-tuned model
259a13c
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 560,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017857142857142857,
"grad_norm": 0.6997888088226318,
"learning_rate": 9.982142857142858e-06,
"loss": 1.9503,
"step": 1
},
{
"epoch": 0.0035714285714285713,
"grad_norm": 0.7093144059181213,
"learning_rate": 9.964285714285714e-06,
"loss": 2.0277,
"step": 2
},
{
"epoch": 0.005357142857142857,
"grad_norm": 0.6343945264816284,
"learning_rate": 9.946428571428572e-06,
"loss": 1.9098,
"step": 3
},
{
"epoch": 0.007142857142857143,
"grad_norm": 0.6176633834838867,
"learning_rate": 9.92857142857143e-06,
"loss": 1.8188,
"step": 4
},
{
"epoch": 0.008928571428571428,
"grad_norm": 0.6151384115219116,
"learning_rate": 9.910714285714288e-06,
"loss": 1.9782,
"step": 5
},
{
"epoch": 0.010714285714285714,
"grad_norm": 0.6134297251701355,
"learning_rate": 9.892857142857143e-06,
"loss": 1.83,
"step": 6
},
{
"epoch": 0.0125,
"grad_norm": 0.6008761525154114,
"learning_rate": 9.875000000000001e-06,
"loss": 1.8764,
"step": 7
},
{
"epoch": 0.014285714285714285,
"grad_norm": 0.5730451941490173,
"learning_rate": 9.857142857142859e-06,
"loss": 1.8342,
"step": 8
},
{
"epoch": 0.01607142857142857,
"grad_norm": 0.5792098641395569,
"learning_rate": 9.839285714285715e-06,
"loss": 1.8666,
"step": 9
},
{
"epoch": 0.017857142857142856,
"grad_norm": 0.5382242202758789,
"learning_rate": 9.821428571428573e-06,
"loss": 1.9054,
"step": 10
},
{
"epoch": 0.019642857142857142,
"grad_norm": 0.46677500009536743,
"learning_rate": 9.803571428571428e-06,
"loss": 1.768,
"step": 11
},
{
"epoch": 0.02142857142857143,
"grad_norm": 0.47485437989234924,
"learning_rate": 9.785714285714286e-06,
"loss": 1.8044,
"step": 12
},
{
"epoch": 0.023214285714285715,
"grad_norm": 0.48358583450317383,
"learning_rate": 9.767857142857144e-06,
"loss": 1.7768,
"step": 13
},
{
"epoch": 0.025,
"grad_norm": 0.46866756677627563,
"learning_rate": 9.75e-06,
"loss": 1.8323,
"step": 14
},
{
"epoch": 0.026785714285714284,
"grad_norm": 0.4622134864330292,
"learning_rate": 9.732142857142858e-06,
"loss": 1.8337,
"step": 15
},
{
"epoch": 0.02857142857142857,
"grad_norm": 0.43944844603538513,
"learning_rate": 9.714285714285715e-06,
"loss": 1.7608,
"step": 16
},
{
"epoch": 0.030357142857142857,
"grad_norm": 0.4220949113368988,
"learning_rate": 9.696428571428573e-06,
"loss": 1.6767,
"step": 17
},
{
"epoch": 0.03214285714285714,
"grad_norm": 0.41527998447418213,
"learning_rate": 9.678571428571429e-06,
"loss": 1.7412,
"step": 18
},
{
"epoch": 0.033928571428571426,
"grad_norm": 0.43707507848739624,
"learning_rate": 9.660714285714287e-06,
"loss": 1.7807,
"step": 19
},
{
"epoch": 0.03571428571428571,
"grad_norm": 0.41880887746810913,
"learning_rate": 9.642857142857144e-06,
"loss": 1.7464,
"step": 20
},
{
"epoch": 0.0375,
"grad_norm": 0.4193197190761566,
"learning_rate": 9.625e-06,
"loss": 1.7347,
"step": 21
},
{
"epoch": 0.039285714285714285,
"grad_norm": 0.383999228477478,
"learning_rate": 9.607142857142858e-06,
"loss": 1.6653,
"step": 22
},
{
"epoch": 0.04107142857142857,
"grad_norm": 0.3941427171230316,
"learning_rate": 9.589285714285716e-06,
"loss": 1.7175,
"step": 23
},
{
"epoch": 0.04285714285714286,
"grad_norm": 0.386326402425766,
"learning_rate": 9.571428571428573e-06,
"loss": 1.6642,
"step": 24
},
{
"epoch": 0.044642857142857144,
"grad_norm": 0.3823203146457672,
"learning_rate": 9.55357142857143e-06,
"loss": 1.7253,
"step": 25
},
{
"epoch": 0.04642857142857143,
"grad_norm": 0.36001327633857727,
"learning_rate": 9.535714285714287e-06,
"loss": 1.7415,
"step": 26
},
{
"epoch": 0.048214285714285716,
"grad_norm": 0.36957600712776184,
"learning_rate": 9.517857142857143e-06,
"loss": 1.7296,
"step": 27
},
{
"epoch": 0.05,
"grad_norm": 0.37834134697914124,
"learning_rate": 9.5e-06,
"loss": 1.6701,
"step": 28
},
{
"epoch": 0.05178571428571429,
"grad_norm": 0.343870609998703,
"learning_rate": 9.482142857142858e-06,
"loss": 1.7299,
"step": 29
},
{
"epoch": 0.05357142857142857,
"grad_norm": 0.3333272933959961,
"learning_rate": 9.464285714285714e-06,
"loss": 1.6733,
"step": 30
},
{
"epoch": 0.055357142857142855,
"grad_norm": 0.3580067455768585,
"learning_rate": 9.446428571428572e-06,
"loss": 1.7065,
"step": 31
},
{
"epoch": 0.05714285714285714,
"grad_norm": 0.3354710638523102,
"learning_rate": 9.42857142857143e-06,
"loss": 1.6561,
"step": 32
},
{
"epoch": 0.05892857142857143,
"grad_norm": 0.4108162820339203,
"learning_rate": 9.410714285714286e-06,
"loss": 1.5889,
"step": 33
},
{
"epoch": 0.060714285714285714,
"grad_norm": 0.34176838397979736,
"learning_rate": 9.392857142857143e-06,
"loss": 1.6161,
"step": 34
},
{
"epoch": 0.0625,
"grad_norm": 0.3366299271583557,
"learning_rate": 9.375000000000001e-06,
"loss": 1.599,
"step": 35
},
{
"epoch": 0.06428571428571428,
"grad_norm": 0.3173324763774872,
"learning_rate": 9.357142857142859e-06,
"loss": 1.6042,
"step": 36
},
{
"epoch": 0.06607142857142857,
"grad_norm": 0.3377523124217987,
"learning_rate": 9.339285714285715e-06,
"loss": 1.6056,
"step": 37
},
{
"epoch": 0.06785714285714285,
"grad_norm": 0.31548359990119934,
"learning_rate": 9.321428571428572e-06,
"loss": 1.6573,
"step": 38
},
{
"epoch": 0.06964285714285715,
"grad_norm": 0.32269155979156494,
"learning_rate": 9.30357142857143e-06,
"loss": 1.5897,
"step": 39
},
{
"epoch": 0.07142857142857142,
"grad_norm": 0.30696460604667664,
"learning_rate": 9.285714285714288e-06,
"loss": 1.6129,
"step": 40
},
{
"epoch": 0.07321428571428572,
"grad_norm": 0.31791722774505615,
"learning_rate": 9.267857142857144e-06,
"loss": 1.6427,
"step": 41
},
{
"epoch": 0.075,
"grad_norm": 0.31335708498954773,
"learning_rate": 9.250000000000001e-06,
"loss": 1.5697,
"step": 42
},
{
"epoch": 0.07678571428571429,
"grad_norm": 0.31561946868896484,
"learning_rate": 9.232142857142859e-06,
"loss": 1.653,
"step": 43
},
{
"epoch": 0.07857142857142857,
"grad_norm": 0.3354925215244293,
"learning_rate": 9.214285714285715e-06,
"loss": 1.5793,
"step": 44
},
{
"epoch": 0.08035714285714286,
"grad_norm": 0.3125816583633423,
"learning_rate": 9.196428571428571e-06,
"loss": 1.5243,
"step": 45
},
{
"epoch": 0.08214285714285714,
"grad_norm": 0.3044743537902832,
"learning_rate": 9.178571428571429e-06,
"loss": 1.5449,
"step": 46
},
{
"epoch": 0.08392857142857142,
"grad_norm": 0.3276882469654083,
"learning_rate": 9.160714285714286e-06,
"loss": 1.65,
"step": 47
},
{
"epoch": 0.08571428571428572,
"grad_norm": 0.34233033657073975,
"learning_rate": 9.142857142857144e-06,
"loss": 1.5939,
"step": 48
},
{
"epoch": 0.0875,
"grad_norm": 0.36175018548965454,
"learning_rate": 9.125e-06,
"loss": 1.5721,
"step": 49
},
{
"epoch": 0.08928571428571429,
"grad_norm": 0.3349234461784363,
"learning_rate": 9.107142857142858e-06,
"loss": 1.615,
"step": 50
},
{
"epoch": 0.09107142857142857,
"grad_norm": 0.3084949553012848,
"learning_rate": 9.089285714285715e-06,
"loss": 1.5363,
"step": 51
},
{
"epoch": 0.09285714285714286,
"grad_norm": 0.30576226115226746,
"learning_rate": 9.071428571428573e-06,
"loss": 1.521,
"step": 52
},
{
"epoch": 0.09464285714285714,
"grad_norm": 0.3125852346420288,
"learning_rate": 9.053571428571429e-06,
"loss": 1.5768,
"step": 53
},
{
"epoch": 0.09642857142857143,
"grad_norm": 0.3131454586982727,
"learning_rate": 9.035714285714287e-06,
"loss": 1.5326,
"step": 54
},
{
"epoch": 0.09821428571428571,
"grad_norm": 0.3140488862991333,
"learning_rate": 9.017857142857144e-06,
"loss": 1.6095,
"step": 55
},
{
"epoch": 0.1,
"grad_norm": 0.3363245129585266,
"learning_rate": 9e-06,
"loss": 1.59,
"step": 56
},
{
"epoch": 0.10178571428571428,
"grad_norm": 0.3293783664703369,
"learning_rate": 8.982142857142858e-06,
"loss": 1.5722,
"step": 57
},
{
"epoch": 0.10357142857142858,
"grad_norm": 0.31743383407592773,
"learning_rate": 8.964285714285716e-06,
"loss": 1.4729,
"step": 58
},
{
"epoch": 0.10535714285714286,
"grad_norm": 0.3278064727783203,
"learning_rate": 8.946428571428573e-06,
"loss": 1.494,
"step": 59
},
{
"epoch": 0.10714285714285714,
"grad_norm": 0.3063907027244568,
"learning_rate": 8.92857142857143e-06,
"loss": 1.5308,
"step": 60
},
{
"epoch": 0.10892857142857143,
"grad_norm": 0.32603612542152405,
"learning_rate": 8.910714285714287e-06,
"loss": 1.505,
"step": 61
},
{
"epoch": 0.11071428571428571,
"grad_norm": 0.32034823298454285,
"learning_rate": 8.892857142857143e-06,
"loss": 1.5049,
"step": 62
},
{
"epoch": 0.1125,
"grad_norm": 0.2999460697174072,
"learning_rate": 8.875e-06,
"loss": 1.4992,
"step": 63
},
{
"epoch": 0.11428571428571428,
"grad_norm": 0.3157401382923126,
"learning_rate": 8.857142857142858e-06,
"loss": 1.5424,
"step": 64
},
{
"epoch": 0.11607142857142858,
"grad_norm": 0.46912819147109985,
"learning_rate": 8.839285714285714e-06,
"loss": 1.542,
"step": 65
},
{
"epoch": 0.11785714285714285,
"grad_norm": 0.31604939699172974,
"learning_rate": 8.821428571428572e-06,
"loss": 1.5034,
"step": 66
},
{
"epoch": 0.11964285714285715,
"grad_norm": 0.3068097233772278,
"learning_rate": 8.80357142857143e-06,
"loss": 1.5091,
"step": 67
},
{
"epoch": 0.12142857142857143,
"grad_norm": 0.30518803000450134,
"learning_rate": 8.785714285714286e-06,
"loss": 1.4497,
"step": 68
},
{
"epoch": 0.12321428571428572,
"grad_norm": 0.3579995632171631,
"learning_rate": 8.767857142857143e-06,
"loss": 1.5062,
"step": 69
},
{
"epoch": 0.125,
"grad_norm": 0.3688044250011444,
"learning_rate": 8.750000000000001e-06,
"loss": 1.5727,
"step": 70
},
{
"epoch": 0.12678571428571428,
"grad_norm": 0.32263967394828796,
"learning_rate": 8.732142857142859e-06,
"loss": 1.4353,
"step": 71
},
{
"epoch": 0.12857142857142856,
"grad_norm": 0.30005359649658203,
"learning_rate": 8.714285714285715e-06,
"loss": 1.468,
"step": 72
},
{
"epoch": 0.13035714285714287,
"grad_norm": 0.33392152190208435,
"learning_rate": 8.696428571428572e-06,
"loss": 1.5394,
"step": 73
},
{
"epoch": 0.13214285714285715,
"grad_norm": 0.36938440799713135,
"learning_rate": 8.67857142857143e-06,
"loss": 1.5137,
"step": 74
},
{
"epoch": 0.13392857142857142,
"grad_norm": 0.33022594451904297,
"learning_rate": 8.660714285714286e-06,
"loss": 1.4917,
"step": 75
},
{
"epoch": 0.1357142857142857,
"grad_norm": 0.35589250922203064,
"learning_rate": 8.642857142857144e-06,
"loss": 1.4468,
"step": 76
},
{
"epoch": 0.1375,
"grad_norm": 0.3233088552951813,
"learning_rate": 8.625000000000001e-06,
"loss": 1.4459,
"step": 77
},
{
"epoch": 0.1392857142857143,
"grad_norm": 0.3153448700904846,
"learning_rate": 8.607142857142859e-06,
"loss": 1.444,
"step": 78
},
{
"epoch": 0.14107142857142857,
"grad_norm": 0.3484407365322113,
"learning_rate": 8.589285714285715e-06,
"loss": 1.4911,
"step": 79
},
{
"epoch": 0.14285714285714285,
"grad_norm": 0.47459274530410767,
"learning_rate": 8.571428571428571e-06,
"loss": 1.5459,
"step": 80
},
{
"epoch": 0.14464285714285716,
"grad_norm": 0.31747081875801086,
"learning_rate": 8.553571428571429e-06,
"loss": 1.4274,
"step": 81
},
{
"epoch": 0.14642857142857144,
"grad_norm": 0.34533751010894775,
"learning_rate": 8.535714285714286e-06,
"loss": 1.5468,
"step": 82
},
{
"epoch": 0.14821428571428572,
"grad_norm": 0.44818830490112305,
"learning_rate": 8.517857142857144e-06,
"loss": 1.5172,
"step": 83
},
{
"epoch": 0.15,
"grad_norm": 0.3334464728832245,
"learning_rate": 8.5e-06,
"loss": 1.4703,
"step": 84
},
{
"epoch": 0.15178571428571427,
"grad_norm": 0.33317744731903076,
"learning_rate": 8.482142857142858e-06,
"loss": 1.4714,
"step": 85
},
{
"epoch": 0.15357142857142858,
"grad_norm": 0.3387729227542877,
"learning_rate": 8.464285714285715e-06,
"loss": 1.4725,
"step": 86
},
{
"epoch": 0.15535714285714286,
"grad_norm": 0.3361343443393707,
"learning_rate": 8.446428571428571e-06,
"loss": 1.4619,
"step": 87
},
{
"epoch": 0.15714285714285714,
"grad_norm": 0.3541395664215088,
"learning_rate": 8.428571428571429e-06,
"loss": 1.5199,
"step": 88
},
{
"epoch": 0.15892857142857142,
"grad_norm": 0.33765190839767456,
"learning_rate": 8.410714285714287e-06,
"loss": 1.4983,
"step": 89
},
{
"epoch": 0.16071428571428573,
"grad_norm": 0.452921599149704,
"learning_rate": 8.392857142857144e-06,
"loss": 1.4619,
"step": 90
},
{
"epoch": 0.1625,
"grad_norm": 0.3351086974143982,
"learning_rate": 8.375e-06,
"loss": 1.3916,
"step": 91
},
{
"epoch": 0.16428571428571428,
"grad_norm": 0.3732268810272217,
"learning_rate": 8.357142857142858e-06,
"loss": 1.484,
"step": 92
},
{
"epoch": 0.16607142857142856,
"grad_norm": 0.36562690138816833,
"learning_rate": 8.339285714285716e-06,
"loss": 1.4536,
"step": 93
},
{
"epoch": 0.16785714285714284,
"grad_norm": 0.32931259274482727,
"learning_rate": 8.321428571428573e-06,
"loss": 1.4649,
"step": 94
},
{
"epoch": 0.16964285714285715,
"grad_norm": 0.5226176381111145,
"learning_rate": 8.30357142857143e-06,
"loss": 1.4584,
"step": 95
},
{
"epoch": 0.17142857142857143,
"grad_norm": 0.5753301978111267,
"learning_rate": 8.285714285714287e-06,
"loss": 1.5382,
"step": 96
},
{
"epoch": 0.1732142857142857,
"grad_norm": 0.34495556354522705,
"learning_rate": 8.267857142857143e-06,
"loss": 1.4719,
"step": 97
},
{
"epoch": 0.175,
"grad_norm": 0.35743340849876404,
"learning_rate": 8.25e-06,
"loss": 1.4244,
"step": 98
},
{
"epoch": 0.1767857142857143,
"grad_norm": 0.3325170576572418,
"learning_rate": 8.232142857142857e-06,
"loss": 1.4362,
"step": 99
},
{
"epoch": 0.17857142857142858,
"grad_norm": 0.37087783217430115,
"learning_rate": 8.214285714285714e-06,
"loss": 1.4888,
"step": 100
},
{
"epoch": 0.18035714285714285,
"grad_norm": 0.34744587540626526,
"learning_rate": 8.196428571428572e-06,
"loss": 1.4836,
"step": 101
},
{
"epoch": 0.18214285714285713,
"grad_norm": 0.3370732367038727,
"learning_rate": 8.17857142857143e-06,
"loss": 1.4962,
"step": 102
},
{
"epoch": 0.18392857142857144,
"grad_norm": 0.36961331963539124,
"learning_rate": 8.160714285714286e-06,
"loss": 1.4483,
"step": 103
},
{
"epoch": 0.18571428571428572,
"grad_norm": 0.3708970844745636,
"learning_rate": 8.142857142857143e-06,
"loss": 1.4767,
"step": 104
},
{
"epoch": 0.1875,
"grad_norm": 0.34523463249206543,
"learning_rate": 8.125000000000001e-06,
"loss": 1.4583,
"step": 105
},
{
"epoch": 0.18928571428571428,
"grad_norm": 0.34832295775413513,
"learning_rate": 8.107142857142859e-06,
"loss": 1.4046,
"step": 106
},
{
"epoch": 0.19107142857142856,
"grad_norm": 0.3623919188976288,
"learning_rate": 8.089285714285715e-06,
"loss": 1.3721,
"step": 107
},
{
"epoch": 0.19285714285714287,
"grad_norm": 0.39359399676322937,
"learning_rate": 8.071428571428572e-06,
"loss": 1.5102,
"step": 108
},
{
"epoch": 0.19464285714285715,
"grad_norm": 0.4397876262664795,
"learning_rate": 8.05357142857143e-06,
"loss": 1.4742,
"step": 109
},
{
"epoch": 0.19642857142857142,
"grad_norm": 0.3398594856262207,
"learning_rate": 8.035714285714286e-06,
"loss": 1.4093,
"step": 110
},
{
"epoch": 0.1982142857142857,
"grad_norm": 0.44007447361946106,
"learning_rate": 8.017857142857144e-06,
"loss": 1.4157,
"step": 111
},
{
"epoch": 0.2,
"grad_norm": 0.384075790643692,
"learning_rate": 8.000000000000001e-06,
"loss": 1.4729,
"step": 112
},
{
"epoch": 0.2017857142857143,
"grad_norm": 0.460844486951828,
"learning_rate": 7.982142857142859e-06,
"loss": 1.4098,
"step": 113
},
{
"epoch": 0.20357142857142857,
"grad_norm": 0.3587513267993927,
"learning_rate": 7.964285714285715e-06,
"loss": 1.4758,
"step": 114
},
{
"epoch": 0.20535714285714285,
"grad_norm": 0.4182850420475006,
"learning_rate": 7.946428571428571e-06,
"loss": 1.3661,
"step": 115
},
{
"epoch": 0.20714285714285716,
"grad_norm": 0.41321325302124023,
"learning_rate": 7.928571428571429e-06,
"loss": 1.4011,
"step": 116
},
{
"epoch": 0.20892857142857144,
"grad_norm": 0.3603422939777374,
"learning_rate": 7.910714285714286e-06,
"loss": 1.412,
"step": 117
},
{
"epoch": 0.21071428571428572,
"grad_norm": 0.3570718467235565,
"learning_rate": 7.892857142857144e-06,
"loss": 1.4377,
"step": 118
},
{
"epoch": 0.2125,
"grad_norm": 0.358900785446167,
"learning_rate": 7.875e-06,
"loss": 1.3817,
"step": 119
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.3678073585033417,
"learning_rate": 7.857142857142858e-06,
"loss": 1.4669,
"step": 120
},
{
"epoch": 0.21607142857142858,
"grad_norm": 0.35429590940475464,
"learning_rate": 7.839285714285715e-06,
"loss": 1.4117,
"step": 121
},
{
"epoch": 0.21785714285714286,
"grad_norm": 0.3580191433429718,
"learning_rate": 7.821428571428571e-06,
"loss": 1.4172,
"step": 122
},
{
"epoch": 0.21964285714285714,
"grad_norm": 0.35501304268836975,
"learning_rate": 7.803571428571429e-06,
"loss": 1.4412,
"step": 123
},
{
"epoch": 0.22142857142857142,
"grad_norm": 0.35224324464797974,
"learning_rate": 7.785714285714287e-06,
"loss": 1.4106,
"step": 124
},
{
"epoch": 0.22321428571428573,
"grad_norm": 0.3726520836353302,
"learning_rate": 7.767857142857144e-06,
"loss": 1.3818,
"step": 125
},
{
"epoch": 0.225,
"grad_norm": 0.3572198748588562,
"learning_rate": 7.75e-06,
"loss": 1.3844,
"step": 126
},
{
"epoch": 0.22678571428571428,
"grad_norm": 0.3563896715641022,
"learning_rate": 7.732142857142858e-06,
"loss": 1.4052,
"step": 127
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.37256497144699097,
"learning_rate": 7.714285714285716e-06,
"loss": 1.4813,
"step": 128
},
{
"epoch": 0.23035714285714284,
"grad_norm": 0.3678889870643616,
"learning_rate": 7.696428571428572e-06,
"loss": 1.424,
"step": 129
},
{
"epoch": 0.23214285714285715,
"grad_norm": 0.37930241227149963,
"learning_rate": 7.67857142857143e-06,
"loss": 1.4352,
"step": 130
},
{
"epoch": 0.23392857142857143,
"grad_norm": 0.34848180413246155,
"learning_rate": 7.660714285714287e-06,
"loss": 1.3944,
"step": 131
},
{
"epoch": 0.2357142857142857,
"grad_norm": 0.4487292468547821,
"learning_rate": 7.642857142857143e-06,
"loss": 1.5334,
"step": 132
},
{
"epoch": 0.2375,
"grad_norm": 0.4080350399017334,
"learning_rate": 7.625e-06,
"loss": 1.3737,
"step": 133
},
{
"epoch": 0.2392857142857143,
"grad_norm": 0.3762721121311188,
"learning_rate": 7.6071428571428575e-06,
"loss": 1.4288,
"step": 134
},
{
"epoch": 0.24107142857142858,
"grad_norm": 0.38287535309791565,
"learning_rate": 7.589285714285714e-06,
"loss": 1.4398,
"step": 135
},
{
"epoch": 0.24285714285714285,
"grad_norm": 0.37439846992492676,
"learning_rate": 7.571428571428572e-06,
"loss": 1.4314,
"step": 136
},
{
"epoch": 0.24464285714285713,
"grad_norm": 0.3716781735420227,
"learning_rate": 7.553571428571429e-06,
"loss": 1.4043,
"step": 137
},
{
"epoch": 0.24642857142857144,
"grad_norm": 0.36782070994377136,
"learning_rate": 7.5357142857142865e-06,
"loss": 1.4057,
"step": 138
},
{
"epoch": 0.24821428571428572,
"grad_norm": 0.36489415168762207,
"learning_rate": 7.517857142857143e-06,
"loss": 1.3609,
"step": 139
},
{
"epoch": 0.25,
"grad_norm": 0.3928413391113281,
"learning_rate": 7.500000000000001e-06,
"loss": 1.427,
"step": 140
},
{
"epoch": 0.2517857142857143,
"grad_norm": 0.36631447076797485,
"learning_rate": 7.482142857142858e-06,
"loss": 1.3939,
"step": 141
},
{
"epoch": 0.25357142857142856,
"grad_norm": 0.3827175498008728,
"learning_rate": 7.464285714285715e-06,
"loss": 1.4078,
"step": 142
},
{
"epoch": 0.25535714285714284,
"grad_norm": 0.37270644307136536,
"learning_rate": 7.446428571428572e-06,
"loss": 1.4459,
"step": 143
},
{
"epoch": 0.2571428571428571,
"grad_norm": 0.3645498752593994,
"learning_rate": 7.428571428571429e-06,
"loss": 1.4207,
"step": 144
},
{
"epoch": 0.25892857142857145,
"grad_norm": 0.3855854868888855,
"learning_rate": 7.410714285714287e-06,
"loss": 1.4153,
"step": 145
},
{
"epoch": 0.26071428571428573,
"grad_norm": 0.40187859535217285,
"learning_rate": 7.392857142857144e-06,
"loss": 1.3937,
"step": 146
},
{
"epoch": 0.2625,
"grad_norm": 0.39412420988082886,
"learning_rate": 7.375000000000001e-06,
"loss": 1.3786,
"step": 147
},
{
"epoch": 0.2642857142857143,
"grad_norm": 0.3723837733268738,
"learning_rate": 7.357142857142858e-06,
"loss": 1.3455,
"step": 148
},
{
"epoch": 0.26607142857142857,
"grad_norm": 0.36095982789993286,
"learning_rate": 7.339285714285714e-06,
"loss": 1.3997,
"step": 149
},
{
"epoch": 0.26785714285714285,
"grad_norm": 0.4379737079143524,
"learning_rate": 7.321428571428572e-06,
"loss": 1.4803,
"step": 150
},
{
"epoch": 0.26964285714285713,
"grad_norm": 0.36974653601646423,
"learning_rate": 7.303571428571429e-06,
"loss": 1.4116,
"step": 151
},
{
"epoch": 0.2714285714285714,
"grad_norm": 0.3808487057685852,
"learning_rate": 7.285714285714286e-06,
"loss": 1.3931,
"step": 152
},
{
"epoch": 0.2732142857142857,
"grad_norm": 0.38126876950263977,
"learning_rate": 7.267857142857143e-06,
"loss": 1.3887,
"step": 153
},
{
"epoch": 0.275,
"grad_norm": 0.39696696400642395,
"learning_rate": 7.25e-06,
"loss": 1.4065,
"step": 154
},
{
"epoch": 0.2767857142857143,
"grad_norm": 0.3827652633190155,
"learning_rate": 7.232142857142858e-06,
"loss": 1.3768,
"step": 155
},
{
"epoch": 0.2785714285714286,
"grad_norm": 0.37400951981544495,
"learning_rate": 7.2142857142857145e-06,
"loss": 1.3904,
"step": 156
},
{
"epoch": 0.28035714285714286,
"grad_norm": 0.3787144720554352,
"learning_rate": 7.196428571428572e-06,
"loss": 1.4174,
"step": 157
},
{
"epoch": 0.28214285714285714,
"grad_norm": 0.378603994846344,
"learning_rate": 7.178571428571429e-06,
"loss": 1.348,
"step": 158
},
{
"epoch": 0.2839285714285714,
"grad_norm": 0.39289960265159607,
"learning_rate": 7.160714285714287e-06,
"loss": 1.3872,
"step": 159
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.38201239705085754,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.3866,
"step": 160
},
{
"epoch": 0.2875,
"grad_norm": 0.37428876757621765,
"learning_rate": 7.125e-06,
"loss": 1.3853,
"step": 161
},
{
"epoch": 0.2892857142857143,
"grad_norm": 0.3859560489654541,
"learning_rate": 7.107142857142858e-06,
"loss": 1.3971,
"step": 162
},
{
"epoch": 0.2910714285714286,
"grad_norm": 0.4165991246700287,
"learning_rate": 7.089285714285715e-06,
"loss": 1.4174,
"step": 163
},
{
"epoch": 0.29285714285714287,
"grad_norm": 0.43760836124420166,
"learning_rate": 7.0714285714285726e-06,
"loss": 1.3777,
"step": 164
},
{
"epoch": 0.29464285714285715,
"grad_norm": 0.37146738171577454,
"learning_rate": 7.053571428571429e-06,
"loss": 1.3949,
"step": 165
},
{
"epoch": 0.29642857142857143,
"grad_norm": 0.3897382915019989,
"learning_rate": 7.035714285714287e-06,
"loss": 1.3523,
"step": 166
},
{
"epoch": 0.2982142857142857,
"grad_norm": 0.4206221401691437,
"learning_rate": 7.017857142857143e-06,
"loss": 1.364,
"step": 167
},
{
"epoch": 0.3,
"grad_norm": 0.3997578024864197,
"learning_rate": 7e-06,
"loss": 1.3687,
"step": 168
},
{
"epoch": 0.30178571428571427,
"grad_norm": 0.3917233943939209,
"learning_rate": 6.9821428571428576e-06,
"loss": 1.3873,
"step": 169
},
{
"epoch": 0.30357142857142855,
"grad_norm": 0.4036829471588135,
"learning_rate": 6.964285714285714e-06,
"loss": 1.3728,
"step": 170
},
{
"epoch": 0.3053571428571429,
"grad_norm": 0.41000431776046753,
"learning_rate": 6.946428571428572e-06,
"loss": 1.298,
"step": 171
},
{
"epoch": 0.30714285714285716,
"grad_norm": 0.40925708413124084,
"learning_rate": 6.928571428571429e-06,
"loss": 1.2966,
"step": 172
},
{
"epoch": 0.30892857142857144,
"grad_norm": 0.43786993622779846,
"learning_rate": 6.910714285714286e-06,
"loss": 1.4269,
"step": 173
},
{
"epoch": 0.3107142857142857,
"grad_norm": 0.41700488328933716,
"learning_rate": 6.892857142857143e-06,
"loss": 1.3893,
"step": 174
},
{
"epoch": 0.3125,
"grad_norm": 0.5371460318565369,
"learning_rate": 6.875e-06,
"loss": 1.3981,
"step": 175
},
{
"epoch": 0.3142857142857143,
"grad_norm": 0.39283287525177,
"learning_rate": 6.857142857142858e-06,
"loss": 1.3413,
"step": 176
},
{
"epoch": 0.31607142857142856,
"grad_norm": 0.450718492269516,
"learning_rate": 6.839285714285715e-06,
"loss": 1.3852,
"step": 177
},
{
"epoch": 0.31785714285714284,
"grad_norm": 0.3878072500228882,
"learning_rate": 6.8214285714285724e-06,
"loss": 1.347,
"step": 178
},
{
"epoch": 0.3196428571428571,
"grad_norm": 0.3963475525379181,
"learning_rate": 6.803571428571429e-06,
"loss": 1.3509,
"step": 179
},
{
"epoch": 0.32142857142857145,
"grad_norm": 0.3902972638607025,
"learning_rate": 6.785714285714287e-06,
"loss": 1.4188,
"step": 180
},
{
"epoch": 0.32321428571428573,
"grad_norm": 0.4161182641983032,
"learning_rate": 6.767857142857144e-06,
"loss": 1.3397,
"step": 181
},
{
"epoch": 0.325,
"grad_norm": 0.4012393355369568,
"learning_rate": 6.750000000000001e-06,
"loss": 1.4165,
"step": 182
},
{
"epoch": 0.3267857142857143,
"grad_norm": 0.42954060435295105,
"learning_rate": 6.732142857142858e-06,
"loss": 1.3531,
"step": 183
},
{
"epoch": 0.32857142857142857,
"grad_norm": 0.3946102261543274,
"learning_rate": 6.714285714285714e-06,
"loss": 1.425,
"step": 184
},
{
"epoch": 0.33035714285714285,
"grad_norm": 0.4067486524581909,
"learning_rate": 6.696428571428571e-06,
"loss": 1.3773,
"step": 185
},
{
"epoch": 0.33214285714285713,
"grad_norm": 0.4191884696483612,
"learning_rate": 6.678571428571429e-06,
"loss": 1.2963,
"step": 186
},
{
"epoch": 0.3339285714285714,
"grad_norm": 0.3840485215187073,
"learning_rate": 6.660714285714286e-06,
"loss": 1.3377,
"step": 187
},
{
"epoch": 0.3357142857142857,
"grad_norm": 0.4106978476047516,
"learning_rate": 6.642857142857143e-06,
"loss": 1.3604,
"step": 188
},
{
"epoch": 0.3375,
"grad_norm": 0.4163394570350647,
"learning_rate": 6.625e-06,
"loss": 1.3599,
"step": 189
},
{
"epoch": 0.3392857142857143,
"grad_norm": 0.3918200433254242,
"learning_rate": 6.607142857142858e-06,
"loss": 1.3012,
"step": 190
},
{
"epoch": 0.3410714285714286,
"grad_norm": 0.4022958278656006,
"learning_rate": 6.589285714285715e-06,
"loss": 1.3567,
"step": 191
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.40892308950424194,
"learning_rate": 6.571428571428572e-06,
"loss": 1.3227,
"step": 192
},
{
"epoch": 0.34464285714285714,
"grad_norm": 0.4009507894515991,
"learning_rate": 6.553571428571429e-06,
"loss": 1.3213,
"step": 193
},
{
"epoch": 0.3464285714285714,
"grad_norm": 0.4105026125907898,
"learning_rate": 6.535714285714286e-06,
"loss": 1.3223,
"step": 194
},
{
"epoch": 0.3482142857142857,
"grad_norm": 0.4591858983039856,
"learning_rate": 6.517857142857144e-06,
"loss": 1.3754,
"step": 195
},
{
"epoch": 0.35,
"grad_norm": 0.4026089906692505,
"learning_rate": 6.5000000000000004e-06,
"loss": 1.3453,
"step": 196
},
{
"epoch": 0.3517857142857143,
"grad_norm": 0.4099741280078888,
"learning_rate": 6.482142857142858e-06,
"loss": 1.3592,
"step": 197
},
{
"epoch": 0.3535714285714286,
"grad_norm": 0.4273921549320221,
"learning_rate": 6.464285714285715e-06,
"loss": 1.3119,
"step": 198
},
{
"epoch": 0.35535714285714287,
"grad_norm": 0.415178120136261,
"learning_rate": 6.446428571428573e-06,
"loss": 1.3219,
"step": 199
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.4362969398498535,
"learning_rate": 6.4285714285714295e-06,
"loss": 1.3299,
"step": 200
},
{
"epoch": 0.35892857142857143,
"grad_norm": 0.4466477036476135,
"learning_rate": 6.410714285714287e-06,
"loss": 1.3376,
"step": 201
},
{
"epoch": 0.3607142857142857,
"grad_norm": 0.44762319326400757,
"learning_rate": 6.392857142857143e-06,
"loss": 1.3716,
"step": 202
},
{
"epoch": 0.3625,
"grad_norm": 0.42173707485198975,
"learning_rate": 6.375e-06,
"loss": 1.3652,
"step": 203
},
{
"epoch": 0.36428571428571427,
"grad_norm": 0.5742844343185425,
"learning_rate": 6.357142857142858e-06,
"loss": 1.3848,
"step": 204
},
{
"epoch": 0.36607142857142855,
"grad_norm": 0.437112957239151,
"learning_rate": 6.3392857142857145e-06,
"loss": 1.3121,
"step": 205
},
{
"epoch": 0.3678571428571429,
"grad_norm": 0.4026128351688385,
"learning_rate": 6.321428571428571e-06,
"loss": 1.3321,
"step": 206
},
{
"epoch": 0.36964285714285716,
"grad_norm": 0.4601041376590729,
"learning_rate": 6.303571428571429e-06,
"loss": 1.3313,
"step": 207
},
{
"epoch": 0.37142857142857144,
"grad_norm": 0.422014057636261,
"learning_rate": 6.285714285714286e-06,
"loss": 1.3062,
"step": 208
},
{
"epoch": 0.3732142857142857,
"grad_norm": 0.4235393702983856,
"learning_rate": 6.2678571428571435e-06,
"loss": 1.3259,
"step": 209
},
{
"epoch": 0.375,
"grad_norm": 0.3982096016407013,
"learning_rate": 6.25e-06,
"loss": 1.3089,
"step": 210
},
{
"epoch": 0.3767857142857143,
"grad_norm": 0.44556036591529846,
"learning_rate": 6.232142857142858e-06,
"loss": 1.3831,
"step": 211
},
{
"epoch": 0.37857142857142856,
"grad_norm": 0.4298821687698364,
"learning_rate": 6.214285714285715e-06,
"loss": 1.3776,
"step": 212
},
{
"epoch": 0.38035714285714284,
"grad_norm": 0.43510201573371887,
"learning_rate": 6.1964285714285725e-06,
"loss": 1.389,
"step": 213
},
{
"epoch": 0.3821428571428571,
"grad_norm": 0.455490380525589,
"learning_rate": 6.178571428571429e-06,
"loss": 1.4085,
"step": 214
},
{
"epoch": 0.38392857142857145,
"grad_norm": 0.5122373700141907,
"learning_rate": 6.160714285714286e-06,
"loss": 1.3985,
"step": 215
},
{
"epoch": 0.38571428571428573,
"grad_norm": 0.4266716241836548,
"learning_rate": 6.142857142857144e-06,
"loss": 1.3463,
"step": 216
},
{
"epoch": 0.3875,
"grad_norm": 0.4357399344444275,
"learning_rate": 6.125000000000001e-06,
"loss": 1.3664,
"step": 217
},
{
"epoch": 0.3892857142857143,
"grad_norm": 0.5229921340942383,
"learning_rate": 6.107142857142858e-06,
"loss": 1.2992,
"step": 218
},
{
"epoch": 0.39107142857142857,
"grad_norm": 0.4309137761592865,
"learning_rate": 6.089285714285714e-06,
"loss": 1.3435,
"step": 219
},
{
"epoch": 0.39285714285714285,
"grad_norm": 0.45066171884536743,
"learning_rate": 6.071428571428571e-06,
"loss": 1.3517,
"step": 220
},
{
"epoch": 0.39464285714285713,
"grad_norm": 0.4304474890232086,
"learning_rate": 6.053571428571429e-06,
"loss": 1.2906,
"step": 221
},
{
"epoch": 0.3964285714285714,
"grad_norm": 0.43573635816574097,
"learning_rate": 6.035714285714286e-06,
"loss": 1.2647,
"step": 222
},
{
"epoch": 0.3982142857142857,
"grad_norm": 0.5297744274139404,
"learning_rate": 6.017857142857143e-06,
"loss": 1.3169,
"step": 223
},
{
"epoch": 0.4,
"grad_norm": 0.5126820802688599,
"learning_rate": 6e-06,
"loss": 1.2604,
"step": 224
},
{
"epoch": 0.4017857142857143,
"grad_norm": 0.44824084639549255,
"learning_rate": 5.982142857142858e-06,
"loss": 1.3573,
"step": 225
},
{
"epoch": 0.4035714285714286,
"grad_norm": 0.6343708634376526,
"learning_rate": 5.964285714285715e-06,
"loss": 1.447,
"step": 226
},
{
"epoch": 0.40535714285714286,
"grad_norm": 0.4232017993927002,
"learning_rate": 5.9464285714285715e-06,
"loss": 1.3303,
"step": 227
},
{
"epoch": 0.40714285714285714,
"grad_norm": 0.44813570380210876,
"learning_rate": 5.928571428571429e-06,
"loss": 1.3647,
"step": 228
},
{
"epoch": 0.4089285714285714,
"grad_norm": 0.46437007188796997,
"learning_rate": 5.910714285714286e-06,
"loss": 1.3534,
"step": 229
},
{
"epoch": 0.4107142857142857,
"grad_norm": 0.4851243793964386,
"learning_rate": 5.892857142857144e-06,
"loss": 1.376,
"step": 230
},
{
"epoch": 0.4125,
"grad_norm": 0.43191760778427124,
"learning_rate": 5.8750000000000005e-06,
"loss": 1.3695,
"step": 231
},
{
"epoch": 0.4142857142857143,
"grad_norm": 0.4866816997528076,
"learning_rate": 5.857142857142858e-06,
"loss": 1.3624,
"step": 232
},
{
"epoch": 0.4160714285714286,
"grad_norm": 0.4390086531639099,
"learning_rate": 5.839285714285715e-06,
"loss": 1.3607,
"step": 233
},
{
"epoch": 0.41785714285714287,
"grad_norm": 0.6692396998405457,
"learning_rate": 5.821428571428573e-06,
"loss": 1.3467,
"step": 234
},
{
"epoch": 0.41964285714285715,
"grad_norm": 0.4408722221851349,
"learning_rate": 5.8035714285714295e-06,
"loss": 1.3205,
"step": 235
},
{
"epoch": 0.42142857142857143,
"grad_norm": 0.5886948704719543,
"learning_rate": 5.785714285714286e-06,
"loss": 1.3403,
"step": 236
},
{
"epoch": 0.4232142857142857,
"grad_norm": 0.6418564319610596,
"learning_rate": 5.767857142857143e-06,
"loss": 1.4085,
"step": 237
},
{
"epoch": 0.425,
"grad_norm": 0.46830517053604126,
"learning_rate": 5.75e-06,
"loss": 1.3981,
"step": 238
},
{
"epoch": 0.42678571428571427,
"grad_norm": 0.4939388036727905,
"learning_rate": 5.732142857142857e-06,
"loss": 1.3607,
"step": 239
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.4522291123867035,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.331,
"step": 240
},
{
"epoch": 0.4303571428571429,
"grad_norm": 0.45634329319000244,
"learning_rate": 5.696428571428571e-06,
"loss": 1.3513,
"step": 241
},
{
"epoch": 0.43214285714285716,
"grad_norm": 0.4688902795314789,
"learning_rate": 5.678571428571429e-06,
"loss": 1.314,
"step": 242
},
{
"epoch": 0.43392857142857144,
"grad_norm": 0.6173244714736938,
"learning_rate": 5.660714285714286e-06,
"loss": 1.3453,
"step": 243
},
{
"epoch": 0.4357142857142857,
"grad_norm": 0.4678085446357727,
"learning_rate": 5.6428571428571435e-06,
"loss": 1.3813,
"step": 244
},
{
"epoch": 0.4375,
"grad_norm": 0.4518805742263794,
"learning_rate": 5.625e-06,
"loss": 1.3408,
"step": 245
},
{
"epoch": 0.4392857142857143,
"grad_norm": 0.44260984659194946,
"learning_rate": 5.607142857142858e-06,
"loss": 1.3764,
"step": 246
},
{
"epoch": 0.44107142857142856,
"grad_norm": 0.4549272060394287,
"learning_rate": 5.589285714285715e-06,
"loss": 1.3144,
"step": 247
},
{
"epoch": 0.44285714285714284,
"grad_norm": 0.4923824071884155,
"learning_rate": 5.571428571428572e-06,
"loss": 1.3239,
"step": 248
},
{
"epoch": 0.4446428571428571,
"grad_norm": 0.45576098561286926,
"learning_rate": 5.553571428571429e-06,
"loss": 1.3326,
"step": 249
},
{
"epoch": 0.44642857142857145,
"grad_norm": 0.463734894990921,
"learning_rate": 5.535714285714286e-06,
"loss": 1.4091,
"step": 250
},
{
"epoch": 0.44821428571428573,
"grad_norm": 0.6547235250473022,
"learning_rate": 5.517857142857144e-06,
"loss": 1.3331,
"step": 251
},
{
"epoch": 0.45,
"grad_norm": 0.47086387872695923,
"learning_rate": 5.500000000000001e-06,
"loss": 1.2845,
"step": 252
},
{
"epoch": 0.4517857142857143,
"grad_norm": 0.490377813577652,
"learning_rate": 5.482142857142858e-06,
"loss": 1.3323,
"step": 253
},
{
"epoch": 0.45357142857142857,
"grad_norm": 0.44158506393432617,
"learning_rate": 5.464285714285714e-06,
"loss": 1.3184,
"step": 254
},
{
"epoch": 0.45535714285714285,
"grad_norm": 0.4678877592086792,
"learning_rate": 5.446428571428571e-06,
"loss": 1.3521,
"step": 255
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.437651664018631,
"learning_rate": 5.428571428571429e-06,
"loss": 1.2956,
"step": 256
},
{
"epoch": 0.4589285714285714,
"grad_norm": 0.46878582239151,
"learning_rate": 5.410714285714286e-06,
"loss": 1.2902,
"step": 257
},
{
"epoch": 0.4607142857142857,
"grad_norm": 0.46873459219932556,
"learning_rate": 5.392857142857143e-06,
"loss": 1.344,
"step": 258
},
{
"epoch": 0.4625,
"grad_norm": 0.4619480073451996,
"learning_rate": 5.375e-06,
"loss": 1.3006,
"step": 259
},
{
"epoch": 0.4642857142857143,
"grad_norm": 0.4431358277797699,
"learning_rate": 5.357142857142857e-06,
"loss": 1.3011,
"step": 260
},
{
"epoch": 0.4660714285714286,
"grad_norm": 0.4527128338813782,
"learning_rate": 5.339285714285715e-06,
"loss": 1.3425,
"step": 261
},
{
"epoch": 0.46785714285714286,
"grad_norm": 0.44812703132629395,
"learning_rate": 5.3214285714285715e-06,
"loss": 1.3324,
"step": 262
},
{
"epoch": 0.46964285714285714,
"grad_norm": 0.4387049973011017,
"learning_rate": 5.303571428571429e-06,
"loss": 1.2346,
"step": 263
},
{
"epoch": 0.4714285714285714,
"grad_norm": 0.49831196665763855,
"learning_rate": 5.285714285714286e-06,
"loss": 1.3146,
"step": 264
},
{
"epoch": 0.4732142857142857,
"grad_norm": 0.6227532029151917,
"learning_rate": 5.267857142857144e-06,
"loss": 1.3182,
"step": 265
},
{
"epoch": 0.475,
"grad_norm": 0.7037027478218079,
"learning_rate": 5.2500000000000006e-06,
"loss": 1.4061,
"step": 266
},
{
"epoch": 0.4767857142857143,
"grad_norm": 0.4282563328742981,
"learning_rate": 5.232142857142858e-06,
"loss": 1.3054,
"step": 267
},
{
"epoch": 0.4785714285714286,
"grad_norm": 0.46296414732933044,
"learning_rate": 5.214285714285715e-06,
"loss": 1.3316,
"step": 268
},
{
"epoch": 0.48035714285714287,
"grad_norm": 0.44861307740211487,
"learning_rate": 5.196428571428572e-06,
"loss": 1.3611,
"step": 269
},
{
"epoch": 0.48214285714285715,
"grad_norm": 0.6239879727363586,
"learning_rate": 5.1785714285714296e-06,
"loss": 1.3288,
"step": 270
},
{
"epoch": 0.48392857142857143,
"grad_norm": 0.4510335624217987,
"learning_rate": 5.160714285714286e-06,
"loss": 1.2894,
"step": 271
},
{
"epoch": 0.4857142857142857,
"grad_norm": 0.44531846046447754,
"learning_rate": 5.142857142857142e-06,
"loss": 1.3373,
"step": 272
},
{
"epoch": 0.4875,
"grad_norm": 0.527055025100708,
"learning_rate": 5.125e-06,
"loss": 1.2901,
"step": 273
},
{
"epoch": 0.48928571428571427,
"grad_norm": 0.4528893530368805,
"learning_rate": 5.107142857142857e-06,
"loss": 1.2937,
"step": 274
},
{
"epoch": 0.49107142857142855,
"grad_norm": 0.4583114981651306,
"learning_rate": 5.0892857142857146e-06,
"loss": 1.303,
"step": 275
},
{
"epoch": 0.4928571428571429,
"grad_norm": 0.43386757373809814,
"learning_rate": 5.071428571428571e-06,
"loss": 1.2684,
"step": 276
},
{
"epoch": 0.49464285714285716,
"grad_norm": 0.4504014253616333,
"learning_rate": 5.053571428571429e-06,
"loss": 1.3475,
"step": 277
},
{
"epoch": 0.49642857142857144,
"grad_norm": 0.5378859639167786,
"learning_rate": 5.035714285714286e-06,
"loss": 1.325,
"step": 278
},
{
"epoch": 0.4982142857142857,
"grad_norm": 0.45138758420944214,
"learning_rate": 5.017857142857144e-06,
"loss": 1.3187,
"step": 279
},
{
"epoch": 0.5,
"grad_norm": 0.446903258562088,
"learning_rate": 5e-06,
"loss": 1.2773,
"step": 280
},
{
"epoch": 0.5017857142857143,
"grad_norm": 0.5092656016349792,
"learning_rate": 4.982142857142857e-06,
"loss": 1.3701,
"step": 281
},
{
"epoch": 0.5035714285714286,
"grad_norm": 0.4944141209125519,
"learning_rate": 4.964285714285715e-06,
"loss": 1.3226,
"step": 282
},
{
"epoch": 0.5053571428571428,
"grad_norm": 0.4606040418148041,
"learning_rate": 4.946428571428572e-06,
"loss": 1.2847,
"step": 283
},
{
"epoch": 0.5071428571428571,
"grad_norm": 0.4338245689868927,
"learning_rate": 4.928571428571429e-06,
"loss": 1.3507,
"step": 284
},
{
"epoch": 0.5089285714285714,
"grad_norm": 0.469093918800354,
"learning_rate": 4.910714285714286e-06,
"loss": 1.2832,
"step": 285
},
{
"epoch": 0.5107142857142857,
"grad_norm": 0.47597774863243103,
"learning_rate": 4.892857142857143e-06,
"loss": 1.3471,
"step": 286
},
{
"epoch": 0.5125,
"grad_norm": 0.4709608554840088,
"learning_rate": 4.875e-06,
"loss": 1.2719,
"step": 287
},
{
"epoch": 0.5142857142857142,
"grad_norm": 0.47710120677948,
"learning_rate": 4.857142857142858e-06,
"loss": 1.328,
"step": 288
},
{
"epoch": 0.5160714285714286,
"grad_norm": 0.4538082480430603,
"learning_rate": 4.839285714285714e-06,
"loss": 1.2903,
"step": 289
},
{
"epoch": 0.5178571428571429,
"grad_norm": 0.45876508951187134,
"learning_rate": 4.821428571428572e-06,
"loss": 1.3186,
"step": 290
},
{
"epoch": 0.5196428571428572,
"grad_norm": 0.5364006757736206,
"learning_rate": 4.803571428571429e-06,
"loss": 1.2773,
"step": 291
},
{
"epoch": 0.5214285714285715,
"grad_norm": 0.5031774640083313,
"learning_rate": 4.785714285714287e-06,
"loss": 1.3626,
"step": 292
},
{
"epoch": 0.5232142857142857,
"grad_norm": 0.46341967582702637,
"learning_rate": 4.7678571428571434e-06,
"loss": 1.2932,
"step": 293
},
{
"epoch": 0.525,
"grad_norm": 0.47424939274787903,
"learning_rate": 4.75e-06,
"loss": 1.3062,
"step": 294
},
{
"epoch": 0.5267857142857143,
"grad_norm": 0.4689320921897888,
"learning_rate": 4.732142857142857e-06,
"loss": 1.3843,
"step": 295
},
{
"epoch": 0.5285714285714286,
"grad_norm": 0.490421861410141,
"learning_rate": 4.714285714285715e-06,
"loss": 1.3136,
"step": 296
},
{
"epoch": 0.5303571428571429,
"grad_norm": 0.44690409302711487,
"learning_rate": 4.696428571428572e-06,
"loss": 1.288,
"step": 297
},
{
"epoch": 0.5321428571428571,
"grad_norm": 0.45565712451934814,
"learning_rate": 4.678571428571429e-06,
"loss": 1.3222,
"step": 298
},
{
"epoch": 0.5339285714285714,
"grad_norm": 0.4677983820438385,
"learning_rate": 4.660714285714286e-06,
"loss": 1.3776,
"step": 299
},
{
"epoch": 0.5357142857142857,
"grad_norm": 0.4681625962257385,
"learning_rate": 4.642857142857144e-06,
"loss": 1.3122,
"step": 300
},
{
"epoch": 0.5375,
"grad_norm": 0.46499693393707275,
"learning_rate": 4.625000000000001e-06,
"loss": 1.2907,
"step": 301
},
{
"epoch": 0.5392857142857143,
"grad_norm": 0.4690093398094177,
"learning_rate": 4.6071428571428574e-06,
"loss": 1.2977,
"step": 302
},
{
"epoch": 0.5410714285714285,
"grad_norm": 0.4936232566833496,
"learning_rate": 4.589285714285714e-06,
"loss": 1.348,
"step": 303
},
{
"epoch": 0.5428571428571428,
"grad_norm": 0.5024741888046265,
"learning_rate": 4.571428571428572e-06,
"loss": 1.3295,
"step": 304
},
{
"epoch": 0.5446428571428571,
"grad_norm": 0.48183590173721313,
"learning_rate": 4.553571428571429e-06,
"loss": 1.2871,
"step": 305
},
{
"epoch": 0.5464285714285714,
"grad_norm": 0.5088504552841187,
"learning_rate": 4.5357142857142865e-06,
"loss": 1.3618,
"step": 306
},
{
"epoch": 0.5482142857142858,
"grad_norm": 0.46338167786598206,
"learning_rate": 4.517857142857143e-06,
"loss": 1.2916,
"step": 307
},
{
"epoch": 0.55,
"grad_norm": 0.5637802481651306,
"learning_rate": 4.5e-06,
"loss": 1.2854,
"step": 308
},
{
"epoch": 0.5517857142857143,
"grad_norm": 0.4742050766944885,
"learning_rate": 4.482142857142858e-06,
"loss": 1.2854,
"step": 309
},
{
"epoch": 0.5535714285714286,
"grad_norm": 0.4686720669269562,
"learning_rate": 4.464285714285715e-06,
"loss": 1.3777,
"step": 310
},
{
"epoch": 0.5553571428571429,
"grad_norm": 0.466203510761261,
"learning_rate": 4.4464285714285715e-06,
"loss": 1.3441,
"step": 311
},
{
"epoch": 0.5571428571428572,
"grad_norm": 0.4458286464214325,
"learning_rate": 4.428571428571429e-06,
"loss": 1.2773,
"step": 312
},
{
"epoch": 0.5589285714285714,
"grad_norm": 0.4533957839012146,
"learning_rate": 4.410714285714286e-06,
"loss": 1.3226,
"step": 313
},
{
"epoch": 0.5607142857142857,
"grad_norm": 0.44287845492362976,
"learning_rate": 4.392857142857143e-06,
"loss": 1.2981,
"step": 314
},
{
"epoch": 0.5625,
"grad_norm": 0.4448222219944,
"learning_rate": 4.3750000000000005e-06,
"loss": 1.3291,
"step": 315
},
{
"epoch": 0.5642857142857143,
"grad_norm": 0.473034530878067,
"learning_rate": 4.357142857142857e-06,
"loss": 1.3117,
"step": 316
},
{
"epoch": 0.5660714285714286,
"grad_norm": 0.46073392033576965,
"learning_rate": 4.339285714285715e-06,
"loss": 1.3144,
"step": 317
},
{
"epoch": 0.5678571428571428,
"grad_norm": 0.47019240260124207,
"learning_rate": 4.321428571428572e-06,
"loss": 1.3596,
"step": 318
},
{
"epoch": 0.5696428571428571,
"grad_norm": 0.6573873162269592,
"learning_rate": 4.3035714285714295e-06,
"loss": 1.3042,
"step": 319
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.47485339641571045,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.3376,
"step": 320
},
{
"epoch": 0.5732142857142857,
"grad_norm": 0.4572908878326416,
"learning_rate": 4.267857142857143e-06,
"loss": 1.3182,
"step": 321
},
{
"epoch": 0.575,
"grad_norm": 0.5253431797027588,
"learning_rate": 4.25e-06,
"loss": 1.2737,
"step": 322
},
{
"epoch": 0.5767857142857142,
"grad_norm": 0.5506420135498047,
"learning_rate": 4.232142857142858e-06,
"loss": 1.3182,
"step": 323
},
{
"epoch": 0.5785714285714286,
"grad_norm": 0.46640023589134216,
"learning_rate": 4.2142857142857145e-06,
"loss": 1.2722,
"step": 324
},
{
"epoch": 0.5803571428571429,
"grad_norm": 0.5084778666496277,
"learning_rate": 4.196428571428572e-06,
"loss": 1.2764,
"step": 325
},
{
"epoch": 0.5821428571428572,
"grad_norm": 0.47945091128349304,
"learning_rate": 4.178571428571429e-06,
"loss": 1.2511,
"step": 326
},
{
"epoch": 0.5839285714285715,
"grad_norm": 0.4847634732723236,
"learning_rate": 4.160714285714287e-06,
"loss": 1.3431,
"step": 327
},
{
"epoch": 0.5857142857142857,
"grad_norm": 0.46884429454803467,
"learning_rate": 4.1428571428571435e-06,
"loss": 1.2875,
"step": 328
},
{
"epoch": 0.5875,
"grad_norm": 0.47149261832237244,
"learning_rate": 4.125e-06,
"loss": 1.318,
"step": 329
},
{
"epoch": 0.5892857142857143,
"grad_norm": 0.47463729977607727,
"learning_rate": 4.107142857142857e-06,
"loss": 1.3138,
"step": 330
},
{
"epoch": 0.5910714285714286,
"grad_norm": 0.5062056183815002,
"learning_rate": 4.089285714285715e-06,
"loss": 1.2764,
"step": 331
},
{
"epoch": 0.5928571428571429,
"grad_norm": 0.45992037653923035,
"learning_rate": 4.071428571428572e-06,
"loss": 1.2663,
"step": 332
},
{
"epoch": 0.5946428571428571,
"grad_norm": 0.47592759132385254,
"learning_rate": 4.053571428571429e-06,
"loss": 1.3085,
"step": 333
},
{
"epoch": 0.5964285714285714,
"grad_norm": 0.545661211013794,
"learning_rate": 4.035714285714286e-06,
"loss": 1.2963,
"step": 334
},
{
"epoch": 0.5982142857142857,
"grad_norm": 0.48694783449172974,
"learning_rate": 4.017857142857143e-06,
"loss": 1.3279,
"step": 335
},
{
"epoch": 0.6,
"grad_norm": 0.4582449197769165,
"learning_rate": 4.000000000000001e-06,
"loss": 1.2888,
"step": 336
},
{
"epoch": 0.6017857142857143,
"grad_norm": 0.5217434167861938,
"learning_rate": 3.9821428571428575e-06,
"loss": 1.2232,
"step": 337
},
{
"epoch": 0.6035714285714285,
"grad_norm": 0.48110780119895935,
"learning_rate": 3.964285714285714e-06,
"loss": 1.2587,
"step": 338
},
{
"epoch": 0.6053571428571428,
"grad_norm": 0.5427893400192261,
"learning_rate": 3.946428571428572e-06,
"loss": 1.3105,
"step": 339
},
{
"epoch": 0.6071428571428571,
"grad_norm": 0.48423877358436584,
"learning_rate": 3.928571428571429e-06,
"loss": 1.2831,
"step": 340
},
{
"epoch": 0.6089285714285714,
"grad_norm": 0.47350701689720154,
"learning_rate": 3.910714285714286e-06,
"loss": 1.2791,
"step": 341
},
{
"epoch": 0.6107142857142858,
"grad_norm": 0.6432921886444092,
"learning_rate": 3.892857142857143e-06,
"loss": 1.3398,
"step": 342
},
{
"epoch": 0.6125,
"grad_norm": 0.47377124428749084,
"learning_rate": 3.875e-06,
"loss": 1.2665,
"step": 343
},
{
"epoch": 0.6142857142857143,
"grad_norm": 0.4444401264190674,
"learning_rate": 3.857142857142858e-06,
"loss": 1.2557,
"step": 344
},
{
"epoch": 0.6160714285714286,
"grad_norm": 0.4990551769733429,
"learning_rate": 3.839285714285715e-06,
"loss": 1.3146,
"step": 345
},
{
"epoch": 0.6178571428571429,
"grad_norm": 0.4426332414150238,
"learning_rate": 3.8214285714285715e-06,
"loss": 1.2646,
"step": 346
},
{
"epoch": 0.6196428571428572,
"grad_norm": 0.46762406826019287,
"learning_rate": 3.8035714285714288e-06,
"loss": 1.2472,
"step": 347
},
{
"epoch": 0.6214285714285714,
"grad_norm": 0.4878827929496765,
"learning_rate": 3.785714285714286e-06,
"loss": 1.3176,
"step": 348
},
{
"epoch": 0.6232142857142857,
"grad_norm": 0.4643489718437195,
"learning_rate": 3.7678571428571433e-06,
"loss": 1.2463,
"step": 349
},
{
"epoch": 0.625,
"grad_norm": 0.5744695067405701,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3041,
"step": 350
},
{
"epoch": 0.6267857142857143,
"grad_norm": 0.5968716144561768,
"learning_rate": 3.7321428571428573e-06,
"loss": 1.4068,
"step": 351
},
{
"epoch": 0.6285714285714286,
"grad_norm": 0.454670786857605,
"learning_rate": 3.7142857142857146e-06,
"loss": 1.2938,
"step": 352
},
{
"epoch": 0.6303571428571428,
"grad_norm": 0.4747679829597473,
"learning_rate": 3.696428571428572e-06,
"loss": 1.335,
"step": 353
},
{
"epoch": 0.6321428571428571,
"grad_norm": 0.47802817821502686,
"learning_rate": 3.678571428571429e-06,
"loss": 1.2879,
"step": 354
},
{
"epoch": 0.6339285714285714,
"grad_norm": 0.4985521733760834,
"learning_rate": 3.660714285714286e-06,
"loss": 1.3693,
"step": 355
},
{
"epoch": 0.6357142857142857,
"grad_norm": 0.4986526072025299,
"learning_rate": 3.642857142857143e-06,
"loss": 1.3127,
"step": 356
},
{
"epoch": 0.6375,
"grad_norm": 0.5105973482131958,
"learning_rate": 3.625e-06,
"loss": 1.286,
"step": 357
},
{
"epoch": 0.6392857142857142,
"grad_norm": 0.4753775894641876,
"learning_rate": 3.6071428571428573e-06,
"loss": 1.2683,
"step": 358
},
{
"epoch": 0.6410714285714286,
"grad_norm": 0.4934110641479492,
"learning_rate": 3.5892857142857145e-06,
"loss": 1.2676,
"step": 359
},
{
"epoch": 0.6428571428571429,
"grad_norm": 0.5534637570381165,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.3717,
"step": 360
},
{
"epoch": 0.6446428571428572,
"grad_norm": 0.49249303340911865,
"learning_rate": 3.553571428571429e-06,
"loss": 1.2703,
"step": 361
},
{
"epoch": 0.6464285714285715,
"grad_norm": 0.4779037833213806,
"learning_rate": 3.5357142857142863e-06,
"loss": 1.3419,
"step": 362
},
{
"epoch": 0.6482142857142857,
"grad_norm": 0.48425963521003723,
"learning_rate": 3.5178571428571435e-06,
"loss": 1.3201,
"step": 363
},
{
"epoch": 0.65,
"grad_norm": 0.497473806142807,
"learning_rate": 3.5e-06,
"loss": 1.25,
"step": 364
},
{
"epoch": 0.6517857142857143,
"grad_norm": 0.49431318044662476,
"learning_rate": 3.482142857142857e-06,
"loss": 1.3033,
"step": 365
},
{
"epoch": 0.6535714285714286,
"grad_norm": 0.657665491104126,
"learning_rate": 3.4642857142857145e-06,
"loss": 1.3362,
"step": 366
},
{
"epoch": 0.6553571428571429,
"grad_norm": 0.5138763189315796,
"learning_rate": 3.4464285714285717e-06,
"loss": 1.33,
"step": 367
},
{
"epoch": 0.6571428571428571,
"grad_norm": 0.5025115609169006,
"learning_rate": 3.428571428571429e-06,
"loss": 1.2995,
"step": 368
},
{
"epoch": 0.6589285714285714,
"grad_norm": 0.48064032196998596,
"learning_rate": 3.4107142857142862e-06,
"loss": 1.3438,
"step": 369
},
{
"epoch": 0.6607142857142857,
"grad_norm": 0.5210350751876831,
"learning_rate": 3.3928571428571435e-06,
"loss": 1.3845,
"step": 370
},
{
"epoch": 0.6625,
"grad_norm": 0.4877013862133026,
"learning_rate": 3.3750000000000003e-06,
"loss": 1.3046,
"step": 371
},
{
"epoch": 0.6642857142857143,
"grad_norm": 0.4923580288887024,
"learning_rate": 3.357142857142857e-06,
"loss": 1.2809,
"step": 372
},
{
"epoch": 0.6660714285714285,
"grad_norm": 0.48784345388412476,
"learning_rate": 3.3392857142857144e-06,
"loss": 1.2812,
"step": 373
},
{
"epoch": 0.6678571428571428,
"grad_norm": 0.5311793684959412,
"learning_rate": 3.3214285714285716e-06,
"loss": 1.4334,
"step": 374
},
{
"epoch": 0.6696428571428571,
"grad_norm": 0.4956177771091461,
"learning_rate": 3.303571428571429e-06,
"loss": 1.2847,
"step": 375
},
{
"epoch": 0.6714285714285714,
"grad_norm": 0.4931349456310272,
"learning_rate": 3.285714285714286e-06,
"loss": 1.3192,
"step": 376
},
{
"epoch": 0.6732142857142858,
"grad_norm": 0.5116465091705322,
"learning_rate": 3.267857142857143e-06,
"loss": 1.3024,
"step": 377
},
{
"epoch": 0.675,
"grad_norm": 0.499210387468338,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.2661,
"step": 378
},
{
"epoch": 0.6767857142857143,
"grad_norm": 0.5030224919319153,
"learning_rate": 3.2321428571428575e-06,
"loss": 1.2374,
"step": 379
},
{
"epoch": 0.6785714285714286,
"grad_norm": 0.503461480140686,
"learning_rate": 3.2142857142857147e-06,
"loss": 1.2834,
"step": 380
},
{
"epoch": 0.6803571428571429,
"grad_norm": 0.46770402789115906,
"learning_rate": 3.1964285714285716e-06,
"loss": 1.2843,
"step": 381
},
{
"epoch": 0.6821428571428572,
"grad_norm": 0.48501157760620117,
"learning_rate": 3.178571428571429e-06,
"loss": 1.2583,
"step": 382
},
{
"epoch": 0.6839285714285714,
"grad_norm": 0.4787866473197937,
"learning_rate": 3.1607142857142856e-06,
"loss": 1.3136,
"step": 383
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.5361959338188171,
"learning_rate": 3.142857142857143e-06,
"loss": 1.3137,
"step": 384
},
{
"epoch": 0.6875,
"grad_norm": 0.48236384987831116,
"learning_rate": 3.125e-06,
"loss": 1.2333,
"step": 385
},
{
"epoch": 0.6892857142857143,
"grad_norm": 0.4979291558265686,
"learning_rate": 3.1071428571428574e-06,
"loss": 1.2305,
"step": 386
},
{
"epoch": 0.6910714285714286,
"grad_norm": 0.4781268537044525,
"learning_rate": 3.0892857142857147e-06,
"loss": 1.2512,
"step": 387
},
{
"epoch": 0.6928571428571428,
"grad_norm": 0.539099931716919,
"learning_rate": 3.071428571428572e-06,
"loss": 1.2722,
"step": 388
},
{
"epoch": 0.6946428571428571,
"grad_norm": 0.47137516736984253,
"learning_rate": 3.053571428571429e-06,
"loss": 1.267,
"step": 389
},
{
"epoch": 0.6964285714285714,
"grad_norm": 0.4990849494934082,
"learning_rate": 3.0357142857142856e-06,
"loss": 1.2527,
"step": 390
},
{
"epoch": 0.6982142857142857,
"grad_norm": 0.4742002487182617,
"learning_rate": 3.017857142857143e-06,
"loss": 1.252,
"step": 391
},
{
"epoch": 0.7,
"grad_norm": 0.4833225607872009,
"learning_rate": 3e-06,
"loss": 1.2527,
"step": 392
},
{
"epoch": 0.7017857142857142,
"grad_norm": 0.5953601002693176,
"learning_rate": 2.9821428571428573e-06,
"loss": 1.2919,
"step": 393
},
{
"epoch": 0.7035714285714286,
"grad_norm": 0.4824086129665375,
"learning_rate": 2.9642857142857146e-06,
"loss": 1.2951,
"step": 394
},
{
"epoch": 0.7053571428571429,
"grad_norm": 0.524111270904541,
"learning_rate": 2.946428571428572e-06,
"loss": 1.2351,
"step": 395
},
{
"epoch": 0.7071428571428572,
"grad_norm": 0.5176703333854675,
"learning_rate": 2.928571428571429e-06,
"loss": 1.346,
"step": 396
},
{
"epoch": 0.7089285714285715,
"grad_norm": 0.5603062510490417,
"learning_rate": 2.9107142857142863e-06,
"loss": 1.368,
"step": 397
},
{
"epoch": 0.7107142857142857,
"grad_norm": 0.510238766670227,
"learning_rate": 2.892857142857143e-06,
"loss": 1.2433,
"step": 398
},
{
"epoch": 0.7125,
"grad_norm": 0.5014546513557434,
"learning_rate": 2.875e-06,
"loss": 1.3554,
"step": 399
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.6319053173065186,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.308,
"step": 400
},
{
"epoch": 0.7160714285714286,
"grad_norm": 0.5223023891448975,
"learning_rate": 2.8392857142857145e-06,
"loss": 1.3143,
"step": 401
},
{
"epoch": 0.7178571428571429,
"grad_norm": 0.48164916038513184,
"learning_rate": 2.8214285714285718e-06,
"loss": 1.296,
"step": 402
},
{
"epoch": 0.7196428571428571,
"grad_norm": 0.5192731618881226,
"learning_rate": 2.803571428571429e-06,
"loss": 1.2594,
"step": 403
},
{
"epoch": 0.7214285714285714,
"grad_norm": 0.5068328380584717,
"learning_rate": 2.785714285714286e-06,
"loss": 1.3369,
"step": 404
},
{
"epoch": 0.7232142857142857,
"grad_norm": 0.6028531193733215,
"learning_rate": 2.767857142857143e-06,
"loss": 1.2731,
"step": 405
},
{
"epoch": 0.725,
"grad_norm": 0.48241686820983887,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.2601,
"step": 406
},
{
"epoch": 0.7267857142857143,
"grad_norm": 0.4899542033672333,
"learning_rate": 2.732142857142857e-06,
"loss": 1.2853,
"step": 407
},
{
"epoch": 0.7285714285714285,
"grad_norm": 0.49059104919433594,
"learning_rate": 2.7142857142857144e-06,
"loss": 1.2885,
"step": 408
},
{
"epoch": 0.7303571428571428,
"grad_norm": 0.49142804741859436,
"learning_rate": 2.6964285714285717e-06,
"loss": 1.2797,
"step": 409
},
{
"epoch": 0.7321428571428571,
"grad_norm": 0.4777282178401947,
"learning_rate": 2.6785714285714285e-06,
"loss": 1.297,
"step": 410
},
{
"epoch": 0.7339285714285714,
"grad_norm": 0.5065960884094238,
"learning_rate": 2.6607142857142858e-06,
"loss": 1.284,
"step": 411
},
{
"epoch": 0.7357142857142858,
"grad_norm": 0.5673279166221619,
"learning_rate": 2.642857142857143e-06,
"loss": 1.2991,
"step": 412
},
{
"epoch": 0.7375,
"grad_norm": 0.5462106466293335,
"learning_rate": 2.6250000000000003e-06,
"loss": 1.3213,
"step": 413
},
{
"epoch": 0.7392857142857143,
"grad_norm": 0.5614867210388184,
"learning_rate": 2.6071428571428575e-06,
"loss": 1.2939,
"step": 414
},
{
"epoch": 0.7410714285714286,
"grad_norm": 0.478646844625473,
"learning_rate": 2.5892857142857148e-06,
"loss": 1.2683,
"step": 415
},
{
"epoch": 0.7428571428571429,
"grad_norm": 0.5402962565422058,
"learning_rate": 2.571428571428571e-06,
"loss": 1.2643,
"step": 416
},
{
"epoch": 0.7446428571428572,
"grad_norm": 0.4907895028591156,
"learning_rate": 2.5535714285714284e-06,
"loss": 1.2604,
"step": 417
},
{
"epoch": 0.7464285714285714,
"grad_norm": 0.4979659914970398,
"learning_rate": 2.5357142857142857e-06,
"loss": 1.265,
"step": 418
},
{
"epoch": 0.7482142857142857,
"grad_norm": 0.4915474057197571,
"learning_rate": 2.517857142857143e-06,
"loss": 1.2653,
"step": 419
},
{
"epoch": 0.75,
"grad_norm": 0.4773080050945282,
"learning_rate": 2.5e-06,
"loss": 1.2286,
"step": 420
},
{
"epoch": 0.7517857142857143,
"grad_norm": 0.5044196844100952,
"learning_rate": 2.4821428571428575e-06,
"loss": 1.3119,
"step": 421
},
{
"epoch": 0.7535714285714286,
"grad_norm": 0.49431711435317993,
"learning_rate": 2.4642857142857147e-06,
"loss": 1.2951,
"step": 422
},
{
"epoch": 0.7553571428571428,
"grad_norm": 0.49043866991996765,
"learning_rate": 2.4464285714285715e-06,
"loss": 1.277,
"step": 423
},
{
"epoch": 0.7571428571428571,
"grad_norm": 0.6170372366905212,
"learning_rate": 2.428571428571429e-06,
"loss": 1.3304,
"step": 424
},
{
"epoch": 0.7589285714285714,
"grad_norm": 0.4647572338581085,
"learning_rate": 2.410714285714286e-06,
"loss": 1.24,
"step": 425
},
{
"epoch": 0.7607142857142857,
"grad_norm": 0.49019157886505127,
"learning_rate": 2.3928571428571433e-06,
"loss": 1.2307,
"step": 426
},
{
"epoch": 0.7625,
"grad_norm": 0.6097828149795532,
"learning_rate": 2.375e-06,
"loss": 1.2541,
"step": 427
},
{
"epoch": 0.7642857142857142,
"grad_norm": 0.5185028314590454,
"learning_rate": 2.3571428571428574e-06,
"loss": 1.372,
"step": 428
},
{
"epoch": 0.7660714285714286,
"grad_norm": 0.49547451734542847,
"learning_rate": 2.3392857142857146e-06,
"loss": 1.2792,
"step": 429
},
{
"epoch": 0.7678571428571429,
"grad_norm": 0.5313171744346619,
"learning_rate": 2.321428571428572e-06,
"loss": 1.2909,
"step": 430
},
{
"epoch": 0.7696428571428572,
"grad_norm": 0.7000820636749268,
"learning_rate": 2.3035714285714287e-06,
"loss": 1.2131,
"step": 431
},
{
"epoch": 0.7714285714285715,
"grad_norm": 0.49376264214515686,
"learning_rate": 2.285714285714286e-06,
"loss": 1.2634,
"step": 432
},
{
"epoch": 0.7732142857142857,
"grad_norm": 0.5121849179267883,
"learning_rate": 2.2678571428571432e-06,
"loss": 1.3224,
"step": 433
},
{
"epoch": 0.775,
"grad_norm": 0.6177911162376404,
"learning_rate": 2.25e-06,
"loss": 1.3438,
"step": 434
},
{
"epoch": 0.7767857142857143,
"grad_norm": 0.682819128036499,
"learning_rate": 2.2321428571428573e-06,
"loss": 1.3062,
"step": 435
},
{
"epoch": 0.7785714285714286,
"grad_norm": 0.5072125792503357,
"learning_rate": 2.2142857142857146e-06,
"loss": 1.2587,
"step": 436
},
{
"epoch": 0.7803571428571429,
"grad_norm": 0.4885023832321167,
"learning_rate": 2.1964285714285714e-06,
"loss": 1.2627,
"step": 437
},
{
"epoch": 0.7821428571428571,
"grad_norm": 0.49310681223869324,
"learning_rate": 2.1785714285714286e-06,
"loss": 1.3108,
"step": 438
},
{
"epoch": 0.7839285714285714,
"grad_norm": 0.6132137775421143,
"learning_rate": 2.160714285714286e-06,
"loss": 1.2016,
"step": 439
},
{
"epoch": 0.7857142857142857,
"grad_norm": 0.5641778707504272,
"learning_rate": 2.1428571428571427e-06,
"loss": 1.3018,
"step": 440
},
{
"epoch": 0.7875,
"grad_norm": 0.5050976872444153,
"learning_rate": 2.125e-06,
"loss": 1.3549,
"step": 441
},
{
"epoch": 0.7892857142857143,
"grad_norm": 0.5003477334976196,
"learning_rate": 2.1071428571428572e-06,
"loss": 1.3315,
"step": 442
},
{
"epoch": 0.7910714285714285,
"grad_norm": 0.4969649314880371,
"learning_rate": 2.0892857142857145e-06,
"loss": 1.3,
"step": 443
},
{
"epoch": 0.7928571428571428,
"grad_norm": 0.5957094430923462,
"learning_rate": 2.0714285714285717e-06,
"loss": 1.3144,
"step": 444
},
{
"epoch": 0.7946428571428571,
"grad_norm": 0.5155512690544128,
"learning_rate": 2.0535714285714286e-06,
"loss": 1.3148,
"step": 445
},
{
"epoch": 0.7964285714285714,
"grad_norm": 0.4873362183570862,
"learning_rate": 2.035714285714286e-06,
"loss": 1.2745,
"step": 446
},
{
"epoch": 0.7982142857142858,
"grad_norm": 0.4859369695186615,
"learning_rate": 2.017857142857143e-06,
"loss": 1.2988,
"step": 447
},
{
"epoch": 0.8,
"grad_norm": 0.4897172152996063,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.2945,
"step": 448
},
{
"epoch": 0.8017857142857143,
"grad_norm": 0.5550750494003296,
"learning_rate": 1.982142857142857e-06,
"loss": 1.2986,
"step": 449
},
{
"epoch": 0.8035714285714286,
"grad_norm": 0.48994505405426025,
"learning_rate": 1.9642857142857144e-06,
"loss": 1.3026,
"step": 450
},
{
"epoch": 0.8053571428571429,
"grad_norm": 0.49379977583885193,
"learning_rate": 1.9464285714285717e-06,
"loss": 1.3036,
"step": 451
},
{
"epoch": 0.8071428571428572,
"grad_norm": 0.5711806416511536,
"learning_rate": 1.928571428571429e-06,
"loss": 1.4071,
"step": 452
},
{
"epoch": 0.8089285714285714,
"grad_norm": 0.5102342963218689,
"learning_rate": 1.9107142857142858e-06,
"loss": 1.2941,
"step": 453
},
{
"epoch": 0.8107142857142857,
"grad_norm": 0.49400219321250916,
"learning_rate": 1.892857142857143e-06,
"loss": 1.2947,
"step": 454
},
{
"epoch": 0.8125,
"grad_norm": 0.5019742250442505,
"learning_rate": 1.8750000000000003e-06,
"loss": 1.2813,
"step": 455
},
{
"epoch": 0.8142857142857143,
"grad_norm": 0.4831081032752991,
"learning_rate": 1.8571428571428573e-06,
"loss": 1.2455,
"step": 456
},
{
"epoch": 0.8160714285714286,
"grad_norm": 0.6246116161346436,
"learning_rate": 1.8392857142857146e-06,
"loss": 1.2438,
"step": 457
},
{
"epoch": 0.8178571428571428,
"grad_norm": 0.48425114154815674,
"learning_rate": 1.8214285714285716e-06,
"loss": 1.2531,
"step": 458
},
{
"epoch": 0.8196428571428571,
"grad_norm": 0.5049098134040833,
"learning_rate": 1.8035714285714286e-06,
"loss": 1.2603,
"step": 459
},
{
"epoch": 0.8214285714285714,
"grad_norm": 0.5139657258987427,
"learning_rate": 1.7857142857142859e-06,
"loss": 1.2987,
"step": 460
},
{
"epoch": 0.8232142857142857,
"grad_norm": 0.4795459508895874,
"learning_rate": 1.7678571428571431e-06,
"loss": 1.2956,
"step": 461
},
{
"epoch": 0.825,
"grad_norm": 0.490296870470047,
"learning_rate": 1.75e-06,
"loss": 1.2379,
"step": 462
},
{
"epoch": 0.8267857142857142,
"grad_norm": 0.4932839572429657,
"learning_rate": 1.7321428571428572e-06,
"loss": 1.3099,
"step": 463
},
{
"epoch": 0.8285714285714286,
"grad_norm": 0.5037016272544861,
"learning_rate": 1.7142857142857145e-06,
"loss": 1.3468,
"step": 464
},
{
"epoch": 0.8303571428571429,
"grad_norm": 0.49343612790107727,
"learning_rate": 1.6964285714285717e-06,
"loss": 1.3217,
"step": 465
},
{
"epoch": 0.8321428571428572,
"grad_norm": 0.49327361583709717,
"learning_rate": 1.6785714285714286e-06,
"loss": 1.3203,
"step": 466
},
{
"epoch": 0.8339285714285715,
"grad_norm": 0.48709049820899963,
"learning_rate": 1.6607142857142858e-06,
"loss": 1.2245,
"step": 467
},
{
"epoch": 0.8357142857142857,
"grad_norm": 0.5107455253601074,
"learning_rate": 1.642857142857143e-06,
"loss": 1.2494,
"step": 468
},
{
"epoch": 0.8375,
"grad_norm": 0.5042998194694519,
"learning_rate": 1.6250000000000001e-06,
"loss": 1.2904,
"step": 469
},
{
"epoch": 0.8392857142857143,
"grad_norm": 0.4961761236190796,
"learning_rate": 1.6071428571428574e-06,
"loss": 1.2893,
"step": 470
},
{
"epoch": 0.8410714285714286,
"grad_norm": 0.4918581545352936,
"learning_rate": 1.5892857142857144e-06,
"loss": 1.2581,
"step": 471
},
{
"epoch": 0.8428571428571429,
"grad_norm": 0.4863058030605316,
"learning_rate": 1.5714285714285714e-06,
"loss": 1.3444,
"step": 472
},
{
"epoch": 0.8446428571428571,
"grad_norm": 0.47693178057670593,
"learning_rate": 1.5535714285714287e-06,
"loss": 1.2956,
"step": 473
},
{
"epoch": 0.8464285714285714,
"grad_norm": 0.5165431499481201,
"learning_rate": 1.535714285714286e-06,
"loss": 1.3898,
"step": 474
},
{
"epoch": 0.8482142857142857,
"grad_norm": 0.8129104375839233,
"learning_rate": 1.5178571428571428e-06,
"loss": 1.2759,
"step": 475
},
{
"epoch": 0.85,
"grad_norm": 0.5097749829292297,
"learning_rate": 1.5e-06,
"loss": 1.2506,
"step": 476
},
{
"epoch": 0.8517857142857143,
"grad_norm": 0.6216138601303101,
"learning_rate": 1.4821428571428573e-06,
"loss": 1.2784,
"step": 477
},
{
"epoch": 0.8535714285714285,
"grad_norm": 0.5131860971450806,
"learning_rate": 1.4642857142857145e-06,
"loss": 1.3087,
"step": 478
},
{
"epoch": 0.8553571428571428,
"grad_norm": 0.532474160194397,
"learning_rate": 1.4464285714285716e-06,
"loss": 1.3609,
"step": 479
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.4782165586948395,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.2722,
"step": 480
},
{
"epoch": 0.8589285714285714,
"grad_norm": 0.4915912449359894,
"learning_rate": 1.4107142857142859e-06,
"loss": 1.3194,
"step": 481
},
{
"epoch": 0.8607142857142858,
"grad_norm": 0.4973057806491852,
"learning_rate": 1.392857142857143e-06,
"loss": 1.2925,
"step": 482
},
{
"epoch": 0.8625,
"grad_norm": 0.4953904151916504,
"learning_rate": 1.3750000000000002e-06,
"loss": 1.2731,
"step": 483
},
{
"epoch": 0.8642857142857143,
"grad_norm": 0.48121222853660583,
"learning_rate": 1.3571428571428572e-06,
"loss": 1.2951,
"step": 484
},
{
"epoch": 0.8660714285714286,
"grad_norm": 0.497459352016449,
"learning_rate": 1.3392857142857143e-06,
"loss": 1.2501,
"step": 485
},
{
"epoch": 0.8678571428571429,
"grad_norm": 0.49168872833251953,
"learning_rate": 1.3214285714285715e-06,
"loss": 1.2746,
"step": 486
},
{
"epoch": 0.8696428571428572,
"grad_norm": 0.6732675433158875,
"learning_rate": 1.3035714285714288e-06,
"loss": 1.2376,
"step": 487
},
{
"epoch": 0.8714285714285714,
"grad_norm": 0.49309980869293213,
"learning_rate": 1.2857142857142856e-06,
"loss": 1.2153,
"step": 488
},
{
"epoch": 0.8732142857142857,
"grad_norm": 0.5174947381019592,
"learning_rate": 1.2678571428571428e-06,
"loss": 1.2484,
"step": 489
},
{
"epoch": 0.875,
"grad_norm": 0.4835157096385956,
"learning_rate": 1.25e-06,
"loss": 1.2991,
"step": 490
},
{
"epoch": 0.8767857142857143,
"grad_norm": 0.4993467330932617,
"learning_rate": 1.2321428571428574e-06,
"loss": 1.3207,
"step": 491
},
{
"epoch": 0.8785714285714286,
"grad_norm": 0.6864446401596069,
"learning_rate": 1.2142857142857144e-06,
"loss": 1.2658,
"step": 492
},
{
"epoch": 0.8803571428571428,
"grad_norm": 0.6041408181190491,
"learning_rate": 1.1964285714285717e-06,
"loss": 1.2204,
"step": 493
},
{
"epoch": 0.8821428571428571,
"grad_norm": 0.49740070104599,
"learning_rate": 1.1785714285714287e-06,
"loss": 1.2915,
"step": 494
},
{
"epoch": 0.8839285714285714,
"grad_norm": 0.504573404788971,
"learning_rate": 1.160714285714286e-06,
"loss": 1.2821,
"step": 495
},
{
"epoch": 0.8857142857142857,
"grad_norm": 0.5079774260520935,
"learning_rate": 1.142857142857143e-06,
"loss": 1.3058,
"step": 496
},
{
"epoch": 0.8875,
"grad_norm": 0.512030303478241,
"learning_rate": 1.125e-06,
"loss": 1.2856,
"step": 497
},
{
"epoch": 0.8892857142857142,
"grad_norm": 0.4908679723739624,
"learning_rate": 1.1071428571428573e-06,
"loss": 1.2765,
"step": 498
},
{
"epoch": 0.8910714285714286,
"grad_norm": 0.4868737757205963,
"learning_rate": 1.0892857142857143e-06,
"loss": 1.2691,
"step": 499
},
{
"epoch": 0.8928571428571429,
"grad_norm": 0.504216194152832,
"learning_rate": 1.0714285714285714e-06,
"loss": 1.2786,
"step": 500
},
{
"epoch": 0.8946428571428572,
"grad_norm": 0.48668280243873596,
"learning_rate": 1.0535714285714286e-06,
"loss": 1.2701,
"step": 501
},
{
"epoch": 0.8964285714285715,
"grad_norm": 0.5180802345275879,
"learning_rate": 1.0357142857142859e-06,
"loss": 1.3073,
"step": 502
},
{
"epoch": 0.8982142857142857,
"grad_norm": 0.48071718215942383,
"learning_rate": 1.017857142857143e-06,
"loss": 1.2082,
"step": 503
},
{
"epoch": 0.9,
"grad_norm": 0.5288301706314087,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.326,
"step": 504
},
{
"epoch": 0.9017857142857143,
"grad_norm": 0.5516347289085388,
"learning_rate": 9.821428571428572e-07,
"loss": 1.2103,
"step": 505
},
{
"epoch": 0.9035714285714286,
"grad_norm": 0.4964546859264374,
"learning_rate": 9.642857142857145e-07,
"loss": 1.2661,
"step": 506
},
{
"epoch": 0.9053571428571429,
"grad_norm": 0.5081775784492493,
"learning_rate": 9.464285714285715e-07,
"loss": 1.2868,
"step": 507
},
{
"epoch": 0.9071428571428571,
"grad_norm": 0.5029599070549011,
"learning_rate": 9.285714285714287e-07,
"loss": 1.291,
"step": 508
},
{
"epoch": 0.9089285714285714,
"grad_norm": 0.5202727913856506,
"learning_rate": 9.107142857142858e-07,
"loss": 1.2899,
"step": 509
},
{
"epoch": 0.9107142857142857,
"grad_norm": 0.48697784543037415,
"learning_rate": 8.928571428571429e-07,
"loss": 1.2969,
"step": 510
},
{
"epoch": 0.9125,
"grad_norm": 0.4840863049030304,
"learning_rate": 8.75e-07,
"loss": 1.2627,
"step": 511
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.5034891963005066,
"learning_rate": 8.571428571428572e-07,
"loss": 1.3292,
"step": 512
},
{
"epoch": 0.9160714285714285,
"grad_norm": 0.5149267315864563,
"learning_rate": 8.392857142857143e-07,
"loss": 1.2654,
"step": 513
},
{
"epoch": 0.9178571428571428,
"grad_norm": 0.7686546444892883,
"learning_rate": 8.214285714285715e-07,
"loss": 1.27,
"step": 514
},
{
"epoch": 0.9196428571428571,
"grad_norm": 0.5021243691444397,
"learning_rate": 8.035714285714287e-07,
"loss": 1.3903,
"step": 515
},
{
"epoch": 0.9214285714285714,
"grad_norm": 0.5143928527832031,
"learning_rate": 7.857142857142857e-07,
"loss": 1.3113,
"step": 516
},
{
"epoch": 0.9232142857142858,
"grad_norm": 0.48257723450660706,
"learning_rate": 7.67857142857143e-07,
"loss": 1.2867,
"step": 517
},
{
"epoch": 0.925,
"grad_norm": 0.4944348633289337,
"learning_rate": 7.5e-07,
"loss": 1.2695,
"step": 518
},
{
"epoch": 0.9267857142857143,
"grad_norm": 0.6723678708076477,
"learning_rate": 7.321428571428573e-07,
"loss": 1.2364,
"step": 519
},
{
"epoch": 0.9285714285714286,
"grad_norm": 0.5064557194709778,
"learning_rate": 7.142857142857143e-07,
"loss": 1.2422,
"step": 520
},
{
"epoch": 0.9303571428571429,
"grad_norm": 0.5481106042861938,
"learning_rate": 6.964285714285715e-07,
"loss": 1.2502,
"step": 521
},
{
"epoch": 0.9321428571428572,
"grad_norm": 0.5038375854492188,
"learning_rate": 6.785714285714286e-07,
"loss": 1.3219,
"step": 522
},
{
"epoch": 0.9339285714285714,
"grad_norm": 0.5069401264190674,
"learning_rate": 6.607142857142858e-07,
"loss": 1.2895,
"step": 523
},
{
"epoch": 0.9357142857142857,
"grad_norm": 0.5010185241699219,
"learning_rate": 6.428571428571428e-07,
"loss": 1.2923,
"step": 524
},
{
"epoch": 0.9375,
"grad_norm": 0.4841311573982239,
"learning_rate": 6.25e-07,
"loss": 1.2651,
"step": 525
},
{
"epoch": 0.9392857142857143,
"grad_norm": 0.5106439590454102,
"learning_rate": 6.071428571428572e-07,
"loss": 1.2708,
"step": 526
},
{
"epoch": 0.9410714285714286,
"grad_norm": 0.4902538061141968,
"learning_rate": 5.892857142857143e-07,
"loss": 1.2703,
"step": 527
},
{
"epoch": 0.9428571428571428,
"grad_norm": 0.48699891567230225,
"learning_rate": 5.714285714285715e-07,
"loss": 1.2679,
"step": 528
},
{
"epoch": 0.9446428571428571,
"grad_norm": 0.5491330623626709,
"learning_rate": 5.535714285714286e-07,
"loss": 1.2921,
"step": 529
},
{
"epoch": 0.9464285714285714,
"grad_norm": 0.5123488903045654,
"learning_rate": 5.357142857142857e-07,
"loss": 1.2446,
"step": 530
},
{
"epoch": 0.9482142857142857,
"grad_norm": 0.6574044823646545,
"learning_rate": 5.178571428571429e-07,
"loss": 1.3066,
"step": 531
},
{
"epoch": 0.95,
"grad_norm": 0.5071901082992554,
"learning_rate": 5.000000000000001e-07,
"loss": 1.2726,
"step": 532
},
{
"epoch": 0.9517857142857142,
"grad_norm": 0.495935320854187,
"learning_rate": 4.821428571428572e-07,
"loss": 1.2758,
"step": 533
},
{
"epoch": 0.9535714285714286,
"grad_norm": 0.5281900763511658,
"learning_rate": 4.642857142857143e-07,
"loss": 1.2688,
"step": 534
},
{
"epoch": 0.9553571428571429,
"grad_norm": 0.49741920828819275,
"learning_rate": 4.4642857142857147e-07,
"loss": 1.2416,
"step": 535
},
{
"epoch": 0.9571428571428572,
"grad_norm": 0.5077670812606812,
"learning_rate": 4.285714285714286e-07,
"loss": 1.3029,
"step": 536
},
{
"epoch": 0.9589285714285715,
"grad_norm": 0.46955814957618713,
"learning_rate": 4.1071428571428577e-07,
"loss": 1.262,
"step": 537
},
{
"epoch": 0.9607142857142857,
"grad_norm": 0.4829843044281006,
"learning_rate": 3.9285714285714286e-07,
"loss": 1.2283,
"step": 538
},
{
"epoch": 0.9625,
"grad_norm": 0.47563934326171875,
"learning_rate": 3.75e-07,
"loss": 1.2962,
"step": 539
},
{
"epoch": 0.9642857142857143,
"grad_norm": 0.6257872581481934,
"learning_rate": 3.5714285714285716e-07,
"loss": 1.1969,
"step": 540
},
{
"epoch": 0.9660714285714286,
"grad_norm": 0.5398846864700317,
"learning_rate": 3.392857142857143e-07,
"loss": 1.2731,
"step": 541
},
{
"epoch": 0.9678571428571429,
"grad_norm": 0.48545607924461365,
"learning_rate": 3.214285714285714e-07,
"loss": 1.2559,
"step": 542
},
{
"epoch": 0.9696428571428571,
"grad_norm": 0.504273533821106,
"learning_rate": 3.035714285714286e-07,
"loss": 1.2457,
"step": 543
},
{
"epoch": 0.9714285714285714,
"grad_norm": 0.4949125051498413,
"learning_rate": 2.8571428571428575e-07,
"loss": 1.296,
"step": 544
},
{
"epoch": 0.9732142857142857,
"grad_norm": 0.5022711753845215,
"learning_rate": 2.6785714285714284e-07,
"loss": 1.2173,
"step": 545
},
{
"epoch": 0.975,
"grad_norm": 0.4856776297092438,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.3013,
"step": 546
},
{
"epoch": 0.9767857142857143,
"grad_norm": 0.508557915687561,
"learning_rate": 2.3214285714285716e-07,
"loss": 1.2674,
"step": 547
},
{
"epoch": 0.9785714285714285,
"grad_norm": 0.5153559446334839,
"learning_rate": 2.142857142857143e-07,
"loss": 1.2569,
"step": 548
},
{
"epoch": 0.9803571428571428,
"grad_norm": 0.49251508712768555,
"learning_rate": 1.9642857142857143e-07,
"loss": 1.2879,
"step": 549
},
{
"epoch": 0.9821428571428571,
"grad_norm": 0.49773454666137695,
"learning_rate": 1.7857142857142858e-07,
"loss": 1.2204,
"step": 550
},
{
"epoch": 0.9839285714285714,
"grad_norm": 0.5022481083869934,
"learning_rate": 1.607142857142857e-07,
"loss": 1.2859,
"step": 551
},
{
"epoch": 0.9857142857142858,
"grad_norm": 0.5000441670417786,
"learning_rate": 1.4285714285714287e-07,
"loss": 1.2217,
"step": 552
},
{
"epoch": 0.9875,
"grad_norm": 0.48934027552604675,
"learning_rate": 1.2500000000000002e-07,
"loss": 1.2892,
"step": 553
},
{
"epoch": 0.9892857142857143,
"grad_norm": 0.5030660033226013,
"learning_rate": 1.0714285714285716e-07,
"loss": 1.2903,
"step": 554
},
{
"epoch": 0.9910714285714286,
"grad_norm": 0.4894131124019623,
"learning_rate": 8.928571428571429e-08,
"loss": 1.3078,
"step": 555
},
{
"epoch": 0.9928571428571429,
"grad_norm": 0.6754637956619263,
"learning_rate": 7.142857142857144e-08,
"loss": 1.2988,
"step": 556
},
{
"epoch": 0.9946428571428572,
"grad_norm": 0.5316541194915771,
"learning_rate": 5.357142857142858e-08,
"loss": 1.2646,
"step": 557
},
{
"epoch": 0.9964285714285714,
"grad_norm": 0.6239261627197266,
"learning_rate": 3.571428571428572e-08,
"loss": 1.3093,
"step": 558
},
{
"epoch": 0.9982142857142857,
"grad_norm": 0.490326851606369,
"learning_rate": 1.785714285714286e-08,
"loss": 1.3136,
"step": 559
},
{
"epoch": 1.0,
"grad_norm": 0.49080222845077515,
"learning_rate": 0.0,
"loss": 1.2867,
"step": 560
}
],
"logging_steps": 1.0,
"max_steps": 560,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.959895737659556e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}