no_pipeline_math / trainer_state.json
marianna13's picture
Upload folder using huggingface_hub
3bfed70 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1440,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003472222222222222,
"grad_norm": 5.6689868084883654,
"learning_rate": 5.555555555555555e-07,
"loss": 0.876,
"step": 1
},
{
"epoch": 0.006944444444444444,
"grad_norm": 5.656438093524862,
"learning_rate": 1.111111111111111e-06,
"loss": 0.8718,
"step": 2
},
{
"epoch": 0.010416666666666666,
"grad_norm": 5.547289268485405,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.8636,
"step": 3
},
{
"epoch": 0.013888888888888888,
"grad_norm": 5.673415707884681,
"learning_rate": 2.222222222222222e-06,
"loss": 0.8824,
"step": 4
},
{
"epoch": 0.017361111111111112,
"grad_norm": 5.240294345520099,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.8563,
"step": 5
},
{
"epoch": 0.020833333333333332,
"grad_norm": 4.272810613406847,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.8199,
"step": 6
},
{
"epoch": 0.024305555555555556,
"grad_norm": 2.2685991272193733,
"learning_rate": 3.88888888888889e-06,
"loss": 0.7568,
"step": 7
},
{
"epoch": 0.027777777777777776,
"grad_norm": 2.028178792401538,
"learning_rate": 4.444444444444444e-06,
"loss": 0.7485,
"step": 8
},
{
"epoch": 0.03125,
"grad_norm": 2.1236911806845473,
"learning_rate": 5e-06,
"loss": 0.7176,
"step": 9
},
{
"epoch": 0.034722222222222224,
"grad_norm": 3.3729211398567163,
"learning_rate": 5.555555555555557e-06,
"loss": 0.7357,
"step": 10
},
{
"epoch": 0.03819444444444445,
"grad_norm": 3.259391674591041,
"learning_rate": 6.111111111111112e-06,
"loss": 0.709,
"step": 11
},
{
"epoch": 0.041666666666666664,
"grad_norm": 2.9926292324873796,
"learning_rate": 6.666666666666667e-06,
"loss": 0.704,
"step": 12
},
{
"epoch": 0.04513888888888889,
"grad_norm": 2.66382030319957,
"learning_rate": 7.222222222222223e-06,
"loss": 0.6683,
"step": 13
},
{
"epoch": 0.04861111111111111,
"grad_norm": 2.3345720474288725,
"learning_rate": 7.77777777777778e-06,
"loss": 0.6581,
"step": 14
},
{
"epoch": 0.052083333333333336,
"grad_norm": 1.6384019103841372,
"learning_rate": 8.333333333333334e-06,
"loss": 0.6456,
"step": 15
},
{
"epoch": 0.05555555555555555,
"grad_norm": 1.3665817465424255,
"learning_rate": 8.888888888888888e-06,
"loss": 0.6188,
"step": 16
},
{
"epoch": 0.059027777777777776,
"grad_norm": 1.8050485428474035,
"learning_rate": 9.444444444444445e-06,
"loss": 0.6289,
"step": 17
},
{
"epoch": 0.0625,
"grad_norm": 1.8521082144961154,
"learning_rate": 1e-05,
"loss": 0.6167,
"step": 18
},
{
"epoch": 0.06597222222222222,
"grad_norm": 1.2977695822726278,
"learning_rate": 1.0555555555555557e-05,
"loss": 0.6083,
"step": 19
},
{
"epoch": 0.06944444444444445,
"grad_norm": 0.9050793342553565,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.5946,
"step": 20
},
{
"epoch": 0.07291666666666667,
"grad_norm": 1.1391127461928305,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.5798,
"step": 21
},
{
"epoch": 0.0763888888888889,
"grad_norm": 0.9433031328354344,
"learning_rate": 1.2222222222222224e-05,
"loss": 0.5818,
"step": 22
},
{
"epoch": 0.0798611111111111,
"grad_norm": 0.6628303547284489,
"learning_rate": 1.2777777777777777e-05,
"loss": 0.5756,
"step": 23
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.9347756103973528,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.5705,
"step": 24
},
{
"epoch": 0.08680555555555555,
"grad_norm": 0.8106549440748502,
"learning_rate": 1.388888888888889e-05,
"loss": 0.5543,
"step": 25
},
{
"epoch": 0.09027777777777778,
"grad_norm": 0.567510927420338,
"learning_rate": 1.4444444444444446e-05,
"loss": 0.5543,
"step": 26
},
{
"epoch": 0.09375,
"grad_norm": 0.6921706937343025,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.5549,
"step": 27
},
{
"epoch": 0.09722222222222222,
"grad_norm": 0.7027125894979898,
"learning_rate": 1.555555555555556e-05,
"loss": 0.5552,
"step": 28
},
{
"epoch": 0.10069444444444445,
"grad_norm": 0.4234236865726793,
"learning_rate": 1.6111111111111115e-05,
"loss": 0.5496,
"step": 29
},
{
"epoch": 0.10416666666666667,
"grad_norm": 0.636600259227426,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.5476,
"step": 30
},
{
"epoch": 0.1076388888888889,
"grad_norm": 0.4880874196057493,
"learning_rate": 1.7222222222222224e-05,
"loss": 0.5438,
"step": 31
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.4615043705286091,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.5417,
"step": 32
},
{
"epoch": 0.11458333333333333,
"grad_norm": 0.4909859309922507,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.5294,
"step": 33
},
{
"epoch": 0.11805555555555555,
"grad_norm": 0.4107390902800856,
"learning_rate": 1.888888888888889e-05,
"loss": 0.5259,
"step": 34
},
{
"epoch": 0.12152777777777778,
"grad_norm": 0.4843241567748437,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.5183,
"step": 35
},
{
"epoch": 0.125,
"grad_norm": 0.402362602696457,
"learning_rate": 2e-05,
"loss": 0.5233,
"step": 36
},
{
"epoch": 0.1284722222222222,
"grad_norm": 0.4849278121412402,
"learning_rate": 2.0555555555555555e-05,
"loss": 0.5166,
"step": 37
},
{
"epoch": 0.13194444444444445,
"grad_norm": 0.40344157604040815,
"learning_rate": 2.1111111111111114e-05,
"loss": 0.5242,
"step": 38
},
{
"epoch": 0.13541666666666666,
"grad_norm": 0.439230264488894,
"learning_rate": 2.1666666666666667e-05,
"loss": 0.5164,
"step": 39
},
{
"epoch": 0.1388888888888889,
"grad_norm": 0.4220862849771054,
"learning_rate": 2.2222222222222227e-05,
"loss": 0.5136,
"step": 40
},
{
"epoch": 0.1423611111111111,
"grad_norm": 0.5279859076306369,
"learning_rate": 2.277777777777778e-05,
"loss": 0.5122,
"step": 41
},
{
"epoch": 0.14583333333333334,
"grad_norm": 0.46244478908007053,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.5118,
"step": 42
},
{
"epoch": 0.14930555555555555,
"grad_norm": 0.4683599686272933,
"learning_rate": 2.388888888888889e-05,
"loss": 0.514,
"step": 43
},
{
"epoch": 0.1527777777777778,
"grad_norm": 0.5322715297704302,
"learning_rate": 2.444444444444445e-05,
"loss": 0.4971,
"step": 44
},
{
"epoch": 0.15625,
"grad_norm": 0.6507970131103591,
"learning_rate": 2.5e-05,
"loss": 0.4963,
"step": 45
},
{
"epoch": 0.1597222222222222,
"grad_norm": 0.714750788035846,
"learning_rate": 2.5555555555555554e-05,
"loss": 0.5134,
"step": 46
},
{
"epoch": 0.16319444444444445,
"grad_norm": 0.7687601059434547,
"learning_rate": 2.6111111111111114e-05,
"loss": 0.5064,
"step": 47
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.8399521816080617,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.4906,
"step": 48
},
{
"epoch": 0.1701388888888889,
"grad_norm": 0.889997192400381,
"learning_rate": 2.7222222222222226e-05,
"loss": 0.5032,
"step": 49
},
{
"epoch": 0.1736111111111111,
"grad_norm": 1.0475698420911328,
"learning_rate": 2.777777777777778e-05,
"loss": 0.493,
"step": 50
},
{
"epoch": 0.17708333333333334,
"grad_norm": 1.118833819111554,
"learning_rate": 2.833333333333334e-05,
"loss": 0.4974,
"step": 51
},
{
"epoch": 0.18055555555555555,
"grad_norm": 0.8024919998359595,
"learning_rate": 2.888888888888889e-05,
"loss": 0.4865,
"step": 52
},
{
"epoch": 0.1840277777777778,
"grad_norm": 0.8606974505869477,
"learning_rate": 2.9444444444444448e-05,
"loss": 0.4926,
"step": 53
},
{
"epoch": 0.1875,
"grad_norm": 0.7881779672296356,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.4878,
"step": 54
},
{
"epoch": 0.1909722222222222,
"grad_norm": 0.6080495175198938,
"learning_rate": 3.0555555555555554e-05,
"loss": 0.4884,
"step": 55
},
{
"epoch": 0.19444444444444445,
"grad_norm": 0.807309170648098,
"learning_rate": 3.111111111111112e-05,
"loss": 0.4826,
"step": 56
},
{
"epoch": 0.19791666666666666,
"grad_norm": 0.9813713400574569,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.4858,
"step": 57
},
{
"epoch": 0.2013888888888889,
"grad_norm": 1.0361441991825402,
"learning_rate": 3.222222222222223e-05,
"loss": 0.4919,
"step": 58
},
{
"epoch": 0.2048611111111111,
"grad_norm": 0.8868025871110543,
"learning_rate": 3.277777777777778e-05,
"loss": 0.4824,
"step": 59
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.9288701717203051,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.4894,
"step": 60
},
{
"epoch": 0.21180555555555555,
"grad_norm": 1.0162786242178787,
"learning_rate": 3.388888888888889e-05,
"loss": 0.4859,
"step": 61
},
{
"epoch": 0.2152777777777778,
"grad_norm": 1.1593588998766855,
"learning_rate": 3.444444444444445e-05,
"loss": 0.4801,
"step": 62
},
{
"epoch": 0.21875,
"grad_norm": 1.0130142454064106,
"learning_rate": 3.5000000000000004e-05,
"loss": 0.4867,
"step": 63
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.1339138891874543,
"learning_rate": 3.555555555555555e-05,
"loss": 0.4801,
"step": 64
},
{
"epoch": 0.22569444444444445,
"grad_norm": 0.9167679815009071,
"learning_rate": 3.6111111111111116e-05,
"loss": 0.472,
"step": 65
},
{
"epoch": 0.22916666666666666,
"grad_norm": 0.9957122622820357,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.4782,
"step": 66
},
{
"epoch": 0.2326388888888889,
"grad_norm": 1.2768722683777673,
"learning_rate": 3.722222222222223e-05,
"loss": 0.4794,
"step": 67
},
{
"epoch": 0.2361111111111111,
"grad_norm": 0.6981900383415166,
"learning_rate": 3.777777777777778e-05,
"loss": 0.4733,
"step": 68
},
{
"epoch": 0.23958333333333334,
"grad_norm": 1.0133076333409807,
"learning_rate": 3.833333333333334e-05,
"loss": 0.4752,
"step": 69
},
{
"epoch": 0.24305555555555555,
"grad_norm": 1.7404120248946109,
"learning_rate": 3.888888888888889e-05,
"loss": 0.4817,
"step": 70
},
{
"epoch": 0.2465277777777778,
"grad_norm": 0.6651429804201384,
"learning_rate": 3.944444444444445e-05,
"loss": 0.4797,
"step": 71
},
{
"epoch": 0.25,
"grad_norm": 2.3281889521219488,
"learning_rate": 4e-05,
"loss": 0.4813,
"step": 72
},
{
"epoch": 0.2534722222222222,
"grad_norm": 1.3476271982813572,
"learning_rate": 4.055555555555556e-05,
"loss": 0.4782,
"step": 73
},
{
"epoch": 0.2569444444444444,
"grad_norm": 2.591174546004534,
"learning_rate": 4.111111111111111e-05,
"loss": 0.4863,
"step": 74
},
{
"epoch": 0.2604166666666667,
"grad_norm": 2.755027938216314,
"learning_rate": 4.166666666666667e-05,
"loss": 0.4898,
"step": 75
},
{
"epoch": 0.2638888888888889,
"grad_norm": 1.0484653191558952,
"learning_rate": 4.222222222222223e-05,
"loss": 0.4714,
"step": 76
},
{
"epoch": 0.2673611111111111,
"grad_norm": 1.8159914872532417,
"learning_rate": 4.277777777777778e-05,
"loss": 0.486,
"step": 77
},
{
"epoch": 0.2708333333333333,
"grad_norm": 1.575533113972724,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.4998,
"step": 78
},
{
"epoch": 0.2743055555555556,
"grad_norm": 1.247189140013317,
"learning_rate": 4.38888888888889e-05,
"loss": 0.472,
"step": 79
},
{
"epoch": 0.2777777777777778,
"grad_norm": 1.8283011598184224,
"learning_rate": 4.444444444444445e-05,
"loss": 0.4896,
"step": 80
},
{
"epoch": 0.28125,
"grad_norm": 1.8109706505904477,
"learning_rate": 4.5e-05,
"loss": 0.4881,
"step": 81
},
{
"epoch": 0.2847222222222222,
"grad_norm": 1.057346335127151,
"learning_rate": 4.555555555555556e-05,
"loss": 0.4832,
"step": 82
},
{
"epoch": 0.2881944444444444,
"grad_norm": 1.7646694577951128,
"learning_rate": 4.611111111111111e-05,
"loss": 0.4832,
"step": 83
},
{
"epoch": 0.2916666666666667,
"grad_norm": 1.4140355829700804,
"learning_rate": 4.666666666666667e-05,
"loss": 0.4811,
"step": 84
},
{
"epoch": 0.2951388888888889,
"grad_norm": 1.2593733584850433,
"learning_rate": 4.722222222222223e-05,
"loss": 0.4663,
"step": 85
},
{
"epoch": 0.2986111111111111,
"grad_norm": 1.3968943399622709,
"learning_rate": 4.777777777777778e-05,
"loss": 0.4778,
"step": 86
},
{
"epoch": 0.3020833333333333,
"grad_norm": 1.1507601425129197,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.4725,
"step": 87
},
{
"epoch": 0.3055555555555556,
"grad_norm": 1.5302822908979552,
"learning_rate": 4.88888888888889e-05,
"loss": 0.4824,
"step": 88
},
{
"epoch": 0.3090277777777778,
"grad_norm": 0.9723391769006969,
"learning_rate": 4.944444444444445e-05,
"loss": 0.4777,
"step": 89
},
{
"epoch": 0.3125,
"grad_norm": 1.3992163636274015,
"learning_rate": 5e-05,
"loss": 0.4757,
"step": 90
},
{
"epoch": 0.3159722222222222,
"grad_norm": 0.799812937993386,
"learning_rate": 5.055555555555556e-05,
"loss": 0.4663,
"step": 91
},
{
"epoch": 0.3194444444444444,
"grad_norm": 0.9859358600047391,
"learning_rate": 5.111111111111111e-05,
"loss": 0.4683,
"step": 92
},
{
"epoch": 0.3229166666666667,
"grad_norm": 1.2225448020462069,
"learning_rate": 5.166666666666667e-05,
"loss": 0.4798,
"step": 93
},
{
"epoch": 0.3263888888888889,
"grad_norm": 1.1666553572392628,
"learning_rate": 5.222222222222223e-05,
"loss": 0.4738,
"step": 94
},
{
"epoch": 0.3298611111111111,
"grad_norm": 1.65630200439605,
"learning_rate": 5.2777777777777784e-05,
"loss": 0.4815,
"step": 95
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.0280119977292617,
"learning_rate": 5.333333333333333e-05,
"loss": 0.4631,
"step": 96
},
{
"epoch": 0.3368055555555556,
"grad_norm": 1.2759438689338,
"learning_rate": 5.3888888888888896e-05,
"loss": 0.4633,
"step": 97
},
{
"epoch": 0.3402777777777778,
"grad_norm": 1.2013225394816978,
"learning_rate": 5.444444444444445e-05,
"loss": 0.4644,
"step": 98
},
{
"epoch": 0.34375,
"grad_norm": 0.9467045454954154,
"learning_rate": 5.5e-05,
"loss": 0.4666,
"step": 99
},
{
"epoch": 0.3472222222222222,
"grad_norm": 1.0593628732980047,
"learning_rate": 5.555555555555556e-05,
"loss": 0.4642,
"step": 100
},
{
"epoch": 0.3506944444444444,
"grad_norm": 1.304024409909431,
"learning_rate": 5.6111111111111114e-05,
"loss": 0.4741,
"step": 101
},
{
"epoch": 0.3541666666666667,
"grad_norm": 0.9810723420807926,
"learning_rate": 5.666666666666668e-05,
"loss": 0.4622,
"step": 102
},
{
"epoch": 0.3576388888888889,
"grad_norm": 1.3123102030562221,
"learning_rate": 5.722222222222223e-05,
"loss": 0.4615,
"step": 103
},
{
"epoch": 0.3611111111111111,
"grad_norm": 1.178298412260806,
"learning_rate": 5.777777777777778e-05,
"loss": 0.4599,
"step": 104
},
{
"epoch": 0.3645833333333333,
"grad_norm": 1.4401758010598742,
"learning_rate": 5.833333333333333e-05,
"loss": 0.4595,
"step": 105
},
{
"epoch": 0.3680555555555556,
"grad_norm": 0.9403895604831765,
"learning_rate": 5.8888888888888896e-05,
"loss": 0.4646,
"step": 106
},
{
"epoch": 0.3715277777777778,
"grad_norm": 1.3214554536185026,
"learning_rate": 5.944444444444445e-05,
"loss": 0.4692,
"step": 107
},
{
"epoch": 0.375,
"grad_norm": 0.7898763356241624,
"learning_rate": 6.000000000000001e-05,
"loss": 0.4745,
"step": 108
},
{
"epoch": 0.3784722222222222,
"grad_norm": 1.3785771659946036,
"learning_rate": 6.055555555555556e-05,
"loss": 0.4656,
"step": 109
},
{
"epoch": 0.3819444444444444,
"grad_norm": 0.8438355733572759,
"learning_rate": 6.111111111111111e-05,
"loss": 0.4658,
"step": 110
},
{
"epoch": 0.3854166666666667,
"grad_norm": 0.9301929891119248,
"learning_rate": 6.166666666666667e-05,
"loss": 0.4676,
"step": 111
},
{
"epoch": 0.3888888888888889,
"grad_norm": 1.1359808326811387,
"learning_rate": 6.222222222222223e-05,
"loss": 0.4682,
"step": 112
},
{
"epoch": 0.3923611111111111,
"grad_norm": 0.6819036135597224,
"learning_rate": 6.277777777777778e-05,
"loss": 0.4638,
"step": 113
},
{
"epoch": 0.3958333333333333,
"grad_norm": 1.1784222230236077,
"learning_rate": 6.333333333333333e-05,
"loss": 0.4721,
"step": 114
},
{
"epoch": 0.3993055555555556,
"grad_norm": 1.0972045878518617,
"learning_rate": 6.38888888888889e-05,
"loss": 0.4564,
"step": 115
},
{
"epoch": 0.4027777777777778,
"grad_norm": 1.1942570539893864,
"learning_rate": 6.444444444444446e-05,
"loss": 0.4606,
"step": 116
},
{
"epoch": 0.40625,
"grad_norm": 1.3066111752440024,
"learning_rate": 6.500000000000001e-05,
"loss": 0.4574,
"step": 117
},
{
"epoch": 0.4097222222222222,
"grad_norm": 1.2705685345556148,
"learning_rate": 6.555555555555556e-05,
"loss": 0.4608,
"step": 118
},
{
"epoch": 0.4131944444444444,
"grad_norm": 1.0800640730680313,
"learning_rate": 6.611111111111111e-05,
"loss": 0.4689,
"step": 119
},
{
"epoch": 0.4166666666666667,
"grad_norm": 1.48126351719224,
"learning_rate": 6.666666666666667e-05,
"loss": 0.4651,
"step": 120
},
{
"epoch": 0.4201388888888889,
"grad_norm": 1.1069560673762247,
"learning_rate": 6.722222222222223e-05,
"loss": 0.4617,
"step": 121
},
{
"epoch": 0.4236111111111111,
"grad_norm": 1.5000892020623857,
"learning_rate": 6.777777777777778e-05,
"loss": 0.4659,
"step": 122
},
{
"epoch": 0.4270833333333333,
"grad_norm": 1.18006794714587,
"learning_rate": 6.833333333333333e-05,
"loss": 0.4618,
"step": 123
},
{
"epoch": 0.4305555555555556,
"grad_norm": 1.2462151808344257,
"learning_rate": 6.88888888888889e-05,
"loss": 0.4594,
"step": 124
},
{
"epoch": 0.4340277777777778,
"grad_norm": 1.0493304940666723,
"learning_rate": 6.944444444444446e-05,
"loss": 0.4625,
"step": 125
},
{
"epoch": 0.4375,
"grad_norm": 1.6573194961926394,
"learning_rate": 7.000000000000001e-05,
"loss": 0.4604,
"step": 126
},
{
"epoch": 0.4409722222222222,
"grad_norm": 0.8417297410097049,
"learning_rate": 7.055555555555556e-05,
"loss": 0.4522,
"step": 127
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.5116369884276502,
"learning_rate": 7.11111111111111e-05,
"loss": 0.4664,
"step": 128
},
{
"epoch": 0.4479166666666667,
"grad_norm": 1.098767994124789,
"learning_rate": 7.166666666666667e-05,
"loss": 0.454,
"step": 129
},
{
"epoch": 0.4513888888888889,
"grad_norm": 1.4427870933514884,
"learning_rate": 7.222222222222223e-05,
"loss": 0.4581,
"step": 130
},
{
"epoch": 0.4548611111111111,
"grad_norm": 1.131214917074712,
"learning_rate": 7.277777777777778e-05,
"loss": 0.463,
"step": 131
},
{
"epoch": 0.4583333333333333,
"grad_norm": 1.1124160125629599,
"learning_rate": 7.333333333333333e-05,
"loss": 0.455,
"step": 132
},
{
"epoch": 0.4618055555555556,
"grad_norm": 1.4234752545924882,
"learning_rate": 7.38888888888889e-05,
"loss": 0.4619,
"step": 133
},
{
"epoch": 0.4652777777777778,
"grad_norm": 1.1724697852891888,
"learning_rate": 7.444444444444446e-05,
"loss": 0.4494,
"step": 134
},
{
"epoch": 0.46875,
"grad_norm": 1.3419661610878133,
"learning_rate": 7.500000000000001e-05,
"loss": 0.4628,
"step": 135
},
{
"epoch": 0.4722222222222222,
"grad_norm": 1.057112319107508,
"learning_rate": 7.555555555555556e-05,
"loss": 0.4547,
"step": 136
},
{
"epoch": 0.4756944444444444,
"grad_norm": 1.3297790190386298,
"learning_rate": 7.611111111111112e-05,
"loss": 0.4658,
"step": 137
},
{
"epoch": 0.4791666666666667,
"grad_norm": 1.080019562308979,
"learning_rate": 7.666666666666668e-05,
"loss": 0.4519,
"step": 138
},
{
"epoch": 0.4826388888888889,
"grad_norm": 1.0209172735208736,
"learning_rate": 7.722222222222223e-05,
"loss": 0.4571,
"step": 139
},
{
"epoch": 0.4861111111111111,
"grad_norm": 1.284571191682376,
"learning_rate": 7.777777777777778e-05,
"loss": 0.4632,
"step": 140
},
{
"epoch": 0.4895833333333333,
"grad_norm": 1.243779273272225,
"learning_rate": 7.833333333333333e-05,
"loss": 0.4556,
"step": 141
},
{
"epoch": 0.4930555555555556,
"grad_norm": 1.5929814480067013,
"learning_rate": 7.88888888888889e-05,
"loss": 0.4628,
"step": 142
},
{
"epoch": 0.4965277777777778,
"grad_norm": 0.8996686117779537,
"learning_rate": 7.944444444444446e-05,
"loss": 0.4616,
"step": 143
},
{
"epoch": 0.5,
"grad_norm": 1.4114637381579962,
"learning_rate": 8e-05,
"loss": 0.4579,
"step": 144
},
{
"epoch": 0.5034722222222222,
"grad_norm": 1.1413142228974857,
"learning_rate": 7.999988247790486e-05,
"loss": 0.4524,
"step": 145
},
{
"epoch": 0.5069444444444444,
"grad_norm": 1.2535207264099173,
"learning_rate": 7.999952991230999e-05,
"loss": 0.4547,
"step": 146
},
{
"epoch": 0.5104166666666666,
"grad_norm": 0.944579007044323,
"learning_rate": 7.99989423052871e-05,
"loss": 0.449,
"step": 147
},
{
"epoch": 0.5138888888888888,
"grad_norm": 1.1702494630139326,
"learning_rate": 7.999811966028904e-05,
"loss": 0.4542,
"step": 148
},
{
"epoch": 0.5173611111111112,
"grad_norm": 1.2981796057689705,
"learning_rate": 7.999706198214977e-05,
"loss": 0.4499,
"step": 149
},
{
"epoch": 0.5208333333333334,
"grad_norm": 0.9795884203661855,
"learning_rate": 7.99957692770843e-05,
"loss": 0.4427,
"step": 150
},
{
"epoch": 0.5243055555555556,
"grad_norm": 1.2125274010863194,
"learning_rate": 7.999424155268872e-05,
"loss": 0.4554,
"step": 151
},
{
"epoch": 0.5277777777777778,
"grad_norm": 0.7608263775470029,
"learning_rate": 7.999247881794007e-05,
"loss": 0.4543,
"step": 152
},
{
"epoch": 0.53125,
"grad_norm": 0.8988441693273684,
"learning_rate": 7.999048108319636e-05,
"loss": 0.454,
"step": 153
},
{
"epoch": 0.5347222222222222,
"grad_norm": 0.9604446856356788,
"learning_rate": 7.998824836019654e-05,
"loss": 0.4518,
"step": 154
},
{
"epoch": 0.5381944444444444,
"grad_norm": 1.465682474436967,
"learning_rate": 7.998578066206027e-05,
"loss": 0.4553,
"step": 155
},
{
"epoch": 0.5416666666666666,
"grad_norm": 0.9953984939217686,
"learning_rate": 7.998307800328803e-05,
"loss": 0.4487,
"step": 156
},
{
"epoch": 0.5451388888888888,
"grad_norm": 1.14883284369783,
"learning_rate": 7.998014039976093e-05,
"loss": 0.4504,
"step": 157
},
{
"epoch": 0.5486111111111112,
"grad_norm": 1.1463500627507957,
"learning_rate": 7.99769678687406e-05,
"loss": 0.4458,
"step": 158
},
{
"epoch": 0.5520833333333334,
"grad_norm": 1.7406399039819629,
"learning_rate": 7.997356042886921e-05,
"loss": 0.4476,
"step": 159
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.9677522133096546,
"learning_rate": 7.996991810016922e-05,
"loss": 0.454,
"step": 160
},
{
"epoch": 0.5590277777777778,
"grad_norm": 2.3399797692246738,
"learning_rate": 7.996604090404331e-05,
"loss": 0.46,
"step": 161
},
{
"epoch": 0.5625,
"grad_norm": 1.769583224289047,
"learning_rate": 7.996192886327432e-05,
"loss": 0.4635,
"step": 162
},
{
"epoch": 0.5659722222222222,
"grad_norm": 1.5805102265354327,
"learning_rate": 7.995758200202502e-05,
"loss": 0.4532,
"step": 163
},
{
"epoch": 0.5694444444444444,
"grad_norm": 1.3241297914657204,
"learning_rate": 7.995300034583802e-05,
"loss": 0.4514,
"step": 164
},
{
"epoch": 0.5729166666666666,
"grad_norm": 1.2125784678413019,
"learning_rate": 7.994818392163563e-05,
"loss": 0.4451,
"step": 165
},
{
"epoch": 0.5763888888888888,
"grad_norm": 1.2227272948654673,
"learning_rate": 7.994313275771963e-05,
"loss": 0.4479,
"step": 166
},
{
"epoch": 0.5798611111111112,
"grad_norm": 0.9281571542890602,
"learning_rate": 7.993784688377122e-05,
"loss": 0.4501,
"step": 167
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.8929936296876618,
"learning_rate": 7.993232633085074e-05,
"loss": 0.445,
"step": 168
},
{
"epoch": 0.5868055555555556,
"grad_norm": 1.238670375147291,
"learning_rate": 7.992657113139751e-05,
"loss": 0.455,
"step": 169
},
{
"epoch": 0.5902777777777778,
"grad_norm": 1.0415211643774225,
"learning_rate": 7.992058131922974e-05,
"loss": 0.4427,
"step": 170
},
{
"epoch": 0.59375,
"grad_norm": 1.5710938827548118,
"learning_rate": 7.991435692954414e-05,
"loss": 0.4468,
"step": 171
},
{
"epoch": 0.5972222222222222,
"grad_norm": 0.8839063532349627,
"learning_rate": 7.990789799891592e-05,
"loss": 0.4445,
"step": 172
},
{
"epoch": 0.6006944444444444,
"grad_norm": 1.697345060366342,
"learning_rate": 7.99012045652984e-05,
"loss": 0.4552,
"step": 173
},
{
"epoch": 0.6041666666666666,
"grad_norm": 0.9164694087509572,
"learning_rate": 7.98942766680229e-05,
"loss": 0.4547,
"step": 174
},
{
"epoch": 0.6076388888888888,
"grad_norm": 1.5276845461738495,
"learning_rate": 7.988711434779849e-05,
"loss": 0.4538,
"step": 175
},
{
"epoch": 0.6111111111111112,
"grad_norm": 0.8693062056237906,
"learning_rate": 7.987971764671168e-05,
"loss": 0.4468,
"step": 176
},
{
"epoch": 0.6145833333333334,
"grad_norm": 1.3482324568848507,
"learning_rate": 7.987208660822631e-05,
"loss": 0.4393,
"step": 177
},
{
"epoch": 0.6180555555555556,
"grad_norm": 1.0523927613249133,
"learning_rate": 7.986422127718312e-05,
"loss": 0.4468,
"step": 178
},
{
"epoch": 0.6215277777777778,
"grad_norm": 1.2102039495362458,
"learning_rate": 7.985612169979964e-05,
"loss": 0.4473,
"step": 179
},
{
"epoch": 0.625,
"grad_norm": 1.041791299494975,
"learning_rate": 7.984778792366983e-05,
"loss": 0.4482,
"step": 180
},
{
"epoch": 0.6284722222222222,
"grad_norm": 0.9702144942742554,
"learning_rate": 7.983921999776381e-05,
"loss": 0.4456,
"step": 181
},
{
"epoch": 0.6319444444444444,
"grad_norm": 0.9634257079158082,
"learning_rate": 7.983041797242766e-05,
"loss": 0.4464,
"step": 182
},
{
"epoch": 0.6354166666666666,
"grad_norm": 1.2159853741470283,
"learning_rate": 7.982138189938296e-05,
"loss": 0.4495,
"step": 183
},
{
"epoch": 0.6388888888888888,
"grad_norm": 0.9230250746367079,
"learning_rate": 7.981211183172663e-05,
"loss": 0.4473,
"step": 184
},
{
"epoch": 0.6423611111111112,
"grad_norm": 0.8276373290152851,
"learning_rate": 7.980260782393058e-05,
"loss": 0.4439,
"step": 185
},
{
"epoch": 0.6458333333333334,
"grad_norm": 0.9210417068141048,
"learning_rate": 7.979286993184134e-05,
"loss": 0.4481,
"step": 186
},
{
"epoch": 0.6493055555555556,
"grad_norm": 0.9826533521079801,
"learning_rate": 7.978289821267976e-05,
"loss": 0.4466,
"step": 187
},
{
"epoch": 0.6527777777777778,
"grad_norm": 1.116467479939209,
"learning_rate": 7.977269272504075e-05,
"loss": 0.4426,
"step": 188
},
{
"epoch": 0.65625,
"grad_norm": 1.231932756904634,
"learning_rate": 7.976225352889278e-05,
"loss": 0.445,
"step": 189
},
{
"epoch": 0.6597222222222222,
"grad_norm": 0.5726339489471363,
"learning_rate": 7.975158068557771e-05,
"loss": 0.4398,
"step": 190
},
{
"epoch": 0.6631944444444444,
"grad_norm": 0.8001387375399556,
"learning_rate": 7.974067425781025e-05,
"loss": 0.4398,
"step": 191
},
{
"epoch": 0.6666666666666666,
"grad_norm": 1.2250064920535346,
"learning_rate": 7.972953430967773e-05,
"loss": 0.4411,
"step": 192
},
{
"epoch": 0.6701388888888888,
"grad_norm": 0.6534744526647841,
"learning_rate": 7.971816090663963e-05,
"loss": 0.4502,
"step": 193
},
{
"epoch": 0.6736111111111112,
"grad_norm": 0.5725567793011336,
"learning_rate": 7.970655411552728e-05,
"loss": 0.4389,
"step": 194
},
{
"epoch": 0.6770833333333334,
"grad_norm": 0.7150266154502478,
"learning_rate": 7.96947140045434e-05,
"loss": 0.4359,
"step": 195
},
{
"epoch": 0.6805555555555556,
"grad_norm": 0.6861133770491901,
"learning_rate": 7.96826406432617e-05,
"loss": 0.4293,
"step": 196
},
{
"epoch": 0.6840277777777778,
"grad_norm": 0.5267677299616069,
"learning_rate": 7.967033410262653e-05,
"loss": 0.4411,
"step": 197
},
{
"epoch": 0.6875,
"grad_norm": 0.7064713153906993,
"learning_rate": 7.965779445495243e-05,
"loss": 0.4409,
"step": 198
},
{
"epoch": 0.6909722222222222,
"grad_norm": 0.7898210534003849,
"learning_rate": 7.964502177392363e-05,
"loss": 0.4414,
"step": 199
},
{
"epoch": 0.6944444444444444,
"grad_norm": 1.0600333307371903,
"learning_rate": 7.963201613459381e-05,
"loss": 0.4497,
"step": 200
},
{
"epoch": 0.6979166666666666,
"grad_norm": 1.3046328917134082,
"learning_rate": 7.961877761338545e-05,
"loss": 0.442,
"step": 201
},
{
"epoch": 0.7013888888888888,
"grad_norm": 0.844190070594073,
"learning_rate": 7.960530628808944e-05,
"loss": 0.4377,
"step": 202
},
{
"epoch": 0.7048611111111112,
"grad_norm": 0.9955414997984852,
"learning_rate": 7.959160223786475e-05,
"loss": 0.4377,
"step": 203
},
{
"epoch": 0.7083333333333334,
"grad_norm": 1.0186927163629396,
"learning_rate": 7.957766554323778e-05,
"loss": 0.4407,
"step": 204
},
{
"epoch": 0.7118055555555556,
"grad_norm": 1.6996204406204833,
"learning_rate": 7.956349628610204e-05,
"loss": 0.4465,
"step": 205
},
{
"epoch": 0.7152777777777778,
"grad_norm": 0.8404953299598265,
"learning_rate": 7.954909454971756e-05,
"loss": 0.4422,
"step": 206
},
{
"epoch": 0.71875,
"grad_norm": 1.9618717386693068,
"learning_rate": 7.953446041871044e-05,
"loss": 0.4514,
"step": 207
},
{
"epoch": 0.7222222222222222,
"grad_norm": 1.0737999034049264,
"learning_rate": 7.951959397907237e-05,
"loss": 0.442,
"step": 208
},
{
"epoch": 0.7256944444444444,
"grad_norm": 2.2317162303577955,
"learning_rate": 7.950449531816011e-05,
"loss": 0.4645,
"step": 209
},
{
"epoch": 0.7291666666666666,
"grad_norm": 2.073214361985945,
"learning_rate": 7.948916452469497e-05,
"loss": 0.4612,
"step": 210
},
{
"epoch": 0.7326388888888888,
"grad_norm": 1.0851082661931841,
"learning_rate": 7.947360168876231e-05,
"loss": 0.4396,
"step": 211
},
{
"epoch": 0.7361111111111112,
"grad_norm": 1.3106986814089487,
"learning_rate": 7.945780690181096e-05,
"loss": 0.4517,
"step": 212
},
{
"epoch": 0.7395833333333334,
"grad_norm": 0.8570698706313222,
"learning_rate": 7.944178025665277e-05,
"loss": 0.4538,
"step": 213
},
{
"epoch": 0.7430555555555556,
"grad_norm": 1.3386762857789183,
"learning_rate": 7.942552184746196e-05,
"loss": 0.4416,
"step": 214
},
{
"epoch": 0.7465277777777778,
"grad_norm": 1.0014581322428484,
"learning_rate": 7.940903176977469e-05,
"loss": 0.4523,
"step": 215
},
{
"epoch": 0.75,
"grad_norm": 1.0734898711169398,
"learning_rate": 7.939231012048833e-05,
"loss": 0.4447,
"step": 216
},
{
"epoch": 0.7534722222222222,
"grad_norm": 0.993341523519617,
"learning_rate": 7.937535699786107e-05,
"loss": 0.45,
"step": 217
},
{
"epoch": 0.7569444444444444,
"grad_norm": 0.6652791910828864,
"learning_rate": 7.935817250151124e-05,
"loss": 0.4324,
"step": 218
},
{
"epoch": 0.7604166666666666,
"grad_norm": 0.6971295516673278,
"learning_rate": 7.934075673241672e-05,
"loss": 0.4426,
"step": 219
},
{
"epoch": 0.7638888888888888,
"grad_norm": 0.673431333145143,
"learning_rate": 7.932310979291441e-05,
"loss": 0.4324,
"step": 220
},
{
"epoch": 0.7673611111111112,
"grad_norm": 0.7732137215225624,
"learning_rate": 7.930523178669956e-05,
"loss": 0.4454,
"step": 221
},
{
"epoch": 0.7708333333333334,
"grad_norm": 0.4823440611357369,
"learning_rate": 7.928712281882523e-05,
"loss": 0.4291,
"step": 222
},
{
"epoch": 0.7743055555555556,
"grad_norm": 0.7868778607572823,
"learning_rate": 7.92687829957016e-05,
"loss": 0.4369,
"step": 223
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.6577701332320577,
"learning_rate": 7.925021242509539e-05,
"loss": 0.4424,
"step": 224
},
{
"epoch": 0.78125,
"grad_norm": 0.5424251079261185,
"learning_rate": 7.923141121612922e-05,
"loss": 0.4403,
"step": 225
},
{
"epoch": 0.7847222222222222,
"grad_norm": 0.5054224207988611,
"learning_rate": 7.921237947928097e-05,
"loss": 0.4392,
"step": 226
},
{
"epoch": 0.7881944444444444,
"grad_norm": 0.605462887935492,
"learning_rate": 7.91931173263831e-05,
"loss": 0.4398,
"step": 227
},
{
"epoch": 0.7916666666666666,
"grad_norm": 0.5551065001075223,
"learning_rate": 7.917362487062207e-05,
"loss": 0.4349,
"step": 228
},
{
"epoch": 0.7951388888888888,
"grad_norm": 0.6153818427714622,
"learning_rate": 7.915390222653756e-05,
"loss": 0.4298,
"step": 229
},
{
"epoch": 0.7986111111111112,
"grad_norm": 0.7260586067886241,
"learning_rate": 7.913394951002191e-05,
"loss": 0.4391,
"step": 230
},
{
"epoch": 0.8020833333333334,
"grad_norm": 0.8350027035555211,
"learning_rate": 7.911376683831937e-05,
"loss": 0.4423,
"step": 231
},
{
"epoch": 0.8055555555555556,
"grad_norm": 1.0219604029866298,
"learning_rate": 7.909335433002543e-05,
"loss": 0.4336,
"step": 232
},
{
"epoch": 0.8090277777777778,
"grad_norm": 1.1709001073322873,
"learning_rate": 7.907271210508612e-05,
"loss": 0.4281,
"step": 233
},
{
"epoch": 0.8125,
"grad_norm": 0.8473876791721466,
"learning_rate": 7.905184028479734e-05,
"loss": 0.4335,
"step": 234
},
{
"epoch": 0.8159722222222222,
"grad_norm": 0.9483895312739647,
"learning_rate": 7.903073899180408e-05,
"loss": 0.4354,
"step": 235
},
{
"epoch": 0.8194444444444444,
"grad_norm": 1.211636095809243,
"learning_rate": 7.900940835009974e-05,
"loss": 0.4403,
"step": 236
},
{
"epoch": 0.8229166666666666,
"grad_norm": 0.754367132994784,
"learning_rate": 7.89878484850254e-05,
"loss": 0.431,
"step": 237
},
{
"epoch": 0.8263888888888888,
"grad_norm": 0.850436115404034,
"learning_rate": 7.89660595232691e-05,
"loss": 0.4361,
"step": 238
},
{
"epoch": 0.8298611111111112,
"grad_norm": 0.9631686880042966,
"learning_rate": 7.894404159286507e-05,
"loss": 0.4377,
"step": 239
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.8809972957235778,
"learning_rate": 7.892179482319297e-05,
"loss": 0.4412,
"step": 240
},
{
"epoch": 0.8368055555555556,
"grad_norm": 0.7087009807332131,
"learning_rate": 7.889931934497713e-05,
"loss": 0.4384,
"step": 241
},
{
"epoch": 0.8402777777777778,
"grad_norm": 0.6235179197671002,
"learning_rate": 7.887661529028583e-05,
"loss": 0.4396,
"step": 242
},
{
"epoch": 0.84375,
"grad_norm": 0.5941580300026621,
"learning_rate": 7.885368279253045e-05,
"loss": 0.4312,
"step": 243
},
{
"epoch": 0.8472222222222222,
"grad_norm": 0.7287526088534313,
"learning_rate": 7.883052198646481e-05,
"loss": 0.4319,
"step": 244
},
{
"epoch": 0.8506944444444444,
"grad_norm": 0.8550617714802732,
"learning_rate": 7.880713300818417e-05,
"loss": 0.4265,
"step": 245
},
{
"epoch": 0.8541666666666666,
"grad_norm": 1.070298714307543,
"learning_rate": 7.878351599512465e-05,
"loss": 0.4298,
"step": 246
},
{
"epoch": 0.8576388888888888,
"grad_norm": 1.0005703836024833,
"learning_rate": 7.875967108606229e-05,
"loss": 0.4304,
"step": 247
},
{
"epoch": 0.8611111111111112,
"grad_norm": 0.8656053628499113,
"learning_rate": 7.873559842111225e-05,
"loss": 0.4184,
"step": 248
},
{
"epoch": 0.8645833333333334,
"grad_norm": 0.8664562224515696,
"learning_rate": 7.871129814172805e-05,
"loss": 0.4344,
"step": 249
},
{
"epoch": 0.8680555555555556,
"grad_norm": 1.0238452521162664,
"learning_rate": 7.868677039070067e-05,
"loss": 0.4312,
"step": 250
},
{
"epoch": 0.8715277777777778,
"grad_norm": 1.0099355495434388,
"learning_rate": 7.866201531215776e-05,
"loss": 0.4302,
"step": 251
},
{
"epoch": 0.875,
"grad_norm": 0.8416275019579411,
"learning_rate": 7.863703305156273e-05,
"loss": 0.4284,
"step": 252
},
{
"epoch": 0.8784722222222222,
"grad_norm": 0.8099259050315379,
"learning_rate": 7.8611823755714e-05,
"loss": 0.4344,
"step": 253
},
{
"epoch": 0.8819444444444444,
"grad_norm": 0.6678679683627219,
"learning_rate": 7.858638757274398e-05,
"loss": 0.4231,
"step": 254
},
{
"epoch": 0.8854166666666666,
"grad_norm": 0.47895420973208647,
"learning_rate": 7.856072465211839e-05,
"loss": 0.4206,
"step": 255
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.009939643607457,
"learning_rate": 7.853483514463521e-05,
"loss": 0.4288,
"step": 256
},
{
"epoch": 0.8923611111111112,
"grad_norm": 1.4189309407266693,
"learning_rate": 7.850871920242394e-05,
"loss": 0.4337,
"step": 257
},
{
"epoch": 0.8958333333333334,
"grad_norm": 0.37058893592615144,
"learning_rate": 7.848237697894453e-05,
"loss": 0.4254,
"step": 258
},
{
"epoch": 0.8993055555555556,
"grad_norm": 1.2115738610190847,
"learning_rate": 7.84558086289867e-05,
"loss": 0.424,
"step": 259
},
{
"epoch": 0.9027777777777778,
"grad_norm": 0.8637909862046065,
"learning_rate": 7.842901430866882e-05,
"loss": 0.4224,
"step": 260
},
{
"epoch": 0.90625,
"grad_norm": 0.8555605365080853,
"learning_rate": 7.840199417543716e-05,
"loss": 0.4215,
"step": 261
},
{
"epoch": 0.9097222222222222,
"grad_norm": 0.8338852542700611,
"learning_rate": 7.837474838806481e-05,
"loss": 0.4253,
"step": 262
},
{
"epoch": 0.9131944444444444,
"grad_norm": 0.7277509949557855,
"learning_rate": 7.834727710665091e-05,
"loss": 0.4237,
"step": 263
},
{
"epoch": 0.9166666666666666,
"grad_norm": 1.0597249447053136,
"learning_rate": 7.831958049261956e-05,
"loss": 0.435,
"step": 264
},
{
"epoch": 0.9201388888888888,
"grad_norm": 0.7628459806053108,
"learning_rate": 7.829165870871897e-05,
"loss": 0.4271,
"step": 265
},
{
"epoch": 0.9236111111111112,
"grad_norm": 0.3800778344556053,
"learning_rate": 7.82635119190205e-05,
"loss": 0.4234,
"step": 266
},
{
"epoch": 0.9270833333333334,
"grad_norm": 0.72949914163169,
"learning_rate": 7.823514028891758e-05,
"loss": 0.4254,
"step": 267
},
{
"epoch": 0.9305555555555556,
"grad_norm": 0.8810789298670233,
"learning_rate": 7.820654398512492e-05,
"loss": 0.4202,
"step": 268
},
{
"epoch": 0.9340277777777778,
"grad_norm": 0.9953625938611481,
"learning_rate": 7.817772317567739e-05,
"loss": 0.4263,
"step": 269
},
{
"epoch": 0.9375,
"grad_norm": 1.0841275303637594,
"learning_rate": 7.814867802992907e-05,
"loss": 0.4271,
"step": 270
},
{
"epoch": 0.9409722222222222,
"grad_norm": 0.8243338663917711,
"learning_rate": 7.811940871855232e-05,
"loss": 0.429,
"step": 271
},
{
"epoch": 0.9444444444444444,
"grad_norm": 0.7969445305450061,
"learning_rate": 7.808991541353662e-05,
"loss": 0.4293,
"step": 272
},
{
"epoch": 0.9479166666666666,
"grad_norm": 0.8402063861721795,
"learning_rate": 7.806019828818776e-05,
"loss": 0.4305,
"step": 273
},
{
"epoch": 0.9513888888888888,
"grad_norm": 0.7708535810728068,
"learning_rate": 7.803025751712667e-05,
"loss": 0.4308,
"step": 274
},
{
"epoch": 0.9548611111111112,
"grad_norm": 0.6791688966743965,
"learning_rate": 7.800009327628845e-05,
"loss": 0.4299,
"step": 275
},
{
"epoch": 0.9583333333333334,
"grad_norm": 0.9406991087495775,
"learning_rate": 7.796970574292136e-05,
"loss": 0.4248,
"step": 276
},
{
"epoch": 0.9618055555555556,
"grad_norm": 1.2117325105562007,
"learning_rate": 7.793909509558572e-05,
"loss": 0.4202,
"step": 277
},
{
"epoch": 0.9652777777777778,
"grad_norm": 0.6831708132582254,
"learning_rate": 7.790826151415289e-05,
"loss": 0.4257,
"step": 278
},
{
"epoch": 0.96875,
"grad_norm": 0.859673440712125,
"learning_rate": 7.787720517980424e-05,
"loss": 0.4183,
"step": 279
},
{
"epoch": 0.9722222222222222,
"grad_norm": 0.9039026634213447,
"learning_rate": 7.784592627503004e-05,
"loss": 0.4184,
"step": 280
},
{
"epoch": 0.9756944444444444,
"grad_norm": 0.7704704639747162,
"learning_rate": 7.781442498362838e-05,
"loss": 0.4245,
"step": 281
},
{
"epoch": 0.9791666666666666,
"grad_norm": 1.021065945036308,
"learning_rate": 7.77827014907042e-05,
"loss": 0.4224,
"step": 282
},
{
"epoch": 0.9826388888888888,
"grad_norm": 1.0035320025992345,
"learning_rate": 7.775075598266803e-05,
"loss": 0.4188,
"step": 283
},
{
"epoch": 0.9861111111111112,
"grad_norm": 0.733225627159732,
"learning_rate": 7.771858864723504e-05,
"loss": 0.4139,
"step": 284
},
{
"epoch": 0.9895833333333334,
"grad_norm": 0.5129363335278152,
"learning_rate": 7.768619967342386e-05,
"loss": 0.4295,
"step": 285
},
{
"epoch": 0.9930555555555556,
"grad_norm": 0.44152260527622333,
"learning_rate": 7.76535892515555e-05,
"loss": 0.4329,
"step": 286
},
{
"epoch": 0.9965277777777778,
"grad_norm": 0.5476141384850192,
"learning_rate": 7.76207575732522e-05,
"loss": 0.4225,
"step": 287
},
{
"epoch": 1.0,
"grad_norm": 0.5830357412083533,
"learning_rate": 7.758770483143634e-05,
"loss": 0.4257,
"step": 288
},
{
"epoch": 1.0034722222222223,
"grad_norm": 0.7063916284367442,
"learning_rate": 7.755443122032931e-05,
"loss": 0.4051,
"step": 289
},
{
"epoch": 1.0069444444444444,
"grad_norm": 0.667202666738724,
"learning_rate": 7.752093693545032e-05,
"loss": 0.4003,
"step": 290
},
{
"epoch": 1.0104166666666667,
"grad_norm": 0.6230156026991575,
"learning_rate": 7.74872221736153e-05,
"loss": 0.4062,
"step": 291
},
{
"epoch": 1.0138888888888888,
"grad_norm": 0.6047415209235458,
"learning_rate": 7.745328713293573e-05,
"loss": 0.399,
"step": 292
},
{
"epoch": 1.0173611111111112,
"grad_norm": 0.4623574863446841,
"learning_rate": 7.741913201281746e-05,
"loss": 0.4107,
"step": 293
},
{
"epoch": 1.0208333333333333,
"grad_norm": 0.44829714098160994,
"learning_rate": 7.738475701395955e-05,
"loss": 0.402,
"step": 294
},
{
"epoch": 1.0243055555555556,
"grad_norm": 0.5583386260853201,
"learning_rate": 7.735016233835308e-05,
"loss": 0.4037,
"step": 295
},
{
"epoch": 1.0277777777777777,
"grad_norm": 0.5849993828315929,
"learning_rate": 7.731534818928004e-05,
"loss": 0.4038,
"step": 296
},
{
"epoch": 1.03125,
"grad_norm": 0.5530322885658703,
"learning_rate": 7.728031477131195e-05,
"loss": 0.4057,
"step": 297
},
{
"epoch": 1.0347222222222223,
"grad_norm": 0.5809657912163412,
"learning_rate": 7.724506229030888e-05,
"loss": 0.4008,
"step": 298
},
{
"epoch": 1.0381944444444444,
"grad_norm": 0.7320117850928769,
"learning_rate": 7.72095909534181e-05,
"loss": 0.4069,
"step": 299
},
{
"epoch": 1.0416666666666667,
"grad_norm": 0.8148876222477555,
"learning_rate": 7.71739009690729e-05,
"loss": 0.4148,
"step": 300
},
{
"epoch": 1.0451388888888888,
"grad_norm": 0.8604742887394918,
"learning_rate": 7.713799254699136e-05,
"loss": 0.4132,
"step": 301
},
{
"epoch": 1.0486111111111112,
"grad_norm": 0.8494756814809938,
"learning_rate": 7.710186589817515e-05,
"loss": 0.4056,
"step": 302
},
{
"epoch": 1.0520833333333333,
"grad_norm": 0.8006898767741991,
"learning_rate": 7.706552123490822e-05,
"loss": 0.4031,
"step": 303
},
{
"epoch": 1.0555555555555556,
"grad_norm": 0.7547949909298812,
"learning_rate": 7.702895877075563e-05,
"loss": 0.4084,
"step": 304
},
{
"epoch": 1.0590277777777777,
"grad_norm": 0.5910172696126633,
"learning_rate": 7.699217872056223e-05,
"loss": 0.4075,
"step": 305
},
{
"epoch": 1.0625,
"grad_norm": 0.4769866947519716,
"learning_rate": 7.695518130045147e-05,
"loss": 0.4028,
"step": 306
},
{
"epoch": 1.0659722222222223,
"grad_norm": 0.5949945873140698,
"learning_rate": 7.691796672782406e-05,
"loss": 0.398,
"step": 307
},
{
"epoch": 1.0694444444444444,
"grad_norm": 0.5723602950104976,
"learning_rate": 7.688053522135675e-05,
"loss": 0.4053,
"step": 308
},
{
"epoch": 1.0729166666666667,
"grad_norm": 0.4849220557566718,
"learning_rate": 7.684288700100095e-05,
"loss": 0.3934,
"step": 309
},
{
"epoch": 1.0763888888888888,
"grad_norm": 0.5878092037060889,
"learning_rate": 7.680502228798157e-05,
"loss": 0.4059,
"step": 310
},
{
"epoch": 1.0798611111111112,
"grad_norm": 0.7914196489636522,
"learning_rate": 7.676694130479563e-05,
"loss": 0.3971,
"step": 311
},
{
"epoch": 1.0833333333333333,
"grad_norm": 0.9299321919759843,
"learning_rate": 7.672864427521097e-05,
"loss": 0.4123,
"step": 312
},
{
"epoch": 1.0868055555555556,
"grad_norm": 0.8715987505249858,
"learning_rate": 7.669013142426496e-05,
"loss": 0.4055,
"step": 313
},
{
"epoch": 1.0902777777777777,
"grad_norm": 0.8096045081793677,
"learning_rate": 7.665140297826313e-05,
"loss": 0.4021,
"step": 314
},
{
"epoch": 1.09375,
"grad_norm": 0.8475178262681384,
"learning_rate": 7.66124591647779e-05,
"loss": 0.4023,
"step": 315
},
{
"epoch": 1.0972222222222223,
"grad_norm": 0.7826778208724321,
"learning_rate": 7.657330021264718e-05,
"loss": 0.3982,
"step": 316
},
{
"epoch": 1.1006944444444444,
"grad_norm": 0.6181458437106809,
"learning_rate": 7.65339263519731e-05,
"loss": 0.4038,
"step": 317
},
{
"epoch": 1.1041666666666667,
"grad_norm": 0.5257606234932206,
"learning_rate": 7.649433781412058e-05,
"loss": 0.3975,
"step": 318
},
{
"epoch": 1.1076388888888888,
"grad_norm": 0.5137603347420444,
"learning_rate": 7.645453483171601e-05,
"loss": 0.4054,
"step": 319
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.6494379013037576,
"learning_rate": 7.641451763864587e-05,
"loss": 0.3967,
"step": 320
},
{
"epoch": 1.1145833333333333,
"grad_norm": 0.7419787340823062,
"learning_rate": 7.637428647005541e-05,
"loss": 0.3956,
"step": 321
},
{
"epoch": 1.1180555555555556,
"grad_norm": 0.6989839067475451,
"learning_rate": 7.633384156234718e-05,
"loss": 0.4003,
"step": 322
},
{
"epoch": 1.1215277777777777,
"grad_norm": 0.6901694456258389,
"learning_rate": 7.629318315317968e-05,
"loss": 0.4026,
"step": 323
},
{
"epoch": 1.125,
"grad_norm": 0.7233257128268635,
"learning_rate": 7.625231148146601e-05,
"loss": 0.4087,
"step": 324
},
{
"epoch": 1.1284722222222223,
"grad_norm": 0.7506785296869003,
"learning_rate": 7.621122678737236e-05,
"loss": 0.3997,
"step": 325
},
{
"epoch": 1.1319444444444444,
"grad_norm": 0.7590348348849132,
"learning_rate": 7.616992931231671e-05,
"loss": 0.4021,
"step": 326
},
{
"epoch": 1.1354166666666667,
"grad_norm": 0.6901940604570691,
"learning_rate": 7.612841929896737e-05,
"loss": 0.4065,
"step": 327
},
{
"epoch": 1.1388888888888888,
"grad_norm": 0.580026833291539,
"learning_rate": 7.608669699124153e-05,
"loss": 0.3979,
"step": 328
},
{
"epoch": 1.1423611111111112,
"grad_norm": 0.5236840254807037,
"learning_rate": 7.604476263430379e-05,
"loss": 0.3998,
"step": 329
},
{
"epoch": 1.1458333333333333,
"grad_norm": 0.5415803185886238,
"learning_rate": 7.600261647456485e-05,
"loss": 0.4003,
"step": 330
},
{
"epoch": 1.1493055555555556,
"grad_norm": 0.4862624810527434,
"learning_rate": 7.596025875967998e-05,
"loss": 0.4044,
"step": 331
},
{
"epoch": 1.1527777777777777,
"grad_norm": 0.5339213319556734,
"learning_rate": 7.591768973854753e-05,
"loss": 0.4035,
"step": 332
},
{
"epoch": 1.15625,
"grad_norm": 0.6310106513888443,
"learning_rate": 7.587490966130754e-05,
"loss": 0.3997,
"step": 333
},
{
"epoch": 1.1597222222222223,
"grad_norm": 0.5550205488151554,
"learning_rate": 7.58319187793402e-05,
"loss": 0.3967,
"step": 334
},
{
"epoch": 1.1631944444444444,
"grad_norm": 0.40985639820095615,
"learning_rate": 7.578871734526449e-05,
"loss": 0.3979,
"step": 335
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.42924572007540923,
"learning_rate": 7.57453056129365e-05,
"loss": 0.4059,
"step": 336
},
{
"epoch": 1.1701388888888888,
"grad_norm": 0.3770504743326086,
"learning_rate": 7.570168383744815e-05,
"loss": 0.3977,
"step": 337
},
{
"epoch": 1.1736111111111112,
"grad_norm": 0.4453962885153323,
"learning_rate": 7.565785227512555e-05,
"loss": 0.3986,
"step": 338
},
{
"epoch": 1.1770833333333333,
"grad_norm": 0.6401462585148607,
"learning_rate": 7.561381118352757e-05,
"loss": 0.4006,
"step": 339
},
{
"epoch": 1.1805555555555556,
"grad_norm": 0.8576068224785208,
"learning_rate": 7.556956082144425e-05,
"loss": 0.4028,
"step": 340
},
{
"epoch": 1.1840277777777777,
"grad_norm": 0.970046404986134,
"learning_rate": 7.552510144889538e-05,
"loss": 0.395,
"step": 341
},
{
"epoch": 1.1875,
"grad_norm": 0.9237808454959463,
"learning_rate": 7.548043332712887e-05,
"loss": 0.3966,
"step": 342
},
{
"epoch": 1.1909722222222223,
"grad_norm": 0.7963471169582702,
"learning_rate": 7.54355567186193e-05,
"loss": 0.4029,
"step": 343
},
{
"epoch": 1.1944444444444444,
"grad_norm": 0.6884696150231001,
"learning_rate": 7.539047188706631e-05,
"loss": 0.4096,
"step": 344
},
{
"epoch": 1.1979166666666667,
"grad_norm": 0.6376467029816646,
"learning_rate": 7.534517909739312e-05,
"loss": 0.3982,
"step": 345
},
{
"epoch": 1.2013888888888888,
"grad_norm": 0.5931365038229,
"learning_rate": 7.529967861574487e-05,
"loss": 0.4077,
"step": 346
},
{
"epoch": 1.2048611111111112,
"grad_norm": 0.6606373517604195,
"learning_rate": 7.525397070948716e-05,
"loss": 0.4088,
"step": 347
},
{
"epoch": 1.2083333333333333,
"grad_norm": 0.8320458571146847,
"learning_rate": 7.520805564720444e-05,
"loss": 0.4018,
"step": 348
},
{
"epoch": 1.2118055555555556,
"grad_norm": 0.8998502488634423,
"learning_rate": 7.516193369869846e-05,
"loss": 0.4091,
"step": 349
},
{
"epoch": 1.2152777777777777,
"grad_norm": 0.8032832906987262,
"learning_rate": 7.511560513498658e-05,
"loss": 0.3993,
"step": 350
},
{
"epoch": 1.21875,
"grad_norm": 0.6642271849354356,
"learning_rate": 7.506907022830032e-05,
"loss": 0.3994,
"step": 351
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.564826521818374,
"learning_rate": 7.502232925208365e-05,
"loss": 0.399,
"step": 352
},
{
"epoch": 1.2256944444444444,
"grad_norm": 0.47406836067731883,
"learning_rate": 7.497538248099144e-05,
"loss": 0.3943,
"step": 353
},
{
"epoch": 1.2291666666666667,
"grad_norm": 0.46564678159712414,
"learning_rate": 7.492823019088785e-05,
"loss": 0.3979,
"step": 354
},
{
"epoch": 1.2326388888888888,
"grad_norm": 0.5727503778599136,
"learning_rate": 7.488087265884466e-05,
"loss": 0.3972,
"step": 355
},
{
"epoch": 1.2361111111111112,
"grad_norm": 0.5506165925693018,
"learning_rate": 7.483331016313969e-05,
"loss": 0.4008,
"step": 356
},
{
"epoch": 1.2395833333333333,
"grad_norm": 0.5430765774399919,
"learning_rate": 7.478554298325517e-05,
"loss": 0.4005,
"step": 357
},
{
"epoch": 1.2430555555555556,
"grad_norm": 0.5256534569365339,
"learning_rate": 7.473757139987602e-05,
"loss": 0.394,
"step": 358
},
{
"epoch": 1.2465277777777777,
"grad_norm": 0.5452588129629314,
"learning_rate": 7.468939569488833e-05,
"loss": 0.4006,
"step": 359
},
{
"epoch": 1.25,
"grad_norm": 0.5375005795015402,
"learning_rate": 7.464101615137756e-05,
"loss": 0.4002,
"step": 360
},
{
"epoch": 1.2534722222222223,
"grad_norm": 0.6640797930885677,
"learning_rate": 7.459243305362697e-05,
"loss": 0.3965,
"step": 361
},
{
"epoch": 1.2569444444444444,
"grad_norm": 6.509958727869088e+28,
"learning_rate": 7.454364668711595e-05,
"loss": 0.3984,
"step": 362
},
{
"epoch": 1.2604166666666667,
"grad_norm": 1.8590515577626585,
"learning_rate": 7.44946573385183e-05,
"loss": 0.416,
"step": 363
},
{
"epoch": 1.2638888888888888,
"grad_norm": 0.7545962814502798,
"learning_rate": 7.444546529570055e-05,
"loss": 0.4087,
"step": 364
},
{
"epoch": 1.2673611111111112,
"grad_norm": 0.9078399896381715,
"learning_rate": 7.439607084772032e-05,
"loss": 0.4021,
"step": 365
},
{
"epoch": 1.2708333333333333,
"grad_norm": 0.9915786959692952,
"learning_rate": 7.434647428482453e-05,
"loss": 0.4076,
"step": 366
},
{
"epoch": 1.2743055555555556,
"grad_norm": 1.1432908983480914,
"learning_rate": 7.42966758984478e-05,
"loss": 0.411,
"step": 367
},
{
"epoch": 1.2777777777777777,
"grad_norm": 0.7908216713685846,
"learning_rate": 7.424667598121067e-05,
"loss": 0.4048,
"step": 368
},
{
"epoch": 1.28125,
"grad_norm": 0.7840804475389772,
"learning_rate": 7.419647482691788e-05,
"loss": 0.3979,
"step": 369
},
{
"epoch": 1.2847222222222223,
"grad_norm": 0.7927335911266363,
"learning_rate": 7.414607273055666e-05,
"loss": 0.4041,
"step": 370
},
{
"epoch": 1.2881944444444444,
"grad_norm": 0.7111887896878814,
"learning_rate": 7.409546998829503e-05,
"loss": 0.3922,
"step": 371
},
{
"epoch": 1.2916666666666667,
"grad_norm": 0.7947054781680923,
"learning_rate": 7.404466689747999e-05,
"loss": 0.4059,
"step": 372
},
{
"epoch": 1.2951388888888888,
"grad_norm": 0.7019883001980964,
"learning_rate": 7.399366375663584e-05,
"loss": 0.4024,
"step": 373
},
{
"epoch": 1.2986111111111112,
"grad_norm": 0.6218653316631517,
"learning_rate": 7.394246086546236e-05,
"loss": 0.4013,
"step": 374
},
{
"epoch": 1.3020833333333333,
"grad_norm": 0.7190549947235102,
"learning_rate": 7.389105852483312e-05,
"loss": 0.4069,
"step": 375
},
{
"epoch": 1.3055555555555556,
"grad_norm": 0.768859503130449,
"learning_rate": 7.383945703679365e-05,
"loss": 0.3958,
"step": 376
},
{
"epoch": 1.3090277777777777,
"grad_norm": 0.534674762997295,
"learning_rate": 7.37876567045597e-05,
"loss": 0.3969,
"step": 377
},
{
"epoch": 1.3125,
"grad_norm": 0.5445743842452613,
"learning_rate": 7.373565783251544e-05,
"loss": 0.4004,
"step": 378
},
{
"epoch": 1.3159722222222223,
"grad_norm": 0.49103846814991575,
"learning_rate": 7.368346072621169e-05,
"loss": 0.3961,
"step": 379
},
{
"epoch": 1.3194444444444444,
"grad_norm": 0.47339147058706754,
"learning_rate": 7.363106569236413e-05,
"loss": 0.4058,
"step": 380
},
{
"epoch": 1.3229166666666667,
"grad_norm": 0.4912336379485776,
"learning_rate": 7.357847303885146e-05,
"loss": 0.3935,
"step": 381
},
{
"epoch": 1.3263888888888888,
"grad_norm": 0.47424561892980627,
"learning_rate": 7.352568307471363e-05,
"loss": 0.3962,
"step": 382
},
{
"epoch": 1.3298611111111112,
"grad_norm": 0.4530340325962746,
"learning_rate": 7.347269611014997e-05,
"loss": 0.4043,
"step": 383
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.4950169140082056,
"learning_rate": 7.341951245651747e-05,
"loss": 0.4042,
"step": 384
},
{
"epoch": 1.3368055555555556,
"grad_norm": 0.5020864609707268,
"learning_rate": 7.336613242632882e-05,
"loss": 0.3981,
"step": 385
},
{
"epoch": 1.3402777777777777,
"grad_norm": 0.3617561837680056,
"learning_rate": 7.33125563332507e-05,
"loss": 0.3943,
"step": 386
},
{
"epoch": 1.34375,
"grad_norm": 0.39426317679870326,
"learning_rate": 7.325878449210182e-05,
"loss": 0.4017,
"step": 387
},
{
"epoch": 1.3472222222222223,
"grad_norm": 0.36781313949402294,
"learning_rate": 7.320481721885116e-05,
"loss": 0.4054,
"step": 388
},
{
"epoch": 1.3506944444444444,
"grad_norm": 0.3743748114329641,
"learning_rate": 7.315065483061608e-05,
"loss": 0.3972,
"step": 389
},
{
"epoch": 1.3541666666666667,
"grad_norm": 0.4147718236807753,
"learning_rate": 7.309629764566042e-05,
"loss": 0.3942,
"step": 390
},
{
"epoch": 1.3576388888888888,
"grad_norm": 0.466497809382821,
"learning_rate": 7.304174598339274e-05,
"loss": 0.3948,
"step": 391
},
{
"epoch": 1.3611111111111112,
"grad_norm": 0.4701553681056374,
"learning_rate": 7.298700016436427e-05,
"loss": 0.3993,
"step": 392
},
{
"epoch": 1.3645833333333333,
"grad_norm": 0.5674005815206642,
"learning_rate": 7.293206051026722e-05,
"loss": 0.4068,
"step": 393
},
{
"epoch": 1.3680555555555556,
"grad_norm": 0.7445442589940026,
"learning_rate": 7.287692734393273e-05,
"loss": 0.3935,
"step": 394
},
{
"epoch": 1.3715277777777777,
"grad_norm": 0.986306413661404,
"learning_rate": 7.282160098932906e-05,
"loss": 0.3977,
"step": 395
},
{
"epoch": 1.375,
"grad_norm": 1.1477021999284092,
"learning_rate": 7.276608177155968e-05,
"loss": 0.4049,
"step": 396
},
{
"epoch": 1.3784722222222223,
"grad_norm": 0.6486212053686043,
"learning_rate": 7.271037001686132e-05,
"loss": 0.3968,
"step": 397
},
{
"epoch": 1.3819444444444444,
"grad_norm": 0.3729943544399879,
"learning_rate": 7.265446605260208e-05,
"loss": 0.3968,
"step": 398
},
{
"epoch": 1.3854166666666667,
"grad_norm": 0.6010239194946239,
"learning_rate": 7.259837020727953e-05,
"loss": 0.3949,
"step": 399
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.753191689196547,
"learning_rate": 7.254208281051871e-05,
"loss": 0.3985,
"step": 400
},
{
"epoch": 1.3923611111111112,
"grad_norm": 0.6980861620307499,
"learning_rate": 7.248560419307028e-05,
"loss": 0.3949,
"step": 401
},
{
"epoch": 1.3958333333333333,
"grad_norm": 0.5625805147809977,
"learning_rate": 7.242893468680849e-05,
"loss": 0.3965,
"step": 402
},
{
"epoch": 1.3993055555555556,
"grad_norm": 0.5129296233655678,
"learning_rate": 7.237207462472933e-05,
"loss": 0.3999,
"step": 403
},
{
"epoch": 1.4027777777777777,
"grad_norm": 0.5019690768893361,
"learning_rate": 7.231502434094845e-05,
"loss": 0.3967,
"step": 404
},
{
"epoch": 1.40625,
"grad_norm": 0.5198379539446143,
"learning_rate": 7.225778417069932e-05,
"loss": 0.3932,
"step": 405
},
{
"epoch": 1.4097222222222223,
"grad_norm": 0.5178789165907579,
"learning_rate": 7.220035445033114e-05,
"loss": 0.3943,
"step": 406
},
{
"epoch": 1.4131944444444444,
"grad_norm": 0.37822059845389094,
"learning_rate": 7.2142735517307e-05,
"loss": 0.3906,
"step": 407
},
{
"epoch": 1.4166666666666667,
"grad_norm": 0.44150796530328035,
"learning_rate": 7.208492771020176e-05,
"loss": 0.3944,
"step": 408
},
{
"epoch": 1.4201388888888888,
"grad_norm": 0.5232356469810064,
"learning_rate": 7.202693136870016e-05,
"loss": 0.3865,
"step": 409
},
{
"epoch": 1.4236111111111112,
"grad_norm": 0.46523603602882435,
"learning_rate": 7.196874683359479e-05,
"loss": 0.3989,
"step": 410
},
{
"epoch": 1.4270833333333333,
"grad_norm": 0.30375565178239217,
"learning_rate": 7.191037444678407e-05,
"loss": 0.4039,
"step": 411
},
{
"epoch": 1.4305555555555556,
"grad_norm": 0.4400233462298016,
"learning_rate": 7.185181455127023e-05,
"loss": 0.3908,
"step": 412
},
{
"epoch": 1.4340277777777777,
"grad_norm": 0.5419376249064612,
"learning_rate": 7.179306749115739e-05,
"loss": 0.3961,
"step": 413
},
{
"epoch": 1.4375,
"grad_norm": 0.5049517808866749,
"learning_rate": 7.173413361164941e-05,
"loss": 0.39,
"step": 414
},
{
"epoch": 1.4409722222222223,
"grad_norm": 0.5615540577656827,
"learning_rate": 7.167501325904795e-05,
"loss": 0.3977,
"step": 415
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.677673898490289,
"learning_rate": 7.161570678075038e-05,
"loss": 0.3941,
"step": 416
},
{
"epoch": 1.4479166666666667,
"grad_norm": 0.7457041655072583,
"learning_rate": 7.155621452524779e-05,
"loss": 0.3982,
"step": 417
},
{
"epoch": 1.4513888888888888,
"grad_norm": 0.7907637822261093,
"learning_rate": 7.14965368421229e-05,
"loss": 0.4062,
"step": 418
},
{
"epoch": 1.4548611111111112,
"grad_norm": 0.8775535799179068,
"learning_rate": 7.143667408204803e-05,
"loss": 0.4041,
"step": 419
},
{
"epoch": 1.4583333333333333,
"grad_norm": 0.8830905484351036,
"learning_rate": 7.137662659678303e-05,
"loss": 0.398,
"step": 420
},
{
"epoch": 1.4618055555555556,
"grad_norm": 0.64842475071273,
"learning_rate": 7.131639473917321e-05,
"loss": 0.3998,
"step": 421
},
{
"epoch": 1.4652777777777777,
"grad_norm": 0.44157655714059424,
"learning_rate": 7.12559788631473e-05,
"loss": 0.3943,
"step": 422
},
{
"epoch": 1.46875,
"grad_norm": 0.40931880728742037,
"learning_rate": 7.119537932371527e-05,
"loss": 0.3975,
"step": 423
},
{
"epoch": 1.4722222222222223,
"grad_norm": 0.4820032334826367,
"learning_rate": 7.113459647696641e-05,
"loss": 0.3935,
"step": 424
},
{
"epoch": 1.4756944444444444,
"grad_norm": 0.588832851916906,
"learning_rate": 7.107363068006706e-05,
"loss": 0.3961,
"step": 425
},
{
"epoch": 1.4791666666666667,
"grad_norm": 0.5425066819059217,
"learning_rate": 7.101248229125864e-05,
"loss": 0.398,
"step": 426
},
{
"epoch": 1.4826388888888888,
"grad_norm": 0.3955679513420198,
"learning_rate": 7.09511516698555e-05,
"loss": 0.3954,
"step": 427
},
{
"epoch": 1.4861111111111112,
"grad_norm": 0.3169987014319606,
"learning_rate": 7.088963917624277e-05,
"loss": 0.397,
"step": 428
},
{
"epoch": 1.4895833333333333,
"grad_norm": 0.44741101040643333,
"learning_rate": 7.082794517187432e-05,
"loss": 0.3914,
"step": 429
},
{
"epoch": 1.4930555555555556,
"grad_norm": 0.5227023640698025,
"learning_rate": 7.076607001927061e-05,
"loss": 0.3916,
"step": 430
},
{
"epoch": 1.4965277777777777,
"grad_norm": 0.4360893533255743,
"learning_rate": 7.070401408201647e-05,
"loss": 0.3986,
"step": 431
},
{
"epoch": 1.5,
"grad_norm": 0.3479497802804189,
"learning_rate": 7.064177772475912e-05,
"loss": 0.3958,
"step": 432
},
{
"epoch": 1.5034722222222223,
"grad_norm": 0.5897209351516681,
"learning_rate": 7.057936131320592e-05,
"loss": 0.4036,
"step": 433
},
{
"epoch": 1.5069444444444444,
"grad_norm": 0.7047406344920322,
"learning_rate": 7.051676521412221e-05,
"loss": 0.3949,
"step": 434
},
{
"epoch": 1.5104166666666665,
"grad_norm": 0.5517206030087352,
"learning_rate": 7.045398979532925e-05,
"loss": 0.4033,
"step": 435
},
{
"epoch": 1.5138888888888888,
"grad_norm": 0.36054079949957824,
"learning_rate": 7.039103542570199e-05,
"loss": 0.3958,
"step": 436
},
{
"epoch": 1.5173611111111112,
"grad_norm": 0.29130083311637406,
"learning_rate": 7.032790247516686e-05,
"loss": 0.3968,
"step": 437
},
{
"epoch": 1.5208333333333335,
"grad_norm": 0.39405579534978225,
"learning_rate": 7.026459131469972e-05,
"loss": 0.4093,
"step": 438
},
{
"epoch": 1.5243055555555556,
"grad_norm": 0.46575221665690475,
"learning_rate": 7.020110231632357e-05,
"loss": 0.4012,
"step": 439
},
{
"epoch": 1.5277777777777777,
"grad_norm": 0.46461614609662905,
"learning_rate": 7.013743585310642e-05,
"loss": 0.3967,
"step": 440
},
{
"epoch": 1.53125,
"grad_norm": 0.48150122610101587,
"learning_rate": 7.00735922991591e-05,
"loss": 0.3993,
"step": 441
},
{
"epoch": 1.5347222222222223,
"grad_norm": 0.4642986163233927,
"learning_rate": 7.000957202963298e-05,
"loss": 0.3956,
"step": 442
},
{
"epoch": 1.5381944444444444,
"grad_norm": 0.397653915493047,
"learning_rate": 6.99453754207179e-05,
"loss": 0.3986,
"step": 443
},
{
"epoch": 1.5416666666666665,
"grad_norm": 0.4043417877836226,
"learning_rate": 6.988100284963985e-05,
"loss": 0.3941,
"step": 444
},
{
"epoch": 1.5451388888888888,
"grad_norm": 0.34751919690711397,
"learning_rate": 6.981645469465878e-05,
"loss": 0.3957,
"step": 445
},
{
"epoch": 1.5486111111111112,
"grad_norm": 0.45683297371671877,
"learning_rate": 6.975173133506646e-05,
"loss": 0.3937,
"step": 446
},
{
"epoch": 1.5520833333333335,
"grad_norm": 0.5944612746389557,
"learning_rate": 6.968683315118407e-05,
"loss": 0.3961,
"step": 447
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.7165358032510882,
"learning_rate": 6.96217605243602e-05,
"loss": 0.3976,
"step": 448
},
{
"epoch": 1.5590277777777777,
"grad_norm": 0.8996970961809184,
"learning_rate": 6.955651383696836e-05,
"loss": 0.3982,
"step": 449
},
{
"epoch": 1.5625,
"grad_norm": 1.228115875802595,
"learning_rate": 6.949109347240496e-05,
"loss": 0.3969,
"step": 450
},
{
"epoch": 1.5659722222222223,
"grad_norm": 0.7021067773647806,
"learning_rate": 6.942549981508691e-05,
"loss": 0.3948,
"step": 451
},
{
"epoch": 1.5694444444444444,
"grad_norm": 0.4195458938653693,
"learning_rate": 6.935973325044941e-05,
"loss": 0.4029,
"step": 452
},
{
"epoch": 1.5729166666666665,
"grad_norm": 0.7064275771918259,
"learning_rate": 6.929379416494369e-05,
"loss": 0.391,
"step": 453
},
{
"epoch": 1.5763888888888888,
"grad_norm": 1.017357717230314,
"learning_rate": 6.92276829460347e-05,
"loss": 0.4085,
"step": 454
},
{
"epoch": 1.5798611111111112,
"grad_norm": 0.9563799327415947,
"learning_rate": 6.91613999821989e-05,
"loss": 0.3951,
"step": 455
},
{
"epoch": 1.5833333333333335,
"grad_norm": 0.7032227970552669,
"learning_rate": 6.909494566292195e-05,
"loss": 0.3954,
"step": 456
},
{
"epoch": 1.5868055555555556,
"grad_norm": 0.49934071955688775,
"learning_rate": 6.902832037869637e-05,
"loss": 0.3918,
"step": 457
},
{
"epoch": 1.5902777777777777,
"grad_norm": 0.4664001240844466,
"learning_rate": 6.89615245210193e-05,
"loss": 0.3938,
"step": 458
},
{
"epoch": 1.59375,
"grad_norm": 0.6434919721382892,
"learning_rate": 6.889455848239022e-05,
"loss": 0.4072,
"step": 459
},
{
"epoch": 1.5972222222222223,
"grad_norm": 0.7719379494052273,
"learning_rate": 6.882742265630859e-05,
"loss": 0.3938,
"step": 460
},
{
"epoch": 1.6006944444444444,
"grad_norm": 0.8059082567281859,
"learning_rate": 6.876011743727154e-05,
"loss": 0.3995,
"step": 461
},
{
"epoch": 1.6041666666666665,
"grad_norm": 0.6320412549731026,
"learning_rate": 6.869264322077158e-05,
"loss": 0.3908,
"step": 462
},
{
"epoch": 1.6076388888888888,
"grad_norm": 0.4222247764158233,
"learning_rate": 6.86250004032943e-05,
"loss": 0.3929,
"step": 463
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.3699988949394749,
"learning_rate": 6.855718938231597e-05,
"loss": 0.389,
"step": 464
},
{
"epoch": 1.6145833333333335,
"grad_norm": 0.4049406846507113,
"learning_rate": 6.848921055630125e-05,
"loss": 0.3853,
"step": 465
},
{
"epoch": 1.6180555555555556,
"grad_norm": 0.4872135649150802,
"learning_rate": 6.842106432470084e-05,
"loss": 0.3966,
"step": 466
},
{
"epoch": 1.6215277777777777,
"grad_norm": 0.5738461208479633,
"learning_rate": 6.835275108794915e-05,
"loss": 0.4036,
"step": 467
},
{
"epoch": 1.625,
"grad_norm": 0.475084958136699,
"learning_rate": 6.828427124746191e-05,
"loss": 0.3943,
"step": 468
},
{
"epoch": 1.6284722222222223,
"grad_norm": 0.3344824698389433,
"learning_rate": 6.821562520563383e-05,
"loss": 0.3929,
"step": 469
},
{
"epoch": 1.6319444444444444,
"grad_norm": 0.34048825094857005,
"learning_rate": 6.814681336583624e-05,
"loss": 0.3953,
"step": 470
},
{
"epoch": 1.6354166666666665,
"grad_norm": 0.34332483302626543,
"learning_rate": 6.807783613241474e-05,
"loss": 0.3913,
"step": 471
},
{
"epoch": 1.6388888888888888,
"grad_norm": 0.3722370480435148,
"learning_rate": 6.800869391068674e-05,
"loss": 0.3966,
"step": 472
},
{
"epoch": 1.6423611111111112,
"grad_norm": 0.3786915178498432,
"learning_rate": 6.793938710693922e-05,
"loss": 0.3932,
"step": 473
},
{
"epoch": 1.6458333333333335,
"grad_norm": 0.40117165891657924,
"learning_rate": 6.786991612842621e-05,
"loss": 0.3918,
"step": 474
},
{
"epoch": 1.6493055555555556,
"grad_norm": 0.4047825538421503,
"learning_rate": 6.780028138336643e-05,
"loss": 0.3931,
"step": 475
},
{
"epoch": 1.6527777777777777,
"grad_norm": 0.4073725341386445,
"learning_rate": 6.773048328094097e-05,
"loss": 0.3983,
"step": 476
},
{
"epoch": 1.65625,
"grad_norm": 0.4310315011113548,
"learning_rate": 6.766052223129079e-05,
"loss": 0.392,
"step": 477
},
{
"epoch": 1.6597222222222223,
"grad_norm": 0.38768773081292784,
"learning_rate": 6.759039864551431e-05,
"loss": 0.3876,
"step": 478
},
{
"epoch": 1.6631944444444444,
"grad_norm": 0.40706945931775057,
"learning_rate": 6.752011293566511e-05,
"loss": 0.395,
"step": 479
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.5687554325547544,
"learning_rate": 6.744966551474936e-05,
"loss": 0.3946,
"step": 480
},
{
"epoch": 1.6701388888888888,
"grad_norm": 0.6755155309069832,
"learning_rate": 6.737905679672347e-05,
"loss": 0.3853,
"step": 481
},
{
"epoch": 1.6736111111111112,
"grad_norm": 0.5807751535885235,
"learning_rate": 6.730828719649171e-05,
"loss": 0.3966,
"step": 482
},
{
"epoch": 1.6770833333333335,
"grad_norm": 0.37771332961010123,
"learning_rate": 6.723735712990362e-05,
"loss": 0.3902,
"step": 483
},
{
"epoch": 1.6805555555555556,
"grad_norm": 0.32353917470642674,
"learning_rate": 6.716626701375174e-05,
"loss": 0.3902,
"step": 484
},
{
"epoch": 1.6840277777777777,
"grad_norm": 0.3992076255136683,
"learning_rate": 6.7095017265769e-05,
"loss": 0.3974,
"step": 485
},
{
"epoch": 1.6875,
"grad_norm": 0.48039669200418916,
"learning_rate": 6.702360830462642e-05,
"loss": 0.3938,
"step": 486
},
{
"epoch": 1.6909722222222223,
"grad_norm": 0.60640709998847,
"learning_rate": 6.695204054993051e-05,
"loss": 0.397,
"step": 487
},
{
"epoch": 1.6944444444444444,
"grad_norm": 0.7118701736955534,
"learning_rate": 6.688031442222091e-05,
"loss": 0.3948,
"step": 488
},
{
"epoch": 1.6979166666666665,
"grad_norm": 0.785511616790005,
"learning_rate": 6.680843034296785e-05,
"loss": 0.3958,
"step": 489
},
{
"epoch": 1.7013888888888888,
"grad_norm": 0.8490039815668733,
"learning_rate": 6.67363887345697e-05,
"loss": 0.3946,
"step": 490
},
{
"epoch": 1.7048611111111112,
"grad_norm": 0.799012212330246,
"learning_rate": 6.666419002035053e-05,
"loss": 0.4004,
"step": 491
},
{
"epoch": 1.7083333333333335,
"grad_norm": 0.6451959607357418,
"learning_rate": 6.659183462455751e-05,
"loss": 0.3934,
"step": 492
},
{
"epoch": 1.7118055555555556,
"grad_norm": 0.4397460593795287,
"learning_rate": 6.651932297235858e-05,
"loss": 0.3968,
"step": 493
},
{
"epoch": 1.7152777777777777,
"grad_norm": 0.4984518575640306,
"learning_rate": 6.644665548983973e-05,
"loss": 0.3838,
"step": 494
},
{
"epoch": 1.71875,
"grad_norm": 0.5624968662346395,
"learning_rate": 6.637383260400276e-05,
"loss": 0.3882,
"step": 495
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.4976012577824521,
"learning_rate": 6.630085474276256e-05,
"loss": 0.3876,
"step": 496
},
{
"epoch": 1.7256944444444444,
"grad_norm": 0.40732966615342625,
"learning_rate": 6.622772233494467e-05,
"loss": 0.3967,
"step": 497
},
{
"epoch": 1.7291666666666665,
"grad_norm": 0.40592882952930137,
"learning_rate": 6.615443581028279e-05,
"loss": 0.396,
"step": 498
},
{
"epoch": 1.7326388888888888,
"grad_norm": 0.47886476411037715,
"learning_rate": 6.608099559941623e-05,
"loss": 0.3892,
"step": 499
},
{
"epoch": 1.7361111111111112,
"grad_norm": 0.41129655248344593,
"learning_rate": 6.600740213388735e-05,
"loss": 0.3837,
"step": 500
},
{
"epoch": 1.7395833333333335,
"grad_norm": 0.3054387826354855,
"learning_rate": 6.593365584613906e-05,
"loss": 0.3946,
"step": 501
},
{
"epoch": 1.7430555555555556,
"grad_norm": 0.43919149776524113,
"learning_rate": 6.585975716951226e-05,
"loss": 0.3931,
"step": 502
},
{
"epoch": 1.7465277777777777,
"grad_norm": 0.44650735659448654,
"learning_rate": 6.578570653824335e-05,
"loss": 0.3967,
"step": 503
},
{
"epoch": 1.75,
"grad_norm": 0.3126097483009025,
"learning_rate": 6.571150438746157e-05,
"loss": 0.3874,
"step": 504
},
{
"epoch": 1.7534722222222223,
"grad_norm": 0.34139547055278535,
"learning_rate": 6.563715115318655e-05,
"loss": 0.3958,
"step": 505
},
{
"epoch": 1.7569444444444444,
"grad_norm": 0.4346890170698485,
"learning_rate": 6.556264727232567e-05,
"loss": 0.3913,
"step": 506
},
{
"epoch": 1.7604166666666665,
"grad_norm": 0.32111684006814456,
"learning_rate": 6.548799318267154e-05,
"loss": 0.3914,
"step": 507
},
{
"epoch": 1.7638888888888888,
"grad_norm": 0.24993037577302774,
"learning_rate": 6.54131893228994e-05,
"loss": 0.3903,
"step": 508
},
{
"epoch": 1.7673611111111112,
"grad_norm": 0.4529309860194363,
"learning_rate": 6.533823613256461e-05,
"loss": 0.3902,
"step": 509
},
{
"epoch": 1.7708333333333335,
"grad_norm": 0.3939925676268099,
"learning_rate": 6.526313405209991e-05,
"loss": 0.3932,
"step": 510
},
{
"epoch": 1.7743055555555556,
"grad_norm": 0.2977509306937723,
"learning_rate": 6.518788352281303e-05,
"loss": 0.3883,
"step": 511
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.3926989264025188,
"learning_rate": 6.511248498688396e-05,
"loss": 0.3993,
"step": 512
},
{
"epoch": 1.78125,
"grad_norm": 0.4121738758470045,
"learning_rate": 6.503693888736238e-05,
"loss": 0.3897,
"step": 513
},
{
"epoch": 1.7847222222222223,
"grad_norm": 0.3360981108450817,
"learning_rate": 6.49612456681651e-05,
"loss": 0.3882,
"step": 514
},
{
"epoch": 1.7881944444444444,
"grad_norm": 0.3938069249933719,
"learning_rate": 6.488540577407337e-05,
"loss": 0.3901,
"step": 515
},
{
"epoch": 1.7916666666666665,
"grad_norm": 0.46994454215492776,
"learning_rate": 6.480941965073041e-05,
"loss": 0.39,
"step": 516
},
{
"epoch": 1.7951388888888888,
"grad_norm": 0.5388397889423108,
"learning_rate": 6.473328774463861e-05,
"loss": 0.3942,
"step": 517
},
{
"epoch": 1.7986111111111112,
"grad_norm": 0.6472064801068331,
"learning_rate": 6.465701050315702e-05,
"loss": 0.3856,
"step": 518
},
{
"epoch": 1.8020833333333335,
"grad_norm": 0.6669761089856858,
"learning_rate": 6.458058837449871e-05,
"loss": 0.3987,
"step": 519
},
{
"epoch": 1.8055555555555556,
"grad_norm": 0.7121336419902075,
"learning_rate": 6.450402180772811e-05,
"loss": 0.3969,
"step": 520
},
{
"epoch": 1.8090277777777777,
"grad_norm": 0.7825620371561279,
"learning_rate": 6.44273112527584e-05,
"loss": 0.3828,
"step": 521
},
{
"epoch": 1.8125,
"grad_norm": 0.7356857985595523,
"learning_rate": 6.435045716034883e-05,
"loss": 0.3908,
"step": 522
},
{
"epoch": 1.8159722222222223,
"grad_norm": 0.6187744972361597,
"learning_rate": 6.427345998210209e-05,
"loss": 0.389,
"step": 523
},
{
"epoch": 1.8194444444444444,
"grad_norm": 0.48962757707999305,
"learning_rate": 6.419632017046167e-05,
"loss": 0.3879,
"step": 524
},
{
"epoch": 1.8229166666666665,
"grad_norm": 0.38371647053249225,
"learning_rate": 6.411903817870919e-05,
"loss": 0.3921,
"step": 525
},
{
"epoch": 1.8263888888888888,
"grad_norm": 0.3913261530262924,
"learning_rate": 6.404161446096172e-05,
"loss": 0.3836,
"step": 526
},
{
"epoch": 1.8298611111111112,
"grad_norm": 0.6113089881845829,
"learning_rate": 6.396404947216915e-05,
"loss": 0.391,
"step": 527
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.7838044752642598,
"learning_rate": 6.388634366811146e-05,
"loss": 0.3936,
"step": 528
},
{
"epoch": 1.8368055555555556,
"grad_norm": 0.7608944646229419,
"learning_rate": 6.38084975053961e-05,
"loss": 0.3904,
"step": 529
},
{
"epoch": 1.8402777777777777,
"grad_norm": 0.5793469586106695,
"learning_rate": 6.37305114414553e-05,
"loss": 0.3903,
"step": 530
},
{
"epoch": 1.84375,
"grad_norm": 0.4597220685898171,
"learning_rate": 6.365238593454331e-05,
"loss": 0.3984,
"step": 531
},
{
"epoch": 1.8472222222222223,
"grad_norm": 0.5211196641640181,
"learning_rate": 6.35741214437338e-05,
"loss": 0.3915,
"step": 532
},
{
"epoch": 1.8506944444444444,
"grad_norm": 0.5511952598639375,
"learning_rate": 6.349571842891713e-05,
"loss": 0.4013,
"step": 533
},
{
"epoch": 1.8541666666666665,
"grad_norm": 0.4890246265904626,
"learning_rate": 6.341717735079763e-05,
"loss": 0.3928,
"step": 534
},
{
"epoch": 1.8576388888888888,
"grad_norm": 0.44583161362467083,
"learning_rate": 6.333849867089089e-05,
"loss": 0.395,
"step": 535
},
{
"epoch": 1.8611111111111112,
"grad_norm": 0.411850148556461,
"learning_rate": 6.325968285152107e-05,
"loss": 0.3887,
"step": 536
},
{
"epoch": 1.8645833333333335,
"grad_norm": 0.36467455928426995,
"learning_rate": 6.318073035581821e-05,
"loss": 0.3927,
"step": 537
},
{
"epoch": 1.8680555555555556,
"grad_norm": 0.4235914855953697,
"learning_rate": 6.31016416477154e-05,
"loss": 0.3829,
"step": 538
},
{
"epoch": 1.8715277777777777,
"grad_norm": 0.45603956391146694,
"learning_rate": 6.302241719194623e-05,
"loss": 0.387,
"step": 539
},
{
"epoch": 1.875,
"grad_norm": 0.4345935351579549,
"learning_rate": 6.294305745404185e-05,
"loss": 0.3921,
"step": 540
},
{
"epoch": 1.8784722222222223,
"grad_norm": 0.43172199957177415,
"learning_rate": 6.286356290032842e-05,
"loss": 0.3865,
"step": 541
},
{
"epoch": 1.8819444444444444,
"grad_norm": 0.37950386852749723,
"learning_rate": 6.278393399792426e-05,
"loss": 0.3924,
"step": 542
},
{
"epoch": 1.8854166666666665,
"grad_norm": 0.2996805079524871,
"learning_rate": 6.270417121473716e-05,
"loss": 0.3868,
"step": 543
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.3427611051054387,
"learning_rate": 6.262427501946155e-05,
"loss": 0.3955,
"step": 544
},
{
"epoch": 1.8923611111111112,
"grad_norm": 0.4265987354144226,
"learning_rate": 6.254424588157587e-05,
"loss": 0.3922,
"step": 545
},
{
"epoch": 1.8958333333333335,
"grad_norm": 0.42877721944052416,
"learning_rate": 6.246408427133972e-05,
"loss": 0.3952,
"step": 546
},
{
"epoch": 1.8993055555555556,
"grad_norm": 0.36928582871837345,
"learning_rate": 6.238379065979111e-05,
"loss": 0.3921,
"step": 547
},
{
"epoch": 1.9027777777777777,
"grad_norm": 0.2820191982443896,
"learning_rate": 6.230336551874372e-05,
"loss": 0.3858,
"step": 548
},
{
"epoch": 1.90625,
"grad_norm": 0.3068943917467818,
"learning_rate": 6.22228093207841e-05,
"loss": 0.3908,
"step": 549
},
{
"epoch": 1.9097222222222223,
"grad_norm": 0.36438451862287263,
"learning_rate": 6.214212253926891e-05,
"loss": 0.3903,
"step": 550
},
{
"epoch": 1.9131944444444444,
"grad_norm": 0.3919389997262451,
"learning_rate": 6.206130564832211e-05,
"loss": 0.3911,
"step": 551
},
{
"epoch": 1.9166666666666665,
"grad_norm": 0.37993753769113087,
"learning_rate": 6.198035912283225e-05,
"loss": 0.3888,
"step": 552
},
{
"epoch": 1.9201388888888888,
"grad_norm": 0.3167766051429095,
"learning_rate": 6.189928343844958e-05,
"loss": 0.3888,
"step": 553
},
{
"epoch": 1.9236111111111112,
"grad_norm": 0.3533856097778807,
"learning_rate": 6.18180790715833e-05,
"loss": 0.3868,
"step": 554
},
{
"epoch": 1.9270833333333335,
"grad_norm": 0.3720825699174947,
"learning_rate": 6.17367464993988e-05,
"loss": 0.3926,
"step": 555
},
{
"epoch": 1.9305555555555556,
"grad_norm": 0.390372909339937,
"learning_rate": 6.165528619981479e-05,
"loss": 0.3895,
"step": 556
},
{
"epoch": 1.9340277777777777,
"grad_norm": 0.43884352775151003,
"learning_rate": 6.157369865150052e-05,
"loss": 0.3932,
"step": 557
},
{
"epoch": 1.9375,
"grad_norm": 0.42288736407700567,
"learning_rate": 6.149198433387297e-05,
"loss": 0.3958,
"step": 558
},
{
"epoch": 1.9409722222222223,
"grad_norm": 0.40451538645376955,
"learning_rate": 6.141014372709402e-05,
"loss": 0.3936,
"step": 559
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.354175463043975,
"learning_rate": 6.132817731206766e-05,
"loss": 0.3904,
"step": 560
},
{
"epoch": 1.9479166666666665,
"grad_norm": 0.3780580927853469,
"learning_rate": 6.124608557043713e-05,
"loss": 0.3936,
"step": 561
},
{
"epoch": 1.9513888888888888,
"grad_norm": 0.4013091767144706,
"learning_rate": 6.116386898458211e-05,
"loss": 0.3908,
"step": 562
},
{
"epoch": 1.9548611111111112,
"grad_norm": 0.3752656113478743,
"learning_rate": 6.108152803761585e-05,
"loss": 0.388,
"step": 563
},
{
"epoch": 1.9583333333333335,
"grad_norm": 0.42849730360980076,
"learning_rate": 6.099906321338241e-05,
"loss": 0.3883,
"step": 564
},
{
"epoch": 1.9618055555555556,
"grad_norm": 0.5136107162433327,
"learning_rate": 6.091647499645373e-05,
"loss": 0.3936,
"step": 565
},
{
"epoch": 1.9652777777777777,
"grad_norm": 0.4401404773608974,
"learning_rate": 6.08337638721268e-05,
"loss": 0.387,
"step": 566
},
{
"epoch": 1.96875,
"grad_norm": 0.3578780937355148,
"learning_rate": 6.075093032642087e-05,
"loss": 0.3798,
"step": 567
},
{
"epoch": 1.9722222222222223,
"grad_norm": 0.34936357235449345,
"learning_rate": 6.0667974846074524e-05,
"loss": 0.3965,
"step": 568
},
{
"epoch": 1.9756944444444444,
"grad_norm": 0.40689796724445154,
"learning_rate": 6.058489791854286e-05,
"loss": 0.3894,
"step": 569
},
{
"epoch": 1.9791666666666665,
"grad_norm": 0.4956233528878605,
"learning_rate": 6.0501700031994613e-05,
"loss": 0.3937,
"step": 570
},
{
"epoch": 1.9826388888888888,
"grad_norm": 0.5833995404348259,
"learning_rate": 6.041838167530927e-05,
"loss": 0.3871,
"step": 571
},
{
"epoch": 1.9861111111111112,
"grad_norm": 0.6022857744397874,
"learning_rate": 6.033494333807422e-05,
"loss": 0.389,
"step": 572
},
{
"epoch": 1.9895833333333335,
"grad_norm": 0.5608017817744333,
"learning_rate": 6.02513855105819e-05,
"loss": 0.3983,
"step": 573
},
{
"epoch": 1.9930555555555556,
"grad_norm": 0.47618308572818047,
"learning_rate": 6.016770868382683e-05,
"loss": 0.3818,
"step": 574
},
{
"epoch": 1.9965277777777777,
"grad_norm": 0.3934864503184335,
"learning_rate": 6.008391334950281e-05,
"loss": 0.385,
"step": 575
},
{
"epoch": 2.0,
"grad_norm": 0.25691954246846876,
"learning_rate": 6.000000000000001e-05,
"loss": 0.3698,
"step": 576
},
{
"epoch": 2.0034722222222223,
"grad_norm": 0.30107102968416166,
"learning_rate": 5.991596912840207e-05,
"loss": 0.3627,
"step": 577
},
{
"epoch": 2.0069444444444446,
"grad_norm": 0.5647121908849111,
"learning_rate": 5.983182122848318e-05,
"loss": 0.3514,
"step": 578
},
{
"epoch": 2.0104166666666665,
"grad_norm": 0.8511792389980791,
"learning_rate": 5.9747556794705213e-05,
"loss": 0.3589,
"step": 579
},
{
"epoch": 2.013888888888889,
"grad_norm": 0.940817594887035,
"learning_rate": 5.9663176322214826e-05,
"loss": 0.3622,
"step": 580
},
{
"epoch": 2.017361111111111,
"grad_norm": 0.7139112695072981,
"learning_rate": 5.957868030684051e-05,
"loss": 0.361,
"step": 581
},
{
"epoch": 2.0208333333333335,
"grad_norm": 0.4910633370182954,
"learning_rate": 5.94940692450897e-05,
"loss": 0.3565,
"step": 582
},
{
"epoch": 2.0243055555555554,
"grad_norm": 0.4269351784997279,
"learning_rate": 5.940934363414586e-05,
"loss": 0.3595,
"step": 583
},
{
"epoch": 2.0277777777777777,
"grad_norm": 0.4872235898220299,
"learning_rate": 5.9324503971865545e-05,
"loss": 0.3587,
"step": 584
},
{
"epoch": 2.03125,
"grad_norm": 0.5995095615790915,
"learning_rate": 5.923955075677551e-05,
"loss": 0.3554,
"step": 585
},
{
"epoch": 2.0347222222222223,
"grad_norm": 0.5648401977971076,
"learning_rate": 5.9154484488069736e-05,
"loss": 0.3581,
"step": 586
},
{
"epoch": 2.0381944444444446,
"grad_norm": 0.4158491724702212,
"learning_rate": 5.9069305665606554e-05,
"loss": 0.3553,
"step": 587
},
{
"epoch": 2.0416666666666665,
"grad_norm": 0.349690330012685,
"learning_rate": 5.8984014789905625e-05,
"loss": 0.3578,
"step": 588
},
{
"epoch": 2.045138888888889,
"grad_norm": 0.4802435080315265,
"learning_rate": 5.8898612362145066e-05,
"loss": 0.3584,
"step": 589
},
{
"epoch": 2.048611111111111,
"grad_norm": 0.48963145307942074,
"learning_rate": 5.8813098884158505e-05,
"loss": 0.3569,
"step": 590
},
{
"epoch": 2.0520833333333335,
"grad_norm": 0.3526405542275553,
"learning_rate": 5.8727474858432085e-05,
"loss": 0.358,
"step": 591
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.33023603489278375,
"learning_rate": 5.8641740788101566e-05,
"loss": 0.3603,
"step": 592
},
{
"epoch": 2.0590277777777777,
"grad_norm": 0.3840854831858298,
"learning_rate": 5.85558971769493e-05,
"loss": 0.3487,
"step": 593
},
{
"epoch": 2.0625,
"grad_norm": 0.4107313120312768,
"learning_rate": 5.846994452940137e-05,
"loss": 0.355,
"step": 594
},
{
"epoch": 2.0659722222222223,
"grad_norm": 0.3145742869476471,
"learning_rate": 5.83838833505245e-05,
"loss": 0.357,
"step": 595
},
{
"epoch": 2.0694444444444446,
"grad_norm": 0.27860334683052107,
"learning_rate": 5.8297714146023236e-05,
"loss": 0.351,
"step": 596
},
{
"epoch": 2.0729166666666665,
"grad_norm": 0.4136824169602067,
"learning_rate": 5.821143742223682e-05,
"loss": 0.3562,
"step": 597
},
{
"epoch": 2.076388888888889,
"grad_norm": 0.3673086857169161,
"learning_rate": 5.812505368613633e-05,
"loss": 0.3495,
"step": 598
},
{
"epoch": 2.079861111111111,
"grad_norm": 0.2862981526340435,
"learning_rate": 5.803856344532166e-05,
"loss": 0.3622,
"step": 599
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.2799086544794607,
"learning_rate": 5.79519672080185e-05,
"loss": 0.3585,
"step": 600
},
{
"epoch": 2.0868055555555554,
"grad_norm": 0.24475419710964016,
"learning_rate": 5.786526548307541e-05,
"loss": 0.3514,
"step": 601
},
{
"epoch": 2.0902777777777777,
"grad_norm": 0.2834139430354975,
"learning_rate": 5.777845877996085e-05,
"loss": 0.3596,
"step": 602
},
{
"epoch": 2.09375,
"grad_norm": 0.32111723120156277,
"learning_rate": 5.7691547608760055e-05,
"loss": 0.3559,
"step": 603
},
{
"epoch": 2.0972222222222223,
"grad_norm": 0.3194256373082478,
"learning_rate": 5.76045324801722e-05,
"loss": 0.3523,
"step": 604
},
{
"epoch": 2.1006944444444446,
"grad_norm": 0.3300710025133727,
"learning_rate": 5.7517413905507286e-05,
"loss": 0.3568,
"step": 605
},
{
"epoch": 2.1041666666666665,
"grad_norm": 0.3503765239910186,
"learning_rate": 5.743019239668318e-05,
"loss": 0.3537,
"step": 606
},
{
"epoch": 2.107638888888889,
"grad_norm": 0.3676525989023615,
"learning_rate": 5.7342868466222616e-05,
"loss": 0.3623,
"step": 607
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.6703499129502645,
"learning_rate": 5.7255442627250146e-05,
"loss": 0.3626,
"step": 608
},
{
"epoch": 2.1145833333333335,
"grad_norm": 0.22185727620726894,
"learning_rate": 5.716791539348917e-05,
"loss": 0.354,
"step": 609
},
{
"epoch": 2.1180555555555554,
"grad_norm": 0.6429072892056448,
"learning_rate": 5.708028727925887e-05,
"loss": 0.3572,
"step": 610
},
{
"epoch": 2.1215277777777777,
"grad_norm": 0.3846890307207904,
"learning_rate": 5.6992558799471226e-05,
"loss": 0.3587,
"step": 611
},
{
"epoch": 2.125,
"grad_norm": 2.143120529808764,
"learning_rate": 5.6904730469627985e-05,
"loss": 0.375,
"step": 612
},
{
"epoch": 2.1284722222222223,
"grad_norm": 0.414767281586357,
"learning_rate": 5.681680280581761e-05,
"loss": 0.3679,
"step": 613
},
{
"epoch": 2.1319444444444446,
"grad_norm": 0.7323559863602489,
"learning_rate": 5.672877632471226e-05,
"loss": 0.3651,
"step": 614
},
{
"epoch": 2.1354166666666665,
"grad_norm": 0.7643817367842332,
"learning_rate": 5.664065154356477e-05,
"loss": 0.3609,
"step": 615
},
{
"epoch": 2.138888888888889,
"grad_norm": 1.9345025495859447,
"learning_rate": 5.6552428980205575e-05,
"loss": 0.372,
"step": 616
},
{
"epoch": 2.142361111111111,
"grad_norm": 208.50573500089143,
"learning_rate": 5.6464109153039695e-05,
"loss": 5.8523,
"step": 617
},
{
"epoch": 2.1458333333333335,
"grad_norm": 24.860622309173138,
"learning_rate": 5.6375692581043705e-05,
"loss": 0.5587,
"step": 618
},
{
"epoch": 2.1493055555555554,
"grad_norm": 24.26248109338951,
"learning_rate": 5.628717978376263e-05,
"loss": 0.7174,
"step": 619
},
{
"epoch": 2.1527777777777777,
"grad_norm": 6.523767538235247,
"learning_rate": 5.619857128130695e-05,
"loss": 0.4476,
"step": 620
},
{
"epoch": 2.15625,
"grad_norm": 6.2813772578997416,
"learning_rate": 5.61098675943495e-05,
"loss": 0.3991,
"step": 621
},
{
"epoch": 2.1597222222222223,
"grad_norm": 0.7292368990201091,
"learning_rate": 5.602106924412243e-05,
"loss": 0.3903,
"step": 622
},
{
"epoch": 2.1631944444444446,
"grad_norm": 1.4353569142671059,
"learning_rate": 5.5932176752414163e-05,
"loss": 0.3951,
"step": 623
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.7524898691540998,
"learning_rate": 5.584319064156628e-05,
"loss": 0.3791,
"step": 624
},
{
"epoch": 2.170138888888889,
"grad_norm": 0.7596150008455383,
"learning_rate": 5.57541114344705e-05,
"loss": 0.3742,
"step": 625
},
{
"epoch": 2.173611111111111,
"grad_norm": 0.6870434643700057,
"learning_rate": 5.566493965456557e-05,
"loss": 0.3786,
"step": 626
},
{
"epoch": 2.1770833333333335,
"grad_norm": 0.4825491333810975,
"learning_rate": 5.5575675825834215e-05,
"loss": 0.3746,
"step": 627
},
{
"epoch": 2.1805555555555554,
"grad_norm": 0.9393542952103341,
"learning_rate": 5.548632047280003e-05,
"loss": 0.3761,
"step": 628
},
{
"epoch": 2.1840277777777777,
"grad_norm": 0.6786528943222451,
"learning_rate": 5.539687412052445e-05,
"loss": 0.3707,
"step": 629
},
{
"epoch": 2.1875,
"grad_norm": 4.3304441633601884,
"learning_rate": 5.5307337294603595e-05,
"loss": 0.3928,
"step": 630
},
{
"epoch": 2.1909722222222223,
"grad_norm": 2.894806364255019,
"learning_rate": 5.521771052116524e-05,
"loss": 0.4186,
"step": 631
},
{
"epoch": 2.1944444444444446,
"grad_norm": 0.8736036582533201,
"learning_rate": 5.5127994326865706e-05,
"loss": 0.3829,
"step": 632
},
{
"epoch": 2.1979166666666665,
"grad_norm": 1.0402302831246584,
"learning_rate": 5.5038189238886724e-05,
"loss": 0.3917,
"step": 633
},
{
"epoch": 2.201388888888889,
"grad_norm": 1.0251763725005574,
"learning_rate": 5.4948295784932425e-05,
"loss": 0.384,
"step": 634
},
{
"epoch": 2.204861111111111,
"grad_norm": 0.8468595986592679,
"learning_rate": 5.485831449322614e-05,
"loss": 0.3717,
"step": 635
},
{
"epoch": 2.2083333333333335,
"grad_norm": 0.9633419107531916,
"learning_rate": 5.476824589250738e-05,
"loss": 0.3841,
"step": 636
},
{
"epoch": 2.2118055555555554,
"grad_norm": 0.6494993837379418,
"learning_rate": 5.467809051202867e-05,
"loss": 0.3765,
"step": 637
},
{
"epoch": 2.2152777777777777,
"grad_norm": 0.6328352776053527,
"learning_rate": 5.458784888155248e-05,
"loss": 0.3715,
"step": 638
},
{
"epoch": 2.21875,
"grad_norm": 0.5569738869215616,
"learning_rate": 5.4497521531348066e-05,
"loss": 0.3727,
"step": 639
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.511170034380733,
"learning_rate": 5.440710899218842e-05,
"loss": 0.3705,
"step": 640
},
{
"epoch": 2.2256944444444446,
"grad_norm": 0.5625035626615582,
"learning_rate": 5.431661179534708e-05,
"loss": 0.3672,
"step": 641
},
{
"epoch": 2.2291666666666665,
"grad_norm": 0.5466352203355245,
"learning_rate": 5.4226030472595075e-05,
"loss": 0.3777,
"step": 642
},
{
"epoch": 2.232638888888889,
"grad_norm": 0.5202495848597224,
"learning_rate": 5.4135365556197715e-05,
"loss": 0.364,
"step": 643
},
{
"epoch": 2.236111111111111,
"grad_norm": 0.48315072083496347,
"learning_rate": 5.404461757891156e-05,
"loss": 0.3621,
"step": 644
},
{
"epoch": 2.2395833333333335,
"grad_norm": 0.43131695889837246,
"learning_rate": 5.3953787073981236e-05,
"loss": 0.3691,
"step": 645
},
{
"epoch": 2.2430555555555554,
"grad_norm": 0.4189369002593043,
"learning_rate": 5.3862874575136304e-05,
"loss": 0.3623,
"step": 646
},
{
"epoch": 2.2465277777777777,
"grad_norm": 0.3745410821868568,
"learning_rate": 5.377188061658814e-05,
"loss": 0.3619,
"step": 647
},
{
"epoch": 2.25,
"grad_norm": 0.3409984577353413,
"learning_rate": 5.368080573302676e-05,
"loss": 0.3711,
"step": 648
},
{
"epoch": 2.2534722222222223,
"grad_norm": 0.37503860120051213,
"learning_rate": 5.358965045961772e-05,
"loss": 0.3616,
"step": 649
},
{
"epoch": 2.2569444444444446,
"grad_norm": 0.3382453595399695,
"learning_rate": 5.3498415331998965e-05,
"loss": 0.3716,
"step": 650
},
{
"epoch": 2.2604166666666665,
"grad_norm": 0.2988789618607428,
"learning_rate": 5.340710088627766e-05,
"loss": 0.3653,
"step": 651
},
{
"epoch": 2.263888888888889,
"grad_norm": 0.3876803657220898,
"learning_rate": 5.331570765902706e-05,
"loss": 0.3646,
"step": 652
},
{
"epoch": 2.267361111111111,
"grad_norm": 0.2482002164430231,
"learning_rate": 5.3224236187283345e-05,
"loss": 0.3588,
"step": 653
},
{
"epoch": 2.2708333333333335,
"grad_norm": 0.3492377558634399,
"learning_rate": 5.3132687008542454e-05,
"loss": 0.3674,
"step": 654
},
{
"epoch": 2.2743055555555554,
"grad_norm": 0.3057479810242644,
"learning_rate": 5.304106066075694e-05,
"loss": 0.3667,
"step": 655
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.28671680767187063,
"learning_rate": 5.294935768233285e-05,
"loss": 0.365,
"step": 656
},
{
"epoch": 2.28125,
"grad_norm": 0.22327813987047312,
"learning_rate": 5.2857578612126466e-05,
"loss": 0.359,
"step": 657
},
{
"epoch": 2.2847222222222223,
"grad_norm": 0.26865980718906646,
"learning_rate": 5.276572398944124e-05,
"loss": 0.3556,
"step": 658
},
{
"epoch": 2.2881944444444446,
"grad_norm": 0.2333779225620715,
"learning_rate": 5.267379435402455e-05,
"loss": 0.3574,
"step": 659
},
{
"epoch": 2.2916666666666665,
"grad_norm": 0.24382283579760292,
"learning_rate": 5.258179024606455e-05,
"loss": 0.3589,
"step": 660
},
{
"epoch": 2.295138888888889,
"grad_norm": 0.261824698068253,
"learning_rate": 5.2489712206187036e-05,
"loss": 0.3642,
"step": 661
},
{
"epoch": 2.298611111111111,
"grad_norm": 0.24569982834386714,
"learning_rate": 5.239756077545221e-05,
"loss": 0.3588,
"step": 662
},
{
"epoch": 2.3020833333333335,
"grad_norm": 0.29187895293715893,
"learning_rate": 5.2305336495351536e-05,
"loss": 0.3602,
"step": 663
},
{
"epoch": 2.3055555555555554,
"grad_norm": 0.2339347191042144,
"learning_rate": 5.2213039907804535e-05,
"loss": 0.3633,
"step": 664
},
{
"epoch": 2.3090277777777777,
"grad_norm": 0.22979503433977172,
"learning_rate": 5.212067155515563e-05,
"loss": 0.3606,
"step": 665
},
{
"epoch": 2.3125,
"grad_norm": 0.2044651546517708,
"learning_rate": 5.202823198017092e-05,
"loss": 0.3642,
"step": 666
},
{
"epoch": 2.3159722222222223,
"grad_norm": 0.21390953062575657,
"learning_rate": 5.1935721726035066e-05,
"loss": 0.3615,
"step": 667
},
{
"epoch": 2.3194444444444446,
"grad_norm": 0.21587882165366537,
"learning_rate": 5.1843141336348e-05,
"loss": 0.3563,
"step": 668
},
{
"epoch": 2.3229166666666665,
"grad_norm": 0.23130846400906935,
"learning_rate": 5.1750491355121776e-05,
"loss": 0.3621,
"step": 669
},
{
"epoch": 2.326388888888889,
"grad_norm": 0.20361212130904563,
"learning_rate": 5.165777232677741e-05,
"loss": 0.3616,
"step": 670
},
{
"epoch": 2.329861111111111,
"grad_norm": 0.21069360029668197,
"learning_rate": 5.15649847961416e-05,
"loss": 0.3593,
"step": 671
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.21641477786022795,
"learning_rate": 5.1472129308443616e-05,
"loss": 0.3577,
"step": 672
},
{
"epoch": 2.3368055555555554,
"grad_norm": 0.20207920681809247,
"learning_rate": 5.137920640931203e-05,
"loss": 0.3624,
"step": 673
},
{
"epoch": 2.3402777777777777,
"grad_norm": 0.2176212869974964,
"learning_rate": 5.1286216644771516e-05,
"loss": 0.3615,
"step": 674
},
{
"epoch": 2.34375,
"grad_norm": 0.22129145443500614,
"learning_rate": 5.1193160561239694e-05,
"loss": 0.3576,
"step": 675
},
{
"epoch": 2.3472222222222223,
"grad_norm": 0.2466102629786633,
"learning_rate": 5.1100038705523834e-05,
"loss": 0.3574,
"step": 676
},
{
"epoch": 2.3506944444444446,
"grad_norm": 0.21074133497030534,
"learning_rate": 5.100685162481774e-05,
"loss": 0.3587,
"step": 677
},
{
"epoch": 2.3541666666666665,
"grad_norm": 0.23866014599102006,
"learning_rate": 5.091359986669845e-05,
"loss": 0.3643,
"step": 678
},
{
"epoch": 2.357638888888889,
"grad_norm": 0.3002294853456305,
"learning_rate": 5.082028397912305e-05,
"loss": 0.3558,
"step": 679
},
{
"epoch": 2.361111111111111,
"grad_norm": 0.30099280117716753,
"learning_rate": 5.07269045104255e-05,
"loss": 0.3547,
"step": 680
},
{
"epoch": 2.3645833333333335,
"grad_norm": 0.2606514046962765,
"learning_rate": 5.0633462009313315e-05,
"loss": 0.3607,
"step": 681
},
{
"epoch": 2.3680555555555554,
"grad_norm": 0.2675783212789683,
"learning_rate": 5.053995702486443e-05,
"loss": 0.3639,
"step": 682
},
{
"epoch": 2.3715277777777777,
"grad_norm": 0.24318875517158728,
"learning_rate": 5.044639010652393e-05,
"loss": 0.359,
"step": 683
},
{
"epoch": 2.375,
"grad_norm": 0.23002381127619823,
"learning_rate": 5.0352761804100835e-05,
"loss": 0.3617,
"step": 684
},
{
"epoch": 2.3784722222222223,
"grad_norm": 0.2118164025124787,
"learning_rate": 5.025907266776484e-05,
"loss": 0.3556,
"step": 685
},
{
"epoch": 2.3819444444444446,
"grad_norm": 0.22285424218251762,
"learning_rate": 5.0165323248043145e-05,
"loss": 0.3538,
"step": 686
},
{
"epoch": 2.3854166666666665,
"grad_norm": 0.2387153365397832,
"learning_rate": 5.007151409581715e-05,
"loss": 0.3592,
"step": 687
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.2301897194015837,
"learning_rate": 4.9977645762319255e-05,
"loss": 0.3563,
"step": 688
},
{
"epoch": 2.392361111111111,
"grad_norm": 0.28292987282319554,
"learning_rate": 4.988371879912964e-05,
"loss": 0.3686,
"step": 689
},
{
"epoch": 2.3958333333333335,
"grad_norm": 0.2924629331701138,
"learning_rate": 4.9789733758172956e-05,
"loss": 0.3659,
"step": 690
},
{
"epoch": 2.3993055555555554,
"grad_norm": 0.22966813193968594,
"learning_rate": 4.9695691191715175e-05,
"loss": 0.3652,
"step": 691
},
{
"epoch": 2.4027777777777777,
"grad_norm": 0.1887146801357064,
"learning_rate": 4.9601591652360244e-05,
"loss": 0.3586,
"step": 692
},
{
"epoch": 2.40625,
"grad_norm": 0.1857337748310565,
"learning_rate": 4.950743569304693e-05,
"loss": 0.3614,
"step": 693
},
{
"epoch": 2.4097222222222223,
"grad_norm": 0.2001722223098041,
"learning_rate": 4.941322386704551e-05,
"loss": 0.3551,
"step": 694
},
{
"epoch": 2.4131944444444446,
"grad_norm": 0.21840341629402213,
"learning_rate": 4.931895672795454e-05,
"loss": 0.3535,
"step": 695
},
{
"epoch": 2.4166666666666665,
"grad_norm": 0.2069911607036155,
"learning_rate": 4.922463482969761e-05,
"loss": 0.3562,
"step": 696
},
{
"epoch": 2.420138888888889,
"grad_norm": 0.16249095809217645,
"learning_rate": 4.913025872652007e-05,
"loss": 0.3632,
"step": 697
},
{
"epoch": 2.423611111111111,
"grad_norm": 0.19258216947616108,
"learning_rate": 4.903582897298579e-05,
"loss": 0.357,
"step": 698
},
{
"epoch": 2.4270833333333335,
"grad_norm": 0.17488328139804288,
"learning_rate": 4.89413461239739e-05,
"loss": 0.3633,
"step": 699
},
{
"epoch": 2.4305555555555554,
"grad_norm": 0.21230424394190295,
"learning_rate": 4.884681073467551e-05,
"loss": 0.3622,
"step": 700
},
{
"epoch": 2.4340277777777777,
"grad_norm": 0.2054127351550878,
"learning_rate": 4.8752223360590484e-05,
"loss": 0.3609,
"step": 701
},
{
"epoch": 2.4375,
"grad_norm": 0.16416468185173924,
"learning_rate": 4.8657584557524116e-05,
"loss": 0.3579,
"step": 702
},
{
"epoch": 2.4409722222222223,
"grad_norm": 0.20905287225635077,
"learning_rate": 4.8562894881583956e-05,
"loss": 0.3629,
"step": 703
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.18925844032734393,
"learning_rate": 4.846815488917644e-05,
"loss": 0.3626,
"step": 704
},
{
"epoch": 2.4479166666666665,
"grad_norm": 0.23302638899554443,
"learning_rate": 4.837336513700369e-05,
"loss": 0.3603,
"step": 705
},
{
"epoch": 2.451388888888889,
"grad_norm": 0.21386687962480064,
"learning_rate": 4.8278526182060225e-05,
"loss": 0.3573,
"step": 706
},
{
"epoch": 2.454861111111111,
"grad_norm": 0.16096788495870992,
"learning_rate": 4.8183638581629676e-05,
"loss": 0.3597,
"step": 707
},
{
"epoch": 2.4583333333333335,
"grad_norm": 0.17374476347622608,
"learning_rate": 4.808870289328153e-05,
"loss": 0.3616,
"step": 708
},
{
"epoch": 2.4618055555555554,
"grad_norm": 0.21830910332147066,
"learning_rate": 4.7993719674867815e-05,
"loss": 0.3558,
"step": 709
},
{
"epoch": 2.4652777777777777,
"grad_norm": 0.18240980841394056,
"learning_rate": 4.789868948451991e-05,
"loss": 0.3591,
"step": 710
},
{
"epoch": 2.46875,
"grad_norm": 0.21816236250652732,
"learning_rate": 4.780361288064514e-05,
"loss": 0.3604,
"step": 711
},
{
"epoch": 2.4722222222222223,
"grad_norm": 0.2728408320055425,
"learning_rate": 4.7708490421923596e-05,
"loss": 0.3586,
"step": 712
},
{
"epoch": 2.4756944444444446,
"grad_norm": 0.24429645460919563,
"learning_rate": 4.761332266730481e-05,
"loss": 0.3523,
"step": 713
},
{
"epoch": 2.4791666666666665,
"grad_norm": 0.19689609824801885,
"learning_rate": 4.751811017600448e-05,
"loss": 0.3606,
"step": 714
},
{
"epoch": 2.482638888888889,
"grad_norm": 0.273566755423662,
"learning_rate": 4.742285350750118e-05,
"loss": 0.3554,
"step": 715
},
{
"epoch": 2.486111111111111,
"grad_norm": 0.32623914313060043,
"learning_rate": 4.7327553221533074e-05,
"loss": 0.357,
"step": 716
},
{
"epoch": 2.4895833333333335,
"grad_norm": 0.2830951878660179,
"learning_rate": 4.723220987809462e-05,
"loss": 0.3578,
"step": 717
},
{
"epoch": 2.4930555555555554,
"grad_norm": 0.2565496381802557,
"learning_rate": 4.713682403743329e-05,
"loss": 0.3604,
"step": 718
},
{
"epoch": 2.4965277777777777,
"grad_norm": 0.21894746366691853,
"learning_rate": 4.7041396260046286e-05,
"loss": 0.3641,
"step": 719
},
{
"epoch": 2.5,
"grad_norm": 0.19901998551130898,
"learning_rate": 4.694592710667723e-05,
"loss": 0.3582,
"step": 720
},
{
"epoch": 2.5034722222222223,
"grad_norm": 0.24837568226290876,
"learning_rate": 4.6850417138312845e-05,
"loss": 0.3505,
"step": 721
},
{
"epoch": 2.5069444444444446,
"grad_norm": 0.3313870249246507,
"learning_rate": 4.6754866916179725e-05,
"loss": 0.3582,
"step": 722
},
{
"epoch": 2.5104166666666665,
"grad_norm": 0.2244873842332084,
"learning_rate": 4.6659277001740984e-05,
"loss": 0.3573,
"step": 723
},
{
"epoch": 2.513888888888889,
"grad_norm": 0.19767791466423057,
"learning_rate": 4.656364795669297e-05,
"loss": 0.36,
"step": 724
},
{
"epoch": 2.517361111111111,
"grad_norm": 0.28843808426003764,
"learning_rate": 4.646798034296197e-05,
"loss": 0.3604,
"step": 725
},
{
"epoch": 2.5208333333333335,
"grad_norm": 0.2796222422579987,
"learning_rate": 4.637227472270091e-05,
"loss": 0.3605,
"step": 726
},
{
"epoch": 2.5243055555555554,
"grad_norm": 0.2367371209993064,
"learning_rate": 4.6276531658286036e-05,
"loss": 0.3589,
"step": 727
},
{
"epoch": 2.5277777777777777,
"grad_norm": 0.20008216456325678,
"learning_rate": 4.618075171231363e-05,
"loss": 0.3571,
"step": 728
},
{
"epoch": 2.53125,
"grad_norm": 0.18250753943724574,
"learning_rate": 4.608493544759667e-05,
"loss": 0.3595,
"step": 729
},
{
"epoch": 2.5347222222222223,
"grad_norm": 0.22848019667076963,
"learning_rate": 4.59890834271616e-05,
"loss": 0.3599,
"step": 730
},
{
"epoch": 2.5381944444444446,
"grad_norm": 0.267718829441734,
"learning_rate": 4.589319621424489e-05,
"loss": 0.3612,
"step": 731
},
{
"epoch": 2.5416666666666665,
"grad_norm": 0.33157306810932696,
"learning_rate": 4.579727437228987e-05,
"loss": 0.3597,
"step": 732
},
{
"epoch": 2.545138888888889,
"grad_norm": 0.3143344523876356,
"learning_rate": 4.570131846494334e-05,
"loss": 0.3571,
"step": 733
},
{
"epoch": 2.548611111111111,
"grad_norm": 0.20354881157325236,
"learning_rate": 4.560532905605225e-05,
"loss": 0.3589,
"step": 734
},
{
"epoch": 2.5520833333333335,
"grad_norm": 0.22541333864731933,
"learning_rate": 4.550930670966043e-05,
"loss": 0.3579,
"step": 735
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.3280950019921769,
"learning_rate": 4.541325199000525e-05,
"loss": 0.3516,
"step": 736
},
{
"epoch": 2.5590277777777777,
"grad_norm": 0.24024319950783074,
"learning_rate": 4.5317165461514295e-05,
"loss": 0.3656,
"step": 737
},
{
"epoch": 2.5625,
"grad_norm": 0.1660705941990726,
"learning_rate": 4.522104768880208e-05,
"loss": 0.3584,
"step": 738
},
{
"epoch": 2.5659722222222223,
"grad_norm": 0.1888722193381791,
"learning_rate": 4.5124899236666694e-05,
"loss": 0.3646,
"step": 739
},
{
"epoch": 2.5694444444444446,
"grad_norm": 0.22749799010625654,
"learning_rate": 4.502872067008652e-05,
"loss": 0.354,
"step": 740
},
{
"epoch": 2.5729166666666665,
"grad_norm": 0.21243399731512363,
"learning_rate": 4.4932512554216886e-05,
"loss": 0.3602,
"step": 741
},
{
"epoch": 2.576388888888889,
"grad_norm": 0.17510977970439304,
"learning_rate": 4.483627545438678e-05,
"loss": 0.3607,
"step": 742
},
{
"epoch": 2.579861111111111,
"grad_norm": 0.20554150424391404,
"learning_rate": 4.4740009936095466e-05,
"loss": 0.3611,
"step": 743
},
{
"epoch": 2.5833333333333335,
"grad_norm": 0.20942087652236643,
"learning_rate": 4.464371656500921e-05,
"loss": 0.362,
"step": 744
},
{
"epoch": 2.5868055555555554,
"grad_norm": 0.22710097125473938,
"learning_rate": 4.4547395906957966e-05,
"loss": 0.3499,
"step": 745
},
{
"epoch": 2.5902777777777777,
"grad_norm": 0.21288271383011229,
"learning_rate": 4.4451048527932e-05,
"loss": 0.3626,
"step": 746
},
{
"epoch": 2.59375,
"grad_norm": 0.24824074926385184,
"learning_rate": 4.4354674994078585e-05,
"loss": 0.3646,
"step": 747
},
{
"epoch": 2.5972222222222223,
"grad_norm": 0.2184701047156981,
"learning_rate": 4.425827587169873e-05,
"loss": 0.3589,
"step": 748
},
{
"epoch": 2.6006944444444446,
"grad_norm": 0.22929717895377194,
"learning_rate": 4.4161851727243766e-05,
"loss": 0.3577,
"step": 749
},
{
"epoch": 2.6041666666666665,
"grad_norm": 0.31278505610599755,
"learning_rate": 4.406540312731208e-05,
"loss": 0.3561,
"step": 750
},
{
"epoch": 2.607638888888889,
"grad_norm": 0.25212944788531505,
"learning_rate": 4.396893063864573e-05,
"loss": 0.3561,
"step": 751
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.17873710074529314,
"learning_rate": 4.387243482812717e-05,
"loss": 0.357,
"step": 752
},
{
"epoch": 2.6145833333333335,
"grad_norm": 0.29077615166300086,
"learning_rate": 4.37759162627759e-05,
"loss": 0.3561,
"step": 753
},
{
"epoch": 2.6180555555555554,
"grad_norm": 0.3467330594972484,
"learning_rate": 4.3679375509745104e-05,
"loss": 0.3676,
"step": 754
},
{
"epoch": 2.6215277777777777,
"grad_norm": 0.2993517680065959,
"learning_rate": 4.358281313631838e-05,
"loss": 0.3537,
"step": 755
},
{
"epoch": 2.625,
"grad_norm": 0.15785110489395995,
"learning_rate": 4.348622970990634e-05,
"loss": 0.3601,
"step": 756
},
{
"epoch": 2.6284722222222223,
"grad_norm": 0.22408309035303686,
"learning_rate": 4.338962579804331e-05,
"loss": 0.3541,
"step": 757
},
{
"epoch": 2.6319444444444446,
"grad_norm": 0.3382351165187617,
"learning_rate": 4.3293001968384e-05,
"loss": 0.3584,
"step": 758
},
{
"epoch": 2.6354166666666665,
"grad_norm": 0.279111362806744,
"learning_rate": 4.3196358788700164e-05,
"loss": 0.3614,
"step": 759
},
{
"epoch": 2.638888888888889,
"grad_norm": 0.17240804345082109,
"learning_rate": 4.309969682687724e-05,
"loss": 0.3535,
"step": 760
},
{
"epoch": 2.642361111111111,
"grad_norm": 0.20131161960623978,
"learning_rate": 4.300301665091105e-05,
"loss": 0.3562,
"step": 761
},
{
"epoch": 2.6458333333333335,
"grad_norm": 0.24162909795940918,
"learning_rate": 4.290631882890443e-05,
"loss": 0.3594,
"step": 762
},
{
"epoch": 2.6493055555555554,
"grad_norm": 0.21997131895223193,
"learning_rate": 4.2809603929063906e-05,
"loss": 0.3571,
"step": 763
},
{
"epoch": 2.6527777777777777,
"grad_norm": 0.19702297458082826,
"learning_rate": 4.271287251969637e-05,
"loss": 0.3612,
"step": 764
},
{
"epoch": 2.65625,
"grad_norm": 0.23837037232926317,
"learning_rate": 4.261612516920573e-05,
"loss": 0.3602,
"step": 765
},
{
"epoch": 2.6597222222222223,
"grad_norm": 0.2312337983450589,
"learning_rate": 4.251936244608953e-05,
"loss": 0.3542,
"step": 766
},
{
"epoch": 2.6631944444444446,
"grad_norm": 0.2538986261708629,
"learning_rate": 4.242258491893567e-05,
"loss": 0.3642,
"step": 767
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.1919542427472609,
"learning_rate": 4.2325793156419035e-05,
"loss": 0.3553,
"step": 768
},
{
"epoch": 2.670138888888889,
"grad_norm": 0.1959693615452782,
"learning_rate": 4.222898772729818e-05,
"loss": 0.3536,
"step": 769
},
{
"epoch": 2.673611111111111,
"grad_norm": 0.21247229460127162,
"learning_rate": 4.213216920041194e-05,
"loss": 0.3563,
"step": 770
},
{
"epoch": 2.6770833333333335,
"grad_norm": 0.22144773539883858,
"learning_rate": 4.203533814467611e-05,
"loss": 0.3636,
"step": 771
},
{
"epoch": 2.6805555555555554,
"grad_norm": 0.22808065589921542,
"learning_rate": 4.193849512908013e-05,
"loss": 0.3584,
"step": 772
},
{
"epoch": 2.6840277777777777,
"grad_norm": 0.20009577392696715,
"learning_rate": 4.1841640722683685e-05,
"loss": 0.3652,
"step": 773
},
{
"epoch": 2.6875,
"grad_norm": 0.2298061729577738,
"learning_rate": 4.174477549461345e-05,
"loss": 0.3608,
"step": 774
},
{
"epoch": 2.6909722222222223,
"grad_norm": 0.21869371124941922,
"learning_rate": 4.164790001405962e-05,
"loss": 0.3574,
"step": 775
},
{
"epoch": 2.6944444444444446,
"grad_norm": 0.21370183101499696,
"learning_rate": 4.155101485027268e-05,
"loss": 0.3532,
"step": 776
},
{
"epoch": 2.6979166666666665,
"grad_norm": 0.17168907041052586,
"learning_rate": 4.145412057256e-05,
"loss": 0.3554,
"step": 777
},
{
"epoch": 2.701388888888889,
"grad_norm": 0.20485180075506743,
"learning_rate": 4.1357217750282504e-05,
"loss": 0.362,
"step": 778
},
{
"epoch": 2.704861111111111,
"grad_norm": 0.22401006957798586,
"learning_rate": 4.1260306952851315e-05,
"loss": 0.3632,
"step": 779
},
{
"epoch": 2.7083333333333335,
"grad_norm": 0.2520664541360099,
"learning_rate": 4.116338874972446e-05,
"loss": 0.3616,
"step": 780
},
{
"epoch": 2.7118055555555554,
"grad_norm": 0.17674416781450486,
"learning_rate": 4.106646371040343e-05,
"loss": 0.3563,
"step": 781
},
{
"epoch": 2.7152777777777777,
"grad_norm": 0.24149354151984379,
"learning_rate": 4.096953240442993e-05,
"loss": 0.3596,
"step": 782
},
{
"epoch": 2.71875,
"grad_norm": 0.25292906375096735,
"learning_rate": 4.087259540138245e-05,
"loss": 0.3629,
"step": 783
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.21108199686390794,
"learning_rate": 4.077565327087298e-05,
"loss": 0.3595,
"step": 784
},
{
"epoch": 2.7256944444444446,
"grad_norm": 0.18238322067033086,
"learning_rate": 4.0678706582543634e-05,
"loss": 0.3576,
"step": 785
},
{
"epoch": 2.7291666666666665,
"grad_norm": 0.15893305599604374,
"learning_rate": 4.058175590606332e-05,
"loss": 0.3548,
"step": 786
},
{
"epoch": 2.732638888888889,
"grad_norm": 0.1572378850322172,
"learning_rate": 4.0484801811124346e-05,
"loss": 0.3513,
"step": 787
},
{
"epoch": 2.736111111111111,
"grad_norm": 0.1648972137773385,
"learning_rate": 4.0387844867439143e-05,
"loss": 0.3559,
"step": 788
},
{
"epoch": 2.7395833333333335,
"grad_norm": 0.1492622259544844,
"learning_rate": 4.029088564473688e-05,
"loss": 0.3558,
"step": 789
},
{
"epoch": 2.7430555555555554,
"grad_norm": 0.1749873424220576,
"learning_rate": 4.019392471276008e-05,
"loss": 0.3616,
"step": 790
},
{
"epoch": 2.7465277777777777,
"grad_norm": 0.16420183013763476,
"learning_rate": 4.0096962641261365e-05,
"loss": 0.3555,
"step": 791
},
{
"epoch": 2.75,
"grad_norm": 0.1782955356918841,
"learning_rate": 4e-05,
"loss": 0.3633,
"step": 792
},
{
"epoch": 2.7534722222222223,
"grad_norm": 0.17880962347986656,
"learning_rate": 3.990303735873866e-05,
"loss": 0.3527,
"step": 793
},
{
"epoch": 2.7569444444444446,
"grad_norm": 0.17598621623586508,
"learning_rate": 3.9806075287239935e-05,
"loss": 0.3664,
"step": 794
},
{
"epoch": 2.7604166666666665,
"grad_norm": 0.18620845392745122,
"learning_rate": 3.970911435526314e-05,
"loss": 0.3584,
"step": 795
},
{
"epoch": 2.763888888888889,
"grad_norm": 0.17465336973990567,
"learning_rate": 3.961215513256086e-05,
"loss": 0.357,
"step": 796
},
{
"epoch": 2.767361111111111,
"grad_norm": 0.17917224180289917,
"learning_rate": 3.9515198188875674e-05,
"loss": 0.3589,
"step": 797
},
{
"epoch": 2.7708333333333335,
"grad_norm": 0.1887633306567826,
"learning_rate": 3.9418244093936694e-05,
"loss": 0.3623,
"step": 798
},
{
"epoch": 2.7743055555555554,
"grad_norm": 0.22561488109817832,
"learning_rate": 3.9321293417456387e-05,
"loss": 0.357,
"step": 799
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.16907414954125385,
"learning_rate": 3.9224346729127034e-05,
"loss": 0.353,
"step": 800
},
{
"epoch": 2.78125,
"grad_norm": 0.19666725754174408,
"learning_rate": 3.912740459861756e-05,
"loss": 0.3658,
"step": 801
},
{
"epoch": 2.7847222222222223,
"grad_norm": 0.16730199368543427,
"learning_rate": 3.903046759557007e-05,
"loss": 0.3551,
"step": 802
},
{
"epoch": 2.7881944444444446,
"grad_norm": 0.17896309699401097,
"learning_rate": 3.893353628959658e-05,
"loss": 0.3604,
"step": 803
},
{
"epoch": 2.7916666666666665,
"grad_norm": 0.16510597022859788,
"learning_rate": 3.8836611250275546e-05,
"loss": 0.361,
"step": 804
},
{
"epoch": 2.795138888888889,
"grad_norm": 0.19370398445124015,
"learning_rate": 3.87396930471487e-05,
"loss": 0.3553,
"step": 805
},
{
"epoch": 2.798611111111111,
"grad_norm": 0.18393178304705537,
"learning_rate": 3.8642782249717516e-05,
"loss": 0.358,
"step": 806
},
{
"epoch": 2.8020833333333335,
"grad_norm": 0.24948450843414427,
"learning_rate": 3.854587942744002e-05,
"loss": 0.3638,
"step": 807
},
{
"epoch": 2.8055555555555554,
"grad_norm": 0.2830314635173867,
"learning_rate": 3.844898514972733e-05,
"loss": 0.3594,
"step": 808
},
{
"epoch": 2.8090277777777777,
"grad_norm": 0.2002181438855024,
"learning_rate": 3.835209998594039e-05,
"loss": 0.3624,
"step": 809
},
{
"epoch": 2.8125,
"grad_norm": 0.2511277931849178,
"learning_rate": 3.825522450538657e-05,
"loss": 0.3522,
"step": 810
},
{
"epoch": 2.8159722222222223,
"grad_norm": 0.1967207093366818,
"learning_rate": 3.815835927731632e-05,
"loss": 0.3652,
"step": 811
},
{
"epoch": 2.8194444444444446,
"grad_norm": 0.17960784639555785,
"learning_rate": 3.806150487091989e-05,
"loss": 0.3565,
"step": 812
},
{
"epoch": 2.8229166666666665,
"grad_norm": 0.2233174388393411,
"learning_rate": 3.79646618553239e-05,
"loss": 0.3535,
"step": 813
},
{
"epoch": 2.826388888888889,
"grad_norm": 0.19390324896728148,
"learning_rate": 3.786783079958808e-05,
"loss": 0.3514,
"step": 814
},
{
"epoch": 2.829861111111111,
"grad_norm": 0.18204031624872857,
"learning_rate": 3.777101227270183e-05,
"loss": 0.3603,
"step": 815
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.20344719272452805,
"learning_rate": 3.767420684358097e-05,
"loss": 0.3572,
"step": 816
},
{
"epoch": 2.8368055555555554,
"grad_norm": 0.19184933570521928,
"learning_rate": 3.757741508106434e-05,
"loss": 0.3644,
"step": 817
},
{
"epoch": 2.8402777777777777,
"grad_norm": 0.18424245484201168,
"learning_rate": 3.748063755391049e-05,
"loss": 0.3613,
"step": 818
},
{
"epoch": 2.84375,
"grad_norm": 0.21747633916072062,
"learning_rate": 3.738387483079428e-05,
"loss": 0.3563,
"step": 819
},
{
"epoch": 2.8472222222222223,
"grad_norm": 0.17447330084596435,
"learning_rate": 3.7287127480303634e-05,
"loss": 0.3536,
"step": 820
},
{
"epoch": 2.8506944444444446,
"grad_norm": 0.1922881427816934,
"learning_rate": 3.7190396070936093e-05,
"loss": 0.3557,
"step": 821
},
{
"epoch": 2.8541666666666665,
"grad_norm": 0.20972951109888854,
"learning_rate": 3.709368117109558e-05,
"loss": 0.3578,
"step": 822
},
{
"epoch": 2.857638888888889,
"grad_norm": 0.22468999669900613,
"learning_rate": 3.699698334908895e-05,
"loss": 0.3598,
"step": 823
},
{
"epoch": 2.861111111111111,
"grad_norm": 0.20049022903894825,
"learning_rate": 3.690030317312277e-05,
"loss": 0.3582,
"step": 824
},
{
"epoch": 2.8645833333333335,
"grad_norm": 0.17875399966945452,
"learning_rate": 3.6803641211299856e-05,
"loss": 0.3564,
"step": 825
},
{
"epoch": 2.8680555555555554,
"grad_norm": 0.2428253163358811,
"learning_rate": 3.670699803161601e-05,
"loss": 0.3557,
"step": 826
},
{
"epoch": 2.8715277777777777,
"grad_norm": 0.24065758729640713,
"learning_rate": 3.661037420195671e-05,
"loss": 0.3608,
"step": 827
},
{
"epoch": 2.875,
"grad_norm": 0.19617707994378045,
"learning_rate": 3.6513770290093674e-05,
"loss": 0.3544,
"step": 828
},
{
"epoch": 2.8784722222222223,
"grad_norm": 0.26671779662664247,
"learning_rate": 3.641718686368164e-05,
"loss": 0.3557,
"step": 829
},
{
"epoch": 2.8819444444444446,
"grad_norm": 0.1801343584986345,
"learning_rate": 3.63206244902549e-05,
"loss": 0.3543,
"step": 830
},
{
"epoch": 2.8854166666666665,
"grad_norm": 0.19450147708793394,
"learning_rate": 3.622408373722412e-05,
"loss": 0.3584,
"step": 831
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.17598786584797108,
"learning_rate": 3.612756517187284e-05,
"loss": 0.3632,
"step": 832
},
{
"epoch": 2.892361111111111,
"grad_norm": 0.18182338931383799,
"learning_rate": 3.603106936135429e-05,
"loss": 0.3535,
"step": 833
},
{
"epoch": 2.8958333333333335,
"grad_norm": 0.22364386017448618,
"learning_rate": 3.5934596872687924e-05,
"loss": 0.3575,
"step": 834
},
{
"epoch": 2.8993055555555554,
"grad_norm": 0.1668601423162201,
"learning_rate": 3.583814827275624e-05,
"loss": 0.3569,
"step": 835
},
{
"epoch": 2.9027777777777777,
"grad_norm": 0.19801055806417453,
"learning_rate": 3.574172412830127e-05,
"loss": 0.3625,
"step": 836
},
{
"epoch": 2.90625,
"grad_norm": 0.23583604825625834,
"learning_rate": 3.564532500592143e-05,
"loss": 0.3575,
"step": 837
},
{
"epoch": 2.9097222222222223,
"grad_norm": 0.14520855199053195,
"learning_rate": 3.5548951472068017e-05,
"loss": 0.3497,
"step": 838
},
{
"epoch": 2.9131944444444446,
"grad_norm": 0.19605998381766435,
"learning_rate": 3.545260409304205e-05,
"loss": 0.358,
"step": 839
},
{
"epoch": 2.9166666666666665,
"grad_norm": 0.21453935925882056,
"learning_rate": 3.535628343499079e-05,
"loss": 0.3536,
"step": 840
},
{
"epoch": 2.920138888888889,
"grad_norm": 0.19409210510884808,
"learning_rate": 3.525999006390455e-05,
"loss": 0.3706,
"step": 841
},
{
"epoch": 2.923611111111111,
"grad_norm": 0.23198091133286045,
"learning_rate": 3.516372454561324e-05,
"loss": 0.363,
"step": 842
},
{
"epoch": 2.9270833333333335,
"grad_norm": 0.17318947090543216,
"learning_rate": 3.506748744578312e-05,
"loss": 0.3564,
"step": 843
},
{
"epoch": 2.9305555555555554,
"grad_norm": 0.17641845044561835,
"learning_rate": 3.49712793299135e-05,
"loss": 0.3593,
"step": 844
},
{
"epoch": 2.9340277777777777,
"grad_norm": 0.1852604985802232,
"learning_rate": 3.487510076333332e-05,
"loss": 0.3584,
"step": 845
},
{
"epoch": 2.9375,
"grad_norm": 0.17514830249343535,
"learning_rate": 3.477895231119795e-05,
"loss": 0.3634,
"step": 846
},
{
"epoch": 2.9409722222222223,
"grad_norm": 0.16734143827463377,
"learning_rate": 3.468283453848572e-05,
"loss": 0.3499,
"step": 847
},
{
"epoch": 2.9444444444444446,
"grad_norm": 0.18068561439735764,
"learning_rate": 3.458674800999477e-05,
"loss": 0.3603,
"step": 848
},
{
"epoch": 2.9479166666666665,
"grad_norm": 0.15836687872645414,
"learning_rate": 3.4490693290339576e-05,
"loss": 0.3566,
"step": 849
},
{
"epoch": 2.951388888888889,
"grad_norm": 0.18682847065633865,
"learning_rate": 3.4394670943947756e-05,
"loss": 0.3557,
"step": 850
},
{
"epoch": 2.954861111111111,
"grad_norm": 0.182860315826738,
"learning_rate": 3.4298681535056664e-05,
"loss": 0.3547,
"step": 851
},
{
"epoch": 2.9583333333333335,
"grad_norm": 0.1692085984740758,
"learning_rate": 3.4202725627710136e-05,
"loss": 0.3565,
"step": 852
},
{
"epoch": 2.9618055555555554,
"grad_norm": 0.16291805438785126,
"learning_rate": 3.410680378575512e-05,
"loss": 0.3578,
"step": 853
},
{
"epoch": 2.9652777777777777,
"grad_norm": 0.19636032650270333,
"learning_rate": 3.401091657283842e-05,
"loss": 0.3553,
"step": 854
},
{
"epoch": 2.96875,
"grad_norm": 0.22258957309784064,
"learning_rate": 3.3915064552403336e-05,
"loss": 0.3518,
"step": 855
},
{
"epoch": 2.9722222222222223,
"grad_norm": 0.16965579066125516,
"learning_rate": 3.3819248287686386e-05,
"loss": 0.3511,
"step": 856
},
{
"epoch": 2.9756944444444446,
"grad_norm": 0.1930972448807475,
"learning_rate": 3.3723468341713985e-05,
"loss": 0.3515,
"step": 857
},
{
"epoch": 2.9791666666666665,
"grad_norm": 0.1801137357638696,
"learning_rate": 3.3627725277299103e-05,
"loss": 0.3567,
"step": 858
},
{
"epoch": 2.982638888888889,
"grad_norm": 0.1768882568613071,
"learning_rate": 3.3532019657038045e-05,
"loss": 0.3565,
"step": 859
},
{
"epoch": 2.986111111111111,
"grad_norm": 0.18990156843143038,
"learning_rate": 3.343635204330704e-05,
"loss": 0.3505,
"step": 860
},
{
"epoch": 2.9895833333333335,
"grad_norm": 0.18215961637350045,
"learning_rate": 3.3340722998259036e-05,
"loss": 0.3609,
"step": 861
},
{
"epoch": 2.9930555555555554,
"grad_norm": 0.1850457048123024,
"learning_rate": 3.324513308382029e-05,
"loss": 0.3545,
"step": 862
},
{
"epoch": 2.9965277777777777,
"grad_norm": 0.19097473138907864,
"learning_rate": 3.314958286168718e-05,
"loss": 0.359,
"step": 863
},
{
"epoch": 3.0,
"grad_norm": 0.20390114258764727,
"learning_rate": 3.305407289332279e-05,
"loss": 0.3371,
"step": 864
},
{
"epoch": 3.0034722222222223,
"grad_norm": 0.23184923556068548,
"learning_rate": 3.295860373995373e-05,
"loss": 0.3338,
"step": 865
},
{
"epoch": 3.0069444444444446,
"grad_norm": 0.22531202044446838,
"learning_rate": 3.2863175962566716e-05,
"loss": 0.3278,
"step": 866
},
{
"epoch": 3.0104166666666665,
"grad_norm": 0.21138866062659648,
"learning_rate": 3.2767790121905396e-05,
"loss": 0.3313,
"step": 867
},
{
"epoch": 3.013888888888889,
"grad_norm": 0.19393318904652032,
"learning_rate": 3.267244677846693e-05,
"loss": 0.3287,
"step": 868
},
{
"epoch": 3.017361111111111,
"grad_norm": 0.20297535530392305,
"learning_rate": 3.257714649249883e-05,
"loss": 0.3212,
"step": 869
},
{
"epoch": 3.0208333333333335,
"grad_norm": 0.18715172201692282,
"learning_rate": 3.248188982399553e-05,
"loss": 0.3224,
"step": 870
},
{
"epoch": 3.0243055555555554,
"grad_norm": 0.19690389505029438,
"learning_rate": 3.23866773326952e-05,
"loss": 0.3265,
"step": 871
},
{
"epoch": 3.0277777777777777,
"grad_norm": 0.18180988764739114,
"learning_rate": 3.229150957807641e-05,
"loss": 0.3221,
"step": 872
},
{
"epoch": 3.03125,
"grad_norm": 0.19869573153269798,
"learning_rate": 3.219638711935488e-05,
"loss": 0.3327,
"step": 873
},
{
"epoch": 3.0347222222222223,
"grad_norm": 0.17871850646547546,
"learning_rate": 3.210131051548011e-05,
"loss": 0.3281,
"step": 874
},
{
"epoch": 3.0381944444444446,
"grad_norm": 0.19745444724872563,
"learning_rate": 3.200628032513219e-05,
"loss": 0.3257,
"step": 875
},
{
"epoch": 3.0416666666666665,
"grad_norm": 0.16518241065195463,
"learning_rate": 3.191129710671849e-05,
"loss": 0.3231,
"step": 876
},
{
"epoch": 3.045138888888889,
"grad_norm": 0.15519829533874455,
"learning_rate": 3.181636141837033e-05,
"loss": 0.3325,
"step": 877
},
{
"epoch": 3.048611111111111,
"grad_norm": 0.15572856456527798,
"learning_rate": 3.1721473817939795e-05,
"loss": 0.3326,
"step": 878
},
{
"epoch": 3.0520833333333335,
"grad_norm": 0.16157608012366306,
"learning_rate": 3.162663486299632e-05,
"loss": 0.326,
"step": 879
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.15496294058585847,
"learning_rate": 3.153184511082359e-05,
"loss": 0.3244,
"step": 880
},
{
"epoch": 3.0590277777777777,
"grad_norm": 0.15446224821419566,
"learning_rate": 3.143710511841606e-05,
"loss": 0.3304,
"step": 881
},
{
"epoch": 3.0625,
"grad_norm": 0.15700101028157268,
"learning_rate": 3.134241544247589e-05,
"loss": 0.3286,
"step": 882
},
{
"epoch": 3.0659722222222223,
"grad_norm": 0.15344774586241733,
"learning_rate": 3.124777663940952e-05,
"loss": 0.3251,
"step": 883
},
{
"epoch": 3.0694444444444446,
"grad_norm": 0.16649283778376192,
"learning_rate": 3.1153189265324494e-05,
"loss": 0.3277,
"step": 884
},
{
"epoch": 3.0729166666666665,
"grad_norm": 0.16996931994950168,
"learning_rate": 3.1058653876026105e-05,
"loss": 0.332,
"step": 885
},
{
"epoch": 3.076388888888889,
"grad_norm": 0.15660620934902456,
"learning_rate": 3.0964171027014217e-05,
"loss": 0.3259,
"step": 886
},
{
"epoch": 3.079861111111111,
"grad_norm": 0.17381747937524708,
"learning_rate": 3.0869741273479934e-05,
"loss": 0.3332,
"step": 887
},
{
"epoch": 3.0833333333333335,
"grad_norm": 0.15325114202704937,
"learning_rate": 3.07753651703024e-05,
"loss": 0.3328,
"step": 888
},
{
"epoch": 3.0868055555555554,
"grad_norm": 0.16410160873161936,
"learning_rate": 3.068104327204546e-05,
"loss": 0.3248,
"step": 889
},
{
"epoch": 3.0902777777777777,
"grad_norm": 0.15629114043992434,
"learning_rate": 3.0586776132954504e-05,
"loss": 0.3274,
"step": 890
},
{
"epoch": 3.09375,
"grad_norm": 0.1287547525109488,
"learning_rate": 3.0492564306953083e-05,
"loss": 0.3313,
"step": 891
},
{
"epoch": 3.0972222222222223,
"grad_norm": 0.15918994404783066,
"learning_rate": 3.0398408347639773e-05,
"loss": 0.327,
"step": 892
},
{
"epoch": 3.1006944444444446,
"grad_norm": 0.15708787224166132,
"learning_rate": 3.0304308808284845e-05,
"loss": 0.3285,
"step": 893
},
{
"epoch": 3.1041666666666665,
"grad_norm": 0.14410613175392642,
"learning_rate": 3.0210266241827047e-05,
"loss": 0.3229,
"step": 894
},
{
"epoch": 3.107638888888889,
"grad_norm": 0.1454609051584777,
"learning_rate": 3.0116281200870383e-05,
"loss": 0.3283,
"step": 895
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.15236322898098387,
"learning_rate": 3.0022354237680752e-05,
"loss": 0.3253,
"step": 896
},
{
"epoch": 3.1145833333333335,
"grad_norm": 0.13124398580702817,
"learning_rate": 2.9928485904182865e-05,
"loss": 0.3252,
"step": 897
},
{
"epoch": 3.1180555555555554,
"grad_norm": 0.15033157405355416,
"learning_rate": 2.9834676751956855e-05,
"loss": 0.3259,
"step": 898
},
{
"epoch": 3.1215277777777777,
"grad_norm": 0.12394460931334396,
"learning_rate": 2.9740927332235164e-05,
"loss": 0.326,
"step": 899
},
{
"epoch": 3.125,
"grad_norm": 0.13351975000878838,
"learning_rate": 2.9647238195899168e-05,
"loss": 0.3367,
"step": 900
},
{
"epoch": 3.1284722222222223,
"grad_norm": 0.1325052686323737,
"learning_rate": 2.9553609893476078e-05,
"loss": 0.3264,
"step": 901
},
{
"epoch": 3.1319444444444446,
"grad_norm": 0.13581375405721016,
"learning_rate": 2.9460042975135575e-05,
"loss": 0.3329,
"step": 902
},
{
"epoch": 3.1354166666666665,
"grad_norm": 0.1667562873637715,
"learning_rate": 2.936653799068669e-05,
"loss": 0.3283,
"step": 903
},
{
"epoch": 3.138888888888889,
"grad_norm": 0.15505693782068763,
"learning_rate": 2.9273095489574502e-05,
"loss": 0.3256,
"step": 904
},
{
"epoch": 3.142361111111111,
"grad_norm": 0.1487602254104706,
"learning_rate": 2.917971602087695e-05,
"loss": 0.3257,
"step": 905
},
{
"epoch": 3.1458333333333335,
"grad_norm": 0.1591921240522157,
"learning_rate": 2.9086400133301573e-05,
"loss": 0.3265,
"step": 906
},
{
"epoch": 3.1493055555555554,
"grad_norm": 0.13849832135798662,
"learning_rate": 2.8993148375182273e-05,
"loss": 0.3272,
"step": 907
},
{
"epoch": 3.1527777777777777,
"grad_norm": 0.1507894000952872,
"learning_rate": 2.889996129447618e-05,
"loss": 0.3272,
"step": 908
},
{
"epoch": 3.15625,
"grad_norm": 0.13584423605202275,
"learning_rate": 2.8806839438760322e-05,
"loss": 0.3269,
"step": 909
},
{
"epoch": 3.1597222222222223,
"grad_norm": 0.14817096095782104,
"learning_rate": 2.8713783355228497e-05,
"loss": 0.3257,
"step": 910
},
{
"epoch": 3.1631944444444446,
"grad_norm": 0.134355250776596,
"learning_rate": 2.8620793590687987e-05,
"loss": 0.3251,
"step": 911
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.13707426963646546,
"learning_rate": 2.8527870691556404e-05,
"loss": 0.3272,
"step": 912
},
{
"epoch": 3.170138888888889,
"grad_norm": 0.13959856877548055,
"learning_rate": 2.843501520385841e-05,
"loss": 0.3255,
"step": 913
},
{
"epoch": 3.173611111111111,
"grad_norm": 0.13553678964547694,
"learning_rate": 2.8342227673222608e-05,
"loss": 0.3278,
"step": 914
},
{
"epoch": 3.1770833333333335,
"grad_norm": 0.13659725325340102,
"learning_rate": 2.8249508644878224e-05,
"loss": 0.3225,
"step": 915
},
{
"epoch": 3.1805555555555554,
"grad_norm": 0.14820750705537203,
"learning_rate": 2.8156858663652015e-05,
"loss": 0.3318,
"step": 916
},
{
"epoch": 3.1840277777777777,
"grad_norm": 0.13640897033741206,
"learning_rate": 2.806427827396493e-05,
"loss": 0.3351,
"step": 917
},
{
"epoch": 3.1875,
"grad_norm": 0.12546349430475254,
"learning_rate": 2.7971768019829083e-05,
"loss": 0.3317,
"step": 918
},
{
"epoch": 3.1909722222222223,
"grad_norm": 0.12506137585820623,
"learning_rate": 2.7879328444844386e-05,
"loss": 0.3229,
"step": 919
},
{
"epoch": 3.1944444444444446,
"grad_norm": 0.14489730864288738,
"learning_rate": 2.778696009219548e-05,
"loss": 0.3238,
"step": 920
},
{
"epoch": 3.1979166666666665,
"grad_norm": 0.1314663587842031,
"learning_rate": 2.769466350464847e-05,
"loss": 0.3272,
"step": 921
},
{
"epoch": 3.201388888888889,
"grad_norm": 0.14506952594049383,
"learning_rate": 2.76024392245478e-05,
"loss": 0.3273,
"step": 922
},
{
"epoch": 3.204861111111111,
"grad_norm": 0.13448052337608013,
"learning_rate": 2.751028779381298e-05,
"loss": 0.3284,
"step": 923
},
{
"epoch": 3.2083333333333335,
"grad_norm": 0.14402460060846006,
"learning_rate": 2.7418209753935464e-05,
"loss": 0.3229,
"step": 924
},
{
"epoch": 3.2118055555555554,
"grad_norm": 0.1594688318321725,
"learning_rate": 2.732620564597547e-05,
"loss": 0.331,
"step": 925
},
{
"epoch": 3.2152777777777777,
"grad_norm": 0.16364319049182574,
"learning_rate": 2.7234276010558766e-05,
"loss": 0.3267,
"step": 926
},
{
"epoch": 3.21875,
"grad_norm": 0.15546709679880438,
"learning_rate": 2.7142421387873548e-05,
"loss": 0.3251,
"step": 927
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.15304291076882148,
"learning_rate": 2.7050642317667164e-05,
"loss": 0.3294,
"step": 928
},
{
"epoch": 3.2256944444444446,
"grad_norm": 0.16550996164467935,
"learning_rate": 2.695893933924308e-05,
"loss": 0.3219,
"step": 929
},
{
"epoch": 3.2291666666666665,
"grad_norm": 0.12571383158452695,
"learning_rate": 2.6867312991457563e-05,
"loss": 0.3301,
"step": 930
},
{
"epoch": 3.232638888888889,
"grad_norm": 0.16561727085953437,
"learning_rate": 2.6775763812716665e-05,
"loss": 0.328,
"step": 931
},
{
"epoch": 3.236111111111111,
"grad_norm": 0.12461275003256311,
"learning_rate": 2.6684292340972936e-05,
"loss": 0.3204,
"step": 932
},
{
"epoch": 3.2395833333333335,
"grad_norm": 0.16175602688976579,
"learning_rate": 2.659289911372234e-05,
"loss": 0.3297,
"step": 933
},
{
"epoch": 3.2430555555555554,
"grad_norm": 0.12598154391016275,
"learning_rate": 2.6501584668001038e-05,
"loss": 0.3315,
"step": 934
},
{
"epoch": 3.2465277777777777,
"grad_norm": 0.1412961765348397,
"learning_rate": 2.6410349540382285e-05,
"loss": 0.3283,
"step": 935
},
{
"epoch": 3.25,
"grad_norm": 0.14185747055205886,
"learning_rate": 2.6319194266973256e-05,
"loss": 0.3269,
"step": 936
},
{
"epoch": 3.2534722222222223,
"grad_norm": 0.12878278517933486,
"learning_rate": 2.6228119383411875e-05,
"loss": 0.333,
"step": 937
},
{
"epoch": 3.2569444444444446,
"grad_norm": 0.13440642206471998,
"learning_rate": 2.6137125424863713e-05,
"loss": 0.3254,
"step": 938
},
{
"epoch": 3.2604166666666665,
"grad_norm": 0.14596329626360965,
"learning_rate": 2.6046212926018774e-05,
"loss": 0.3258,
"step": 939
},
{
"epoch": 3.263888888888889,
"grad_norm": 0.1364611951529621,
"learning_rate": 2.5955382421088457e-05,
"loss": 0.3265,
"step": 940
},
{
"epoch": 3.267361111111111,
"grad_norm": 0.17050661806833226,
"learning_rate": 2.58646344438023e-05,
"loss": 0.3314,
"step": 941
},
{
"epoch": 3.2708333333333335,
"grad_norm": 0.1298103989698816,
"learning_rate": 2.577396952740495e-05,
"loss": 0.3323,
"step": 942
},
{
"epoch": 3.2743055555555554,
"grad_norm": 0.15642233563304042,
"learning_rate": 2.568338820465292e-05,
"loss": 0.3261,
"step": 943
},
{
"epoch": 3.2777777777777777,
"grad_norm": 0.12611133774369115,
"learning_rate": 2.5592891007811594e-05,
"loss": 0.3231,
"step": 944
},
{
"epoch": 3.28125,
"grad_norm": 0.14937341829368525,
"learning_rate": 2.550247846865194e-05,
"loss": 0.3283,
"step": 945
},
{
"epoch": 3.2847222222222223,
"grad_norm": 0.16357367165064074,
"learning_rate": 2.541215111844753e-05,
"loss": 0.3258,
"step": 946
},
{
"epoch": 3.2881944444444446,
"grad_norm": 0.1451872556880248,
"learning_rate": 2.5321909487971324e-05,
"loss": 0.3292,
"step": 947
},
{
"epoch": 3.2916666666666665,
"grad_norm": 0.155863544721583,
"learning_rate": 2.523175410749263e-05,
"loss": 0.3266,
"step": 948
},
{
"epoch": 3.295138888888889,
"grad_norm": 0.1437954314612826,
"learning_rate": 2.5141685506773862e-05,
"loss": 0.3249,
"step": 949
},
{
"epoch": 3.298611111111111,
"grad_norm": 0.14039277643177653,
"learning_rate": 2.505170421506759e-05,
"loss": 0.332,
"step": 950
},
{
"epoch": 3.3020833333333335,
"grad_norm": 0.14077753058037845,
"learning_rate": 2.4961810761113282e-05,
"loss": 0.3254,
"step": 951
},
{
"epoch": 3.3055555555555554,
"grad_norm": 0.15498427144289387,
"learning_rate": 2.4872005673134307e-05,
"loss": 0.3262,
"step": 952
},
{
"epoch": 3.3090277777777777,
"grad_norm": 0.13209964705917596,
"learning_rate": 2.4782289478834757e-05,
"loss": 0.3359,
"step": 953
},
{
"epoch": 3.3125,
"grad_norm": 0.13621175969696817,
"learning_rate": 2.4692662705396412e-05,
"loss": 0.33,
"step": 954
},
{
"epoch": 3.3159722222222223,
"grad_norm": 0.12829692288727748,
"learning_rate": 2.460312587947557e-05,
"loss": 0.3199,
"step": 955
},
{
"epoch": 3.3194444444444446,
"grad_norm": 0.12514830015509698,
"learning_rate": 2.4513679527199986e-05,
"loss": 0.3277,
"step": 956
},
{
"epoch": 3.3229166666666665,
"grad_norm": 0.14513050874335826,
"learning_rate": 2.4424324174165808e-05,
"loss": 0.332,
"step": 957
},
{
"epoch": 3.326388888888889,
"grad_norm": 0.11014959493081117,
"learning_rate": 2.4335060345434443e-05,
"loss": 0.3254,
"step": 958
},
{
"epoch": 3.329861111111111,
"grad_norm": 0.13668804458246817,
"learning_rate": 2.4245888565529518e-05,
"loss": 0.3256,
"step": 959
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.11888817466530431,
"learning_rate": 2.4156809358433728e-05,
"loss": 0.3313,
"step": 960
},
{
"epoch": 3.3368055555555554,
"grad_norm": 0.12990350418153296,
"learning_rate": 2.4067823247585857e-05,
"loss": 0.3266,
"step": 961
},
{
"epoch": 3.3402777777777777,
"grad_norm": 0.12637757796358817,
"learning_rate": 2.3978930755877583e-05,
"loss": 0.332,
"step": 962
},
{
"epoch": 3.34375,
"grad_norm": 0.1499424946562788,
"learning_rate": 2.389013240565052e-05,
"loss": 0.3257,
"step": 963
},
{
"epoch": 3.3472222222222223,
"grad_norm": 0.13046441675186193,
"learning_rate": 2.3801428718693055e-05,
"loss": 0.3352,
"step": 964
},
{
"epoch": 3.3506944444444446,
"grad_norm": 0.15641789451684035,
"learning_rate": 2.371282021623738e-05,
"loss": 0.3266,
"step": 965
},
{
"epoch": 3.3541666666666665,
"grad_norm": 0.15401041271475968,
"learning_rate": 2.3624307418956298e-05,
"loss": 0.3251,
"step": 966
},
{
"epoch": 3.357638888888889,
"grad_norm": 0.14610541183888143,
"learning_rate": 2.3535890846960318e-05,
"loss": 0.3274,
"step": 967
},
{
"epoch": 3.361111111111111,
"grad_norm": 0.1597491360564764,
"learning_rate": 2.3447571019794438e-05,
"loss": 0.3279,
"step": 968
},
{
"epoch": 3.3645833333333335,
"grad_norm": 0.1391254633674649,
"learning_rate": 2.3359348456435243e-05,
"loss": 0.3223,
"step": 969
},
{
"epoch": 3.3680555555555554,
"grad_norm": 0.16742166965814972,
"learning_rate": 2.327122367528775e-05,
"loss": 0.3213,
"step": 970
},
{
"epoch": 3.3715277777777777,
"grad_norm": 0.13795694128294528,
"learning_rate": 2.3183197194182395e-05,
"loss": 0.3267,
"step": 971
},
{
"epoch": 3.375,
"grad_norm": 0.15651560419922309,
"learning_rate": 2.3095269530372032e-05,
"loss": 0.3277,
"step": 972
},
{
"epoch": 3.3784722222222223,
"grad_norm": 0.1383038389996478,
"learning_rate": 2.300744120052878e-05,
"loss": 0.3233,
"step": 973
},
{
"epoch": 3.3819444444444446,
"grad_norm": 0.13798159811928254,
"learning_rate": 2.291971272074115e-05,
"loss": 0.3308,
"step": 974
},
{
"epoch": 3.3854166666666665,
"grad_norm": 0.1349158326186098,
"learning_rate": 2.2832084606510848e-05,
"loss": 0.3286,
"step": 975
},
{
"epoch": 3.388888888888889,
"grad_norm": 0.13930366060353663,
"learning_rate": 2.274455737274987e-05,
"loss": 0.3329,
"step": 976
},
{
"epoch": 3.392361111111111,
"grad_norm": 0.1310441724409089,
"learning_rate": 2.26571315337774e-05,
"loss": 0.3311,
"step": 977
},
{
"epoch": 3.3958333333333335,
"grad_norm": 0.12728966187369123,
"learning_rate": 2.2569807603316836e-05,
"loss": 0.3229,
"step": 978
},
{
"epoch": 3.3993055555555554,
"grad_norm": 0.14277390088433503,
"learning_rate": 2.2482586094492724e-05,
"loss": 0.328,
"step": 979
},
{
"epoch": 3.4027777777777777,
"grad_norm": 0.14441166578011597,
"learning_rate": 2.239546751982782e-05,
"loss": 0.3398,
"step": 980
},
{
"epoch": 3.40625,
"grad_norm": 0.15113050069038053,
"learning_rate": 2.2308452391239958e-05,
"loss": 0.3298,
"step": 981
},
{
"epoch": 3.4097222222222223,
"grad_norm": 0.1475369362828003,
"learning_rate": 2.2221541220039162e-05,
"loss": 0.327,
"step": 982
},
{
"epoch": 3.4131944444444446,
"grad_norm": 0.14568823290138413,
"learning_rate": 2.2134734516924583e-05,
"loss": 0.3301,
"step": 983
},
{
"epoch": 3.4166666666666665,
"grad_norm": 0.133394078745156,
"learning_rate": 2.2048032791981515e-05,
"loss": 0.3282,
"step": 984
},
{
"epoch": 3.420138888888889,
"grad_norm": 0.13571851430095155,
"learning_rate": 2.196143655467835e-05,
"loss": 0.3289,
"step": 985
},
{
"epoch": 3.423611111111111,
"grad_norm": 0.12212196306543147,
"learning_rate": 2.1874946313863673e-05,
"loss": 0.329,
"step": 986
},
{
"epoch": 3.4270833333333335,
"grad_norm": 0.1400616993150097,
"learning_rate": 2.1788562577763192e-05,
"loss": 0.3251,
"step": 987
},
{
"epoch": 3.4305555555555554,
"grad_norm": 0.12306894172185011,
"learning_rate": 2.1702285853976774e-05,
"loss": 0.3266,
"step": 988
},
{
"epoch": 3.4340277777777777,
"grad_norm": 0.1559157616814954,
"learning_rate": 2.161611664947551e-05,
"loss": 0.3258,
"step": 989
},
{
"epoch": 3.4375,
"grad_norm": 0.12516551486822852,
"learning_rate": 2.1530055470598654e-05,
"loss": 0.3265,
"step": 990
},
{
"epoch": 3.4409722222222223,
"grad_norm": 0.13080425032211002,
"learning_rate": 2.1444102823050706e-05,
"loss": 0.3316,
"step": 991
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.1300922642459424,
"learning_rate": 2.135825921189846e-05,
"loss": 0.3288,
"step": 992
},
{
"epoch": 3.4479166666666665,
"grad_norm": 0.12922486035564673,
"learning_rate": 2.1272525141567925e-05,
"loss": 0.3266,
"step": 993
},
{
"epoch": 3.451388888888889,
"grad_norm": 0.12469798677762536,
"learning_rate": 2.11869011158415e-05,
"loss": 0.3306,
"step": 994
},
{
"epoch": 3.454861111111111,
"grad_norm": 0.13120696522264091,
"learning_rate": 2.1101387637854948e-05,
"loss": 0.3287,
"step": 995
},
{
"epoch": 3.4583333333333335,
"grad_norm": 0.11755454189452605,
"learning_rate": 2.1015985210094385e-05,
"loss": 0.3235,
"step": 996
},
{
"epoch": 3.4618055555555554,
"grad_norm": 0.11981150512361477,
"learning_rate": 2.093069433439346e-05,
"loss": 0.3241,
"step": 997
},
{
"epoch": 3.4652777777777777,
"grad_norm": 0.1163241592558362,
"learning_rate": 2.084551551193026e-05,
"loss": 0.3317,
"step": 998
},
{
"epoch": 3.46875,
"grad_norm": 0.11818942476471016,
"learning_rate": 2.0760449243224504e-05,
"loss": 0.3239,
"step": 999
},
{
"epoch": 3.4722222222222223,
"grad_norm": 0.13391398089188974,
"learning_rate": 2.067549602813446e-05,
"loss": 0.3276,
"step": 1000
},
{
"epoch": 3.4756944444444446,
"grad_norm": 0.11776828786444583,
"learning_rate": 2.059065636585416e-05,
"loss": 0.3284,
"step": 1001
},
{
"epoch": 3.4791666666666665,
"grad_norm": 0.11898978369329122,
"learning_rate": 2.050593075491031e-05,
"loss": 0.3222,
"step": 1002
},
{
"epoch": 3.482638888888889,
"grad_norm": 0.12050085117961935,
"learning_rate": 2.0421319693159488e-05,
"loss": 0.3246,
"step": 1003
},
{
"epoch": 3.486111111111111,
"grad_norm": 0.11420196300243668,
"learning_rate": 2.033682367778518e-05,
"loss": 0.323,
"step": 1004
},
{
"epoch": 3.4895833333333335,
"grad_norm": 0.10906983179786023,
"learning_rate": 2.025244320529479e-05,
"loss": 0.3258,
"step": 1005
},
{
"epoch": 3.4930555555555554,
"grad_norm": 0.11713331459082049,
"learning_rate": 2.0168178771516844e-05,
"loss": 0.3256,
"step": 1006
},
{
"epoch": 3.4965277777777777,
"grad_norm": 0.12452120908228087,
"learning_rate": 2.0084030871597944e-05,
"loss": 0.3292,
"step": 1007
},
{
"epoch": 3.5,
"grad_norm": 0.11383590817143446,
"learning_rate": 2.0000000000000012e-05,
"loss": 0.3312,
"step": 1008
},
{
"epoch": 3.5034722222222223,
"grad_norm": 0.1361420180977826,
"learning_rate": 1.9916086650497206e-05,
"loss": 0.3316,
"step": 1009
},
{
"epoch": 3.5069444444444446,
"grad_norm": 0.1227570110480921,
"learning_rate": 1.9832291316173196e-05,
"loss": 0.3303,
"step": 1010
},
{
"epoch": 3.5104166666666665,
"grad_norm": 0.11318826375174597,
"learning_rate": 1.9748614489418118e-05,
"loss": 0.3233,
"step": 1011
},
{
"epoch": 3.513888888888889,
"grad_norm": 0.11329679043568246,
"learning_rate": 1.966505666192579e-05,
"loss": 0.3335,
"step": 1012
},
{
"epoch": 3.517361111111111,
"grad_norm": 0.12400647990567192,
"learning_rate": 1.9581618324690742e-05,
"loss": 0.3349,
"step": 1013
},
{
"epoch": 3.5208333333333335,
"grad_norm": 0.11126217524731555,
"learning_rate": 1.9498299968005393e-05,
"loss": 0.3226,
"step": 1014
},
{
"epoch": 3.5243055555555554,
"grad_norm": 0.11193919585531145,
"learning_rate": 1.9415102081457138e-05,
"loss": 0.3226,
"step": 1015
},
{
"epoch": 3.5277777777777777,
"grad_norm": 0.11071870031836675,
"learning_rate": 1.9332025153925486e-05,
"loss": 0.3268,
"step": 1016
},
{
"epoch": 3.53125,
"grad_norm": 0.11287024785977574,
"learning_rate": 1.9249069673579136e-05,
"loss": 0.3251,
"step": 1017
},
{
"epoch": 3.5347222222222223,
"grad_norm": 0.11110239502899291,
"learning_rate": 1.9166236127873215e-05,
"loss": 0.3233,
"step": 1018
},
{
"epoch": 3.5381944444444446,
"grad_norm": 0.11584789285191628,
"learning_rate": 1.9083525003546296e-05,
"loss": 0.3282,
"step": 1019
},
{
"epoch": 3.5416666666666665,
"grad_norm": 0.1253078621072049,
"learning_rate": 1.90009367866176e-05,
"loss": 0.332,
"step": 1020
},
{
"epoch": 3.545138888888889,
"grad_norm": 0.1198697407069051,
"learning_rate": 1.8918471962384163e-05,
"loss": 0.331,
"step": 1021
},
{
"epoch": 3.548611111111111,
"grad_norm": 0.12069680597925006,
"learning_rate": 1.8836131015417906e-05,
"loss": 0.3299,
"step": 1022
},
{
"epoch": 3.5520833333333335,
"grad_norm": 0.10755804286298509,
"learning_rate": 1.875391442956289e-05,
"loss": 0.3265,
"step": 1023
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.13204844212409522,
"learning_rate": 1.867182268793236e-05,
"loss": 0.3242,
"step": 1024
},
{
"epoch": 3.5590277777777777,
"grad_norm": 0.10964257930704686,
"learning_rate": 1.8589856272906e-05,
"loss": 0.329,
"step": 1025
},
{
"epoch": 3.5625,
"grad_norm": 0.12148237166024527,
"learning_rate": 1.8508015666127043e-05,
"loss": 0.3248,
"step": 1026
},
{
"epoch": 3.5659722222222223,
"grad_norm": 0.10930688998046321,
"learning_rate": 1.8426301348499495e-05,
"loss": 0.3249,
"step": 1027
},
{
"epoch": 3.5694444444444446,
"grad_norm": 0.12726809440921658,
"learning_rate": 1.8344713800185215e-05,
"loss": 0.3288,
"step": 1028
},
{
"epoch": 3.5729166666666665,
"grad_norm": 0.10648107999527251,
"learning_rate": 1.826325350060121e-05,
"loss": 0.3288,
"step": 1029
},
{
"epoch": 3.576388888888889,
"grad_norm": 0.11912716762210253,
"learning_rate": 1.8181920928416704e-05,
"loss": 0.3204,
"step": 1030
},
{
"epoch": 3.579861111111111,
"grad_norm": 0.12135009526529683,
"learning_rate": 1.810071656155044e-05,
"loss": 0.3247,
"step": 1031
},
{
"epoch": 3.5833333333333335,
"grad_norm": 0.11231174673001908,
"learning_rate": 1.8019640877167763e-05,
"loss": 0.3329,
"step": 1032
},
{
"epoch": 3.5868055555555554,
"grad_norm": 0.11687605127132657,
"learning_rate": 1.7938694351677907e-05,
"loss": 0.3255,
"step": 1033
},
{
"epoch": 3.5902777777777777,
"grad_norm": 0.11239743899073819,
"learning_rate": 1.785787746073111e-05,
"loss": 0.3256,
"step": 1034
},
{
"epoch": 3.59375,
"grad_norm": 0.12259189423086393,
"learning_rate": 1.7777190679215923e-05,
"loss": 0.3243,
"step": 1035
},
{
"epoch": 3.5972222222222223,
"grad_norm": 0.11193043928854648,
"learning_rate": 1.7696634481256293e-05,
"loss": 0.3266,
"step": 1036
},
{
"epoch": 3.6006944444444446,
"grad_norm": 0.12516968216797694,
"learning_rate": 1.761620934020889e-05,
"loss": 0.3269,
"step": 1037
},
{
"epoch": 3.6041666666666665,
"grad_norm": 0.10669706033996294,
"learning_rate": 1.753591572866029e-05,
"loss": 0.3254,
"step": 1038
},
{
"epoch": 3.607638888888889,
"grad_norm": 0.13136099916171398,
"learning_rate": 1.7455754118424134e-05,
"loss": 0.3328,
"step": 1039
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.11840511583143465,
"learning_rate": 1.7375724980538465e-05,
"loss": 0.3324,
"step": 1040
},
{
"epoch": 3.6145833333333335,
"grad_norm": 0.13260287735222887,
"learning_rate": 1.7295828785262857e-05,
"loss": 0.3338,
"step": 1041
},
{
"epoch": 3.6180555555555554,
"grad_norm": 0.1092213984349806,
"learning_rate": 1.721606600207575e-05,
"loss": 0.3264,
"step": 1042
},
{
"epoch": 3.6215277777777777,
"grad_norm": 0.12107825432473537,
"learning_rate": 1.713643709967159e-05,
"loss": 0.3261,
"step": 1043
},
{
"epoch": 3.625,
"grad_norm": 0.11663128574485776,
"learning_rate": 1.7056942545958167e-05,
"loss": 0.3278,
"step": 1044
},
{
"epoch": 3.6284722222222223,
"grad_norm": 0.1162808823074308,
"learning_rate": 1.697758280805379e-05,
"loss": 0.328,
"step": 1045
},
{
"epoch": 3.6319444444444446,
"grad_norm": 0.11969320984733327,
"learning_rate": 1.68983583522846e-05,
"loss": 0.3286,
"step": 1046
},
{
"epoch": 3.6354166666666665,
"grad_norm": 0.12749537876295391,
"learning_rate": 1.68192696441818e-05,
"loss": 0.3282,
"step": 1047
},
{
"epoch": 3.638888888888889,
"grad_norm": 0.12432812932104631,
"learning_rate": 1.6740317148478932e-05,
"loss": 0.3298,
"step": 1048
},
{
"epoch": 3.642361111111111,
"grad_norm": 0.12352471143147438,
"learning_rate": 1.6661501329109118e-05,
"loss": 0.3261,
"step": 1049
},
{
"epoch": 3.6458333333333335,
"grad_norm": 0.12619465833114463,
"learning_rate": 1.6582822649202382e-05,
"loss": 0.3263,
"step": 1050
},
{
"epoch": 3.6493055555555554,
"grad_norm": 0.11466347815059254,
"learning_rate": 1.6504281571082873e-05,
"loss": 0.3194,
"step": 1051
},
{
"epoch": 3.6527777777777777,
"grad_norm": 0.13179675145606548,
"learning_rate": 1.642587855626621e-05,
"loss": 0.3319,
"step": 1052
},
{
"epoch": 3.65625,
"grad_norm": 0.11282192811037618,
"learning_rate": 1.6347614065456715e-05,
"loss": 0.3284,
"step": 1053
},
{
"epoch": 3.6597222222222223,
"grad_norm": 0.11886170256023952,
"learning_rate": 1.6269488558544724e-05,
"loss": 0.3293,
"step": 1054
},
{
"epoch": 3.6631944444444446,
"grad_norm": 0.11799256382181283,
"learning_rate": 1.6191502494603925e-05,
"loss": 0.3266,
"step": 1055
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.11390887631440014,
"learning_rate": 1.6113656331888563e-05,
"loss": 0.3272,
"step": 1056
},
{
"epoch": 3.670138888888889,
"grad_norm": 0.12377782950633084,
"learning_rate": 1.6035950527830868e-05,
"loss": 0.3299,
"step": 1057
},
{
"epoch": 3.673611111111111,
"grad_norm": 0.10228022446219863,
"learning_rate": 1.5958385539038285e-05,
"loss": 0.3311,
"step": 1058
},
{
"epoch": 3.6770833333333335,
"grad_norm": 0.12066991092121124,
"learning_rate": 1.588096182129082e-05,
"loss": 0.3286,
"step": 1059
},
{
"epoch": 3.6805555555555554,
"grad_norm": 0.10509918089098899,
"learning_rate": 1.580367982953833e-05,
"loss": 0.3292,
"step": 1060
},
{
"epoch": 3.6840277777777777,
"grad_norm": 0.10663352182573295,
"learning_rate": 1.572654001789792e-05,
"loss": 0.3334,
"step": 1061
},
{
"epoch": 3.6875,
"grad_norm": 0.11999418073196094,
"learning_rate": 1.5649542839651175e-05,
"loss": 0.3256,
"step": 1062
},
{
"epoch": 3.6909722222222223,
"grad_norm": 0.11145209248435085,
"learning_rate": 1.5572688747241605e-05,
"loss": 0.3269,
"step": 1063
},
{
"epoch": 3.6944444444444446,
"grad_norm": 0.11903495171170056,
"learning_rate": 1.5495978192271887e-05,
"loss": 0.32,
"step": 1064
},
{
"epoch": 3.6979166666666665,
"grad_norm": 0.11631140126765901,
"learning_rate": 1.5419411625501302e-05,
"loss": 0.3255,
"step": 1065
},
{
"epoch": 3.701388888888889,
"grad_norm": 0.11875582201597984,
"learning_rate": 1.534298949684299e-05,
"loss": 0.3273,
"step": 1066
},
{
"epoch": 3.704861111111111,
"grad_norm": 0.12463534502290068,
"learning_rate": 1.5266712255361413e-05,
"loss": 0.3282,
"step": 1067
},
{
"epoch": 3.7083333333333335,
"grad_norm": 0.11873227730380286,
"learning_rate": 1.5190580349269604e-05,
"loss": 0.3287,
"step": 1068
},
{
"epoch": 3.7118055555555554,
"grad_norm": 0.1253968455352852,
"learning_rate": 1.5114594225926631e-05,
"loss": 0.3373,
"step": 1069
},
{
"epoch": 3.7152777777777777,
"grad_norm": 0.11955105986890233,
"learning_rate": 1.503875433183493e-05,
"loss": 0.3309,
"step": 1070
},
{
"epoch": 3.71875,
"grad_norm": 0.11945767131489388,
"learning_rate": 1.4963061112637637e-05,
"loss": 0.3257,
"step": 1071
},
{
"epoch": 3.7222222222222223,
"grad_norm": 0.11320297989341972,
"learning_rate": 1.4887515013116067e-05,
"loss": 0.3324,
"step": 1072
},
{
"epoch": 3.7256944444444446,
"grad_norm": 0.10360639278268688,
"learning_rate": 1.481211647718698e-05,
"loss": 0.3214,
"step": 1073
},
{
"epoch": 3.7291666666666665,
"grad_norm": 0.10520008867884603,
"learning_rate": 1.4736865947900106e-05,
"loss": 0.3281,
"step": 1074
},
{
"epoch": 3.732638888888889,
"grad_norm": 0.10779658571104866,
"learning_rate": 1.4661763867435407e-05,
"loss": 0.3259,
"step": 1075
},
{
"epoch": 3.736111111111111,
"grad_norm": 0.10358861836884209,
"learning_rate": 1.4586810677100608e-05,
"loss": 0.3309,
"step": 1076
},
{
"epoch": 3.7395833333333335,
"grad_norm": 0.11055658533942075,
"learning_rate": 1.4512006817328472e-05,
"loss": 0.3268,
"step": 1077
},
{
"epoch": 3.7430555555555554,
"grad_norm": 0.11078061429552334,
"learning_rate": 1.4437352727674335e-05,
"loss": 0.3267,
"step": 1078
},
{
"epoch": 3.7465277777777777,
"grad_norm": 0.10435501091691729,
"learning_rate": 1.4362848846813461e-05,
"loss": 0.3245,
"step": 1079
},
{
"epoch": 3.75,
"grad_norm": 0.11960459787061607,
"learning_rate": 1.4288495612538427e-05,
"loss": 0.3308,
"step": 1080
},
{
"epoch": 3.7534722222222223,
"grad_norm": 0.10183573591431129,
"learning_rate": 1.4214293461756645e-05,
"loss": 0.3228,
"step": 1081
},
{
"epoch": 3.7569444444444446,
"grad_norm": 0.10326319760910473,
"learning_rate": 1.4140242830487743e-05,
"loss": 0.3257,
"step": 1082
},
{
"epoch": 3.7604166666666665,
"grad_norm": 0.10288885616446265,
"learning_rate": 1.406634415386095e-05,
"loss": 0.3312,
"step": 1083
},
{
"epoch": 3.763888888888889,
"grad_norm": 0.10259782848930725,
"learning_rate": 1.3992597866112667e-05,
"loss": 0.3245,
"step": 1084
},
{
"epoch": 3.767361111111111,
"grad_norm": 0.10218365204918832,
"learning_rate": 1.391900440058379e-05,
"loss": 0.3272,
"step": 1085
},
{
"epoch": 3.7708333333333335,
"grad_norm": 0.10627712104738776,
"learning_rate": 1.3845564189717218e-05,
"loss": 0.3275,
"step": 1086
},
{
"epoch": 3.7743055555555554,
"grad_norm": 0.10701771275450184,
"learning_rate": 1.3772277665055351e-05,
"loss": 0.3317,
"step": 1087
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.1095852881244597,
"learning_rate": 1.369914525723746e-05,
"loss": 0.3282,
"step": 1088
},
{
"epoch": 3.78125,
"grad_norm": 0.11023426100788798,
"learning_rate": 1.3626167395997247e-05,
"loss": 0.3282,
"step": 1089
},
{
"epoch": 3.7847222222222223,
"grad_norm": 0.09945960251671251,
"learning_rate": 1.3553344510160268e-05,
"loss": 0.3265,
"step": 1090
},
{
"epoch": 3.7881944444444446,
"grad_norm": 0.12218443061362727,
"learning_rate": 1.3480677027641443e-05,
"loss": 0.33,
"step": 1091
},
{
"epoch": 3.7916666666666665,
"grad_norm": 0.11447703332102428,
"learning_rate": 1.3408165375442486e-05,
"loss": 0.3305,
"step": 1092
},
{
"epoch": 3.795138888888889,
"grad_norm": 0.09651158172556974,
"learning_rate": 1.3335809979649486e-05,
"loss": 0.3204,
"step": 1093
},
{
"epoch": 3.798611111111111,
"grad_norm": 0.11384440993096656,
"learning_rate": 1.3263611265430303e-05,
"loss": 0.3267,
"step": 1094
},
{
"epoch": 3.8020833333333335,
"grad_norm": 0.11304722402701793,
"learning_rate": 1.319156965703217e-05,
"loss": 0.3326,
"step": 1095
},
{
"epoch": 3.8055555555555554,
"grad_norm": 0.10486976163312414,
"learning_rate": 1.3119685577779105e-05,
"loss": 0.3293,
"step": 1096
},
{
"epoch": 3.8090277777777777,
"grad_norm": 0.1271060262298,
"learning_rate": 1.3047959450069505e-05,
"loss": 0.3272,
"step": 1097
},
{
"epoch": 3.8125,
"grad_norm": 0.11143745204437848,
"learning_rate": 1.297639169537359e-05,
"loss": 0.3297,
"step": 1098
},
{
"epoch": 3.8159722222222223,
"grad_norm": 0.10831300071287393,
"learning_rate": 1.290498273423101e-05,
"loss": 0.3254,
"step": 1099
},
{
"epoch": 3.8194444444444446,
"grad_norm": 0.09826593280022544,
"learning_rate": 1.2833732986248277e-05,
"loss": 0.3237,
"step": 1100
},
{
"epoch": 3.8229166666666665,
"grad_norm": 0.1080511751809344,
"learning_rate": 1.2762642870096377e-05,
"loss": 0.3281,
"step": 1101
},
{
"epoch": 3.826388888888889,
"grad_norm": 0.10191099405231756,
"learning_rate": 1.2691712803508307e-05,
"loss": 0.3246,
"step": 1102
},
{
"epoch": 3.829861111111111,
"grad_norm": 0.1182255215246636,
"learning_rate": 1.2620943203276527e-05,
"loss": 0.3297,
"step": 1103
},
{
"epoch": 3.8333333333333335,
"grad_norm": 0.11372702532966089,
"learning_rate": 1.2550334485250661e-05,
"loss": 0.321,
"step": 1104
},
{
"epoch": 3.8368055555555554,
"grad_norm": 0.11153225684328712,
"learning_rate": 1.2479887064334904e-05,
"loss": 0.3247,
"step": 1105
},
{
"epoch": 3.8402777777777777,
"grad_norm": 0.10934410198524311,
"learning_rate": 1.24096013544857e-05,
"loss": 0.3179,
"step": 1106
},
{
"epoch": 3.84375,
"grad_norm": 0.11047573559225156,
"learning_rate": 1.233947776870923e-05,
"loss": 0.3237,
"step": 1107
},
{
"epoch": 3.8472222222222223,
"grad_norm": 0.10334708781569232,
"learning_rate": 1.2269516719059041e-05,
"loss": 0.3286,
"step": 1108
},
{
"epoch": 3.8506944444444446,
"grad_norm": 0.0997713836270273,
"learning_rate": 1.2199718616633574e-05,
"loss": 0.3323,
"step": 1109
},
{
"epoch": 3.8541666666666665,
"grad_norm": 0.10598909183310787,
"learning_rate": 1.2130083871573812e-05,
"loss": 0.3294,
"step": 1110
},
{
"epoch": 3.857638888888889,
"grad_norm": 0.1019537717012242,
"learning_rate": 1.2060612893060788e-05,
"loss": 0.3309,
"step": 1111
},
{
"epoch": 3.861111111111111,
"grad_norm": 0.10394292731566264,
"learning_rate": 1.1991306089313261e-05,
"loss": 0.3286,
"step": 1112
},
{
"epoch": 3.8645833333333335,
"grad_norm": 0.10505701208953087,
"learning_rate": 1.1922163867585268e-05,
"loss": 0.3271,
"step": 1113
},
{
"epoch": 3.8680555555555554,
"grad_norm": 0.11019283466687356,
"learning_rate": 1.1853186634163766e-05,
"loss": 0.3203,
"step": 1114
},
{
"epoch": 3.8715277777777777,
"grad_norm": 0.11259235512564435,
"learning_rate": 1.1784374794366177e-05,
"loss": 0.3283,
"step": 1115
},
{
"epoch": 3.875,
"grad_norm": 0.10395515531494519,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.3301,
"step": 1116
},
{
"epoch": 3.8784722222222223,
"grad_norm": 0.09840864009316534,
"learning_rate": 1.1647248912050863e-05,
"loss": 0.3293,
"step": 1117
},
{
"epoch": 3.8819444444444446,
"grad_norm": 0.11150628322480051,
"learning_rate": 1.1578935675299166e-05,
"loss": 0.3218,
"step": 1118
},
{
"epoch": 3.8854166666666665,
"grad_norm": 0.09778565099255097,
"learning_rate": 1.1510789443698772e-05,
"loss": 0.3248,
"step": 1119
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.10078294206883251,
"learning_rate": 1.1442810617684046e-05,
"loss": 0.3232,
"step": 1120
},
{
"epoch": 3.892361111111111,
"grad_norm": 0.10514241635853229,
"learning_rate": 1.1374999596705707e-05,
"loss": 0.3251,
"step": 1121
},
{
"epoch": 3.8958333333333335,
"grad_norm": 0.09516414758527439,
"learning_rate": 1.130735677922842e-05,
"loss": 0.3244,
"step": 1122
},
{
"epoch": 3.8993055555555554,
"grad_norm": 0.09425427526229069,
"learning_rate": 1.1239882562728476e-05,
"loss": 0.3278,
"step": 1123
},
{
"epoch": 3.9027777777777777,
"grad_norm": 0.10027536579030943,
"learning_rate": 1.1172577343691415e-05,
"loss": 0.3229,
"step": 1124
},
{
"epoch": 3.90625,
"grad_norm": 0.09683190273273522,
"learning_rate": 1.110544151760978e-05,
"loss": 0.3298,
"step": 1125
},
{
"epoch": 3.9097222222222223,
"grad_norm": 0.0967610942668413,
"learning_rate": 1.1038475478980697e-05,
"loss": 0.3279,
"step": 1126
},
{
"epoch": 3.9131944444444446,
"grad_norm": 0.10330176910362741,
"learning_rate": 1.0971679621303642e-05,
"loss": 0.3317,
"step": 1127
},
{
"epoch": 3.9166666666666665,
"grad_norm": 0.09289793987187862,
"learning_rate": 1.0905054337078051e-05,
"loss": 0.3227,
"step": 1128
},
{
"epoch": 3.920138888888889,
"grad_norm": 0.09497389173764989,
"learning_rate": 1.08386000178011e-05,
"loss": 0.3226,
"step": 1129
},
{
"epoch": 3.923611111111111,
"grad_norm": 0.0895936439673318,
"learning_rate": 1.0772317053965304e-05,
"loss": 0.3262,
"step": 1130
},
{
"epoch": 3.9270833333333335,
"grad_norm": 0.11102321960186208,
"learning_rate": 1.0706205835056326e-05,
"loss": 0.3263,
"step": 1131
},
{
"epoch": 3.9305555555555554,
"grad_norm": 0.10321047657838925,
"learning_rate": 1.0640266749550593e-05,
"loss": 0.3291,
"step": 1132
},
{
"epoch": 3.9340277777777777,
"grad_norm": 0.09582704106508376,
"learning_rate": 1.0574500184913083e-05,
"loss": 0.3264,
"step": 1133
},
{
"epoch": 3.9375,
"grad_norm": 0.10607451556807133,
"learning_rate": 1.0508906527595042e-05,
"loss": 0.3249,
"step": 1134
},
{
"epoch": 3.9409722222222223,
"grad_norm": 0.09752377721853296,
"learning_rate": 1.0443486163031644e-05,
"loss": 0.322,
"step": 1135
},
{
"epoch": 3.9444444444444446,
"grad_norm": 0.08985167666388755,
"learning_rate": 1.0378239475639823e-05,
"loss": 0.3312,
"step": 1136
},
{
"epoch": 3.9479166666666665,
"grad_norm": 0.09861848573864489,
"learning_rate": 1.0313166848815931e-05,
"loss": 0.3283,
"step": 1137
},
{
"epoch": 3.951388888888889,
"grad_norm": 0.09568519398223461,
"learning_rate": 1.0248268664933563e-05,
"loss": 0.3235,
"step": 1138
},
{
"epoch": 3.954861111111111,
"grad_norm": 0.09293210325829616,
"learning_rate": 1.018354530534122e-05,
"loss": 0.3233,
"step": 1139
},
{
"epoch": 3.9583333333333335,
"grad_norm": 0.09919535966598288,
"learning_rate": 1.0118997150360169e-05,
"loss": 0.3248,
"step": 1140
},
{
"epoch": 3.9618055555555554,
"grad_norm": 0.10003396984693862,
"learning_rate": 1.0054624579282107e-05,
"loss": 0.3258,
"step": 1141
},
{
"epoch": 3.9652777777777777,
"grad_norm": 0.09547811645073016,
"learning_rate": 9.990427970367032e-06,
"loss": 0.3248,
"step": 1142
},
{
"epoch": 3.96875,
"grad_norm": 0.09343511294470988,
"learning_rate": 9.92640770084091e-06,
"loss": 0.3228,
"step": 1143
},
{
"epoch": 3.9722222222222223,
"grad_norm": 0.09824876865049713,
"learning_rate": 9.862564146893571e-06,
"loss": 0.3261,
"step": 1144
},
{
"epoch": 3.9756944444444446,
"grad_norm": 0.09020643426664245,
"learning_rate": 9.798897683676425e-06,
"loss": 0.3206,
"step": 1145
},
{
"epoch": 3.9791666666666665,
"grad_norm": 0.09196067134166887,
"learning_rate": 9.735408685300287e-06,
"loss": 0.3287,
"step": 1146
},
{
"epoch": 3.982638888888889,
"grad_norm": 0.09624041193059374,
"learning_rate": 9.672097524833144e-06,
"loss": 0.3234,
"step": 1147
},
{
"epoch": 3.986111111111111,
"grad_norm": 0.09445572196365963,
"learning_rate": 9.60896457429803e-06,
"loss": 0.3269,
"step": 1148
},
{
"epoch": 3.9895833333333335,
"grad_norm": 0.09413435203202192,
"learning_rate": 9.546010204670759e-06,
"loss": 0.3249,
"step": 1149
},
{
"epoch": 3.9930555555555554,
"grad_norm": 0.09849551726546966,
"learning_rate": 9.4832347858778e-06,
"loss": 0.3327,
"step": 1150
},
{
"epoch": 3.9965277777777777,
"grad_norm": 0.10716050657493227,
"learning_rate": 9.420638686794104e-06,
"loss": 0.3332,
"step": 1151
},
{
"epoch": 4.0,
"grad_norm": 0.14340310851517993,
"learning_rate": 9.358222275240884e-06,
"loss": 0.3105,
"step": 1152
},
{
"epoch": 4.003472222222222,
"grad_norm": 0.13131364336545615,
"learning_rate": 9.29598591798353e-06,
"loss": 0.3078,
"step": 1153
},
{
"epoch": 4.006944444444445,
"grad_norm": 0.11508669248540353,
"learning_rate": 9.233929980729406e-06,
"loss": 0.3034,
"step": 1154
},
{
"epoch": 4.010416666666667,
"grad_norm": 0.10786149626912109,
"learning_rate": 9.172054828125678e-06,
"loss": 0.304,
"step": 1155
},
{
"epoch": 4.013888888888889,
"grad_norm": 0.13206545124072844,
"learning_rate": 9.110360823757235e-06,
"loss": 0.3073,
"step": 1156
},
{
"epoch": 4.017361111111111,
"grad_norm": 0.13000704335108745,
"learning_rate": 9.048848330144517e-06,
"loss": 0.2984,
"step": 1157
},
{
"epoch": 4.020833333333333,
"grad_norm": 0.12136424684494163,
"learning_rate": 8.987517708741364e-06,
"loss": 0.3033,
"step": 1158
},
{
"epoch": 4.024305555555555,
"grad_norm": 0.1131951055285726,
"learning_rate": 8.926369319932955e-06,
"loss": 0.3038,
"step": 1159
},
{
"epoch": 4.027777777777778,
"grad_norm": 0.12276551490332963,
"learning_rate": 8.8654035230336e-06,
"loss": 0.3063,
"step": 1160
},
{
"epoch": 4.03125,
"grad_norm": 0.11606442023390179,
"learning_rate": 8.804620676284736e-06,
"loss": 0.3045,
"step": 1161
},
{
"epoch": 4.034722222222222,
"grad_norm": 0.11139314403259774,
"learning_rate": 8.74402113685271e-06,
"loss": 0.3007,
"step": 1162
},
{
"epoch": 4.038194444444445,
"grad_norm": 0.1065337658272395,
"learning_rate": 8.683605260826792e-06,
"loss": 0.3072,
"step": 1163
},
{
"epoch": 4.041666666666667,
"grad_norm": 0.10443063897412685,
"learning_rate": 8.623373403216972e-06,
"loss": 0.3046,
"step": 1164
},
{
"epoch": 4.045138888888889,
"grad_norm": 0.11050778057057117,
"learning_rate": 8.56332591795197e-06,
"loss": 0.3108,
"step": 1165
},
{
"epoch": 4.048611111111111,
"grad_norm": 0.10107821431355542,
"learning_rate": 8.503463157877112e-06,
"loss": 0.3041,
"step": 1166
},
{
"epoch": 4.052083333333333,
"grad_norm": 0.11491987551631903,
"learning_rate": 8.44378547475222e-06,
"loss": 0.3076,
"step": 1167
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.10884189411682854,
"learning_rate": 8.384293219249633e-06,
"loss": 0.3095,
"step": 1168
},
{
"epoch": 4.059027777777778,
"grad_norm": 0.09910051377670472,
"learning_rate": 8.324986740952061e-06,
"loss": 0.3068,
"step": 1169
},
{
"epoch": 4.0625,
"grad_norm": 0.09641277889877216,
"learning_rate": 8.265866388350598e-06,
"loss": 0.305,
"step": 1170
},
{
"epoch": 4.065972222222222,
"grad_norm": 0.09639148768059384,
"learning_rate": 8.206932508842617e-06,
"loss": 0.3078,
"step": 1171
},
{
"epoch": 4.069444444444445,
"grad_norm": 0.09590017094288145,
"learning_rate": 8.148185448729778e-06,
"loss": 0.3048,
"step": 1172
},
{
"epoch": 4.072916666666667,
"grad_norm": 0.1031626172943818,
"learning_rate": 8.089625553215947e-06,
"loss": 0.3072,
"step": 1173
},
{
"epoch": 4.076388888888889,
"grad_norm": 0.09505984395972193,
"learning_rate": 8.031253166405223e-06,
"loss": 0.3067,
"step": 1174
},
{
"epoch": 4.079861111111111,
"grad_norm": 0.09796967816252462,
"learning_rate": 7.973068631299848e-06,
"loss": 0.3049,
"step": 1175
},
{
"epoch": 4.083333333333333,
"grad_norm": 0.10038582605001176,
"learning_rate": 7.915072289798247e-06,
"loss": 0.31,
"step": 1176
},
{
"epoch": 4.086805555555555,
"grad_norm": 0.09456176428791851,
"learning_rate": 7.857264482693007e-06,
"loss": 0.301,
"step": 1177
},
{
"epoch": 4.090277777777778,
"grad_norm": 0.10194949961977534,
"learning_rate": 7.799645549668869e-06,
"loss": 0.3044,
"step": 1178
},
{
"epoch": 4.09375,
"grad_norm": 0.10057820263217915,
"learning_rate": 7.742215829300695e-06,
"loss": 0.306,
"step": 1179
},
{
"epoch": 4.097222222222222,
"grad_norm": 0.09003366079286232,
"learning_rate": 7.684975659051557e-06,
"loss": 0.3068,
"step": 1180
},
{
"epoch": 4.100694444444445,
"grad_norm": 0.10159173660529588,
"learning_rate": 7.627925375270684e-06,
"loss": 0.3079,
"step": 1181
},
{
"epoch": 4.104166666666667,
"grad_norm": 0.09248887797460335,
"learning_rate": 7.5710653131915125e-06,
"loss": 0.3056,
"step": 1182
},
{
"epoch": 4.107638888888889,
"grad_norm": 0.09784437202252298,
"learning_rate": 7.514395806929742e-06,
"loss": 0.3069,
"step": 1183
},
{
"epoch": 4.111111111111111,
"grad_norm": 0.10274066112669682,
"learning_rate": 7.457917189481301e-06,
"loss": 0.3053,
"step": 1184
},
{
"epoch": 4.114583333333333,
"grad_norm": 0.08905635702892338,
"learning_rate": 7.401629792720495e-06,
"loss": 0.3028,
"step": 1185
},
{
"epoch": 4.118055555555555,
"grad_norm": 0.0932267327763702,
"learning_rate": 7.345533947397933e-06,
"loss": 0.3053,
"step": 1186
},
{
"epoch": 4.121527777777778,
"grad_norm": 0.09571394607798979,
"learning_rate": 7.289629983138691e-06,
"loss": 0.305,
"step": 1187
},
{
"epoch": 4.125,
"grad_norm": 0.09484033625260786,
"learning_rate": 7.233918228440324e-06,
"loss": 0.3033,
"step": 1188
},
{
"epoch": 4.128472222222222,
"grad_norm": 0.09866883982672792,
"learning_rate": 7.1783990106709485e-06,
"loss": 0.3043,
"step": 1189
},
{
"epoch": 4.131944444444445,
"grad_norm": 0.09157610827195094,
"learning_rate": 7.123072656067278e-06,
"loss": 0.3022,
"step": 1190
},
{
"epoch": 4.135416666666667,
"grad_norm": 0.08992237518920917,
"learning_rate": 7.067939489732794e-06,
"loss": 0.3056,
"step": 1191
},
{
"epoch": 4.138888888888889,
"grad_norm": 0.08399036043953488,
"learning_rate": 7.0129998356357295e-06,
"loss": 0.2967,
"step": 1192
},
{
"epoch": 4.142361111111111,
"grad_norm": 0.09226206512891923,
"learning_rate": 6.958254016607275e-06,
"loss": 0.3004,
"step": 1193
},
{
"epoch": 4.145833333333333,
"grad_norm": 0.09669707681367447,
"learning_rate": 6.903702354339578e-06,
"loss": 0.3008,
"step": 1194
},
{
"epoch": 4.149305555555555,
"grad_norm": 0.08590239014832185,
"learning_rate": 6.849345169383941e-06,
"loss": 0.3076,
"step": 1195
},
{
"epoch": 4.152777777777778,
"grad_norm": 0.09036361233423511,
"learning_rate": 6.795182781148848e-06,
"loss": 0.3074,
"step": 1196
},
{
"epoch": 4.15625,
"grad_norm": 0.09500461151290321,
"learning_rate": 6.7412155078981865e-06,
"loss": 0.3017,
"step": 1197
},
{
"epoch": 4.159722222222222,
"grad_norm": 0.08791034639868246,
"learning_rate": 6.687443666749316e-06,
"loss": 0.3071,
"step": 1198
},
{
"epoch": 4.163194444444445,
"grad_norm": 0.08752933631983714,
"learning_rate": 6.633867573671185e-06,
"loss": 0.3015,
"step": 1199
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.08896289666476007,
"learning_rate": 6.58048754348255e-06,
"loss": 0.3036,
"step": 1200
},
{
"epoch": 4.170138888888889,
"grad_norm": 0.08717440264167338,
"learning_rate": 6.527303889850038e-06,
"loss": 0.3075,
"step": 1201
},
{
"epoch": 4.173611111111111,
"grad_norm": 0.09311114762510445,
"learning_rate": 6.474316925286391e-06,
"loss": 0.3064,
"step": 1202
},
{
"epoch": 4.177083333333333,
"grad_norm": 0.09054319998608429,
"learning_rate": 6.421526961148545e-06,
"loss": 0.307,
"step": 1203
},
{
"epoch": 4.180555555555555,
"grad_norm": 0.08473005230511284,
"learning_rate": 6.368934307635881e-06,
"loss": 0.3018,
"step": 1204
},
{
"epoch": 4.184027777777778,
"grad_norm": 0.0906747780665984,
"learning_rate": 6.316539273788316e-06,
"loss": 0.3049,
"step": 1205
},
{
"epoch": 4.1875,
"grad_norm": 0.09376500153252898,
"learning_rate": 6.26434216748458e-06,
"loss": 0.309,
"step": 1206
},
{
"epoch": 4.190972222222222,
"grad_norm": 0.09594364360393172,
"learning_rate": 6.2123432954403155e-06,
"loss": 0.3046,
"step": 1207
},
{
"epoch": 4.194444444444445,
"grad_norm": 0.0821759111515786,
"learning_rate": 6.160542963206357e-06,
"loss": 0.2996,
"step": 1208
},
{
"epoch": 4.197916666666667,
"grad_norm": 0.08749906272107089,
"learning_rate": 6.108941475166879e-06,
"loss": 0.3079,
"step": 1209
},
{
"epoch": 4.201388888888889,
"grad_norm": 0.09555774118608272,
"learning_rate": 6.057539134537642e-06,
"loss": 0.3087,
"step": 1210
},
{
"epoch": 4.204861111111111,
"grad_norm": 0.08402200455892112,
"learning_rate": 6.006336243364161e-06,
"loss": 0.3047,
"step": 1211
},
{
"epoch": 4.208333333333333,
"grad_norm": 0.08995895348715435,
"learning_rate": 5.955333102520011e-06,
"loss": 0.3058,
"step": 1212
},
{
"epoch": 4.211805555555555,
"grad_norm": 0.087339945882205,
"learning_rate": 5.904530011704977e-06,
"loss": 0.306,
"step": 1213
},
{
"epoch": 4.215277777777778,
"grad_norm": 0.08520619113415061,
"learning_rate": 5.853927269443351e-06,
"loss": 0.3036,
"step": 1214
},
{
"epoch": 4.21875,
"grad_norm": 0.09260547720974625,
"learning_rate": 5.803525173082145e-06,
"loss": 0.3122,
"step": 1215
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.09653805351478427,
"learning_rate": 5.753324018789346e-06,
"loss": 0.3001,
"step": 1216
},
{
"epoch": 4.225694444444445,
"grad_norm": 0.09089979862793579,
"learning_rate": 5.703324101552215e-06,
"loss": 0.3081,
"step": 1217
},
{
"epoch": 4.229166666666667,
"grad_norm": 0.09407662119818534,
"learning_rate": 5.653525715175483e-06,
"loss": 0.305,
"step": 1218
},
{
"epoch": 4.232638888888889,
"grad_norm": 0.08882380776423075,
"learning_rate": 5.6039291522796925e-06,
"loss": 0.3094,
"step": 1219
},
{
"epoch": 4.236111111111111,
"grad_norm": 0.08654673046171545,
"learning_rate": 5.554534704299448e-06,
"loss": 0.3017,
"step": 1220
},
{
"epoch": 4.239583333333333,
"grad_norm": 0.09564054745602778,
"learning_rate": 5.5053426614817094e-06,
"loss": 0.3064,
"step": 1221
},
{
"epoch": 4.243055555555555,
"grad_norm": 0.08742594086106721,
"learning_rate": 5.456353312884051e-06,
"loss": 0.3054,
"step": 1222
},
{
"epoch": 4.246527777777778,
"grad_norm": 0.08777112302991236,
"learning_rate": 5.407566946373037e-06,
"loss": 0.3033,
"step": 1223
},
{
"epoch": 4.25,
"grad_norm": 0.08507735550646113,
"learning_rate": 5.358983848622452e-06,
"loss": 0.302,
"step": 1224
},
{
"epoch": 4.253472222222222,
"grad_norm": 0.09199965080958919,
"learning_rate": 5.310604305111686e-06,
"loss": 0.3093,
"step": 1225
},
{
"epoch": 4.256944444444445,
"grad_norm": 0.09239775881843512,
"learning_rate": 5.262428600123981e-06,
"loss": 0.3092,
"step": 1226
},
{
"epoch": 4.260416666666667,
"grad_norm": 0.08274177231578059,
"learning_rate": 5.2144570167448475e-06,
"loss": 0.307,
"step": 1227
},
{
"epoch": 4.263888888888889,
"grad_norm": 0.08396101555744864,
"learning_rate": 5.1666898368603195e-06,
"loss": 0.3032,
"step": 1228
},
{
"epoch": 4.267361111111111,
"grad_norm": 0.08953375546952966,
"learning_rate": 5.119127341155365e-06,
"loss": 0.3047,
"step": 1229
},
{
"epoch": 4.270833333333333,
"grad_norm": 0.09007507657663101,
"learning_rate": 5.07176980911217e-06,
"loss": 0.3047,
"step": 1230
},
{
"epoch": 4.274305555555555,
"grad_norm": 0.08417185133114392,
"learning_rate": 5.024617519008574e-06,
"loss": 0.3024,
"step": 1231
},
{
"epoch": 4.277777777777778,
"grad_norm": 0.08609345920977819,
"learning_rate": 4.97767074791637e-06,
"loss": 0.3064,
"step": 1232
},
{
"epoch": 4.28125,
"grad_norm": 0.08472912916639756,
"learning_rate": 4.930929771699693e-06,
"loss": 0.3092,
"step": 1233
},
{
"epoch": 4.284722222222222,
"grad_norm": 0.08804997352238675,
"learning_rate": 4.8843948650134285e-06,
"loss": 0.299,
"step": 1234
},
{
"epoch": 4.288194444444445,
"grad_norm": 0.08479705600440504,
"learning_rate": 4.838066301301547e-06,
"loss": 0.3062,
"step": 1235
},
{
"epoch": 4.291666666666667,
"grad_norm": 0.08081270894823607,
"learning_rate": 4.791944352795561e-06,
"loss": 0.3062,
"step": 1236
},
{
"epoch": 4.295138888888889,
"grad_norm": 0.08287377233415162,
"learning_rate": 4.746029290512852e-06,
"loss": 0.3031,
"step": 1237
},
{
"epoch": 4.298611111111111,
"grad_norm": 0.08775456916989416,
"learning_rate": 4.700321384255158e-06,
"loss": 0.3019,
"step": 1238
},
{
"epoch": 4.302083333333333,
"grad_norm": 0.0872987560615773,
"learning_rate": 4.654820902606898e-06,
"loss": 0.3051,
"step": 1239
},
{
"epoch": 4.305555555555555,
"grad_norm": 0.08121221496704681,
"learning_rate": 4.609528112933688e-06,
"loss": 0.3111,
"step": 1240
},
{
"epoch": 4.309027777777778,
"grad_norm": 0.08383067090092823,
"learning_rate": 4.564443281380708e-06,
"loss": 0.3079,
"step": 1241
},
{
"epoch": 4.3125,
"grad_norm": 0.08586819103118022,
"learning_rate": 4.519566672871132e-06,
"loss": 0.3072,
"step": 1242
},
{
"epoch": 4.315972222222222,
"grad_norm": 0.08230433839566585,
"learning_rate": 4.474898551104625e-06,
"loss": 0.3077,
"step": 1243
},
{
"epoch": 4.319444444444445,
"grad_norm": 0.08210657080061083,
"learning_rate": 4.430439178555759e-06,
"loss": 0.3033,
"step": 1244
},
{
"epoch": 4.322916666666667,
"grad_norm": 0.08729196012855645,
"learning_rate": 4.386188816472441e-06,
"loss": 0.3111,
"step": 1245
},
{
"epoch": 4.326388888888889,
"grad_norm": 0.08240213073346828,
"learning_rate": 4.342147724874459e-06,
"loss": 0.3088,
"step": 1246
},
{
"epoch": 4.329861111111111,
"grad_norm": 0.0794807984870581,
"learning_rate": 4.29831616255187e-06,
"loss": 0.3033,
"step": 1247
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.08829701160660025,
"learning_rate": 4.254694387063514e-06,
"loss": 0.3075,
"step": 1248
},
{
"epoch": 4.336805555555555,
"grad_norm": 0.08044160076356655,
"learning_rate": 4.2112826547355335e-06,
"loss": 0.3064,
"step": 1249
},
{
"epoch": 4.340277777777778,
"grad_norm": 0.08636152108755218,
"learning_rate": 4.168081220659796e-06,
"loss": 0.305,
"step": 1250
},
{
"epoch": 4.34375,
"grad_norm": 0.08091856000413852,
"learning_rate": 4.12509033869247e-06,
"loss": 0.3038,
"step": 1251
},
{
"epoch": 4.347222222222222,
"grad_norm": 0.08354541659416939,
"learning_rate": 4.082310261452471e-06,
"loss": 0.3083,
"step": 1252
},
{
"epoch": 4.350694444444445,
"grad_norm": 0.08171670419800484,
"learning_rate": 4.039741240320028e-06,
"loss": 0.3015,
"step": 1253
},
{
"epoch": 4.354166666666667,
"grad_norm": 0.08148652161118189,
"learning_rate": 3.997383525435154e-06,
"loss": 0.3063,
"step": 1254
},
{
"epoch": 4.357638888888889,
"grad_norm": 0.08161676617579289,
"learning_rate": 3.9552373656962295e-06,
"loss": 0.3052,
"step": 1255
},
{
"epoch": 4.361111111111111,
"grad_norm": 0.07993172768627148,
"learning_rate": 3.913303008758491e-06,
"loss": 0.3058,
"step": 1256
},
{
"epoch": 4.364583333333333,
"grad_norm": 0.08226260042852836,
"learning_rate": 3.871580701032631e-06,
"loss": 0.3048,
"step": 1257
},
{
"epoch": 4.368055555555555,
"grad_norm": 0.07963967554779505,
"learning_rate": 3.830070687683285e-06,
"loss": 0.3063,
"step": 1258
},
{
"epoch": 4.371527777777778,
"grad_norm": 0.08031725225973586,
"learning_rate": 3.78877321262765e-06,
"loss": 0.3072,
"step": 1259
},
{
"epoch": 4.375,
"grad_norm": 0.07809304104102638,
"learning_rate": 3.747688518534003e-06,
"loss": 0.3023,
"step": 1260
},
{
"epoch": 4.378472222222222,
"grad_norm": 0.07830478547816339,
"learning_rate": 3.706816846820327e-06,
"loss": 0.3016,
"step": 1261
},
{
"epoch": 4.381944444444445,
"grad_norm": 0.08141185465094594,
"learning_rate": 3.666158437652829e-06,
"loss": 0.3072,
"step": 1262
},
{
"epoch": 4.385416666666667,
"grad_norm": 0.08225321441942754,
"learning_rate": 3.6257135299445943e-06,
"loss": 0.3141,
"step": 1263
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.0816264618707245,
"learning_rate": 3.585482361354138e-06,
"loss": 0.3058,
"step": 1264
},
{
"epoch": 4.392361111111111,
"grad_norm": 0.08196549798085448,
"learning_rate": 3.545465168284006e-06,
"loss": 0.3055,
"step": 1265
},
{
"epoch": 4.395833333333333,
"grad_norm": 0.07849614984159622,
"learning_rate": 3.5056621858794393e-06,
"loss": 0.3051,
"step": 1266
},
{
"epoch": 4.399305555555555,
"grad_norm": 0.0819544023451366,
"learning_rate": 3.4660736480269084e-06,
"loss": 0.3079,
"step": 1267
},
{
"epoch": 4.402777777777778,
"grad_norm": 0.08201178776199941,
"learning_rate": 3.42669978735283e-06,
"loss": 0.3066,
"step": 1268
},
{
"epoch": 4.40625,
"grad_norm": 0.07898815368040525,
"learning_rate": 3.3875408352221164e-06,
"loss": 0.3015,
"step": 1269
},
{
"epoch": 4.409722222222222,
"grad_norm": 0.08124064277394925,
"learning_rate": 3.348597021736888e-06,
"loss": 0.3112,
"step": 1270
},
{
"epoch": 4.413194444444445,
"grad_norm": 0.0780944772652276,
"learning_rate": 3.309868575735058e-06,
"loss": 0.3081,
"step": 1271
},
{
"epoch": 4.416666666666667,
"grad_norm": 0.07799431800188858,
"learning_rate": 3.2713557247890447e-06,
"loss": 0.3084,
"step": 1272
},
{
"epoch": 4.420138888888889,
"grad_norm": 0.08246204656448664,
"learning_rate": 3.233058695204383e-06,
"loss": 0.3016,
"step": 1273
},
{
"epoch": 4.423611111111111,
"grad_norm": 0.0801454285645143,
"learning_rate": 3.194977712018439e-06,
"loss": 0.3105,
"step": 1274
},
{
"epoch": 4.427083333333333,
"grad_norm": 0.07819837320085002,
"learning_rate": 3.157112998999057e-06,
"loss": 0.3052,
"step": 1275
},
{
"epoch": 4.430555555555555,
"grad_norm": 0.08098405872621355,
"learning_rate": 3.1194647786432663e-06,
"loss": 0.303,
"step": 1276
},
{
"epoch": 4.434027777777778,
"grad_norm": 0.08486769449790087,
"learning_rate": 3.082033272175933e-06,
"loss": 0.3102,
"step": 1277
},
{
"epoch": 4.4375,
"grad_norm": 0.07846939710904592,
"learning_rate": 3.0448186995485307e-06,
"loss": 0.3073,
"step": 1278
},
{
"epoch": 4.440972222222222,
"grad_norm": 0.07724147069385455,
"learning_rate": 3.0078212794377814e-06,
"loss": 0.3071,
"step": 1279
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.07929202020664662,
"learning_rate": 2.9710412292443868e-06,
"loss": 0.3018,
"step": 1280
},
{
"epoch": 4.447916666666667,
"grad_norm": 0.07979841043601492,
"learning_rate": 2.934478765091795e-06,
"loss": 0.3055,
"step": 1281
},
{
"epoch": 4.451388888888889,
"grad_norm": 0.07774246164480418,
"learning_rate": 2.8981341018248587e-06,
"loss": 0.3046,
"step": 1282
},
{
"epoch": 4.454861111111111,
"grad_norm": 0.07712391520906098,
"learning_rate": 2.8620074530086373e-06,
"loss": 0.3064,
"step": 1283
},
{
"epoch": 4.458333333333333,
"grad_norm": 0.07713201817547355,
"learning_rate": 2.8260990309270987e-06,
"loss": 0.3077,
"step": 1284
},
{
"epoch": 4.461805555555555,
"grad_norm": 0.08201941158917679,
"learning_rate": 2.7904090465819036e-06,
"loss": 0.306,
"step": 1285
},
{
"epoch": 4.465277777777778,
"grad_norm": 0.07755953735549277,
"learning_rate": 2.7549377096911213e-06,
"loss": 0.3051,
"step": 1286
},
{
"epoch": 4.46875,
"grad_norm": 0.07859301016230703,
"learning_rate": 2.7196852286880624e-06,
"loss": 0.3009,
"step": 1287
},
{
"epoch": 4.472222222222222,
"grad_norm": 0.07681147137276423,
"learning_rate": 2.6846518107199782e-06,
"loss": 0.3014,
"step": 1288
},
{
"epoch": 4.475694444444445,
"grad_norm": 0.0788469179654231,
"learning_rate": 2.649837661646921e-06,
"loss": 0.3088,
"step": 1289
},
{
"epoch": 4.479166666666667,
"grad_norm": 0.0800162833065285,
"learning_rate": 2.6152429860404647e-06,
"loss": 0.3041,
"step": 1290
},
{
"epoch": 4.482638888888889,
"grad_norm": 0.07945724490526976,
"learning_rate": 2.580867987182556e-06,
"loss": 0.3026,
"step": 1291
},
{
"epoch": 4.486111111111111,
"grad_norm": 0.0793594687341575,
"learning_rate": 2.546712867064276e-06,
"loss": 0.3083,
"step": 1292
},
{
"epoch": 4.489583333333333,
"grad_norm": 0.07985397006865015,
"learning_rate": 2.512777826384709e-06,
"loss": 0.3007,
"step": 1293
},
{
"epoch": 4.493055555555555,
"grad_norm": 0.07759493413593757,
"learning_rate": 2.479063064549689e-06,
"loss": 0.3003,
"step": 1294
},
{
"epoch": 4.496527777777778,
"grad_norm": 0.07904821520349109,
"learning_rate": 2.4455687796706996e-06,
"loss": 0.3037,
"step": 1295
},
{
"epoch": 4.5,
"grad_norm": 0.07959351475882469,
"learning_rate": 2.4122951685636674e-06,
"loss": 0.3105,
"step": 1296
},
{
"epoch": 4.503472222222222,
"grad_norm": 0.08438189556943551,
"learning_rate": 2.3792424267478077e-06,
"loss": 0.3128,
"step": 1297
},
{
"epoch": 4.506944444444445,
"grad_norm": 0.0805267903725845,
"learning_rate": 2.34641074844451e-06,
"loss": 0.3076,
"step": 1298
},
{
"epoch": 4.510416666666667,
"grad_norm": 0.07648828401004298,
"learning_rate": 2.313800326576141e-06,
"loss": 0.3054,
"step": 1299
},
{
"epoch": 4.513888888888889,
"grad_norm": 0.07847170167124491,
"learning_rate": 2.281411352764966e-06,
"loss": 0.3043,
"step": 1300
},
{
"epoch": 4.517361111111111,
"grad_norm": 0.07964758385848214,
"learning_rate": 2.249244017331975e-06,
"loss": 0.3052,
"step": 1301
},
{
"epoch": 4.520833333333333,
"grad_norm": 0.08137734757681728,
"learning_rate": 2.217298509295813e-06,
"loss": 0.3101,
"step": 1302
},
{
"epoch": 4.524305555555555,
"grad_norm": 0.08059904880275212,
"learning_rate": 2.185575016371626e-06,
"loss": 0.3067,
"step": 1303
},
{
"epoch": 4.527777777777778,
"grad_norm": 0.07642771619974303,
"learning_rate": 2.1540737249699893e-06,
"loss": 0.3006,
"step": 1304
},
{
"epoch": 4.53125,
"grad_norm": 0.07450068026649213,
"learning_rate": 2.122794820195777e-06,
"loss": 0.3029,
"step": 1305
},
{
"epoch": 4.534722222222222,
"grad_norm": 0.07699770912588345,
"learning_rate": 2.0917384858471168e-06,
"loss": 0.3073,
"step": 1306
},
{
"epoch": 4.538194444444445,
"grad_norm": 0.08118194206437729,
"learning_rate": 2.0609049044142894e-06,
"loss": 0.3086,
"step": 1307
},
{
"epoch": 4.541666666666667,
"grad_norm": 0.07918192426609688,
"learning_rate": 2.0302942570786446e-06,
"loss": 0.3033,
"step": 1308
},
{
"epoch": 4.545138888888889,
"grad_norm": 0.0818161903856754,
"learning_rate": 1.999906723711549e-06,
"loss": 0.3091,
"step": 1309
},
{
"epoch": 4.548611111111111,
"grad_norm": 0.07712654079943236,
"learning_rate": 1.9697424828733423e-06,
"loss": 0.301,
"step": 1310
},
{
"epoch": 4.552083333333333,
"grad_norm": 0.07937817181604188,
"learning_rate": 1.9398017118122546e-06,
"loss": 0.3008,
"step": 1311
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.07761097719890697,
"learning_rate": 1.9100845864633875e-06,
"loss": 0.3035,
"step": 1312
},
{
"epoch": 4.559027777777778,
"grad_norm": 0.08249170241628408,
"learning_rate": 1.880591281447699e-06,
"loss": 0.3077,
"step": 1313
},
{
"epoch": 4.5625,
"grad_norm": 0.07671567257184514,
"learning_rate": 1.8513219700709272e-06,
"loss": 0.3012,
"step": 1314
},
{
"epoch": 4.565972222222222,
"grad_norm": 0.08240625489775082,
"learning_rate": 1.8222768243226108e-06,
"loss": 0.3051,
"step": 1315
},
{
"epoch": 4.569444444444445,
"grad_norm": 0.08106392892833238,
"learning_rate": 1.793456014875079e-06,
"loss": 0.3027,
"step": 1316
},
{
"epoch": 4.572916666666667,
"grad_norm": 0.080852573793533,
"learning_rate": 1.7648597110824183e-06,
"loss": 0.3075,
"step": 1317
},
{
"epoch": 4.576388888888889,
"grad_norm": 0.07502019845627791,
"learning_rate": 1.7364880809795082e-06,
"loss": 0.3015,
"step": 1318
},
{
"epoch": 4.579861111111111,
"grad_norm": 0.07400493685151668,
"learning_rate": 1.708341291281026e-06,
"loss": 0.3009,
"step": 1319
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.07449432188464176,
"learning_rate": 1.6804195073804442e-06,
"loss": 0.3059,
"step": 1320
},
{
"epoch": 4.586805555555555,
"grad_norm": 0.08268528004577931,
"learning_rate": 1.6527228933491012e-06,
"loss": 0.3076,
"step": 1321
},
{
"epoch": 4.590277777777778,
"grad_norm": 0.07316983791427785,
"learning_rate": 1.6252516119351947e-06,
"loss": 0.3039,
"step": 1322
},
{
"epoch": 4.59375,
"grad_norm": 0.08087200910739684,
"learning_rate": 1.598005824562856e-06,
"loss": 0.3064,
"step": 1323
},
{
"epoch": 4.597222222222222,
"grad_norm": 0.07713339500755885,
"learning_rate": 1.5709856913311795e-06,
"loss": 0.3063,
"step": 1324
},
{
"epoch": 4.600694444444445,
"grad_norm": 0.07639629736437101,
"learning_rate": 1.5441913710133106e-06,
"loss": 0.3113,
"step": 1325
},
{
"epoch": 4.604166666666667,
"grad_norm": 0.07622410815157274,
"learning_rate": 1.5176230210554744e-06,
"loss": 0.3095,
"step": 1326
},
{
"epoch": 4.607638888888889,
"grad_norm": 0.07402239150583342,
"learning_rate": 1.4912807975760734e-06,
"loss": 0.3001,
"step": 1327
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.07586041151318136,
"learning_rate": 1.4651648553647869e-06,
"loss": 0.3049,
"step": 1328
},
{
"epoch": 4.614583333333333,
"grad_norm": 0.07577501190460577,
"learning_rate": 1.4392753478816145e-06,
"loss": 0.3092,
"step": 1329
},
{
"epoch": 4.618055555555555,
"grad_norm": 0.07339670766161828,
"learning_rate": 1.4136124272560259e-06,
"loss": 0.3056,
"step": 1330
},
{
"epoch": 4.621527777777778,
"grad_norm": 0.07347997250949742,
"learning_rate": 1.3881762442860124e-06,
"loss": 0.3063,
"step": 1331
},
{
"epoch": 4.625,
"grad_norm": 0.07682233056778245,
"learning_rate": 1.3629669484372722e-06,
"loss": 0.3087,
"step": 1332
},
{
"epoch": 4.628472222222222,
"grad_norm": 0.07453451880293445,
"learning_rate": 1.3379846878422487e-06,
"loss": 0.3057,
"step": 1333
},
{
"epoch": 4.631944444444445,
"grad_norm": 0.07745996539898557,
"learning_rate": 1.313229609299338e-06,
"loss": 0.3044,
"step": 1334
},
{
"epoch": 4.635416666666667,
"grad_norm": 0.0780732404982061,
"learning_rate": 1.2887018582719634e-06,
"loss": 0.3037,
"step": 1335
},
{
"epoch": 4.638888888888889,
"grad_norm": 0.07389341458338661,
"learning_rate": 1.2644015788877684e-06,
"loss": 0.3011,
"step": 1336
},
{
"epoch": 4.642361111111111,
"grad_norm": 0.07456241935811461,
"learning_rate": 1.2403289139377317e-06,
"loss": 0.3035,
"step": 1337
},
{
"epoch": 4.645833333333333,
"grad_norm": 0.07562411943342495,
"learning_rate": 1.2164840048753602e-06,
"loss": 0.3069,
"step": 1338
},
{
"epoch": 4.649305555555555,
"grad_norm": 0.07804683083088992,
"learning_rate": 1.1928669918158309e-06,
"loss": 0.3061,
"step": 1339
},
{
"epoch": 4.652777777777778,
"grad_norm": 0.07338410128940559,
"learning_rate": 1.1694780135352013e-06,
"loss": 0.3019,
"step": 1340
},
{
"epoch": 4.65625,
"grad_norm": 0.07497239513325749,
"learning_rate": 1.1463172074695428e-06,
"loss": 0.3049,
"step": 1341
},
{
"epoch": 4.659722222222222,
"grad_norm": 0.07652107630464167,
"learning_rate": 1.1233847097141858e-06,
"loss": 0.3009,
"step": 1342
},
{
"epoch": 4.663194444444445,
"grad_norm": 0.07466572845911,
"learning_rate": 1.1006806550228855e-06,
"loss": 0.305,
"step": 1343
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.07949689532916121,
"learning_rate": 1.0782051768070477e-06,
"loss": 0.3106,
"step": 1344
},
{
"epoch": 4.670138888888889,
"grad_norm": 0.07716711613529656,
"learning_rate": 1.0559584071349405e-06,
"loss": 0.3067,
"step": 1345
},
{
"epoch": 4.673611111111111,
"grad_norm": 0.07714967482230316,
"learning_rate": 1.0339404767309014e-06,
"loss": 0.3033,
"step": 1346
},
{
"epoch": 4.677083333333333,
"grad_norm": 0.0747240286372511,
"learning_rate": 1.0121515149746108e-06,
"loss": 0.302,
"step": 1347
},
{
"epoch": 4.680555555555555,
"grad_norm": 0.07699672596500706,
"learning_rate": 9.905916499002787e-07,
"loss": 0.3075,
"step": 1348
},
{
"epoch": 4.684027777777778,
"grad_norm": 0.07462981706073524,
"learning_rate": 9.692610081959342e-07,
"loss": 0.3071,
"step": 1349
},
{
"epoch": 4.6875,
"grad_norm": 0.07440562255699622,
"learning_rate": 9.481597152026656e-07,
"loss": 0.3035,
"step": 1350
},
{
"epoch": 4.690972222222222,
"grad_norm": 0.0747750169854033,
"learning_rate": 9.272878949138798e-07,
"loss": 0.3026,
"step": 1351
},
{
"epoch": 4.694444444444445,
"grad_norm": 0.07517286545410393,
"learning_rate": 9.066456699745774e-07,
"loss": 0.2988,
"step": 1352
},
{
"epoch": 4.697916666666667,
"grad_norm": 0.07777257948685168,
"learning_rate": 8.862331616806385e-07,
"loss": 0.3025,
"step": 1353
},
{
"epoch": 4.701388888888889,
"grad_norm": 0.07679937737402819,
"learning_rate": 8.660504899780986e-07,
"loss": 0.3066,
"step": 1354
},
{
"epoch": 4.704861111111111,
"grad_norm": 0.07607956492556339,
"learning_rate": 8.460977734624509e-07,
"loss": 0.3035,
"step": 1355
},
{
"epoch": 4.708333333333333,
"grad_norm": 0.07534506879573015,
"learning_rate": 8.263751293779409e-07,
"loss": 0.3094,
"step": 1356
},
{
"epoch": 4.711805555555555,
"grad_norm": 0.07663473789080033,
"learning_rate": 8.068826736169e-07,
"loss": 0.3053,
"step": 1357
},
{
"epoch": 4.715277777777778,
"grad_norm": 0.07510530680390314,
"learning_rate": 7.876205207190391e-07,
"loss": 0.3092,
"step": 1358
},
{
"epoch": 4.71875,
"grad_norm": 0.07377768224328009,
"learning_rate": 7.685887838707828e-07,
"loss": 0.3031,
"step": 1359
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.07504818836415791,
"learning_rate": 7.497875749046124e-07,
"loss": 0.3069,
"step": 1360
},
{
"epoch": 4.725694444444445,
"grad_norm": 0.07299334086816671,
"learning_rate": 7.312170042984035e-07,
"loss": 0.3021,
"step": 1361
},
{
"epoch": 4.729166666666667,
"grad_norm": 0.0779726563674373,
"learning_rate": 7.128771811747737e-07,
"loss": 0.3079,
"step": 1362
},
{
"epoch": 4.732638888888889,
"grad_norm": 0.0744531952974455,
"learning_rate": 6.947682133004386e-07,
"loss": 0.3057,
"step": 1363
},
{
"epoch": 4.736111111111111,
"grad_norm": 0.07704018762229559,
"learning_rate": 6.768902070856031e-07,
"loss": 0.3067,
"step": 1364
},
{
"epoch": 4.739583333333333,
"grad_norm": 0.07639788284252715,
"learning_rate": 6.592432675832916e-07,
"loss": 0.3114,
"step": 1365
},
{
"epoch": 4.743055555555555,
"grad_norm": 0.07596548343087427,
"learning_rate": 6.418274984887741e-07,
"loss": 0.299,
"step": 1366
},
{
"epoch": 4.746527777777778,
"grad_norm": 0.07579121693711281,
"learning_rate": 6.24643002138936e-07,
"loss": 0.3096,
"step": 1367
},
{
"epoch": 4.75,
"grad_norm": 0.07585599254548926,
"learning_rate": 6.076898795116792e-07,
"loss": 0.306,
"step": 1368
},
{
"epoch": 4.753472222222222,
"grad_norm": 0.07564652114058751,
"learning_rate": 5.909682302253217e-07,
"loss": 0.3053,
"step": 1369
},
{
"epoch": 4.756944444444445,
"grad_norm": 0.07414673161087322,
"learning_rate": 5.744781525380339e-07,
"loss": 0.3077,
"step": 1370
},
{
"epoch": 4.760416666666667,
"grad_norm": 0.07612522562021062,
"learning_rate": 5.582197433472348e-07,
"loss": 0.3056,
"step": 1371
},
{
"epoch": 4.763888888888889,
"grad_norm": 0.07684508645065607,
"learning_rate": 5.421930981890455e-07,
"loss": 0.3037,
"step": 1372
},
{
"epoch": 4.767361111111111,
"grad_norm": 0.07299978275560645,
"learning_rate": 5.263983112377036e-07,
"loss": 0.3051,
"step": 1373
},
{
"epoch": 4.770833333333333,
"grad_norm": 0.07539354501312999,
"learning_rate": 5.108354753050381e-07,
"loss": 0.3066,
"step": 1374
},
{
"epoch": 4.774305555555555,
"grad_norm": 0.07144155759475616,
"learning_rate": 4.955046818398979e-07,
"loss": 0.3046,
"step": 1375
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.0750194771698725,
"learning_rate": 4.804060209276396e-07,
"loss": 0.3051,
"step": 1376
},
{
"epoch": 4.78125,
"grad_norm": 0.07386907645068111,
"learning_rate": 4.6553958128957355e-07,
"loss": 0.3051,
"step": 1377
},
{
"epoch": 4.784722222222222,
"grad_norm": 0.07300681819406184,
"learning_rate": 4.509054502824528e-07,
"loss": 0.3053,
"step": 1378
},
{
"epoch": 4.788194444444445,
"grad_norm": 0.07209552136082469,
"learning_rate": 4.365037138979622e-07,
"loss": 0.301,
"step": 1379
},
{
"epoch": 4.791666666666667,
"grad_norm": 0.07361568021097385,
"learning_rate": 4.223344567622212e-07,
"loss": 0.3016,
"step": 1380
},
{
"epoch": 4.795138888888889,
"grad_norm": 0.07498679679992676,
"learning_rate": 4.083977621352642e-07,
"loss": 0.3109,
"step": 1381
},
{
"epoch": 4.798611111111111,
"grad_norm": 0.07594221300650468,
"learning_rate": 3.946937119105654e-07,
"loss": 0.2995,
"step": 1382
},
{
"epoch": 4.802083333333333,
"grad_norm": 0.07394072454121102,
"learning_rate": 3.8122238661456814e-07,
"loss": 0.3024,
"step": 1383
},
{
"epoch": 4.805555555555555,
"grad_norm": 0.07353465846849089,
"learning_rate": 3.679838654061874e-07,
"loss": 0.3008,
"step": 1384
},
{
"epoch": 4.809027777777778,
"grad_norm": 0.0720259043424231,
"learning_rate": 3.5497822607636123e-07,
"loss": 0.3048,
"step": 1385
},
{
"epoch": 4.8125,
"grad_norm": 0.07294963832385294,
"learning_rate": 3.4220554504758475e-07,
"loss": 0.3084,
"step": 1386
},
{
"epoch": 4.815972222222222,
"grad_norm": 0.0726666221581321,
"learning_rate": 3.2966589737347457e-07,
"loss": 0.3086,
"step": 1387
},
{
"epoch": 4.819444444444445,
"grad_norm": 0.07483350159915376,
"learning_rate": 3.173593567383071e-07,
"loss": 0.3003,
"step": 1388
},
{
"epoch": 4.822916666666667,
"grad_norm": 0.07322136882905836,
"learning_rate": 3.0528599545661453e-07,
"loss": 0.3039,
"step": 1389
},
{
"epoch": 4.826388888888889,
"grad_norm": 0.07073641232974301,
"learning_rate": 2.9344588447272726e-07,
"loss": 0.3033,
"step": 1390
},
{
"epoch": 4.829861111111111,
"grad_norm": 0.07470320302872045,
"learning_rate": 2.818390933603743e-07,
"loss": 0.3005,
"step": 1391
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.07324094393948057,
"learning_rate": 2.704656903222791e-07,
"loss": 0.3056,
"step": 1392
},
{
"epoch": 4.836805555555555,
"grad_norm": 0.07274475401225772,
"learning_rate": 2.5932574218975104e-07,
"loss": 0.305,
"step": 1393
},
{
"epoch": 4.840277777777778,
"grad_norm": 0.07473214948879901,
"learning_rate": 2.484193144222946e-07,
"loss": 0.3076,
"step": 1394
},
{
"epoch": 4.84375,
"grad_norm": 0.07221235816804243,
"learning_rate": 2.3774647110721415e-07,
"loss": 0.3036,
"step": 1395
},
{
"epoch": 4.847222222222222,
"grad_norm": 0.07367780540336007,
"learning_rate": 2.273072749592631e-07,
"loss": 0.3045,
"step": 1396
},
{
"epoch": 4.850694444444445,
"grad_norm": 0.07325375453284966,
"learning_rate": 2.1710178732024413e-07,
"loss": 0.3049,
"step": 1397
},
{
"epoch": 4.854166666666667,
"grad_norm": 0.07659484091306909,
"learning_rate": 2.0713006815868075e-07,
"loss": 0.3117,
"step": 1398
},
{
"epoch": 4.857638888888889,
"grad_norm": 0.07535899223039774,
"learning_rate": 1.973921760694264e-07,
"loss": 0.3051,
"step": 1399
},
{
"epoch": 4.861111111111111,
"grad_norm": 0.07300398348215961,
"learning_rate": 1.8788816827336686e-07,
"loss": 0.3056,
"step": 1400
},
{
"epoch": 4.864583333333333,
"grad_norm": 0.07316979292661353,
"learning_rate": 1.7861810061704287e-07,
"loss": 0.3057,
"step": 1401
},
{
"epoch": 4.868055555555555,
"grad_norm": 0.07378740384961463,
"learning_rate": 1.6958202757234366e-07,
"loss": 0.3049,
"step": 1402
},
{
"epoch": 4.871527777777778,
"grad_norm": 0.07181291686698073,
"learning_rate": 1.6078000223618272e-07,
"loss": 0.3067,
"step": 1403
},
{
"epoch": 4.875,
"grad_norm": 0.07295819865239954,
"learning_rate": 1.522120763301782e-07,
"loss": 0.3074,
"step": 1404
},
{
"epoch": 4.878472222222222,
"grad_norm": 0.07232277152642255,
"learning_rate": 1.438783002003641e-07,
"loss": 0.3023,
"step": 1405
},
{
"epoch": 4.881944444444445,
"grad_norm": 0.07234458208768678,
"learning_rate": 1.3577872281688388e-07,
"loss": 0.3082,
"step": 1406
},
{
"epoch": 4.885416666666667,
"grad_norm": 0.07276557670020045,
"learning_rate": 1.2791339177369745e-07,
"loss": 0.2999,
"step": 1407
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.0744224740118525,
"learning_rate": 1.2028235328831906e-07,
"loss": 0.3079,
"step": 1408
},
{
"epoch": 4.892361111111111,
"grad_norm": 0.0724196144673555,
"learning_rate": 1.1288565220152426e-07,
"loss": 0.3043,
"step": 1409
},
{
"epoch": 4.895833333333333,
"grad_norm": 0.07355374246045054,
"learning_rate": 1.0572333197711005e-07,
"loss": 0.3028,
"step": 1410
},
{
"epoch": 4.899305555555555,
"grad_norm": 0.07251943734879686,
"learning_rate": 9.879543470161512e-08,
"loss": 0.3015,
"step": 1411
},
{
"epoch": 4.902777777777778,
"grad_norm": 0.0739982426020259,
"learning_rate": 9.21020010840934e-08,
"loss": 0.3049,
"step": 1412
},
{
"epoch": 4.90625,
"grad_norm": 0.07307140402702485,
"learning_rate": 8.564307045586085e-08,
"loss": 0.308,
"step": 1413
},
{
"epoch": 4.909722222222222,
"grad_norm": 0.07273314483178361,
"learning_rate": 7.941868077026905e-08,
"loss": 0.3013,
"step": 1414
},
{
"epoch": 4.913194444444445,
"grad_norm": 0.07179264299655526,
"learning_rate": 7.34288686024831e-08,
"loss": 0.3041,
"step": 1415
},
{
"epoch": 4.916666666666667,
"grad_norm": 0.07399654735570108,
"learning_rate": 6.767366914927298e-08,
"loss": 0.3065,
"step": 1416
},
{
"epoch": 4.920138888888889,
"grad_norm": 0.0728246842339243,
"learning_rate": 6.215311622878695e-08,
"loss": 0.307,
"step": 1417
},
{
"epoch": 4.923611111111111,
"grad_norm": 0.07402560027995006,
"learning_rate": 5.6867242280373994e-08,
"loss": 0.3103,
"step": 1418
},
{
"epoch": 4.927083333333333,
"grad_norm": 0.07383259587315397,
"learning_rate": 5.1816078364383956e-08,
"loss": 0.306,
"step": 1419
},
{
"epoch": 4.930555555555555,
"grad_norm": 0.07298007618541999,
"learning_rate": 4.699965416198549e-08,
"loss": 0.3054,
"step": 1420
},
{
"epoch": 4.934027777777778,
"grad_norm": 0.07188192284714194,
"learning_rate": 4.241799797498836e-08,
"loss": 0.3008,
"step": 1421
},
{
"epoch": 4.9375,
"grad_norm": 0.07255291024636266,
"learning_rate": 3.8071136725688074e-08,
"loss": 0.3054,
"step": 1422
},
{
"epoch": 4.940972222222222,
"grad_norm": 0.07306219408514232,
"learning_rate": 3.3959095956697106e-08,
"loss": 0.3033,
"step": 1423
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.07293410022631318,
"learning_rate": 3.0081899830798345e-08,
"loss": 0.3042,
"step": 1424
},
{
"epoch": 4.947916666666667,
"grad_norm": 0.07153901090491563,
"learning_rate": 2.6439571130798536e-08,
"loss": 0.3071,
"step": 1425
},
{
"epoch": 4.951388888888889,
"grad_norm": 0.07296799735265355,
"learning_rate": 2.3032131259403955e-08,
"loss": 0.2973,
"step": 1426
},
{
"epoch": 4.954861111111111,
"grad_norm": 0.07029056717086786,
"learning_rate": 1.9859600239087175e-08,
"loss": 0.3035,
"step": 1427
},
{
"epoch": 4.958333333333333,
"grad_norm": 0.07458737702442902,
"learning_rate": 1.6921996711976028e-08,
"loss": 0.3096,
"step": 1428
},
{
"epoch": 4.961805555555555,
"grad_norm": 0.07206677082554877,
"learning_rate": 1.4219337939738175e-08,
"loss": 0.306,
"step": 1429
},
{
"epoch": 4.965277777777778,
"grad_norm": 0.07170156509697462,
"learning_rate": 1.175163980347005e-08,
"loss": 0.3027,
"step": 1430
},
{
"epoch": 4.96875,
"grad_norm": 0.07216683994988987,
"learning_rate": 9.518916803634703e-09,
"loss": 0.2986,
"step": 1431
},
{
"epoch": 4.972222222222222,
"grad_norm": 0.07279226964622872,
"learning_rate": 7.521182059946342e-09,
"loss": 0.3057,
"step": 1432
},
{
"epoch": 4.975694444444445,
"grad_norm": 0.07392442432994105,
"learning_rate": 5.758447311294823e-09,
"loss": 0.3071,
"step": 1433
},
{
"epoch": 4.979166666666667,
"grad_norm": 0.07187539149864376,
"learning_rate": 4.230722915701257e-09,
"loss": 0.3029,
"step": 1434
},
{
"epoch": 4.982638888888889,
"grad_norm": 0.07059128687591827,
"learning_rate": 2.93801785022918e-09,
"loss": 0.305,
"step": 1435
},
{
"epoch": 4.986111111111111,
"grad_norm": 0.07343485822486388,
"learning_rate": 1.8803397109534715e-09,
"loss": 0.3038,
"step": 1436
},
{
"epoch": 4.989583333333333,
"grad_norm": 0.07147607345386381,
"learning_rate": 1.057694712902624e-09,
"loss": 0.3063,
"step": 1437
},
{
"epoch": 4.993055555555555,
"grad_norm": 0.07245299095155071,
"learning_rate": 4.700876900187723e-10,
"loss": 0.3022,
"step": 1438
},
{
"epoch": 4.996527777777778,
"grad_norm": 0.07367615840163165,
"learning_rate": 1.175220951488143e-10,
"loss": 0.3066,
"step": 1439
},
{
"epoch": 5.0,
"grad_norm": 0.08989122624390881,
"learning_rate": 0.0,
"loss": 0.298,
"step": 1440
},
{
"epoch": 5.0,
"step": 1440,
"total_flos": 2.415764485177344e+16,
"train_loss": 0.3780418654489848,
"train_runtime": 19797.8662,
"train_samples_per_second": 37.137,
"train_steps_per_second": 0.073
}
],
"logging_steps": 1,
"max_steps": 1440,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.415764485177344e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}