model_f72e8e0c / checkpoint-1763 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
edd5217 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996219996219996,
"eval_steps": 500,
"global_step": 1763,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000567000567000567,
"grad_norm": 4.704992771148682,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.9183,
"step": 1
},
{
"epoch": 0.001134001134001134,
"grad_norm": 4.873214244842529,
"learning_rate": 1.0000000000000001e-07,
"loss": 1.9567,
"step": 2
},
{
"epoch": 0.001701001701001701,
"grad_norm": 4.890101432800293,
"learning_rate": 1.5000000000000002e-07,
"loss": 1.8994,
"step": 3
},
{
"epoch": 0.002268002268002268,
"grad_norm": 4.563302516937256,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.8513,
"step": 4
},
{
"epoch": 0.002835002835002835,
"grad_norm": 4.943462371826172,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.8998,
"step": 5
},
{
"epoch": 0.003402003402003402,
"grad_norm": 5.069730281829834,
"learning_rate": 3.0000000000000004e-07,
"loss": 1.9748,
"step": 6
},
{
"epoch": 0.003969003969003969,
"grad_norm": 5.16794490814209,
"learning_rate": 3.5000000000000004e-07,
"loss": 2.0669,
"step": 7
},
{
"epoch": 0.004536004536004536,
"grad_norm": 4.572751998901367,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.9074,
"step": 8
},
{
"epoch": 0.005103005103005103,
"grad_norm": 4.676807403564453,
"learning_rate": 4.5000000000000003e-07,
"loss": 1.9424,
"step": 9
},
{
"epoch": 0.00567000567000567,
"grad_norm": 4.998410701751709,
"learning_rate": 5.000000000000001e-07,
"loss": 1.9398,
"step": 10
},
{
"epoch": 0.006237006237006237,
"grad_norm": 4.833102703094482,
"learning_rate": 5.5e-07,
"loss": 1.9526,
"step": 11
},
{
"epoch": 0.006804006804006804,
"grad_norm": 4.7410078048706055,
"learning_rate": 6.000000000000001e-07,
"loss": 1.9341,
"step": 12
},
{
"epoch": 0.007371007371007371,
"grad_norm": 4.841571807861328,
"learning_rate": 6.5e-07,
"loss": 1.8506,
"step": 13
},
{
"epoch": 0.007938007938007937,
"grad_norm": 4.75044584274292,
"learning_rate": 7.000000000000001e-07,
"loss": 1.9124,
"step": 14
},
{
"epoch": 0.008505008505008505,
"grad_norm": 5.093398094177246,
"learning_rate": 7.5e-07,
"loss": 1.8485,
"step": 15
},
{
"epoch": 0.009072009072009071,
"grad_norm": 4.743251800537109,
"learning_rate": 8.000000000000001e-07,
"loss": 1.8827,
"step": 16
},
{
"epoch": 0.009639009639009639,
"grad_norm": 4.599445343017578,
"learning_rate": 8.500000000000001e-07,
"loss": 1.9064,
"step": 17
},
{
"epoch": 0.010206010206010205,
"grad_norm": 4.685406684875488,
"learning_rate": 9.000000000000001e-07,
"loss": 1.8437,
"step": 18
},
{
"epoch": 0.010773010773010773,
"grad_norm": 5.116965293884277,
"learning_rate": 9.500000000000001e-07,
"loss": 1.8478,
"step": 19
},
{
"epoch": 0.01134001134001134,
"grad_norm": 4.974440574645996,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.9602,
"step": 20
},
{
"epoch": 0.011907011907011907,
"grad_norm": 4.430954933166504,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.814,
"step": 21
},
{
"epoch": 0.012474012474012475,
"grad_norm": 4.7586164474487305,
"learning_rate": 1.1e-06,
"loss": 1.8719,
"step": 22
},
{
"epoch": 0.01304101304101304,
"grad_norm": 4.204355716705322,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.8459,
"step": 23
},
{
"epoch": 0.013608013608013609,
"grad_norm": 4.102180004119873,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.9011,
"step": 24
},
{
"epoch": 0.014175014175014175,
"grad_norm": 3.9540836811065674,
"learning_rate": 1.25e-06,
"loss": 1.7151,
"step": 25
},
{
"epoch": 0.014742014742014743,
"grad_norm": 3.724958658218384,
"learning_rate": 1.3e-06,
"loss": 1.8811,
"step": 26
},
{
"epoch": 0.015309015309015309,
"grad_norm": 3.6260979175567627,
"learning_rate": 1.3500000000000002e-06,
"loss": 1.7493,
"step": 27
},
{
"epoch": 0.015876015876015875,
"grad_norm": 3.549999713897705,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.8808,
"step": 28
},
{
"epoch": 0.016443016443016444,
"grad_norm": 3.366023063659668,
"learning_rate": 1.45e-06,
"loss": 1.6787,
"step": 29
},
{
"epoch": 0.01701001701001701,
"grad_norm": 3.0748300552368164,
"learning_rate": 1.5e-06,
"loss": 1.6742,
"step": 30
},
{
"epoch": 0.017577017577017576,
"grad_norm": 3.056105852127075,
"learning_rate": 1.5500000000000002e-06,
"loss": 1.6747,
"step": 31
},
{
"epoch": 0.018144018144018143,
"grad_norm": 3.277554512023926,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.6949,
"step": 32
},
{
"epoch": 0.018711018711018712,
"grad_norm": 3.0011744499206543,
"learning_rate": 1.6500000000000003e-06,
"loss": 1.681,
"step": 33
},
{
"epoch": 0.019278019278019278,
"grad_norm": 3.01111102104187,
"learning_rate": 1.7000000000000002e-06,
"loss": 1.7185,
"step": 34
},
{
"epoch": 0.019845019845019844,
"grad_norm": 2.8633668422698975,
"learning_rate": 1.75e-06,
"loss": 1.6156,
"step": 35
},
{
"epoch": 0.02041202041202041,
"grad_norm": 2.5703542232513428,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.6805,
"step": 36
},
{
"epoch": 0.02097902097902098,
"grad_norm": 2.5187172889709473,
"learning_rate": 1.85e-06,
"loss": 1.6083,
"step": 37
},
{
"epoch": 0.021546021546021546,
"grad_norm": 2.7661757469177246,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.6907,
"step": 38
},
{
"epoch": 0.022113022113022112,
"grad_norm": 3.207343101501465,
"learning_rate": 1.9500000000000004e-06,
"loss": 1.5878,
"step": 39
},
{
"epoch": 0.02268002268002268,
"grad_norm": 2.7971088886260986,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5075,
"step": 40
},
{
"epoch": 0.023247023247023248,
"grad_norm": 2.50616192817688,
"learning_rate": 2.05e-06,
"loss": 1.6045,
"step": 41
},
{
"epoch": 0.023814023814023814,
"grad_norm": 2.8198935985565186,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.6205,
"step": 42
},
{
"epoch": 0.02438102438102438,
"grad_norm": 2.591521739959717,
"learning_rate": 2.15e-06,
"loss": 1.5531,
"step": 43
},
{
"epoch": 0.02494802494802495,
"grad_norm": 2.4420039653778076,
"learning_rate": 2.2e-06,
"loss": 1.5229,
"step": 44
},
{
"epoch": 0.025515025515025515,
"grad_norm": 2.6265766620635986,
"learning_rate": 2.25e-06,
"loss": 1.4936,
"step": 45
},
{
"epoch": 0.02608202608202608,
"grad_norm": 2.300294876098633,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.5293,
"step": 46
},
{
"epoch": 0.026649026649026648,
"grad_norm": 2.3751509189605713,
"learning_rate": 2.35e-06,
"loss": 1.5413,
"step": 47
},
{
"epoch": 0.027216027216027217,
"grad_norm": 2.4751758575439453,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.5172,
"step": 48
},
{
"epoch": 0.027783027783027783,
"grad_norm": 2.3078811168670654,
"learning_rate": 2.4500000000000003e-06,
"loss": 1.5058,
"step": 49
},
{
"epoch": 0.02835002835002835,
"grad_norm": 2.3470571041107178,
"learning_rate": 2.5e-06,
"loss": 1.5087,
"step": 50
},
{
"epoch": 0.028917028917028915,
"grad_norm": 2.2181596755981445,
"learning_rate": 2.55e-06,
"loss": 1.5913,
"step": 51
},
{
"epoch": 0.029484029484029485,
"grad_norm": 2.228245735168457,
"learning_rate": 2.6e-06,
"loss": 1.491,
"step": 52
},
{
"epoch": 0.03005103005103005,
"grad_norm": 2.0523054599761963,
"learning_rate": 2.6500000000000005e-06,
"loss": 1.3369,
"step": 53
},
{
"epoch": 0.030618030618030617,
"grad_norm": 2.1131646633148193,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.4824,
"step": 54
},
{
"epoch": 0.031185031185031187,
"grad_norm": 2.1240482330322266,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.4588,
"step": 55
},
{
"epoch": 0.03175203175203175,
"grad_norm": 2.329206705093384,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.4934,
"step": 56
},
{
"epoch": 0.03231903231903232,
"grad_norm": 2.2496628761291504,
"learning_rate": 2.85e-06,
"loss": 1.4863,
"step": 57
},
{
"epoch": 0.03288603288603289,
"grad_norm": 2.1327602863311768,
"learning_rate": 2.9e-06,
"loss": 1.493,
"step": 58
},
{
"epoch": 0.03345303345303345,
"grad_norm": 2.0308682918548584,
"learning_rate": 2.95e-06,
"loss": 1.4056,
"step": 59
},
{
"epoch": 0.03402003402003402,
"grad_norm": 2.0320749282836914,
"learning_rate": 3e-06,
"loss": 1.4364,
"step": 60
},
{
"epoch": 0.03458703458703459,
"grad_norm": 2.419875144958496,
"learning_rate": 3.05e-06,
"loss": 1.4899,
"step": 61
},
{
"epoch": 0.03515403515403515,
"grad_norm": 2.623107433319092,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.3701,
"step": 62
},
{
"epoch": 0.03572103572103572,
"grad_norm": 2.1976675987243652,
"learning_rate": 3.1500000000000003e-06,
"loss": 1.3846,
"step": 63
},
{
"epoch": 0.036288036288036285,
"grad_norm": 2.262049436569214,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.4136,
"step": 64
},
{
"epoch": 0.036855036855036855,
"grad_norm": 2.027294397354126,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.4009,
"step": 65
},
{
"epoch": 0.037422037422037424,
"grad_norm": 2.174931526184082,
"learning_rate": 3.3000000000000006e-06,
"loss": 1.4644,
"step": 66
},
{
"epoch": 0.03798903798903799,
"grad_norm": 2.3164167404174805,
"learning_rate": 3.3500000000000005e-06,
"loss": 1.3916,
"step": 67
},
{
"epoch": 0.038556038556038556,
"grad_norm": 2.1951541900634766,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.3923,
"step": 68
},
{
"epoch": 0.039123039123039126,
"grad_norm": 2.751126766204834,
"learning_rate": 3.45e-06,
"loss": 1.445,
"step": 69
},
{
"epoch": 0.03969003969003969,
"grad_norm": 2.052192449569702,
"learning_rate": 3.5e-06,
"loss": 1.3873,
"step": 70
},
{
"epoch": 0.04025704025704026,
"grad_norm": 1.9943838119506836,
"learning_rate": 3.5500000000000003e-06,
"loss": 1.4527,
"step": 71
},
{
"epoch": 0.04082404082404082,
"grad_norm": 2.073538303375244,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.4175,
"step": 72
},
{
"epoch": 0.04139104139104139,
"grad_norm": 2.014153480529785,
"learning_rate": 3.65e-06,
"loss": 1.4308,
"step": 73
},
{
"epoch": 0.04195804195804196,
"grad_norm": 2.140015125274658,
"learning_rate": 3.7e-06,
"loss": 1.3748,
"step": 74
},
{
"epoch": 0.04252504252504252,
"grad_norm": 2.0118002891540527,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3709,
"step": 75
},
{
"epoch": 0.04309204309204309,
"grad_norm": 2.0641977787017822,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.3158,
"step": 76
},
{
"epoch": 0.04365904365904366,
"grad_norm": 2.119843006134033,
"learning_rate": 3.85e-06,
"loss": 1.4224,
"step": 77
},
{
"epoch": 0.044226044226044224,
"grad_norm": 2.0117244720458984,
"learning_rate": 3.900000000000001e-06,
"loss": 1.2549,
"step": 78
},
{
"epoch": 0.044793044793044794,
"grad_norm": 2.001262664794922,
"learning_rate": 3.95e-06,
"loss": 1.4035,
"step": 79
},
{
"epoch": 0.04536004536004536,
"grad_norm": 2.128577470779419,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3866,
"step": 80
},
{
"epoch": 0.045927045927045926,
"grad_norm": 2.002300977706909,
"learning_rate": 4.05e-06,
"loss": 1.3133,
"step": 81
},
{
"epoch": 0.046494046494046495,
"grad_norm": 2.065413475036621,
"learning_rate": 4.1e-06,
"loss": 1.37,
"step": 82
},
{
"epoch": 0.04706104706104706,
"grad_norm": 2.11797833442688,
"learning_rate": 4.15e-06,
"loss": 1.4127,
"step": 83
},
{
"epoch": 0.04762804762804763,
"grad_norm": 2.0376827716827393,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.3478,
"step": 84
},
{
"epoch": 0.0481950481950482,
"grad_norm": 2.141932964324951,
"learning_rate": 4.25e-06,
"loss": 1.4469,
"step": 85
},
{
"epoch": 0.04876204876204876,
"grad_norm": 2.0376832485198975,
"learning_rate": 4.3e-06,
"loss": 1.3035,
"step": 86
},
{
"epoch": 0.04932904932904933,
"grad_norm": 1.9415974617004395,
"learning_rate": 4.350000000000001e-06,
"loss": 1.3057,
"step": 87
},
{
"epoch": 0.0498960498960499,
"grad_norm": 1.972311019897461,
"learning_rate": 4.4e-06,
"loss": 1.3494,
"step": 88
},
{
"epoch": 0.05046305046305046,
"grad_norm": 2.0214779376983643,
"learning_rate": 4.450000000000001e-06,
"loss": 1.348,
"step": 89
},
{
"epoch": 0.05103005103005103,
"grad_norm": 1.9641141891479492,
"learning_rate": 4.5e-06,
"loss": 1.3264,
"step": 90
},
{
"epoch": 0.051597051597051594,
"grad_norm": 2.1060950756073,
"learning_rate": 4.5500000000000005e-06,
"loss": 1.3606,
"step": 91
},
{
"epoch": 0.05216405216405216,
"grad_norm": 2.0103988647460938,
"learning_rate": 4.600000000000001e-06,
"loss": 1.3464,
"step": 92
},
{
"epoch": 0.05273105273105273,
"grad_norm": 1.9835673570632935,
"learning_rate": 4.65e-06,
"loss": 1.353,
"step": 93
},
{
"epoch": 0.053298053298053295,
"grad_norm": 2.0680184364318848,
"learning_rate": 4.7e-06,
"loss": 1.4022,
"step": 94
},
{
"epoch": 0.053865053865053865,
"grad_norm": 2.5879733562469482,
"learning_rate": 4.75e-06,
"loss": 1.3186,
"step": 95
},
{
"epoch": 0.054432054432054434,
"grad_norm": 2.620063066482544,
"learning_rate": 4.800000000000001e-06,
"loss": 1.3101,
"step": 96
},
{
"epoch": 0.054999054999055,
"grad_norm": 2.196293592453003,
"learning_rate": 4.85e-06,
"loss": 1.2989,
"step": 97
},
{
"epoch": 0.05556605556605557,
"grad_norm": 2.123204231262207,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.3334,
"step": 98
},
{
"epoch": 0.056133056133056136,
"grad_norm": 2.1718428134918213,
"learning_rate": 4.95e-06,
"loss": 1.3703,
"step": 99
},
{
"epoch": 0.0567000567000567,
"grad_norm": 2.10593843460083,
"learning_rate": 5e-06,
"loss": 1.3411,
"step": 100
},
{
"epoch": 0.05726705726705727,
"grad_norm": 1.9898821115493774,
"learning_rate": 4.999999887629331e-06,
"loss": 1.3076,
"step": 101
},
{
"epoch": 0.05783405783405783,
"grad_norm": 2.1761505603790283,
"learning_rate": 4.999999550517334e-06,
"loss": 1.3909,
"step": 102
},
{
"epoch": 0.0584010584010584,
"grad_norm": 1.9373730421066284,
"learning_rate": 4.999998988664039e-06,
"loss": 1.2573,
"step": 103
},
{
"epoch": 0.05896805896805897,
"grad_norm": 2.208836078643799,
"learning_rate": 4.999998202069496e-06,
"loss": 1.3415,
"step": 104
},
{
"epoch": 0.05953505953505953,
"grad_norm": 2.1279194355010986,
"learning_rate": 4.999997190733778e-06,
"loss": 1.3049,
"step": 105
},
{
"epoch": 0.0601020601020601,
"grad_norm": 2.0629360675811768,
"learning_rate": 4.999995954656972e-06,
"loss": 1.289,
"step": 106
},
{
"epoch": 0.06066906066906067,
"grad_norm": 2.208876132965088,
"learning_rate": 4.9999944938391935e-06,
"loss": 1.3609,
"step": 107
},
{
"epoch": 0.061236061236061234,
"grad_norm": 2.086843252182007,
"learning_rate": 4.99999280828057e-06,
"loss": 1.3733,
"step": 108
},
{
"epoch": 0.061803061803061804,
"grad_norm": 2.007978677749634,
"learning_rate": 4.999990897981256e-06,
"loss": 1.2892,
"step": 109
},
{
"epoch": 0.062370062370062374,
"grad_norm": 2.0387015342712402,
"learning_rate": 4.999988762941422e-06,
"loss": 1.3037,
"step": 110
},
{
"epoch": 0.06293706293706294,
"grad_norm": 2.8154208660125732,
"learning_rate": 4.99998640316126e-06,
"loss": 1.2534,
"step": 111
},
{
"epoch": 0.0635040635040635,
"grad_norm": 1.996565818786621,
"learning_rate": 4.999983818640981e-06,
"loss": 1.2697,
"step": 112
},
{
"epoch": 0.06407106407106407,
"grad_norm": 1.850510835647583,
"learning_rate": 4.99998100938082e-06,
"loss": 1.295,
"step": 113
},
{
"epoch": 0.06463806463806464,
"grad_norm": 1.994425654411316,
"learning_rate": 4.999977975381027e-06,
"loss": 1.3082,
"step": 114
},
{
"epoch": 0.06520506520506521,
"grad_norm": 2.0517311096191406,
"learning_rate": 4.999974716641875e-06,
"loss": 1.2921,
"step": 115
},
{
"epoch": 0.06577206577206578,
"grad_norm": 2.0769684314727783,
"learning_rate": 4.999971233163658e-06,
"loss": 1.3186,
"step": 116
},
{
"epoch": 0.06633906633906633,
"grad_norm": 2.0265793800354004,
"learning_rate": 4.99996752494669e-06,
"loss": 1.3795,
"step": 117
},
{
"epoch": 0.0669060669060669,
"grad_norm": 1.9943898916244507,
"learning_rate": 4.999963591991302e-06,
"loss": 1.3286,
"step": 118
},
{
"epoch": 0.06747306747306747,
"grad_norm": 2.2208609580993652,
"learning_rate": 4.999959434297849e-06,
"loss": 1.2839,
"step": 119
},
{
"epoch": 0.06804006804006804,
"grad_norm": 2.0283749103546143,
"learning_rate": 4.9999550518667045e-06,
"loss": 1.3112,
"step": 120
},
{
"epoch": 0.06860706860706861,
"grad_norm": 2.0714364051818848,
"learning_rate": 4.999950444698262e-06,
"loss": 1.3426,
"step": 121
},
{
"epoch": 0.06917406917406918,
"grad_norm": 1.9883396625518799,
"learning_rate": 4.999945612792937e-06,
"loss": 1.321,
"step": 122
},
{
"epoch": 0.06974106974106974,
"grad_norm": 2.229637622833252,
"learning_rate": 4.999940556151163e-06,
"loss": 1.3168,
"step": 123
},
{
"epoch": 0.0703080703080703,
"grad_norm": 2.144243001937866,
"learning_rate": 4.999935274773394e-06,
"loss": 1.2621,
"step": 124
},
{
"epoch": 0.07087507087507088,
"grad_norm": 2.2905640602111816,
"learning_rate": 4.999929768660105e-06,
"loss": 1.3178,
"step": 125
},
{
"epoch": 0.07144207144207144,
"grad_norm": 2.149752140045166,
"learning_rate": 4.999924037811792e-06,
"loss": 1.3038,
"step": 126
},
{
"epoch": 0.07200907200907201,
"grad_norm": 2.0315706729888916,
"learning_rate": 4.999918082228969e-06,
"loss": 1.2576,
"step": 127
},
{
"epoch": 0.07257607257607257,
"grad_norm": 2.168241262435913,
"learning_rate": 4.999911901912172e-06,
"loss": 1.2738,
"step": 128
},
{
"epoch": 0.07314307314307314,
"grad_norm": 1.9652851819992065,
"learning_rate": 4.999905496861957e-06,
"loss": 1.2494,
"step": 129
},
{
"epoch": 0.07371007371007371,
"grad_norm": 2.2343976497650146,
"learning_rate": 4.999898867078898e-06,
"loss": 1.336,
"step": 130
},
{
"epoch": 0.07427707427707428,
"grad_norm": 2.411104202270508,
"learning_rate": 4.999892012563593e-06,
"loss": 1.2526,
"step": 131
},
{
"epoch": 0.07484407484407485,
"grad_norm": 2.051623582839966,
"learning_rate": 4.999884933316658e-06,
"loss": 1.3189,
"step": 132
},
{
"epoch": 0.07541107541107542,
"grad_norm": 2.2318692207336426,
"learning_rate": 4.999877629338729e-06,
"loss": 1.3392,
"step": 133
},
{
"epoch": 0.07597807597807597,
"grad_norm": 2.2030036449432373,
"learning_rate": 4.999870100630462e-06,
"loss": 1.2661,
"step": 134
},
{
"epoch": 0.07654507654507654,
"grad_norm": 2.2581799030303955,
"learning_rate": 4.999862347192533e-06,
"loss": 1.3374,
"step": 135
},
{
"epoch": 0.07711207711207711,
"grad_norm": 2.343352794647217,
"learning_rate": 4.9998543690256415e-06,
"loss": 1.2771,
"step": 136
},
{
"epoch": 0.07767907767907768,
"grad_norm": 2.143404483795166,
"learning_rate": 4.999846166130503e-06,
"loss": 1.2934,
"step": 137
},
{
"epoch": 0.07824607824607825,
"grad_norm": 2.125469923019409,
"learning_rate": 4.999837738507856e-06,
"loss": 1.2593,
"step": 138
},
{
"epoch": 0.07881307881307881,
"grad_norm": 1.9987525939941406,
"learning_rate": 4.999829086158458e-06,
"loss": 1.2719,
"step": 139
},
{
"epoch": 0.07938007938007938,
"grad_norm": 2.0173356533050537,
"learning_rate": 4.999820209083085e-06,
"loss": 1.2888,
"step": 140
},
{
"epoch": 0.07994707994707995,
"grad_norm": 2.903157949447632,
"learning_rate": 4.999811107282537e-06,
"loss": 1.2915,
"step": 141
},
{
"epoch": 0.08051408051408052,
"grad_norm": 2.426927328109741,
"learning_rate": 4.999801780757631e-06,
"loss": 1.3207,
"step": 142
},
{
"epoch": 0.08108108108108109,
"grad_norm": 2.19057035446167,
"learning_rate": 4.999792229509207e-06,
"loss": 1.2671,
"step": 143
},
{
"epoch": 0.08164808164808164,
"grad_norm": 1.889320731163025,
"learning_rate": 4.9997824535381215e-06,
"loss": 1.2258,
"step": 144
},
{
"epoch": 0.08221508221508221,
"grad_norm": 2.0674948692321777,
"learning_rate": 4.9997724528452554e-06,
"loss": 1.2624,
"step": 145
},
{
"epoch": 0.08278208278208278,
"grad_norm": 2.3926374912261963,
"learning_rate": 4.999762227431506e-06,
"loss": 1.2322,
"step": 146
},
{
"epoch": 0.08334908334908335,
"grad_norm": 2.16817045211792,
"learning_rate": 4.999751777297794e-06,
"loss": 1.2871,
"step": 147
},
{
"epoch": 0.08391608391608392,
"grad_norm": 2.0507423877716064,
"learning_rate": 4.999741102445057e-06,
"loss": 1.2344,
"step": 148
},
{
"epoch": 0.08448308448308449,
"grad_norm": 2.096215009689331,
"learning_rate": 4.999730202874256e-06,
"loss": 1.2724,
"step": 149
},
{
"epoch": 0.08505008505008504,
"grad_norm": 2.091418504714966,
"learning_rate": 4.999719078586372e-06,
"loss": 1.2381,
"step": 150
},
{
"epoch": 0.08561708561708561,
"grad_norm": 2.1694529056549072,
"learning_rate": 4.999707729582402e-06,
"loss": 1.298,
"step": 151
},
{
"epoch": 0.08618408618408618,
"grad_norm": 2.03376841545105,
"learning_rate": 4.999696155863369e-06,
"loss": 1.2768,
"step": 152
},
{
"epoch": 0.08675108675108675,
"grad_norm": 2.1496047973632812,
"learning_rate": 4.999684357430312e-06,
"loss": 1.2704,
"step": 153
},
{
"epoch": 0.08731808731808732,
"grad_norm": 2.192505359649658,
"learning_rate": 4.999672334284292e-06,
"loss": 1.2537,
"step": 154
},
{
"epoch": 0.08788508788508788,
"grad_norm": 2.290090560913086,
"learning_rate": 4.999660086426389e-06,
"loss": 1.258,
"step": 155
},
{
"epoch": 0.08845208845208845,
"grad_norm": 2.2584447860717773,
"learning_rate": 4.999647613857706e-06,
"loss": 1.2762,
"step": 156
},
{
"epoch": 0.08901908901908902,
"grad_norm": 2.1247639656066895,
"learning_rate": 4.999634916579362e-06,
"loss": 1.2899,
"step": 157
},
{
"epoch": 0.08958608958608959,
"grad_norm": 2.1827831268310547,
"learning_rate": 4.9996219945925e-06,
"loss": 1.2723,
"step": 158
},
{
"epoch": 0.09015309015309016,
"grad_norm": 2.2694759368896484,
"learning_rate": 4.999608847898281e-06,
"loss": 1.3093,
"step": 159
},
{
"epoch": 0.09072009072009073,
"grad_norm": 2.494340658187866,
"learning_rate": 4.9995954764978865e-06,
"loss": 1.3717,
"step": 160
},
{
"epoch": 0.09128709128709128,
"grad_norm": 2.2255382537841797,
"learning_rate": 4.99958188039252e-06,
"loss": 1.2386,
"step": 161
},
{
"epoch": 0.09185409185409185,
"grad_norm": 2.237102746963501,
"learning_rate": 4.999568059583401e-06,
"loss": 1.285,
"step": 162
},
{
"epoch": 0.09242109242109242,
"grad_norm": 2.3099398612976074,
"learning_rate": 4.999554014071776e-06,
"loss": 1.2403,
"step": 163
},
{
"epoch": 0.09298809298809299,
"grad_norm": 2.0923874378204346,
"learning_rate": 4.999539743858904e-06,
"loss": 1.3747,
"step": 164
},
{
"epoch": 0.09355509355509356,
"grad_norm": 2.0679986476898193,
"learning_rate": 4.99952524894607e-06,
"loss": 1.1877,
"step": 165
},
{
"epoch": 0.09412209412209412,
"grad_norm": 2.03501296043396,
"learning_rate": 4.999510529334575e-06,
"loss": 1.2692,
"step": 166
},
{
"epoch": 0.09468909468909469,
"grad_norm": 2.1350958347320557,
"learning_rate": 4.9994955850257444e-06,
"loss": 1.292,
"step": 167
},
{
"epoch": 0.09525609525609526,
"grad_norm": 2.3618247509002686,
"learning_rate": 4.999480416020921e-06,
"loss": 1.255,
"step": 168
},
{
"epoch": 0.09582309582309582,
"grad_norm": 2.1679601669311523,
"learning_rate": 4.9994650223214665e-06,
"loss": 1.2706,
"step": 169
},
{
"epoch": 0.0963900963900964,
"grad_norm": 2.4074959754943848,
"learning_rate": 4.999449403928768e-06,
"loss": 1.2783,
"step": 170
},
{
"epoch": 0.09695709695709696,
"grad_norm": 2.200929880142212,
"learning_rate": 4.999433560844228e-06,
"loss": 1.2846,
"step": 171
},
{
"epoch": 0.09752409752409752,
"grad_norm": 2.061795949935913,
"learning_rate": 4.999417493069269e-06,
"loss": 1.2984,
"step": 172
},
{
"epoch": 0.09809109809109809,
"grad_norm": 2.0811519622802734,
"learning_rate": 4.99940120060534e-06,
"loss": 1.2096,
"step": 173
},
{
"epoch": 0.09865809865809866,
"grad_norm": 2.2621266841888428,
"learning_rate": 4.999384683453901e-06,
"loss": 1.304,
"step": 174
},
{
"epoch": 0.09922509922509923,
"grad_norm": 2.1586225032806396,
"learning_rate": 4.999367941616438e-06,
"loss": 1.2258,
"step": 175
},
{
"epoch": 0.0997920997920998,
"grad_norm": 2.058638334274292,
"learning_rate": 4.9993509750944565e-06,
"loss": 1.2461,
"step": 176
},
{
"epoch": 0.10035910035910035,
"grad_norm": 2.0724360942840576,
"learning_rate": 4.999333783889483e-06,
"loss": 1.2287,
"step": 177
},
{
"epoch": 0.10092610092610092,
"grad_norm": 1.9717096090316772,
"learning_rate": 4.999316368003062e-06,
"loss": 1.2764,
"step": 178
},
{
"epoch": 0.10149310149310149,
"grad_norm": 2.199272871017456,
"learning_rate": 4.999298727436758e-06,
"loss": 1.245,
"step": 179
},
{
"epoch": 0.10206010206010206,
"grad_norm": 2.1129369735717773,
"learning_rate": 4.999280862192158e-06,
"loss": 1.2592,
"step": 180
},
{
"epoch": 0.10262710262710263,
"grad_norm": 2.0408732891082764,
"learning_rate": 4.999262772270867e-06,
"loss": 1.2407,
"step": 181
},
{
"epoch": 0.10319410319410319,
"grad_norm": 2.1376776695251465,
"learning_rate": 4.999244457674514e-06,
"loss": 1.2328,
"step": 182
},
{
"epoch": 0.10376110376110376,
"grad_norm": 2.1702775955200195,
"learning_rate": 4.999225918404741e-06,
"loss": 1.2439,
"step": 183
},
{
"epoch": 0.10432810432810433,
"grad_norm": 2.6905198097229004,
"learning_rate": 4.9992071544632184e-06,
"loss": 1.2983,
"step": 184
},
{
"epoch": 0.1048951048951049,
"grad_norm": 2.080111265182495,
"learning_rate": 4.999188165851632e-06,
"loss": 1.2545,
"step": 185
},
{
"epoch": 0.10546210546210547,
"grad_norm": 2.1639678478240967,
"learning_rate": 4.999168952571687e-06,
"loss": 1.2709,
"step": 186
},
{
"epoch": 0.10602910602910603,
"grad_norm": 2.16153883934021,
"learning_rate": 4.999149514625113e-06,
"loss": 1.2861,
"step": 187
},
{
"epoch": 0.10659610659610659,
"grad_norm": 2.1005361080169678,
"learning_rate": 4.999129852013656e-06,
"loss": 1.2461,
"step": 188
},
{
"epoch": 0.10716310716310716,
"grad_norm": 2.059573173522949,
"learning_rate": 4.999109964739085e-06,
"loss": 1.2245,
"step": 189
},
{
"epoch": 0.10773010773010773,
"grad_norm": 2.1651785373687744,
"learning_rate": 4.999089852803186e-06,
"loss": 1.1909,
"step": 190
},
{
"epoch": 0.1082971082971083,
"grad_norm": 2.1926372051239014,
"learning_rate": 4.999069516207767e-06,
"loss": 1.2346,
"step": 191
},
{
"epoch": 0.10886410886410887,
"grad_norm": 1.9541335105895996,
"learning_rate": 4.999048954954658e-06,
"loss": 1.1564,
"step": 192
},
{
"epoch": 0.10943110943110942,
"grad_norm": 2.194918394088745,
"learning_rate": 4.9990281690457075e-06,
"loss": 1.2866,
"step": 193
},
{
"epoch": 0.10999810999811,
"grad_norm": 2.0585858821868896,
"learning_rate": 4.9990071584827815e-06,
"loss": 1.2152,
"step": 194
},
{
"epoch": 0.11056511056511056,
"grad_norm": 2.174222469329834,
"learning_rate": 4.998985923267771e-06,
"loss": 1.2421,
"step": 195
},
{
"epoch": 0.11113211113211113,
"grad_norm": 2.400076150894165,
"learning_rate": 4.998964463402583e-06,
"loss": 1.2473,
"step": 196
},
{
"epoch": 0.1116991116991117,
"grad_norm": 2.1600160598754883,
"learning_rate": 4.99894277888915e-06,
"loss": 1.2247,
"step": 197
},
{
"epoch": 0.11226611226611227,
"grad_norm": 2.108492851257324,
"learning_rate": 4.998920869729418e-06,
"loss": 1.2678,
"step": 198
},
{
"epoch": 0.11283311283311283,
"grad_norm": 2.3057072162628174,
"learning_rate": 4.998898735925357e-06,
"loss": 1.2647,
"step": 199
},
{
"epoch": 0.1134001134001134,
"grad_norm": 2.171736240386963,
"learning_rate": 4.998876377478959e-06,
"loss": 1.2397,
"step": 200
},
{
"epoch": 0.11396711396711397,
"grad_norm": 2.1163182258605957,
"learning_rate": 4.998853794392233e-06,
"loss": 1.2026,
"step": 201
},
{
"epoch": 0.11453411453411454,
"grad_norm": 2.1151411533355713,
"learning_rate": 4.998830986667207e-06,
"loss": 1.2562,
"step": 202
},
{
"epoch": 0.1151011151011151,
"grad_norm": 2.1629276275634766,
"learning_rate": 4.998807954305933e-06,
"loss": 1.242,
"step": 203
},
{
"epoch": 0.11566811566811566,
"grad_norm": 2.0635461807250977,
"learning_rate": 4.998784697310483e-06,
"loss": 1.2825,
"step": 204
},
{
"epoch": 0.11623511623511623,
"grad_norm": 2.018306255340576,
"learning_rate": 4.998761215682945e-06,
"loss": 1.2942,
"step": 205
},
{
"epoch": 0.1168021168021168,
"grad_norm": 2.0687899589538574,
"learning_rate": 4.998737509425432e-06,
"loss": 1.241,
"step": 206
},
{
"epoch": 0.11736911736911737,
"grad_norm": 2.129807233810425,
"learning_rate": 4.998713578540074e-06,
"loss": 1.2185,
"step": 207
},
{
"epoch": 0.11793611793611794,
"grad_norm": 2.1497106552124023,
"learning_rate": 4.998689423029022e-06,
"loss": 1.2279,
"step": 208
},
{
"epoch": 0.11850311850311851,
"grad_norm": 2.1885874271392822,
"learning_rate": 4.998665042894449e-06,
"loss": 1.2404,
"step": 209
},
{
"epoch": 0.11907011907011907,
"grad_norm": 2.271296739578247,
"learning_rate": 4.998640438138545e-06,
"loss": 1.242,
"step": 210
},
{
"epoch": 0.11963711963711963,
"grad_norm": 2.147714138031006,
"learning_rate": 4.998615608763524e-06,
"loss": 1.1742,
"step": 211
},
{
"epoch": 0.1202041202041202,
"grad_norm": 2.109957695007324,
"learning_rate": 4.998590554771615e-06,
"loss": 1.2519,
"step": 212
},
{
"epoch": 0.12077112077112077,
"grad_norm": 2.291804313659668,
"learning_rate": 4.998565276165073e-06,
"loss": 1.2788,
"step": 213
},
{
"epoch": 0.12133812133812134,
"grad_norm": 2.127683401107788,
"learning_rate": 4.998539772946169e-06,
"loss": 1.2399,
"step": 214
},
{
"epoch": 0.1219051219051219,
"grad_norm": 2.188720226287842,
"learning_rate": 4.998514045117197e-06,
"loss": 1.2295,
"step": 215
},
{
"epoch": 0.12247212247212247,
"grad_norm": 2.2163619995117188,
"learning_rate": 4.998488092680468e-06,
"loss": 1.2594,
"step": 216
},
{
"epoch": 0.12303912303912304,
"grad_norm": 2.2182250022888184,
"learning_rate": 4.998461915638316e-06,
"loss": 1.2382,
"step": 217
},
{
"epoch": 0.12360612360612361,
"grad_norm": 2.0882575511932373,
"learning_rate": 4.998435513993095e-06,
"loss": 1.2426,
"step": 218
},
{
"epoch": 0.12417312417312418,
"grad_norm": 2.2090935707092285,
"learning_rate": 4.998408887747177e-06,
"loss": 1.2609,
"step": 219
},
{
"epoch": 0.12474012474012475,
"grad_norm": 2.1636769771575928,
"learning_rate": 4.9983820369029565e-06,
"loss": 1.2703,
"step": 220
},
{
"epoch": 0.12530712530712532,
"grad_norm": 2.2115790843963623,
"learning_rate": 4.998354961462847e-06,
"loss": 1.242,
"step": 221
},
{
"epoch": 0.1258741258741259,
"grad_norm": 2.766953945159912,
"learning_rate": 4.998327661429282e-06,
"loss": 1.2074,
"step": 222
},
{
"epoch": 0.12644112644112643,
"grad_norm": 2.1348719596862793,
"learning_rate": 4.998300136804717e-06,
"loss": 1.2299,
"step": 223
},
{
"epoch": 0.127008127008127,
"grad_norm": 2.1691837310791016,
"learning_rate": 4.998272387591625e-06,
"loss": 1.272,
"step": 224
},
{
"epoch": 0.12757512757512757,
"grad_norm": 2.123806953430176,
"learning_rate": 4.998244413792501e-06,
"loss": 1.2037,
"step": 225
},
{
"epoch": 0.12814212814212814,
"grad_norm": 2.1003634929656982,
"learning_rate": 4.9982162154098605e-06,
"loss": 1.2979,
"step": 226
},
{
"epoch": 0.1287091287091287,
"grad_norm": 2.2071104049682617,
"learning_rate": 4.998187792446238e-06,
"loss": 1.2586,
"step": 227
},
{
"epoch": 0.12927612927612928,
"grad_norm": 1.992179274559021,
"learning_rate": 4.998159144904188e-06,
"loss": 1.2152,
"step": 228
},
{
"epoch": 0.12984312984312985,
"grad_norm": 2.196314573287964,
"learning_rate": 4.998130272786286e-06,
"loss": 1.2631,
"step": 229
},
{
"epoch": 0.13041013041013041,
"grad_norm": 2.072913885116577,
"learning_rate": 4.998101176095128e-06,
"loss": 1.2451,
"step": 230
},
{
"epoch": 0.13097713097713098,
"grad_norm": 2.045172691345215,
"learning_rate": 4.99807185483333e-06,
"loss": 1.1982,
"step": 231
},
{
"epoch": 0.13154413154413155,
"grad_norm": 2.30246639251709,
"learning_rate": 4.998042309003526e-06,
"loss": 1.3061,
"step": 232
},
{
"epoch": 0.13211113211113212,
"grad_norm": 1.9828840494155884,
"learning_rate": 4.9980125386083744e-06,
"loss": 1.2102,
"step": 233
},
{
"epoch": 0.13267813267813267,
"grad_norm": 2.0770061016082764,
"learning_rate": 4.99798254365055e-06,
"loss": 1.268,
"step": 234
},
{
"epoch": 0.13324513324513323,
"grad_norm": 2.1250219345092773,
"learning_rate": 4.99795232413275e-06,
"loss": 1.2187,
"step": 235
},
{
"epoch": 0.1338121338121338,
"grad_norm": 2.24729323387146,
"learning_rate": 4.99792188005769e-06,
"loss": 1.1909,
"step": 236
},
{
"epoch": 0.13437913437913437,
"grad_norm": 2.048240900039673,
"learning_rate": 4.997891211428109e-06,
"loss": 1.216,
"step": 237
},
{
"epoch": 0.13494613494613494,
"grad_norm": 2.0455849170684814,
"learning_rate": 4.997860318246761e-06,
"loss": 1.2172,
"step": 238
},
{
"epoch": 0.1355131355131355,
"grad_norm": 2.306419849395752,
"learning_rate": 4.997829200516426e-06,
"loss": 1.3109,
"step": 239
},
{
"epoch": 0.13608013608013608,
"grad_norm": 2.0704190731048584,
"learning_rate": 4.997797858239899e-06,
"loss": 1.1954,
"step": 240
},
{
"epoch": 0.13664713664713665,
"grad_norm": 2.1370325088500977,
"learning_rate": 4.997766291419999e-06,
"loss": 1.3341,
"step": 241
},
{
"epoch": 0.13721413721413722,
"grad_norm": 2.311892509460449,
"learning_rate": 4.997734500059564e-06,
"loss": 1.2123,
"step": 242
},
{
"epoch": 0.1377811377811378,
"grad_norm": 2.125614643096924,
"learning_rate": 4.997702484161451e-06,
"loss": 1.2384,
"step": 243
},
{
"epoch": 0.13834813834813836,
"grad_norm": 2.17887806892395,
"learning_rate": 4.997670243728538e-06,
"loss": 1.2155,
"step": 244
},
{
"epoch": 0.1389151389151389,
"grad_norm": 2.132019519805908,
"learning_rate": 4.9976377787637246e-06,
"loss": 1.1651,
"step": 245
},
{
"epoch": 0.13948213948213947,
"grad_norm": 2.275861978530884,
"learning_rate": 4.997605089269928e-06,
"loss": 1.2107,
"step": 246
},
{
"epoch": 0.14004914004914004,
"grad_norm": 2.1661062240600586,
"learning_rate": 4.997572175250087e-06,
"loss": 1.2028,
"step": 247
},
{
"epoch": 0.1406161406161406,
"grad_norm": 2.1466269493103027,
"learning_rate": 4.997539036707162e-06,
"loss": 1.2264,
"step": 248
},
{
"epoch": 0.14118314118314118,
"grad_norm": 2.0136516094207764,
"learning_rate": 4.997505673644129e-06,
"loss": 1.224,
"step": 249
},
{
"epoch": 0.14175014175014175,
"grad_norm": 2.0584123134613037,
"learning_rate": 4.9974720860639906e-06,
"loss": 1.2282,
"step": 250
},
{
"epoch": 0.14231714231714232,
"grad_norm": 2.3332486152648926,
"learning_rate": 4.997438273969764e-06,
"loss": 1.2475,
"step": 251
},
{
"epoch": 0.1428841428841429,
"grad_norm": 2.032762289047241,
"learning_rate": 4.997404237364489e-06,
"loss": 1.2194,
"step": 252
},
{
"epoch": 0.14345114345114346,
"grad_norm": 2.1560542583465576,
"learning_rate": 4.997369976251228e-06,
"loss": 1.21,
"step": 253
},
{
"epoch": 0.14401814401814403,
"grad_norm": 2.2596874237060547,
"learning_rate": 4.9973354906330565e-06,
"loss": 1.2647,
"step": 254
},
{
"epoch": 0.1445851445851446,
"grad_norm": 2.0843327045440674,
"learning_rate": 4.997300780513078e-06,
"loss": 1.1982,
"step": 255
},
{
"epoch": 0.14515214515214514,
"grad_norm": 2.177116870880127,
"learning_rate": 4.997265845894411e-06,
"loss": 1.2727,
"step": 256
},
{
"epoch": 0.1457191457191457,
"grad_norm": 2.1334030628204346,
"learning_rate": 4.997230686780197e-06,
"loss": 1.227,
"step": 257
},
{
"epoch": 0.14628614628614628,
"grad_norm": 2.182185649871826,
"learning_rate": 4.997195303173597e-06,
"loss": 1.2546,
"step": 258
},
{
"epoch": 0.14685314685314685,
"grad_norm": 2.2583160400390625,
"learning_rate": 4.99715969507779e-06,
"loss": 1.2269,
"step": 259
},
{
"epoch": 0.14742014742014742,
"grad_norm": 2.1098732948303223,
"learning_rate": 4.99712386249598e-06,
"loss": 1.2527,
"step": 260
},
{
"epoch": 0.147987147987148,
"grad_norm": 2.074742317199707,
"learning_rate": 4.997087805431385e-06,
"loss": 1.1974,
"step": 261
},
{
"epoch": 0.14855414855414856,
"grad_norm": 2.2374022006988525,
"learning_rate": 4.997051523887249e-06,
"loss": 1.2668,
"step": 262
},
{
"epoch": 0.14912114912114913,
"grad_norm": 2.2284483909606934,
"learning_rate": 4.997015017866832e-06,
"loss": 1.1923,
"step": 263
},
{
"epoch": 0.1496881496881497,
"grad_norm": 2.0552480220794678,
"learning_rate": 4.9969782873734165e-06,
"loss": 1.2624,
"step": 264
},
{
"epoch": 0.15025515025515027,
"grad_norm": 1.9496517181396484,
"learning_rate": 4.996941332410304e-06,
"loss": 1.2363,
"step": 265
},
{
"epoch": 0.15082215082215084,
"grad_norm": 2.10217547416687,
"learning_rate": 4.996904152980817e-06,
"loss": 1.2402,
"step": 266
},
{
"epoch": 0.15138915138915138,
"grad_norm": 2.3567399978637695,
"learning_rate": 4.996866749088298e-06,
"loss": 1.185,
"step": 267
},
{
"epoch": 0.15195615195615195,
"grad_norm": 2.2061548233032227,
"learning_rate": 4.996829120736109e-06,
"loss": 1.199,
"step": 268
},
{
"epoch": 0.15252315252315252,
"grad_norm": 2.2688632011413574,
"learning_rate": 4.996791267927632e-06,
"loss": 1.1538,
"step": 269
},
{
"epoch": 0.15309015309015309,
"grad_norm": 2.0387306213378906,
"learning_rate": 4.996753190666272e-06,
"loss": 1.2068,
"step": 270
},
{
"epoch": 0.15365715365715366,
"grad_norm": 1.9924688339233398,
"learning_rate": 4.9967148889554495e-06,
"loss": 1.1666,
"step": 271
},
{
"epoch": 0.15422415422415423,
"grad_norm": 2.164651393890381,
"learning_rate": 4.99667636279861e-06,
"loss": 1.2135,
"step": 272
},
{
"epoch": 0.1547911547911548,
"grad_norm": 2.251842737197876,
"learning_rate": 4.996637612199215e-06,
"loss": 1.2516,
"step": 273
},
{
"epoch": 0.15535815535815536,
"grad_norm": 2.039698600769043,
"learning_rate": 4.996598637160749e-06,
"loss": 1.1277,
"step": 274
},
{
"epoch": 0.15592515592515593,
"grad_norm": 2.076981544494629,
"learning_rate": 4.996559437686716e-06,
"loss": 1.2322,
"step": 275
},
{
"epoch": 0.1564921564921565,
"grad_norm": 2.067885398864746,
"learning_rate": 4.996520013780638e-06,
"loss": 1.236,
"step": 276
},
{
"epoch": 0.15705915705915707,
"grad_norm": 1.9662197828292847,
"learning_rate": 4.996480365446061e-06,
"loss": 1.1706,
"step": 277
},
{
"epoch": 0.15762615762615761,
"grad_norm": 1.9575865268707275,
"learning_rate": 4.99644049268655e-06,
"loss": 1.1679,
"step": 278
},
{
"epoch": 0.15819315819315818,
"grad_norm": 1.9785221815109253,
"learning_rate": 4.996400395505686e-06,
"loss": 1.2064,
"step": 279
},
{
"epoch": 0.15876015876015875,
"grad_norm": 1.8600579500198364,
"learning_rate": 4.996360073907077e-06,
"loss": 1.1616,
"step": 280
},
{
"epoch": 0.15932715932715932,
"grad_norm": 1.9530023336410522,
"learning_rate": 4.996319527894347e-06,
"loss": 1.1516,
"step": 281
},
{
"epoch": 0.1598941598941599,
"grad_norm": 1.9305200576782227,
"learning_rate": 4.996278757471139e-06,
"loss": 1.1937,
"step": 282
},
{
"epoch": 0.16046116046116046,
"grad_norm": 2.1341655254364014,
"learning_rate": 4.996237762641121e-06,
"loss": 1.2599,
"step": 283
},
{
"epoch": 0.16102816102816103,
"grad_norm": 2.364447593688965,
"learning_rate": 4.996196543407976e-06,
"loss": 1.2051,
"step": 284
},
{
"epoch": 0.1615951615951616,
"grad_norm": 2.126579523086548,
"learning_rate": 4.996155099775411e-06,
"loss": 1.2119,
"step": 285
},
{
"epoch": 0.16216216216216217,
"grad_norm": 2.0040011405944824,
"learning_rate": 4.99611343174715e-06,
"loss": 1.2536,
"step": 286
},
{
"epoch": 0.16272916272916274,
"grad_norm": 2.0829451084136963,
"learning_rate": 4.99607153932694e-06,
"loss": 1.1956,
"step": 287
},
{
"epoch": 0.16329616329616328,
"grad_norm": 2.0177149772644043,
"learning_rate": 4.996029422518547e-06,
"loss": 1.1834,
"step": 288
},
{
"epoch": 0.16386316386316385,
"grad_norm": 2.0412003993988037,
"learning_rate": 4.995987081325757e-06,
"loss": 1.2371,
"step": 289
},
{
"epoch": 0.16443016443016442,
"grad_norm": 2.0813381671905518,
"learning_rate": 4.995944515752377e-06,
"loss": 1.2217,
"step": 290
},
{
"epoch": 0.164997164997165,
"grad_norm": 2.190868377685547,
"learning_rate": 4.995901725802231e-06,
"loss": 1.2081,
"step": 291
},
{
"epoch": 0.16556416556416556,
"grad_norm": 1.9888038635253906,
"learning_rate": 4.995858711479169e-06,
"loss": 1.1693,
"step": 292
},
{
"epoch": 0.16613116613116613,
"grad_norm": 2.158754348754883,
"learning_rate": 4.995815472787055e-06,
"loss": 1.2367,
"step": 293
},
{
"epoch": 0.1666981666981667,
"grad_norm": 2.0454671382904053,
"learning_rate": 4.995772009729778e-06,
"loss": 1.1847,
"step": 294
},
{
"epoch": 0.16726516726516727,
"grad_norm": 2.2318146228790283,
"learning_rate": 4.995728322311244e-06,
"loss": 1.1736,
"step": 295
},
{
"epoch": 0.16783216783216784,
"grad_norm": 2.050093412399292,
"learning_rate": 4.995684410535382e-06,
"loss": 1.2201,
"step": 296
},
{
"epoch": 0.1683991683991684,
"grad_norm": 2.1249499320983887,
"learning_rate": 4.995640274406137e-06,
"loss": 1.2297,
"step": 297
},
{
"epoch": 0.16896616896616898,
"grad_norm": 2.013911008834839,
"learning_rate": 4.995595913927478e-06,
"loss": 1.2292,
"step": 298
},
{
"epoch": 0.16953316953316952,
"grad_norm": 3.098249673843384,
"learning_rate": 4.995551329103393e-06,
"loss": 1.1848,
"step": 299
},
{
"epoch": 0.1701001701001701,
"grad_norm": 2.1460604667663574,
"learning_rate": 4.99550651993789e-06,
"loss": 1.2376,
"step": 300
},
{
"epoch": 0.17066717066717066,
"grad_norm": 2.1938812732696533,
"learning_rate": 4.995461486434997e-06,
"loss": 1.1589,
"step": 301
},
{
"epoch": 0.17123417123417123,
"grad_norm": 2.1145992279052734,
"learning_rate": 4.995416228598763e-06,
"loss": 1.2085,
"step": 302
},
{
"epoch": 0.1718011718011718,
"grad_norm": 2.0934178829193115,
"learning_rate": 4.995370746433256e-06,
"loss": 1.2386,
"step": 303
},
{
"epoch": 0.17236817236817237,
"grad_norm": 2.2182319164276123,
"learning_rate": 4.995325039942563e-06,
"loss": 1.1851,
"step": 304
},
{
"epoch": 0.17293517293517294,
"grad_norm": 2.167012929916382,
"learning_rate": 4.995279109130796e-06,
"loss": 1.1872,
"step": 305
},
{
"epoch": 0.1735021735021735,
"grad_norm": 2.0496084690093994,
"learning_rate": 4.995232954002082e-06,
"loss": 1.1956,
"step": 306
},
{
"epoch": 0.17406917406917408,
"grad_norm": 2.009467124938965,
"learning_rate": 4.9951865745605705e-06,
"loss": 1.1736,
"step": 307
},
{
"epoch": 0.17463617463617465,
"grad_norm": 2.2250547409057617,
"learning_rate": 4.995139970810431e-06,
"loss": 1.2494,
"step": 308
},
{
"epoch": 0.17520317520317522,
"grad_norm": 2.7782530784606934,
"learning_rate": 4.995093142755854e-06,
"loss": 1.1837,
"step": 309
},
{
"epoch": 0.17577017577017576,
"grad_norm": 2.1650445461273193,
"learning_rate": 4.995046090401047e-06,
"loss": 1.1792,
"step": 310
},
{
"epoch": 0.17633717633717633,
"grad_norm": 2.4029552936553955,
"learning_rate": 4.994998813750241e-06,
"loss": 1.2021,
"step": 311
},
{
"epoch": 0.1769041769041769,
"grad_norm": 2.227540969848633,
"learning_rate": 4.994951312807687e-06,
"loss": 1.1454,
"step": 312
},
{
"epoch": 0.17747117747117747,
"grad_norm": 2.060182571411133,
"learning_rate": 4.994903587577653e-06,
"loss": 1.2367,
"step": 313
},
{
"epoch": 0.17803817803817804,
"grad_norm": 2.069013833999634,
"learning_rate": 4.994855638064432e-06,
"loss": 1.2495,
"step": 314
},
{
"epoch": 0.1786051786051786,
"grad_norm": 2.2063608169555664,
"learning_rate": 4.994807464272332e-06,
"loss": 1.1819,
"step": 315
},
{
"epoch": 0.17917217917217917,
"grad_norm": 1.9642980098724365,
"learning_rate": 4.994759066205685e-06,
"loss": 1.1656,
"step": 316
},
{
"epoch": 0.17973917973917974,
"grad_norm": 2.192549228668213,
"learning_rate": 4.994710443868842e-06,
"loss": 1.2461,
"step": 317
},
{
"epoch": 0.1803061803061803,
"grad_norm": 1.8663341999053955,
"learning_rate": 4.9946615972661735e-06,
"loss": 1.1618,
"step": 318
},
{
"epoch": 0.18087318087318088,
"grad_norm": 1.9651904106140137,
"learning_rate": 4.994612526402071e-06,
"loss": 1.2028,
"step": 319
},
{
"epoch": 0.18144018144018145,
"grad_norm": 2.250514030456543,
"learning_rate": 4.9945632312809444e-06,
"loss": 1.2224,
"step": 320
},
{
"epoch": 0.182007182007182,
"grad_norm": 2.1259069442749023,
"learning_rate": 4.994513711907227e-06,
"loss": 1.2486,
"step": 321
},
{
"epoch": 0.18257418257418256,
"grad_norm": 2.530109405517578,
"learning_rate": 4.994463968285369e-06,
"loss": 1.2559,
"step": 322
},
{
"epoch": 0.18314118314118313,
"grad_norm": 2.106372833251953,
"learning_rate": 4.994414000419844e-06,
"loss": 1.1923,
"step": 323
},
{
"epoch": 0.1837081837081837,
"grad_norm": 2.2210237979888916,
"learning_rate": 4.994363808315141e-06,
"loss": 1.2749,
"step": 324
},
{
"epoch": 0.18427518427518427,
"grad_norm": 2.1548731327056885,
"learning_rate": 4.994313391975775e-06,
"loss": 1.2003,
"step": 325
},
{
"epoch": 0.18484218484218484,
"grad_norm": 2.310274124145508,
"learning_rate": 4.994262751406277e-06,
"loss": 1.2314,
"step": 326
},
{
"epoch": 0.1854091854091854,
"grad_norm": 2.3611135482788086,
"learning_rate": 4.9942118866112e-06,
"loss": 1.1846,
"step": 327
},
{
"epoch": 0.18597618597618598,
"grad_norm": 2.2071614265441895,
"learning_rate": 4.994160797595115e-06,
"loss": 1.1793,
"step": 328
},
{
"epoch": 0.18654318654318655,
"grad_norm": 1.9791762828826904,
"learning_rate": 4.994109484362617e-06,
"loss": 1.2402,
"step": 329
},
{
"epoch": 0.18711018711018712,
"grad_norm": 2.1133017539978027,
"learning_rate": 4.9940579469183174e-06,
"loss": 1.1781,
"step": 330
},
{
"epoch": 0.1876771876771877,
"grad_norm": 2.1351208686828613,
"learning_rate": 4.994006185266848e-06,
"loss": 1.1897,
"step": 331
},
{
"epoch": 0.18824418824418823,
"grad_norm": 1.9921027421951294,
"learning_rate": 4.9939541994128646e-06,
"loss": 1.1257,
"step": 332
},
{
"epoch": 0.1888111888111888,
"grad_norm": 2.026926279067993,
"learning_rate": 4.99390198936104e-06,
"loss": 1.1219,
"step": 333
},
{
"epoch": 0.18937818937818937,
"grad_norm": 2.0869030952453613,
"learning_rate": 4.993849555116067e-06,
"loss": 1.1899,
"step": 334
},
{
"epoch": 0.18994518994518994,
"grad_norm": 2.1541025638580322,
"learning_rate": 4.9937968966826595e-06,
"loss": 1.231,
"step": 335
},
{
"epoch": 0.1905121905121905,
"grad_norm": 1.927504062652588,
"learning_rate": 4.993744014065551e-06,
"loss": 1.1633,
"step": 336
},
{
"epoch": 0.19107919107919108,
"grad_norm": 2.2648391723632812,
"learning_rate": 4.993690907269496e-06,
"loss": 1.1915,
"step": 337
},
{
"epoch": 0.19164619164619165,
"grad_norm": 2.340109348297119,
"learning_rate": 4.993637576299268e-06,
"loss": 1.1964,
"step": 338
},
{
"epoch": 0.19221319221319222,
"grad_norm": 2.209946393966675,
"learning_rate": 4.993584021159662e-06,
"loss": 1.2111,
"step": 339
},
{
"epoch": 0.1927801927801928,
"grad_norm": 2.405576705932617,
"learning_rate": 4.993530241855491e-06,
"loss": 1.1626,
"step": 340
},
{
"epoch": 0.19334719334719336,
"grad_norm": 2.1429643630981445,
"learning_rate": 4.993476238391591e-06,
"loss": 1.193,
"step": 341
},
{
"epoch": 0.19391419391419393,
"grad_norm": 2.2143402099609375,
"learning_rate": 4.993422010772817e-06,
"loss": 1.2702,
"step": 342
},
{
"epoch": 0.19448119448119447,
"grad_norm": 2.0782816410064697,
"learning_rate": 4.993367559004043e-06,
"loss": 1.1514,
"step": 343
},
{
"epoch": 0.19504819504819504,
"grad_norm": 2.1788971424102783,
"learning_rate": 4.993312883090164e-06,
"loss": 1.1957,
"step": 344
},
{
"epoch": 0.1956151956151956,
"grad_norm": 2.1041131019592285,
"learning_rate": 4.993257983036095e-06,
"loss": 1.2639,
"step": 345
},
{
"epoch": 0.19618219618219618,
"grad_norm": 2.1448371410369873,
"learning_rate": 4.993202858846773e-06,
"loss": 1.2499,
"step": 346
},
{
"epoch": 0.19674919674919675,
"grad_norm": 2.086442470550537,
"learning_rate": 4.993147510527151e-06,
"loss": 1.2106,
"step": 347
},
{
"epoch": 0.19731619731619732,
"grad_norm": 1.9370099306106567,
"learning_rate": 4.993091938082206e-06,
"loss": 1.145,
"step": 348
},
{
"epoch": 0.1978831978831979,
"grad_norm": 2.2078094482421875,
"learning_rate": 4.993036141516934e-06,
"loss": 1.2206,
"step": 349
},
{
"epoch": 0.19845019845019846,
"grad_norm": 2.2671496868133545,
"learning_rate": 4.99298012083635e-06,
"loss": 1.1099,
"step": 350
},
{
"epoch": 0.19901719901719903,
"grad_norm": 2.2037370204925537,
"learning_rate": 4.9929238760454915e-06,
"loss": 1.2188,
"step": 351
},
{
"epoch": 0.1995841995841996,
"grad_norm": 2.1167116165161133,
"learning_rate": 4.9928674071494125e-06,
"loss": 1.2037,
"step": 352
},
{
"epoch": 0.20015120015120016,
"grad_norm": 2.066723585128784,
"learning_rate": 4.992810714153191e-06,
"loss": 1.1682,
"step": 353
},
{
"epoch": 0.2007182007182007,
"grad_norm": 2.2286489009857178,
"learning_rate": 4.992753797061924e-06,
"loss": 1.1563,
"step": 354
},
{
"epoch": 0.20128520128520128,
"grad_norm": 2.165644645690918,
"learning_rate": 4.992696655880727e-06,
"loss": 1.1885,
"step": 355
},
{
"epoch": 0.20185220185220185,
"grad_norm": 2.1326098442077637,
"learning_rate": 4.992639290614736e-06,
"loss": 1.1434,
"step": 356
},
{
"epoch": 0.20241920241920242,
"grad_norm": 2.0205516815185547,
"learning_rate": 4.99258170126911e-06,
"loss": 1.1146,
"step": 357
},
{
"epoch": 0.20298620298620298,
"grad_norm": 2.1004693508148193,
"learning_rate": 4.992523887849025e-06,
"loss": 1.1704,
"step": 358
},
{
"epoch": 0.20355320355320355,
"grad_norm": 2.3303492069244385,
"learning_rate": 4.992465850359679e-06,
"loss": 1.2289,
"step": 359
},
{
"epoch": 0.20412020412020412,
"grad_norm": 2.013455867767334,
"learning_rate": 4.992407588806287e-06,
"loss": 1.1745,
"step": 360
},
{
"epoch": 0.2046872046872047,
"grad_norm": 2.2908389568328857,
"learning_rate": 4.9923491031940895e-06,
"loss": 1.2258,
"step": 361
},
{
"epoch": 0.20525420525420526,
"grad_norm": 2.0814812183380127,
"learning_rate": 4.9922903935283425e-06,
"loss": 1.1626,
"step": 362
},
{
"epoch": 0.20582120582120583,
"grad_norm": 2.045369863510132,
"learning_rate": 4.992231459814324e-06,
"loss": 1.1758,
"step": 363
},
{
"epoch": 0.20638820638820637,
"grad_norm": 1.986330270767212,
"learning_rate": 4.992172302057332e-06,
"loss": 1.1153,
"step": 364
},
{
"epoch": 0.20695520695520694,
"grad_norm": 3.600193738937378,
"learning_rate": 4.9921129202626856e-06,
"loss": 1.171,
"step": 365
},
{
"epoch": 0.2075222075222075,
"grad_norm": 2.119173288345337,
"learning_rate": 4.992053314435722e-06,
"loss": 1.152,
"step": 366
},
{
"epoch": 0.20808920808920808,
"grad_norm": 2.0884904861450195,
"learning_rate": 4.9919934845817984e-06,
"loss": 1.1959,
"step": 367
},
{
"epoch": 0.20865620865620865,
"grad_norm": 2.014221668243408,
"learning_rate": 4.991933430706296e-06,
"loss": 1.1839,
"step": 368
},
{
"epoch": 0.20922320922320922,
"grad_norm": 1.9567116498947144,
"learning_rate": 4.9918731528146115e-06,
"loss": 1.2124,
"step": 369
},
{
"epoch": 0.2097902097902098,
"grad_norm": 2.1171345710754395,
"learning_rate": 4.991812650912163e-06,
"loss": 1.1805,
"step": 370
},
{
"epoch": 0.21035721035721036,
"grad_norm": 2.2752904891967773,
"learning_rate": 4.991751925004392e-06,
"loss": 1.2005,
"step": 371
},
{
"epoch": 0.21092421092421093,
"grad_norm": 2.025243043899536,
"learning_rate": 4.991690975096756e-06,
"loss": 1.1823,
"step": 372
},
{
"epoch": 0.2114912114912115,
"grad_norm": 2.2367069721221924,
"learning_rate": 4.991629801194734e-06,
"loss": 1.1298,
"step": 373
},
{
"epoch": 0.21205821205821207,
"grad_norm": 2.109471082687378,
"learning_rate": 4.991568403303825e-06,
"loss": 1.2322,
"step": 374
},
{
"epoch": 0.2126252126252126,
"grad_norm": 2.2278378009796143,
"learning_rate": 4.99150678142955e-06,
"loss": 1.1766,
"step": 375
},
{
"epoch": 0.21319221319221318,
"grad_norm": 2.0208706855773926,
"learning_rate": 4.991444935577447e-06,
"loss": 1.1748,
"step": 376
},
{
"epoch": 0.21375921375921375,
"grad_norm": 2.2481842041015625,
"learning_rate": 4.991382865753077e-06,
"loss": 1.2832,
"step": 377
},
{
"epoch": 0.21432621432621432,
"grad_norm": 2.2334558963775635,
"learning_rate": 4.9913205719620195e-06,
"loss": 1.2306,
"step": 378
},
{
"epoch": 0.2148932148932149,
"grad_norm": 2.373222589492798,
"learning_rate": 4.991258054209873e-06,
"loss": 1.1926,
"step": 379
},
{
"epoch": 0.21546021546021546,
"grad_norm": 2.2954397201538086,
"learning_rate": 4.9911953125022606e-06,
"loss": 1.25,
"step": 380
},
{
"epoch": 0.21602721602721603,
"grad_norm": 2.0847930908203125,
"learning_rate": 4.991132346844819e-06,
"loss": 1.1645,
"step": 381
},
{
"epoch": 0.2165942165942166,
"grad_norm": 2.0304839611053467,
"learning_rate": 4.991069157243212e-06,
"loss": 1.1687,
"step": 382
},
{
"epoch": 0.21716121716121717,
"grad_norm": 2.000683546066284,
"learning_rate": 4.991005743703118e-06,
"loss": 1.1637,
"step": 383
},
{
"epoch": 0.21772821772821774,
"grad_norm": 2.1974005699157715,
"learning_rate": 4.990942106230238e-06,
"loss": 1.168,
"step": 384
},
{
"epoch": 0.2182952182952183,
"grad_norm": 2.0585193634033203,
"learning_rate": 4.990878244830294e-06,
"loss": 1.213,
"step": 385
},
{
"epoch": 0.21886221886221885,
"grad_norm": 2.027149200439453,
"learning_rate": 4.990814159509025e-06,
"loss": 1.1494,
"step": 386
},
{
"epoch": 0.21942921942921942,
"grad_norm": 2.1952474117279053,
"learning_rate": 4.990749850272193e-06,
"loss": 1.1986,
"step": 387
},
{
"epoch": 0.21999621999622,
"grad_norm": 1.9279240369796753,
"learning_rate": 4.990685317125579e-06,
"loss": 1.2058,
"step": 388
},
{
"epoch": 0.22056322056322056,
"grad_norm": 1.9203171730041504,
"learning_rate": 4.9906205600749855e-06,
"loss": 1.143,
"step": 389
},
{
"epoch": 0.22113022113022113,
"grad_norm": 2.044790506362915,
"learning_rate": 4.990555579126232e-06,
"loss": 1.2609,
"step": 390
},
{
"epoch": 0.2216972216972217,
"grad_norm": 2.142638921737671,
"learning_rate": 4.99049037428516e-06,
"loss": 1.2095,
"step": 391
},
{
"epoch": 0.22226422226422227,
"grad_norm": 2.184535026550293,
"learning_rate": 4.990424945557635e-06,
"loss": 1.153,
"step": 392
},
{
"epoch": 0.22283122283122284,
"grad_norm": 2.1759684085845947,
"learning_rate": 4.990359292949534e-06,
"loss": 1.1759,
"step": 393
},
{
"epoch": 0.2233982233982234,
"grad_norm": 2.133268356323242,
"learning_rate": 4.990293416466761e-06,
"loss": 1.2163,
"step": 394
},
{
"epoch": 0.22396522396522398,
"grad_norm": 2.0592682361602783,
"learning_rate": 4.9902273161152385e-06,
"loss": 1.1854,
"step": 395
},
{
"epoch": 0.22453222453222454,
"grad_norm": 1.9705684185028076,
"learning_rate": 4.990160991900907e-06,
"loss": 1.2054,
"step": 396
},
{
"epoch": 0.2250992250992251,
"grad_norm": 1.9913262128829956,
"learning_rate": 4.990094443829732e-06,
"loss": 1.1733,
"step": 397
},
{
"epoch": 0.22566622566622566,
"grad_norm": 2.1340818405151367,
"learning_rate": 4.990027671907692e-06,
"loss": 1.147,
"step": 398
},
{
"epoch": 0.22623322623322623,
"grad_norm": 2.1737279891967773,
"learning_rate": 4.989960676140793e-06,
"loss": 1.2293,
"step": 399
},
{
"epoch": 0.2268002268002268,
"grad_norm": 2.1729650497436523,
"learning_rate": 4.989893456535056e-06,
"loss": 1.2801,
"step": 400
},
{
"epoch": 0.22736722736722736,
"grad_norm": 2.1917948722839355,
"learning_rate": 4.989826013096522e-06,
"loss": 1.1536,
"step": 401
},
{
"epoch": 0.22793422793422793,
"grad_norm": 2.1080565452575684,
"learning_rate": 4.989758345831258e-06,
"loss": 1.2141,
"step": 402
},
{
"epoch": 0.2285012285012285,
"grad_norm": 2.110219717025757,
"learning_rate": 4.989690454745345e-06,
"loss": 1.2062,
"step": 403
},
{
"epoch": 0.22906822906822907,
"grad_norm": 2.035661220550537,
"learning_rate": 4.989622339844886e-06,
"loss": 1.1714,
"step": 404
},
{
"epoch": 0.22963522963522964,
"grad_norm": 2.1241447925567627,
"learning_rate": 4.989554001136003e-06,
"loss": 1.1623,
"step": 405
},
{
"epoch": 0.2302022302022302,
"grad_norm": 2.0898942947387695,
"learning_rate": 4.989485438624843e-06,
"loss": 1.1604,
"step": 406
},
{
"epoch": 0.23076923076923078,
"grad_norm": 2.0134730339050293,
"learning_rate": 4.989416652317566e-06,
"loss": 1.225,
"step": 407
},
{
"epoch": 0.23133623133623132,
"grad_norm": 2.0604565143585205,
"learning_rate": 4.989347642220357e-06,
"loss": 1.1959,
"step": 408
},
{
"epoch": 0.2319032319032319,
"grad_norm": 2.229936361312866,
"learning_rate": 4.98927840833942e-06,
"loss": 1.1592,
"step": 409
},
{
"epoch": 0.23247023247023246,
"grad_norm": 2.2757463455200195,
"learning_rate": 4.989208950680979e-06,
"loss": 1.148,
"step": 410
},
{
"epoch": 0.23303723303723303,
"grad_norm": 2.0375144481658936,
"learning_rate": 4.989139269251278e-06,
"loss": 1.2944,
"step": 411
},
{
"epoch": 0.2336042336042336,
"grad_norm": 2.230483293533325,
"learning_rate": 4.98906936405658e-06,
"loss": 1.1945,
"step": 412
},
{
"epoch": 0.23417123417123417,
"grad_norm": 1.9437626600265503,
"learning_rate": 4.988999235103171e-06,
"loss": 1.1952,
"step": 413
},
{
"epoch": 0.23473823473823474,
"grad_norm": 2.0950779914855957,
"learning_rate": 4.9889288823973535e-06,
"loss": 1.2084,
"step": 414
},
{
"epoch": 0.2353052353052353,
"grad_norm": 2.0104267597198486,
"learning_rate": 4.9888583059454536e-06,
"loss": 1.1729,
"step": 415
},
{
"epoch": 0.23587223587223588,
"grad_norm": 2.0139896869659424,
"learning_rate": 4.988787505753815e-06,
"loss": 1.1628,
"step": 416
},
{
"epoch": 0.23643923643923645,
"grad_norm": 2.155890703201294,
"learning_rate": 4.9887164818288016e-06,
"loss": 1.1806,
"step": 417
},
{
"epoch": 0.23700623700623702,
"grad_norm": 2.0404961109161377,
"learning_rate": 4.9886452341768e-06,
"loss": 1.1794,
"step": 418
},
{
"epoch": 0.23757323757323756,
"grad_norm": 2.07344126701355,
"learning_rate": 4.988573762804214e-06,
"loss": 1.1908,
"step": 419
},
{
"epoch": 0.23814023814023813,
"grad_norm": 2.1261799335479736,
"learning_rate": 4.988502067717469e-06,
"loss": 1.2493,
"step": 420
},
{
"epoch": 0.2387072387072387,
"grad_norm": 2.111435651779175,
"learning_rate": 4.98843014892301e-06,
"loss": 1.0969,
"step": 421
},
{
"epoch": 0.23927423927423927,
"grad_norm": 2.3221940994262695,
"learning_rate": 4.988358006427303e-06,
"loss": 1.2238,
"step": 422
},
{
"epoch": 0.23984123984123984,
"grad_norm": 2.3007023334503174,
"learning_rate": 4.988285640236832e-06,
"loss": 1.2167,
"step": 423
},
{
"epoch": 0.2404082404082404,
"grad_norm": 2.247527837753296,
"learning_rate": 4.988213050358103e-06,
"loss": 1.1468,
"step": 424
},
{
"epoch": 0.24097524097524098,
"grad_norm": 2.294705867767334,
"learning_rate": 4.988140236797642e-06,
"loss": 1.1917,
"step": 425
},
{
"epoch": 0.24154224154224155,
"grad_norm": 2.0505738258361816,
"learning_rate": 4.9880671995619935e-06,
"loss": 1.1772,
"step": 426
},
{
"epoch": 0.24210924210924212,
"grad_norm": 2.2414138317108154,
"learning_rate": 4.987993938657725e-06,
"loss": 1.1632,
"step": 427
},
{
"epoch": 0.2426762426762427,
"grad_norm": 2.2510695457458496,
"learning_rate": 4.987920454091422e-06,
"loss": 1.235,
"step": 428
},
{
"epoch": 0.24324324324324326,
"grad_norm": 1.9748800992965698,
"learning_rate": 4.987846745869689e-06,
"loss": 1.1171,
"step": 429
},
{
"epoch": 0.2438102438102438,
"grad_norm": 2.399214506149292,
"learning_rate": 4.987772813999154e-06,
"loss": 1.2321,
"step": 430
},
{
"epoch": 0.24437724437724437,
"grad_norm": 2.1555111408233643,
"learning_rate": 4.987698658486462e-06,
"loss": 1.147,
"step": 431
},
{
"epoch": 0.24494424494424494,
"grad_norm": 2.1852099895477295,
"learning_rate": 4.9876242793382795e-06,
"loss": 1.2108,
"step": 432
},
{
"epoch": 0.2455112455112455,
"grad_norm": 2.2297446727752686,
"learning_rate": 4.9875496765612935e-06,
"loss": 1.2424,
"step": 433
},
{
"epoch": 0.24607824607824608,
"grad_norm": 1.976022481918335,
"learning_rate": 4.98747485016221e-06,
"loss": 1.1248,
"step": 434
},
{
"epoch": 0.24664524664524665,
"grad_norm": 2.116549015045166,
"learning_rate": 4.9873998001477564e-06,
"loss": 1.1704,
"step": 435
},
{
"epoch": 0.24721224721224722,
"grad_norm": 2.195775270462036,
"learning_rate": 4.987324526524678e-06,
"loss": 1.206,
"step": 436
},
{
"epoch": 0.24777924777924779,
"grad_norm": 2.012995719909668,
"learning_rate": 4.987249029299743e-06,
"loss": 1.1893,
"step": 437
},
{
"epoch": 0.24834624834624835,
"grad_norm": 1.9351956844329834,
"learning_rate": 4.987173308479738e-06,
"loss": 1.2489,
"step": 438
},
{
"epoch": 0.24891324891324892,
"grad_norm": 2.075611114501953,
"learning_rate": 4.98709736407147e-06,
"loss": 1.2025,
"step": 439
},
{
"epoch": 0.2494802494802495,
"grad_norm": 2.0834317207336426,
"learning_rate": 4.987021196081766e-06,
"loss": 1.1608,
"step": 440
},
{
"epoch": 0.25004725004725004,
"grad_norm": 1.9521535634994507,
"learning_rate": 4.986944804517473e-06,
"loss": 1.0972,
"step": 441
},
{
"epoch": 0.25061425061425063,
"grad_norm": 2.0758798122406006,
"learning_rate": 4.986868189385459e-06,
"loss": 1.1865,
"step": 442
},
{
"epoch": 0.2511812511812512,
"grad_norm": 2.025371789932251,
"learning_rate": 4.98679135069261e-06,
"loss": 1.1592,
"step": 443
},
{
"epoch": 0.2517482517482518,
"grad_norm": 2.3723526000976562,
"learning_rate": 4.986714288445835e-06,
"loss": 1.2082,
"step": 444
},
{
"epoch": 0.2523152523152523,
"grad_norm": 2.081716299057007,
"learning_rate": 4.986637002652061e-06,
"loss": 1.1611,
"step": 445
},
{
"epoch": 0.25288225288225286,
"grad_norm": 2.23604679107666,
"learning_rate": 4.986559493318237e-06,
"loss": 1.1433,
"step": 446
},
{
"epoch": 0.25344925344925345,
"grad_norm": 2.1360273361206055,
"learning_rate": 4.986481760451329e-06,
"loss": 1.1006,
"step": 447
},
{
"epoch": 0.254016254016254,
"grad_norm": 2.1812918186187744,
"learning_rate": 4.986403804058326e-06,
"loss": 1.153,
"step": 448
},
{
"epoch": 0.2545832545832546,
"grad_norm": 2.2628848552703857,
"learning_rate": 4.986325624146236e-06,
"loss": 1.2299,
"step": 449
},
{
"epoch": 0.25515025515025513,
"grad_norm": 2.1486809253692627,
"learning_rate": 4.986247220722085e-06,
"loss": 1.1961,
"step": 450
},
{
"epoch": 0.25571725571725573,
"grad_norm": 2.16284441947937,
"learning_rate": 4.986168593792924e-06,
"loss": 1.2001,
"step": 451
},
{
"epoch": 0.2562842562842563,
"grad_norm": 2.333041191101074,
"learning_rate": 4.986089743365821e-06,
"loss": 1.1224,
"step": 452
},
{
"epoch": 0.25685125685125687,
"grad_norm": 2.0724565982818604,
"learning_rate": 4.986010669447863e-06,
"loss": 1.1522,
"step": 453
},
{
"epoch": 0.2574182574182574,
"grad_norm": 2.077805519104004,
"learning_rate": 4.985931372046159e-06,
"loss": 1.1658,
"step": 454
},
{
"epoch": 0.257985257985258,
"grad_norm": 2.3617515563964844,
"learning_rate": 4.985851851167838e-06,
"loss": 1.2318,
"step": 455
},
{
"epoch": 0.25855225855225855,
"grad_norm": 2.1975150108337402,
"learning_rate": 4.985772106820048e-06,
"loss": 1.155,
"step": 456
},
{
"epoch": 0.2591192591192591,
"grad_norm": 2.175889015197754,
"learning_rate": 4.985692139009958e-06,
"loss": 1.2338,
"step": 457
},
{
"epoch": 0.2596862596862597,
"grad_norm": 2.2394015789031982,
"learning_rate": 4.9856119477447575e-06,
"loss": 1.1954,
"step": 458
},
{
"epoch": 0.26025326025326023,
"grad_norm": 2.231133460998535,
"learning_rate": 4.985531533031654e-06,
"loss": 1.152,
"step": 459
},
{
"epoch": 0.26082026082026083,
"grad_norm": 2.161984920501709,
"learning_rate": 4.9854508948778776e-06,
"loss": 1.1859,
"step": 460
},
{
"epoch": 0.26138726138726137,
"grad_norm": 2.168325901031494,
"learning_rate": 4.985370033290678e-06,
"loss": 1.153,
"step": 461
},
{
"epoch": 0.26195426195426197,
"grad_norm": 2.126570701599121,
"learning_rate": 4.985288948277322e-06,
"loss": 1.1973,
"step": 462
},
{
"epoch": 0.2625212625212625,
"grad_norm": 2.019768476486206,
"learning_rate": 4.985207639845101e-06,
"loss": 1.2032,
"step": 463
},
{
"epoch": 0.2630882630882631,
"grad_norm": 2.0683631896972656,
"learning_rate": 4.985126108001323e-06,
"loss": 1.1933,
"step": 464
},
{
"epoch": 0.26365526365526365,
"grad_norm": 2.112457036972046,
"learning_rate": 4.9850443527533186e-06,
"loss": 1.1886,
"step": 465
},
{
"epoch": 0.26422226422226425,
"grad_norm": 2.1040475368499756,
"learning_rate": 4.984962374108438e-06,
"loss": 1.1435,
"step": 466
},
{
"epoch": 0.2647892647892648,
"grad_norm": 2.0582830905914307,
"learning_rate": 4.9848801720740484e-06,
"loss": 1.1349,
"step": 467
},
{
"epoch": 0.26535626535626533,
"grad_norm": 2.101658582687378,
"learning_rate": 4.98479774665754e-06,
"loss": 1.2083,
"step": 468
},
{
"epoch": 0.26592326592326593,
"grad_norm": 1.9494727849960327,
"learning_rate": 4.984715097866325e-06,
"loss": 1.1068,
"step": 469
},
{
"epoch": 0.26649026649026647,
"grad_norm": 2.1678626537323,
"learning_rate": 4.984632225707831e-06,
"loss": 1.2217,
"step": 470
},
{
"epoch": 0.26705726705726707,
"grad_norm": 2.1645007133483887,
"learning_rate": 4.984549130189508e-06,
"loss": 1.2297,
"step": 471
},
{
"epoch": 0.2676242676242676,
"grad_norm": 1.9701532125473022,
"learning_rate": 4.984465811318826e-06,
"loss": 1.2184,
"step": 472
},
{
"epoch": 0.2681912681912682,
"grad_norm": 2.028223752975464,
"learning_rate": 4.984382269103276e-06,
"loss": 1.2268,
"step": 473
},
{
"epoch": 0.26875826875826875,
"grad_norm": 2.1951472759246826,
"learning_rate": 4.984298503550367e-06,
"loss": 1.0856,
"step": 474
},
{
"epoch": 0.26932526932526935,
"grad_norm": 2.046638250350952,
"learning_rate": 4.984214514667631e-06,
"loss": 1.1667,
"step": 475
},
{
"epoch": 0.2698922698922699,
"grad_norm": 2.0294766426086426,
"learning_rate": 4.984130302462617e-06,
"loss": 1.1449,
"step": 476
},
{
"epoch": 0.2704592704592705,
"grad_norm": 1.9961752891540527,
"learning_rate": 4.984045866942895e-06,
"loss": 1.1493,
"step": 477
},
{
"epoch": 0.271026271026271,
"grad_norm": 2.1608307361602783,
"learning_rate": 4.983961208116057e-06,
"loss": 1.1564,
"step": 478
},
{
"epoch": 0.27159327159327157,
"grad_norm": 2.082221031188965,
"learning_rate": 4.983876325989712e-06,
"loss": 1.1251,
"step": 479
},
{
"epoch": 0.27216027216027217,
"grad_norm": 2.020977258682251,
"learning_rate": 4.983791220571491e-06,
"loss": 1.1933,
"step": 480
},
{
"epoch": 0.2727272727272727,
"grad_norm": 2.1868746280670166,
"learning_rate": 4.983705891869045e-06,
"loss": 1.0961,
"step": 481
},
{
"epoch": 0.2732942732942733,
"grad_norm": 2.204965829849243,
"learning_rate": 4.983620339890045e-06,
"loss": 1.1872,
"step": 482
},
{
"epoch": 0.27386127386127385,
"grad_norm": 2.166038751602173,
"learning_rate": 4.983534564642181e-06,
"loss": 1.1613,
"step": 483
},
{
"epoch": 0.27442827442827444,
"grad_norm": 2.204023599624634,
"learning_rate": 4.9834485661331635e-06,
"loss": 1.1728,
"step": 484
},
{
"epoch": 0.274995274995275,
"grad_norm": 2.2817134857177734,
"learning_rate": 4.983362344370725e-06,
"loss": 1.1308,
"step": 485
},
{
"epoch": 0.2755622755622756,
"grad_norm": 2.1722960472106934,
"learning_rate": 4.983275899362617e-06,
"loss": 1.1608,
"step": 486
},
{
"epoch": 0.2761292761292761,
"grad_norm": 1.996817946434021,
"learning_rate": 4.983189231116609e-06,
"loss": 1.209,
"step": 487
},
{
"epoch": 0.2766962766962767,
"grad_norm": 2.1152753829956055,
"learning_rate": 4.9831023396404915e-06,
"loss": 1.2043,
"step": 488
},
{
"epoch": 0.27726327726327726,
"grad_norm": 2.458534002304077,
"learning_rate": 4.983015224942077e-06,
"loss": 1.1945,
"step": 489
},
{
"epoch": 0.2778302778302778,
"grad_norm": 2.107175350189209,
"learning_rate": 4.9829278870291975e-06,
"loss": 1.1164,
"step": 490
},
{
"epoch": 0.2783972783972784,
"grad_norm": 2.216923236846924,
"learning_rate": 4.982840325909704e-06,
"loss": 1.1624,
"step": 491
},
{
"epoch": 0.27896427896427894,
"grad_norm": 1.9773650169372559,
"learning_rate": 4.982752541591467e-06,
"loss": 1.077,
"step": 492
},
{
"epoch": 0.27953127953127954,
"grad_norm": 1.9695847034454346,
"learning_rate": 4.982664534082377e-06,
"loss": 1.1566,
"step": 493
},
{
"epoch": 0.2800982800982801,
"grad_norm": 2.343064546585083,
"learning_rate": 4.98257630339035e-06,
"loss": 1.2313,
"step": 494
},
{
"epoch": 0.2806652806652807,
"grad_norm": 1.9904532432556152,
"learning_rate": 4.982487849523312e-06,
"loss": 1.1701,
"step": 495
},
{
"epoch": 0.2812322812322812,
"grad_norm": 1.9797039031982422,
"learning_rate": 4.982399172489219e-06,
"loss": 1.1929,
"step": 496
},
{
"epoch": 0.2817992817992818,
"grad_norm": 1.9941498041152954,
"learning_rate": 4.98231027229604e-06,
"loss": 1.1203,
"step": 497
},
{
"epoch": 0.28236628236628236,
"grad_norm": 2.0945804119110107,
"learning_rate": 4.982221148951769e-06,
"loss": 1.081,
"step": 498
},
{
"epoch": 0.28293328293328296,
"grad_norm": 2.344377040863037,
"learning_rate": 4.982131802464417e-06,
"loss": 1.1263,
"step": 499
},
{
"epoch": 0.2835002835002835,
"grad_norm": 2.075709342956543,
"learning_rate": 4.982042232842015e-06,
"loss": 1.1351,
"step": 500
},
{
"epoch": 0.28406728406728404,
"grad_norm": 2.089801073074341,
"learning_rate": 4.9819524400926165e-06,
"loss": 1.1428,
"step": 501
},
{
"epoch": 0.28463428463428464,
"grad_norm": 2.149322032928467,
"learning_rate": 4.981862424224292e-06,
"loss": 1.2166,
"step": 502
},
{
"epoch": 0.2852012852012852,
"grad_norm": 2.2097578048706055,
"learning_rate": 4.981772185245135e-06,
"loss": 1.1438,
"step": 503
},
{
"epoch": 0.2857682857682858,
"grad_norm": 2.0914175510406494,
"learning_rate": 4.981681723163257e-06,
"loss": 1.1878,
"step": 504
},
{
"epoch": 0.2863352863352863,
"grad_norm": 2.1141879558563232,
"learning_rate": 4.981591037986791e-06,
"loss": 1.1921,
"step": 505
},
{
"epoch": 0.2869022869022869,
"grad_norm": 2.122882127761841,
"learning_rate": 4.981500129723888e-06,
"loss": 1.1832,
"step": 506
},
{
"epoch": 0.28746928746928746,
"grad_norm": 2.0918145179748535,
"learning_rate": 4.981408998382722e-06,
"loss": 1.1446,
"step": 507
},
{
"epoch": 0.28803628803628806,
"grad_norm": 2.1555593013763428,
"learning_rate": 4.981317643971483e-06,
"loss": 1.1637,
"step": 508
},
{
"epoch": 0.2886032886032886,
"grad_norm": 2.057591438293457,
"learning_rate": 4.981226066498386e-06,
"loss": 1.1216,
"step": 509
},
{
"epoch": 0.2891702891702892,
"grad_norm": 2.0887913703918457,
"learning_rate": 4.981134265971661e-06,
"loss": 1.1765,
"step": 510
},
{
"epoch": 0.28973728973728974,
"grad_norm": 2.009054660797119,
"learning_rate": 4.981042242399563e-06,
"loss": 1.1756,
"step": 511
},
{
"epoch": 0.2903042903042903,
"grad_norm": 2.092836380004883,
"learning_rate": 4.980949995790363e-06,
"loss": 1.1748,
"step": 512
},
{
"epoch": 0.2908712908712909,
"grad_norm": 2.105720281600952,
"learning_rate": 4.980857526152354e-06,
"loss": 1.1859,
"step": 513
},
{
"epoch": 0.2914382914382914,
"grad_norm": 2.0198593139648438,
"learning_rate": 4.9807648334938495e-06,
"loss": 1.1511,
"step": 514
},
{
"epoch": 0.292005292005292,
"grad_norm": 2.1741116046905518,
"learning_rate": 4.9806719178231815e-06,
"loss": 1.146,
"step": 515
},
{
"epoch": 0.29257229257229256,
"grad_norm": 2.0599679946899414,
"learning_rate": 4.980578779148702e-06,
"loss": 1.166,
"step": 516
},
{
"epoch": 0.29313929313929316,
"grad_norm": 2.0803048610687256,
"learning_rate": 4.980485417478785e-06,
"loss": 1.2055,
"step": 517
},
{
"epoch": 0.2937062937062937,
"grad_norm": 2.0175561904907227,
"learning_rate": 4.980391832821823e-06,
"loss": 1.1096,
"step": 518
},
{
"epoch": 0.2942732942732943,
"grad_norm": 2.224367380142212,
"learning_rate": 4.98029802518623e-06,
"loss": 1.1226,
"step": 519
},
{
"epoch": 0.29484029484029484,
"grad_norm": 2.0235671997070312,
"learning_rate": 4.980203994580438e-06,
"loss": 1.1662,
"step": 520
},
{
"epoch": 0.29540729540729543,
"grad_norm": 2.0228888988494873,
"learning_rate": 4.980109741012899e-06,
"loss": 1.2074,
"step": 521
},
{
"epoch": 0.295974295974296,
"grad_norm": 2.286905288696289,
"learning_rate": 4.980015264492087e-06,
"loss": 1.1795,
"step": 522
},
{
"epoch": 0.2965412965412965,
"grad_norm": 2.0491106510162354,
"learning_rate": 4.979920565026496e-06,
"loss": 1.1727,
"step": 523
},
{
"epoch": 0.2971082971082971,
"grad_norm": 1.9510176181793213,
"learning_rate": 4.979825642624639e-06,
"loss": 1.1782,
"step": 524
},
{
"epoch": 0.29767529767529766,
"grad_norm": 2.1948649883270264,
"learning_rate": 4.979730497295048e-06,
"loss": 1.1906,
"step": 525
},
{
"epoch": 0.29824229824229825,
"grad_norm": 2.01057505607605,
"learning_rate": 4.979635129046276e-06,
"loss": 1.2282,
"step": 526
},
{
"epoch": 0.2988092988092988,
"grad_norm": 2.133222818374634,
"learning_rate": 4.979539537886899e-06,
"loss": 1.2072,
"step": 527
},
{
"epoch": 0.2993762993762994,
"grad_norm": 1.993911623954773,
"learning_rate": 4.979443723825506e-06,
"loss": 1.1421,
"step": 528
},
{
"epoch": 0.29994329994329993,
"grad_norm": 2.000917673110962,
"learning_rate": 4.979347686870714e-06,
"loss": 1.1226,
"step": 529
},
{
"epoch": 0.30051030051030053,
"grad_norm": 2.2361230850219727,
"learning_rate": 4.9792514270311556e-06,
"loss": 1.2491,
"step": 530
},
{
"epoch": 0.3010773010773011,
"grad_norm": 2.1424977779388428,
"learning_rate": 4.979154944315483e-06,
"loss": 1.1685,
"step": 531
},
{
"epoch": 0.30164430164430167,
"grad_norm": 2.0004067420959473,
"learning_rate": 4.979058238732371e-06,
"loss": 1.2122,
"step": 532
},
{
"epoch": 0.3022113022113022,
"grad_norm": 2.1690309047698975,
"learning_rate": 4.978961310290512e-06,
"loss": 1.155,
"step": 533
},
{
"epoch": 0.30277830277830275,
"grad_norm": 2.143450975418091,
"learning_rate": 4.97886415899862e-06,
"loss": 1.1719,
"step": 534
},
{
"epoch": 0.30334530334530335,
"grad_norm": 2.197096824645996,
"learning_rate": 4.978766784865429e-06,
"loss": 1.1263,
"step": 535
},
{
"epoch": 0.3039123039123039,
"grad_norm": 2.075712203979492,
"learning_rate": 4.9786691878996926e-06,
"loss": 1.1412,
"step": 536
},
{
"epoch": 0.3044793044793045,
"grad_norm": 2.1067452430725098,
"learning_rate": 4.978571368110183e-06,
"loss": 1.231,
"step": 537
},
{
"epoch": 0.30504630504630503,
"grad_norm": 2.2791428565979004,
"learning_rate": 4.978473325505696e-06,
"loss": 1.1304,
"step": 538
},
{
"epoch": 0.30561330561330563,
"grad_norm": 1.856226921081543,
"learning_rate": 4.978375060095044e-06,
"loss": 1.1559,
"step": 539
},
{
"epoch": 0.30618030618030617,
"grad_norm": 2.2657668590545654,
"learning_rate": 4.97827657188706e-06,
"loss": 1.1616,
"step": 540
},
{
"epoch": 0.30674730674730677,
"grad_norm": 2.008127212524414,
"learning_rate": 4.9781778608906e-06,
"loss": 1.1456,
"step": 541
},
{
"epoch": 0.3073143073143073,
"grad_norm": 2.2255024909973145,
"learning_rate": 4.978078927114536e-06,
"loss": 1.162,
"step": 542
},
{
"epoch": 0.3078813078813079,
"grad_norm": 2.2034647464752197,
"learning_rate": 4.977979770567762e-06,
"loss": 1.1093,
"step": 543
},
{
"epoch": 0.30844830844830845,
"grad_norm": 2.1638619899749756,
"learning_rate": 4.977880391259192e-06,
"loss": 1.1953,
"step": 544
},
{
"epoch": 0.309015309015309,
"grad_norm": 2.1344549655914307,
"learning_rate": 4.977780789197761e-06,
"loss": 1.1511,
"step": 545
},
{
"epoch": 0.3095823095823096,
"grad_norm": 2.2296082973480225,
"learning_rate": 4.97768096439242e-06,
"loss": 1.1982,
"step": 546
},
{
"epoch": 0.31014931014931013,
"grad_norm": 2.3037984371185303,
"learning_rate": 4.977580916852146e-06,
"loss": 1.1815,
"step": 547
},
{
"epoch": 0.31071631071631073,
"grad_norm": 2.4563586711883545,
"learning_rate": 4.977480646585931e-06,
"loss": 1.1606,
"step": 548
},
{
"epoch": 0.31128331128331127,
"grad_norm": 1.9984495639801025,
"learning_rate": 4.97738015360279e-06,
"loss": 1.144,
"step": 549
},
{
"epoch": 0.31185031185031187,
"grad_norm": 2.0485143661499023,
"learning_rate": 4.977279437911756e-06,
"loss": 1.1536,
"step": 550
},
{
"epoch": 0.3124173124173124,
"grad_norm": 2.2146592140197754,
"learning_rate": 4.9771784995218845e-06,
"loss": 1.1156,
"step": 551
},
{
"epoch": 0.312984312984313,
"grad_norm": 1.880562424659729,
"learning_rate": 4.9770773384422485e-06,
"loss": 1.1479,
"step": 552
},
{
"epoch": 0.31355131355131355,
"grad_norm": 2.0846965312957764,
"learning_rate": 4.976975954681942e-06,
"loss": 1.1266,
"step": 553
},
{
"epoch": 0.31411831411831415,
"grad_norm": 1.9795643091201782,
"learning_rate": 4.976874348250078e-06,
"loss": 1.1197,
"step": 554
},
{
"epoch": 0.3146853146853147,
"grad_norm": 2.1454899311065674,
"learning_rate": 4.976772519155793e-06,
"loss": 1.1333,
"step": 555
},
{
"epoch": 0.31525231525231523,
"grad_norm": 2.2674753665924072,
"learning_rate": 4.97667046740824e-06,
"loss": 1.1433,
"step": 556
},
{
"epoch": 0.3158193158193158,
"grad_norm": 2.0304932594299316,
"learning_rate": 4.976568193016592e-06,
"loss": 1.1531,
"step": 557
},
{
"epoch": 0.31638631638631637,
"grad_norm": 2.2592039108276367,
"learning_rate": 4.976465695990045e-06,
"loss": 1.21,
"step": 558
},
{
"epoch": 0.31695331695331697,
"grad_norm": 2.1316540241241455,
"learning_rate": 4.976362976337811e-06,
"loss": 1.1602,
"step": 559
},
{
"epoch": 0.3175203175203175,
"grad_norm": 2.0556883811950684,
"learning_rate": 4.976260034069126e-06,
"loss": 1.1739,
"step": 560
},
{
"epoch": 0.3180873180873181,
"grad_norm": 2.580925226211548,
"learning_rate": 4.976156869193243e-06,
"loss": 1.1982,
"step": 561
},
{
"epoch": 0.31865431865431865,
"grad_norm": 2.2084896564483643,
"learning_rate": 4.976053481719437e-06,
"loss": 1.191,
"step": 562
},
{
"epoch": 0.31922131922131924,
"grad_norm": 2.0614700317382812,
"learning_rate": 4.975949871657001e-06,
"loss": 1.1364,
"step": 563
},
{
"epoch": 0.3197883197883198,
"grad_norm": 2.1390912532806396,
"learning_rate": 4.975846039015251e-06,
"loss": 1.1374,
"step": 564
},
{
"epoch": 0.3203553203553203,
"grad_norm": 2.044769763946533,
"learning_rate": 4.97574198380352e-06,
"loss": 1.1242,
"step": 565
},
{
"epoch": 0.3209223209223209,
"grad_norm": 1.8909872770309448,
"learning_rate": 4.975637706031162e-06,
"loss": 1.1693,
"step": 566
},
{
"epoch": 0.32148932148932147,
"grad_norm": 2.2310988903045654,
"learning_rate": 4.975533205707552e-06,
"loss": 1.1736,
"step": 567
},
{
"epoch": 0.32205632205632206,
"grad_norm": 2.057612180709839,
"learning_rate": 4.975428482842083e-06,
"loss": 1.2525,
"step": 568
},
{
"epoch": 0.3226233226233226,
"grad_norm": 2.104064464569092,
"learning_rate": 4.975323537444171e-06,
"loss": 1.108,
"step": 569
},
{
"epoch": 0.3231903231903232,
"grad_norm": 2.0129916667938232,
"learning_rate": 4.975218369523249e-06,
"loss": 1.1598,
"step": 570
},
{
"epoch": 0.32375732375732375,
"grad_norm": 2.080134153366089,
"learning_rate": 4.9751129790887705e-06,
"loss": 1.1726,
"step": 571
},
{
"epoch": 0.32432432432432434,
"grad_norm": 2.1668407917022705,
"learning_rate": 4.975007366150212e-06,
"loss": 1.1063,
"step": 572
},
{
"epoch": 0.3248913248913249,
"grad_norm": 2.0228352546691895,
"learning_rate": 4.974901530717066e-06,
"loss": 1.1907,
"step": 573
},
{
"epoch": 0.3254583254583255,
"grad_norm": 2.016080379486084,
"learning_rate": 4.974795472798847e-06,
"loss": 1.1973,
"step": 574
},
{
"epoch": 0.326025326025326,
"grad_norm": 2.2395012378692627,
"learning_rate": 4.97468919240509e-06,
"loss": 1.2248,
"step": 575
},
{
"epoch": 0.32659232659232657,
"grad_norm": 2.1966094970703125,
"learning_rate": 4.974582689545348e-06,
"loss": 1.1356,
"step": 576
},
{
"epoch": 0.32715932715932716,
"grad_norm": 2.1334877014160156,
"learning_rate": 4.974475964229196e-06,
"loss": 1.1458,
"step": 577
},
{
"epoch": 0.3277263277263277,
"grad_norm": 2.388422727584839,
"learning_rate": 4.9743690164662286e-06,
"loss": 1.1866,
"step": 578
},
{
"epoch": 0.3282933282933283,
"grad_norm": 2.077364683151245,
"learning_rate": 4.974261846266059e-06,
"loss": 1.163,
"step": 579
},
{
"epoch": 0.32886032886032884,
"grad_norm": 1.9961100816726685,
"learning_rate": 4.974154453638323e-06,
"loss": 1.1281,
"step": 580
},
{
"epoch": 0.32942732942732944,
"grad_norm": 2.213353395462036,
"learning_rate": 4.974046838592672e-06,
"loss": 1.2425,
"step": 581
},
{
"epoch": 0.32999432999433,
"grad_norm": 2.022712230682373,
"learning_rate": 4.973939001138783e-06,
"loss": 1.2039,
"step": 582
},
{
"epoch": 0.3305613305613306,
"grad_norm": 2.2076306343078613,
"learning_rate": 4.97383094128635e-06,
"loss": 1.1468,
"step": 583
},
{
"epoch": 0.3311283311283311,
"grad_norm": 1.9615507125854492,
"learning_rate": 4.9737226590450855e-06,
"loss": 1.1382,
"step": 584
},
{
"epoch": 0.3316953316953317,
"grad_norm": 2.1458539962768555,
"learning_rate": 4.973614154424725e-06,
"loss": 1.1042,
"step": 585
},
{
"epoch": 0.33226233226233226,
"grad_norm": 2.045104742050171,
"learning_rate": 4.973505427435023e-06,
"loss": 1.1347,
"step": 586
},
{
"epoch": 0.3328293328293328,
"grad_norm": 2.0573666095733643,
"learning_rate": 4.973396478085752e-06,
"loss": 1.1718,
"step": 587
},
{
"epoch": 0.3333963333963334,
"grad_norm": 2.056199550628662,
"learning_rate": 4.973287306386707e-06,
"loss": 1.1704,
"step": 588
},
{
"epoch": 0.33396333396333394,
"grad_norm": 2.456008195877075,
"learning_rate": 4.973177912347703e-06,
"loss": 1.086,
"step": 589
},
{
"epoch": 0.33453033453033454,
"grad_norm": 2.0783896446228027,
"learning_rate": 4.9730682959785735e-06,
"loss": 1.1944,
"step": 590
},
{
"epoch": 0.3350973350973351,
"grad_norm": 2.096651077270508,
"learning_rate": 4.972958457289173e-06,
"loss": 1.1146,
"step": 591
},
{
"epoch": 0.3356643356643357,
"grad_norm": 2.1893470287323,
"learning_rate": 4.972848396289375e-06,
"loss": 1.1147,
"step": 592
},
{
"epoch": 0.3362313362313362,
"grad_norm": 2.1144778728485107,
"learning_rate": 4.972738112989073e-06,
"loss": 1.2118,
"step": 593
},
{
"epoch": 0.3367983367983368,
"grad_norm": 2.035900354385376,
"learning_rate": 4.972627607398183e-06,
"loss": 1.155,
"step": 594
},
{
"epoch": 0.33736533736533736,
"grad_norm": 2.0177159309387207,
"learning_rate": 4.972516879526638e-06,
"loss": 1.1114,
"step": 595
},
{
"epoch": 0.33793233793233796,
"grad_norm": 1.9777759313583374,
"learning_rate": 4.972405929384391e-06,
"loss": 1.0969,
"step": 596
},
{
"epoch": 0.3384993384993385,
"grad_norm": 1.9437038898468018,
"learning_rate": 4.972294756981419e-06,
"loss": 1.0776,
"step": 597
},
{
"epoch": 0.33906633906633904,
"grad_norm": 2.2023017406463623,
"learning_rate": 4.972183362327712e-06,
"loss": 1.085,
"step": 598
},
{
"epoch": 0.33963333963333964,
"grad_norm": 2.137089490890503,
"learning_rate": 4.972071745433287e-06,
"loss": 1.1618,
"step": 599
},
{
"epoch": 0.3402003402003402,
"grad_norm": 2.292973518371582,
"learning_rate": 4.971959906308177e-06,
"loss": 1.2049,
"step": 600
},
{
"epoch": 0.3407673407673408,
"grad_norm": 2.0405850410461426,
"learning_rate": 4.971847844962436e-06,
"loss": 1.1778,
"step": 601
},
{
"epoch": 0.3413343413343413,
"grad_norm": 2.1747894287109375,
"learning_rate": 4.971735561406138e-06,
"loss": 1.1973,
"step": 602
},
{
"epoch": 0.3419013419013419,
"grad_norm": 2.003805637359619,
"learning_rate": 4.971623055649377e-06,
"loss": 1.1549,
"step": 603
},
{
"epoch": 0.34246834246834246,
"grad_norm": 2.138134002685547,
"learning_rate": 4.971510327702267e-06,
"loss": 1.1143,
"step": 604
},
{
"epoch": 0.34303534303534305,
"grad_norm": 2.085298538208008,
"learning_rate": 4.971397377574941e-06,
"loss": 1.1557,
"step": 605
},
{
"epoch": 0.3436023436023436,
"grad_norm": 2.045970916748047,
"learning_rate": 4.9712842052775536e-06,
"loss": 1.1313,
"step": 606
},
{
"epoch": 0.3441693441693442,
"grad_norm": 2.070230007171631,
"learning_rate": 4.971170810820279e-06,
"loss": 1.1676,
"step": 607
},
{
"epoch": 0.34473634473634474,
"grad_norm": 1.9645379781723022,
"learning_rate": 4.97105719421331e-06,
"loss": 1.1211,
"step": 608
},
{
"epoch": 0.3453033453033453,
"grad_norm": 2.0739731788635254,
"learning_rate": 4.970943355466861e-06,
"loss": 1.2498,
"step": 609
},
{
"epoch": 0.3458703458703459,
"grad_norm": 2.0373897552490234,
"learning_rate": 4.970829294591164e-06,
"loss": 1.1665,
"step": 610
},
{
"epoch": 0.3464373464373464,
"grad_norm": 2.105302333831787,
"learning_rate": 4.9707150115964756e-06,
"loss": 1.2105,
"step": 611
},
{
"epoch": 0.347004347004347,
"grad_norm": 1.993300437927246,
"learning_rate": 4.9706005064930674e-06,
"loss": 1.1395,
"step": 612
},
{
"epoch": 0.34757134757134756,
"grad_norm": 2.1406335830688477,
"learning_rate": 4.970485779291234e-06,
"loss": 1.1353,
"step": 613
},
{
"epoch": 0.34813834813834815,
"grad_norm": 2.1167380809783936,
"learning_rate": 4.970370830001288e-06,
"loss": 1.1699,
"step": 614
},
{
"epoch": 0.3487053487053487,
"grad_norm": 2.1319780349731445,
"learning_rate": 4.970255658633564e-06,
"loss": 1.1761,
"step": 615
},
{
"epoch": 0.3492723492723493,
"grad_norm": 2.0811727046966553,
"learning_rate": 4.970140265198414e-06,
"loss": 1.1692,
"step": 616
},
{
"epoch": 0.34983934983934983,
"grad_norm": 2.139535903930664,
"learning_rate": 4.9700246497062135e-06,
"loss": 1.1949,
"step": 617
},
{
"epoch": 0.35040635040635043,
"grad_norm": 2.1217803955078125,
"learning_rate": 4.969908812167354e-06,
"loss": 1.1141,
"step": 618
},
{
"epoch": 0.350973350973351,
"grad_norm": 2.010310173034668,
"learning_rate": 4.969792752592251e-06,
"loss": 1.1575,
"step": 619
},
{
"epoch": 0.3515403515403515,
"grad_norm": 2.0576040744781494,
"learning_rate": 4.969676470991336e-06,
"loss": 1.1145,
"step": 620
},
{
"epoch": 0.3521073521073521,
"grad_norm": 2.233651638031006,
"learning_rate": 4.969559967375063e-06,
"loss": 1.1472,
"step": 621
},
{
"epoch": 0.35267435267435265,
"grad_norm": 1.9711737632751465,
"learning_rate": 4.969443241753905e-06,
"loss": 1.1164,
"step": 622
},
{
"epoch": 0.35324135324135325,
"grad_norm": 2.076789379119873,
"learning_rate": 4.969326294138355e-06,
"loss": 1.1369,
"step": 623
},
{
"epoch": 0.3538083538083538,
"grad_norm": 2.148998260498047,
"learning_rate": 4.9692091245389275e-06,
"loss": 1.1001,
"step": 624
},
{
"epoch": 0.3543753543753544,
"grad_norm": 1.9037641286849976,
"learning_rate": 4.969091732966155e-06,
"loss": 1.201,
"step": 625
},
{
"epoch": 0.35494235494235493,
"grad_norm": 1.9544503688812256,
"learning_rate": 4.968974119430589e-06,
"loss": 1.1571,
"step": 626
},
{
"epoch": 0.35550935550935553,
"grad_norm": 2.1833956241607666,
"learning_rate": 4.968856283942805e-06,
"loss": 1.1889,
"step": 627
},
{
"epoch": 0.35607635607635607,
"grad_norm": 2.0116825103759766,
"learning_rate": 4.968738226513395e-06,
"loss": 1.1488,
"step": 628
},
{
"epoch": 0.35664335664335667,
"grad_norm": 2.1879611015319824,
"learning_rate": 4.968619947152971e-06,
"loss": 1.1717,
"step": 629
},
{
"epoch": 0.3572103572103572,
"grad_norm": 2.06209135055542,
"learning_rate": 4.968501445872168e-06,
"loss": 1.1698,
"step": 630
},
{
"epoch": 0.35777735777735775,
"grad_norm": 2.126161813735962,
"learning_rate": 4.968382722681637e-06,
"loss": 1.1888,
"step": 631
},
{
"epoch": 0.35834435834435835,
"grad_norm": 2.016767978668213,
"learning_rate": 4.968263777592052e-06,
"loss": 1.1583,
"step": 632
},
{
"epoch": 0.3589113589113589,
"grad_norm": 2.19541335105896,
"learning_rate": 4.968144610614104e-06,
"loss": 1.2259,
"step": 633
},
{
"epoch": 0.3594783594783595,
"grad_norm": 2.040583610534668,
"learning_rate": 4.968025221758508e-06,
"loss": 1.1836,
"step": 634
},
{
"epoch": 0.36004536004536003,
"grad_norm": 1.9836755990982056,
"learning_rate": 4.967905611035994e-06,
"loss": 1.0771,
"step": 635
},
{
"epoch": 0.3606123606123606,
"grad_norm": 2.1664631366729736,
"learning_rate": 4.967785778457318e-06,
"loss": 1.1864,
"step": 636
},
{
"epoch": 0.36117936117936117,
"grad_norm": 1.9839632511138916,
"learning_rate": 4.967665724033249e-06,
"loss": 1.1075,
"step": 637
},
{
"epoch": 0.36174636174636177,
"grad_norm": 2.1936347484588623,
"learning_rate": 4.9675454477745825e-06,
"loss": 1.2054,
"step": 638
},
{
"epoch": 0.3623133623133623,
"grad_norm": 1.9246869087219238,
"learning_rate": 4.967424949692129e-06,
"loss": 1.1856,
"step": 639
},
{
"epoch": 0.3628803628803629,
"grad_norm": 2.0133697986602783,
"learning_rate": 4.967304229796722e-06,
"loss": 1.162,
"step": 640
},
{
"epoch": 0.36344736344736345,
"grad_norm": 2.135317802429199,
"learning_rate": 4.967183288099212e-06,
"loss": 1.1268,
"step": 641
},
{
"epoch": 0.364014364014364,
"grad_norm": 2.1062726974487305,
"learning_rate": 4.967062124610473e-06,
"loss": 1.1205,
"step": 642
},
{
"epoch": 0.3645813645813646,
"grad_norm": 2.252697229385376,
"learning_rate": 4.966940739341397e-06,
"loss": 1.1184,
"step": 643
},
{
"epoch": 0.36514836514836513,
"grad_norm": 2.0622639656066895,
"learning_rate": 4.9668191323028956e-06,
"loss": 1.1073,
"step": 644
},
{
"epoch": 0.3657153657153657,
"grad_norm": 2.094453811645508,
"learning_rate": 4.966697303505901e-06,
"loss": 1.1755,
"step": 645
},
{
"epoch": 0.36628236628236627,
"grad_norm": 2.2338266372680664,
"learning_rate": 4.966575252961365e-06,
"loss": 1.1128,
"step": 646
},
{
"epoch": 0.36684936684936686,
"grad_norm": 2.259481430053711,
"learning_rate": 4.9664529806802605e-06,
"loss": 1.1412,
"step": 647
},
{
"epoch": 0.3674163674163674,
"grad_norm": 2.1204957962036133,
"learning_rate": 4.966330486673578e-06,
"loss": 1.1276,
"step": 648
},
{
"epoch": 0.367983367983368,
"grad_norm": 2.659971237182617,
"learning_rate": 4.966207770952329e-06,
"loss": 1.1212,
"step": 649
},
{
"epoch": 0.36855036855036855,
"grad_norm": 2.29526948928833,
"learning_rate": 4.966084833527547e-06,
"loss": 1.1902,
"step": 650
},
{
"epoch": 0.36911736911736914,
"grad_norm": 2.237398147583008,
"learning_rate": 4.9659616744102825e-06,
"loss": 1.1895,
"step": 651
},
{
"epoch": 0.3696843696843697,
"grad_norm": 2.0594875812530518,
"learning_rate": 4.965838293611608e-06,
"loss": 1.1617,
"step": 652
},
{
"epoch": 0.3702513702513702,
"grad_norm": 2.1710054874420166,
"learning_rate": 4.965714691142614e-06,
"loss": 1.168,
"step": 653
},
{
"epoch": 0.3708183708183708,
"grad_norm": 2.2017593383789062,
"learning_rate": 4.965590867014411e-06,
"loss": 1.1226,
"step": 654
},
{
"epoch": 0.37138537138537137,
"grad_norm": 2.2235498428344727,
"learning_rate": 4.965466821238133e-06,
"loss": 1.1587,
"step": 655
},
{
"epoch": 0.37195237195237196,
"grad_norm": 2.168541431427002,
"learning_rate": 4.965342553824929e-06,
"loss": 1.1707,
"step": 656
},
{
"epoch": 0.3725193725193725,
"grad_norm": 1.9276596307754517,
"learning_rate": 4.965218064785972e-06,
"loss": 1.1589,
"step": 657
},
{
"epoch": 0.3730863730863731,
"grad_norm": 2.0985217094421387,
"learning_rate": 4.965093354132451e-06,
"loss": 1.1817,
"step": 658
},
{
"epoch": 0.37365337365337364,
"grad_norm": 2.0856056213378906,
"learning_rate": 4.964968421875579e-06,
"loss": 1.1571,
"step": 659
},
{
"epoch": 0.37422037422037424,
"grad_norm": 2.0469911098480225,
"learning_rate": 4.964843268026586e-06,
"loss": 1.1054,
"step": 660
},
{
"epoch": 0.3747873747873748,
"grad_norm": 2.1270503997802734,
"learning_rate": 4.964717892596723e-06,
"loss": 1.1567,
"step": 661
},
{
"epoch": 0.3753543753543754,
"grad_norm": 2.1887195110321045,
"learning_rate": 4.964592295597261e-06,
"loss": 1.1441,
"step": 662
},
{
"epoch": 0.3759213759213759,
"grad_norm": 2.317736864089966,
"learning_rate": 4.964466477039492e-06,
"loss": 1.1534,
"step": 663
},
{
"epoch": 0.37648837648837646,
"grad_norm": 2.509260892868042,
"learning_rate": 4.964340436934724e-06,
"loss": 1.1391,
"step": 664
},
{
"epoch": 0.37705537705537706,
"grad_norm": 2.0452466011047363,
"learning_rate": 4.96421417529429e-06,
"loss": 1.1786,
"step": 665
},
{
"epoch": 0.3776223776223776,
"grad_norm": 2.159749746322632,
"learning_rate": 4.964087692129538e-06,
"loss": 1.2412,
"step": 666
},
{
"epoch": 0.3781893781893782,
"grad_norm": 1.99373197555542,
"learning_rate": 4.963960987451841e-06,
"loss": 1.1374,
"step": 667
},
{
"epoch": 0.37875637875637874,
"grad_norm": 2.1809184551239014,
"learning_rate": 4.9638340612725875e-06,
"loss": 1.1689,
"step": 668
},
{
"epoch": 0.37932337932337934,
"grad_norm": 2.27892804145813,
"learning_rate": 4.963706913603188e-06,
"loss": 1.1155,
"step": 669
},
{
"epoch": 0.3798903798903799,
"grad_norm": 2.08236026763916,
"learning_rate": 4.963579544455074e-06,
"loss": 1.1535,
"step": 670
},
{
"epoch": 0.3804573804573805,
"grad_norm": 2.14473557472229,
"learning_rate": 4.963451953839694e-06,
"loss": 1.1698,
"step": 671
},
{
"epoch": 0.381024381024381,
"grad_norm": 2.0856521129608154,
"learning_rate": 4.963324141768519e-06,
"loss": 1.1349,
"step": 672
},
{
"epoch": 0.3815913815913816,
"grad_norm": 2.1324524879455566,
"learning_rate": 4.963196108253037e-06,
"loss": 1.1598,
"step": 673
},
{
"epoch": 0.38215838215838216,
"grad_norm": 2.0533447265625,
"learning_rate": 4.96306785330476e-06,
"loss": 1.1546,
"step": 674
},
{
"epoch": 0.3827253827253827,
"grad_norm": 1.913636326789856,
"learning_rate": 4.962939376935216e-06,
"loss": 1.1914,
"step": 675
},
{
"epoch": 0.3832923832923833,
"grad_norm": 2.0817277431488037,
"learning_rate": 4.962810679155957e-06,
"loss": 1.1146,
"step": 676
},
{
"epoch": 0.38385938385938384,
"grad_norm": 2.0026915073394775,
"learning_rate": 4.96268175997855e-06,
"loss": 1.1487,
"step": 677
},
{
"epoch": 0.38442638442638444,
"grad_norm": 2.05865478515625,
"learning_rate": 4.962552619414584e-06,
"loss": 1.148,
"step": 678
},
{
"epoch": 0.384993384993385,
"grad_norm": 2.0297701358795166,
"learning_rate": 4.962423257475672e-06,
"loss": 1.1555,
"step": 679
},
{
"epoch": 0.3855603855603856,
"grad_norm": 2.2152462005615234,
"learning_rate": 4.962293674173438e-06,
"loss": 1.1278,
"step": 680
},
{
"epoch": 0.3861273861273861,
"grad_norm": 1.9059257507324219,
"learning_rate": 4.962163869519536e-06,
"loss": 1.1193,
"step": 681
},
{
"epoch": 0.3866943866943867,
"grad_norm": 2.1127209663391113,
"learning_rate": 4.962033843525632e-06,
"loss": 1.0973,
"step": 682
},
{
"epoch": 0.38726138726138726,
"grad_norm": 2.3520565032958984,
"learning_rate": 4.961903596203416e-06,
"loss": 1.2412,
"step": 683
},
{
"epoch": 0.38782838782838785,
"grad_norm": 2.072892904281616,
"learning_rate": 4.961773127564596e-06,
"loss": 1.1184,
"step": 684
},
{
"epoch": 0.3883953883953884,
"grad_norm": 2.1626739501953125,
"learning_rate": 4.961642437620901e-06,
"loss": 1.1772,
"step": 685
},
{
"epoch": 0.38896238896238894,
"grad_norm": 2.09814453125,
"learning_rate": 4.96151152638408e-06,
"loss": 1.09,
"step": 686
},
{
"epoch": 0.38952938952938954,
"grad_norm": 1.9176437854766846,
"learning_rate": 4.9613803938659014e-06,
"loss": 1.0821,
"step": 687
},
{
"epoch": 0.3900963900963901,
"grad_norm": 2.1363837718963623,
"learning_rate": 4.961249040078153e-06,
"loss": 1.1564,
"step": 688
},
{
"epoch": 0.3906633906633907,
"grad_norm": 2.1223223209381104,
"learning_rate": 4.961117465032643e-06,
"loss": 1.1089,
"step": 689
},
{
"epoch": 0.3912303912303912,
"grad_norm": 2.165515184402466,
"learning_rate": 4.960985668741201e-06,
"loss": 1.1745,
"step": 690
},
{
"epoch": 0.3917973917973918,
"grad_norm": 2.15566349029541,
"learning_rate": 4.960853651215673e-06,
"loss": 1.1345,
"step": 691
},
{
"epoch": 0.39236439236439236,
"grad_norm": 2.015986919403076,
"learning_rate": 4.960721412467929e-06,
"loss": 1.1247,
"step": 692
},
{
"epoch": 0.39293139293139295,
"grad_norm": 2.1274542808532715,
"learning_rate": 4.960588952509855e-06,
"loss": 1.1688,
"step": 693
},
{
"epoch": 0.3934983934983935,
"grad_norm": 2.058623790740967,
"learning_rate": 4.960456271353359e-06,
"loss": 1.1785,
"step": 694
},
{
"epoch": 0.3940653940653941,
"grad_norm": 2.4499433040618896,
"learning_rate": 4.9603233690103695e-06,
"loss": 1.1351,
"step": 695
},
{
"epoch": 0.39463239463239463,
"grad_norm": 2.1276276111602783,
"learning_rate": 4.960190245492833e-06,
"loss": 1.1368,
"step": 696
},
{
"epoch": 0.3951993951993952,
"grad_norm": 2.095386028289795,
"learning_rate": 4.960056900812717e-06,
"loss": 1.1141,
"step": 697
},
{
"epoch": 0.3957663957663958,
"grad_norm": 2.1107561588287354,
"learning_rate": 4.95992333498201e-06,
"loss": 1.1269,
"step": 698
},
{
"epoch": 0.3963333963333963,
"grad_norm": 2.3567731380462646,
"learning_rate": 4.9597895480127175e-06,
"loss": 1.1367,
"step": 699
},
{
"epoch": 0.3969003969003969,
"grad_norm": 2.1174514293670654,
"learning_rate": 4.959655539916868e-06,
"loss": 1.1891,
"step": 700
},
{
"epoch": 0.39746739746739745,
"grad_norm": 2.0991039276123047,
"learning_rate": 4.959521310706506e-06,
"loss": 1.1735,
"step": 701
},
{
"epoch": 0.39803439803439805,
"grad_norm": 1.934212327003479,
"learning_rate": 4.9593868603937e-06,
"loss": 1.0731,
"step": 702
},
{
"epoch": 0.3986013986013986,
"grad_norm": 2.1108388900756836,
"learning_rate": 4.959252188990536e-06,
"loss": 1.1735,
"step": 703
},
{
"epoch": 0.3991683991683992,
"grad_norm": 2.010054349899292,
"learning_rate": 4.9591172965091224e-06,
"loss": 1.1337,
"step": 704
},
{
"epoch": 0.39973539973539973,
"grad_norm": 2.167483329772949,
"learning_rate": 4.9589821829615825e-06,
"loss": 1.1609,
"step": 705
},
{
"epoch": 0.40030240030240033,
"grad_norm": 2.048516273498535,
"learning_rate": 4.958846848360065e-06,
"loss": 1.098,
"step": 706
},
{
"epoch": 0.40086940086940087,
"grad_norm": 2.2454535961151123,
"learning_rate": 4.958711292716733e-06,
"loss": 1.1277,
"step": 707
},
{
"epoch": 0.4014364014364014,
"grad_norm": 2.0777549743652344,
"learning_rate": 4.958575516043776e-06,
"loss": 1.1743,
"step": 708
},
{
"epoch": 0.402003402003402,
"grad_norm": 2.114323377609253,
"learning_rate": 4.958439518353399e-06,
"loss": 1.1638,
"step": 709
},
{
"epoch": 0.40257040257040255,
"grad_norm": 2.010104179382324,
"learning_rate": 4.958303299657826e-06,
"loss": 1.1594,
"step": 710
},
{
"epoch": 0.40313740313740315,
"grad_norm": 2.0232269763946533,
"learning_rate": 4.958166859969304e-06,
"loss": 1.1402,
"step": 711
},
{
"epoch": 0.4037044037044037,
"grad_norm": 1.9577562808990479,
"learning_rate": 4.9580301993000984e-06,
"loss": 1.2293,
"step": 712
},
{
"epoch": 0.4042714042714043,
"grad_norm": 1.968520164489746,
"learning_rate": 4.957893317662494e-06,
"loss": 1.1197,
"step": 713
},
{
"epoch": 0.40483840483840483,
"grad_norm": 2.126594305038452,
"learning_rate": 4.9577562150687955e-06,
"loss": 1.16,
"step": 714
},
{
"epoch": 0.40540540540540543,
"grad_norm": 2.0474393367767334,
"learning_rate": 4.957618891531329e-06,
"loss": 1.126,
"step": 715
},
{
"epoch": 0.40597240597240597,
"grad_norm": 1.9974445104599,
"learning_rate": 4.95748134706244e-06,
"loss": 1.0981,
"step": 716
},
{
"epoch": 0.40653940653940657,
"grad_norm": 2.0076394081115723,
"learning_rate": 4.957343581674492e-06,
"loss": 1.1171,
"step": 717
},
{
"epoch": 0.4071064071064071,
"grad_norm": 2.1541495323181152,
"learning_rate": 4.9572055953798695e-06,
"loss": 1.1941,
"step": 718
},
{
"epoch": 0.40767340767340765,
"grad_norm": 2.1172938346862793,
"learning_rate": 4.957067388190977e-06,
"loss": 1.1361,
"step": 719
},
{
"epoch": 0.40824040824040825,
"grad_norm": 2.162334442138672,
"learning_rate": 4.9569289601202405e-06,
"loss": 1.0521,
"step": 720
},
{
"epoch": 0.4088074088074088,
"grad_norm": 2.1444907188415527,
"learning_rate": 4.956790311180102e-06,
"loss": 1.1954,
"step": 721
},
{
"epoch": 0.4093744093744094,
"grad_norm": 1.9972236156463623,
"learning_rate": 4.956651441383027e-06,
"loss": 1.0979,
"step": 722
},
{
"epoch": 0.40994140994140993,
"grad_norm": 2.1866066455841064,
"learning_rate": 4.9565123507414994e-06,
"loss": 1.1854,
"step": 723
},
{
"epoch": 0.4105084105084105,
"grad_norm": 2.0343639850616455,
"learning_rate": 4.956373039268022e-06,
"loss": 1.1422,
"step": 724
},
{
"epoch": 0.41107541107541107,
"grad_norm": 2.0357658863067627,
"learning_rate": 4.956233506975119e-06,
"loss": 1.1096,
"step": 725
},
{
"epoch": 0.41164241164241167,
"grad_norm": 2.22104549407959,
"learning_rate": 4.956093753875334e-06,
"loss": 1.1119,
"step": 726
},
{
"epoch": 0.4122094122094122,
"grad_norm": 2.0534398555755615,
"learning_rate": 4.95595377998123e-06,
"loss": 1.2207,
"step": 727
},
{
"epoch": 0.41277641277641275,
"grad_norm": 1.9503989219665527,
"learning_rate": 4.95581358530539e-06,
"loss": 1.1033,
"step": 728
},
{
"epoch": 0.41334341334341335,
"grad_norm": 2.2704718112945557,
"learning_rate": 4.955673169860418e-06,
"loss": 1.1745,
"step": 729
},
{
"epoch": 0.4139104139104139,
"grad_norm": 2.0502419471740723,
"learning_rate": 4.955532533658936e-06,
"loss": 1.1606,
"step": 730
},
{
"epoch": 0.4144774144774145,
"grad_norm": 1.971962809562683,
"learning_rate": 4.955391676713587e-06,
"loss": 1.1077,
"step": 731
},
{
"epoch": 0.415044415044415,
"grad_norm": 2.115936040878296,
"learning_rate": 4.955250599037034e-06,
"loss": 1.123,
"step": 732
},
{
"epoch": 0.4156114156114156,
"grad_norm": 1.9907145500183105,
"learning_rate": 4.9551093006419574e-06,
"loss": 1.1332,
"step": 733
},
{
"epoch": 0.41617841617841617,
"grad_norm": 2.122241497039795,
"learning_rate": 4.954967781541062e-06,
"loss": 1.0907,
"step": 734
},
{
"epoch": 0.41674541674541676,
"grad_norm": 2.1141576766967773,
"learning_rate": 4.954826041747068e-06,
"loss": 1.0933,
"step": 735
},
{
"epoch": 0.4173124173124173,
"grad_norm": 2.121520519256592,
"learning_rate": 4.954684081272719e-06,
"loss": 1.1604,
"step": 736
},
{
"epoch": 0.4178794178794179,
"grad_norm": 2.0803537368774414,
"learning_rate": 4.954541900130775e-06,
"loss": 1.1552,
"step": 737
},
{
"epoch": 0.41844641844641844,
"grad_norm": 2.2571604251861572,
"learning_rate": 4.954399498334019e-06,
"loss": 1.0974,
"step": 738
},
{
"epoch": 0.419013419013419,
"grad_norm": 2.1840076446533203,
"learning_rate": 4.954256875895252e-06,
"loss": 1.1438,
"step": 739
},
{
"epoch": 0.4195804195804196,
"grad_norm": 2.1919736862182617,
"learning_rate": 4.954114032827294e-06,
"loss": 1.1731,
"step": 740
},
{
"epoch": 0.4201474201474201,
"grad_norm": 2.0454063415527344,
"learning_rate": 4.953970969142989e-06,
"loss": 1.1379,
"step": 741
},
{
"epoch": 0.4207144207144207,
"grad_norm": 2.4228060245513916,
"learning_rate": 4.953827684855195e-06,
"loss": 1.1868,
"step": 742
},
{
"epoch": 0.42128142128142126,
"grad_norm": 2.1020874977111816,
"learning_rate": 4.953684179976794e-06,
"loss": 1.1355,
"step": 743
},
{
"epoch": 0.42184842184842186,
"grad_norm": 2.1466715335845947,
"learning_rate": 4.953540454520687e-06,
"loss": 1.1692,
"step": 744
},
{
"epoch": 0.4224154224154224,
"grad_norm": 2.0675599575042725,
"learning_rate": 4.953396508499794e-06,
"loss": 1.1205,
"step": 745
},
{
"epoch": 0.422982422982423,
"grad_norm": 2.4063730239868164,
"learning_rate": 4.953252341927054e-06,
"loss": 1.1227,
"step": 746
},
{
"epoch": 0.42354942354942354,
"grad_norm": 2.484339714050293,
"learning_rate": 4.953107954815429e-06,
"loss": 1.1665,
"step": 747
},
{
"epoch": 0.42411642411642414,
"grad_norm": 2.1248579025268555,
"learning_rate": 4.952963347177898e-06,
"loss": 1.1755,
"step": 748
},
{
"epoch": 0.4246834246834247,
"grad_norm": 2.005690574645996,
"learning_rate": 4.952818519027461e-06,
"loss": 1.1516,
"step": 749
},
{
"epoch": 0.4252504252504252,
"grad_norm": 3.1751925945281982,
"learning_rate": 4.952673470377137e-06,
"loss": 1.1643,
"step": 750
},
{
"epoch": 0.4258174258174258,
"grad_norm": 1.9519903659820557,
"learning_rate": 4.952528201239967e-06,
"loss": 1.1672,
"step": 751
},
{
"epoch": 0.42638442638442636,
"grad_norm": 1.906704306602478,
"learning_rate": 4.952382711629008e-06,
"loss": 1.0814,
"step": 752
},
{
"epoch": 0.42695142695142696,
"grad_norm": 2.2957396507263184,
"learning_rate": 4.9522370015573405e-06,
"loss": 1.2129,
"step": 753
},
{
"epoch": 0.4275184275184275,
"grad_norm": 2.1339492797851562,
"learning_rate": 4.952091071038062e-06,
"loss": 1.1826,
"step": 754
},
{
"epoch": 0.4280854280854281,
"grad_norm": 2.051973819732666,
"learning_rate": 4.951944920084293e-06,
"loss": 1.1258,
"step": 755
},
{
"epoch": 0.42865242865242864,
"grad_norm": 1.99086332321167,
"learning_rate": 4.95179854870917e-06,
"loss": 1.1508,
"step": 756
},
{
"epoch": 0.42921942921942924,
"grad_norm": 1.9002501964569092,
"learning_rate": 4.951651956925853e-06,
"loss": 1.097,
"step": 757
},
{
"epoch": 0.4297864297864298,
"grad_norm": 2.087890625,
"learning_rate": 4.951505144747519e-06,
"loss": 1.1231,
"step": 758
},
{
"epoch": 0.4303534303534304,
"grad_norm": 1.9995880126953125,
"learning_rate": 4.9513581121873665e-06,
"loss": 1.1392,
"step": 759
},
{
"epoch": 0.4309204309204309,
"grad_norm": 2.1408588886260986,
"learning_rate": 4.9512108592586125e-06,
"loss": 1.1433,
"step": 760
},
{
"epoch": 0.43148743148743146,
"grad_norm": 2.1597390174865723,
"learning_rate": 4.951063385974495e-06,
"loss": 1.063,
"step": 761
},
{
"epoch": 0.43205443205443206,
"grad_norm": 2.084463357925415,
"learning_rate": 4.950915692348271e-06,
"loss": 1.1381,
"step": 762
},
{
"epoch": 0.4326214326214326,
"grad_norm": 2.148618698120117,
"learning_rate": 4.95076777839322e-06,
"loss": 1.1597,
"step": 763
},
{
"epoch": 0.4331884331884332,
"grad_norm": 2.15020489692688,
"learning_rate": 4.9506196441226345e-06,
"loss": 1.09,
"step": 764
},
{
"epoch": 0.43375543375543374,
"grad_norm": 2.059931516647339,
"learning_rate": 4.950471289549834e-06,
"loss": 1.1254,
"step": 765
},
{
"epoch": 0.43432243432243434,
"grad_norm": 2.0239474773406982,
"learning_rate": 4.950322714688156e-06,
"loss": 1.1228,
"step": 766
},
{
"epoch": 0.4348894348894349,
"grad_norm": 2.121474027633667,
"learning_rate": 4.950173919550955e-06,
"loss": 1.1657,
"step": 767
},
{
"epoch": 0.4354564354564355,
"grad_norm": 2.2115628719329834,
"learning_rate": 4.950024904151607e-06,
"loss": 1.158,
"step": 768
},
{
"epoch": 0.436023436023436,
"grad_norm": 2.045806646347046,
"learning_rate": 4.9498756685035095e-06,
"loss": 1.1509,
"step": 769
},
{
"epoch": 0.4365904365904366,
"grad_norm": 2.263608932495117,
"learning_rate": 4.949726212620077e-06,
"loss": 1.1491,
"step": 770
},
{
"epoch": 0.43715743715743716,
"grad_norm": 2.2598161697387695,
"learning_rate": 4.949576536514747e-06,
"loss": 1.1495,
"step": 771
},
{
"epoch": 0.4377244377244377,
"grad_norm": 2.0730316638946533,
"learning_rate": 4.949426640200972e-06,
"loss": 1.1024,
"step": 772
},
{
"epoch": 0.4382914382914383,
"grad_norm": 2.0575971603393555,
"learning_rate": 4.949276523692228e-06,
"loss": 1.0622,
"step": 773
},
{
"epoch": 0.43885843885843884,
"grad_norm": 1.9827250242233276,
"learning_rate": 4.949126187002012e-06,
"loss": 1.1452,
"step": 774
},
{
"epoch": 0.43942543942543943,
"grad_norm": 2.11433482170105,
"learning_rate": 4.948975630143837e-06,
"loss": 1.1375,
"step": 775
},
{
"epoch": 0.43999243999244,
"grad_norm": 2.214946985244751,
"learning_rate": 4.948824853131237e-06,
"loss": 1.0639,
"step": 776
},
{
"epoch": 0.4405594405594406,
"grad_norm": 2.0897412300109863,
"learning_rate": 4.948673855977767e-06,
"loss": 1.0991,
"step": 777
},
{
"epoch": 0.4411264411264411,
"grad_norm": 2.051321506500244,
"learning_rate": 4.948522638697002e-06,
"loss": 1.1743,
"step": 778
},
{
"epoch": 0.4416934416934417,
"grad_norm": 2.1338798999786377,
"learning_rate": 4.9483712013025356e-06,
"loss": 1.1451,
"step": 779
},
{
"epoch": 0.44226044226044225,
"grad_norm": 2.172214984893799,
"learning_rate": 4.94821954380798e-06,
"loss": 1.0999,
"step": 780
},
{
"epoch": 0.44282744282744285,
"grad_norm": 2.14731502532959,
"learning_rate": 4.9480676662269704e-06,
"loss": 1.1238,
"step": 781
},
{
"epoch": 0.4433944433944434,
"grad_norm": 2.144775867462158,
"learning_rate": 4.9479155685731595e-06,
"loss": 1.1571,
"step": 782
},
{
"epoch": 0.44396144396144394,
"grad_norm": 2.0654232501983643,
"learning_rate": 4.94776325086022e-06,
"loss": 1.1644,
"step": 783
},
{
"epoch": 0.44452844452844453,
"grad_norm": 2.08166241645813,
"learning_rate": 4.947610713101846e-06,
"loss": 1.1535,
"step": 784
},
{
"epoch": 0.4450954450954451,
"grad_norm": 2.0543806552886963,
"learning_rate": 4.947457955311748e-06,
"loss": 1.1814,
"step": 785
},
{
"epoch": 0.44566244566244567,
"grad_norm": 2.543776512145996,
"learning_rate": 4.94730497750366e-06,
"loss": 1.1607,
"step": 786
},
{
"epoch": 0.4462294462294462,
"grad_norm": 2.1171834468841553,
"learning_rate": 4.9471517796913325e-06,
"loss": 1.1503,
"step": 787
},
{
"epoch": 0.4467964467964468,
"grad_norm": 1.9275233745574951,
"learning_rate": 4.946998361888541e-06,
"loss": 1.1236,
"step": 788
},
{
"epoch": 0.44736344736344735,
"grad_norm": 2.2657222747802734,
"learning_rate": 4.946844724109073e-06,
"loss": 1.171,
"step": 789
},
{
"epoch": 0.44793044793044795,
"grad_norm": 2.3687210083007812,
"learning_rate": 4.9466908663667425e-06,
"loss": 1.1054,
"step": 790
},
{
"epoch": 0.4484974484974485,
"grad_norm": 2.2839155197143555,
"learning_rate": 4.94653678867538e-06,
"loss": 1.1713,
"step": 791
},
{
"epoch": 0.4490644490644491,
"grad_norm": 2.1770317554473877,
"learning_rate": 4.946382491048836e-06,
"loss": 1.187,
"step": 792
},
{
"epoch": 0.44963144963144963,
"grad_norm": 2.0334887504577637,
"learning_rate": 4.9462279735009835e-06,
"loss": 1.1199,
"step": 793
},
{
"epoch": 0.4501984501984502,
"grad_norm": 1.87135910987854,
"learning_rate": 4.946073236045712e-06,
"loss": 1.1181,
"step": 794
},
{
"epoch": 0.45076545076545077,
"grad_norm": 2.0009896755218506,
"learning_rate": 4.945918278696929e-06,
"loss": 1.1139,
"step": 795
},
{
"epoch": 0.4513324513324513,
"grad_norm": 1.9485775232315063,
"learning_rate": 4.945763101468569e-06,
"loss": 1.1215,
"step": 796
},
{
"epoch": 0.4518994518994519,
"grad_norm": 2.148066282272339,
"learning_rate": 4.9456077043745805e-06,
"loss": 1.1767,
"step": 797
},
{
"epoch": 0.45246645246645245,
"grad_norm": 2.1108059883117676,
"learning_rate": 4.945452087428931e-06,
"loss": 1.0699,
"step": 798
},
{
"epoch": 0.45303345303345305,
"grad_norm": 2.168656826019287,
"learning_rate": 4.945296250645613e-06,
"loss": 1.1925,
"step": 799
},
{
"epoch": 0.4536004536004536,
"grad_norm": 2.0252230167388916,
"learning_rate": 4.945140194038633e-06,
"loss": 1.1474,
"step": 800
},
{
"epoch": 0.4541674541674542,
"grad_norm": 2.010436773300171,
"learning_rate": 4.944983917622023e-06,
"loss": 1.0687,
"step": 801
},
{
"epoch": 0.45473445473445473,
"grad_norm": 2.039442539215088,
"learning_rate": 4.944827421409829e-06,
"loss": 1.2066,
"step": 802
},
{
"epoch": 0.4553014553014553,
"grad_norm": 2.1052887439727783,
"learning_rate": 4.94467070541612e-06,
"loss": 1.095,
"step": 803
},
{
"epoch": 0.45586845586845587,
"grad_norm": 2.1052777767181396,
"learning_rate": 4.944513769654985e-06,
"loss": 1.1136,
"step": 804
},
{
"epoch": 0.4564354564354564,
"grad_norm": 2.0587961673736572,
"learning_rate": 4.944356614140532e-06,
"loss": 1.1154,
"step": 805
},
{
"epoch": 0.457002457002457,
"grad_norm": 2.0752010345458984,
"learning_rate": 4.9441992388868876e-06,
"loss": 1.1544,
"step": 806
},
{
"epoch": 0.45756945756945755,
"grad_norm": 2.405973434448242,
"learning_rate": 4.9440416439082006e-06,
"loss": 1.1276,
"step": 807
},
{
"epoch": 0.45813645813645815,
"grad_norm": 2.0728070735931396,
"learning_rate": 4.943883829218638e-06,
"loss": 1.1398,
"step": 808
},
{
"epoch": 0.4587034587034587,
"grad_norm": 2.0885121822357178,
"learning_rate": 4.943725794832386e-06,
"loss": 1.1287,
"step": 809
},
{
"epoch": 0.4592704592704593,
"grad_norm": 1.9414550065994263,
"learning_rate": 4.943567540763651e-06,
"loss": 1.1355,
"step": 810
},
{
"epoch": 0.4598374598374598,
"grad_norm": 2.2840662002563477,
"learning_rate": 4.943409067026662e-06,
"loss": 1.1273,
"step": 811
},
{
"epoch": 0.4604044604044604,
"grad_norm": 2.0449087619781494,
"learning_rate": 4.943250373635663e-06,
"loss": 1.0825,
"step": 812
},
{
"epoch": 0.46097146097146097,
"grad_norm": 2.1993865966796875,
"learning_rate": 4.94309146060492e-06,
"loss": 1.2063,
"step": 813
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.00508189201355,
"learning_rate": 4.942932327948719e-06,
"loss": 1.1049,
"step": 814
},
{
"epoch": 0.4621054621054621,
"grad_norm": 1.9916926622390747,
"learning_rate": 4.942772975681366e-06,
"loss": 1.0996,
"step": 815
},
{
"epoch": 0.46267246267246265,
"grad_norm": 2.2713184356689453,
"learning_rate": 4.942613403817187e-06,
"loss": 1.1215,
"step": 816
},
{
"epoch": 0.46323946323946324,
"grad_norm": 2.0838847160339355,
"learning_rate": 4.942453612370525e-06,
"loss": 1.1203,
"step": 817
},
{
"epoch": 0.4638064638064638,
"grad_norm": 2.103275775909424,
"learning_rate": 4.9422936013557454e-06,
"loss": 1.1335,
"step": 818
},
{
"epoch": 0.4643734643734644,
"grad_norm": 2.257582664489746,
"learning_rate": 4.9421333707872335e-06,
"loss": 1.1705,
"step": 819
},
{
"epoch": 0.4649404649404649,
"grad_norm": 2.2377452850341797,
"learning_rate": 4.941972920679393e-06,
"loss": 1.1128,
"step": 820
},
{
"epoch": 0.4655074655074655,
"grad_norm": 2.1154627799987793,
"learning_rate": 4.941812251046647e-06,
"loss": 1.203,
"step": 821
},
{
"epoch": 0.46607446607446607,
"grad_norm": 2.1149961948394775,
"learning_rate": 4.94165136190344e-06,
"loss": 1.1938,
"step": 822
},
{
"epoch": 0.46664146664146666,
"grad_norm": 2.020078659057617,
"learning_rate": 4.941490253264235e-06,
"loss": 1.1896,
"step": 823
},
{
"epoch": 0.4672084672084672,
"grad_norm": 1.9953190088272095,
"learning_rate": 4.9413289251435156e-06,
"loss": 1.1152,
"step": 824
},
{
"epoch": 0.4677754677754678,
"grad_norm": 2.0713393688201904,
"learning_rate": 4.941167377555785e-06,
"loss": 1.1156,
"step": 825
},
{
"epoch": 0.46834246834246834,
"grad_norm": 1.975783348083496,
"learning_rate": 4.941005610515563e-06,
"loss": 1.0871,
"step": 826
},
{
"epoch": 0.4689094689094689,
"grad_norm": 2.0003039836883545,
"learning_rate": 4.940843624037396e-06,
"loss": 1.0953,
"step": 827
},
{
"epoch": 0.4694764694764695,
"grad_norm": 2.0911831855773926,
"learning_rate": 4.940681418135843e-06,
"loss": 1.1284,
"step": 828
},
{
"epoch": 0.47004347004347,
"grad_norm": 2.032801866531372,
"learning_rate": 4.940518992825487e-06,
"loss": 1.1489,
"step": 829
},
{
"epoch": 0.4706104706104706,
"grad_norm": 2.1229052543640137,
"learning_rate": 4.940356348120929e-06,
"loss": 1.0989,
"step": 830
},
{
"epoch": 0.47117747117747116,
"grad_norm": 1.9044902324676514,
"learning_rate": 4.94019348403679e-06,
"loss": 1.1146,
"step": 831
},
{
"epoch": 0.47174447174447176,
"grad_norm": 2.077392101287842,
"learning_rate": 4.940030400587712e-06,
"loss": 1.1295,
"step": 832
},
{
"epoch": 0.4723114723114723,
"grad_norm": 2.1112782955169678,
"learning_rate": 4.939867097788356e-06,
"loss": 1.1323,
"step": 833
},
{
"epoch": 0.4728784728784729,
"grad_norm": 2.0358989238739014,
"learning_rate": 4.9397035756534e-06,
"loss": 1.0978,
"step": 834
},
{
"epoch": 0.47344547344547344,
"grad_norm": 2.0520427227020264,
"learning_rate": 4.939539834197545e-06,
"loss": 1.152,
"step": 835
},
{
"epoch": 0.47401247401247404,
"grad_norm": 2.0846059322357178,
"learning_rate": 4.939375873435512e-06,
"loss": 1.107,
"step": 836
},
{
"epoch": 0.4745794745794746,
"grad_norm": 2.139598846435547,
"learning_rate": 4.93921169338204e-06,
"loss": 1.0811,
"step": 837
},
{
"epoch": 0.4751464751464751,
"grad_norm": 2.6360387802124023,
"learning_rate": 4.939047294051887e-06,
"loss": 1.1115,
"step": 838
},
{
"epoch": 0.4757134757134757,
"grad_norm": 2.1863367557525635,
"learning_rate": 4.938882675459833e-06,
"loss": 1.0615,
"step": 839
},
{
"epoch": 0.47628047628047626,
"grad_norm": 2.0317673683166504,
"learning_rate": 4.938717837620677e-06,
"loss": 1.1024,
"step": 840
},
{
"epoch": 0.47684747684747686,
"grad_norm": 2.2008063793182373,
"learning_rate": 4.938552780549236e-06,
"loss": 1.149,
"step": 841
},
{
"epoch": 0.4774144774144774,
"grad_norm": 2.037165641784668,
"learning_rate": 4.9383875042603496e-06,
"loss": 1.1239,
"step": 842
},
{
"epoch": 0.477981477981478,
"grad_norm": 1.9858256578445435,
"learning_rate": 4.9382220087688745e-06,
"loss": 1.0986,
"step": 843
},
{
"epoch": 0.47854847854847854,
"grad_norm": 2.032320022583008,
"learning_rate": 4.938056294089689e-06,
"loss": 1.1053,
"step": 844
},
{
"epoch": 0.47911547911547914,
"grad_norm": 2.054124355316162,
"learning_rate": 4.93789036023769e-06,
"loss": 1.1036,
"step": 845
},
{
"epoch": 0.4796824796824797,
"grad_norm": 2.1493234634399414,
"learning_rate": 4.937724207227793e-06,
"loss": 1.1359,
"step": 846
},
{
"epoch": 0.4802494802494803,
"grad_norm": 1.9878216981887817,
"learning_rate": 4.937557835074937e-06,
"loss": 1.143,
"step": 847
},
{
"epoch": 0.4808164808164808,
"grad_norm": 2.1001598834991455,
"learning_rate": 4.9373912437940765e-06,
"loss": 1.1545,
"step": 848
},
{
"epoch": 0.48138348138348136,
"grad_norm": 2.1448514461517334,
"learning_rate": 4.9372244334001874e-06,
"loss": 1.1517,
"step": 849
},
{
"epoch": 0.48195048195048196,
"grad_norm": 2.144200325012207,
"learning_rate": 4.937057403908266e-06,
"loss": 1.1104,
"step": 850
},
{
"epoch": 0.4825174825174825,
"grad_norm": 2.0703322887420654,
"learning_rate": 4.9368901553333296e-06,
"loss": 1.0506,
"step": 851
},
{
"epoch": 0.4830844830844831,
"grad_norm": 2.108504295349121,
"learning_rate": 4.936722687690409e-06,
"loss": 1.1578,
"step": 852
},
{
"epoch": 0.48365148365148364,
"grad_norm": 2.0468504428863525,
"learning_rate": 4.936555000994563e-06,
"loss": 1.1275,
"step": 853
},
{
"epoch": 0.48421848421848424,
"grad_norm": 1.9859349727630615,
"learning_rate": 4.9363870952608634e-06,
"loss": 1.1569,
"step": 854
},
{
"epoch": 0.4847854847854848,
"grad_norm": 2.0685060024261475,
"learning_rate": 4.936218970504406e-06,
"loss": 1.0957,
"step": 855
},
{
"epoch": 0.4853524853524854,
"grad_norm": 1.965552568435669,
"learning_rate": 4.936050626740303e-06,
"loss": 1.2005,
"step": 856
},
{
"epoch": 0.4859194859194859,
"grad_norm": 2.2556333541870117,
"learning_rate": 4.935882063983689e-06,
"loss": 1.1436,
"step": 857
},
{
"epoch": 0.4864864864864865,
"grad_norm": 4.329214096069336,
"learning_rate": 4.935713282249718e-06,
"loss": 1.1138,
"step": 858
},
{
"epoch": 0.48705348705348706,
"grad_norm": 2.037137269973755,
"learning_rate": 4.935544281553561e-06,
"loss": 1.1449,
"step": 859
},
{
"epoch": 0.4876204876204876,
"grad_norm": 1.913141131401062,
"learning_rate": 4.935375061910412e-06,
"loss": 1.1467,
"step": 860
},
{
"epoch": 0.4881874881874882,
"grad_norm": 2.0674262046813965,
"learning_rate": 4.935205623335483e-06,
"loss": 1.099,
"step": 861
},
{
"epoch": 0.48875448875448874,
"grad_norm": 2.456698179244995,
"learning_rate": 4.935035965844005e-06,
"loss": 1.179,
"step": 862
},
{
"epoch": 0.48932148932148933,
"grad_norm": 1.964211106300354,
"learning_rate": 4.93486608945123e-06,
"loss": 1.1761,
"step": 863
},
{
"epoch": 0.4898884898884899,
"grad_norm": 2.1101772785186768,
"learning_rate": 4.9346959941724305e-06,
"loss": 1.1189,
"step": 864
},
{
"epoch": 0.4904554904554905,
"grad_norm": 1.9686706066131592,
"learning_rate": 4.934525680022897e-06,
"loss": 1.1225,
"step": 865
},
{
"epoch": 0.491022491022491,
"grad_norm": 1.93234121799469,
"learning_rate": 4.934355147017939e-06,
"loss": 1.0838,
"step": 866
},
{
"epoch": 0.4915894915894916,
"grad_norm": 2.1068077087402344,
"learning_rate": 4.934184395172888e-06,
"loss": 1.1495,
"step": 867
},
{
"epoch": 0.49215649215649215,
"grad_norm": 1.9966765642166138,
"learning_rate": 4.934013424503094e-06,
"loss": 1.1457,
"step": 868
},
{
"epoch": 0.49272349272349275,
"grad_norm": 2.1224286556243896,
"learning_rate": 4.9338422350239245e-06,
"loss": 1.1205,
"step": 869
},
{
"epoch": 0.4932904932904933,
"grad_norm": 2.2381792068481445,
"learning_rate": 4.9336708267507724e-06,
"loss": 1.177,
"step": 870
},
{
"epoch": 0.49385749385749383,
"grad_norm": 1.9738177061080933,
"learning_rate": 4.933499199699044e-06,
"loss": 1.1008,
"step": 871
},
{
"epoch": 0.49442449442449443,
"grad_norm": 2.031609058380127,
"learning_rate": 4.93332735388417e-06,
"loss": 1.1318,
"step": 872
},
{
"epoch": 0.494991494991495,
"grad_norm": 2.3054850101470947,
"learning_rate": 4.933155289321596e-06,
"loss": 1.1877,
"step": 873
},
{
"epoch": 0.49555849555849557,
"grad_norm": 2.2658283710479736,
"learning_rate": 4.932983006026792e-06,
"loss": 1.1726,
"step": 874
},
{
"epoch": 0.4961254961254961,
"grad_norm": 2.0128214359283447,
"learning_rate": 4.932810504015246e-06,
"loss": 1.1001,
"step": 875
},
{
"epoch": 0.4966924966924967,
"grad_norm": 2.054161787033081,
"learning_rate": 4.932637783302465e-06,
"loss": 1.1191,
"step": 876
},
{
"epoch": 0.49725949725949725,
"grad_norm": 1.9922339916229248,
"learning_rate": 4.932464843903976e-06,
"loss": 1.1631,
"step": 877
},
{
"epoch": 0.49782649782649785,
"grad_norm": 2.0135786533355713,
"learning_rate": 4.932291685835324e-06,
"loss": 1.1587,
"step": 878
},
{
"epoch": 0.4983934983934984,
"grad_norm": 2.167001485824585,
"learning_rate": 4.932118309112077e-06,
"loss": 1.1278,
"step": 879
},
{
"epoch": 0.498960498960499,
"grad_norm": 2.2132434844970703,
"learning_rate": 4.931944713749821e-06,
"loss": 1.1372,
"step": 880
},
{
"epoch": 0.49952749952749953,
"grad_norm": 2.3871099948883057,
"learning_rate": 4.93177089976416e-06,
"loss": 1.1333,
"step": 881
},
{
"epoch": 0.5000945000945001,
"grad_norm": 2.0161664485931396,
"learning_rate": 4.931596867170721e-06,
"loss": 1.1531,
"step": 882
},
{
"epoch": 0.5006615006615006,
"grad_norm": 2.056384325027466,
"learning_rate": 4.93142261598515e-06,
"loss": 1.1791,
"step": 883
},
{
"epoch": 0.5012285012285013,
"grad_norm": 2.0523018836975098,
"learning_rate": 4.931248146223108e-06,
"loss": 1.1363,
"step": 884
},
{
"epoch": 0.5017955017955018,
"grad_norm": 2.1028332710266113,
"learning_rate": 4.9310734579002815e-06,
"loss": 1.1545,
"step": 885
},
{
"epoch": 0.5023625023625024,
"grad_norm": 2.1507184505462646,
"learning_rate": 4.9308985510323745e-06,
"loss": 1.1455,
"step": 886
},
{
"epoch": 0.5029295029295029,
"grad_norm": 2.3825719356536865,
"learning_rate": 4.93072342563511e-06,
"loss": 1.1448,
"step": 887
},
{
"epoch": 0.5034965034965035,
"grad_norm": 2.4547119140625,
"learning_rate": 4.930548081724232e-06,
"loss": 1.1699,
"step": 888
},
{
"epoch": 0.5040635040635041,
"grad_norm": 1.9959300756454468,
"learning_rate": 4.930372519315501e-06,
"loss": 1.1495,
"step": 889
},
{
"epoch": 0.5046305046305046,
"grad_norm": 2.0104527473449707,
"learning_rate": 4.930196738424703e-06,
"loss": 1.1168,
"step": 890
},
{
"epoch": 0.5051975051975052,
"grad_norm": 1.8669036626815796,
"learning_rate": 4.930020739067637e-06,
"loss": 1.0347,
"step": 891
},
{
"epoch": 0.5057645057645057,
"grad_norm": 2.0974411964416504,
"learning_rate": 4.929844521260125e-06,
"loss": 1.0764,
"step": 892
},
{
"epoch": 0.5063315063315064,
"grad_norm": 2.0043864250183105,
"learning_rate": 4.929668085018011e-06,
"loss": 1.1351,
"step": 893
},
{
"epoch": 0.5068985068985069,
"grad_norm": 2.148527145385742,
"learning_rate": 4.929491430357154e-06,
"loss": 1.1796,
"step": 894
},
{
"epoch": 0.5074655074655074,
"grad_norm": 1.9664125442504883,
"learning_rate": 4.929314557293434e-06,
"loss": 1.044,
"step": 895
},
{
"epoch": 0.508032508032508,
"grad_norm": 2.263545513153076,
"learning_rate": 4.929137465842752e-06,
"loss": 1.1961,
"step": 896
},
{
"epoch": 0.5085995085995086,
"grad_norm": 2.004988193511963,
"learning_rate": 4.928960156021029e-06,
"loss": 1.118,
"step": 897
},
{
"epoch": 0.5091665091665092,
"grad_norm": 2.54754900932312,
"learning_rate": 4.928782627844202e-06,
"loss": 1.1106,
"step": 898
},
{
"epoch": 0.5097335097335097,
"grad_norm": 1.9804669618606567,
"learning_rate": 4.928604881328234e-06,
"loss": 1.0989,
"step": 899
},
{
"epoch": 0.5103005103005103,
"grad_norm": 2.1374270915985107,
"learning_rate": 4.9284269164891e-06,
"loss": 1.1583,
"step": 900
},
{
"epoch": 0.5108675108675109,
"grad_norm": 1.9350876808166504,
"learning_rate": 4.9282487333428e-06,
"loss": 1.1411,
"step": 901
},
{
"epoch": 0.5114345114345115,
"grad_norm": 2.1036617755889893,
"learning_rate": 4.928070331905352e-06,
"loss": 1.1273,
"step": 902
},
{
"epoch": 0.512001512001512,
"grad_norm": 2.180227756500244,
"learning_rate": 4.927891712192795e-06,
"loss": 1.0769,
"step": 903
},
{
"epoch": 0.5125685125685125,
"grad_norm": 1.9973223209381104,
"learning_rate": 4.927712874221184e-06,
"loss": 1.0891,
"step": 904
},
{
"epoch": 0.5131355131355131,
"grad_norm": 2.426229953765869,
"learning_rate": 4.927533818006597e-06,
"loss": 1.151,
"step": 905
},
{
"epoch": 0.5137025137025137,
"grad_norm": 2.2276418209075928,
"learning_rate": 4.927354543565131e-06,
"loss": 1.1493,
"step": 906
},
{
"epoch": 0.5142695142695143,
"grad_norm": 2.0039780139923096,
"learning_rate": 4.9271750509129e-06,
"loss": 1.1024,
"step": 907
},
{
"epoch": 0.5148365148365148,
"grad_norm": 2.0033607482910156,
"learning_rate": 4.926995340066043e-06,
"loss": 1.1248,
"step": 908
},
{
"epoch": 0.5154035154035154,
"grad_norm": 2.007786750793457,
"learning_rate": 4.926815411040713e-06,
"loss": 1.0888,
"step": 909
},
{
"epoch": 0.515970515970516,
"grad_norm": 2.130481481552124,
"learning_rate": 4.926635263853086e-06,
"loss": 1.2004,
"step": 910
},
{
"epoch": 0.5165375165375166,
"grad_norm": 1.9995726346969604,
"learning_rate": 4.926454898519356e-06,
"loss": 1.0705,
"step": 911
},
{
"epoch": 0.5171045171045171,
"grad_norm": 2.122728109359741,
"learning_rate": 4.926274315055738e-06,
"loss": 1.1051,
"step": 912
},
{
"epoch": 0.5176715176715176,
"grad_norm": 2.3630666732788086,
"learning_rate": 4.926093513478466e-06,
"loss": 1.1809,
"step": 913
},
{
"epoch": 0.5182385182385182,
"grad_norm": 2.1009976863861084,
"learning_rate": 4.925912493803792e-06,
"loss": 1.1338,
"step": 914
},
{
"epoch": 0.5188055188055188,
"grad_norm": 2.086754560470581,
"learning_rate": 4.9257312560479895e-06,
"loss": 1.1692,
"step": 915
},
{
"epoch": 0.5193725193725194,
"grad_norm": 1.97232186794281,
"learning_rate": 4.925549800227352e-06,
"loss": 1.1131,
"step": 916
},
{
"epoch": 0.5199395199395199,
"grad_norm": 2.0924556255340576,
"learning_rate": 4.925368126358191e-06,
"loss": 1.1051,
"step": 917
},
{
"epoch": 0.5205065205065205,
"grad_norm": 2.1015572547912598,
"learning_rate": 4.925186234456839e-06,
"loss": 1.1694,
"step": 918
},
{
"epoch": 0.5210735210735211,
"grad_norm": 2.045950412750244,
"learning_rate": 4.925004124539648e-06,
"loss": 1.0953,
"step": 919
},
{
"epoch": 0.5216405216405217,
"grad_norm": 2.4197933673858643,
"learning_rate": 4.9248217966229865e-06,
"loss": 1.1401,
"step": 920
},
{
"epoch": 0.5222075222075222,
"grad_norm": 2.0570056438446045,
"learning_rate": 4.924639250723247e-06,
"loss": 1.1512,
"step": 921
},
{
"epoch": 0.5227745227745227,
"grad_norm": 2.0190746784210205,
"learning_rate": 4.92445648685684e-06,
"loss": 1.0759,
"step": 922
},
{
"epoch": 0.5233415233415234,
"grad_norm": 2.069953680038452,
"learning_rate": 4.924273505040195e-06,
"loss": 1.1182,
"step": 923
},
{
"epoch": 0.5239085239085239,
"grad_norm": 2.082301378250122,
"learning_rate": 4.9240903052897605e-06,
"loss": 1.1443,
"step": 924
},
{
"epoch": 0.5244755244755245,
"grad_norm": 2.077989339828491,
"learning_rate": 4.9239068876220064e-06,
"loss": 1.068,
"step": 925
},
{
"epoch": 0.525042525042525,
"grad_norm": 2.0633435249328613,
"learning_rate": 4.9237232520534216e-06,
"loss": 1.108,
"step": 926
},
{
"epoch": 0.5256095256095256,
"grad_norm": 2.0655646324157715,
"learning_rate": 4.9235393986005145e-06,
"loss": 1.1233,
"step": 927
},
{
"epoch": 0.5261765261765262,
"grad_norm": 2.005661725997925,
"learning_rate": 4.923355327279811e-06,
"loss": 1.1472,
"step": 928
},
{
"epoch": 0.5267435267435268,
"grad_norm": 2.087590217590332,
"learning_rate": 4.923171038107861e-06,
"loss": 1.1612,
"step": 929
},
{
"epoch": 0.5273105273105273,
"grad_norm": 1.943999171257019,
"learning_rate": 4.922986531101229e-06,
"loss": 1.1393,
"step": 930
},
{
"epoch": 0.5278775278775278,
"grad_norm": 2.139828681945801,
"learning_rate": 4.922801806276504e-06,
"loss": 1.1677,
"step": 931
},
{
"epoch": 0.5284445284445285,
"grad_norm": 2.0876002311706543,
"learning_rate": 4.92261686365029e-06,
"loss": 1.1315,
"step": 932
},
{
"epoch": 0.529011529011529,
"grad_norm": 2.1113197803497314,
"learning_rate": 4.922431703239214e-06,
"loss": 1.0681,
"step": 933
},
{
"epoch": 0.5295785295785296,
"grad_norm": 1.9526565074920654,
"learning_rate": 4.922246325059922e-06,
"loss": 1.0847,
"step": 934
},
{
"epoch": 0.5301455301455301,
"grad_norm": 2.103508234024048,
"learning_rate": 4.922060729129076e-06,
"loss": 1.1318,
"step": 935
},
{
"epoch": 0.5307125307125307,
"grad_norm": 2.0978636741638184,
"learning_rate": 4.921874915463363e-06,
"loss": 1.1212,
"step": 936
},
{
"epoch": 0.5312795312795313,
"grad_norm": 2.017594575881958,
"learning_rate": 4.921688884079486e-06,
"loss": 1.1121,
"step": 937
},
{
"epoch": 0.5318465318465319,
"grad_norm": 2.0037190914154053,
"learning_rate": 4.921502634994169e-06,
"loss": 1.0777,
"step": 938
},
{
"epoch": 0.5324135324135324,
"grad_norm": 2.0331473350524902,
"learning_rate": 4.9213161682241546e-06,
"loss": 1.1632,
"step": 939
},
{
"epoch": 0.5329805329805329,
"grad_norm": 2.252798080444336,
"learning_rate": 4.9211294837862055e-06,
"loss": 1.1374,
"step": 940
},
{
"epoch": 0.5335475335475336,
"grad_norm": 2.1416819095611572,
"learning_rate": 4.920942581697105e-06,
"loss": 1.1136,
"step": 941
},
{
"epoch": 0.5341145341145341,
"grad_norm": 2.1041433811187744,
"learning_rate": 4.920755461973654e-06,
"loss": 1.133,
"step": 942
},
{
"epoch": 0.5346815346815347,
"grad_norm": 2.319733142852783,
"learning_rate": 4.920568124632674e-06,
"loss": 1.1501,
"step": 943
},
{
"epoch": 0.5352485352485352,
"grad_norm": 2.0705809593200684,
"learning_rate": 4.920380569691007e-06,
"loss": 1.1071,
"step": 944
},
{
"epoch": 0.5358155358155359,
"grad_norm": 2.1050591468811035,
"learning_rate": 4.920192797165511e-06,
"loss": 1.1014,
"step": 945
},
{
"epoch": 0.5363825363825364,
"grad_norm": 1.9876645803451538,
"learning_rate": 4.920004807073069e-06,
"loss": 1.1281,
"step": 946
},
{
"epoch": 0.536949536949537,
"grad_norm": 1.9790834188461304,
"learning_rate": 4.919816599430579e-06,
"loss": 1.1586,
"step": 947
},
{
"epoch": 0.5375165375165375,
"grad_norm": 1.9907217025756836,
"learning_rate": 4.919628174254961e-06,
"loss": 1.0882,
"step": 948
},
{
"epoch": 0.538083538083538,
"grad_norm": 2.121260166168213,
"learning_rate": 4.9194395315631535e-06,
"loss": 1.1715,
"step": 949
},
{
"epoch": 0.5386505386505387,
"grad_norm": 2.3340444564819336,
"learning_rate": 4.919250671372114e-06,
"loss": 1.2029,
"step": 950
},
{
"epoch": 0.5392175392175392,
"grad_norm": 1.9696297645568848,
"learning_rate": 4.919061593698822e-06,
"loss": 1.1487,
"step": 951
},
{
"epoch": 0.5397845397845398,
"grad_norm": 1.9165290594100952,
"learning_rate": 4.918872298560273e-06,
"loss": 1.077,
"step": 952
},
{
"epoch": 0.5403515403515403,
"grad_norm": 2.135812520980835,
"learning_rate": 4.918682785973486e-06,
"loss": 1.0452,
"step": 953
},
{
"epoch": 0.540918540918541,
"grad_norm": 2.068650484085083,
"learning_rate": 4.918493055955497e-06,
"loss": 1.1309,
"step": 954
},
{
"epoch": 0.5414855414855415,
"grad_norm": 1.990132451057434,
"learning_rate": 4.91830310852336e-06,
"loss": 1.1296,
"step": 955
},
{
"epoch": 0.542052542052542,
"grad_norm": 2.1037912368774414,
"learning_rate": 4.918112943694153e-06,
"loss": 1.1017,
"step": 956
},
{
"epoch": 0.5426195426195426,
"grad_norm": 2.4106600284576416,
"learning_rate": 4.917922561484971e-06,
"loss": 1.1424,
"step": 957
},
{
"epoch": 0.5431865431865431,
"grad_norm": 2.0177202224731445,
"learning_rate": 4.917731961912927e-06,
"loss": 1.1401,
"step": 958
},
{
"epoch": 0.5437535437535438,
"grad_norm": 2.0823750495910645,
"learning_rate": 4.917541144995157e-06,
"loss": 1.1077,
"step": 959
},
{
"epoch": 0.5443205443205443,
"grad_norm": 2.0904784202575684,
"learning_rate": 4.917350110748815e-06,
"loss": 1.1433,
"step": 960
},
{
"epoch": 0.5448875448875449,
"grad_norm": 2.1177074909210205,
"learning_rate": 4.917158859191072e-06,
"loss": 1.1711,
"step": 961
},
{
"epoch": 0.5454545454545454,
"grad_norm": 2.118147373199463,
"learning_rate": 4.916967390339123e-06,
"loss": 1.1629,
"step": 962
},
{
"epoch": 0.5460215460215461,
"grad_norm": 1.9789443016052246,
"learning_rate": 4.916775704210179e-06,
"loss": 1.0943,
"step": 963
},
{
"epoch": 0.5465885465885466,
"grad_norm": 1.9533722400665283,
"learning_rate": 4.916583800821474e-06,
"loss": 1.1124,
"step": 964
},
{
"epoch": 0.5471555471555472,
"grad_norm": 2.237156867980957,
"learning_rate": 4.916391680190257e-06,
"loss": 1.0787,
"step": 965
},
{
"epoch": 0.5477225477225477,
"grad_norm": 2.0534963607788086,
"learning_rate": 4.9161993423338e-06,
"loss": 1.1373,
"step": 966
},
{
"epoch": 0.5482895482895482,
"grad_norm": 2.4482421875,
"learning_rate": 4.916006787269394e-06,
"loss": 1.1619,
"step": 967
},
{
"epoch": 0.5488565488565489,
"grad_norm": 2.11044979095459,
"learning_rate": 4.915814015014349e-06,
"loss": 1.1629,
"step": 968
},
{
"epoch": 0.5494235494235494,
"grad_norm": 2.1861917972564697,
"learning_rate": 4.915621025585993e-06,
"loss": 1.1492,
"step": 969
},
{
"epoch": 0.54999054999055,
"grad_norm": 2.104365587234497,
"learning_rate": 4.915427819001676e-06,
"loss": 1.1158,
"step": 970
},
{
"epoch": 0.5505575505575505,
"grad_norm": 2.0724103450775146,
"learning_rate": 4.915234395278768e-06,
"loss": 1.1259,
"step": 971
},
{
"epoch": 0.5511245511245512,
"grad_norm": 1.9905637502670288,
"learning_rate": 4.915040754434655e-06,
"loss": 1.0791,
"step": 972
},
{
"epoch": 0.5516915516915517,
"grad_norm": 2.011446714401245,
"learning_rate": 4.914846896486746e-06,
"loss": 1.136,
"step": 973
},
{
"epoch": 0.5522585522585522,
"grad_norm": 2.1651599407196045,
"learning_rate": 4.914652821452468e-06,
"loss": 1.1253,
"step": 974
},
{
"epoch": 0.5528255528255528,
"grad_norm": 2.071033239364624,
"learning_rate": 4.914458529349267e-06,
"loss": 1.0814,
"step": 975
},
{
"epoch": 0.5533925533925534,
"grad_norm": 1.9809670448303223,
"learning_rate": 4.914264020194609e-06,
"loss": 1.1501,
"step": 976
},
{
"epoch": 0.553959553959554,
"grad_norm": 1.9492532014846802,
"learning_rate": 4.914069294005982e-06,
"loss": 1.0487,
"step": 977
},
{
"epoch": 0.5545265545265545,
"grad_norm": 2.0437419414520264,
"learning_rate": 4.913874350800888e-06,
"loss": 1.0744,
"step": 978
},
{
"epoch": 0.5550935550935551,
"grad_norm": 2.072704553604126,
"learning_rate": 4.913679190596854e-06,
"loss": 1.0803,
"step": 979
},
{
"epoch": 0.5556605556605556,
"grad_norm": 1.9283461570739746,
"learning_rate": 4.913483813411423e-06,
"loss": 1.0459,
"step": 980
},
{
"epoch": 0.5562275562275563,
"grad_norm": 2.1060099601745605,
"learning_rate": 4.913288219262159e-06,
"loss": 1.0704,
"step": 981
},
{
"epoch": 0.5567945567945568,
"grad_norm": 2.0446958541870117,
"learning_rate": 4.913092408166646e-06,
"loss": 1.1412,
"step": 982
},
{
"epoch": 0.5573615573615573,
"grad_norm": 1.904240608215332,
"learning_rate": 4.912896380142486e-06,
"loss": 1.065,
"step": 983
},
{
"epoch": 0.5579285579285579,
"grad_norm": 1.9327867031097412,
"learning_rate": 4.912700135207301e-06,
"loss": 1.1088,
"step": 984
},
{
"epoch": 0.5584955584955585,
"grad_norm": 2.0221242904663086,
"learning_rate": 4.912503673378733e-06,
"loss": 1.1335,
"step": 985
},
{
"epoch": 0.5590625590625591,
"grad_norm": 2.1040868759155273,
"learning_rate": 4.912306994674444e-06,
"loss": 1.1691,
"step": 986
},
{
"epoch": 0.5596295596295596,
"grad_norm": 1.9867613315582275,
"learning_rate": 4.912110099112114e-06,
"loss": 1.0998,
"step": 987
},
{
"epoch": 0.5601965601965602,
"grad_norm": 2.1432271003723145,
"learning_rate": 4.911912986709444e-06,
"loss": 1.1162,
"step": 988
},
{
"epoch": 0.5607635607635607,
"grad_norm": 2.1761672496795654,
"learning_rate": 4.911715657484152e-06,
"loss": 1.1269,
"step": 989
},
{
"epoch": 0.5613305613305614,
"grad_norm": 2.068603038787842,
"learning_rate": 4.911518111453979e-06,
"loss": 1.1357,
"step": 990
},
{
"epoch": 0.5618975618975619,
"grad_norm": 2.151704788208008,
"learning_rate": 4.911320348636682e-06,
"loss": 1.0952,
"step": 991
},
{
"epoch": 0.5624645624645624,
"grad_norm": 2.053663492202759,
"learning_rate": 4.911122369050041e-06,
"loss": 1.0994,
"step": 992
},
{
"epoch": 0.563031563031563,
"grad_norm": 2.0221524238586426,
"learning_rate": 4.910924172711852e-06,
"loss": 1.0787,
"step": 993
},
{
"epoch": 0.5635985635985636,
"grad_norm": 2.0243706703186035,
"learning_rate": 4.910725759639934e-06,
"loss": 1.0871,
"step": 994
},
{
"epoch": 0.5641655641655642,
"grad_norm": 2.173171281814575,
"learning_rate": 4.910527129852122e-06,
"loss": 1.142,
"step": 995
},
{
"epoch": 0.5647325647325647,
"grad_norm": 2.029360771179199,
"learning_rate": 4.910328283366274e-06,
"loss": 1.1157,
"step": 996
},
{
"epoch": 0.5652995652995653,
"grad_norm": 2.067548990249634,
"learning_rate": 4.910129220200263e-06,
"loss": 1.1589,
"step": 997
},
{
"epoch": 0.5658665658665659,
"grad_norm": 1.99434232711792,
"learning_rate": 4.9099299403719855e-06,
"loss": 1.1006,
"step": 998
},
{
"epoch": 0.5664335664335665,
"grad_norm": 2.186133623123169,
"learning_rate": 4.909730443899357e-06,
"loss": 1.1297,
"step": 999
},
{
"epoch": 0.567000567000567,
"grad_norm": 2.068250894546509,
"learning_rate": 4.909530730800309e-06,
"loss": 1.1129,
"step": 1000
},
{
"epoch": 0.5675675675675675,
"grad_norm": 1.9011445045471191,
"learning_rate": 4.909330801092798e-06,
"loss": 1.0905,
"step": 1001
},
{
"epoch": 0.5681345681345681,
"grad_norm": 2.13041090965271,
"learning_rate": 4.909130654794795e-06,
"loss": 1.0878,
"step": 1002
},
{
"epoch": 0.5687015687015687,
"grad_norm": 2.140449047088623,
"learning_rate": 4.908930291924294e-06,
"loss": 1.148,
"step": 1003
},
{
"epoch": 0.5692685692685693,
"grad_norm": 2.160778045654297,
"learning_rate": 4.908729712499305e-06,
"loss": 1.1301,
"step": 1004
},
{
"epoch": 0.5698355698355698,
"grad_norm": 2.007202625274658,
"learning_rate": 4.90852891653786e-06,
"loss": 1.1445,
"step": 1005
},
{
"epoch": 0.5704025704025704,
"grad_norm": 2.148878335952759,
"learning_rate": 4.908327904058011e-06,
"loss": 1.1313,
"step": 1006
},
{
"epoch": 0.570969570969571,
"grad_norm": 2.013780117034912,
"learning_rate": 4.908126675077828e-06,
"loss": 1.0782,
"step": 1007
},
{
"epoch": 0.5715365715365716,
"grad_norm": 2.027580976486206,
"learning_rate": 4.9079252296154e-06,
"loss": 1.0961,
"step": 1008
},
{
"epoch": 0.5721035721035721,
"grad_norm": 2.006068706512451,
"learning_rate": 4.907723567688836e-06,
"loss": 1.1487,
"step": 1009
},
{
"epoch": 0.5726705726705726,
"grad_norm": 2.081812858581543,
"learning_rate": 4.907521689316265e-06,
"loss": 1.1765,
"step": 1010
},
{
"epoch": 0.5732375732375732,
"grad_norm": 1.988435983657837,
"learning_rate": 4.907319594515837e-06,
"loss": 1.0866,
"step": 1011
},
{
"epoch": 0.5738045738045738,
"grad_norm": 1.9649443626403809,
"learning_rate": 4.907117283305717e-06,
"loss": 1.101,
"step": 1012
},
{
"epoch": 0.5743715743715744,
"grad_norm": 1.9727582931518555,
"learning_rate": 4.906914755704094e-06,
"loss": 1.14,
"step": 1013
},
{
"epoch": 0.5749385749385749,
"grad_norm": 2.19130277633667,
"learning_rate": 4.906712011729173e-06,
"loss": 1.114,
"step": 1014
},
{
"epoch": 0.5755055755055755,
"grad_norm": 1.9982514381408691,
"learning_rate": 4.906509051399181e-06,
"loss": 1.1499,
"step": 1015
},
{
"epoch": 0.5760725760725761,
"grad_norm": 1.844041347503662,
"learning_rate": 4.906305874732362e-06,
"loss": 1.0985,
"step": 1016
},
{
"epoch": 0.5766395766395767,
"grad_norm": 2.0490691661834717,
"learning_rate": 4.9061024817469835e-06,
"loss": 1.1272,
"step": 1017
},
{
"epoch": 0.5772065772065772,
"grad_norm": 2.1834962368011475,
"learning_rate": 4.905898872461328e-06,
"loss": 1.1504,
"step": 1018
},
{
"epoch": 0.5777735777735777,
"grad_norm": 2.2521510124206543,
"learning_rate": 4.905695046893699e-06,
"loss": 1.1116,
"step": 1019
},
{
"epoch": 0.5783405783405784,
"grad_norm": 2.023836135864258,
"learning_rate": 4.905491005062421e-06,
"loss": 1.1157,
"step": 1020
},
{
"epoch": 0.5789075789075789,
"grad_norm": 2.420994520187378,
"learning_rate": 4.905286746985836e-06,
"loss": 1.1131,
"step": 1021
},
{
"epoch": 0.5794745794745795,
"grad_norm": 2.1546456813812256,
"learning_rate": 4.905082272682305e-06,
"loss": 1.1565,
"step": 1022
},
{
"epoch": 0.58004158004158,
"grad_norm": 2.156719207763672,
"learning_rate": 4.904877582170212e-06,
"loss": 1.0556,
"step": 1023
},
{
"epoch": 0.5806085806085806,
"grad_norm": 2.073331117630005,
"learning_rate": 4.904672675467956e-06,
"loss": 1.1314,
"step": 1024
},
{
"epoch": 0.5811755811755812,
"grad_norm": 2.097214460372925,
"learning_rate": 4.9044675525939575e-06,
"loss": 1.1041,
"step": 1025
},
{
"epoch": 0.5817425817425818,
"grad_norm": 2.1299214363098145,
"learning_rate": 4.904262213566657e-06,
"loss": 1.1585,
"step": 1026
},
{
"epoch": 0.5823095823095823,
"grad_norm": 2.228649139404297,
"learning_rate": 4.904056658404514e-06,
"loss": 1.1194,
"step": 1027
},
{
"epoch": 0.5828765828765828,
"grad_norm": 2.1192493438720703,
"learning_rate": 4.903850887126006e-06,
"loss": 1.0576,
"step": 1028
},
{
"epoch": 0.5834435834435835,
"grad_norm": 2.076824903488159,
"learning_rate": 4.903644899749632e-06,
"loss": 1.1033,
"step": 1029
},
{
"epoch": 0.584010584010584,
"grad_norm": 2.0137550830841064,
"learning_rate": 4.90343869629391e-06,
"loss": 1.0799,
"step": 1030
},
{
"epoch": 0.5845775845775846,
"grad_norm": 2.003573417663574,
"learning_rate": 4.903232276777376e-06,
"loss": 1.09,
"step": 1031
},
{
"epoch": 0.5851445851445851,
"grad_norm": 2.0129973888397217,
"learning_rate": 4.9030256412185875e-06,
"loss": 1.1007,
"step": 1032
},
{
"epoch": 0.5857115857115857,
"grad_norm": 2.07281494140625,
"learning_rate": 4.9028187896361185e-06,
"loss": 1.1368,
"step": 1033
},
{
"epoch": 0.5862785862785863,
"grad_norm": 2.1313490867614746,
"learning_rate": 4.902611722048566e-06,
"loss": 1.1255,
"step": 1034
},
{
"epoch": 0.5868455868455869,
"grad_norm": 2.160646438598633,
"learning_rate": 4.902404438474544e-06,
"loss": 1.083,
"step": 1035
},
{
"epoch": 0.5874125874125874,
"grad_norm": 2.0317294597625732,
"learning_rate": 4.9021969389326866e-06,
"loss": 1.1161,
"step": 1036
},
{
"epoch": 0.5879795879795879,
"grad_norm": 2.01206374168396,
"learning_rate": 4.901989223441647e-06,
"loss": 1.1186,
"step": 1037
},
{
"epoch": 0.5885465885465886,
"grad_norm": 2.1760642528533936,
"learning_rate": 4.901781292020098e-06,
"loss": 1.1131,
"step": 1038
},
{
"epoch": 0.5891135891135891,
"grad_norm": 2.3249378204345703,
"learning_rate": 4.9015731446867334e-06,
"loss": 1.1342,
"step": 1039
},
{
"epoch": 0.5896805896805897,
"grad_norm": 1.980068325996399,
"learning_rate": 4.901364781460263e-06,
"loss": 1.1192,
"step": 1040
},
{
"epoch": 0.5902475902475902,
"grad_norm": 1.993043303489685,
"learning_rate": 4.90115620235942e-06,
"loss": 1.0759,
"step": 1041
},
{
"epoch": 0.5908145908145909,
"grad_norm": 1.9559072256088257,
"learning_rate": 4.900947407402952e-06,
"loss": 1.1278,
"step": 1042
},
{
"epoch": 0.5913815913815914,
"grad_norm": 2.1052565574645996,
"learning_rate": 4.900738396609631e-06,
"loss": 1.1116,
"step": 1043
},
{
"epoch": 0.591948591948592,
"grad_norm": 2.000621795654297,
"learning_rate": 4.900529169998247e-06,
"loss": 1.1006,
"step": 1044
},
{
"epoch": 0.5925155925155925,
"grad_norm": 2.008673906326294,
"learning_rate": 4.900319727587607e-06,
"loss": 1.0611,
"step": 1045
},
{
"epoch": 0.593082593082593,
"grad_norm": 2.092581272125244,
"learning_rate": 4.90011006939654e-06,
"loss": 1.1278,
"step": 1046
},
{
"epoch": 0.5936495936495937,
"grad_norm": 2.192446708679199,
"learning_rate": 4.899900195443894e-06,
"loss": 1.1296,
"step": 1047
},
{
"epoch": 0.5942165942165942,
"grad_norm": 2.0756747722625732,
"learning_rate": 4.899690105748534e-06,
"loss": 1.1812,
"step": 1048
},
{
"epoch": 0.5947835947835948,
"grad_norm": 1.9756616353988647,
"learning_rate": 4.899479800329348e-06,
"loss": 1.1218,
"step": 1049
},
{
"epoch": 0.5953505953505953,
"grad_norm": 2.0761704444885254,
"learning_rate": 4.899269279205243e-06,
"loss": 1.1347,
"step": 1050
},
{
"epoch": 0.595917595917596,
"grad_norm": 2.0106265544891357,
"learning_rate": 4.899058542395141e-06,
"loss": 1.087,
"step": 1051
},
{
"epoch": 0.5964845964845965,
"grad_norm": 2.1320059299468994,
"learning_rate": 4.898847589917989e-06,
"loss": 1.1559,
"step": 1052
},
{
"epoch": 0.597051597051597,
"grad_norm": 2.087963104248047,
"learning_rate": 4.89863642179275e-06,
"loss": 1.1293,
"step": 1053
},
{
"epoch": 0.5976185976185976,
"grad_norm": 2.0533034801483154,
"learning_rate": 4.898425038038406e-06,
"loss": 1.1516,
"step": 1054
},
{
"epoch": 0.5981855981855981,
"grad_norm": 2.3996708393096924,
"learning_rate": 4.898213438673962e-06,
"loss": 1.0521,
"step": 1055
},
{
"epoch": 0.5987525987525988,
"grad_norm": 2.2481696605682373,
"learning_rate": 4.898001623718439e-06,
"loss": 1.1748,
"step": 1056
},
{
"epoch": 0.5993195993195993,
"grad_norm": 2.2351038455963135,
"learning_rate": 4.897789593190878e-06,
"loss": 1.1295,
"step": 1057
},
{
"epoch": 0.5998865998865999,
"grad_norm": 2.008779764175415,
"learning_rate": 4.897577347110339e-06,
"loss": 1.1014,
"step": 1058
},
{
"epoch": 0.6004536004536004,
"grad_norm": 2.0178744792938232,
"learning_rate": 4.897364885495905e-06,
"loss": 1.0628,
"step": 1059
},
{
"epoch": 0.6010206010206011,
"grad_norm": 1.9279972314834595,
"learning_rate": 4.8971522083666735e-06,
"loss": 1.0835,
"step": 1060
},
{
"epoch": 0.6015876015876016,
"grad_norm": 2.0269901752471924,
"learning_rate": 4.896939315741765e-06,
"loss": 1.097,
"step": 1061
},
{
"epoch": 0.6021546021546021,
"grad_norm": 2.0053329467773438,
"learning_rate": 4.896726207640315e-06,
"loss": 1.1419,
"step": 1062
},
{
"epoch": 0.6027216027216027,
"grad_norm": 2.123281478881836,
"learning_rate": 4.896512884081484e-06,
"loss": 1.1296,
"step": 1063
},
{
"epoch": 0.6032886032886033,
"grad_norm": 2.836108922958374,
"learning_rate": 4.896299345084447e-06,
"loss": 1.0291,
"step": 1064
},
{
"epoch": 0.6038556038556039,
"grad_norm": 2.102825880050659,
"learning_rate": 4.896085590668402e-06,
"loss": 1.1113,
"step": 1065
},
{
"epoch": 0.6044226044226044,
"grad_norm": 2.155285358428955,
"learning_rate": 4.895871620852564e-06,
"loss": 1.1262,
"step": 1066
},
{
"epoch": 0.604989604989605,
"grad_norm": 2.055398464202881,
"learning_rate": 4.895657435656168e-06,
"loss": 1.0959,
"step": 1067
},
{
"epoch": 0.6055566055566055,
"grad_norm": 1.9637835025787354,
"learning_rate": 4.8954430350984685e-06,
"loss": 1.1196,
"step": 1068
},
{
"epoch": 0.6061236061236062,
"grad_norm": 2.1283085346221924,
"learning_rate": 4.895228419198741e-06,
"loss": 1.1331,
"step": 1069
},
{
"epoch": 0.6066906066906067,
"grad_norm": 1.9687258005142212,
"learning_rate": 4.895013587976276e-06,
"loss": 1.0903,
"step": 1070
},
{
"epoch": 0.6072576072576072,
"grad_norm": 1.920413613319397,
"learning_rate": 4.8947985414503876e-06,
"loss": 1.0724,
"step": 1071
},
{
"epoch": 0.6078246078246078,
"grad_norm": 2.2596044540405273,
"learning_rate": 4.894583279640408e-06,
"loss": 1.1474,
"step": 1072
},
{
"epoch": 0.6083916083916084,
"grad_norm": 2.154895544052124,
"learning_rate": 4.894367802565688e-06,
"loss": 1.1321,
"step": 1073
},
{
"epoch": 0.608958608958609,
"grad_norm": 2.055975914001465,
"learning_rate": 4.894152110245599e-06,
"loss": 1.1589,
"step": 1074
},
{
"epoch": 0.6095256095256095,
"grad_norm": 2.0067338943481445,
"learning_rate": 4.8939362026995295e-06,
"loss": 1.0869,
"step": 1075
},
{
"epoch": 0.6100926100926101,
"grad_norm": 1.965383529663086,
"learning_rate": 4.89372007994689e-06,
"loss": 1.0597,
"step": 1076
},
{
"epoch": 0.6106596106596106,
"grad_norm": 2.137803316116333,
"learning_rate": 4.893503742007108e-06,
"loss": 1.1084,
"step": 1077
},
{
"epoch": 0.6112266112266113,
"grad_norm": 1.892305850982666,
"learning_rate": 4.893287188899633e-06,
"loss": 1.0763,
"step": 1078
},
{
"epoch": 0.6117936117936118,
"grad_norm": 2.238926649093628,
"learning_rate": 4.893070420643932e-06,
"loss": 1.1118,
"step": 1079
},
{
"epoch": 0.6123606123606123,
"grad_norm": 2.2139806747436523,
"learning_rate": 4.892853437259491e-06,
"loss": 1.1198,
"step": 1080
},
{
"epoch": 0.6129276129276129,
"grad_norm": 2.139768600463867,
"learning_rate": 4.892636238765817e-06,
"loss": 1.1724,
"step": 1081
},
{
"epoch": 0.6134946134946135,
"grad_norm": 2.024958848953247,
"learning_rate": 4.892418825182435e-06,
"loss": 1.1211,
"step": 1082
},
{
"epoch": 0.6140616140616141,
"grad_norm": 2.0417838096618652,
"learning_rate": 4.892201196528888e-06,
"loss": 1.1068,
"step": 1083
},
{
"epoch": 0.6146286146286146,
"grad_norm": 2.1328155994415283,
"learning_rate": 4.891983352824744e-06,
"loss": 1.0458,
"step": 1084
},
{
"epoch": 0.6151956151956152,
"grad_norm": 2.0734100341796875,
"learning_rate": 4.891765294089583e-06,
"loss": 1.0968,
"step": 1085
},
{
"epoch": 0.6157626157626158,
"grad_norm": 2.066288709640503,
"learning_rate": 4.891547020343009e-06,
"loss": 1.0867,
"step": 1086
},
{
"epoch": 0.6163296163296164,
"grad_norm": 2.0422234535217285,
"learning_rate": 4.891328531604643e-06,
"loss": 1.0878,
"step": 1087
},
{
"epoch": 0.6168966168966169,
"grad_norm": 2.113037347793579,
"learning_rate": 4.891109827894129e-06,
"loss": 1.0847,
"step": 1088
},
{
"epoch": 0.6174636174636174,
"grad_norm": 2.0433382987976074,
"learning_rate": 4.890890909231124e-06,
"loss": 1.0748,
"step": 1089
},
{
"epoch": 0.618030618030618,
"grad_norm": 2.0215182304382324,
"learning_rate": 4.890671775635311e-06,
"loss": 1.1735,
"step": 1090
},
{
"epoch": 0.6185976185976186,
"grad_norm": 1.9553345441818237,
"learning_rate": 4.890452427126389e-06,
"loss": 1.1418,
"step": 1091
},
{
"epoch": 0.6191646191646192,
"grad_norm": 2.063011646270752,
"learning_rate": 4.890232863724075e-06,
"loss": 1.1011,
"step": 1092
},
{
"epoch": 0.6197316197316197,
"grad_norm": 1.9352067708969116,
"learning_rate": 4.890013085448108e-06,
"loss": 1.0389,
"step": 1093
},
{
"epoch": 0.6202986202986203,
"grad_norm": 2.1764583587646484,
"learning_rate": 4.889793092318246e-06,
"loss": 1.1362,
"step": 1094
},
{
"epoch": 0.6208656208656209,
"grad_norm": 1.887978434562683,
"learning_rate": 4.889572884354265e-06,
"loss": 1.0563,
"step": 1095
},
{
"epoch": 0.6214326214326215,
"grad_norm": 2.02877140045166,
"learning_rate": 4.88935246157596e-06,
"loss": 1.1007,
"step": 1096
},
{
"epoch": 0.621999621999622,
"grad_norm": 1.9961140155792236,
"learning_rate": 4.889131824003147e-06,
"loss": 1.0799,
"step": 1097
},
{
"epoch": 0.6225666225666225,
"grad_norm": 2.059858798980713,
"learning_rate": 4.888910971655662e-06,
"loss": 1.112,
"step": 1098
},
{
"epoch": 0.6231336231336231,
"grad_norm": 1.9874125719070435,
"learning_rate": 4.888689904553356e-06,
"loss": 1.1,
"step": 1099
},
{
"epoch": 0.6237006237006237,
"grad_norm": 1.9725184440612793,
"learning_rate": 4.8884686227161034e-06,
"loss": 1.0764,
"step": 1100
},
{
"epoch": 0.6242676242676243,
"grad_norm": 2.049431562423706,
"learning_rate": 4.8882471261637985e-06,
"loss": 1.1104,
"step": 1101
},
{
"epoch": 0.6248346248346248,
"grad_norm": 2.1070902347564697,
"learning_rate": 4.888025414916351e-06,
"loss": 1.0904,
"step": 1102
},
{
"epoch": 0.6254016254016254,
"grad_norm": 2.054180145263672,
"learning_rate": 4.8878034889936924e-06,
"loss": 1.1429,
"step": 1103
},
{
"epoch": 0.625968625968626,
"grad_norm": 2.134850263595581,
"learning_rate": 4.887581348415773e-06,
"loss": 1.1664,
"step": 1104
},
{
"epoch": 0.6265356265356266,
"grad_norm": 2.1475751399993896,
"learning_rate": 4.887358993202563e-06,
"loss": 1.1337,
"step": 1105
},
{
"epoch": 0.6271026271026271,
"grad_norm": 2.002305507659912,
"learning_rate": 4.8871364233740505e-06,
"loss": 1.0891,
"step": 1106
},
{
"epoch": 0.6276696276696276,
"grad_norm": 1.9193668365478516,
"learning_rate": 4.886913638950245e-06,
"loss": 1.0568,
"step": 1107
},
{
"epoch": 0.6282366282366283,
"grad_norm": 2.232956886291504,
"learning_rate": 4.886690639951173e-06,
"loss": 1.1313,
"step": 1108
},
{
"epoch": 0.6288036288036288,
"grad_norm": 2.0268003940582275,
"learning_rate": 4.8864674263968815e-06,
"loss": 1.0933,
"step": 1109
},
{
"epoch": 0.6293706293706294,
"grad_norm": 2.0502026081085205,
"learning_rate": 4.886243998307436e-06,
"loss": 1.101,
"step": 1110
},
{
"epoch": 0.6299376299376299,
"grad_norm": 2.0282256603240967,
"learning_rate": 4.886020355702924e-06,
"loss": 1.088,
"step": 1111
},
{
"epoch": 0.6305046305046305,
"grad_norm": 2.0302987098693848,
"learning_rate": 4.885796498603448e-06,
"loss": 1.1083,
"step": 1112
},
{
"epoch": 0.6310716310716311,
"grad_norm": 2.0896990299224854,
"learning_rate": 4.885572427029133e-06,
"loss": 1.1143,
"step": 1113
},
{
"epoch": 0.6316386316386317,
"grad_norm": 1.9054166078567505,
"learning_rate": 4.8853481410001225e-06,
"loss": 1.0576,
"step": 1114
},
{
"epoch": 0.6322056322056322,
"grad_norm": 2.0707859992980957,
"learning_rate": 4.885123640536579e-06,
"loss": 1.1889,
"step": 1115
},
{
"epoch": 0.6327726327726327,
"grad_norm": 2.1405484676361084,
"learning_rate": 4.884898925658683e-06,
"loss": 1.0916,
"step": 1116
},
{
"epoch": 0.6333396333396334,
"grad_norm": 2.050922155380249,
"learning_rate": 4.884673996386637e-06,
"loss": 1.0856,
"step": 1117
},
{
"epoch": 0.6339066339066339,
"grad_norm": 2.1457486152648926,
"learning_rate": 4.884448852740661e-06,
"loss": 1.0539,
"step": 1118
},
{
"epoch": 0.6344736344736345,
"grad_norm": 2.2316818237304688,
"learning_rate": 4.884223494740994e-06,
"loss": 1.094,
"step": 1119
},
{
"epoch": 0.635040635040635,
"grad_norm": 1.9859856367111206,
"learning_rate": 4.8839979224078955e-06,
"loss": 1.1308,
"step": 1120
},
{
"epoch": 0.6356076356076356,
"grad_norm": 2.208192825317383,
"learning_rate": 4.883772135761644e-06,
"loss": 1.1239,
"step": 1121
},
{
"epoch": 0.6361746361746362,
"grad_norm": 1.978887677192688,
"learning_rate": 4.8835461348225365e-06,
"loss": 1.0661,
"step": 1122
},
{
"epoch": 0.6367416367416368,
"grad_norm": 2.0355641841888428,
"learning_rate": 4.88331991961089e-06,
"loss": 1.0705,
"step": 1123
},
{
"epoch": 0.6373086373086373,
"grad_norm": 2.0058109760284424,
"learning_rate": 4.8830934901470405e-06,
"loss": 1.0777,
"step": 1124
},
{
"epoch": 0.6378756378756378,
"grad_norm": 2.11564564704895,
"learning_rate": 4.882866846451342e-06,
"loss": 1.0747,
"step": 1125
},
{
"epoch": 0.6384426384426385,
"grad_norm": 2.005120277404785,
"learning_rate": 4.88263998854417e-06,
"loss": 1.0923,
"step": 1126
},
{
"epoch": 0.639009639009639,
"grad_norm": 2.0766701698303223,
"learning_rate": 4.882412916445919e-06,
"loss": 1.1058,
"step": 1127
},
{
"epoch": 0.6395766395766396,
"grad_norm": 2.267301082611084,
"learning_rate": 4.8821856301770004e-06,
"loss": 1.0833,
"step": 1128
},
{
"epoch": 0.6401436401436401,
"grad_norm": 2.0669078826904297,
"learning_rate": 4.881958129757848e-06,
"loss": 1.1013,
"step": 1129
},
{
"epoch": 0.6407106407106407,
"grad_norm": 2.0686910152435303,
"learning_rate": 4.8817304152089115e-06,
"loss": 1.0707,
"step": 1130
},
{
"epoch": 0.6412776412776413,
"grad_norm": 2.045891523361206,
"learning_rate": 4.881502486550663e-06,
"loss": 1.0751,
"step": 1131
},
{
"epoch": 0.6418446418446418,
"grad_norm": 2.202099323272705,
"learning_rate": 4.881274343803593e-06,
"loss": 1.1599,
"step": 1132
},
{
"epoch": 0.6424116424116424,
"grad_norm": 2.1953632831573486,
"learning_rate": 4.881045986988209e-06,
"loss": 1.1134,
"step": 1133
},
{
"epoch": 0.6429786429786429,
"grad_norm": 2.0201382637023926,
"learning_rate": 4.88081741612504e-06,
"loss": 1.1127,
"step": 1134
},
{
"epoch": 0.6435456435456436,
"grad_norm": 2.0316286087036133,
"learning_rate": 4.880588631234635e-06,
"loss": 1.1194,
"step": 1135
},
{
"epoch": 0.6441126441126441,
"grad_norm": 1.9625301361083984,
"learning_rate": 4.88035963233756e-06,
"loss": 1.1443,
"step": 1136
},
{
"epoch": 0.6446796446796447,
"grad_norm": 1.9853287935256958,
"learning_rate": 4.8801304194544006e-06,
"loss": 1.0763,
"step": 1137
},
{
"epoch": 0.6452466452466452,
"grad_norm": 2.0005953311920166,
"learning_rate": 4.879900992605764e-06,
"loss": 1.1616,
"step": 1138
},
{
"epoch": 0.6458136458136459,
"grad_norm": 2.028704881668091,
"learning_rate": 4.879671351812273e-06,
"loss": 1.1073,
"step": 1139
},
{
"epoch": 0.6463806463806464,
"grad_norm": 1.998048186302185,
"learning_rate": 4.879441497094572e-06,
"loss": 1.1021,
"step": 1140
},
{
"epoch": 0.646947646947647,
"grad_norm": 2.0080220699310303,
"learning_rate": 4.8792114284733264e-06,
"loss": 1.137,
"step": 1141
},
{
"epoch": 0.6475146475146475,
"grad_norm": 2.1290740966796875,
"learning_rate": 4.878981145969215e-06,
"loss": 1.0943,
"step": 1142
},
{
"epoch": 0.648081648081648,
"grad_norm": 1.9962797164916992,
"learning_rate": 4.8787506496029416e-06,
"loss": 1.0646,
"step": 1143
},
{
"epoch": 0.6486486486486487,
"grad_norm": 2.1631579399108887,
"learning_rate": 4.878519939395225e-06,
"loss": 1.219,
"step": 1144
},
{
"epoch": 0.6492156492156492,
"grad_norm": 2.115211009979248,
"learning_rate": 4.8782890153668085e-06,
"loss": 1.1076,
"step": 1145
},
{
"epoch": 0.6497826497826498,
"grad_norm": 2.169215440750122,
"learning_rate": 4.878057877538449e-06,
"loss": 1.1226,
"step": 1146
},
{
"epoch": 0.6503496503496503,
"grad_norm": 1.9753289222717285,
"learning_rate": 4.877826525930925e-06,
"loss": 1.0658,
"step": 1147
},
{
"epoch": 0.650916650916651,
"grad_norm": 2.0990424156188965,
"learning_rate": 4.877594960565036e-06,
"loss": 1.1459,
"step": 1148
},
{
"epoch": 0.6514836514836515,
"grad_norm": 1.9168016910552979,
"learning_rate": 4.877363181461598e-06,
"loss": 1.0719,
"step": 1149
},
{
"epoch": 0.652050652050652,
"grad_norm": 2.021908760070801,
"learning_rate": 4.877131188641445e-06,
"loss": 1.1096,
"step": 1150
},
{
"epoch": 0.6526176526176526,
"grad_norm": 2.1016788482666016,
"learning_rate": 4.876898982125435e-06,
"loss": 1.1969,
"step": 1151
},
{
"epoch": 0.6531846531846531,
"grad_norm": 2.069887399673462,
"learning_rate": 4.876666561934442e-06,
"loss": 1.1377,
"step": 1152
},
{
"epoch": 0.6537516537516538,
"grad_norm": 2.016388416290283,
"learning_rate": 4.876433928089359e-06,
"loss": 1.0541,
"step": 1153
},
{
"epoch": 0.6543186543186543,
"grad_norm": 2.005340814590454,
"learning_rate": 4.8762010806111e-06,
"loss": 1.102,
"step": 1154
},
{
"epoch": 0.6548856548856549,
"grad_norm": 2.129586696624756,
"learning_rate": 4.875968019520596e-06,
"loss": 1.0463,
"step": 1155
},
{
"epoch": 0.6554526554526554,
"grad_norm": 2.067054510116577,
"learning_rate": 4.8757347448388e-06,
"loss": 1.0828,
"step": 1156
},
{
"epoch": 0.6560196560196561,
"grad_norm": 1.9568535089492798,
"learning_rate": 4.875501256586682e-06,
"loss": 1.1493,
"step": 1157
},
{
"epoch": 0.6565866565866566,
"grad_norm": 2.120835542678833,
"learning_rate": 4.8752675547852304e-06,
"loss": 1.1571,
"step": 1158
},
{
"epoch": 0.6571536571536571,
"grad_norm": 1.8922606706619263,
"learning_rate": 4.875033639455455e-06,
"loss": 1.1587,
"step": 1159
},
{
"epoch": 0.6577206577206577,
"grad_norm": 1.973677635192871,
"learning_rate": 4.874799510618385e-06,
"loss": 1.0663,
"step": 1160
},
{
"epoch": 0.6582876582876583,
"grad_norm": 1.9903017282485962,
"learning_rate": 4.874565168295067e-06,
"loss": 1.0962,
"step": 1161
},
{
"epoch": 0.6588546588546589,
"grad_norm": 2.0548391342163086,
"learning_rate": 4.874330612506567e-06,
"loss": 1.1234,
"step": 1162
},
{
"epoch": 0.6594216594216594,
"grad_norm": 2.2835071086883545,
"learning_rate": 4.874095843273972e-06,
"loss": 1.1057,
"step": 1163
},
{
"epoch": 0.65998865998866,
"grad_norm": 2.045727252960205,
"learning_rate": 4.873860860618386e-06,
"loss": 1.0984,
"step": 1164
},
{
"epoch": 0.6605556605556605,
"grad_norm": 2.0151989459991455,
"learning_rate": 4.8736256645609325e-06,
"loss": 1.0752,
"step": 1165
},
{
"epoch": 0.6611226611226612,
"grad_norm": 2.1338889598846436,
"learning_rate": 4.873390255122756e-06,
"loss": 1.1256,
"step": 1166
},
{
"epoch": 0.6616896616896617,
"grad_norm": 2.144221782684326,
"learning_rate": 4.873154632325019e-06,
"loss": 1.1575,
"step": 1167
},
{
"epoch": 0.6622566622566622,
"grad_norm": 1.9910509586334229,
"learning_rate": 4.872918796188903e-06,
"loss": 1.0589,
"step": 1168
},
{
"epoch": 0.6628236628236628,
"grad_norm": 2.05189847946167,
"learning_rate": 4.872682746735609e-06,
"loss": 1.1265,
"step": 1169
},
{
"epoch": 0.6633906633906634,
"grad_norm": 2.0158932209014893,
"learning_rate": 4.872446483986355e-06,
"loss": 1.0658,
"step": 1170
},
{
"epoch": 0.663957663957664,
"grad_norm": 2.1114933490753174,
"learning_rate": 4.872210007962384e-06,
"loss": 1.129,
"step": 1171
},
{
"epoch": 0.6645246645246645,
"grad_norm": 1.9263200759887695,
"learning_rate": 4.871973318684951e-06,
"loss": 1.0575,
"step": 1172
},
{
"epoch": 0.6650916650916651,
"grad_norm": 2.173271656036377,
"learning_rate": 4.871736416175335e-06,
"loss": 1.1171,
"step": 1173
},
{
"epoch": 0.6656586656586656,
"grad_norm": 1.9181896448135376,
"learning_rate": 4.871499300454832e-06,
"loss": 1.0679,
"step": 1174
},
{
"epoch": 0.6662256662256663,
"grad_norm": 2.257871627807617,
"learning_rate": 4.8712619715447596e-06,
"loss": 1.1906,
"step": 1175
},
{
"epoch": 0.6667926667926668,
"grad_norm": 2.0567474365234375,
"learning_rate": 4.871024429466451e-06,
"loss": 1.1113,
"step": 1176
},
{
"epoch": 0.6673596673596673,
"grad_norm": 2.1029961109161377,
"learning_rate": 4.870786674241262e-06,
"loss": 1.0982,
"step": 1177
},
{
"epoch": 0.6679266679266679,
"grad_norm": 2.1588408946990967,
"learning_rate": 4.870548705890565e-06,
"loss": 1.1198,
"step": 1178
},
{
"epoch": 0.6684936684936685,
"grad_norm": 2.129164695739746,
"learning_rate": 4.8703105244357504e-06,
"loss": 1.1259,
"step": 1179
},
{
"epoch": 0.6690606690606691,
"grad_norm": 2.0273795127868652,
"learning_rate": 4.870072129898235e-06,
"loss": 1.1151,
"step": 1180
},
{
"epoch": 0.6696276696276696,
"grad_norm": 2.060084342956543,
"learning_rate": 4.8698335222994446e-06,
"loss": 1.0883,
"step": 1181
},
{
"epoch": 0.6701946701946702,
"grad_norm": 2.05039119720459,
"learning_rate": 4.869594701660832e-06,
"loss": 1.1613,
"step": 1182
},
{
"epoch": 0.6707616707616708,
"grad_norm": 1.8904789686203003,
"learning_rate": 4.869355668003866e-06,
"loss": 1.0545,
"step": 1183
},
{
"epoch": 0.6713286713286714,
"grad_norm": 2.0806965827941895,
"learning_rate": 4.8691164213500345e-06,
"loss": 1.0701,
"step": 1184
},
{
"epoch": 0.6718956718956719,
"grad_norm": 1.8995743989944458,
"learning_rate": 4.868876961720844e-06,
"loss": 1.117,
"step": 1185
},
{
"epoch": 0.6724626724626724,
"grad_norm": 1.8391963243484497,
"learning_rate": 4.868637289137823e-06,
"loss": 1.0774,
"step": 1186
},
{
"epoch": 0.673029673029673,
"grad_norm": 1.979702115058899,
"learning_rate": 4.8683974036225165e-06,
"loss": 1.1233,
"step": 1187
},
{
"epoch": 0.6735966735966736,
"grad_norm": 1.955850601196289,
"learning_rate": 4.868157305196489e-06,
"loss": 1.1159,
"step": 1188
},
{
"epoch": 0.6741636741636742,
"grad_norm": 2.1158955097198486,
"learning_rate": 4.867916993881324e-06,
"loss": 1.1319,
"step": 1189
},
{
"epoch": 0.6747306747306747,
"grad_norm": 2.1634299755096436,
"learning_rate": 4.867676469698627e-06,
"loss": 1.1027,
"step": 1190
},
{
"epoch": 0.6752976752976753,
"grad_norm": 2.337096691131592,
"learning_rate": 4.867435732670017e-06,
"loss": 1.2458,
"step": 1191
},
{
"epoch": 0.6758646758646759,
"grad_norm": 1.9703235626220703,
"learning_rate": 4.867194782817138e-06,
"loss": 1.0695,
"step": 1192
},
{
"epoch": 0.6764316764316765,
"grad_norm": 2.1863348484039307,
"learning_rate": 4.8669536201616495e-06,
"loss": 1.061,
"step": 1193
},
{
"epoch": 0.676998676998677,
"grad_norm": 2.2519869804382324,
"learning_rate": 4.866712244725232e-06,
"loss": 1.1289,
"step": 1194
},
{
"epoch": 0.6775656775656775,
"grad_norm": 2.0396711826324463,
"learning_rate": 4.866470656529581e-06,
"loss": 1.0968,
"step": 1195
},
{
"epoch": 0.6781326781326781,
"grad_norm": 2.0688552856445312,
"learning_rate": 4.86622885559642e-06,
"loss": 1.1406,
"step": 1196
},
{
"epoch": 0.6786996786996787,
"grad_norm": 2.0836985111236572,
"learning_rate": 4.865986841947482e-06,
"loss": 1.0849,
"step": 1197
},
{
"epoch": 0.6792666792666793,
"grad_norm": 2.044564962387085,
"learning_rate": 4.8657446156045245e-06,
"loss": 1.1471,
"step": 1198
},
{
"epoch": 0.6798336798336798,
"grad_norm": 2.9498212337493896,
"learning_rate": 4.865502176589323e-06,
"loss": 1.1025,
"step": 1199
},
{
"epoch": 0.6804006804006804,
"grad_norm": 2.525343179702759,
"learning_rate": 4.865259524923671e-06,
"loss": 1.1382,
"step": 1200
},
{
"epoch": 0.680967680967681,
"grad_norm": 2.1106417179107666,
"learning_rate": 4.865016660629383e-06,
"loss": 1.1217,
"step": 1201
},
{
"epoch": 0.6815346815346816,
"grad_norm": 1.9746856689453125,
"learning_rate": 4.864773583728291e-06,
"loss": 1.1246,
"step": 1202
},
{
"epoch": 0.6821016821016821,
"grad_norm": 2.0932469367980957,
"learning_rate": 4.864530294242247e-06,
"loss": 1.0897,
"step": 1203
},
{
"epoch": 0.6826686826686826,
"grad_norm": 2.091334342956543,
"learning_rate": 4.864286792193122e-06,
"loss": 1.0791,
"step": 1204
},
{
"epoch": 0.6832356832356833,
"grad_norm": 2.004572629928589,
"learning_rate": 4.864043077602807e-06,
"loss": 1.1148,
"step": 1205
},
{
"epoch": 0.6838026838026838,
"grad_norm": 2.5519580841064453,
"learning_rate": 4.863799150493209e-06,
"loss": 1.0816,
"step": 1206
},
{
"epoch": 0.6843696843696844,
"grad_norm": 2.1835830211639404,
"learning_rate": 4.863555010886257e-06,
"loss": 1.158,
"step": 1207
},
{
"epoch": 0.6849366849366849,
"grad_norm": 2.0027101039886475,
"learning_rate": 4.8633106588038995e-06,
"loss": 1.142,
"step": 1208
},
{
"epoch": 0.6855036855036855,
"grad_norm": 2.0194194316864014,
"learning_rate": 4.8630660942681004e-06,
"loss": 1.0262,
"step": 1209
},
{
"epoch": 0.6860706860706861,
"grad_norm": 1.973973274230957,
"learning_rate": 4.862821317300848e-06,
"loss": 1.0618,
"step": 1210
},
{
"epoch": 0.6866376866376867,
"grad_norm": 2.0087716579437256,
"learning_rate": 4.862576327924145e-06,
"loss": 1.1234,
"step": 1211
},
{
"epoch": 0.6872046872046872,
"grad_norm": 2.050034284591675,
"learning_rate": 4.862331126160017e-06,
"loss": 1.1161,
"step": 1212
},
{
"epoch": 0.6877716877716877,
"grad_norm": 1.87214994430542,
"learning_rate": 4.8620857120305045e-06,
"loss": 1.1049,
"step": 1213
},
{
"epoch": 0.6883386883386884,
"grad_norm": 2.108328104019165,
"learning_rate": 4.861840085557671e-06,
"loss": 1.0142,
"step": 1214
},
{
"epoch": 0.6889056889056889,
"grad_norm": 1.8961143493652344,
"learning_rate": 4.861594246763596e-06,
"loss": 1.0867,
"step": 1215
},
{
"epoch": 0.6894726894726895,
"grad_norm": 2.029620885848999,
"learning_rate": 4.861348195670381e-06,
"loss": 1.122,
"step": 1216
},
{
"epoch": 0.69003969003969,
"grad_norm": 1.9730783700942993,
"learning_rate": 4.861101932300144e-06,
"loss": 1.0595,
"step": 1217
},
{
"epoch": 0.6906066906066906,
"grad_norm": 2.0151636600494385,
"learning_rate": 4.860855456675024e-06,
"loss": 1.1576,
"step": 1218
},
{
"epoch": 0.6911736911736912,
"grad_norm": 1.9657617807388306,
"learning_rate": 4.8606087688171786e-06,
"loss": 1.0515,
"step": 1219
},
{
"epoch": 0.6917406917406917,
"grad_norm": 2.0308215618133545,
"learning_rate": 4.860361868748783e-06,
"loss": 1.0984,
"step": 1220
},
{
"epoch": 0.6923076923076923,
"grad_norm": 2.583967685699463,
"learning_rate": 4.860114756492034e-06,
"loss": 1.1196,
"step": 1221
},
{
"epoch": 0.6928746928746928,
"grad_norm": 1.9201866388320923,
"learning_rate": 4.859867432069145e-06,
"loss": 1.0976,
"step": 1222
},
{
"epoch": 0.6934416934416935,
"grad_norm": 1.9514093399047852,
"learning_rate": 4.859619895502351e-06,
"loss": 1.1566,
"step": 1223
},
{
"epoch": 0.694008694008694,
"grad_norm": 2.0827386379241943,
"learning_rate": 4.859372146813903e-06,
"loss": 1.1449,
"step": 1224
},
{
"epoch": 0.6945756945756946,
"grad_norm": 2.1744086742401123,
"learning_rate": 4.859124186026074e-06,
"loss": 1.147,
"step": 1225
},
{
"epoch": 0.6951426951426951,
"grad_norm": 1.9219239950180054,
"learning_rate": 4.858876013161153e-06,
"loss": 1.0744,
"step": 1226
},
{
"epoch": 0.6957096957096958,
"grad_norm": 2.0406641960144043,
"learning_rate": 4.858627628241453e-06,
"loss": 1.0826,
"step": 1227
},
{
"epoch": 0.6962766962766963,
"grad_norm": 1.9926422834396362,
"learning_rate": 4.8583790312893005e-06,
"loss": 1.0833,
"step": 1228
},
{
"epoch": 0.6968436968436968,
"grad_norm": 2.0115749835968018,
"learning_rate": 4.858130222327044e-06,
"loss": 1.1028,
"step": 1229
},
{
"epoch": 0.6974106974106974,
"grad_norm": 2.016007900238037,
"learning_rate": 4.85788120137705e-06,
"loss": 1.0219,
"step": 1230
},
{
"epoch": 0.6979776979776979,
"grad_norm": 1.9939322471618652,
"learning_rate": 4.8576319684617064e-06,
"loss": 1.1327,
"step": 1231
},
{
"epoch": 0.6985446985446986,
"grad_norm": 2.1431961059570312,
"learning_rate": 4.8573825236034175e-06,
"loss": 1.1653,
"step": 1232
},
{
"epoch": 0.6991116991116991,
"grad_norm": 2.201347589492798,
"learning_rate": 4.857132866824607e-06,
"loss": 1.105,
"step": 1233
},
{
"epoch": 0.6996786996786997,
"grad_norm": 2.0271849632263184,
"learning_rate": 4.856882998147719e-06,
"loss": 1.1261,
"step": 1234
},
{
"epoch": 0.7002457002457002,
"grad_norm": 1.9960558414459229,
"learning_rate": 4.856632917595214e-06,
"loss": 1.137,
"step": 1235
},
{
"epoch": 0.7008127008127009,
"grad_norm": 1.947149634361267,
"learning_rate": 4.856382625189576e-06,
"loss": 1.0938,
"step": 1236
},
{
"epoch": 0.7013797013797014,
"grad_norm": 2.0694658756256104,
"learning_rate": 4.856132120953304e-06,
"loss": 1.1362,
"step": 1237
},
{
"epoch": 0.701946701946702,
"grad_norm": 1.90565025806427,
"learning_rate": 4.8558814049089174e-06,
"loss": 1.0873,
"step": 1238
},
{
"epoch": 0.7025137025137025,
"grad_norm": 1.8799278736114502,
"learning_rate": 4.8556304770789545e-06,
"loss": 1.0283,
"step": 1239
},
{
"epoch": 0.703080703080703,
"grad_norm": 2.064629554748535,
"learning_rate": 4.855379337485973e-06,
"loss": 1.0587,
"step": 1240
},
{
"epoch": 0.7036477036477037,
"grad_norm": 2.037976026535034,
"learning_rate": 4.8551279861525515e-06,
"loss": 1.0816,
"step": 1241
},
{
"epoch": 0.7042147042147042,
"grad_norm": 2.141584873199463,
"learning_rate": 4.854876423101283e-06,
"loss": 1.156,
"step": 1242
},
{
"epoch": 0.7047817047817048,
"grad_norm": 1.953120470046997,
"learning_rate": 4.854624648354782e-06,
"loss": 1.1033,
"step": 1243
},
{
"epoch": 0.7053487053487053,
"grad_norm": 2.0960190296173096,
"learning_rate": 4.8543726619356846e-06,
"loss": 1.1671,
"step": 1244
},
{
"epoch": 0.705915705915706,
"grad_norm": 2.338067054748535,
"learning_rate": 4.854120463866641e-06,
"loss": 1.1358,
"step": 1245
},
{
"epoch": 0.7064827064827065,
"grad_norm": 2.08161997795105,
"learning_rate": 4.8538680541703245e-06,
"loss": 1.1562,
"step": 1246
},
{
"epoch": 0.707049707049707,
"grad_norm": 2.118591785430908,
"learning_rate": 4.853615432869425e-06,
"loss": 1.1358,
"step": 1247
},
{
"epoch": 0.7076167076167076,
"grad_norm": 2.370490074157715,
"learning_rate": 4.853362599986653e-06,
"loss": 1.1824,
"step": 1248
},
{
"epoch": 0.7081837081837082,
"grad_norm": 1.973972201347351,
"learning_rate": 4.853109555544737e-06,
"loss": 1.1056,
"step": 1249
},
{
"epoch": 0.7087507087507088,
"grad_norm": 2.1963696479797363,
"learning_rate": 4.852856299566425e-06,
"loss": 1.1267,
"step": 1250
},
{
"epoch": 0.7093177093177093,
"grad_norm": 2.1779510974884033,
"learning_rate": 4.852602832074483e-06,
"loss": 1.1242,
"step": 1251
},
{
"epoch": 0.7098847098847099,
"grad_norm": 1.9847594499588013,
"learning_rate": 4.852349153091699e-06,
"loss": 1.0993,
"step": 1252
},
{
"epoch": 0.7104517104517104,
"grad_norm": 2.06015944480896,
"learning_rate": 4.852095262640875e-06,
"loss": 1.1279,
"step": 1253
},
{
"epoch": 0.7110187110187111,
"grad_norm": 1.868611454963684,
"learning_rate": 4.851841160744836e-06,
"loss": 1.118,
"step": 1254
},
{
"epoch": 0.7115857115857116,
"grad_norm": 2.069683074951172,
"learning_rate": 4.851586847426426e-06,
"loss": 1.109,
"step": 1255
},
{
"epoch": 0.7121527121527121,
"grad_norm": 2.7425143718719482,
"learning_rate": 4.8513323227085055e-06,
"loss": 1.09,
"step": 1256
},
{
"epoch": 0.7127197127197127,
"grad_norm": 2.0621962547302246,
"learning_rate": 4.8510775866139556e-06,
"loss": 1.1183,
"step": 1257
},
{
"epoch": 0.7132867132867133,
"grad_norm": 2.1513285636901855,
"learning_rate": 4.850822639165676e-06,
"loss": 1.0869,
"step": 1258
},
{
"epoch": 0.7138537138537139,
"grad_norm": 1.9639217853546143,
"learning_rate": 4.850567480386586e-06,
"loss": 1.0739,
"step": 1259
},
{
"epoch": 0.7144207144207144,
"grad_norm": 2.014523983001709,
"learning_rate": 4.850312110299625e-06,
"loss": 1.1124,
"step": 1260
},
{
"epoch": 0.714987714987715,
"grad_norm": 1.9941346645355225,
"learning_rate": 4.850056528927748e-06,
"loss": 1.0457,
"step": 1261
},
{
"epoch": 0.7155547155547155,
"grad_norm": 1.8968991041183472,
"learning_rate": 4.8498007362939304e-06,
"loss": 1.0871,
"step": 1262
},
{
"epoch": 0.7161217161217162,
"grad_norm": 1.9723222255706787,
"learning_rate": 4.8495447324211685e-06,
"loss": 1.0315,
"step": 1263
},
{
"epoch": 0.7166887166887167,
"grad_norm": 2.0764787197113037,
"learning_rate": 4.849288517332476e-06,
"loss": 1.1086,
"step": 1264
},
{
"epoch": 0.7172557172557172,
"grad_norm": 2.0694315433502197,
"learning_rate": 4.849032091050885e-06,
"loss": 1.0452,
"step": 1265
},
{
"epoch": 0.7178227178227178,
"grad_norm": 1.9298354387283325,
"learning_rate": 4.848775453599448e-06,
"loss": 1.0481,
"step": 1266
},
{
"epoch": 0.7183897183897184,
"grad_norm": 2.0368900299072266,
"learning_rate": 4.848518605001235e-06,
"loss": 1.068,
"step": 1267
},
{
"epoch": 0.718956718956719,
"grad_norm": 1.9808545112609863,
"learning_rate": 4.848261545279337e-06,
"loss": 1.0679,
"step": 1268
},
{
"epoch": 0.7195237195237195,
"grad_norm": 2.044835090637207,
"learning_rate": 4.848004274456861e-06,
"loss": 1.0736,
"step": 1269
},
{
"epoch": 0.7200907200907201,
"grad_norm": 2.0716989040374756,
"learning_rate": 4.8477467925569365e-06,
"loss": 1.0793,
"step": 1270
},
{
"epoch": 0.7206577206577207,
"grad_norm": 2.009671688079834,
"learning_rate": 4.84748909960271e-06,
"loss": 1.1016,
"step": 1271
},
{
"epoch": 0.7212247212247213,
"grad_norm": 2.013240098953247,
"learning_rate": 4.847231195617346e-06,
"loss": 1.0615,
"step": 1272
},
{
"epoch": 0.7217917217917218,
"grad_norm": 1.89451265335083,
"learning_rate": 4.8469730806240305e-06,
"loss": 1.0454,
"step": 1273
},
{
"epoch": 0.7223587223587223,
"grad_norm": 2.019988536834717,
"learning_rate": 4.846714754645967e-06,
"loss": 1.0882,
"step": 1274
},
{
"epoch": 0.7229257229257229,
"grad_norm": 2.1510188579559326,
"learning_rate": 4.846456217706376e-06,
"loss": 1.1119,
"step": 1275
},
{
"epoch": 0.7234927234927235,
"grad_norm": 2.1269772052764893,
"learning_rate": 4.846197469828503e-06,
"loss": 1.0481,
"step": 1276
},
{
"epoch": 0.7240597240597241,
"grad_norm": 2.001478433609009,
"learning_rate": 4.845938511035605e-06,
"loss": 1.1756,
"step": 1277
},
{
"epoch": 0.7246267246267246,
"grad_norm": 1.980617880821228,
"learning_rate": 4.845679341350963e-06,
"loss": 1.0841,
"step": 1278
},
{
"epoch": 0.7251937251937252,
"grad_norm": 2.010340452194214,
"learning_rate": 4.845419960797876e-06,
"loss": 1.1233,
"step": 1279
},
{
"epoch": 0.7257607257607258,
"grad_norm": 2.12774395942688,
"learning_rate": 4.84516036939966e-06,
"loss": 1.1496,
"step": 1280
},
{
"epoch": 0.7263277263277264,
"grad_norm": 2.0324392318725586,
"learning_rate": 4.844900567179652e-06,
"loss": 1.1279,
"step": 1281
},
{
"epoch": 0.7268947268947269,
"grad_norm": 2.0038788318634033,
"learning_rate": 4.844640554161209e-06,
"loss": 1.0345,
"step": 1282
},
{
"epoch": 0.7274617274617274,
"grad_norm": 2.1524405479431152,
"learning_rate": 4.844380330367701e-06,
"loss": 1.1229,
"step": 1283
},
{
"epoch": 0.728028728028728,
"grad_norm": 1.9963539838790894,
"learning_rate": 4.8441198958225255e-06,
"loss": 1.1124,
"step": 1284
},
{
"epoch": 0.7285957285957286,
"grad_norm": 2.119422197341919,
"learning_rate": 4.843859250549093e-06,
"loss": 1.117,
"step": 1285
},
{
"epoch": 0.7291627291627292,
"grad_norm": 2.045699119567871,
"learning_rate": 4.8435983945708345e-06,
"loss": 1.1382,
"step": 1286
},
{
"epoch": 0.7297297297297297,
"grad_norm": 1.9394176006317139,
"learning_rate": 4.8433373279112e-06,
"loss": 1.1363,
"step": 1287
},
{
"epoch": 0.7302967302967303,
"grad_norm": 2.0747976303100586,
"learning_rate": 4.8430760505936596e-06,
"loss": 1.0777,
"step": 1288
},
{
"epoch": 0.7308637308637309,
"grad_norm": 1.814608097076416,
"learning_rate": 4.842814562641699e-06,
"loss": 1.0878,
"step": 1289
},
{
"epoch": 0.7314307314307315,
"grad_norm": 1.9501152038574219,
"learning_rate": 4.842552864078827e-06,
"loss": 1.1065,
"step": 1290
},
{
"epoch": 0.731997731997732,
"grad_norm": 1.955440878868103,
"learning_rate": 4.8422909549285686e-06,
"loss": 1.0804,
"step": 1291
},
{
"epoch": 0.7325647325647325,
"grad_norm": 2.325143337249756,
"learning_rate": 4.842028835214469e-06,
"loss": 1.1443,
"step": 1292
},
{
"epoch": 0.7331317331317331,
"grad_norm": 1.977352499961853,
"learning_rate": 4.841766504960091e-06,
"loss": 1.1329,
"step": 1293
},
{
"epoch": 0.7336987336987337,
"grad_norm": 2.104295015335083,
"learning_rate": 4.8415039641890185e-06,
"loss": 1.0813,
"step": 1294
},
{
"epoch": 0.7342657342657343,
"grad_norm": 2.1327080726623535,
"learning_rate": 4.841241212924851e-06,
"loss": 1.0582,
"step": 1295
},
{
"epoch": 0.7348327348327348,
"grad_norm": 2.021190643310547,
"learning_rate": 4.840978251191212e-06,
"loss": 1.123,
"step": 1296
},
{
"epoch": 0.7353997353997354,
"grad_norm": 2.0947611331939697,
"learning_rate": 4.840715079011738e-06,
"loss": 1.0579,
"step": 1297
},
{
"epoch": 0.735966735966736,
"grad_norm": 2.0282726287841797,
"learning_rate": 4.840451696410087e-06,
"loss": 1.1516,
"step": 1298
},
{
"epoch": 0.7365337365337365,
"grad_norm": 2.0446856021881104,
"learning_rate": 4.840188103409939e-06,
"loss": 1.1577,
"step": 1299
},
{
"epoch": 0.7371007371007371,
"grad_norm": 1.9985852241516113,
"learning_rate": 4.839924300034988e-06,
"loss": 0.9967,
"step": 1300
},
{
"epoch": 0.7376677376677376,
"grad_norm": 2.1126623153686523,
"learning_rate": 4.839660286308951e-06,
"loss": 1.1316,
"step": 1301
},
{
"epoch": 0.7382347382347383,
"grad_norm": 2.0512776374816895,
"learning_rate": 4.839396062255558e-06,
"loss": 1.1535,
"step": 1302
},
{
"epoch": 0.7388017388017388,
"grad_norm": 2.0764071941375732,
"learning_rate": 4.839131627898565e-06,
"loss": 1.1503,
"step": 1303
},
{
"epoch": 0.7393687393687394,
"grad_norm": 1.9599411487579346,
"learning_rate": 4.838866983261745e-06,
"loss": 1.1099,
"step": 1304
},
{
"epoch": 0.7399357399357399,
"grad_norm": 2.029916524887085,
"learning_rate": 4.838602128368885e-06,
"loss": 1.0437,
"step": 1305
},
{
"epoch": 0.7405027405027405,
"grad_norm": 2.106447219848633,
"learning_rate": 4.838337063243797e-06,
"loss": 1.0959,
"step": 1306
},
{
"epoch": 0.7410697410697411,
"grad_norm": 2.0462288856506348,
"learning_rate": 4.838071787910308e-06,
"loss": 1.0623,
"step": 1307
},
{
"epoch": 0.7416367416367416,
"grad_norm": 1.8768260478973389,
"learning_rate": 4.837806302392266e-06,
"loss": 1.1131,
"step": 1308
},
{
"epoch": 0.7422037422037422,
"grad_norm": 1.8805971145629883,
"learning_rate": 4.837540606713538e-06,
"loss": 1.1141,
"step": 1309
},
{
"epoch": 0.7427707427707427,
"grad_norm": 2.0501909255981445,
"learning_rate": 4.837274700898007e-06,
"loss": 1.096,
"step": 1310
},
{
"epoch": 0.7433377433377434,
"grad_norm": 2.987117052078247,
"learning_rate": 4.837008584969579e-06,
"loss": 1.182,
"step": 1311
},
{
"epoch": 0.7439047439047439,
"grad_norm": 2.1888184547424316,
"learning_rate": 4.836742258952176e-06,
"loss": 1.0684,
"step": 1312
},
{
"epoch": 0.7444717444717445,
"grad_norm": 2.088069438934326,
"learning_rate": 4.836475722869741e-06,
"loss": 1.1147,
"step": 1313
},
{
"epoch": 0.745038745038745,
"grad_norm": 1.9525327682495117,
"learning_rate": 4.836208976746233e-06,
"loss": 1.0685,
"step": 1314
},
{
"epoch": 0.7456057456057456,
"grad_norm": 1.9905927181243896,
"learning_rate": 4.835942020605633e-06,
"loss": 1.1229,
"step": 1315
},
{
"epoch": 0.7461727461727462,
"grad_norm": 2.027682304382324,
"learning_rate": 4.835674854471938e-06,
"loss": 1.0984,
"step": 1316
},
{
"epoch": 0.7467397467397467,
"grad_norm": 1.9673049449920654,
"learning_rate": 4.835407478369166e-06,
"loss": 1.0853,
"step": 1317
},
{
"epoch": 0.7473067473067473,
"grad_norm": 2.155024290084839,
"learning_rate": 4.835139892321353e-06,
"loss": 1.1211,
"step": 1318
},
{
"epoch": 0.7478737478737478,
"grad_norm": 2.1094534397125244,
"learning_rate": 4.834872096352554e-06,
"loss": 1.1137,
"step": 1319
},
{
"epoch": 0.7484407484407485,
"grad_norm": 2.0188000202178955,
"learning_rate": 4.834604090486844e-06,
"loss": 1.0767,
"step": 1320
},
{
"epoch": 0.749007749007749,
"grad_norm": 2.0796914100646973,
"learning_rate": 4.834335874748315e-06,
"loss": 1.1056,
"step": 1321
},
{
"epoch": 0.7495747495747496,
"grad_norm": 1.9424731731414795,
"learning_rate": 4.8340674491610786e-06,
"loss": 1.0779,
"step": 1322
},
{
"epoch": 0.7501417501417501,
"grad_norm": 2.0110368728637695,
"learning_rate": 4.833798813749265e-06,
"loss": 1.125,
"step": 1323
},
{
"epoch": 0.7507087507087508,
"grad_norm": 2.0801455974578857,
"learning_rate": 4.833529968537024e-06,
"loss": 1.1088,
"step": 1324
},
{
"epoch": 0.7512757512757513,
"grad_norm": 2.0912394523620605,
"learning_rate": 4.833260913548524e-06,
"loss": 1.1316,
"step": 1325
},
{
"epoch": 0.7518427518427518,
"grad_norm": 1.9779878854751587,
"learning_rate": 4.832991648807951e-06,
"loss": 1.0939,
"step": 1326
},
{
"epoch": 0.7524097524097524,
"grad_norm": 2.2234911918640137,
"learning_rate": 4.832722174339513e-06,
"loss": 1.096,
"step": 1327
},
{
"epoch": 0.7529767529767529,
"grad_norm": 2.1075234413146973,
"learning_rate": 4.832452490167433e-06,
"loss": 1.1334,
"step": 1328
},
{
"epoch": 0.7535437535437536,
"grad_norm": 2.2699432373046875,
"learning_rate": 4.832182596315956e-06,
"loss": 1.1511,
"step": 1329
},
{
"epoch": 0.7541107541107541,
"grad_norm": 1.9237340688705444,
"learning_rate": 4.8319124928093445e-06,
"loss": 1.0717,
"step": 1330
},
{
"epoch": 0.7546777546777547,
"grad_norm": 1.976044774055481,
"learning_rate": 4.831642179671878e-06,
"loss": 1.0268,
"step": 1331
},
{
"epoch": 0.7552447552447552,
"grad_norm": 1.9518632888793945,
"learning_rate": 4.831371656927858e-06,
"loss": 1.1481,
"step": 1332
},
{
"epoch": 0.7558117558117559,
"grad_norm": 2.0657694339752197,
"learning_rate": 4.831100924601604e-06,
"loss": 1.0781,
"step": 1333
},
{
"epoch": 0.7563787563787564,
"grad_norm": 2.010305166244507,
"learning_rate": 4.830829982717454e-06,
"loss": 1.0608,
"step": 1334
},
{
"epoch": 0.7569457569457569,
"grad_norm": 1.9423797130584717,
"learning_rate": 4.8305588312997635e-06,
"loss": 1.1217,
"step": 1335
},
{
"epoch": 0.7575127575127575,
"grad_norm": 1.993203043937683,
"learning_rate": 4.830287470372909e-06,
"loss": 1.1038,
"step": 1336
},
{
"epoch": 0.758079758079758,
"grad_norm": 1.8566621541976929,
"learning_rate": 4.830015899961285e-06,
"loss": 1.0882,
"step": 1337
},
{
"epoch": 0.7586467586467587,
"grad_norm": 2.2911787033081055,
"learning_rate": 4.829744120089304e-06,
"loss": 1.1048,
"step": 1338
},
{
"epoch": 0.7592137592137592,
"grad_norm": 2.038271188735962,
"learning_rate": 4.829472130781398e-06,
"loss": 1.0803,
"step": 1339
},
{
"epoch": 0.7597807597807598,
"grad_norm": 2.142582893371582,
"learning_rate": 4.8291999320620185e-06,
"loss": 1.1509,
"step": 1340
},
{
"epoch": 0.7603477603477603,
"grad_norm": 1.9460140466690063,
"learning_rate": 4.828927523955636e-06,
"loss": 1.0876,
"step": 1341
},
{
"epoch": 0.760914760914761,
"grad_norm": 1.8868871927261353,
"learning_rate": 4.828654906486737e-06,
"loss": 1.1114,
"step": 1342
},
{
"epoch": 0.7614817614817615,
"grad_norm": 1.9232128858566284,
"learning_rate": 4.8283820796798305e-06,
"loss": 1.1119,
"step": 1343
},
{
"epoch": 0.762048762048762,
"grad_norm": 2.0091655254364014,
"learning_rate": 4.828109043559443e-06,
"loss": 1.0971,
"step": 1344
},
{
"epoch": 0.7626157626157626,
"grad_norm": 2.1124536991119385,
"learning_rate": 4.827835798150117e-06,
"loss": 1.0622,
"step": 1345
},
{
"epoch": 0.7631827631827632,
"grad_norm": 2.06380295753479,
"learning_rate": 4.827562343476419e-06,
"loss": 1.1451,
"step": 1346
},
{
"epoch": 0.7637497637497638,
"grad_norm": 2.1077919006347656,
"learning_rate": 4.827288679562931e-06,
"loss": 1.0732,
"step": 1347
},
{
"epoch": 0.7643167643167643,
"grad_norm": 2.0187129974365234,
"learning_rate": 4.827014806434254e-06,
"loss": 1.1245,
"step": 1348
},
{
"epoch": 0.7648837648837649,
"grad_norm": 1.9610968828201294,
"learning_rate": 4.826740724115007e-06,
"loss": 1.1366,
"step": 1349
},
{
"epoch": 0.7654507654507654,
"grad_norm": 2.1612725257873535,
"learning_rate": 4.826466432629831e-06,
"loss": 1.1181,
"step": 1350
},
{
"epoch": 0.766017766017766,
"grad_norm": 2.034870147705078,
"learning_rate": 4.826191932003384e-06,
"loss": 1.0504,
"step": 1351
},
{
"epoch": 0.7665847665847666,
"grad_norm": 2.0349764823913574,
"learning_rate": 4.825917222260342e-06,
"loss": 1.0562,
"step": 1352
},
{
"epoch": 0.7671517671517671,
"grad_norm": 2.0796329975128174,
"learning_rate": 4.825642303425399e-06,
"loss": 1.123,
"step": 1353
},
{
"epoch": 0.7677187677187677,
"grad_norm": 1.989540696144104,
"learning_rate": 4.825367175523272e-06,
"loss": 1.0993,
"step": 1354
},
{
"epoch": 0.7682857682857683,
"grad_norm": 1.9621320962905884,
"learning_rate": 4.825091838578691e-06,
"loss": 1.0958,
"step": 1355
},
{
"epoch": 0.7688527688527689,
"grad_norm": 1.8925583362579346,
"learning_rate": 4.8248162926164115e-06,
"loss": 1.1285,
"step": 1356
},
{
"epoch": 0.7694197694197694,
"grad_norm": 2.136880397796631,
"learning_rate": 4.824540537661201e-06,
"loss": 1.0884,
"step": 1357
},
{
"epoch": 0.76998676998677,
"grad_norm": 3.522716760635376,
"learning_rate": 4.824264573737849e-06,
"loss": 1.0651,
"step": 1358
},
{
"epoch": 0.7705537705537705,
"grad_norm": 1.9552546739578247,
"learning_rate": 4.823988400871166e-06,
"loss": 1.0475,
"step": 1359
},
{
"epoch": 0.7711207711207712,
"grad_norm": 2.041712999343872,
"learning_rate": 4.823712019085978e-06,
"loss": 1.091,
"step": 1360
},
{
"epoch": 0.7716877716877717,
"grad_norm": 1.9892549514770508,
"learning_rate": 4.823435428407129e-06,
"loss": 1.0681,
"step": 1361
},
{
"epoch": 0.7722547722547722,
"grad_norm": 2.180232524871826,
"learning_rate": 4.823158628859487e-06,
"loss": 1.0711,
"step": 1362
},
{
"epoch": 0.7728217728217728,
"grad_norm": 2.0575640201568604,
"learning_rate": 4.822881620467932e-06,
"loss": 1.11,
"step": 1363
},
{
"epoch": 0.7733887733887734,
"grad_norm": 2.0994784832000732,
"learning_rate": 4.822604403257367e-06,
"loss": 1.0946,
"step": 1364
},
{
"epoch": 0.773955773955774,
"grad_norm": 1.9343117475509644,
"learning_rate": 4.822326977252714e-06,
"loss": 1.1042,
"step": 1365
},
{
"epoch": 0.7745227745227745,
"grad_norm": 1.8998512029647827,
"learning_rate": 4.822049342478912e-06,
"loss": 1.0746,
"step": 1366
},
{
"epoch": 0.7750897750897751,
"grad_norm": 1.939725637435913,
"learning_rate": 4.821771498960919e-06,
"loss": 1.1468,
"step": 1367
},
{
"epoch": 0.7756567756567757,
"grad_norm": 2.0788631439208984,
"learning_rate": 4.821493446723713e-06,
"loss": 1.1207,
"step": 1368
},
{
"epoch": 0.7762237762237763,
"grad_norm": 1.9664413928985596,
"learning_rate": 4.821215185792288e-06,
"loss": 1.1096,
"step": 1369
},
{
"epoch": 0.7767907767907768,
"grad_norm": 2.1204512119293213,
"learning_rate": 4.820936716191662e-06,
"loss": 1.1106,
"step": 1370
},
{
"epoch": 0.7773577773577773,
"grad_norm": 2.1662580966949463,
"learning_rate": 4.8206580379468655e-06,
"loss": 1.1301,
"step": 1371
},
{
"epoch": 0.7779247779247779,
"grad_norm": 2.049109697341919,
"learning_rate": 4.820379151082952e-06,
"loss": 1.0862,
"step": 1372
},
{
"epoch": 0.7784917784917785,
"grad_norm": 2.087808132171631,
"learning_rate": 4.820100055624992e-06,
"loss": 1.1094,
"step": 1373
},
{
"epoch": 0.7790587790587791,
"grad_norm": 1.9093419313430786,
"learning_rate": 4.819820751598076e-06,
"loss": 1.1155,
"step": 1374
},
{
"epoch": 0.7796257796257796,
"grad_norm": 2.1120524406433105,
"learning_rate": 4.819541239027311e-06,
"loss": 1.0826,
"step": 1375
},
{
"epoch": 0.7801927801927802,
"grad_norm": 2.1566476821899414,
"learning_rate": 4.819261517937826e-06,
"loss": 1.1011,
"step": 1376
},
{
"epoch": 0.7807597807597808,
"grad_norm": 2.032398223876953,
"learning_rate": 4.818981588354767e-06,
"loss": 1.0908,
"step": 1377
},
{
"epoch": 0.7813267813267813,
"grad_norm": 1.8764480352401733,
"learning_rate": 4.8187014503032955e-06,
"loss": 1.0315,
"step": 1378
},
{
"epoch": 0.7818937818937819,
"grad_norm": 2.0319604873657227,
"learning_rate": 4.818421103808599e-06,
"loss": 1.0873,
"step": 1379
},
{
"epoch": 0.7824607824607824,
"grad_norm": 1.8628435134887695,
"learning_rate": 4.818140548895877e-06,
"loss": 1.0493,
"step": 1380
},
{
"epoch": 0.783027783027783,
"grad_norm": 2.18454647064209,
"learning_rate": 4.817859785590352e-06,
"loss": 1.1007,
"step": 1381
},
{
"epoch": 0.7835947835947836,
"grad_norm": 1.9577960968017578,
"learning_rate": 4.817578813917262e-06,
"loss": 1.1401,
"step": 1382
},
{
"epoch": 0.7841617841617842,
"grad_norm": 2.080314874649048,
"learning_rate": 4.817297633901867e-06,
"loss": 1.0414,
"step": 1383
},
{
"epoch": 0.7847287847287847,
"grad_norm": 1.928419828414917,
"learning_rate": 4.8170162455694435e-06,
"loss": 1.151,
"step": 1384
},
{
"epoch": 0.7852957852957853,
"grad_norm": 2.083796739578247,
"learning_rate": 4.816734648945287e-06,
"loss": 1.1054,
"step": 1385
},
{
"epoch": 0.7858627858627859,
"grad_norm": 1.9626907110214233,
"learning_rate": 4.816452844054712e-06,
"loss": 1.1136,
"step": 1386
},
{
"epoch": 0.7864297864297864,
"grad_norm": 2.0195517539978027,
"learning_rate": 4.816170830923053e-06,
"loss": 1.0842,
"step": 1387
},
{
"epoch": 0.786996786996787,
"grad_norm": 1.9990777969360352,
"learning_rate": 4.815888609575661e-06,
"loss": 1.0576,
"step": 1388
},
{
"epoch": 0.7875637875637875,
"grad_norm": 1.8750730752944946,
"learning_rate": 4.815606180037907e-06,
"loss": 1.0409,
"step": 1389
},
{
"epoch": 0.7881307881307882,
"grad_norm": 2.1351377964019775,
"learning_rate": 4.81532354233518e-06,
"loss": 1.0463,
"step": 1390
},
{
"epoch": 0.7886977886977887,
"grad_norm": 1.896248698234558,
"learning_rate": 4.815040696492888e-06,
"loss": 1.1202,
"step": 1391
},
{
"epoch": 0.7892647892647893,
"grad_norm": 2.018113136291504,
"learning_rate": 4.814757642536459e-06,
"loss": 1.1117,
"step": 1392
},
{
"epoch": 0.7898317898317898,
"grad_norm": 1.9583754539489746,
"learning_rate": 4.814474380491338e-06,
"loss": 1.1024,
"step": 1393
},
{
"epoch": 0.7903987903987904,
"grad_norm": 2.1564619541168213,
"learning_rate": 4.814190910382988e-06,
"loss": 1.1485,
"step": 1394
},
{
"epoch": 0.790965790965791,
"grad_norm": 1.9851993322372437,
"learning_rate": 4.813907232236894e-06,
"loss": 1.0982,
"step": 1395
},
{
"epoch": 0.7915327915327915,
"grad_norm": 1.9537993669509888,
"learning_rate": 4.813623346078557e-06,
"loss": 1.0622,
"step": 1396
},
{
"epoch": 0.7920997920997921,
"grad_norm": 2.0332624912261963,
"learning_rate": 4.813339251933497e-06,
"loss": 1.0678,
"step": 1397
},
{
"epoch": 0.7926667926667926,
"grad_norm": 1.9309813976287842,
"learning_rate": 4.8130549498272535e-06,
"loss": 1.0916,
"step": 1398
},
{
"epoch": 0.7932337932337933,
"grad_norm": 2.0063681602478027,
"learning_rate": 4.812770439785383e-06,
"loss": 1.0946,
"step": 1399
},
{
"epoch": 0.7938007938007938,
"grad_norm": 1.9828499555587769,
"learning_rate": 4.812485721833465e-06,
"loss": 1.0701,
"step": 1400
},
{
"epoch": 0.7943677943677944,
"grad_norm": 1.8927239179611206,
"learning_rate": 4.812200795997091e-06,
"loss": 1.075,
"step": 1401
},
{
"epoch": 0.7949347949347949,
"grad_norm": 2.046656370162964,
"learning_rate": 4.811915662301877e-06,
"loss": 1.0895,
"step": 1402
},
{
"epoch": 0.7955017955017955,
"grad_norm": 2.0462124347686768,
"learning_rate": 4.811630320773455e-06,
"loss": 1.0668,
"step": 1403
},
{
"epoch": 0.7960687960687961,
"grad_norm": 1.8638513088226318,
"learning_rate": 4.811344771437476e-06,
"loss": 1.102,
"step": 1404
},
{
"epoch": 0.7966357966357966,
"grad_norm": 2.1097896099090576,
"learning_rate": 4.811059014319611e-06,
"loss": 1.0449,
"step": 1405
},
{
"epoch": 0.7972027972027972,
"grad_norm": 1.946921706199646,
"learning_rate": 4.8107730494455475e-06,
"loss": 1.0702,
"step": 1406
},
{
"epoch": 0.7977697977697977,
"grad_norm": 2.3324482440948486,
"learning_rate": 4.810486876840992e-06,
"loss": 1.0534,
"step": 1407
},
{
"epoch": 0.7983367983367984,
"grad_norm": 2.0709335803985596,
"learning_rate": 4.810200496531673e-06,
"loss": 1.0584,
"step": 1408
},
{
"epoch": 0.7989037989037989,
"grad_norm": 2.0188565254211426,
"learning_rate": 4.809913908543332e-06,
"loss": 1.0851,
"step": 1409
},
{
"epoch": 0.7994707994707995,
"grad_norm": 2.03055739402771,
"learning_rate": 4.809627112901735e-06,
"loss": 1.1214,
"step": 1410
},
{
"epoch": 0.8000378000378,
"grad_norm": 2.0110137462615967,
"learning_rate": 4.809340109632662e-06,
"loss": 1.1203,
"step": 1411
},
{
"epoch": 0.8006048006048007,
"grad_norm": 2.0457754135131836,
"learning_rate": 4.809052898761915e-06,
"loss": 1.0835,
"step": 1412
},
{
"epoch": 0.8011718011718012,
"grad_norm": 2.07012939453125,
"learning_rate": 4.808765480315312e-06,
"loss": 1.1397,
"step": 1413
},
{
"epoch": 0.8017388017388017,
"grad_norm": 1.9364756345748901,
"learning_rate": 4.808477854318691e-06,
"loss": 1.0797,
"step": 1414
},
{
"epoch": 0.8023058023058023,
"grad_norm": 1.9942682981491089,
"learning_rate": 4.80819002079791e-06,
"loss": 1.1026,
"step": 1415
},
{
"epoch": 0.8028728028728028,
"grad_norm": 2.145369529724121,
"learning_rate": 4.807901979778843e-06,
"loss": 1.0976,
"step": 1416
},
{
"epoch": 0.8034398034398035,
"grad_norm": 1.9216880798339844,
"learning_rate": 4.807613731287384e-06,
"loss": 1.1241,
"step": 1417
},
{
"epoch": 0.804006804006804,
"grad_norm": 1.972402811050415,
"learning_rate": 4.807325275349446e-06,
"loss": 1.0899,
"step": 1418
},
{
"epoch": 0.8045738045738046,
"grad_norm": 1.946427583694458,
"learning_rate": 4.80703661199096e-06,
"loss": 1.089,
"step": 1419
},
{
"epoch": 0.8051408051408051,
"grad_norm": 2.1797311305999756,
"learning_rate": 4.806747741237876e-06,
"loss": 1.1445,
"step": 1420
},
{
"epoch": 0.8057078057078058,
"grad_norm": 2.0358588695526123,
"learning_rate": 4.806458663116161e-06,
"loss": 1.1113,
"step": 1421
},
{
"epoch": 0.8062748062748063,
"grad_norm": 2.0892183780670166,
"learning_rate": 4.806169377651805e-06,
"loss": 1.1153,
"step": 1422
},
{
"epoch": 0.8068418068418068,
"grad_norm": 1.9392249584197998,
"learning_rate": 4.805879884870811e-06,
"loss": 1.1191,
"step": 1423
},
{
"epoch": 0.8074088074088074,
"grad_norm": 1.9548989534378052,
"learning_rate": 4.805590184799206e-06,
"loss": 1.0886,
"step": 1424
},
{
"epoch": 0.8079758079758079,
"grad_norm": 2.1020822525024414,
"learning_rate": 4.80530027746303e-06,
"loss": 1.0654,
"step": 1425
},
{
"epoch": 0.8085428085428086,
"grad_norm": 1.968111276626587,
"learning_rate": 4.805010162888347e-06,
"loss": 1.0522,
"step": 1426
},
{
"epoch": 0.8091098091098091,
"grad_norm": 2.0265655517578125,
"learning_rate": 4.804719841101237e-06,
"loss": 1.1122,
"step": 1427
},
{
"epoch": 0.8096768096768097,
"grad_norm": 2.0999677181243896,
"learning_rate": 4.8044293121277975e-06,
"loss": 1.075,
"step": 1428
},
{
"epoch": 0.8102438102438102,
"grad_norm": 2.0023441314697266,
"learning_rate": 4.8041385759941475e-06,
"loss": 1.1009,
"step": 1429
},
{
"epoch": 0.8108108108108109,
"grad_norm": 2.0415596961975098,
"learning_rate": 4.803847632726422e-06,
"loss": 1.0809,
"step": 1430
},
{
"epoch": 0.8113778113778114,
"grad_norm": 1.9322887659072876,
"learning_rate": 4.803556482350777e-06,
"loss": 1.0597,
"step": 1431
},
{
"epoch": 0.8119448119448119,
"grad_norm": 1.927483081817627,
"learning_rate": 4.8032651248933855e-06,
"loss": 1.0894,
"step": 1432
},
{
"epoch": 0.8125118125118125,
"grad_norm": 1.9914480447769165,
"learning_rate": 4.802973560380439e-06,
"loss": 1.0948,
"step": 1433
},
{
"epoch": 0.8130788130788131,
"grad_norm": 1.9616143703460693,
"learning_rate": 4.802681788838149e-06,
"loss": 1.096,
"step": 1434
},
{
"epoch": 0.8136458136458137,
"grad_norm": 1.884313941001892,
"learning_rate": 4.802389810292744e-06,
"loss": 1.0895,
"step": 1435
},
{
"epoch": 0.8142128142128142,
"grad_norm": 1.8811097145080566,
"learning_rate": 4.802097624770472e-06,
"loss": 1.0194,
"step": 1436
},
{
"epoch": 0.8147798147798148,
"grad_norm": 1.9978089332580566,
"learning_rate": 4.8018052322976e-06,
"loss": 1.0993,
"step": 1437
},
{
"epoch": 0.8153468153468153,
"grad_norm": 2.0147740840911865,
"learning_rate": 4.801512632900413e-06,
"loss": 1.0659,
"step": 1438
},
{
"epoch": 0.815913815913816,
"grad_norm": 1.9632623195648193,
"learning_rate": 4.801219826605213e-06,
"loss": 1.079,
"step": 1439
},
{
"epoch": 0.8164808164808165,
"grad_norm": 1.959356665611267,
"learning_rate": 4.800926813438325e-06,
"loss": 1.133,
"step": 1440
},
{
"epoch": 0.817047817047817,
"grad_norm": 2.189089298248291,
"learning_rate": 4.8006335934260885e-06,
"loss": 1.1282,
"step": 1441
},
{
"epoch": 0.8176148176148176,
"grad_norm": 2.0199830532073975,
"learning_rate": 4.800340166594862e-06,
"loss": 1.0958,
"step": 1442
},
{
"epoch": 0.8181818181818182,
"grad_norm": 2.248572587966919,
"learning_rate": 4.800046532971025e-06,
"loss": 1.1197,
"step": 1443
},
{
"epoch": 0.8187488187488188,
"grad_norm": 2.16886043548584,
"learning_rate": 4.799752692580973e-06,
"loss": 1.1203,
"step": 1444
},
{
"epoch": 0.8193158193158193,
"grad_norm": 1.995705008506775,
"learning_rate": 4.799458645451122e-06,
"loss": 1.0579,
"step": 1445
},
{
"epoch": 0.8198828198828199,
"grad_norm": 2.0506200790405273,
"learning_rate": 4.799164391607908e-06,
"loss": 1.138,
"step": 1446
},
{
"epoch": 0.8204498204498204,
"grad_norm": 2.047494411468506,
"learning_rate": 4.798869931077779e-06,
"loss": 1.1272,
"step": 1447
},
{
"epoch": 0.821016821016821,
"grad_norm": 2.0765833854675293,
"learning_rate": 4.798575263887208e-06,
"loss": 1.0581,
"step": 1448
},
{
"epoch": 0.8215838215838216,
"grad_norm": 2.1554183959960938,
"learning_rate": 4.798280390062685e-06,
"loss": 1.1212,
"step": 1449
},
{
"epoch": 0.8221508221508221,
"grad_norm": 2.6534454822540283,
"learning_rate": 4.797985309630718e-06,
"loss": 1.1472,
"step": 1450
},
{
"epoch": 0.8227178227178227,
"grad_norm": 2.0224509239196777,
"learning_rate": 4.797690022617834e-06,
"loss": 1.1336,
"step": 1451
},
{
"epoch": 0.8232848232848233,
"grad_norm": 2.20784068107605,
"learning_rate": 4.797394529050577e-06,
"loss": 1.1146,
"step": 1452
},
{
"epoch": 0.8238518238518239,
"grad_norm": 1.9155614376068115,
"learning_rate": 4.797098828955512e-06,
"loss": 1.0795,
"step": 1453
},
{
"epoch": 0.8244188244188244,
"grad_norm": 2.068676233291626,
"learning_rate": 4.7968029223592205e-06,
"loss": 1.0609,
"step": 1454
},
{
"epoch": 0.824985824985825,
"grad_norm": 1.9581069946289062,
"learning_rate": 4.796506809288305e-06,
"loss": 1.1375,
"step": 1455
},
{
"epoch": 0.8255528255528255,
"grad_norm": 1.9850029945373535,
"learning_rate": 4.796210489769383e-06,
"loss": 1.0904,
"step": 1456
},
{
"epoch": 0.8261198261198262,
"grad_norm": 2.549131155014038,
"learning_rate": 4.7959139638290945e-06,
"loss": 1.0845,
"step": 1457
},
{
"epoch": 0.8266868266868267,
"grad_norm": 2.00335693359375,
"learning_rate": 4.7956172314940945e-06,
"loss": 1.1247,
"step": 1458
},
{
"epoch": 0.8272538272538272,
"grad_norm": 2.0605878829956055,
"learning_rate": 4.795320292791059e-06,
"loss": 1.0974,
"step": 1459
},
{
"epoch": 0.8278208278208278,
"grad_norm": 1.9676733016967773,
"learning_rate": 4.7950231477466825e-06,
"loss": 1.1461,
"step": 1460
},
{
"epoch": 0.8283878283878284,
"grad_norm": 2.107635974884033,
"learning_rate": 4.794725796387677e-06,
"loss": 1.1107,
"step": 1461
},
{
"epoch": 0.828954828954829,
"grad_norm": 2.1952548027038574,
"learning_rate": 4.794428238740771e-06,
"loss": 1.0946,
"step": 1462
},
{
"epoch": 0.8295218295218295,
"grad_norm": 2.205260992050171,
"learning_rate": 4.794130474832718e-06,
"loss": 1.1676,
"step": 1463
},
{
"epoch": 0.83008883008883,
"grad_norm": 2.144150733947754,
"learning_rate": 4.793832504690283e-06,
"loss": 1.1435,
"step": 1464
},
{
"epoch": 0.8306558306558307,
"grad_norm": 1.9926881790161133,
"learning_rate": 4.793534328340253e-06,
"loss": 1.0526,
"step": 1465
},
{
"epoch": 0.8312228312228312,
"grad_norm": 2.093427896499634,
"learning_rate": 4.7932359458094335e-06,
"loss": 1.15,
"step": 1466
},
{
"epoch": 0.8317898317898318,
"grad_norm": 2.014958620071411,
"learning_rate": 4.792937357124647e-06,
"loss": 1.0616,
"step": 1467
},
{
"epoch": 0.8323568323568323,
"grad_norm": 1.9660993814468384,
"learning_rate": 4.792638562312738e-06,
"loss": 1.0524,
"step": 1468
},
{
"epoch": 0.8329238329238329,
"grad_norm": 2.1528244018554688,
"learning_rate": 4.792339561400565e-06,
"loss": 1.0605,
"step": 1469
},
{
"epoch": 0.8334908334908335,
"grad_norm": 1.8813989162445068,
"learning_rate": 4.792040354415008e-06,
"loss": 1.0126,
"step": 1470
},
{
"epoch": 0.8340578340578341,
"grad_norm": 2.1694982051849365,
"learning_rate": 4.791740941382963e-06,
"loss": 1.07,
"step": 1471
},
{
"epoch": 0.8346248346248346,
"grad_norm": 2.0867135524749756,
"learning_rate": 4.7914413223313484e-06,
"loss": 1.0531,
"step": 1472
},
{
"epoch": 0.8351918351918352,
"grad_norm": 2.2400665283203125,
"learning_rate": 4.791141497287098e-06,
"loss": 1.1123,
"step": 1473
},
{
"epoch": 0.8357588357588358,
"grad_norm": 2.0336430072784424,
"learning_rate": 4.7908414662771655e-06,
"loss": 1.0809,
"step": 1474
},
{
"epoch": 0.8363258363258363,
"grad_norm": 2.1923985481262207,
"learning_rate": 4.790541229328522e-06,
"loss": 1.0294,
"step": 1475
},
{
"epoch": 0.8368928368928369,
"grad_norm": 2.0450212955474854,
"learning_rate": 4.790240786468158e-06,
"loss": 1.135,
"step": 1476
},
{
"epoch": 0.8374598374598374,
"grad_norm": 1.9666211605072021,
"learning_rate": 4.789940137723082e-06,
"loss": 1.0146,
"step": 1477
},
{
"epoch": 0.838026838026838,
"grad_norm": 1.9747493267059326,
"learning_rate": 4.789639283120323e-06,
"loss": 1.1008,
"step": 1478
},
{
"epoch": 0.8385938385938386,
"grad_norm": 1.9848383665084839,
"learning_rate": 4.789338222686924e-06,
"loss": 1.1277,
"step": 1479
},
{
"epoch": 0.8391608391608392,
"grad_norm": 2.0215046405792236,
"learning_rate": 4.789036956449951e-06,
"loss": 1.0636,
"step": 1480
},
{
"epoch": 0.8397278397278397,
"grad_norm": 1.9995296001434326,
"learning_rate": 4.788735484436486e-06,
"loss": 1.0614,
"step": 1481
},
{
"epoch": 0.8402948402948403,
"grad_norm": 1.9948177337646484,
"learning_rate": 4.7884338066736315e-06,
"loss": 1.1005,
"step": 1482
},
{
"epoch": 0.8408618408618409,
"grad_norm": 2.206935405731201,
"learning_rate": 4.788131923188506e-06,
"loss": 1.1549,
"step": 1483
},
{
"epoch": 0.8414288414288414,
"grad_norm": 2.1533777713775635,
"learning_rate": 4.787829834008248e-06,
"loss": 1.1103,
"step": 1484
},
{
"epoch": 0.841995841995842,
"grad_norm": 1.962776780128479,
"learning_rate": 4.787527539160016e-06,
"loss": 1.0235,
"step": 1485
},
{
"epoch": 0.8425628425628425,
"grad_norm": 1.902707576751709,
"learning_rate": 4.787225038670983e-06,
"loss": 1.095,
"step": 1486
},
{
"epoch": 0.8431298431298432,
"grad_norm": 1.9712163209915161,
"learning_rate": 4.786922332568343e-06,
"loss": 1.1332,
"step": 1487
},
{
"epoch": 0.8436968436968437,
"grad_norm": 2.0679397583007812,
"learning_rate": 4.786619420879309e-06,
"loss": 1.0823,
"step": 1488
},
{
"epoch": 0.8442638442638443,
"grad_norm": 2.060270071029663,
"learning_rate": 4.786316303631112e-06,
"loss": 1.1254,
"step": 1489
},
{
"epoch": 0.8448308448308448,
"grad_norm": 2.0360071659088135,
"learning_rate": 4.786012980851e-06,
"loss": 1.1093,
"step": 1490
},
{
"epoch": 0.8453978453978453,
"grad_norm": 2.075654983520508,
"learning_rate": 4.785709452566243e-06,
"loss": 1.0913,
"step": 1491
},
{
"epoch": 0.845964845964846,
"grad_norm": 2.0330331325531006,
"learning_rate": 4.785405718804124e-06,
"loss": 1.1261,
"step": 1492
},
{
"epoch": 0.8465318465318465,
"grad_norm": 2.006742238998413,
"learning_rate": 4.78510177959195e-06,
"loss": 1.0898,
"step": 1493
},
{
"epoch": 0.8470988470988471,
"grad_norm": 1.96126127243042,
"learning_rate": 4.784797634957042e-06,
"loss": 1.1011,
"step": 1494
},
{
"epoch": 0.8476658476658476,
"grad_norm": 2.077693462371826,
"learning_rate": 4.784493284926743e-06,
"loss": 1.0646,
"step": 1495
},
{
"epoch": 0.8482328482328483,
"grad_norm": 2.056105375289917,
"learning_rate": 4.784188729528414e-06,
"loss": 1.0702,
"step": 1496
},
{
"epoch": 0.8487998487998488,
"grad_norm": 1.9399914741516113,
"learning_rate": 4.783883968789431e-06,
"loss": 1.0561,
"step": 1497
},
{
"epoch": 0.8493668493668494,
"grad_norm": 2.05819034576416,
"learning_rate": 4.783579002737193e-06,
"loss": 1.0837,
"step": 1498
},
{
"epoch": 0.8499338499338499,
"grad_norm": 2.0962612628936768,
"learning_rate": 4.783273831399114e-06,
"loss": 1.1275,
"step": 1499
},
{
"epoch": 0.8505008505008504,
"grad_norm": 2.1176199913024902,
"learning_rate": 4.782968454802629e-06,
"loss": 1.1522,
"step": 1500
},
{
"epoch": 0.8510678510678511,
"grad_norm": 2.1833150386810303,
"learning_rate": 4.78266287297519e-06,
"loss": 1.0479,
"step": 1501
},
{
"epoch": 0.8516348516348516,
"grad_norm": 1.9732375144958496,
"learning_rate": 4.782357085944267e-06,
"loss": 1.1363,
"step": 1502
},
{
"epoch": 0.8522018522018522,
"grad_norm": 1.987606167793274,
"learning_rate": 4.782051093737349e-06,
"loss": 1.0978,
"step": 1503
},
{
"epoch": 0.8527688527688527,
"grad_norm": 1.9279766082763672,
"learning_rate": 4.781744896381945e-06,
"loss": 1.0906,
"step": 1504
},
{
"epoch": 0.8533358533358534,
"grad_norm": 1.9426352977752686,
"learning_rate": 4.78143849390558e-06,
"loss": 1.1195,
"step": 1505
},
{
"epoch": 0.8539028539028539,
"grad_norm": 1.9374680519104004,
"learning_rate": 4.781131886335799e-06,
"loss": 1.0835,
"step": 1506
},
{
"epoch": 0.8544698544698545,
"grad_norm": 2.1612491607666016,
"learning_rate": 4.780825073700166e-06,
"loss": 1.0892,
"step": 1507
},
{
"epoch": 0.855036855036855,
"grad_norm": 2.1090891361236572,
"learning_rate": 4.78051805602626e-06,
"loss": 1.1032,
"step": 1508
},
{
"epoch": 0.8556038556038557,
"grad_norm": 2.0436275005340576,
"learning_rate": 4.780210833341682e-06,
"loss": 1.1011,
"step": 1509
},
{
"epoch": 0.8561708561708562,
"grad_norm": 1.9176534414291382,
"learning_rate": 4.77990340567405e-06,
"loss": 1.0907,
"step": 1510
},
{
"epoch": 0.8567378567378567,
"grad_norm": 2.086163282394409,
"learning_rate": 4.779595773051002e-06,
"loss": 1.1346,
"step": 1511
},
{
"epoch": 0.8573048573048573,
"grad_norm": 1.9749596118927002,
"learning_rate": 4.779287935500192e-06,
"loss": 1.0875,
"step": 1512
},
{
"epoch": 0.8578718578718578,
"grad_norm": 1.9084590673446655,
"learning_rate": 4.778979893049294e-06,
"loss": 1.0849,
"step": 1513
},
{
"epoch": 0.8584388584388585,
"grad_norm": 1.9401350021362305,
"learning_rate": 4.778671645725999e-06,
"loss": 1.1297,
"step": 1514
},
{
"epoch": 0.859005859005859,
"grad_norm": 2.069439649581909,
"learning_rate": 4.778363193558017e-06,
"loss": 1.0059,
"step": 1515
},
{
"epoch": 0.8595728595728596,
"grad_norm": 2.0672447681427,
"learning_rate": 4.77805453657308e-06,
"loss": 1.1441,
"step": 1516
},
{
"epoch": 0.8601398601398601,
"grad_norm": 1.9461488723754883,
"learning_rate": 4.777745674798931e-06,
"loss": 1.0962,
"step": 1517
},
{
"epoch": 0.8607068607068608,
"grad_norm": 2.1001944541931152,
"learning_rate": 4.777436608263338e-06,
"loss": 1.1358,
"step": 1518
},
{
"epoch": 0.8612738612738613,
"grad_norm": 1.9706493616104126,
"learning_rate": 4.777127336994085e-06,
"loss": 1.101,
"step": 1519
},
{
"epoch": 0.8618408618408618,
"grad_norm": 1.9673707485198975,
"learning_rate": 4.7768178610189744e-06,
"loss": 1.1193,
"step": 1520
},
{
"epoch": 0.8624078624078624,
"grad_norm": 1.9464128017425537,
"learning_rate": 4.776508180365826e-06,
"loss": 1.0663,
"step": 1521
},
{
"epoch": 0.8629748629748629,
"grad_norm": 2.0577609539031982,
"learning_rate": 4.77619829506248e-06,
"loss": 1.0971,
"step": 1522
},
{
"epoch": 0.8635418635418636,
"grad_norm": 2.171048402786255,
"learning_rate": 4.775888205136793e-06,
"loss": 1.104,
"step": 1523
},
{
"epoch": 0.8641088641088641,
"grad_norm": 2.2168309688568115,
"learning_rate": 4.775577910616642e-06,
"loss": 1.0856,
"step": 1524
},
{
"epoch": 0.8646758646758647,
"grad_norm": 1.9648188352584839,
"learning_rate": 4.77526741152992e-06,
"loss": 1.1112,
"step": 1525
},
{
"epoch": 0.8652428652428652,
"grad_norm": 2.0772106647491455,
"learning_rate": 4.774956707904542e-06,
"loss": 1.0805,
"step": 1526
},
{
"epoch": 0.8658098658098659,
"grad_norm": 2.0109941959381104,
"learning_rate": 4.774645799768438e-06,
"loss": 1.1004,
"step": 1527
},
{
"epoch": 0.8663768663768664,
"grad_norm": 2.0720832347869873,
"learning_rate": 4.7743346871495575e-06,
"loss": 1.0671,
"step": 1528
},
{
"epoch": 0.8669438669438669,
"grad_norm": 2.011953830718994,
"learning_rate": 4.774023370075868e-06,
"loss": 1.0671,
"step": 1529
},
{
"epoch": 0.8675108675108675,
"grad_norm": 1.9619215726852417,
"learning_rate": 4.773711848575357e-06,
"loss": 1.1336,
"step": 1530
},
{
"epoch": 0.8680778680778681,
"grad_norm": 1.8378658294677734,
"learning_rate": 4.773400122676028e-06,
"loss": 1.1083,
"step": 1531
},
{
"epoch": 0.8686448686448687,
"grad_norm": 1.9250093698501587,
"learning_rate": 4.7730881924059046e-06,
"loss": 1.1024,
"step": 1532
},
{
"epoch": 0.8692118692118692,
"grad_norm": 2.106729745864868,
"learning_rate": 4.772776057793029e-06,
"loss": 1.0636,
"step": 1533
},
{
"epoch": 0.8697788697788698,
"grad_norm": 1.9255436658859253,
"learning_rate": 4.77246371886546e-06,
"loss": 1.1246,
"step": 1534
},
{
"epoch": 0.8703458703458703,
"grad_norm": 1.9523237943649292,
"learning_rate": 4.772151175651275e-06,
"loss": 1.0753,
"step": 1535
},
{
"epoch": 0.870912870912871,
"grad_norm": 2.121988296508789,
"learning_rate": 4.771838428178574e-06,
"loss": 1.149,
"step": 1536
},
{
"epoch": 0.8714798714798715,
"grad_norm": 2.094503879547119,
"learning_rate": 4.771525476475467e-06,
"loss": 1.0711,
"step": 1537
},
{
"epoch": 0.872046872046872,
"grad_norm": 2.0268783569335938,
"learning_rate": 4.771212320570091e-06,
"loss": 1.0787,
"step": 1538
},
{
"epoch": 0.8726138726138726,
"grad_norm": 2.261963129043579,
"learning_rate": 4.770898960490596e-06,
"loss": 1.0989,
"step": 1539
},
{
"epoch": 0.8731808731808732,
"grad_norm": 2.0570859909057617,
"learning_rate": 4.770585396265153e-06,
"loss": 1.0863,
"step": 1540
},
{
"epoch": 0.8737478737478738,
"grad_norm": 2.00663423538208,
"learning_rate": 4.77027162792195e-06,
"loss": 1.0525,
"step": 1541
},
{
"epoch": 0.8743148743148743,
"grad_norm": 1.9876610040664673,
"learning_rate": 4.769957655489193e-06,
"loss": 1.0734,
"step": 1542
},
{
"epoch": 0.8748818748818749,
"grad_norm": 2.09036922454834,
"learning_rate": 4.7696434789951074e-06,
"loss": 1.0851,
"step": 1543
},
{
"epoch": 0.8754488754488754,
"grad_norm": 1.9771044254302979,
"learning_rate": 4.769329098467937e-06,
"loss": 1.1107,
"step": 1544
},
{
"epoch": 0.876015876015876,
"grad_norm": 1.9424091577529907,
"learning_rate": 4.7690145139359435e-06,
"loss": 1.0558,
"step": 1545
},
{
"epoch": 0.8765828765828766,
"grad_norm": 1.9221347570419312,
"learning_rate": 4.7686997254274056e-06,
"loss": 1.117,
"step": 1546
},
{
"epoch": 0.8771498771498771,
"grad_norm": 2.007035493850708,
"learning_rate": 4.7683847329706236e-06,
"loss": 1.0622,
"step": 1547
},
{
"epoch": 0.8777168777168777,
"grad_norm": 2.046154499053955,
"learning_rate": 4.768069536593913e-06,
"loss": 1.0602,
"step": 1548
},
{
"epoch": 0.8782838782838783,
"grad_norm": 2.043686866760254,
"learning_rate": 4.76775413632561e-06,
"loss": 1.1248,
"step": 1549
},
{
"epoch": 0.8788508788508789,
"grad_norm": 1.8838635683059692,
"learning_rate": 4.767438532194066e-06,
"loss": 1.0887,
"step": 1550
},
{
"epoch": 0.8794178794178794,
"grad_norm": 2.2751243114471436,
"learning_rate": 4.767122724227655e-06,
"loss": 1.0549,
"step": 1551
},
{
"epoch": 0.87998487998488,
"grad_norm": 1.9918965101242065,
"learning_rate": 4.766806712454766e-06,
"loss": 1.0428,
"step": 1552
},
{
"epoch": 0.8805518805518806,
"grad_norm": 2.0295073986053467,
"learning_rate": 4.7664904969038064e-06,
"loss": 1.0678,
"step": 1553
},
{
"epoch": 0.8811188811188811,
"grad_norm": 2.027252435684204,
"learning_rate": 4.766174077603204e-06,
"loss": 1.1304,
"step": 1554
},
{
"epoch": 0.8816858816858817,
"grad_norm": 2.0209717750549316,
"learning_rate": 4.765857454581404e-06,
"loss": 1.0862,
"step": 1555
},
{
"epoch": 0.8822528822528822,
"grad_norm": 2.1175334453582764,
"learning_rate": 4.76554062786687e-06,
"loss": 1.0914,
"step": 1556
},
{
"epoch": 0.8828198828198828,
"grad_norm": 2.0487277507781982,
"learning_rate": 4.765223597488082e-06,
"loss": 1.0725,
"step": 1557
},
{
"epoch": 0.8833868833868834,
"grad_norm": 2.0266189575195312,
"learning_rate": 4.764906363473542e-06,
"loss": 1.1061,
"step": 1558
},
{
"epoch": 0.883953883953884,
"grad_norm": 2.035072088241577,
"learning_rate": 4.764588925851766e-06,
"loss": 1.1178,
"step": 1559
},
{
"epoch": 0.8845208845208845,
"grad_norm": 2.010530948638916,
"learning_rate": 4.764271284651292e-06,
"loss": 1.0839,
"step": 1560
},
{
"epoch": 0.885087885087885,
"grad_norm": 2.035928964614868,
"learning_rate": 4.7639534399006745e-06,
"loss": 1.1263,
"step": 1561
},
{
"epoch": 0.8856548856548857,
"grad_norm": 1.9451258182525635,
"learning_rate": 4.763635391628487e-06,
"loss": 1.0582,
"step": 1562
},
{
"epoch": 0.8862218862218862,
"grad_norm": 1.9632164239883423,
"learning_rate": 4.763317139863321e-06,
"loss": 1.0674,
"step": 1563
},
{
"epoch": 0.8867888867888868,
"grad_norm": 1.9365487098693848,
"learning_rate": 4.762998684633785e-06,
"loss": 1.0518,
"step": 1564
},
{
"epoch": 0.8873558873558873,
"grad_norm": 1.942113995552063,
"learning_rate": 4.762680025968508e-06,
"loss": 1.0884,
"step": 1565
},
{
"epoch": 0.8879228879228879,
"grad_norm": 1.9525227546691895,
"learning_rate": 4.7623611638961365e-06,
"loss": 1.057,
"step": 1566
},
{
"epoch": 0.8884898884898885,
"grad_norm": 1.9437819719314575,
"learning_rate": 4.762042098445334e-06,
"loss": 1.0831,
"step": 1567
},
{
"epoch": 0.8890568890568891,
"grad_norm": 1.907525897026062,
"learning_rate": 4.7617228296447846e-06,
"loss": 1.099,
"step": 1568
},
{
"epoch": 0.8896238896238896,
"grad_norm": 1.9731396436691284,
"learning_rate": 4.76140335752319e-06,
"loss": 1.1293,
"step": 1569
},
{
"epoch": 0.8901908901908901,
"grad_norm": 2.044581174850464,
"learning_rate": 4.761083682109268e-06,
"loss": 1.1112,
"step": 1570
},
{
"epoch": 0.8907578907578908,
"grad_norm": 2.2614455223083496,
"learning_rate": 4.760763803431756e-06,
"loss": 1.1442,
"step": 1571
},
{
"epoch": 0.8913248913248913,
"grad_norm": 1.9900983572006226,
"learning_rate": 4.760443721519412e-06,
"loss": 1.1082,
"step": 1572
},
{
"epoch": 0.8918918918918919,
"grad_norm": 2.0624983310699463,
"learning_rate": 4.760123436401009e-06,
"loss": 1.0991,
"step": 1573
},
{
"epoch": 0.8924588924588924,
"grad_norm": 1.9901314973831177,
"learning_rate": 4.75980294810534e-06,
"loss": 1.0694,
"step": 1574
},
{
"epoch": 0.8930258930258931,
"grad_norm": 2.052838087081909,
"learning_rate": 4.759482256661215e-06,
"loss": 1.1102,
"step": 1575
},
{
"epoch": 0.8935928935928936,
"grad_norm": 1.9329949617385864,
"learning_rate": 4.759161362097463e-06,
"loss": 1.0261,
"step": 1576
},
{
"epoch": 0.8941598941598942,
"grad_norm": 2.1841771602630615,
"learning_rate": 4.7588402644429335e-06,
"loss": 1.0946,
"step": 1577
},
{
"epoch": 0.8947268947268947,
"grad_norm": 1.9439847469329834,
"learning_rate": 4.75851896372649e-06,
"loss": 1.0731,
"step": 1578
},
{
"epoch": 0.8952938952938952,
"grad_norm": 2.0109915733337402,
"learning_rate": 4.758197459977015e-06,
"loss": 1.0552,
"step": 1579
},
{
"epoch": 0.8958608958608959,
"grad_norm": 1.86398184299469,
"learning_rate": 4.7578757532234145e-06,
"loss": 1.0914,
"step": 1580
},
{
"epoch": 0.8964278964278964,
"grad_norm": 1.985925555229187,
"learning_rate": 4.757553843494606e-06,
"loss": 1.0717,
"step": 1581
},
{
"epoch": 0.896994896994897,
"grad_norm": 2.2007815837860107,
"learning_rate": 4.757231730819528e-06,
"loss": 1.1239,
"step": 1582
},
{
"epoch": 0.8975618975618975,
"grad_norm": 2.279360055923462,
"learning_rate": 4.756909415227139e-06,
"loss": 1.1248,
"step": 1583
},
{
"epoch": 0.8981288981288982,
"grad_norm": 2.0951502323150635,
"learning_rate": 4.7565868967464124e-06,
"loss": 1.0919,
"step": 1584
},
{
"epoch": 0.8986958986958987,
"grad_norm": 2.100532054901123,
"learning_rate": 4.756264175406342e-06,
"loss": 1.0742,
"step": 1585
},
{
"epoch": 0.8992628992628993,
"grad_norm": 2.1471176147460938,
"learning_rate": 4.75594125123594e-06,
"loss": 1.0993,
"step": 1586
},
{
"epoch": 0.8998298998298998,
"grad_norm": 1.9409059286117554,
"learning_rate": 4.755618124264236e-06,
"loss": 1.0884,
"step": 1587
},
{
"epoch": 0.9003969003969003,
"grad_norm": 2.0202972888946533,
"learning_rate": 4.755294794520277e-06,
"loss": 1.1407,
"step": 1588
},
{
"epoch": 0.900963900963901,
"grad_norm": 1.8660567998886108,
"learning_rate": 4.75497126203313e-06,
"loss": 1.0864,
"step": 1589
},
{
"epoch": 0.9015309015309015,
"grad_norm": 2.0232956409454346,
"learning_rate": 4.7546475268318795e-06,
"loss": 1.0715,
"step": 1590
},
{
"epoch": 0.9020979020979021,
"grad_norm": 1.869956612586975,
"learning_rate": 4.754323588945628e-06,
"loss": 1.0768,
"step": 1591
},
{
"epoch": 0.9026649026649026,
"grad_norm": 2.032975196838379,
"learning_rate": 4.753999448403497e-06,
"loss": 1.1404,
"step": 1592
},
{
"epoch": 0.9032319032319033,
"grad_norm": 2.062655448913574,
"learning_rate": 4.7536751052346244e-06,
"loss": 1.0803,
"step": 1593
},
{
"epoch": 0.9037989037989038,
"grad_norm": 2.068575143814087,
"learning_rate": 4.753350559468169e-06,
"loss": 1.1214,
"step": 1594
},
{
"epoch": 0.9043659043659044,
"grad_norm": 1.8464081287384033,
"learning_rate": 4.753025811133304e-06,
"loss": 1.0328,
"step": 1595
},
{
"epoch": 0.9049329049329049,
"grad_norm": 2.0097310543060303,
"learning_rate": 4.752700860259225e-06,
"loss": 1.063,
"step": 1596
},
{
"epoch": 0.9054999054999056,
"grad_norm": 2.3630588054656982,
"learning_rate": 4.7523757068751445e-06,
"loss": 1.1093,
"step": 1597
},
{
"epoch": 0.9060669060669061,
"grad_norm": 2.0079240798950195,
"learning_rate": 4.752050351010291e-06,
"loss": 1.0643,
"step": 1598
},
{
"epoch": 0.9066339066339066,
"grad_norm": 2.055734157562256,
"learning_rate": 4.751724792693914e-06,
"loss": 0.9884,
"step": 1599
},
{
"epoch": 0.9072009072009072,
"grad_norm": 2.0036795139312744,
"learning_rate": 4.751399031955279e-06,
"loss": 1.0962,
"step": 1600
},
{
"epoch": 0.9077679077679077,
"grad_norm": 2.157174587249756,
"learning_rate": 4.751073068823673e-06,
"loss": 1.0921,
"step": 1601
},
{
"epoch": 0.9083349083349084,
"grad_norm": 2.27801251411438,
"learning_rate": 4.750746903328396e-06,
"loss": 1.1183,
"step": 1602
},
{
"epoch": 0.9089019089019089,
"grad_norm": 1.9836596250534058,
"learning_rate": 4.750420535498771e-06,
"loss": 1.0127,
"step": 1603
},
{
"epoch": 0.9094689094689095,
"grad_norm": 1.9263114929199219,
"learning_rate": 4.750093965364137e-06,
"loss": 1.0417,
"step": 1604
},
{
"epoch": 0.91003591003591,
"grad_norm": 1.8833136558532715,
"learning_rate": 4.749767192953852e-06,
"loss": 1.0528,
"step": 1605
},
{
"epoch": 0.9106029106029107,
"grad_norm": 2.0248804092407227,
"learning_rate": 4.74944021829729e-06,
"loss": 1.1033,
"step": 1606
},
{
"epoch": 0.9111699111699112,
"grad_norm": 2.1038992404937744,
"learning_rate": 4.749113041423846e-06,
"loss": 1.1072,
"step": 1607
},
{
"epoch": 0.9117369117369117,
"grad_norm": 2.1123745441436768,
"learning_rate": 4.7487856623629325e-06,
"loss": 1.1115,
"step": 1608
},
{
"epoch": 0.9123039123039123,
"grad_norm": 1.9227306842803955,
"learning_rate": 4.74845808114398e-06,
"loss": 1.0282,
"step": 1609
},
{
"epoch": 0.9128709128709128,
"grad_norm": 2.635802745819092,
"learning_rate": 4.748130297796435e-06,
"loss": 1.0809,
"step": 1610
},
{
"epoch": 0.9134379134379135,
"grad_norm": 2.026642084121704,
"learning_rate": 4.747802312349767e-06,
"loss": 1.0672,
"step": 1611
},
{
"epoch": 0.914004914004914,
"grad_norm": 2.060119390487671,
"learning_rate": 4.747474124833456e-06,
"loss": 1.0266,
"step": 1612
},
{
"epoch": 0.9145719145719146,
"grad_norm": 1.9226115942001343,
"learning_rate": 4.747145735277011e-06,
"loss": 1.0446,
"step": 1613
},
{
"epoch": 0.9151389151389151,
"grad_norm": 2.0081734657287598,
"learning_rate": 4.746817143709949e-06,
"loss": 1.1019,
"step": 1614
},
{
"epoch": 0.9157059157059158,
"grad_norm": 1.902655005455017,
"learning_rate": 4.746488350161811e-06,
"loss": 1.0723,
"step": 1615
},
{
"epoch": 0.9162729162729163,
"grad_norm": 1.9542418718338013,
"learning_rate": 4.746159354662153e-06,
"loss": 1.0677,
"step": 1616
},
{
"epoch": 0.9168399168399168,
"grad_norm": 2.057506561279297,
"learning_rate": 4.745830157240551e-06,
"loss": 1.143,
"step": 1617
},
{
"epoch": 0.9174069174069174,
"grad_norm": 2.0979979038238525,
"learning_rate": 4.7455007579266e-06,
"loss": 1.0977,
"step": 1618
},
{
"epoch": 0.9179739179739179,
"grad_norm": 2.1611087322235107,
"learning_rate": 4.74517115674991e-06,
"loss": 1.049,
"step": 1619
},
{
"epoch": 0.9185409185409186,
"grad_norm": 1.9556939601898193,
"learning_rate": 4.744841353740112e-06,
"loss": 1.0329,
"step": 1620
},
{
"epoch": 0.9191079191079191,
"grad_norm": 1.9895910024642944,
"learning_rate": 4.744511348926855e-06,
"loss": 1.0884,
"step": 1621
},
{
"epoch": 0.9196749196749197,
"grad_norm": 2.0237414836883545,
"learning_rate": 4.744181142339803e-06,
"loss": 1.1147,
"step": 1622
},
{
"epoch": 0.9202419202419202,
"grad_norm": 2.115481376647949,
"learning_rate": 4.743850734008643e-06,
"loss": 1.0856,
"step": 1623
},
{
"epoch": 0.9208089208089208,
"grad_norm": 2.0079002380371094,
"learning_rate": 4.743520123963075e-06,
"loss": 1.0896,
"step": 1624
},
{
"epoch": 0.9213759213759214,
"grad_norm": 1.9966343641281128,
"learning_rate": 4.743189312232821e-06,
"loss": 1.1146,
"step": 1625
},
{
"epoch": 0.9219429219429219,
"grad_norm": 2.072922468185425,
"learning_rate": 4.742858298847621e-06,
"loss": 1.1268,
"step": 1626
},
{
"epoch": 0.9225099225099225,
"grad_norm": 1.9800165891647339,
"learning_rate": 4.742527083837229e-06,
"loss": 1.0057,
"step": 1627
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.2015650272369385,
"learning_rate": 4.742195667231424e-06,
"loss": 1.0517,
"step": 1628
},
{
"epoch": 0.9236439236439237,
"grad_norm": 2.021609306335449,
"learning_rate": 4.741864049059995e-06,
"loss": 1.0832,
"step": 1629
},
{
"epoch": 0.9242109242109242,
"grad_norm": 1.9899101257324219,
"learning_rate": 4.741532229352756e-06,
"loss": 1.075,
"step": 1630
},
{
"epoch": 0.9247779247779248,
"grad_norm": 2.0984103679656982,
"learning_rate": 4.741200208139537e-06,
"loss": 1.1141,
"step": 1631
},
{
"epoch": 0.9253449253449253,
"grad_norm": 2.0580196380615234,
"learning_rate": 4.740867985450184e-06,
"loss": 1.0965,
"step": 1632
},
{
"epoch": 0.925911925911926,
"grad_norm": 2.0495893955230713,
"learning_rate": 4.740535561314562e-06,
"loss": 1.0752,
"step": 1633
},
{
"epoch": 0.9264789264789265,
"grad_norm": 1.908860445022583,
"learning_rate": 4.740202935762557e-06,
"loss": 1.072,
"step": 1634
},
{
"epoch": 0.927045927045927,
"grad_norm": 1.9863654375076294,
"learning_rate": 4.739870108824069e-06,
"loss": 1.0827,
"step": 1635
},
{
"epoch": 0.9276129276129276,
"grad_norm": 2.0442192554473877,
"learning_rate": 4.739537080529019e-06,
"loss": 1.1489,
"step": 1636
},
{
"epoch": 0.9281799281799282,
"grad_norm": 1.8727167844772339,
"learning_rate": 4.739203850907345e-06,
"loss": 1.1274,
"step": 1637
},
{
"epoch": 0.9287469287469288,
"grad_norm": 2.0538337230682373,
"learning_rate": 4.7388704199890025e-06,
"loss": 1.1224,
"step": 1638
},
{
"epoch": 0.9293139293139293,
"grad_norm": 1.9616420269012451,
"learning_rate": 4.738536787803967e-06,
"loss": 1.1123,
"step": 1639
},
{
"epoch": 0.9298809298809299,
"grad_norm": 2.009235143661499,
"learning_rate": 4.738202954382228e-06,
"loss": 1.0661,
"step": 1640
},
{
"epoch": 0.9304479304479304,
"grad_norm": 1.9128503799438477,
"learning_rate": 4.7378689197538005e-06,
"loss": 1.0612,
"step": 1641
},
{
"epoch": 0.931014931014931,
"grad_norm": 1.9708175659179688,
"learning_rate": 4.73753468394871e-06,
"loss": 1.122,
"step": 1642
},
{
"epoch": 0.9315819315819316,
"grad_norm": 1.7882028818130493,
"learning_rate": 4.737200246997004e-06,
"loss": 1.0251,
"step": 1643
},
{
"epoch": 0.9321489321489321,
"grad_norm": 1.9868979454040527,
"learning_rate": 4.7368656089287455e-06,
"loss": 1.1074,
"step": 1644
},
{
"epoch": 0.9327159327159327,
"grad_norm": 4.167366981506348,
"learning_rate": 4.73653076977402e-06,
"loss": 1.1029,
"step": 1645
},
{
"epoch": 0.9332829332829333,
"grad_norm": 2.0651445388793945,
"learning_rate": 4.736195729562928e-06,
"loss": 1.1035,
"step": 1646
},
{
"epoch": 0.9338499338499339,
"grad_norm": 1.9564183950424194,
"learning_rate": 4.735860488325586e-06,
"loss": 1.0094,
"step": 1647
},
{
"epoch": 0.9344169344169344,
"grad_norm": 2.0358612537384033,
"learning_rate": 4.7355250460921346e-06,
"loss": 1.123,
"step": 1648
},
{
"epoch": 0.934983934983935,
"grad_norm": 2.1947906017303467,
"learning_rate": 4.735189402892726e-06,
"loss": 1.1135,
"step": 1649
},
{
"epoch": 0.9355509355509356,
"grad_norm": 2.077989101409912,
"learning_rate": 4.734853558757534e-06,
"loss": 1.0527,
"step": 1650
},
{
"epoch": 0.9361179361179361,
"grad_norm": 1.989847183227539,
"learning_rate": 4.73451751371675e-06,
"loss": 1.0593,
"step": 1651
},
{
"epoch": 0.9366849366849367,
"grad_norm": 2.135556936264038,
"learning_rate": 4.734181267800584e-06,
"loss": 1.1115,
"step": 1652
},
{
"epoch": 0.9372519372519372,
"grad_norm": 2.0910916328430176,
"learning_rate": 4.733844821039263e-06,
"loss": 1.1552,
"step": 1653
},
{
"epoch": 0.9378189378189378,
"grad_norm": 2.14231276512146,
"learning_rate": 4.733508173463032e-06,
"loss": 1.1227,
"step": 1654
},
{
"epoch": 0.9383859383859384,
"grad_norm": 1.8958340883255005,
"learning_rate": 4.733171325102154e-06,
"loss": 1.0518,
"step": 1655
},
{
"epoch": 0.938952938952939,
"grad_norm": 2.190434694290161,
"learning_rate": 4.732834275986912e-06,
"loss": 1.0807,
"step": 1656
},
{
"epoch": 0.9395199395199395,
"grad_norm": 2.2391319274902344,
"learning_rate": 4.732497026147605e-06,
"loss": 1.1503,
"step": 1657
},
{
"epoch": 0.94008694008694,
"grad_norm": 2.0487253665924072,
"learning_rate": 4.732159575614549e-06,
"loss": 1.1004,
"step": 1658
},
{
"epoch": 0.9406539406539407,
"grad_norm": 2.263414144515991,
"learning_rate": 4.7318219244180816e-06,
"loss": 1.0249,
"step": 1659
},
{
"epoch": 0.9412209412209412,
"grad_norm": 2.0268044471740723,
"learning_rate": 4.731484072588556e-06,
"loss": 1.0823,
"step": 1660
},
{
"epoch": 0.9417879417879418,
"grad_norm": 2.096123695373535,
"learning_rate": 4.731146020156343e-06,
"loss": 1.0718,
"step": 1661
},
{
"epoch": 0.9423549423549423,
"grad_norm": 2.0111961364746094,
"learning_rate": 4.730807767151834e-06,
"loss": 1.092,
"step": 1662
},
{
"epoch": 0.9429219429219429,
"grad_norm": 2.0895767211914062,
"learning_rate": 4.730469313605435e-06,
"loss": 1.1478,
"step": 1663
},
{
"epoch": 0.9434889434889435,
"grad_norm": 1.9681237936019897,
"learning_rate": 4.730130659547573e-06,
"loss": 1.0627,
"step": 1664
},
{
"epoch": 0.9440559440559441,
"grad_norm": 2.094294786453247,
"learning_rate": 4.729791805008691e-06,
"loss": 1.1258,
"step": 1665
},
{
"epoch": 0.9446229446229446,
"grad_norm": 2.05210542678833,
"learning_rate": 4.729452750019252e-06,
"loss": 1.1241,
"step": 1666
},
{
"epoch": 0.9451899451899451,
"grad_norm": 1.9600073099136353,
"learning_rate": 4.729113494609735e-06,
"loss": 1.0742,
"step": 1667
},
{
"epoch": 0.9457569457569458,
"grad_norm": 1.8465203046798706,
"learning_rate": 4.728774038810638e-06,
"loss": 1.0268,
"step": 1668
},
{
"epoch": 0.9463239463239463,
"grad_norm": 1.9174362421035767,
"learning_rate": 4.728434382652477e-06,
"loss": 1.0674,
"step": 1669
},
{
"epoch": 0.9468909468909469,
"grad_norm": 2.0088138580322266,
"learning_rate": 4.728094526165786e-06,
"loss": 1.0755,
"step": 1670
},
{
"epoch": 0.9474579474579474,
"grad_norm": 1.899976372718811,
"learning_rate": 4.727754469381116e-06,
"loss": 1.0559,
"step": 1671
},
{
"epoch": 0.9480249480249481,
"grad_norm": 2.0253117084503174,
"learning_rate": 4.7274142123290386e-06,
"loss": 1.0736,
"step": 1672
},
{
"epoch": 0.9485919485919486,
"grad_norm": 2.2883076667785645,
"learning_rate": 4.72707375504014e-06,
"loss": 1.0642,
"step": 1673
},
{
"epoch": 0.9491589491589492,
"grad_norm": 2.1243669986724854,
"learning_rate": 4.726733097545028e-06,
"loss": 1.0961,
"step": 1674
},
{
"epoch": 0.9497259497259497,
"grad_norm": 2.0006613731384277,
"learning_rate": 4.726392239874325e-06,
"loss": 1.0739,
"step": 1675
},
{
"epoch": 0.9502929502929502,
"grad_norm": 2.0095293521881104,
"learning_rate": 4.726051182058673e-06,
"loss": 1.0438,
"step": 1676
},
{
"epoch": 0.9508599508599509,
"grad_norm": 2.0329766273498535,
"learning_rate": 4.725709924128733e-06,
"loss": 1.1254,
"step": 1677
},
{
"epoch": 0.9514269514269514,
"grad_norm": 2.016813278198242,
"learning_rate": 4.725368466115182e-06,
"loss": 1.0904,
"step": 1678
},
{
"epoch": 0.951993951993952,
"grad_norm": 1.9678806066513062,
"learning_rate": 4.725026808048716e-06,
"loss": 1.0721,
"step": 1679
},
{
"epoch": 0.9525609525609525,
"grad_norm": 1.994081735610962,
"learning_rate": 4.7246849499600485e-06,
"loss": 1.0723,
"step": 1680
},
{
"epoch": 0.9531279531279532,
"grad_norm": 2.020341396331787,
"learning_rate": 4.724342891879913e-06,
"loss": 1.0537,
"step": 1681
},
{
"epoch": 0.9536949536949537,
"grad_norm": 1.9112085103988647,
"learning_rate": 4.724000633839057e-06,
"loss": 1.0588,
"step": 1682
},
{
"epoch": 0.9542619542619543,
"grad_norm": 1.9430484771728516,
"learning_rate": 4.723658175868251e-06,
"loss": 1.0341,
"step": 1683
},
{
"epoch": 0.9548289548289548,
"grad_norm": 1.9295274019241333,
"learning_rate": 4.723315517998278e-06,
"loss": 1.1012,
"step": 1684
},
{
"epoch": 0.9553959553959553,
"grad_norm": 1.913633108139038,
"learning_rate": 4.722972660259944e-06,
"loss": 1.1075,
"step": 1685
},
{
"epoch": 0.955962955962956,
"grad_norm": 2.0695457458496094,
"learning_rate": 4.722629602684069e-06,
"loss": 1.0851,
"step": 1686
},
{
"epoch": 0.9565299565299565,
"grad_norm": 2.084043502807617,
"learning_rate": 4.722286345301494e-06,
"loss": 1.1,
"step": 1687
},
{
"epoch": 0.9570969570969571,
"grad_norm": 2.147137403488159,
"learning_rate": 4.721942888143076e-06,
"loss": 1.089,
"step": 1688
},
{
"epoch": 0.9576639576639576,
"grad_norm": 2.0710439682006836,
"learning_rate": 4.721599231239691e-06,
"loss": 1.0876,
"step": 1689
},
{
"epoch": 0.9582309582309583,
"grad_norm": 2.159060478210449,
"learning_rate": 4.721255374622231e-06,
"loss": 1.0908,
"step": 1690
},
{
"epoch": 0.9587979587979588,
"grad_norm": 2.000617742538452,
"learning_rate": 4.7209113183216105e-06,
"loss": 1.07,
"step": 1691
},
{
"epoch": 0.9593649593649594,
"grad_norm": 1.9861509799957275,
"learning_rate": 4.720567062368757e-06,
"loss": 1.0692,
"step": 1692
},
{
"epoch": 0.9599319599319599,
"grad_norm": 2.0842881202697754,
"learning_rate": 4.720222606794617e-06,
"loss": 1.1225,
"step": 1693
},
{
"epoch": 0.9604989604989606,
"grad_norm": 2.016493320465088,
"learning_rate": 4.719877951630158e-06,
"loss": 1.126,
"step": 1694
},
{
"epoch": 0.9610659610659611,
"grad_norm": 1.9814172983169556,
"learning_rate": 4.719533096906363e-06,
"loss": 1.0924,
"step": 1695
},
{
"epoch": 0.9616329616329616,
"grad_norm": 1.9399725198745728,
"learning_rate": 4.7191880426542306e-06,
"loss": 1.1072,
"step": 1696
},
{
"epoch": 0.9621999621999622,
"grad_norm": 1.9508792161941528,
"learning_rate": 4.718842788904784e-06,
"loss": 1.0957,
"step": 1697
},
{
"epoch": 0.9627669627669627,
"grad_norm": 1.943466305732727,
"learning_rate": 4.718497335689057e-06,
"loss": 1.1059,
"step": 1698
},
{
"epoch": 0.9633339633339634,
"grad_norm": 2.0598337650299072,
"learning_rate": 4.7181516830381065e-06,
"loss": 1.1545,
"step": 1699
},
{
"epoch": 0.9639009639009639,
"grad_norm": 2.0156242847442627,
"learning_rate": 4.717805830983005e-06,
"loss": 1.1019,
"step": 1700
},
{
"epoch": 0.9644679644679645,
"grad_norm": 1.872069239616394,
"learning_rate": 4.717459779554843e-06,
"loss": 1.1587,
"step": 1701
},
{
"epoch": 0.965034965034965,
"grad_norm": 1.9101303815841675,
"learning_rate": 4.7171135287847295e-06,
"loss": 1.0322,
"step": 1702
},
{
"epoch": 0.9656019656019657,
"grad_norm": 2.0382840633392334,
"learning_rate": 4.716767078703793e-06,
"loss": 1.0802,
"step": 1703
},
{
"epoch": 0.9661689661689662,
"grad_norm": 2.093435049057007,
"learning_rate": 4.716420429343175e-06,
"loss": 1.1195,
"step": 1704
},
{
"epoch": 0.9667359667359667,
"grad_norm": 1.9311339855194092,
"learning_rate": 4.7160735807340395e-06,
"loss": 1.0978,
"step": 1705
},
{
"epoch": 0.9673029673029673,
"grad_norm": 1.9358426332473755,
"learning_rate": 4.7157265329075675e-06,
"loss": 1.0104,
"step": 1706
},
{
"epoch": 0.9678699678699678,
"grad_norm": 2.0609984397888184,
"learning_rate": 4.715379285894957e-06,
"loss": 1.0827,
"step": 1707
},
{
"epoch": 0.9684369684369685,
"grad_norm": 2.046546697616577,
"learning_rate": 4.715031839727424e-06,
"loss": 1.1349,
"step": 1708
},
{
"epoch": 0.969003969003969,
"grad_norm": 1.9720268249511719,
"learning_rate": 4.714684194436204e-06,
"loss": 1.0625,
"step": 1709
},
{
"epoch": 0.9695709695709696,
"grad_norm": 2.13085675239563,
"learning_rate": 4.714336350052547e-06,
"loss": 1.075,
"step": 1710
},
{
"epoch": 0.9701379701379701,
"grad_norm": 2.105146884918213,
"learning_rate": 4.713988306607726e-06,
"loss": 1.12,
"step": 1711
},
{
"epoch": 0.9707049707049707,
"grad_norm": 2.1501526832580566,
"learning_rate": 4.7136400641330245e-06,
"loss": 1.0891,
"step": 1712
},
{
"epoch": 0.9712719712719713,
"grad_norm": 2.1131129264831543,
"learning_rate": 4.713291622659753e-06,
"loss": 1.1084,
"step": 1713
},
{
"epoch": 0.9718389718389718,
"grad_norm": 2.1475110054016113,
"learning_rate": 4.712942982219232e-06,
"loss": 1.1547,
"step": 1714
},
{
"epoch": 0.9724059724059724,
"grad_norm": 2.140531301498413,
"learning_rate": 4.712594142842804e-06,
"loss": 1.1253,
"step": 1715
},
{
"epoch": 0.972972972972973,
"grad_norm": 1.925569772720337,
"learning_rate": 4.712245104561829e-06,
"loss": 1.0605,
"step": 1716
},
{
"epoch": 0.9735399735399736,
"grad_norm": 2.097440242767334,
"learning_rate": 4.711895867407684e-06,
"loss": 1.0883,
"step": 1717
},
{
"epoch": 0.9741069741069741,
"grad_norm": 2.093440055847168,
"learning_rate": 4.711546431411763e-06,
"loss": 1.07,
"step": 1718
},
{
"epoch": 0.9746739746739747,
"grad_norm": 1.9718364477157593,
"learning_rate": 4.711196796605482e-06,
"loss": 1.0653,
"step": 1719
},
{
"epoch": 0.9752409752409752,
"grad_norm": 1.944761872291565,
"learning_rate": 4.710846963020268e-06,
"loss": 0.9752,
"step": 1720
},
{
"epoch": 0.9758079758079758,
"grad_norm": 2.169654130935669,
"learning_rate": 4.710496930687574e-06,
"loss": 1.123,
"step": 1721
},
{
"epoch": 0.9763749763749764,
"grad_norm": 2.033576011657715,
"learning_rate": 4.710146699638864e-06,
"loss": 1.0782,
"step": 1722
},
{
"epoch": 0.9769419769419769,
"grad_norm": 1.983486533164978,
"learning_rate": 4.709796269905622e-06,
"loss": 1.0464,
"step": 1723
},
{
"epoch": 0.9775089775089775,
"grad_norm": 2.1388914585113525,
"learning_rate": 4.709445641519352e-06,
"loss": 1.0805,
"step": 1724
},
{
"epoch": 0.9780759780759781,
"grad_norm": 2.0662407875061035,
"learning_rate": 4.709094814511574e-06,
"loss": 1.0845,
"step": 1725
},
{
"epoch": 0.9786429786429787,
"grad_norm": 5.486571788787842,
"learning_rate": 4.708743788913827e-06,
"loss": 1.0293,
"step": 1726
},
{
"epoch": 0.9792099792099792,
"grad_norm": 2.0278141498565674,
"learning_rate": 4.708392564757665e-06,
"loss": 1.1524,
"step": 1727
},
{
"epoch": 0.9797769797769798,
"grad_norm": 2.124642848968506,
"learning_rate": 4.708041142074664e-06,
"loss": 1.1332,
"step": 1728
},
{
"epoch": 0.9803439803439803,
"grad_norm": 2.006274938583374,
"learning_rate": 4.707689520896413e-06,
"loss": 1.1231,
"step": 1729
},
{
"epoch": 0.980910980910981,
"grad_norm": 1.7690280675888062,
"learning_rate": 4.707337701254524e-06,
"loss": 1.0224,
"step": 1730
},
{
"epoch": 0.9814779814779815,
"grad_norm": 2.001707077026367,
"learning_rate": 4.706985683180624e-06,
"loss": 1.0755,
"step": 1731
},
{
"epoch": 0.982044982044982,
"grad_norm": 1.9623841047286987,
"learning_rate": 4.706633466706356e-06,
"loss": 1.0818,
"step": 1732
},
{
"epoch": 0.9826119826119826,
"grad_norm": 1.9035817384719849,
"learning_rate": 4.706281051863386e-06,
"loss": 1.1194,
"step": 1733
},
{
"epoch": 0.9831789831789832,
"grad_norm": 1.875132441520691,
"learning_rate": 4.705928438683394e-06,
"loss": 1.0762,
"step": 1734
},
{
"epoch": 0.9837459837459838,
"grad_norm": 1.8791394233703613,
"learning_rate": 4.705575627198077e-06,
"loss": 1.0526,
"step": 1735
},
{
"epoch": 0.9843129843129843,
"grad_norm": 1.932252049446106,
"learning_rate": 4.705222617439152e-06,
"loss": 1.0841,
"step": 1736
},
{
"epoch": 0.9848799848799848,
"grad_norm": 1.9309083223342896,
"learning_rate": 4.7048694094383564e-06,
"loss": 1.1053,
"step": 1737
},
{
"epoch": 0.9854469854469855,
"grad_norm": 2.0522756576538086,
"learning_rate": 4.704516003227439e-06,
"loss": 1.0378,
"step": 1738
},
{
"epoch": 0.986013986013986,
"grad_norm": 2.725698232650757,
"learning_rate": 4.70416239883817e-06,
"loss": 1.0885,
"step": 1739
},
{
"epoch": 0.9865809865809866,
"grad_norm": 1.9060684442520142,
"learning_rate": 4.703808596302339e-06,
"loss": 1.0453,
"step": 1740
},
{
"epoch": 0.9871479871479871,
"grad_norm": 2.087420701980591,
"learning_rate": 4.703454595651752e-06,
"loss": 1.0884,
"step": 1741
},
{
"epoch": 0.9877149877149877,
"grad_norm": 1.921978235244751,
"learning_rate": 4.7031003969182295e-06,
"loss": 1.0927,
"step": 1742
},
{
"epoch": 0.9882819882819883,
"grad_norm": 2.0271310806274414,
"learning_rate": 4.702746000133614e-06,
"loss": 1.1026,
"step": 1743
},
{
"epoch": 0.9888489888489889,
"grad_norm": 1.9672328233718872,
"learning_rate": 4.702391405329766e-06,
"loss": 1.018,
"step": 1744
},
{
"epoch": 0.9894159894159894,
"grad_norm": 1.9041637182235718,
"learning_rate": 4.702036612538562e-06,
"loss": 1.078,
"step": 1745
},
{
"epoch": 0.98998298998299,
"grad_norm": 2.149569034576416,
"learning_rate": 4.701681621791895e-06,
"loss": 1.0468,
"step": 1746
},
{
"epoch": 0.9905499905499906,
"grad_norm": 2.0086848735809326,
"learning_rate": 4.701326433121678e-06,
"loss": 1.1154,
"step": 1747
},
{
"epoch": 0.9911169911169911,
"grad_norm": 2.129842519760132,
"learning_rate": 4.700971046559842e-06,
"loss": 1.0542,
"step": 1748
},
{
"epoch": 0.9916839916839917,
"grad_norm": 2.165759801864624,
"learning_rate": 4.700615462138334e-06,
"loss": 1.1032,
"step": 1749
},
{
"epoch": 0.9922509922509922,
"grad_norm": 2.163757801055908,
"learning_rate": 4.700259679889122e-06,
"loss": 1.0914,
"step": 1750
},
{
"epoch": 0.9928179928179928,
"grad_norm": 2.3616926670074463,
"learning_rate": 4.699903699844186e-06,
"loss": 1.0627,
"step": 1751
},
{
"epoch": 0.9933849933849934,
"grad_norm": 1.9337599277496338,
"learning_rate": 4.69954752203553e-06,
"loss": 1.1096,
"step": 1752
},
{
"epoch": 0.993951993951994,
"grad_norm": 1.9729043245315552,
"learning_rate": 4.699191146495174e-06,
"loss": 1.0626,
"step": 1753
},
{
"epoch": 0.9945189945189945,
"grad_norm": 1.9833909273147583,
"learning_rate": 4.698834573255152e-06,
"loss": 1.1127,
"step": 1754
},
{
"epoch": 0.995085995085995,
"grad_norm": 2.1493887901306152,
"learning_rate": 4.69847780234752e-06,
"loss": 1.0547,
"step": 1755
},
{
"epoch": 0.9956529956529957,
"grad_norm": 2.0263702869415283,
"learning_rate": 4.698120833804352e-06,
"loss": 1.0631,
"step": 1756
},
{
"epoch": 0.9962199962199962,
"grad_norm": 1.9223586320877075,
"learning_rate": 4.697763667657737e-06,
"loss": 1.1226,
"step": 1757
},
{
"epoch": 0.9967869967869968,
"grad_norm": 2.044142961502075,
"learning_rate": 4.697406303939781e-06,
"loss": 1.0361,
"step": 1758
},
{
"epoch": 0.9973539973539973,
"grad_norm": 1.9353723526000977,
"learning_rate": 4.697048742682613e-06,
"loss": 1.0546,
"step": 1759
},
{
"epoch": 0.997920997920998,
"grad_norm": 3.121025800704956,
"learning_rate": 4.696690983918375e-06,
"loss": 1.0101,
"step": 1760
},
{
"epoch": 0.9984879984879985,
"grad_norm": 2.016129732131958,
"learning_rate": 4.696333027679229e-06,
"loss": 1.0542,
"step": 1761
},
{
"epoch": 0.9990549990549991,
"grad_norm": 2.0094079971313477,
"learning_rate": 4.695974873997352e-06,
"loss": 1.0867,
"step": 1762
},
{
"epoch": 0.9996219996219996,
"grad_norm": 1.9897685050964355,
"learning_rate": 4.695616522904943e-06,
"loss": 1.1321,
"step": 1763
}
],
"logging_steps": 1,
"max_steps": 10578,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 1763,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.453402696388444e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}