HT-phase_scale-Phi-20k-phase2 / trainer_state.json
ShourenWSR's picture
Upload folder using huggingface_hub
132da5d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 972,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003092783505154639,
"grad_norm": 0.7220329009380105,
"learning_rate": 0.0,
"loss": 0.4422,
"step": 1
},
{
"epoch": 0.006185567010309278,
"grad_norm": 0.7035020283112076,
"learning_rate": 1.0204081632653061e-07,
"loss": 0.4832,
"step": 2
},
{
"epoch": 0.009278350515463918,
"grad_norm": 0.8737722015358887,
"learning_rate": 2.0408163265306121e-07,
"loss": 0.4732,
"step": 3
},
{
"epoch": 0.012371134020618556,
"grad_norm": 0.7779243721957012,
"learning_rate": 3.0612244897959183e-07,
"loss": 0.4737,
"step": 4
},
{
"epoch": 0.015463917525773196,
"grad_norm": 0.7329444808774586,
"learning_rate": 4.0816326530612243e-07,
"loss": 0.4426,
"step": 5
},
{
"epoch": 0.018556701030927835,
"grad_norm": 0.7158028965170649,
"learning_rate": 5.102040816326531e-07,
"loss": 0.4941,
"step": 6
},
{
"epoch": 0.021649484536082474,
"grad_norm": 0.6888075254399494,
"learning_rate": 6.122448979591837e-07,
"loss": 0.4111,
"step": 7
},
{
"epoch": 0.024742268041237112,
"grad_norm": 0.7427508164973907,
"learning_rate": 7.142857142857143e-07,
"loss": 0.5104,
"step": 8
},
{
"epoch": 0.027835051546391754,
"grad_norm": 0.6277972002358595,
"learning_rate": 8.163265306122449e-07,
"loss": 0.4448,
"step": 9
},
{
"epoch": 0.030927835051546393,
"grad_norm": 0.6615474642249266,
"learning_rate": 9.183673469387756e-07,
"loss": 0.4766,
"step": 10
},
{
"epoch": 0.03402061855670103,
"grad_norm": 0.6267344916879954,
"learning_rate": 1.0204081632653063e-06,
"loss": 0.4347,
"step": 11
},
{
"epoch": 0.03711340206185567,
"grad_norm": 1.1775850456355763,
"learning_rate": 1.122448979591837e-06,
"loss": 0.5167,
"step": 12
},
{
"epoch": 0.04020618556701031,
"grad_norm": 0.6941660855089555,
"learning_rate": 1.2244897959183673e-06,
"loss": 0.4782,
"step": 13
},
{
"epoch": 0.04329896907216495,
"grad_norm": 0.7691560992409402,
"learning_rate": 1.3265306122448982e-06,
"loss": 0.4624,
"step": 14
},
{
"epoch": 0.04639175257731959,
"grad_norm": 0.6661388733908383,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.4674,
"step": 15
},
{
"epoch": 0.049484536082474224,
"grad_norm": 0.5788791202729546,
"learning_rate": 1.5306122448979593e-06,
"loss": 0.4704,
"step": 16
},
{
"epoch": 0.05257731958762887,
"grad_norm": 0.5025256108670897,
"learning_rate": 1.6326530612244897e-06,
"loss": 0.4672,
"step": 17
},
{
"epoch": 0.05567010309278351,
"grad_norm": 1.2671386866538892,
"learning_rate": 1.7346938775510206e-06,
"loss": 0.4322,
"step": 18
},
{
"epoch": 0.058762886597938144,
"grad_norm": 0.43553685659977487,
"learning_rate": 1.8367346938775512e-06,
"loss": 0.4041,
"step": 19
},
{
"epoch": 0.061855670103092786,
"grad_norm": 0.5729928987299671,
"learning_rate": 1.938775510204082e-06,
"loss": 0.5054,
"step": 20
},
{
"epoch": 0.06494845360824743,
"grad_norm": 0.6645449546643061,
"learning_rate": 2.0408163265306125e-06,
"loss": 0.438,
"step": 21
},
{
"epoch": 0.06804123711340206,
"grad_norm": 0.5176445382179179,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.4382,
"step": 22
},
{
"epoch": 0.0711340206185567,
"grad_norm": 0.5262967794748546,
"learning_rate": 2.244897959183674e-06,
"loss": 0.4205,
"step": 23
},
{
"epoch": 0.07422680412371134,
"grad_norm": 0.7267621375796322,
"learning_rate": 2.3469387755102044e-06,
"loss": 0.5079,
"step": 24
},
{
"epoch": 0.07731958762886598,
"grad_norm": 0.4252212798888888,
"learning_rate": 2.4489795918367347e-06,
"loss": 0.4396,
"step": 25
},
{
"epoch": 0.08041237113402062,
"grad_norm": 0.4536131271915926,
"learning_rate": 2.5510204081632657e-06,
"loss": 0.4384,
"step": 26
},
{
"epoch": 0.08350515463917525,
"grad_norm": 0.45116014661712117,
"learning_rate": 2.6530612244897964e-06,
"loss": 0.4402,
"step": 27
},
{
"epoch": 0.0865979381443299,
"grad_norm": 0.4568728052221468,
"learning_rate": 2.7551020408163266e-06,
"loss": 0.4648,
"step": 28
},
{
"epoch": 0.08969072164948454,
"grad_norm": 0.43137876910461337,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.4402,
"step": 29
},
{
"epoch": 0.09278350515463918,
"grad_norm": 0.4211788965935997,
"learning_rate": 2.959183673469388e-06,
"loss": 0.4229,
"step": 30
},
{
"epoch": 0.09587628865979382,
"grad_norm": 0.4337448222173674,
"learning_rate": 3.0612244897959185e-06,
"loss": 0.4128,
"step": 31
},
{
"epoch": 0.09896907216494845,
"grad_norm": 0.3729189572852763,
"learning_rate": 3.1632653061224496e-06,
"loss": 0.4159,
"step": 32
},
{
"epoch": 0.10206185567010309,
"grad_norm": 0.4231908442749754,
"learning_rate": 3.2653061224489794e-06,
"loss": 0.431,
"step": 33
},
{
"epoch": 0.10515463917525773,
"grad_norm": 0.409077818435389,
"learning_rate": 3.3673469387755105e-06,
"loss": 0.4525,
"step": 34
},
{
"epoch": 0.10824742268041238,
"grad_norm": 0.34021413683338564,
"learning_rate": 3.469387755102041e-06,
"loss": 0.4116,
"step": 35
},
{
"epoch": 0.11134020618556702,
"grad_norm": 0.3591499421696542,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.4299,
"step": 36
},
{
"epoch": 0.11443298969072165,
"grad_norm": 0.4659218552375075,
"learning_rate": 3.6734693877551024e-06,
"loss": 0.4215,
"step": 37
},
{
"epoch": 0.11752577319587629,
"grad_norm": 0.36553772280054864,
"learning_rate": 3.7755102040816327e-06,
"loss": 0.4367,
"step": 38
},
{
"epoch": 0.12061855670103093,
"grad_norm": 0.4224042486029835,
"learning_rate": 3.877551020408164e-06,
"loss": 0.4438,
"step": 39
},
{
"epoch": 0.12371134020618557,
"grad_norm": 0.3971996014459553,
"learning_rate": 3.979591836734694e-06,
"loss": 0.4487,
"step": 40
},
{
"epoch": 0.1268041237113402,
"grad_norm": 0.35284570413422267,
"learning_rate": 4.081632653061225e-06,
"loss": 0.4039,
"step": 41
},
{
"epoch": 0.12989690721649486,
"grad_norm": 0.4493870470268664,
"learning_rate": 4.183673469387755e-06,
"loss": 0.472,
"step": 42
},
{
"epoch": 0.13298969072164948,
"grad_norm": 0.5342463347549146,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.4162,
"step": 43
},
{
"epoch": 0.1360824742268041,
"grad_norm": 0.4174496136137149,
"learning_rate": 4.3877551020408165e-06,
"loss": 0.4095,
"step": 44
},
{
"epoch": 0.13917525773195877,
"grad_norm": 0.4384141653564562,
"learning_rate": 4.489795918367348e-06,
"loss": 0.4054,
"step": 45
},
{
"epoch": 0.1422680412371134,
"grad_norm": 0.40013936895317725,
"learning_rate": 4.591836734693878e-06,
"loss": 0.4541,
"step": 46
},
{
"epoch": 0.14536082474226805,
"grad_norm": 0.3480277328089883,
"learning_rate": 4.693877551020409e-06,
"loss": 0.4374,
"step": 47
},
{
"epoch": 0.14845360824742268,
"grad_norm": 0.42726168307574297,
"learning_rate": 4.795918367346939e-06,
"loss": 0.4039,
"step": 48
},
{
"epoch": 0.1515463917525773,
"grad_norm": 0.4538463173366489,
"learning_rate": 4.897959183673469e-06,
"loss": 0.4666,
"step": 49
},
{
"epoch": 0.15463917525773196,
"grad_norm": 0.40284353314825155,
"learning_rate": 5e-06,
"loss": 0.4622,
"step": 50
},
{
"epoch": 0.1577319587628866,
"grad_norm": 0.44910131131991515,
"learning_rate": 5.1020408163265315e-06,
"loss": 0.3969,
"step": 51
},
{
"epoch": 0.16082474226804125,
"grad_norm": 0.44134065441696,
"learning_rate": 5.204081632653062e-06,
"loss": 0.4969,
"step": 52
},
{
"epoch": 0.16391752577319588,
"grad_norm": 0.35918306588103044,
"learning_rate": 5.306122448979593e-06,
"loss": 0.424,
"step": 53
},
{
"epoch": 0.1670103092783505,
"grad_norm": 0.33710757989510826,
"learning_rate": 5.408163265306123e-06,
"loss": 0.4266,
"step": 54
},
{
"epoch": 0.17010309278350516,
"grad_norm": 0.434215578187321,
"learning_rate": 5.510204081632653e-06,
"loss": 0.4376,
"step": 55
},
{
"epoch": 0.1731958762886598,
"grad_norm": 0.4260116466909397,
"learning_rate": 5.6122448979591834e-06,
"loss": 0.4191,
"step": 56
},
{
"epoch": 0.17628865979381445,
"grad_norm": 0.559318489706675,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.447,
"step": 57
},
{
"epoch": 0.17938144329896907,
"grad_norm": 0.3906173126178787,
"learning_rate": 5.816326530612246e-06,
"loss": 0.4225,
"step": 58
},
{
"epoch": 0.1824742268041237,
"grad_norm": 0.4742346991841608,
"learning_rate": 5.918367346938776e-06,
"loss": 0.4458,
"step": 59
},
{
"epoch": 0.18556701030927836,
"grad_norm": 0.4123045030138927,
"learning_rate": 6.020408163265307e-06,
"loss": 0.4497,
"step": 60
},
{
"epoch": 0.18865979381443299,
"grad_norm": 0.35048792524854255,
"learning_rate": 6.122448979591837e-06,
"loss": 0.4158,
"step": 61
},
{
"epoch": 0.19175257731958764,
"grad_norm": 0.4169124872120747,
"learning_rate": 6.224489795918368e-06,
"loss": 0.4283,
"step": 62
},
{
"epoch": 0.19484536082474227,
"grad_norm": 0.41029713183541644,
"learning_rate": 6.326530612244899e-06,
"loss": 0.4246,
"step": 63
},
{
"epoch": 0.1979381443298969,
"grad_norm": 0.5337463704313201,
"learning_rate": 6.4285714285714295e-06,
"loss": 0.4433,
"step": 64
},
{
"epoch": 0.20103092783505155,
"grad_norm": 0.3933430581995637,
"learning_rate": 6.530612244897959e-06,
"loss": 0.3966,
"step": 65
},
{
"epoch": 0.20412371134020618,
"grad_norm": 0.4582694378054699,
"learning_rate": 6.63265306122449e-06,
"loss": 0.4432,
"step": 66
},
{
"epoch": 0.20721649484536084,
"grad_norm": 0.48765468978039683,
"learning_rate": 6.734693877551021e-06,
"loss": 0.4393,
"step": 67
},
{
"epoch": 0.21030927835051547,
"grad_norm": 0.39676839862520125,
"learning_rate": 6.836734693877551e-06,
"loss": 0.4623,
"step": 68
},
{
"epoch": 0.2134020618556701,
"grad_norm": 0.4482589398534172,
"learning_rate": 6.938775510204082e-06,
"loss": 0.4101,
"step": 69
},
{
"epoch": 0.21649484536082475,
"grad_norm": 0.3624149691888066,
"learning_rate": 7.0408163265306125e-06,
"loss": 0.4036,
"step": 70
},
{
"epoch": 0.21958762886597938,
"grad_norm": 0.4295048480379611,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.403,
"step": 71
},
{
"epoch": 0.22268041237113403,
"grad_norm": 0.39682111123734054,
"learning_rate": 7.244897959183675e-06,
"loss": 0.4229,
"step": 72
},
{
"epoch": 0.22577319587628866,
"grad_norm": 0.4154676798215689,
"learning_rate": 7.346938775510205e-06,
"loss": 0.4131,
"step": 73
},
{
"epoch": 0.2288659793814433,
"grad_norm": 0.3765403100349772,
"learning_rate": 7.448979591836736e-06,
"loss": 0.4517,
"step": 74
},
{
"epoch": 0.23195876288659795,
"grad_norm": 0.4768545909661036,
"learning_rate": 7.551020408163265e-06,
"loss": 0.417,
"step": 75
},
{
"epoch": 0.23505154639175257,
"grad_norm": 0.37769473717411983,
"learning_rate": 7.653061224489796e-06,
"loss": 0.4568,
"step": 76
},
{
"epoch": 0.2381443298969072,
"grad_norm": 0.38498631713602566,
"learning_rate": 7.755102040816327e-06,
"loss": 0.4334,
"step": 77
},
{
"epoch": 0.24123711340206186,
"grad_norm": 0.3174598983048248,
"learning_rate": 7.857142857142858e-06,
"loss": 0.3881,
"step": 78
},
{
"epoch": 0.2443298969072165,
"grad_norm": 0.36192435295437103,
"learning_rate": 7.959183673469388e-06,
"loss": 0.3982,
"step": 79
},
{
"epoch": 0.24742268041237114,
"grad_norm": 0.38008377420730255,
"learning_rate": 8.06122448979592e-06,
"loss": 0.4129,
"step": 80
},
{
"epoch": 0.25051546391752577,
"grad_norm": 0.37093712326106304,
"learning_rate": 8.16326530612245e-06,
"loss": 0.3955,
"step": 81
},
{
"epoch": 0.2536082474226804,
"grad_norm": 0.3245693253207308,
"learning_rate": 8.26530612244898e-06,
"loss": 0.3779,
"step": 82
},
{
"epoch": 0.256701030927835,
"grad_norm": 0.40484115388567915,
"learning_rate": 8.36734693877551e-06,
"loss": 0.4201,
"step": 83
},
{
"epoch": 0.2597938144329897,
"grad_norm": 0.45349807280259075,
"learning_rate": 8.469387755102042e-06,
"loss": 0.407,
"step": 84
},
{
"epoch": 0.26288659793814434,
"grad_norm": 0.5134134442864399,
"learning_rate": 8.571428571428571e-06,
"loss": 0.4503,
"step": 85
},
{
"epoch": 0.26597938144329897,
"grad_norm": 0.600117050646678,
"learning_rate": 8.673469387755103e-06,
"loss": 0.4274,
"step": 86
},
{
"epoch": 0.2690721649484536,
"grad_norm": 0.4164938359408504,
"learning_rate": 8.775510204081633e-06,
"loss": 0.4473,
"step": 87
},
{
"epoch": 0.2721649484536082,
"grad_norm": 0.3648277119395248,
"learning_rate": 8.877551020408163e-06,
"loss": 0.4435,
"step": 88
},
{
"epoch": 0.2752577319587629,
"grad_norm": 0.42584735229761017,
"learning_rate": 8.979591836734695e-06,
"loss": 0.4637,
"step": 89
},
{
"epoch": 0.27835051546391754,
"grad_norm": 0.4893392978187684,
"learning_rate": 9.081632653061225e-06,
"loss": 0.4608,
"step": 90
},
{
"epoch": 0.28144329896907216,
"grad_norm": 0.40017674990337454,
"learning_rate": 9.183673469387756e-06,
"loss": 0.4463,
"step": 91
},
{
"epoch": 0.2845360824742268,
"grad_norm": 0.3765261542973935,
"learning_rate": 9.285714285714288e-06,
"loss": 0.4219,
"step": 92
},
{
"epoch": 0.2876288659793814,
"grad_norm": 0.37518752775293795,
"learning_rate": 9.387755102040818e-06,
"loss": 0.4244,
"step": 93
},
{
"epoch": 0.2907216494845361,
"grad_norm": 0.4473276537814246,
"learning_rate": 9.489795918367348e-06,
"loss": 0.4572,
"step": 94
},
{
"epoch": 0.29381443298969073,
"grad_norm": 0.5287937607719012,
"learning_rate": 9.591836734693878e-06,
"loss": 0.4772,
"step": 95
},
{
"epoch": 0.29690721649484536,
"grad_norm": 0.43309575827009483,
"learning_rate": 9.693877551020408e-06,
"loss": 0.4366,
"step": 96
},
{
"epoch": 0.3,
"grad_norm": 0.3868727014101142,
"learning_rate": 9.795918367346939e-06,
"loss": 0.4408,
"step": 97
},
{
"epoch": 0.3030927835051546,
"grad_norm": 0.3840116443447955,
"learning_rate": 9.89795918367347e-06,
"loss": 0.3959,
"step": 98
},
{
"epoch": 0.3061855670103093,
"grad_norm": 0.3332314789253363,
"learning_rate": 1e-05,
"loss": 0.4098,
"step": 99
},
{
"epoch": 0.30927835051546393,
"grad_norm": 0.36075607939522497,
"learning_rate": 9.999967698966278e-06,
"loss": 0.4226,
"step": 100
},
{
"epoch": 0.31237113402061856,
"grad_norm": 0.4392150875918316,
"learning_rate": 9.999870796282452e-06,
"loss": 0.4338,
"step": 101
},
{
"epoch": 0.3154639175257732,
"grad_norm": 0.43798219013201406,
"learning_rate": 9.999709293200546e-06,
"loss": 0.4664,
"step": 102
},
{
"epoch": 0.3185567010309278,
"grad_norm": 0.3364109046254725,
"learning_rate": 9.999483191807245e-06,
"loss": 0.405,
"step": 103
},
{
"epoch": 0.3216494845360825,
"grad_norm": 0.4033274426634507,
"learning_rate": 9.999192495023873e-06,
"loss": 0.4612,
"step": 104
},
{
"epoch": 0.3247422680412371,
"grad_norm": 1.264002600498985,
"learning_rate": 9.998837206606355e-06,
"loss": 0.3965,
"step": 105
},
{
"epoch": 0.32783505154639175,
"grad_norm": 0.36417937925451055,
"learning_rate": 9.998417331145161e-06,
"loss": 0.4417,
"step": 106
},
{
"epoch": 0.3309278350515464,
"grad_norm": 0.4023715037416989,
"learning_rate": 9.997932874065259e-06,
"loss": 0.4406,
"step": 107
},
{
"epoch": 0.334020618556701,
"grad_norm": 0.42289969931350985,
"learning_rate": 9.99738384162603e-06,
"loss": 0.4116,
"step": 108
},
{
"epoch": 0.3371134020618557,
"grad_norm": 0.35713977786014356,
"learning_rate": 9.996770240921205e-06,
"loss": 0.4185,
"step": 109
},
{
"epoch": 0.3402061855670103,
"grad_norm": 0.3670739072941028,
"learning_rate": 9.996092079878757e-06,
"loss": 0.4022,
"step": 110
},
{
"epoch": 0.34329896907216495,
"grad_norm": 0.3870582401338966,
"learning_rate": 9.995349367260807e-06,
"loss": 0.4577,
"step": 111
},
{
"epoch": 0.3463917525773196,
"grad_norm": 0.32993992408348866,
"learning_rate": 9.994542112663507e-06,
"loss": 0.3801,
"step": 112
},
{
"epoch": 0.3494845360824742,
"grad_norm": 0.345855562846639,
"learning_rate": 9.993670326516924e-06,
"loss": 0.461,
"step": 113
},
{
"epoch": 0.3525773195876289,
"grad_norm": 0.4104212434960431,
"learning_rate": 9.992734020084892e-06,
"loss": 0.3999,
"step": 114
},
{
"epoch": 0.3556701030927835,
"grad_norm": 0.6634908671468979,
"learning_rate": 9.991733205464882e-06,
"loss": 0.4251,
"step": 115
},
{
"epoch": 0.35876288659793815,
"grad_norm": 0.2947924939452513,
"learning_rate": 9.990667895587827e-06,
"loss": 0.3913,
"step": 116
},
{
"epoch": 0.3618556701030928,
"grad_norm": 0.3478221537413288,
"learning_rate": 9.989538104217975e-06,
"loss": 0.3926,
"step": 117
},
{
"epoch": 0.3649484536082474,
"grad_norm": 0.3265206725118031,
"learning_rate": 9.988343845952697e-06,
"loss": 0.3703,
"step": 118
},
{
"epoch": 0.3680412371134021,
"grad_norm": 0.3720856553833873,
"learning_rate": 9.987085136222302e-06,
"loss": 0.3786,
"step": 119
},
{
"epoch": 0.3711340206185567,
"grad_norm": 0.43621656093852407,
"learning_rate": 9.985761991289841e-06,
"loss": 0.4315,
"step": 120
},
{
"epoch": 0.37422680412371134,
"grad_norm": 0.37899203918553465,
"learning_rate": 9.984374428250894e-06,
"loss": 0.3953,
"step": 121
},
{
"epoch": 0.37731958762886597,
"grad_norm": 0.3609003641884193,
"learning_rate": 9.98292246503335e-06,
"loss": 0.4319,
"step": 122
},
{
"epoch": 0.3804123711340206,
"grad_norm": 0.41971555363580315,
"learning_rate": 9.981406120397172e-06,
"loss": 0.4421,
"step": 123
},
{
"epoch": 0.3835051546391753,
"grad_norm": 0.3977990495669762,
"learning_rate": 9.979825413934162e-06,
"loss": 0.4188,
"step": 124
},
{
"epoch": 0.3865979381443299,
"grad_norm": 0.3987336298164563,
"learning_rate": 9.9781803660677e-06,
"loss": 0.4371,
"step": 125
},
{
"epoch": 0.38969072164948454,
"grad_norm": 0.35568515500000975,
"learning_rate": 9.976470998052484e-06,
"loss": 0.41,
"step": 126
},
{
"epoch": 0.39278350515463917,
"grad_norm": 0.468475729583225,
"learning_rate": 9.974697331974255e-06,
"loss": 0.4802,
"step": 127
},
{
"epoch": 0.3958762886597938,
"grad_norm": 0.442269906201004,
"learning_rate": 9.972859390749516e-06,
"loss": 0.4967,
"step": 128
},
{
"epoch": 0.3989690721649485,
"grad_norm": 0.424830726205854,
"learning_rate": 9.970957198125224e-06,
"loss": 0.4099,
"step": 129
},
{
"epoch": 0.4020618556701031,
"grad_norm": 0.3901734315345603,
"learning_rate": 9.968990778678493e-06,
"loss": 0.405,
"step": 130
},
{
"epoch": 0.40515463917525774,
"grad_norm": 0.38051479001704774,
"learning_rate": 9.966960157816279e-06,
"loss": 0.4329,
"step": 131
},
{
"epoch": 0.40824742268041236,
"grad_norm": 0.38980392969505734,
"learning_rate": 9.964865361775042e-06,
"loss": 0.4273,
"step": 132
},
{
"epoch": 0.411340206185567,
"grad_norm": 0.4082796222777848,
"learning_rate": 9.962706417620413e-06,
"loss": 0.4248,
"step": 133
},
{
"epoch": 0.4144329896907217,
"grad_norm": 0.3455050558847363,
"learning_rate": 9.960483353246843e-06,
"loss": 0.4163,
"step": 134
},
{
"epoch": 0.4175257731958763,
"grad_norm": 0.41108684679376056,
"learning_rate": 9.958196197377242e-06,
"loss": 0.4468,
"step": 135
},
{
"epoch": 0.42061855670103093,
"grad_norm": 0.3848356328809587,
"learning_rate": 9.95584497956261e-06,
"loss": 0.392,
"step": 136
},
{
"epoch": 0.42371134020618556,
"grad_norm": 0.3760869314548746,
"learning_rate": 9.953429730181653e-06,
"loss": 0.4666,
"step": 137
},
{
"epoch": 0.4268041237113402,
"grad_norm": 0.34588003209988005,
"learning_rate": 9.950950480440396e-06,
"loss": 0.4416,
"step": 138
},
{
"epoch": 0.4298969072164949,
"grad_norm": 0.3735210250651018,
"learning_rate": 9.948407262371764e-06,
"loss": 0.3973,
"step": 139
},
{
"epoch": 0.4329896907216495,
"grad_norm": 0.36920833160292676,
"learning_rate": 9.945800108835191e-06,
"loss": 0.4235,
"step": 140
},
{
"epoch": 0.43608247422680413,
"grad_norm": 0.4064849366706592,
"learning_rate": 9.943129053516176e-06,
"loss": 0.4391,
"step": 141
},
{
"epoch": 0.43917525773195876,
"grad_norm": 0.32274388016168765,
"learning_rate": 9.940394130925858e-06,
"loss": 0.401,
"step": 142
},
{
"epoch": 0.4422680412371134,
"grad_norm": 0.3514480510132677,
"learning_rate": 9.93759537640057e-06,
"loss": 0.4147,
"step": 143
},
{
"epoch": 0.44536082474226807,
"grad_norm": 0.3433080243743223,
"learning_rate": 9.934732826101378e-06,
"loss": 0.426,
"step": 144
},
{
"epoch": 0.4484536082474227,
"grad_norm": 0.43791164185777237,
"learning_rate": 9.931806517013612e-06,
"loss": 0.4596,
"step": 145
},
{
"epoch": 0.4515463917525773,
"grad_norm": 0.370180398918842,
"learning_rate": 9.928816486946398e-06,
"loss": 0.3997,
"step": 146
},
{
"epoch": 0.45463917525773195,
"grad_norm": 0.3272727921118669,
"learning_rate": 9.925762774532162e-06,
"loss": 0.4272,
"step": 147
},
{
"epoch": 0.4577319587628866,
"grad_norm": 0.4637056549954698,
"learning_rate": 9.922645419226128e-06,
"loss": 0.4718,
"step": 148
},
{
"epoch": 0.4608247422680412,
"grad_norm": 0.7294401083647661,
"learning_rate": 9.919464461305817e-06,
"loss": 0.416,
"step": 149
},
{
"epoch": 0.4639175257731959,
"grad_norm": 0.4103431625946844,
"learning_rate": 9.916219941870519e-06,
"loss": 0.418,
"step": 150
},
{
"epoch": 0.4670103092783505,
"grad_norm": 0.3519149712001119,
"learning_rate": 9.912911902840771e-06,
"loss": 0.4014,
"step": 151
},
{
"epoch": 0.47010309278350515,
"grad_norm": 0.41537482262548897,
"learning_rate": 9.909540386957801e-06,
"loss": 0.4462,
"step": 152
},
{
"epoch": 0.4731958762886598,
"grad_norm": 0.45138202485027723,
"learning_rate": 9.90610543778299e-06,
"loss": 0.4613,
"step": 153
},
{
"epoch": 0.4762886597938144,
"grad_norm": 0.7949633754886084,
"learning_rate": 9.9026070996973e-06,
"loss": 0.4492,
"step": 154
},
{
"epoch": 0.4793814432989691,
"grad_norm": 0.4084320907683219,
"learning_rate": 9.899045417900709e-06,
"loss": 0.4524,
"step": 155
},
{
"epoch": 0.4824742268041237,
"grad_norm": 0.3097088652783349,
"learning_rate": 9.895420438411616e-06,
"loss": 0.3947,
"step": 156
},
{
"epoch": 0.48556701030927835,
"grad_norm": 0.500006282123083,
"learning_rate": 9.891732208066254e-06,
"loss": 0.4033,
"step": 157
},
{
"epoch": 0.488659793814433,
"grad_norm": 0.3682651386422796,
"learning_rate": 9.887980774518085e-06,
"loss": 0.399,
"step": 158
},
{
"epoch": 0.4917525773195876,
"grad_norm": 0.39684906849089263,
"learning_rate": 9.884166186237185e-06,
"loss": 0.4304,
"step": 159
},
{
"epoch": 0.4948453608247423,
"grad_norm": 0.4519243281046066,
"learning_rate": 9.880288492509606e-06,
"loss": 0.48,
"step": 160
},
{
"epoch": 0.4979381443298969,
"grad_norm": 0.3524909210389552,
"learning_rate": 9.876347743436758e-06,
"loss": 0.3941,
"step": 161
},
{
"epoch": 0.5010309278350515,
"grad_norm": 0.4045327844967625,
"learning_rate": 9.872343989934747e-06,
"loss": 0.4384,
"step": 162
},
{
"epoch": 0.5041237113402062,
"grad_norm": 0.3717528508433992,
"learning_rate": 9.868277283733725e-06,
"loss": 0.4573,
"step": 163
},
{
"epoch": 0.5072164948453608,
"grad_norm": 0.4190213137059769,
"learning_rate": 9.864147677377218e-06,
"loss": 0.4404,
"step": 164
},
{
"epoch": 0.5103092783505154,
"grad_norm": 0.4186463041983309,
"learning_rate": 9.859955224221446e-06,
"loss": 0.4126,
"step": 165
},
{
"epoch": 0.51340206185567,
"grad_norm": 0.38404866212835087,
"learning_rate": 9.855699978434639e-06,
"loss": 0.449,
"step": 166
},
{
"epoch": 0.5164948453608248,
"grad_norm": 0.35653907687833253,
"learning_rate": 9.85138199499633e-06,
"loss": 0.3943,
"step": 167
},
{
"epoch": 0.5195876288659794,
"grad_norm": 0.42289422396642834,
"learning_rate": 9.847001329696653e-06,
"loss": 0.4413,
"step": 168
},
{
"epoch": 0.522680412371134,
"grad_norm": 0.40627594296304126,
"learning_rate": 9.842558039135612e-06,
"loss": 0.4244,
"step": 169
},
{
"epoch": 0.5257731958762887,
"grad_norm": 0.452599947941535,
"learning_rate": 9.838052180722362e-06,
"loss": 0.4264,
"step": 170
},
{
"epoch": 0.5288659793814433,
"grad_norm": 0.3882521773969115,
"learning_rate": 9.833483812674453e-06,
"loss": 0.4086,
"step": 171
},
{
"epoch": 0.5319587628865979,
"grad_norm": 0.31860129778076346,
"learning_rate": 9.828852994017091e-06,
"loss": 0.4264,
"step": 172
},
{
"epoch": 0.5350515463917526,
"grad_norm": 0.372117789676078,
"learning_rate": 9.82415978458237e-06,
"loss": 0.4023,
"step": 173
},
{
"epoch": 0.5381443298969072,
"grad_norm": 0.5036778726839901,
"learning_rate": 9.819404245008492e-06,
"loss": 0.3944,
"step": 174
},
{
"epoch": 0.5412371134020618,
"grad_norm": 0.3674447495317301,
"learning_rate": 9.814586436738998e-06,
"loss": 0.3923,
"step": 175
},
{
"epoch": 0.5443298969072164,
"grad_norm": 0.3731434238979774,
"learning_rate": 9.80970642202196e-06,
"loss": 0.4205,
"step": 176
},
{
"epoch": 0.5474226804123712,
"grad_norm": 0.41627568220258476,
"learning_rate": 9.80476426390919e-06,
"loss": 0.3717,
"step": 177
},
{
"epoch": 0.5505154639175258,
"grad_norm": 1.0211708857535742,
"learning_rate": 9.799760026255412e-06,
"loss": 0.418,
"step": 178
},
{
"epoch": 0.5536082474226804,
"grad_norm": 0.38193863125239336,
"learning_rate": 9.794693773717445e-06,
"loss": 0.4458,
"step": 179
},
{
"epoch": 0.5567010309278351,
"grad_norm": 0.47696480909517025,
"learning_rate": 9.789565571753368e-06,
"loss": 0.4631,
"step": 180
},
{
"epoch": 0.5597938144329897,
"grad_norm": 0.40610575547480393,
"learning_rate": 9.78437548662167e-06,
"loss": 0.4083,
"step": 181
},
{
"epoch": 0.5628865979381443,
"grad_norm": 0.3304252568122808,
"learning_rate": 9.779123585380398e-06,
"loss": 0.439,
"step": 182
},
{
"epoch": 0.565979381443299,
"grad_norm": 0.3272419825098461,
"learning_rate": 9.773809935886287e-06,
"loss": 0.3937,
"step": 183
},
{
"epoch": 0.5690721649484536,
"grad_norm": 0.3587831705215171,
"learning_rate": 9.768434606793884e-06,
"loss": 0.4018,
"step": 184
},
{
"epoch": 0.5721649484536082,
"grad_norm": 0.443615507107455,
"learning_rate": 9.762997667554666e-06,
"loss": 0.4092,
"step": 185
},
{
"epoch": 0.5752577319587628,
"grad_norm": 0.37884710599971766,
"learning_rate": 9.757499188416135e-06,
"loss": 0.3896,
"step": 186
},
{
"epoch": 0.5783505154639176,
"grad_norm": 0.3856243077642201,
"learning_rate": 9.751939240420916e-06,
"loss": 0.4444,
"step": 187
},
{
"epoch": 0.5814432989690722,
"grad_norm": 0.3871002878329792,
"learning_rate": 9.746317895405835e-06,
"loss": 0.4652,
"step": 188
},
{
"epoch": 0.5845360824742268,
"grad_norm": 0.40509810646221905,
"learning_rate": 9.740635226000994e-06,
"loss": 0.4397,
"step": 189
},
{
"epoch": 0.5876288659793815,
"grad_norm": 0.4160183357738651,
"learning_rate": 9.734891305628831e-06,
"loss": 0.4314,
"step": 190
},
{
"epoch": 0.5907216494845361,
"grad_norm": 0.43975201212553466,
"learning_rate": 9.729086208503174e-06,
"loss": 0.4575,
"step": 191
},
{
"epoch": 0.5938144329896907,
"grad_norm": 0.484435973107823,
"learning_rate": 9.723220009628278e-06,
"loss": 0.4178,
"step": 192
},
{
"epoch": 0.5969072164948453,
"grad_norm": 0.34862743794307083,
"learning_rate": 9.717292784797854e-06,
"loss": 0.3981,
"step": 193
},
{
"epoch": 0.6,
"grad_norm": 0.37978393661957066,
"learning_rate": 9.711304610594104e-06,
"loss": 0.4521,
"step": 194
},
{
"epoch": 0.6030927835051546,
"grad_norm": 0.3542607634854072,
"learning_rate": 9.70525556438671e-06,
"loss": 0.434,
"step": 195
},
{
"epoch": 0.6061855670103092,
"grad_norm": 0.3507367379175751,
"learning_rate": 9.699145724331851e-06,
"loss": 0.4462,
"step": 196
},
{
"epoch": 0.6092783505154639,
"grad_norm": 0.38846079113543686,
"learning_rate": 9.692975169371189e-06,
"loss": 0.4112,
"step": 197
},
{
"epoch": 0.6123711340206186,
"grad_norm": 0.3761006338373534,
"learning_rate": 9.686743979230844e-06,
"loss": 0.4542,
"step": 198
},
{
"epoch": 0.6154639175257732,
"grad_norm": 0.37798748643860347,
"learning_rate": 9.68045223442037e-06,
"loss": 0.3916,
"step": 199
},
{
"epoch": 0.6185567010309279,
"grad_norm": 0.3762540406851515,
"learning_rate": 9.67410001623171e-06,
"loss": 0.4026,
"step": 200
},
{
"epoch": 0.6216494845360825,
"grad_norm": 0.4064096939338082,
"learning_rate": 9.66768740673815e-06,
"loss": 0.4475,
"step": 201
},
{
"epoch": 0.6247422680412371,
"grad_norm": 0.40087767969653776,
"learning_rate": 9.661214488793257e-06,
"loss": 0.4158,
"step": 202
},
{
"epoch": 0.6278350515463917,
"grad_norm": 0.3542486717999693,
"learning_rate": 9.654681346029809e-06,
"loss": 0.4163,
"step": 203
},
{
"epoch": 0.6309278350515464,
"grad_norm": 0.41164735599886176,
"learning_rate": 9.648088062858707e-06,
"loss": 0.4334,
"step": 204
},
{
"epoch": 0.634020618556701,
"grad_norm": 0.3750735185215337,
"learning_rate": 9.6414347244679e-06,
"loss": 0.3863,
"step": 205
},
{
"epoch": 0.6371134020618556,
"grad_norm": 0.4017716434148873,
"learning_rate": 9.63472141682127e-06,
"loss": 0.4541,
"step": 206
},
{
"epoch": 0.6402061855670103,
"grad_norm": 0.3482809621069407,
"learning_rate": 9.627948226657527e-06,
"loss": 0.4196,
"step": 207
},
{
"epoch": 0.643298969072165,
"grad_norm": 0.41831555099933715,
"learning_rate": 9.62111524148909e-06,
"loss": 0.4261,
"step": 208
},
{
"epoch": 0.6463917525773196,
"grad_norm": 0.3376522288825046,
"learning_rate": 9.61422254960095e-06,
"loss": 0.4191,
"step": 209
},
{
"epoch": 0.6494845360824743,
"grad_norm": 0.4086079442501632,
"learning_rate": 9.60727024004954e-06,
"loss": 0.4254,
"step": 210
},
{
"epoch": 0.6525773195876289,
"grad_norm": 0.3554579587789702,
"learning_rate": 9.60025840266157e-06,
"loss": 0.4514,
"step": 211
},
{
"epoch": 0.6556701030927835,
"grad_norm": 0.5797896520598668,
"learning_rate": 9.593187128032882e-06,
"loss": 0.4812,
"step": 212
},
{
"epoch": 0.6587628865979381,
"grad_norm": 0.3872271636430896,
"learning_rate": 9.586056507527266e-06,
"loss": 0.419,
"step": 213
},
{
"epoch": 0.6618556701030928,
"grad_norm": 0.38821488571945145,
"learning_rate": 9.578866633275289e-06,
"loss": 0.42,
"step": 214
},
{
"epoch": 0.6649484536082474,
"grad_norm": 0.4813854082108463,
"learning_rate": 9.571617598173097e-06,
"loss": 0.4325,
"step": 215
},
{
"epoch": 0.668041237113402,
"grad_norm": 0.3627936880248002,
"learning_rate": 9.564309495881221e-06,
"loss": 0.392,
"step": 216
},
{
"epoch": 0.6711340206185566,
"grad_norm": 0.37734221944263685,
"learning_rate": 9.556942420823368e-06,
"loss": 0.4246,
"step": 217
},
{
"epoch": 0.6742268041237114,
"grad_norm": 0.4004639252719525,
"learning_rate": 9.549516468185191e-06,
"loss": 0.412,
"step": 218
},
{
"epoch": 0.677319587628866,
"grad_norm": 0.4526596384689256,
"learning_rate": 9.542031733913069e-06,
"loss": 0.4792,
"step": 219
},
{
"epoch": 0.6804123711340206,
"grad_norm": 0.4390731617883788,
"learning_rate": 9.534488314712863e-06,
"loss": 0.4421,
"step": 220
},
{
"epoch": 0.6835051546391753,
"grad_norm": 0.4007337258521647,
"learning_rate": 9.52688630804867e-06,
"loss": 0.4404,
"step": 221
},
{
"epoch": 0.6865979381443299,
"grad_norm": 0.3915553551665069,
"learning_rate": 9.519225812141556e-06,
"loss": 0.393,
"step": 222
},
{
"epoch": 0.6896907216494845,
"grad_norm": 0.341094849828294,
"learning_rate": 9.511506925968302e-06,
"loss": 0.4145,
"step": 223
},
{
"epoch": 0.6927835051546392,
"grad_norm": 0.413391987710813,
"learning_rate": 9.503729749260101e-06,
"loss": 0.4449,
"step": 224
},
{
"epoch": 0.6958762886597938,
"grad_norm": 0.3876076479558053,
"learning_rate": 9.4958943825013e-06,
"loss": 0.43,
"step": 225
},
{
"epoch": 0.6989690721649484,
"grad_norm": 0.35907008562608367,
"learning_rate": 9.488000926928071e-06,
"loss": 0.4108,
"step": 226
},
{
"epoch": 0.702061855670103,
"grad_norm": 0.3561364856811235,
"learning_rate": 9.480049484527127e-06,
"loss": 0.4238,
"step": 227
},
{
"epoch": 0.7051546391752578,
"grad_norm": 0.47010210624724486,
"learning_rate": 9.472040158034392e-06,
"loss": 0.4209,
"step": 228
},
{
"epoch": 0.7082474226804124,
"grad_norm": 0.4301123700583335,
"learning_rate": 9.463973050933674e-06,
"loss": 0.3994,
"step": 229
},
{
"epoch": 0.711340206185567,
"grad_norm": 0.3909682894633138,
"learning_rate": 9.455848267455332e-06,
"loss": 0.4215,
"step": 230
},
{
"epoch": 0.7144329896907217,
"grad_norm": 0.34838625934979284,
"learning_rate": 9.44766591257493e-06,
"loss": 0.3846,
"step": 231
},
{
"epoch": 0.7175257731958763,
"grad_norm": 0.3280238808955969,
"learning_rate": 9.439426092011877e-06,
"loss": 0.3931,
"step": 232
},
{
"epoch": 0.7206185567010309,
"grad_norm": 0.35144130802487117,
"learning_rate": 9.43112891222806e-06,
"loss": 0.4096,
"step": 233
},
{
"epoch": 0.7237113402061855,
"grad_norm": 0.41596258564300653,
"learning_rate": 9.422774480426474e-06,
"loss": 0.4025,
"step": 234
},
{
"epoch": 0.7268041237113402,
"grad_norm": 0.3841630129084166,
"learning_rate": 9.414362904549829e-06,
"loss": 0.4532,
"step": 235
},
{
"epoch": 0.7298969072164948,
"grad_norm": 0.3578469578427432,
"learning_rate": 9.405894293279167e-06,
"loss": 0.4503,
"step": 236
},
{
"epoch": 0.7329896907216494,
"grad_norm": 0.38595777247388136,
"learning_rate": 9.397368756032445e-06,
"loss": 0.4066,
"step": 237
},
{
"epoch": 0.7360824742268042,
"grad_norm": 0.4335838878361591,
"learning_rate": 9.388786402963133e-06,
"loss": 0.4495,
"step": 238
},
{
"epoch": 0.7391752577319588,
"grad_norm": 0.33324163736582774,
"learning_rate": 9.380147344958778e-06,
"loss": 0.3915,
"step": 239
},
{
"epoch": 0.7422680412371134,
"grad_norm": 0.36773272717492805,
"learning_rate": 9.371451693639583e-06,
"loss": 0.4307,
"step": 240
},
{
"epoch": 0.7453608247422681,
"grad_norm": 0.40450476128913343,
"learning_rate": 9.362699561356957e-06,
"loss": 0.4256,
"step": 241
},
{
"epoch": 0.7484536082474227,
"grad_norm": 0.3908440757047333,
"learning_rate": 9.35389106119207e-06,
"loss": 0.4153,
"step": 242
},
{
"epoch": 0.7515463917525773,
"grad_norm": 0.3114817033772392,
"learning_rate": 9.345026306954385e-06,
"loss": 0.4125,
"step": 243
},
{
"epoch": 0.7546391752577319,
"grad_norm": 0.29706798256457034,
"learning_rate": 9.336105413180194e-06,
"loss": 0.4226,
"step": 244
},
{
"epoch": 0.7577319587628866,
"grad_norm": 0.4376146395225796,
"learning_rate": 9.32712849513113e-06,
"loss": 0.4331,
"step": 245
},
{
"epoch": 0.7608247422680412,
"grad_norm": 0.3899623219746875,
"learning_rate": 9.31809566879269e-06,
"loss": 0.4383,
"step": 246
},
{
"epoch": 0.7639175257731958,
"grad_norm": 0.34792340705670277,
"learning_rate": 9.309007050872722e-06,
"loss": 0.443,
"step": 247
},
{
"epoch": 0.7670103092783506,
"grad_norm": 0.40032908798049566,
"learning_rate": 9.299862758799929e-06,
"loss": 0.3943,
"step": 248
},
{
"epoch": 0.7701030927835052,
"grad_norm": 0.3646886635478996,
"learning_rate": 9.290662910722346e-06,
"loss": 0.4438,
"step": 249
},
{
"epoch": 0.7731958762886598,
"grad_norm": 0.48708708631155473,
"learning_rate": 9.281407625505813e-06,
"loss": 0.3921,
"step": 250
},
{
"epoch": 0.7762886597938145,
"grad_norm": 0.35131588651040985,
"learning_rate": 9.272097022732444e-06,
"loss": 0.3841,
"step": 251
},
{
"epoch": 0.7793814432989691,
"grad_norm": 0.3366070564480312,
"learning_rate": 9.262731222699073e-06,
"loss": 0.4244,
"step": 252
},
{
"epoch": 0.7824742268041237,
"grad_norm": 0.4059540639934267,
"learning_rate": 9.253310346415714e-06,
"loss": 0.3968,
"step": 253
},
{
"epoch": 0.7855670103092783,
"grad_norm": 0.3319429374771854,
"learning_rate": 9.24383451560398e-06,
"loss": 0.4001,
"step": 254
},
{
"epoch": 0.788659793814433,
"grad_norm": 0.3747214017804394,
"learning_rate": 9.234303852695526e-06,
"loss": 0.4079,
"step": 255
},
{
"epoch": 0.7917525773195876,
"grad_norm": 0.6094521128793228,
"learning_rate": 9.224718480830454e-06,
"loss": 0.4492,
"step": 256
},
{
"epoch": 0.7948453608247422,
"grad_norm": 0.40150812678403713,
"learning_rate": 9.215078523855736e-06,
"loss": 0.3971,
"step": 257
},
{
"epoch": 0.797938144329897,
"grad_norm": 0.33127062639306104,
"learning_rate": 9.205384106323602e-06,
"loss": 0.391,
"step": 258
},
{
"epoch": 0.8010309278350516,
"grad_norm": 0.3804077472064743,
"learning_rate": 9.195635353489932e-06,
"loss": 0.424,
"step": 259
},
{
"epoch": 0.8041237113402062,
"grad_norm": 0.4156186134532376,
"learning_rate": 9.185832391312644e-06,
"loss": 0.4239,
"step": 260
},
{
"epoch": 0.8072164948453608,
"grad_norm": 0.37494860050910456,
"learning_rate": 9.175975346450063e-06,
"loss": 0.4122,
"step": 261
},
{
"epoch": 0.8103092783505155,
"grad_norm": 0.37253235581305144,
"learning_rate": 9.166064346259288e-06,
"loss": 0.4636,
"step": 262
},
{
"epoch": 0.8134020618556701,
"grad_norm": 0.47925682852498785,
"learning_rate": 9.156099518794535e-06,
"loss": 0.4191,
"step": 263
},
{
"epoch": 0.8164948453608247,
"grad_norm": 0.38882262828447034,
"learning_rate": 9.146080992805497e-06,
"loss": 0.4743,
"step": 264
},
{
"epoch": 0.8195876288659794,
"grad_norm": 0.40517243481841286,
"learning_rate": 9.136008897735673e-06,
"loss": 0.4381,
"step": 265
},
{
"epoch": 0.822680412371134,
"grad_norm": 0.33944094329173813,
"learning_rate": 9.125883363720696e-06,
"loss": 0.4218,
"step": 266
},
{
"epoch": 0.8257731958762886,
"grad_norm": 0.3387360961611941,
"learning_rate": 9.11570452158665e-06,
"loss": 0.4187,
"step": 267
},
{
"epoch": 0.8288659793814434,
"grad_norm": 0.32535654522901664,
"learning_rate": 9.105472502848386e-06,
"loss": 0.425,
"step": 268
},
{
"epoch": 0.831958762886598,
"grad_norm": 0.4065234297081288,
"learning_rate": 9.095187439707817e-06,
"loss": 0.4338,
"step": 269
},
{
"epoch": 0.8350515463917526,
"grad_norm": 0.40157205206617597,
"learning_rate": 9.08484946505221e-06,
"loss": 0.4564,
"step": 270
},
{
"epoch": 0.8381443298969072,
"grad_norm": 0.3479048593946284,
"learning_rate": 9.074458712452476e-06,
"loss": 0.4339,
"step": 271
},
{
"epoch": 0.8412371134020619,
"grad_norm": 0.46307296193741704,
"learning_rate": 9.06401531616143e-06,
"loss": 0.4778,
"step": 272
},
{
"epoch": 0.8443298969072165,
"grad_norm": 0.35582479153588514,
"learning_rate": 9.053519411112075e-06,
"loss": 0.3822,
"step": 273
},
{
"epoch": 0.8474226804123711,
"grad_norm": 0.4376659890869369,
"learning_rate": 9.042971132915841e-06,
"loss": 0.3995,
"step": 274
},
{
"epoch": 0.8505154639175257,
"grad_norm": 0.3389338487984306,
"learning_rate": 9.032370617860844e-06,
"loss": 0.4204,
"step": 275
},
{
"epoch": 0.8536082474226804,
"grad_norm": 0.33150814648720384,
"learning_rate": 9.021718002910124e-06,
"loss": 0.396,
"step": 276
},
{
"epoch": 0.856701030927835,
"grad_norm": 0.3834437528783491,
"learning_rate": 9.011013425699868e-06,
"loss": 0.4362,
"step": 277
},
{
"epoch": 0.8597938144329897,
"grad_norm": 0.3942889058329744,
"learning_rate": 9.000257024537641e-06,
"loss": 0.4506,
"step": 278
},
{
"epoch": 0.8628865979381444,
"grad_norm": 0.6702145922259628,
"learning_rate": 8.989448938400596e-06,
"loss": 0.4833,
"step": 279
},
{
"epoch": 0.865979381443299,
"grad_norm": 0.31283468483751053,
"learning_rate": 8.978589306933672e-06,
"loss": 0.4072,
"step": 280
},
{
"epoch": 0.8690721649484536,
"grad_norm": 0.36632655743824466,
"learning_rate": 8.9676782704478e-06,
"loss": 0.4511,
"step": 281
},
{
"epoch": 0.8721649484536083,
"grad_norm": 0.3966023481535047,
"learning_rate": 8.95671596991808e-06,
"loss": 0.4306,
"step": 282
},
{
"epoch": 0.8752577319587629,
"grad_norm": 0.397005850506469,
"learning_rate": 8.94570254698197e-06,
"loss": 0.4489,
"step": 283
},
{
"epoch": 0.8783505154639175,
"grad_norm": 0.48150680327390494,
"learning_rate": 8.934638143937447e-06,
"loss": 0.4036,
"step": 284
},
{
"epoch": 0.8814432989690721,
"grad_norm": 0.42034692202567625,
"learning_rate": 8.923522903741173e-06,
"loss": 0.4246,
"step": 285
},
{
"epoch": 0.8845360824742268,
"grad_norm": 0.34587961383464905,
"learning_rate": 8.91235697000665e-06,
"loss": 0.3852,
"step": 286
},
{
"epoch": 0.8876288659793814,
"grad_norm": 0.4895986957921685,
"learning_rate": 8.901140487002358e-06,
"loss": 0.4635,
"step": 287
},
{
"epoch": 0.8907216494845361,
"grad_norm": 0.35487167407743136,
"learning_rate": 8.889873599649893e-06,
"loss": 0.3973,
"step": 288
},
{
"epoch": 0.8938144329896908,
"grad_norm": 0.37736951311343125,
"learning_rate": 8.8785564535221e-06,
"loss": 0.4081,
"step": 289
},
{
"epoch": 0.8969072164948454,
"grad_norm": 0.3846700038419489,
"learning_rate": 8.867189194841187e-06,
"loss": 0.4881,
"step": 290
},
{
"epoch": 0.9,
"grad_norm": 0.4429103421627471,
"learning_rate": 8.855771970476834e-06,
"loss": 0.4551,
"step": 291
},
{
"epoch": 0.9030927835051547,
"grad_norm": 0.32825849474711527,
"learning_rate": 8.844304927944304e-06,
"loss": 0.3956,
"step": 292
},
{
"epoch": 0.9061855670103093,
"grad_norm": 0.3304508041662356,
"learning_rate": 8.832788215402527e-06,
"loss": 0.3972,
"step": 293
},
{
"epoch": 0.9092783505154639,
"grad_norm": 0.4344446557575567,
"learning_rate": 8.821221981652189e-06,
"loss": 0.4141,
"step": 294
},
{
"epoch": 0.9123711340206185,
"grad_norm": 0.32272845920125925,
"learning_rate": 8.809606376133814e-06,
"loss": 0.4501,
"step": 295
},
{
"epoch": 0.9154639175257732,
"grad_norm": 0.3453561017057089,
"learning_rate": 8.79794154892583e-06,
"loss": 0.4051,
"step": 296
},
{
"epoch": 0.9185567010309278,
"grad_norm": 0.340213017334916,
"learning_rate": 8.786227650742624e-06,
"loss": 0.4144,
"step": 297
},
{
"epoch": 0.9216494845360824,
"grad_norm": 0.32610289859497454,
"learning_rate": 8.774464832932609e-06,
"loss": 0.4105,
"step": 298
},
{
"epoch": 0.9247422680412372,
"grad_norm": 0.3382705441690299,
"learning_rate": 8.762653247476249e-06,
"loss": 0.4359,
"step": 299
},
{
"epoch": 0.9278350515463918,
"grad_norm": 0.3373010065525689,
"learning_rate": 8.750793046984118e-06,
"loss": 0.3884,
"step": 300
},
{
"epoch": 0.9309278350515464,
"grad_norm": 0.4147951347719845,
"learning_rate": 8.738884384694905e-06,
"loss": 0.4298,
"step": 301
},
{
"epoch": 0.934020618556701,
"grad_norm": 0.3312432357311152,
"learning_rate": 8.726927414473457e-06,
"loss": 0.4053,
"step": 302
},
{
"epoch": 0.9371134020618557,
"grad_norm": 0.4627090495345429,
"learning_rate": 8.714922290808766e-06,
"loss": 0.4279,
"step": 303
},
{
"epoch": 0.9402061855670103,
"grad_norm": 0.3896335087463135,
"learning_rate": 8.702869168811999e-06,
"loss": 0.423,
"step": 304
},
{
"epoch": 0.9432989690721649,
"grad_norm": 0.41180922800326647,
"learning_rate": 8.690768204214474e-06,
"loss": 0.4363,
"step": 305
},
{
"epoch": 0.9463917525773196,
"grad_norm": 0.3307854211509245,
"learning_rate": 8.67861955336566e-06,
"loss": 0.413,
"step": 306
},
{
"epoch": 0.9494845360824742,
"grad_norm": 0.3841553041095563,
"learning_rate": 8.666423373231145e-06,
"loss": 0.4125,
"step": 307
},
{
"epoch": 0.9525773195876288,
"grad_norm": 0.3838781748953959,
"learning_rate": 8.65417982139062e-06,
"loss": 0.4439,
"step": 308
},
{
"epoch": 0.9556701030927836,
"grad_norm": 0.31305227186806495,
"learning_rate": 8.641889056035842e-06,
"loss": 0.3905,
"step": 309
},
{
"epoch": 0.9587628865979382,
"grad_norm": 0.3857088956434675,
"learning_rate": 8.629551235968577e-06,
"loss": 0.411,
"step": 310
},
{
"epoch": 0.9618556701030928,
"grad_norm": 0.399404131470791,
"learning_rate": 8.617166520598563e-06,
"loss": 0.4339,
"step": 311
},
{
"epoch": 0.9649484536082474,
"grad_norm": 0.34041988314797883,
"learning_rate": 8.604735069941443e-06,
"loss": 0.4393,
"step": 312
},
{
"epoch": 0.9680412371134021,
"grad_norm": 0.33528564492422497,
"learning_rate": 8.592257044616701e-06,
"loss": 0.4026,
"step": 313
},
{
"epoch": 0.9711340206185567,
"grad_norm": 0.35692516201601204,
"learning_rate": 8.579732605845583e-06,
"loss": 0.4384,
"step": 314
},
{
"epoch": 0.9742268041237113,
"grad_norm": 0.38353193047164685,
"learning_rate": 8.567161915449018e-06,
"loss": 0.4539,
"step": 315
},
{
"epoch": 0.977319587628866,
"grad_norm": 0.45921826745698263,
"learning_rate": 8.554545135845522e-06,
"loss": 0.4103,
"step": 316
},
{
"epoch": 0.9804123711340206,
"grad_norm": 0.39231294331678845,
"learning_rate": 8.541882430049103e-06,
"loss": 0.4193,
"step": 317
},
{
"epoch": 0.9835051546391752,
"grad_norm": 0.3158697601066205,
"learning_rate": 8.529173961667158e-06,
"loss": 0.3764,
"step": 318
},
{
"epoch": 0.98659793814433,
"grad_norm": 0.3617686666769801,
"learning_rate": 8.516419894898356e-06,
"loss": 0.3925,
"step": 319
},
{
"epoch": 0.9896907216494846,
"grad_norm": 0.3494830754623986,
"learning_rate": 8.503620394530507e-06,
"loss": 0.4126,
"step": 320
},
{
"epoch": 0.9927835051546392,
"grad_norm": 0.4042145993787018,
"learning_rate": 8.490775625938452e-06,
"loss": 0.4458,
"step": 321
},
{
"epoch": 0.9958762886597938,
"grad_norm": 0.34415938289188014,
"learning_rate": 8.477885755081913e-06,
"loss": 0.4324,
"step": 322
},
{
"epoch": 0.9989690721649485,
"grad_norm": 0.4559898087472229,
"learning_rate": 8.46495094850335e-06,
"loss": 0.4039,
"step": 323
},
{
"epoch": 1.0,
"grad_norm": 0.4559898087472229,
"learning_rate": 8.451971373325813e-06,
"loss": 0.4044,
"step": 324
},
{
"epoch": 1.0030927835051546,
"grad_norm": 0.6525724020105182,
"learning_rate": 8.43894719725078e-06,
"loss": 0.3491,
"step": 325
},
{
"epoch": 1.0061855670103093,
"grad_norm": 0.3922452835125829,
"learning_rate": 8.42587858855599e-06,
"loss": 0.3665,
"step": 326
},
{
"epoch": 1.0092783505154639,
"grad_norm": 0.3692331200761448,
"learning_rate": 8.412765716093273e-06,
"loss": 0.4013,
"step": 327
},
{
"epoch": 1.0123711340206185,
"grad_norm": 0.32689915021476607,
"learning_rate": 8.39960874928636e-06,
"loss": 0.3473,
"step": 328
},
{
"epoch": 1.0154639175257731,
"grad_norm": 0.35641939466022565,
"learning_rate": 8.386407858128707e-06,
"loss": 0.3484,
"step": 329
},
{
"epoch": 1.0185567010309278,
"grad_norm": 0.4466817808622907,
"learning_rate": 8.373163213181283e-06,
"loss": 0.3214,
"step": 330
},
{
"epoch": 1.0216494845360824,
"grad_norm": 0.3830567903995025,
"learning_rate": 8.359874985570378e-06,
"loss": 0.3644,
"step": 331
},
{
"epoch": 1.024742268041237,
"grad_norm": 0.47720737189360485,
"learning_rate": 8.346543346985388e-06,
"loss": 0.3808,
"step": 332
},
{
"epoch": 1.0278350515463917,
"grad_norm": 0.30458724044567703,
"learning_rate": 8.333168469676595e-06,
"loss": 0.3616,
"step": 333
},
{
"epoch": 1.0309278350515463,
"grad_norm": 0.46053373607867876,
"learning_rate": 8.319750526452945e-06,
"loss": 0.35,
"step": 334
},
{
"epoch": 1.0340206185567011,
"grad_norm": 0.40190768640909413,
"learning_rate": 8.306289690679812e-06,
"loss": 0.3926,
"step": 335
},
{
"epoch": 1.0371134020618558,
"grad_norm": 0.43240340569051033,
"learning_rate": 8.29278613627676e-06,
"loss": 0.2903,
"step": 336
},
{
"epoch": 1.0402061855670104,
"grad_norm": 0.31303678537438284,
"learning_rate": 8.279240037715297e-06,
"loss": 0.3662,
"step": 337
},
{
"epoch": 1.043298969072165,
"grad_norm": 0.36641545734601244,
"learning_rate": 8.265651570016618e-06,
"loss": 0.3553,
"step": 338
},
{
"epoch": 1.0463917525773196,
"grad_norm": 0.4841123603738825,
"learning_rate": 8.252020908749338e-06,
"loss": 0.332,
"step": 339
},
{
"epoch": 1.0494845360824743,
"grad_norm": 0.2956997076869852,
"learning_rate": 8.238348230027245e-06,
"loss": 0.3379,
"step": 340
},
{
"epoch": 1.052577319587629,
"grad_norm": 0.38931134059191863,
"learning_rate": 8.224633710506997e-06,
"loss": 0.3648,
"step": 341
},
{
"epoch": 1.0556701030927835,
"grad_norm": 0.3611149951062155,
"learning_rate": 8.210877527385859e-06,
"loss": 0.328,
"step": 342
},
{
"epoch": 1.0587628865979382,
"grad_norm": 0.4546040068388239,
"learning_rate": 8.197079858399403e-06,
"loss": 0.3335,
"step": 343
},
{
"epoch": 1.0618556701030928,
"grad_norm": 0.3993100472987632,
"learning_rate": 8.18324088181922e-06,
"loss": 0.3621,
"step": 344
},
{
"epoch": 1.0649484536082474,
"grad_norm": 0.3853715287732158,
"learning_rate": 8.169360776450606e-06,
"loss": 0.3497,
"step": 345
},
{
"epoch": 1.068041237113402,
"grad_norm": 0.4538572958536918,
"learning_rate": 8.155439721630265e-06,
"loss": 0.4041,
"step": 346
},
{
"epoch": 1.0711340206185567,
"grad_norm": 0.3819616971274627,
"learning_rate": 8.14147789722398e-06,
"loss": 0.3125,
"step": 347
},
{
"epoch": 1.0742268041237113,
"grad_norm": 0.404047744842174,
"learning_rate": 8.127475483624296e-06,
"loss": 0.376,
"step": 348
},
{
"epoch": 1.077319587628866,
"grad_norm": 0.6412664221693233,
"learning_rate": 8.113432661748187e-06,
"loss": 0.3368,
"step": 349
},
{
"epoch": 1.0804123711340206,
"grad_norm": 0.4079355731584008,
"learning_rate": 8.099349613034715e-06,
"loss": 0.3299,
"step": 350
},
{
"epoch": 1.0835051546391752,
"grad_norm": 0.34948984869697675,
"learning_rate": 8.085226519442697e-06,
"loss": 0.3166,
"step": 351
},
{
"epoch": 1.0865979381443298,
"grad_norm": 0.3702256660226774,
"learning_rate": 8.071063563448341e-06,
"loss": 0.3456,
"step": 352
},
{
"epoch": 1.0896907216494844,
"grad_norm": 0.3591573870798163,
"learning_rate": 8.056860928042892e-06,
"loss": 0.3067,
"step": 353
},
{
"epoch": 1.0927835051546393,
"grad_norm": 0.4211722962881093,
"learning_rate": 8.042618796730272e-06,
"loss": 0.357,
"step": 354
},
{
"epoch": 1.095876288659794,
"grad_norm": 0.2979530511506807,
"learning_rate": 8.028337353524712e-06,
"loss": 0.3434,
"step": 355
},
{
"epoch": 1.0989690721649485,
"grad_norm": 0.7222053664711489,
"learning_rate": 8.014016782948358e-06,
"loss": 0.3513,
"step": 356
},
{
"epoch": 1.1020618556701032,
"grad_norm": 0.4726448804328554,
"learning_rate": 7.999657270028904e-06,
"loss": 0.3424,
"step": 357
},
{
"epoch": 1.1051546391752578,
"grad_norm": 0.384744888765029,
"learning_rate": 7.985259000297196e-06,
"loss": 0.347,
"step": 358
},
{
"epoch": 1.1082474226804124,
"grad_norm": 0.4214120836808585,
"learning_rate": 7.970822159784832e-06,
"loss": 0.3527,
"step": 359
},
{
"epoch": 1.111340206185567,
"grad_norm": 0.4313237210833637,
"learning_rate": 7.956346935021762e-06,
"loss": 0.3338,
"step": 360
},
{
"epoch": 1.1144329896907217,
"grad_norm": 0.41043298999057304,
"learning_rate": 7.941833513033873e-06,
"loss": 0.3257,
"step": 361
},
{
"epoch": 1.1175257731958763,
"grad_norm": 0.45398400909999115,
"learning_rate": 7.92728208134058e-06,
"loss": 0.3533,
"step": 362
},
{
"epoch": 1.120618556701031,
"grad_norm": 0.4121091535911782,
"learning_rate": 7.912692827952395e-06,
"loss": 0.3478,
"step": 363
},
{
"epoch": 1.1237113402061856,
"grad_norm": 0.3799154594744056,
"learning_rate": 7.898065941368507e-06,
"loss": 0.3679,
"step": 364
},
{
"epoch": 1.1268041237113402,
"grad_norm": 0.3231398982291365,
"learning_rate": 7.883401610574338e-06,
"loss": 0.3267,
"step": 365
},
{
"epoch": 1.1298969072164948,
"grad_norm": 0.37731112008573586,
"learning_rate": 7.868700025039102e-06,
"loss": 0.3444,
"step": 366
},
{
"epoch": 1.1329896907216495,
"grad_norm": 0.2979459870840287,
"learning_rate": 7.853961374713367e-06,
"loss": 0.3876,
"step": 367
},
{
"epoch": 1.136082474226804,
"grad_norm": 0.360771137481857,
"learning_rate": 7.839185850026592e-06,
"loss": 0.3663,
"step": 368
},
{
"epoch": 1.1391752577319587,
"grad_norm": 0.3907315619368398,
"learning_rate": 7.82437364188466e-06,
"loss": 0.3472,
"step": 369
},
{
"epoch": 1.1422680412371133,
"grad_norm": 0.3514296826261096,
"learning_rate": 7.809524941667426e-06,
"loss": 0.3583,
"step": 370
},
{
"epoch": 1.145360824742268,
"grad_norm": 0.4471849446328275,
"learning_rate": 7.794639941226238e-06,
"loss": 0.335,
"step": 371
},
{
"epoch": 1.1484536082474226,
"grad_norm": 0.3432232339200618,
"learning_rate": 7.779718832881456e-06,
"loss": 0.3139,
"step": 372
},
{
"epoch": 1.1515463917525772,
"grad_norm": 0.34717136955485867,
"learning_rate": 7.764761809419969e-06,
"loss": 0.348,
"step": 373
},
{
"epoch": 1.1546391752577319,
"grad_norm": 0.5078372437404328,
"learning_rate": 7.749769064092706e-06,
"loss": 0.3642,
"step": 374
},
{
"epoch": 1.1577319587628865,
"grad_norm": 0.4613570762769725,
"learning_rate": 7.734740790612137e-06,
"loss": 0.3346,
"step": 375
},
{
"epoch": 1.1608247422680413,
"grad_norm": 0.36959768373709084,
"learning_rate": 7.719677183149764e-06,
"loss": 0.3575,
"step": 376
},
{
"epoch": 1.163917525773196,
"grad_norm": 0.4779248378014372,
"learning_rate": 7.70457843633363e-06,
"loss": 0.3423,
"step": 377
},
{
"epoch": 1.1670103092783506,
"grad_norm": 0.32643441854444594,
"learning_rate": 7.689444745245782e-06,
"loss": 0.378,
"step": 378
},
{
"epoch": 1.1701030927835052,
"grad_norm": 0.3089704414768058,
"learning_rate": 7.67427630541977e-06,
"loss": 0.331,
"step": 379
},
{
"epoch": 1.1731958762886598,
"grad_norm": 0.3749819889676696,
"learning_rate": 7.65907331283811e-06,
"loss": 0.3184,
"step": 380
},
{
"epoch": 1.1762886597938145,
"grad_norm": 0.4677147598049641,
"learning_rate": 7.643835963929747e-06,
"loss": 0.3818,
"step": 381
},
{
"epoch": 1.179381443298969,
"grad_norm": 0.4828388717102419,
"learning_rate": 7.6285644555675345e-06,
"loss": 0.3484,
"step": 382
},
{
"epoch": 1.1824742268041237,
"grad_norm": 0.26942212683700595,
"learning_rate": 7.613258985065672e-06,
"loss": 0.3623,
"step": 383
},
{
"epoch": 1.1855670103092784,
"grad_norm": 0.4157868728729967,
"learning_rate": 7.597919750177168e-06,
"loss": 0.3401,
"step": 384
},
{
"epoch": 1.188659793814433,
"grad_norm": 0.42605063182184566,
"learning_rate": 7.58254694909128e-06,
"loss": 0.3159,
"step": 385
},
{
"epoch": 1.1917525773195876,
"grad_norm": 0.34314819154922166,
"learning_rate": 7.567140780430956e-06,
"loss": 0.3419,
"step": 386
},
{
"epoch": 1.1948453608247422,
"grad_norm": 0.3289614756235703,
"learning_rate": 7.551701443250263e-06,
"loss": 0.3211,
"step": 387
},
{
"epoch": 1.1979381443298969,
"grad_norm": 0.4057519424982471,
"learning_rate": 7.536229137031822e-06,
"loss": 0.3368,
"step": 388
},
{
"epoch": 1.2010309278350515,
"grad_norm": 0.3744695844688546,
"learning_rate": 7.520724061684227e-06,
"loss": 0.3625,
"step": 389
},
{
"epoch": 1.2041237113402061,
"grad_norm": 0.3504025419656684,
"learning_rate": 7.505186417539465e-06,
"loss": 0.3437,
"step": 390
},
{
"epoch": 1.2072164948453608,
"grad_norm": 0.4279051952223975,
"learning_rate": 7.489616405350319e-06,
"loss": 0.3286,
"step": 391
},
{
"epoch": 1.2103092783505154,
"grad_norm": 0.43577738791346377,
"learning_rate": 7.474014226287786e-06,
"loss": 0.3519,
"step": 392
},
{
"epoch": 1.21340206185567,
"grad_norm": 0.29125474177173316,
"learning_rate": 7.45838008193847e-06,
"loss": 0.3096,
"step": 393
},
{
"epoch": 1.2164948453608249,
"grad_norm": 0.4472479854562308,
"learning_rate": 7.442714174301984e-06,
"loss": 0.3563,
"step": 394
},
{
"epoch": 1.2195876288659795,
"grad_norm": 0.3249198760178035,
"learning_rate": 7.4270167057883295e-06,
"loss": 0.3825,
"step": 395
},
{
"epoch": 1.2226804123711341,
"grad_norm": 0.41662397510911336,
"learning_rate": 7.411287879215291e-06,
"loss": 0.3413,
"step": 396
},
{
"epoch": 1.2257731958762887,
"grad_norm": 0.3850817892627363,
"learning_rate": 7.395527897805812e-06,
"loss": 0.3426,
"step": 397
},
{
"epoch": 1.2288659793814434,
"grad_norm": 0.41909976516465813,
"learning_rate": 7.379736965185369e-06,
"loss": 0.2918,
"step": 398
},
{
"epoch": 1.231958762886598,
"grad_norm": 0.35938630643481323,
"learning_rate": 7.36391528537934e-06,
"loss": 0.3251,
"step": 399
},
{
"epoch": 1.2350515463917526,
"grad_norm": 0.351044135365865,
"learning_rate": 7.348063062810369e-06,
"loss": 0.3227,
"step": 400
},
{
"epoch": 1.2381443298969073,
"grad_norm": 0.43751484222875453,
"learning_rate": 7.332180502295729e-06,
"loss": 0.3734,
"step": 401
},
{
"epoch": 1.2412371134020619,
"grad_norm": 0.36948439372703645,
"learning_rate": 7.316267809044667e-06,
"loss": 0.3195,
"step": 402
},
{
"epoch": 1.2443298969072165,
"grad_norm": 0.29471956115359044,
"learning_rate": 7.300325188655762e-06,
"loss": 0.3466,
"step": 403
},
{
"epoch": 1.2474226804123711,
"grad_norm": 0.5091978979592966,
"learning_rate": 7.284352847114259e-06,
"loss": 0.3743,
"step": 404
},
{
"epoch": 1.2505154639175258,
"grad_norm": 0.3628727698515663,
"learning_rate": 7.268350990789415e-06,
"loss": 0.3523,
"step": 405
},
{
"epoch": 1.2536082474226804,
"grad_norm": 0.3878916069312507,
"learning_rate": 7.252319826431833e-06,
"loss": 0.3589,
"step": 406
},
{
"epoch": 1.256701030927835,
"grad_norm": 0.36280618889349325,
"learning_rate": 7.236259561170783e-06,
"loss": 0.3658,
"step": 407
},
{
"epoch": 1.2597938144329897,
"grad_norm": 0.6373467770204526,
"learning_rate": 7.220170402511534e-06,
"loss": 0.3547,
"step": 408
},
{
"epoch": 1.2628865979381443,
"grad_norm": 0.33274294232763424,
"learning_rate": 7.204052558332668e-06,
"loss": 0.3103,
"step": 409
},
{
"epoch": 1.265979381443299,
"grad_norm": 0.3735084312326953,
"learning_rate": 7.187906236883395e-06,
"loss": 0.3776,
"step": 410
},
{
"epoch": 1.2690721649484535,
"grad_norm": 0.346815947501004,
"learning_rate": 7.171731646780867e-06,
"loss": 0.3539,
"step": 411
},
{
"epoch": 1.2721649484536082,
"grad_norm": 0.35962803496131096,
"learning_rate": 7.155528997007476e-06,
"loss": 0.3321,
"step": 412
},
{
"epoch": 1.2752577319587628,
"grad_norm": 0.310966882010396,
"learning_rate": 7.139298496908155e-06,
"loss": 0.3241,
"step": 413
},
{
"epoch": 1.2783505154639174,
"grad_norm": 0.39105942243010555,
"learning_rate": 7.123040356187676e-06,
"loss": 0.3472,
"step": 414
},
{
"epoch": 1.281443298969072,
"grad_norm": 0.33753426953195,
"learning_rate": 7.106754784907942e-06,
"loss": 0.2907,
"step": 415
},
{
"epoch": 1.2845360824742267,
"grad_norm": 0.40088355762514305,
"learning_rate": 7.090441993485268e-06,
"loss": 0.3508,
"step": 416
},
{
"epoch": 1.2876288659793813,
"grad_norm": 0.40064565306290706,
"learning_rate": 7.07410219268766e-06,
"loss": 0.3348,
"step": 417
},
{
"epoch": 1.2907216494845362,
"grad_norm": 0.5469772825212642,
"learning_rate": 7.057735593632106e-06,
"loss": 0.3911,
"step": 418
},
{
"epoch": 1.2938144329896908,
"grad_norm": 0.3938454332181955,
"learning_rate": 7.04134240778183e-06,
"loss": 0.3275,
"step": 419
},
{
"epoch": 1.2969072164948454,
"grad_norm": 0.3841074843900627,
"learning_rate": 7.024922846943573e-06,
"loss": 0.3324,
"step": 420
},
{
"epoch": 1.3,
"grad_norm": 0.3589284518865989,
"learning_rate": 7.008477123264849e-06,
"loss": 0.3225,
"step": 421
},
{
"epoch": 1.3030927835051547,
"grad_norm": 0.3059033829909143,
"learning_rate": 6.9920054492312086e-06,
"loss": 0.359,
"step": 422
},
{
"epoch": 1.3061855670103093,
"grad_norm": 0.3271771652734719,
"learning_rate": 6.97550803766349e-06,
"loss": 0.3395,
"step": 423
},
{
"epoch": 1.309278350515464,
"grad_norm": 0.40806409758051815,
"learning_rate": 6.958985101715077e-06,
"loss": 0.3615,
"step": 424
},
{
"epoch": 1.3123711340206186,
"grad_norm": 0.4066403373834957,
"learning_rate": 6.942436854869129e-06,
"loss": 0.3689,
"step": 425
},
{
"epoch": 1.3154639175257732,
"grad_norm": 0.370416386524105,
"learning_rate": 6.925863510935839e-06,
"loss": 0.3433,
"step": 426
},
{
"epoch": 1.3185567010309278,
"grad_norm": 0.4418974179010107,
"learning_rate": 6.909265284049664e-06,
"loss": 0.3827,
"step": 427
},
{
"epoch": 1.3216494845360824,
"grad_norm": 0.4227883285206932,
"learning_rate": 6.89264238866656e-06,
"loss": 0.3627,
"step": 428
},
{
"epoch": 1.324742268041237,
"grad_norm": 0.31167092762445603,
"learning_rate": 6.875995039561206e-06,
"loss": 0.359,
"step": 429
},
{
"epoch": 1.3278350515463917,
"grad_norm": 0.3480875177965604,
"learning_rate": 6.859323451824238e-06,
"loss": 0.3444,
"step": 430
},
{
"epoch": 1.3309278350515463,
"grad_norm": 0.33854262777758315,
"learning_rate": 6.842627840859461e-06,
"loss": 0.3664,
"step": 431
},
{
"epoch": 1.334020618556701,
"grad_norm": 0.34767527727955205,
"learning_rate": 6.825908422381074e-06,
"loss": 0.3516,
"step": 432
},
{
"epoch": 1.3371134020618558,
"grad_norm": 0.5865810671246062,
"learning_rate": 6.8091654124108765e-06,
"loss": 0.4047,
"step": 433
},
{
"epoch": 1.3402061855670104,
"grad_norm": 0.45609449114653355,
"learning_rate": 6.792399027275482e-06,
"loss": 0.3319,
"step": 434
},
{
"epoch": 1.343298969072165,
"grad_norm": 0.4400731902548814,
"learning_rate": 6.775609483603516e-06,
"loss": 0.3652,
"step": 435
},
{
"epoch": 1.3463917525773197,
"grad_norm": 0.38007555373261076,
"learning_rate": 6.758796998322825e-06,
"loss": 0.3685,
"step": 436
},
{
"epoch": 1.3494845360824743,
"grad_norm": 0.35178301360682257,
"learning_rate": 6.7419617886576735e-06,
"loss": 0.3555,
"step": 437
},
{
"epoch": 1.352577319587629,
"grad_norm": 0.43610235707265677,
"learning_rate": 6.725104072125931e-06,
"loss": 0.3481,
"step": 438
},
{
"epoch": 1.3556701030927836,
"grad_norm": 0.34661801350291904,
"learning_rate": 6.708224066536263e-06,
"loss": 0.3837,
"step": 439
},
{
"epoch": 1.3587628865979382,
"grad_norm": 0.3989161601450613,
"learning_rate": 6.6913219899853245e-06,
"loss": 0.329,
"step": 440
},
{
"epoch": 1.3618556701030928,
"grad_norm": 0.3744534100982985,
"learning_rate": 6.674398060854931e-06,
"loss": 0.3634,
"step": 441
},
{
"epoch": 1.3649484536082475,
"grad_norm": 0.33702451565402913,
"learning_rate": 6.657452497809247e-06,
"loss": 0.349,
"step": 442
},
{
"epoch": 1.368041237113402,
"grad_norm": 0.5591464114695802,
"learning_rate": 6.640485519791953e-06,
"loss": 0.3789,
"step": 443
},
{
"epoch": 1.3711340206185567,
"grad_norm": 0.3252398001484424,
"learning_rate": 6.6234973460234184e-06,
"loss": 0.3184,
"step": 444
},
{
"epoch": 1.3742268041237113,
"grad_norm": 0.5256539082109882,
"learning_rate": 6.606488195997876e-06,
"loss": 0.3707,
"step": 445
},
{
"epoch": 1.377319587628866,
"grad_norm": 0.41502723892825255,
"learning_rate": 6.589458289480575e-06,
"loss": 0.3424,
"step": 446
},
{
"epoch": 1.3804123711340206,
"grad_norm": 0.34496860905236404,
"learning_rate": 6.57240784650495e-06,
"loss": 0.3553,
"step": 447
},
{
"epoch": 1.3835051546391752,
"grad_norm": 0.3677644556343446,
"learning_rate": 6.555337087369775e-06,
"loss": 0.3133,
"step": 448
},
{
"epoch": 1.3865979381443299,
"grad_norm": 0.43771172700586186,
"learning_rate": 6.538246232636316e-06,
"loss": 0.3844,
"step": 449
},
{
"epoch": 1.3896907216494845,
"grad_norm": 0.3438481741906832,
"learning_rate": 6.521135503125483e-06,
"loss": 0.3283,
"step": 450
},
{
"epoch": 1.3927835051546391,
"grad_norm": 0.625983954705621,
"learning_rate": 6.5040051199149755e-06,
"loss": 0.397,
"step": 451
},
{
"epoch": 1.3958762886597937,
"grad_norm": 0.40915327598292334,
"learning_rate": 6.48685530433643e-06,
"loss": 0.3807,
"step": 452
},
{
"epoch": 1.3989690721649484,
"grad_norm": 0.30288182205857417,
"learning_rate": 6.469686277972556e-06,
"loss": 0.3476,
"step": 453
},
{
"epoch": 1.402061855670103,
"grad_norm": 0.4292888222844284,
"learning_rate": 6.452498262654267e-06,
"loss": 0.3573,
"step": 454
},
{
"epoch": 1.4051546391752576,
"grad_norm": 0.3468873094824028,
"learning_rate": 6.4352914804578345e-06,
"loss": 0.3475,
"step": 455
},
{
"epoch": 1.4082474226804123,
"grad_norm": 0.3611550096422234,
"learning_rate": 6.418066153701997e-06,
"loss": 0.3448,
"step": 456
},
{
"epoch": 1.4113402061855669,
"grad_norm": 0.38685613343922054,
"learning_rate": 6.4008225049450974e-06,
"loss": 0.3516,
"step": 457
},
{
"epoch": 1.4144329896907217,
"grad_norm": 0.3151727833355366,
"learning_rate": 6.38356075698221e-06,
"loss": 0.3493,
"step": 458
},
{
"epoch": 1.4175257731958764,
"grad_norm": 0.3408245800320407,
"learning_rate": 6.366281132842256e-06,
"loss": 0.3642,
"step": 459
},
{
"epoch": 1.420618556701031,
"grad_norm": 0.35947440187809293,
"learning_rate": 6.348983855785122e-06,
"loss": 0.3491,
"step": 460
},
{
"epoch": 1.4237113402061856,
"grad_norm": 0.3697513099146176,
"learning_rate": 6.331669149298781e-06,
"loss": 0.3462,
"step": 461
},
{
"epoch": 1.4268041237113402,
"grad_norm": 0.3440627883456723,
"learning_rate": 6.314337237096401e-06,
"loss": 0.3294,
"step": 462
},
{
"epoch": 1.4298969072164949,
"grad_norm": 0.3424656075586527,
"learning_rate": 6.296988343113453e-06,
"loss": 0.3441,
"step": 463
},
{
"epoch": 1.4329896907216495,
"grad_norm": 0.3410954960548553,
"learning_rate": 6.279622691504821e-06,
"loss": 0.3367,
"step": 464
},
{
"epoch": 1.4360824742268041,
"grad_norm": 0.42875886013339126,
"learning_rate": 6.2622405066419046e-06,
"loss": 0.3659,
"step": 465
},
{
"epoch": 1.4391752577319588,
"grad_norm": 0.34138282265214576,
"learning_rate": 6.24484201310972e-06,
"loss": 0.3187,
"step": 466
},
{
"epoch": 1.4422680412371134,
"grad_norm": 0.3498674361424939,
"learning_rate": 6.227427435703997e-06,
"loss": 0.3249,
"step": 467
},
{
"epoch": 1.445360824742268,
"grad_norm": 0.40405356495212513,
"learning_rate": 6.2099969994282764e-06,
"loss": 0.3288,
"step": 468
},
{
"epoch": 1.4484536082474226,
"grad_norm": 0.3596781700650695,
"learning_rate": 6.192550929491002e-06,
"loss": 0.3578,
"step": 469
},
{
"epoch": 1.4515463917525773,
"grad_norm": 0.3510031934171229,
"learning_rate": 6.175089451302614e-06,
"loss": 0.3546,
"step": 470
},
{
"epoch": 1.454639175257732,
"grad_norm": 0.4142516280132979,
"learning_rate": 6.157612790472626e-06,
"loss": 0.356,
"step": 471
},
{
"epoch": 1.4577319587628865,
"grad_norm": 0.42780646762862284,
"learning_rate": 6.140121172806725e-06,
"loss": 0.3447,
"step": 472
},
{
"epoch": 1.4608247422680412,
"grad_norm": 0.370184077167348,
"learning_rate": 6.122614824303845e-06,
"loss": 0.3162,
"step": 473
},
{
"epoch": 1.463917525773196,
"grad_norm": 0.3817693239463498,
"learning_rate": 6.105093971153246e-06,
"loss": 0.3536,
"step": 474
},
{
"epoch": 1.4670103092783506,
"grad_norm": 0.3917768728174191,
"learning_rate": 6.087558839731594e-06,
"loss": 0.3341,
"step": 475
},
{
"epoch": 1.4701030927835053,
"grad_norm": 0.3428239655666794,
"learning_rate": 6.070009656600039e-06,
"loss": 0.3422,
"step": 476
},
{
"epoch": 1.47319587628866,
"grad_norm": 0.3973161497874271,
"learning_rate": 6.052446648501283e-06,
"loss": 0.3429,
"step": 477
},
{
"epoch": 1.4762886597938145,
"grad_norm": 0.4333165450189888,
"learning_rate": 6.034870042356653e-06,
"loss": 0.3703,
"step": 478
},
{
"epoch": 1.4793814432989691,
"grad_norm": 0.323605469691123,
"learning_rate": 6.0172800652631706e-06,
"loss": 0.3426,
"step": 479
},
{
"epoch": 1.4824742268041238,
"grad_norm": 0.3929125656201943,
"learning_rate": 5.999676944490609e-06,
"loss": 0.3324,
"step": 480
},
{
"epoch": 1.4855670103092784,
"grad_norm": 0.3587189281991536,
"learning_rate": 5.982060907478568e-06,
"loss": 0.3608,
"step": 481
},
{
"epoch": 1.488659793814433,
"grad_norm": 0.4074306888968019,
"learning_rate": 5.964432181833532e-06,
"loss": 0.3386,
"step": 482
},
{
"epoch": 1.4917525773195877,
"grad_norm": 0.4119418936472174,
"learning_rate": 5.946790995325924e-06,
"loss": 0.3351,
"step": 483
},
{
"epoch": 1.4948453608247423,
"grad_norm": 0.5463610190099886,
"learning_rate": 5.929137575887167e-06,
"loss": 0.3486,
"step": 484
},
{
"epoch": 1.497938144329897,
"grad_norm": 0.39394480450005404,
"learning_rate": 5.911472151606743e-06,
"loss": 0.3647,
"step": 485
},
{
"epoch": 1.5010309278350515,
"grad_norm": 0.32096578094355016,
"learning_rate": 5.893794950729237e-06,
"loss": 0.3321,
"step": 486
},
{
"epoch": 1.5041237113402062,
"grad_norm": 0.4262076144835789,
"learning_rate": 5.876106201651392e-06,
"loss": 0.3445,
"step": 487
},
{
"epoch": 1.5072164948453608,
"grad_norm": 0.40275399201166046,
"learning_rate": 5.858406132919162e-06,
"loss": 0.328,
"step": 488
},
{
"epoch": 1.5103092783505154,
"grad_norm": 0.31623557083038034,
"learning_rate": 5.840694973224752e-06,
"loss": 0.3488,
"step": 489
},
{
"epoch": 1.51340206185567,
"grad_norm": 0.41350708263181063,
"learning_rate": 5.82297295140367e-06,
"loss": 0.3198,
"step": 490
},
{
"epoch": 1.5164948453608247,
"grad_norm": 0.32875741521436547,
"learning_rate": 5.805240296431765e-06,
"loss": 0.3453,
"step": 491
},
{
"epoch": 1.5195876288659793,
"grad_norm": 0.4161024892078206,
"learning_rate": 5.787497237422272e-06,
"loss": 0.3537,
"step": 492
},
{
"epoch": 1.522680412371134,
"grad_norm": 0.3350236926596831,
"learning_rate": 5.769744003622852e-06,
"loss": 0.3524,
"step": 493
},
{
"epoch": 1.5257731958762886,
"grad_norm": 0.3718584309646912,
"learning_rate": 5.751980824412622e-06,
"loss": 0.3415,
"step": 494
},
{
"epoch": 1.5288659793814432,
"grad_norm": 0.33405973084354273,
"learning_rate": 5.734207929299206e-06,
"loss": 0.3785,
"step": 495
},
{
"epoch": 1.5319587628865978,
"grad_norm": 0.3491907461943334,
"learning_rate": 5.716425547915756e-06,
"loss": 0.3566,
"step": 496
},
{
"epoch": 1.5350515463917525,
"grad_norm": 0.34785897166443974,
"learning_rate": 5.698633910017993e-06,
"loss": 0.3465,
"step": 497
},
{
"epoch": 1.538144329896907,
"grad_norm": 0.40927118637399906,
"learning_rate": 5.680833245481234e-06,
"loss": 0.3254,
"step": 498
},
{
"epoch": 1.5412371134020617,
"grad_norm": 0.364191447666914,
"learning_rate": 5.663023784297426e-06,
"loss": 0.3581,
"step": 499
},
{
"epoch": 1.5443298969072163,
"grad_norm": 0.49907269025129153,
"learning_rate": 5.6452057565721715e-06,
"loss": 0.3772,
"step": 500
},
{
"epoch": 1.5474226804123712,
"grad_norm": 0.35782849389183596,
"learning_rate": 5.627379392521758e-06,
"loss": 0.3206,
"step": 501
},
{
"epoch": 1.5505154639175258,
"grad_norm": 0.38887433093763457,
"learning_rate": 5.609544922470178e-06,
"loss": 0.2835,
"step": 502
},
{
"epoch": 1.5536082474226804,
"grad_norm": 0.35326584373882364,
"learning_rate": 5.59170257684616e-06,
"loss": 0.3389,
"step": 503
},
{
"epoch": 1.556701030927835,
"grad_norm": 0.40349932678306516,
"learning_rate": 5.573852586180185e-06,
"loss": 0.3889,
"step": 504
},
{
"epoch": 1.5597938144329897,
"grad_norm": 0.3765765021817677,
"learning_rate": 5.555995181101517e-06,
"loss": 0.3528,
"step": 505
},
{
"epoch": 1.5628865979381443,
"grad_norm": 0.3463176384496649,
"learning_rate": 5.53813059233521e-06,
"loss": 0.3459,
"step": 506
},
{
"epoch": 1.565979381443299,
"grad_norm": 0.394296667709075,
"learning_rate": 5.520259050699138e-06,
"loss": 0.3811,
"step": 507
},
{
"epoch": 1.5690721649484536,
"grad_norm": 0.4384971668510495,
"learning_rate": 5.50238078710101e-06,
"loss": 0.2956,
"step": 508
},
{
"epoch": 1.5721649484536082,
"grad_norm": 0.2759163025819777,
"learning_rate": 5.484496032535385e-06,
"loss": 0.3542,
"step": 509
},
{
"epoch": 1.5752577319587628,
"grad_norm": 0.39946717876701127,
"learning_rate": 5.466605018080684e-06,
"loss": 0.3553,
"step": 510
},
{
"epoch": 1.5783505154639177,
"grad_norm": 0.3168571537880368,
"learning_rate": 5.448707974896214e-06,
"loss": 0.3816,
"step": 511
},
{
"epoch": 1.5814432989690723,
"grad_norm": 0.35776094699034044,
"learning_rate": 5.430805134219171e-06,
"loss": 0.3374,
"step": 512
},
{
"epoch": 1.584536082474227,
"grad_norm": 0.3372746052323508,
"learning_rate": 5.412896727361663e-06,
"loss": 0.3289,
"step": 513
},
{
"epoch": 1.5876288659793816,
"grad_norm": 0.3509930509793478,
"learning_rate": 5.3949829857077075e-06,
"loss": 0.3443,
"step": 514
},
{
"epoch": 1.5907216494845362,
"grad_norm": 0.4449590962316539,
"learning_rate": 5.3770641407102554e-06,
"loss": 0.3418,
"step": 515
},
{
"epoch": 1.5938144329896908,
"grad_norm": 0.3218980327532609,
"learning_rate": 5.3591404238881935e-06,
"loss": 0.3143,
"step": 516
},
{
"epoch": 1.5969072164948455,
"grad_norm": 0.43752839784545067,
"learning_rate": 5.341212066823356e-06,
"loss": 0.3479,
"step": 517
},
{
"epoch": 1.6,
"grad_norm": 0.43005166973733555,
"learning_rate": 5.323279301157526e-06,
"loss": 0.3679,
"step": 518
},
{
"epoch": 1.6030927835051547,
"grad_norm": 0.3840246824337281,
"learning_rate": 5.305342358589452e-06,
"loss": 0.3277,
"step": 519
},
{
"epoch": 1.6061855670103093,
"grad_norm": 0.6948499227181512,
"learning_rate": 5.287401470871851e-06,
"loss": 0.4366,
"step": 520
},
{
"epoch": 1.609278350515464,
"grad_norm": 0.4752253370301213,
"learning_rate": 5.2694568698084085e-06,
"loss": 0.3607,
"step": 521
},
{
"epoch": 1.6123711340206186,
"grad_norm": 0.34613731373360773,
"learning_rate": 5.25150878725079e-06,
"loss": 0.3557,
"step": 522
},
{
"epoch": 1.6154639175257732,
"grad_norm": 0.29453002565946584,
"learning_rate": 5.233557455095645e-06,
"loss": 0.3663,
"step": 523
},
{
"epoch": 1.6185567010309279,
"grad_norm": 0.5096116989863569,
"learning_rate": 5.215603105281606e-06,
"loss": 0.4037,
"step": 524
},
{
"epoch": 1.6216494845360825,
"grad_norm": 0.4228973578656383,
"learning_rate": 5.197645969786297e-06,
"loss": 0.3569,
"step": 525
},
{
"epoch": 1.6247422680412371,
"grad_norm": 0.40898122279760163,
"learning_rate": 5.179686280623334e-06,
"loss": 0.3361,
"step": 526
},
{
"epoch": 1.6278350515463917,
"grad_norm": 0.3239619283576348,
"learning_rate": 5.1617242698393265e-06,
"loss": 0.3408,
"step": 527
},
{
"epoch": 1.6309278350515464,
"grad_norm": 0.32898745469773205,
"learning_rate": 5.143760169510882e-06,
"loss": 0.3341,
"step": 528
},
{
"epoch": 1.634020618556701,
"grad_norm": 0.3635754593359022,
"learning_rate": 5.125794211741602e-06,
"loss": 0.3524,
"step": 529
},
{
"epoch": 1.6371134020618556,
"grad_norm": 0.3818846922647241,
"learning_rate": 5.107826628659095e-06,
"loss": 0.3659,
"step": 530
},
{
"epoch": 1.6402061855670103,
"grad_norm": 0.4996575536438249,
"learning_rate": 5.089857652411961e-06,
"loss": 0.3966,
"step": 531
},
{
"epoch": 1.6432989690721649,
"grad_norm": 0.3917831825064119,
"learning_rate": 5.0718875151668005e-06,
"loss": 0.3868,
"step": 532
},
{
"epoch": 1.6463917525773195,
"grad_norm": 0.4033639088959359,
"learning_rate": 5.053916449105219e-06,
"loss": 0.3408,
"step": 533
},
{
"epoch": 1.6494845360824741,
"grad_norm": 0.5601375825737572,
"learning_rate": 5.035944686420823e-06,
"loss": 0.366,
"step": 534
},
{
"epoch": 1.6525773195876288,
"grad_norm": 0.3915181598012574,
"learning_rate": 5.0179724593162146e-06,
"loss": 0.3312,
"step": 535
},
{
"epoch": 1.6556701030927834,
"grad_norm": 0.4643735140339054,
"learning_rate": 5e-06,
"loss": 0.3672,
"step": 536
},
{
"epoch": 1.658762886597938,
"grad_norm": 0.4120202123265614,
"learning_rate": 4.982027540683785e-06,
"loss": 0.3972,
"step": 537
},
{
"epoch": 1.6618556701030927,
"grad_norm": 0.3417335396795078,
"learning_rate": 4.964055313579179e-06,
"loss": 0.3024,
"step": 538
},
{
"epoch": 1.6649484536082473,
"grad_norm": 0.35012876477382987,
"learning_rate": 4.946083550894782e-06,
"loss": 0.333,
"step": 539
},
{
"epoch": 1.668041237113402,
"grad_norm": 0.3636966873267465,
"learning_rate": 4.928112484833201e-06,
"loss": 0.3441,
"step": 540
},
{
"epoch": 1.6711340206185565,
"grad_norm": 0.3111643102986036,
"learning_rate": 4.910142347588041e-06,
"loss": 0.3188,
"step": 541
},
{
"epoch": 1.6742268041237114,
"grad_norm": 0.40428582352299114,
"learning_rate": 4.892173371340907e-06,
"loss": 0.3484,
"step": 542
},
{
"epoch": 1.677319587628866,
"grad_norm": 0.3672336238706189,
"learning_rate": 4.874205788258397e-06,
"loss": 0.3441,
"step": 543
},
{
"epoch": 1.6804123711340206,
"grad_norm": 0.40811813907806466,
"learning_rate": 4.856239830489121e-06,
"loss": 0.3425,
"step": 544
},
{
"epoch": 1.6835051546391753,
"grad_norm": 0.3656893919324694,
"learning_rate": 4.838275730160675e-06,
"loss": 0.3323,
"step": 545
},
{
"epoch": 1.68659793814433,
"grad_norm": 0.37306700029813783,
"learning_rate": 4.8203137193766685e-06,
"loss": 0.3366,
"step": 546
},
{
"epoch": 1.6896907216494845,
"grad_norm": 0.3487607040775841,
"learning_rate": 4.802354030213704e-06,
"loss": 0.3481,
"step": 547
},
{
"epoch": 1.6927835051546392,
"grad_norm": 0.43124099792571774,
"learning_rate": 4.784396894718397e-06,
"loss": 0.3563,
"step": 548
},
{
"epoch": 1.6958762886597938,
"grad_norm": 0.4317586697798039,
"learning_rate": 4.766442544904357e-06,
"loss": 0.3116,
"step": 549
},
{
"epoch": 1.6989690721649484,
"grad_norm": 0.3554072453278775,
"learning_rate": 4.748491212749212e-06,
"loss": 0.3717,
"step": 550
},
{
"epoch": 1.702061855670103,
"grad_norm": 0.38642313014520263,
"learning_rate": 4.730543130191594e-06,
"loss": 0.3133,
"step": 551
},
{
"epoch": 1.705154639175258,
"grad_norm": 0.35718698367747603,
"learning_rate": 4.71259852912815e-06,
"loss": 0.3353,
"step": 552
},
{
"epoch": 1.7082474226804125,
"grad_norm": 0.3768471478704191,
"learning_rate": 4.6946576414105485e-06,
"loss": 0.3762,
"step": 553
},
{
"epoch": 1.7113402061855671,
"grad_norm": 0.3857887148028528,
"learning_rate": 4.676720698842474e-06,
"loss": 0.3529,
"step": 554
},
{
"epoch": 1.7144329896907218,
"grad_norm": 0.4266039767609616,
"learning_rate": 4.6587879331766465e-06,
"loss": 0.3594,
"step": 555
},
{
"epoch": 1.7175257731958764,
"grad_norm": 0.3948922880525116,
"learning_rate": 4.640859576111806e-06,
"loss": 0.3095,
"step": 556
},
{
"epoch": 1.720618556701031,
"grad_norm": 0.30686472583946167,
"learning_rate": 4.622935859289745e-06,
"loss": 0.3217,
"step": 557
},
{
"epoch": 1.7237113402061857,
"grad_norm": 0.33402814027027605,
"learning_rate": 4.605017014292294e-06,
"loss": 0.3162,
"step": 558
},
{
"epoch": 1.7268041237113403,
"grad_norm": 0.39075139358824645,
"learning_rate": 4.587103272638339e-06,
"loss": 0.3532,
"step": 559
},
{
"epoch": 1.729896907216495,
"grad_norm": 0.32617228983096963,
"learning_rate": 4.56919486578083e-06,
"loss": 0.3321,
"step": 560
},
{
"epoch": 1.7329896907216495,
"grad_norm": 0.4767942295459208,
"learning_rate": 4.551292025103789e-06,
"loss": 0.3727,
"step": 561
},
{
"epoch": 1.7360824742268042,
"grad_norm": 0.3987103445762424,
"learning_rate": 4.533394981919318e-06,
"loss": 0.3387,
"step": 562
},
{
"epoch": 1.7391752577319588,
"grad_norm": 0.38195474771000787,
"learning_rate": 4.515503967464619e-06,
"loss": 0.3518,
"step": 563
},
{
"epoch": 1.7422680412371134,
"grad_norm": 0.35405334182231835,
"learning_rate": 4.4976192128989905e-06,
"loss": 0.3058,
"step": 564
},
{
"epoch": 1.745360824742268,
"grad_norm": 0.3992288275189986,
"learning_rate": 4.479740949300864e-06,
"loss": 0.3615,
"step": 565
},
{
"epoch": 1.7484536082474227,
"grad_norm": 0.3332582817837684,
"learning_rate": 4.461869407664791e-06,
"loss": 0.3642,
"step": 566
},
{
"epoch": 1.7515463917525773,
"grad_norm": 0.42033238544083,
"learning_rate": 4.444004818898484e-06,
"loss": 0.3787,
"step": 567
},
{
"epoch": 1.754639175257732,
"grad_norm": 0.4880167336142244,
"learning_rate": 4.426147413819816e-06,
"loss": 0.4005,
"step": 568
},
{
"epoch": 1.7577319587628866,
"grad_norm": 0.3281528013955215,
"learning_rate": 4.408297423153841e-06,
"loss": 0.3268,
"step": 569
},
{
"epoch": 1.7608247422680412,
"grad_norm": 0.3574637034531168,
"learning_rate": 4.3904550775298235e-06,
"loss": 0.3773,
"step": 570
},
{
"epoch": 1.7639175257731958,
"grad_norm": 0.42421289930673906,
"learning_rate": 4.372620607478242e-06,
"loss": 0.3483,
"step": 571
},
{
"epoch": 1.7670103092783505,
"grad_norm": 0.3679170394870709,
"learning_rate": 4.354794243427829e-06,
"loss": 0.3731,
"step": 572
},
{
"epoch": 1.770103092783505,
"grad_norm": 0.38955579533123275,
"learning_rate": 4.336976215702574e-06,
"loss": 0.373,
"step": 573
},
{
"epoch": 1.7731958762886597,
"grad_norm": 0.45173466197382955,
"learning_rate": 4.319166754518768e-06,
"loss": 0.3379,
"step": 574
},
{
"epoch": 1.7762886597938143,
"grad_norm": 0.39227759702665027,
"learning_rate": 4.301366089982009e-06,
"loss": 0.3869,
"step": 575
},
{
"epoch": 1.779381443298969,
"grad_norm": 0.3729461642890205,
"learning_rate": 4.283574452084246e-06,
"loss": 0.3542,
"step": 576
},
{
"epoch": 1.7824742268041236,
"grad_norm": 0.4436097525383527,
"learning_rate": 4.265792070700796e-06,
"loss": 0.3567,
"step": 577
},
{
"epoch": 1.7855670103092782,
"grad_norm": 0.40747362880576216,
"learning_rate": 4.24801917558738e-06,
"loss": 0.3232,
"step": 578
},
{
"epoch": 1.7886597938144329,
"grad_norm": 0.3828534325794096,
"learning_rate": 4.23025599637715e-06,
"loss": 0.3209,
"step": 579
},
{
"epoch": 1.7917525773195875,
"grad_norm": 0.28993427750985645,
"learning_rate": 4.212502762577729e-06,
"loss": 0.3421,
"step": 580
},
{
"epoch": 1.794845360824742,
"grad_norm": 0.39602892726754746,
"learning_rate": 4.1947597035682355e-06,
"loss": 0.3635,
"step": 581
},
{
"epoch": 1.797938144329897,
"grad_norm": 0.3789277210001876,
"learning_rate": 4.17702704859633e-06,
"loss": 0.3234,
"step": 582
},
{
"epoch": 1.8010309278350516,
"grad_norm": 0.39826069405908116,
"learning_rate": 4.159305026775249e-06,
"loss": 0.32,
"step": 583
},
{
"epoch": 1.8041237113402062,
"grad_norm": 0.3932667433563304,
"learning_rate": 4.14159386708084e-06,
"loss": 0.3634,
"step": 584
},
{
"epoch": 1.8072164948453608,
"grad_norm": 0.3506483772059777,
"learning_rate": 4.1238937983486085e-06,
"loss": 0.3733,
"step": 585
},
{
"epoch": 1.8103092783505155,
"grad_norm": 0.4272908390532751,
"learning_rate": 4.106205049270764e-06,
"loss": 0.384,
"step": 586
},
{
"epoch": 1.81340206185567,
"grad_norm": 0.36438615249297024,
"learning_rate": 4.088527848393258e-06,
"loss": 0.3289,
"step": 587
},
{
"epoch": 1.8164948453608247,
"grad_norm": 0.3438225140274201,
"learning_rate": 4.070862424112833e-06,
"loss": 0.3511,
"step": 588
},
{
"epoch": 1.8195876288659794,
"grad_norm": 0.3114587225460769,
"learning_rate": 4.053209004674079e-06,
"loss": 0.3222,
"step": 589
},
{
"epoch": 1.822680412371134,
"grad_norm": 0.29869536683197423,
"learning_rate": 4.035567818166469e-06,
"loss": 0.3006,
"step": 590
},
{
"epoch": 1.8257731958762886,
"grad_norm": 0.3609354722469145,
"learning_rate": 4.017939092521434e-06,
"loss": 0.3662,
"step": 591
},
{
"epoch": 1.8288659793814435,
"grad_norm": 0.47969264614152507,
"learning_rate": 4.000323055509393e-06,
"loss": 0.3669,
"step": 592
},
{
"epoch": 1.831958762886598,
"grad_norm": 0.3050755424330547,
"learning_rate": 3.982719934736832e-06,
"loss": 0.3482,
"step": 593
},
{
"epoch": 1.8350515463917527,
"grad_norm": 0.4386065475892597,
"learning_rate": 3.9651299576433475e-06,
"loss": 0.3382,
"step": 594
},
{
"epoch": 1.8381443298969073,
"grad_norm": 0.3563022429334855,
"learning_rate": 3.947553351498719e-06,
"loss": 0.3758,
"step": 595
},
{
"epoch": 1.841237113402062,
"grad_norm": 0.3462789139485534,
"learning_rate": 3.929990343399963e-06,
"loss": 0.3771,
"step": 596
},
{
"epoch": 1.8443298969072166,
"grad_norm": 0.6529657263861594,
"learning_rate": 3.912441160268407e-06,
"loss": 0.3721,
"step": 597
},
{
"epoch": 1.8474226804123712,
"grad_norm": 0.42113391108788284,
"learning_rate": 3.894906028846757e-06,
"loss": 0.3296,
"step": 598
},
{
"epoch": 1.8505154639175259,
"grad_norm": 0.3248573190461725,
"learning_rate": 3.877385175696156e-06,
"loss": 0.328,
"step": 599
},
{
"epoch": 1.8536082474226805,
"grad_norm": 0.38525111572361326,
"learning_rate": 3.859878827193276e-06,
"loss": 0.3457,
"step": 600
},
{
"epoch": 1.8567010309278351,
"grad_norm": 0.45934475152425075,
"learning_rate": 3.842387209527374e-06,
"loss": 0.3482,
"step": 601
},
{
"epoch": 1.8597938144329897,
"grad_norm": 0.3268894340260456,
"learning_rate": 3.824910548697388e-06,
"loss": 0.319,
"step": 602
},
{
"epoch": 1.8628865979381444,
"grad_norm": 0.3645984463478698,
"learning_rate": 3.8074490705089983e-06,
"loss": 0.3271,
"step": 603
},
{
"epoch": 1.865979381443299,
"grad_norm": 0.29361036400417095,
"learning_rate": 3.790003000571726e-06,
"loss": 0.3453,
"step": 604
},
{
"epoch": 1.8690721649484536,
"grad_norm": 0.42982680335675016,
"learning_rate": 3.7725725642960047e-06,
"loss": 0.3677,
"step": 605
},
{
"epoch": 1.8721649484536083,
"grad_norm": 0.3720418664564901,
"learning_rate": 3.7551579868902828e-06,
"loss": 0.354,
"step": 606
},
{
"epoch": 1.8752577319587629,
"grad_norm": 0.812231457718325,
"learning_rate": 3.7377594933580967e-06,
"loss": 0.3607,
"step": 607
},
{
"epoch": 1.8783505154639175,
"grad_norm": 0.347727286666641,
"learning_rate": 3.7203773084951816e-06,
"loss": 0.3527,
"step": 608
},
{
"epoch": 1.8814432989690721,
"grad_norm": 0.4814060129518193,
"learning_rate": 3.7030116568865486e-06,
"loss": 0.3684,
"step": 609
},
{
"epoch": 1.8845360824742268,
"grad_norm": 0.36169836379494236,
"learning_rate": 3.685662762903601e-06,
"loss": 0.3466,
"step": 610
},
{
"epoch": 1.8876288659793814,
"grad_norm": 0.34318821998983295,
"learning_rate": 3.6683308507012196e-06,
"loss": 0.3299,
"step": 611
},
{
"epoch": 1.890721649484536,
"grad_norm": 0.2995004736405127,
"learning_rate": 3.6510161442148783e-06,
"loss": 0.308,
"step": 612
},
{
"epoch": 1.8938144329896907,
"grad_norm": 0.515389833064657,
"learning_rate": 3.6337188671577463e-06,
"loss": 0.3536,
"step": 613
},
{
"epoch": 1.8969072164948453,
"grad_norm": 0.439637019996902,
"learning_rate": 3.6164392430177898e-06,
"loss": 0.325,
"step": 614
},
{
"epoch": 1.9,
"grad_norm": 0.3374756184294089,
"learning_rate": 3.599177495054903e-06,
"loss": 0.3683,
"step": 615
},
{
"epoch": 1.9030927835051545,
"grad_norm": 0.3948612480996173,
"learning_rate": 3.5819338462980037e-06,
"loss": 0.3625,
"step": 616
},
{
"epoch": 1.9061855670103092,
"grad_norm": 0.33357356595751453,
"learning_rate": 3.5647085195421668e-06,
"loss": 0.316,
"step": 617
},
{
"epoch": 1.9092783505154638,
"grad_norm": 0.45806876778610933,
"learning_rate": 3.5475017373457328e-06,
"loss": 0.356,
"step": 618
},
{
"epoch": 1.9123711340206184,
"grad_norm": 0.5131941090672334,
"learning_rate": 3.5303137220274467e-06,
"loss": 0.367,
"step": 619
},
{
"epoch": 1.915463917525773,
"grad_norm": 0.4186380446116192,
"learning_rate": 3.5131446956635706e-06,
"loss": 0.3809,
"step": 620
},
{
"epoch": 1.9185567010309277,
"grad_norm": 0.43813716192128194,
"learning_rate": 3.4959948800850253e-06,
"loss": 0.334,
"step": 621
},
{
"epoch": 1.9216494845360823,
"grad_norm": 0.3278335871752149,
"learning_rate": 3.478864496874519e-06,
"loss": 0.3531,
"step": 622
},
{
"epoch": 1.9247422680412372,
"grad_norm": 0.43980216398961003,
"learning_rate": 3.461753767363687e-06,
"loss": 0.3638,
"step": 623
},
{
"epoch": 1.9278350515463918,
"grad_norm": 0.39408155093980723,
"learning_rate": 3.4446629126302268e-06,
"loss": 0.3387,
"step": 624
},
{
"epoch": 1.9309278350515464,
"grad_norm": 0.3733877925676547,
"learning_rate": 3.427592153495053e-06,
"loss": 0.3462,
"step": 625
},
{
"epoch": 1.934020618556701,
"grad_norm": 0.4832864361818532,
"learning_rate": 3.410541710519427e-06,
"loss": 0.3567,
"step": 626
},
{
"epoch": 1.9371134020618557,
"grad_norm": 0.3762228487632087,
"learning_rate": 3.3935118040021255e-06,
"loss": 0.3031,
"step": 627
},
{
"epoch": 1.9402061855670103,
"grad_norm": 0.3598117820415253,
"learning_rate": 3.3765026539765832e-06,
"loss": 0.3347,
"step": 628
},
{
"epoch": 1.943298969072165,
"grad_norm": 0.3579045196689011,
"learning_rate": 3.3595144802080493e-06,
"loss": 0.3662,
"step": 629
},
{
"epoch": 1.9463917525773196,
"grad_norm": 0.38779135939714676,
"learning_rate": 3.342547502190754e-06,
"loss": 0.309,
"step": 630
},
{
"epoch": 1.9494845360824742,
"grad_norm": 0.3153073267233112,
"learning_rate": 3.3256019391450696e-06,
"loss": 0.3268,
"step": 631
},
{
"epoch": 1.9525773195876288,
"grad_norm": 0.39382790442601595,
"learning_rate": 3.3086780100146776e-06,
"loss": 0.3551,
"step": 632
},
{
"epoch": 1.9556701030927837,
"grad_norm": 0.3173007725064202,
"learning_rate": 3.2917759334637376e-06,
"loss": 0.3162,
"step": 633
},
{
"epoch": 1.9587628865979383,
"grad_norm": 0.3731996954778807,
"learning_rate": 3.2748959278740714e-06,
"loss": 0.3844,
"step": 634
},
{
"epoch": 1.961855670103093,
"grad_norm": 0.38549990466885287,
"learning_rate": 3.258038211342327e-06,
"loss": 0.3603,
"step": 635
},
{
"epoch": 1.9649484536082475,
"grad_norm": 0.2821598992722239,
"learning_rate": 3.2412030016771768e-06,
"loss": 0.3093,
"step": 636
},
{
"epoch": 1.9680412371134022,
"grad_norm": 0.37085760577720334,
"learning_rate": 3.2243905163964863e-06,
"loss": 0.3517,
"step": 637
},
{
"epoch": 1.9711340206185568,
"grad_norm": 0.369370716299227,
"learning_rate": 3.2076009727245204e-06,
"loss": 0.3113,
"step": 638
},
{
"epoch": 1.9742268041237114,
"grad_norm": 0.44492141505571886,
"learning_rate": 3.1908345875891243e-06,
"loss": 0.3806,
"step": 639
},
{
"epoch": 1.977319587628866,
"grad_norm": 0.4299515138037621,
"learning_rate": 3.1740915776189275e-06,
"loss": 0.3484,
"step": 640
},
{
"epoch": 1.9804123711340207,
"grad_norm": 0.3458870621986509,
"learning_rate": 3.1573721591405405e-06,
"loss": 0.3463,
"step": 641
},
{
"epoch": 1.9835051546391753,
"grad_norm": 0.3522458946378208,
"learning_rate": 3.140676548175763e-06,
"loss": 0.3339,
"step": 642
},
{
"epoch": 1.98659793814433,
"grad_norm": 0.3755282519867609,
"learning_rate": 3.1240049604387955e-06,
"loss": 0.3647,
"step": 643
},
{
"epoch": 1.9896907216494846,
"grad_norm": 0.3836690091047248,
"learning_rate": 3.10735761133344e-06,
"loss": 0.3316,
"step": 644
},
{
"epoch": 1.9927835051546392,
"grad_norm": 0.460009964702149,
"learning_rate": 3.0907347159503364e-06,
"loss": 0.3478,
"step": 645
},
{
"epoch": 1.9958762886597938,
"grad_norm": 0.38159065451537555,
"learning_rate": 3.074136489064161e-06,
"loss": 0.3474,
"step": 646
},
{
"epoch": 1.9989690721649485,
"grad_norm": 0.38052585704250785,
"learning_rate": 3.057563145130873e-06,
"loss": 0.3167,
"step": 647
},
{
"epoch": 2.0,
"grad_norm": 0.38052585704250785,
"learning_rate": 3.0410148982849248e-06,
"loss": 0.346,
"step": 648
},
{
"epoch": 2.0030927835051546,
"grad_norm": 0.7864373970070601,
"learning_rate": 3.024491962336511e-06,
"loss": 0.3058,
"step": 649
},
{
"epoch": 2.0061855670103093,
"grad_norm": 0.4155546032809773,
"learning_rate": 3.007994550768793e-06,
"loss": 0.2993,
"step": 650
},
{
"epoch": 2.009278350515464,
"grad_norm": 0.517838019795173,
"learning_rate": 2.991522876735154e-06,
"loss": 0.2926,
"step": 651
},
{
"epoch": 2.0123711340206185,
"grad_norm": 0.34043841402604463,
"learning_rate": 2.9750771530564295e-06,
"loss": 0.2738,
"step": 652
},
{
"epoch": 2.015463917525773,
"grad_norm": 0.3120194955678573,
"learning_rate": 2.9586575922181724e-06,
"loss": 0.2674,
"step": 653
},
{
"epoch": 2.0185567010309278,
"grad_norm": 0.7794499266529689,
"learning_rate": 2.9422644063678952e-06,
"loss": 0.2644,
"step": 654
},
{
"epoch": 2.0216494845360824,
"grad_norm": 0.38566659693898253,
"learning_rate": 2.9258978073123413e-06,
"loss": 0.2684,
"step": 655
},
{
"epoch": 2.024742268041237,
"grad_norm": 0.49676337527578907,
"learning_rate": 2.909558006514735e-06,
"loss": 0.257,
"step": 656
},
{
"epoch": 2.0278350515463917,
"grad_norm": 0.35809281358226486,
"learning_rate": 2.8932452150920576e-06,
"loss": 0.2881,
"step": 657
},
{
"epoch": 2.0309278350515463,
"grad_norm": 0.343520690507132,
"learning_rate": 2.876959643812325e-06,
"loss": 0.2949,
"step": 658
},
{
"epoch": 2.034020618556701,
"grad_norm": 0.43398985865900497,
"learning_rate": 2.860701503091845e-06,
"loss": 0.3143,
"step": 659
},
{
"epoch": 2.0371134020618555,
"grad_norm": 0.5249462442966696,
"learning_rate": 2.844471002992526e-06,
"loss": 0.2811,
"step": 660
},
{
"epoch": 2.04020618556701,
"grad_norm": 0.38045577210647147,
"learning_rate": 2.8282683532191333e-06,
"loss": 0.2832,
"step": 661
},
{
"epoch": 2.043298969072165,
"grad_norm": 0.41396895046937116,
"learning_rate": 2.8120937631166056e-06,
"loss": 0.283,
"step": 662
},
{
"epoch": 2.0463917525773194,
"grad_norm": 0.3911427806566122,
"learning_rate": 2.795947441667334e-06,
"loss": 0.2797,
"step": 663
},
{
"epoch": 2.049484536082474,
"grad_norm": 0.3515798025425524,
"learning_rate": 2.7798295974884675e-06,
"loss": 0.2801,
"step": 664
},
{
"epoch": 2.0525773195876287,
"grad_norm": 0.3343140501364322,
"learning_rate": 2.7637404388292184e-06,
"loss": 0.3043,
"step": 665
},
{
"epoch": 2.0556701030927833,
"grad_norm": 0.4382243594147191,
"learning_rate": 2.747680173568168e-06,
"loss": 0.2569,
"step": 666
},
{
"epoch": 2.058762886597938,
"grad_norm": 0.5015312640014641,
"learning_rate": 2.7316490092105856e-06,
"loss": 0.2709,
"step": 667
},
{
"epoch": 2.0618556701030926,
"grad_norm": 0.4976105522523717,
"learning_rate": 2.715647152885743e-06,
"loss": 0.3071,
"step": 668
},
{
"epoch": 2.0649484536082476,
"grad_norm": 0.40739613826250587,
"learning_rate": 2.6996748113442397e-06,
"loss": 0.2654,
"step": 669
},
{
"epoch": 2.0680412371134023,
"grad_norm": 0.37038708287005284,
"learning_rate": 2.6837321909553336e-06,
"loss": 0.2736,
"step": 670
},
{
"epoch": 2.071134020618557,
"grad_norm": 0.48730500899488516,
"learning_rate": 2.6678194977042727e-06,
"loss": 0.2616,
"step": 671
},
{
"epoch": 2.0742268041237115,
"grad_norm": 0.4328911289033136,
"learning_rate": 2.651936937189632e-06,
"loss": 0.3325,
"step": 672
},
{
"epoch": 2.077319587628866,
"grad_norm": 0.5426180704517115,
"learning_rate": 2.6360847146206624e-06,
"loss": 0.2887,
"step": 673
},
{
"epoch": 2.0804123711340208,
"grad_norm": 0.6127385789467704,
"learning_rate": 2.6202630348146323e-06,
"loss": 0.27,
"step": 674
},
{
"epoch": 2.0835051546391754,
"grad_norm": 0.3666477703023413,
"learning_rate": 2.6044721021941887e-06,
"loss": 0.2891,
"step": 675
},
{
"epoch": 2.08659793814433,
"grad_norm": 0.37996463656101304,
"learning_rate": 2.5887121207847093e-06,
"loss": 0.3025,
"step": 676
},
{
"epoch": 2.0896907216494847,
"grad_norm": 0.35839291187334554,
"learning_rate": 2.5729832942116705e-06,
"loss": 0.2903,
"step": 677
},
{
"epoch": 2.0927835051546393,
"grad_norm": 0.4004079234535629,
"learning_rate": 2.5572858256980163e-06,
"loss": 0.2635,
"step": 678
},
{
"epoch": 2.095876288659794,
"grad_norm": 0.3487304623438044,
"learning_rate": 2.5416199180615297e-06,
"loss": 0.2711,
"step": 679
},
{
"epoch": 2.0989690721649485,
"grad_norm": 0.4353724025324779,
"learning_rate": 2.525985773712216e-06,
"loss": 0.2755,
"step": 680
},
{
"epoch": 2.102061855670103,
"grad_norm": 0.4276980963577041,
"learning_rate": 2.5103835946496846e-06,
"loss": 0.2917,
"step": 681
},
{
"epoch": 2.105154639175258,
"grad_norm": 0.42988999785831816,
"learning_rate": 2.4948135824605366e-06,
"loss": 0.24,
"step": 682
},
{
"epoch": 2.1082474226804124,
"grad_norm": 0.4082557837039154,
"learning_rate": 2.479275938315775e-06,
"loss": 0.2634,
"step": 683
},
{
"epoch": 2.111340206185567,
"grad_norm": 0.3143558127025377,
"learning_rate": 2.4637708629681786e-06,
"loss": 0.2673,
"step": 684
},
{
"epoch": 2.1144329896907217,
"grad_norm": 0.357931063053651,
"learning_rate": 2.4482985567497395e-06,
"loss": 0.278,
"step": 685
},
{
"epoch": 2.1175257731958763,
"grad_norm": 0.3981027115089952,
"learning_rate": 2.4328592195690444e-06,
"loss": 0.2677,
"step": 686
},
{
"epoch": 2.120618556701031,
"grad_norm": 0.40716571063614465,
"learning_rate": 2.4174530509087193e-06,
"loss": 0.273,
"step": 687
},
{
"epoch": 2.1237113402061856,
"grad_norm": 0.36013914620812215,
"learning_rate": 2.4020802498228333e-06,
"loss": 0.2601,
"step": 688
},
{
"epoch": 2.12680412371134,
"grad_norm": 0.3767013802752783,
"learning_rate": 2.3867410149343284e-06,
"loss": 0.282,
"step": 689
},
{
"epoch": 2.129896907216495,
"grad_norm": 0.49573734401612746,
"learning_rate": 2.3714355444324675e-06,
"loss": 0.2557,
"step": 690
},
{
"epoch": 2.1329896907216495,
"grad_norm": 0.3697812393978906,
"learning_rate": 2.3561640360702525e-06,
"loss": 0.2651,
"step": 691
},
{
"epoch": 2.136082474226804,
"grad_norm": 0.3928302981423731,
"learning_rate": 2.340926687161893e-06,
"loss": 0.2942,
"step": 692
},
{
"epoch": 2.1391752577319587,
"grad_norm": 0.44134848334336135,
"learning_rate": 2.3257236945802292e-06,
"loss": 0.2776,
"step": 693
},
{
"epoch": 2.1422680412371133,
"grad_norm": 0.5528462494959189,
"learning_rate": 2.31055525475422e-06,
"loss": 0.2531,
"step": 694
},
{
"epoch": 2.145360824742268,
"grad_norm": 0.3843449058361117,
"learning_rate": 2.295421563666372e-06,
"loss": 0.2888,
"step": 695
},
{
"epoch": 2.1484536082474226,
"grad_norm": 0.3541017264885976,
"learning_rate": 2.2803228168502383e-06,
"loss": 0.2728,
"step": 696
},
{
"epoch": 2.1515463917525772,
"grad_norm": 0.365972017913051,
"learning_rate": 2.265259209387867e-06,
"loss": 0.2565,
"step": 697
},
{
"epoch": 2.154639175257732,
"grad_norm": 0.3863462492512773,
"learning_rate": 2.2502309359072953e-06,
"loss": 0.2682,
"step": 698
},
{
"epoch": 2.1577319587628865,
"grad_norm": 0.35442097659929384,
"learning_rate": 2.2352381905800325e-06,
"loss": 0.2536,
"step": 699
},
{
"epoch": 2.160824742268041,
"grad_norm": 0.32697612991708297,
"learning_rate": 2.2202811671185458e-06,
"loss": 0.2975,
"step": 700
},
{
"epoch": 2.1639175257731957,
"grad_norm": 0.36614596481715983,
"learning_rate": 2.205360058773764e-06,
"loss": 0.2735,
"step": 701
},
{
"epoch": 2.1670103092783504,
"grad_norm": 0.43091349727443096,
"learning_rate": 2.190475058332574e-06,
"loss": 0.2549,
"step": 702
},
{
"epoch": 2.170103092783505,
"grad_norm": 0.361129805505901,
"learning_rate": 2.1756263581153427e-06,
"loss": 0.3013,
"step": 703
},
{
"epoch": 2.1731958762886596,
"grad_norm": 0.4947360631523454,
"learning_rate": 2.16081414997341e-06,
"loss": 0.2583,
"step": 704
},
{
"epoch": 2.1762886597938143,
"grad_norm": 0.38239489577444014,
"learning_rate": 2.1460386252866327e-06,
"loss": 0.2851,
"step": 705
},
{
"epoch": 2.179381443298969,
"grad_norm": 0.3919201012163893,
"learning_rate": 2.1312999749608987e-06,
"loss": 0.2598,
"step": 706
},
{
"epoch": 2.1824742268041235,
"grad_norm": 0.45850653108362616,
"learning_rate": 2.1165983894256647e-06,
"loss": 0.2748,
"step": 707
},
{
"epoch": 2.1855670103092786,
"grad_norm": 0.414377399643986,
"learning_rate": 2.101934058631495e-06,
"loss": 0.3246,
"step": 708
},
{
"epoch": 2.188659793814433,
"grad_norm": 0.36659216289733115,
"learning_rate": 2.0873071720476067e-06,
"loss": 0.2595,
"step": 709
},
{
"epoch": 2.191752577319588,
"grad_norm": 0.3417432754143865,
"learning_rate": 2.0727179186594224e-06,
"loss": 0.2881,
"step": 710
},
{
"epoch": 2.1948453608247425,
"grad_norm": 0.33748255922934906,
"learning_rate": 2.058166486966128e-06,
"loss": 0.2418,
"step": 711
},
{
"epoch": 2.197938144329897,
"grad_norm": 0.3995433111495412,
"learning_rate": 2.043653064978239e-06,
"loss": 0.2679,
"step": 712
},
{
"epoch": 2.2010309278350517,
"grad_norm": 0.4163499781278848,
"learning_rate": 2.0291778402151685e-06,
"loss": 0.2564,
"step": 713
},
{
"epoch": 2.2041237113402063,
"grad_norm": 0.32369566851848647,
"learning_rate": 2.0147409997028045e-06,
"loss": 0.2842,
"step": 714
},
{
"epoch": 2.207216494845361,
"grad_norm": 0.4445352641934898,
"learning_rate": 2.0003427299710966e-06,
"loss": 0.3008,
"step": 715
},
{
"epoch": 2.2103092783505156,
"grad_norm": 0.37416288101445394,
"learning_rate": 1.9859832170516437e-06,
"loss": 0.2833,
"step": 716
},
{
"epoch": 2.2134020618556702,
"grad_norm": 0.44437495554517636,
"learning_rate": 1.9716626464752896e-06,
"loss": 0.2731,
"step": 717
},
{
"epoch": 2.216494845360825,
"grad_norm": 0.43872445768670315,
"learning_rate": 1.9573812032697277e-06,
"loss": 0.2923,
"step": 718
},
{
"epoch": 2.2195876288659795,
"grad_norm": 0.3242000894012009,
"learning_rate": 1.9431390719571096e-06,
"loss": 0.2392,
"step": 719
},
{
"epoch": 2.222680412371134,
"grad_norm": 0.47261881424241464,
"learning_rate": 1.928936436551661e-06,
"loss": 0.2833,
"step": 720
},
{
"epoch": 2.2257731958762887,
"grad_norm": 0.6426216936806711,
"learning_rate": 1.914773480557304e-06,
"loss": 0.2585,
"step": 721
},
{
"epoch": 2.2288659793814434,
"grad_norm": 0.48708615764527685,
"learning_rate": 1.9006503869652854e-06,
"loss": 0.2923,
"step": 722
},
{
"epoch": 2.231958762886598,
"grad_norm": 0.4066200708502729,
"learning_rate": 1.8865673382518146e-06,
"loss": 0.2868,
"step": 723
},
{
"epoch": 2.2350515463917526,
"grad_norm": 0.4193733496397231,
"learning_rate": 1.872524516375705e-06,
"loss": 0.2813,
"step": 724
},
{
"epoch": 2.2381443298969073,
"grad_norm": 0.37564083363888556,
"learning_rate": 1.8585221027760209e-06,
"loss": 0.2832,
"step": 725
},
{
"epoch": 2.241237113402062,
"grad_norm": 0.3203728508806716,
"learning_rate": 1.8445602783697375e-06,
"loss": 0.2657,
"step": 726
},
{
"epoch": 2.2443298969072165,
"grad_norm": 0.39734444741295993,
"learning_rate": 1.8306392235493946e-06,
"loss": 0.2582,
"step": 727
},
{
"epoch": 2.247422680412371,
"grad_norm": 0.5190391324637584,
"learning_rate": 1.8167591181807836e-06,
"loss": 0.3178,
"step": 728
},
{
"epoch": 2.2505154639175258,
"grad_norm": 0.3638149149971288,
"learning_rate": 1.8029201416005976e-06,
"loss": 0.2378,
"step": 729
},
{
"epoch": 2.2536082474226804,
"grad_norm": 0.3864306165776402,
"learning_rate": 1.789122472614143e-06,
"loss": 0.2643,
"step": 730
},
{
"epoch": 2.256701030927835,
"grad_norm": 0.40392109683979727,
"learning_rate": 1.775366289493003e-06,
"loss": 0.2998,
"step": 731
},
{
"epoch": 2.2597938144329897,
"grad_norm": 0.47275997790914914,
"learning_rate": 1.7616517699727554e-06,
"loss": 0.2748,
"step": 732
},
{
"epoch": 2.2628865979381443,
"grad_norm": 0.408857288094096,
"learning_rate": 1.7479790912506628e-06,
"loss": 0.2554,
"step": 733
},
{
"epoch": 2.265979381443299,
"grad_norm": 0.41080443668702876,
"learning_rate": 1.734348429983384e-06,
"loss": 0.2522,
"step": 734
},
{
"epoch": 2.2690721649484535,
"grad_norm": 0.38970842456935784,
"learning_rate": 1.7207599622847042e-06,
"loss": 0.3015,
"step": 735
},
{
"epoch": 2.272164948453608,
"grad_norm": 0.35769582798595084,
"learning_rate": 1.7072138637232394e-06,
"loss": 0.2763,
"step": 736
},
{
"epoch": 2.275257731958763,
"grad_norm": 0.4250873527600758,
"learning_rate": 1.6937103093201895e-06,
"loss": 0.3007,
"step": 737
},
{
"epoch": 2.2783505154639174,
"grad_norm": 0.361704767065038,
"learning_rate": 1.6802494735470548e-06,
"loss": 0.2558,
"step": 738
},
{
"epoch": 2.281443298969072,
"grad_norm": 0.3686554134526815,
"learning_rate": 1.6668315303234068e-06,
"loss": 0.2619,
"step": 739
},
{
"epoch": 2.2845360824742267,
"grad_norm": 0.44057581127752826,
"learning_rate": 1.6534566530146123e-06,
"loss": 0.2878,
"step": 740
},
{
"epoch": 2.2876288659793813,
"grad_norm": 0.3306113026712118,
"learning_rate": 1.6401250144296239e-06,
"loss": 0.2848,
"step": 741
},
{
"epoch": 2.290721649484536,
"grad_norm": 0.30579912019686456,
"learning_rate": 1.626836786818719e-06,
"loss": 0.3013,
"step": 742
},
{
"epoch": 2.2938144329896906,
"grad_norm": 0.4848429339929378,
"learning_rate": 1.6135921418712959e-06,
"loss": 0.2721,
"step": 743
},
{
"epoch": 2.296907216494845,
"grad_norm": 0.3619522576835984,
"learning_rate": 1.6003912507136422e-06,
"loss": 0.2901,
"step": 744
},
{
"epoch": 2.3,
"grad_norm": 0.3117306212597808,
"learning_rate": 1.5872342839067305e-06,
"loss": 0.2809,
"step": 745
},
{
"epoch": 2.3030927835051545,
"grad_norm": 0.3018814157736744,
"learning_rate": 1.574121411444013e-06,
"loss": 0.2563,
"step": 746
},
{
"epoch": 2.306185567010309,
"grad_norm": 0.39204350000474086,
"learning_rate": 1.561052802749221e-06,
"loss": 0.2949,
"step": 747
},
{
"epoch": 2.3092783505154637,
"grad_norm": 0.469936745505809,
"learning_rate": 1.548028626674189e-06,
"loss": 0.2635,
"step": 748
},
{
"epoch": 2.3123711340206183,
"grad_norm": 0.33508014362099003,
"learning_rate": 1.5350490514966509e-06,
"loss": 0.2604,
"step": 749
},
{
"epoch": 2.315463917525773,
"grad_norm": 0.4125367262529412,
"learning_rate": 1.5221142449180882e-06,
"loss": 0.2784,
"step": 750
},
{
"epoch": 2.3185567010309276,
"grad_norm": 0.40880353780401285,
"learning_rate": 1.5092243740615486e-06,
"loss": 0.2649,
"step": 751
},
{
"epoch": 2.3216494845360827,
"grad_norm": 0.3511252824079501,
"learning_rate": 1.496379605469494e-06,
"loss": 0.2589,
"step": 752
},
{
"epoch": 2.3247422680412373,
"grad_norm": 0.44823739152907094,
"learning_rate": 1.4835801051016463e-06,
"loss": 0.3085,
"step": 753
},
{
"epoch": 2.327835051546392,
"grad_norm": 0.4619415605914144,
"learning_rate": 1.4708260383328422e-06,
"loss": 0.3109,
"step": 754
},
{
"epoch": 2.3309278350515465,
"grad_norm": 0.37302697291531406,
"learning_rate": 1.4581175699508982e-06,
"loss": 0.2895,
"step": 755
},
{
"epoch": 2.334020618556701,
"grad_norm": 0.4600197812984674,
"learning_rate": 1.4454548641544803e-06,
"loss": 0.2506,
"step": 756
},
{
"epoch": 2.337113402061856,
"grad_norm": 0.3379307558333953,
"learning_rate": 1.4328380845509837e-06,
"loss": 0.2783,
"step": 757
},
{
"epoch": 2.3402061855670104,
"grad_norm": 0.4403369320114859,
"learning_rate": 1.4202673941544176e-06,
"loss": 0.2945,
"step": 758
},
{
"epoch": 2.343298969072165,
"grad_norm": 0.41855994301163396,
"learning_rate": 1.4077429553832995e-06,
"loss": 0.2787,
"step": 759
},
{
"epoch": 2.3463917525773197,
"grad_norm": 0.4242311482486076,
"learning_rate": 1.3952649300585574e-06,
"loss": 0.2753,
"step": 760
},
{
"epoch": 2.3494845360824743,
"grad_norm": 0.4912168070549898,
"learning_rate": 1.382833479401438e-06,
"loss": 0.2874,
"step": 761
},
{
"epoch": 2.352577319587629,
"grad_norm": 0.47260058954453504,
"learning_rate": 1.3704487640314257e-06,
"loss": 0.2781,
"step": 762
},
{
"epoch": 2.3556701030927836,
"grad_norm": 0.3356648638014468,
"learning_rate": 1.3581109439641587e-06,
"loss": 0.2429,
"step": 763
},
{
"epoch": 2.358762886597938,
"grad_norm": 0.3406439635161593,
"learning_rate": 1.3458201786093795e-06,
"loss": 0.3005,
"step": 764
},
{
"epoch": 2.361855670103093,
"grad_norm": 0.9027133680522813,
"learning_rate": 1.3335766267688566e-06,
"loss": 0.2512,
"step": 765
},
{
"epoch": 2.3649484536082475,
"grad_norm": 0.4170248415384265,
"learning_rate": 1.321380446634342e-06,
"loss": 0.2716,
"step": 766
},
{
"epoch": 2.368041237113402,
"grad_norm": 0.34736226887673133,
"learning_rate": 1.309231795785526e-06,
"loss": 0.279,
"step": 767
},
{
"epoch": 2.3711340206185567,
"grad_norm": 0.38374462474597076,
"learning_rate": 1.2971308311880015e-06,
"loss": 0.2659,
"step": 768
},
{
"epoch": 2.3742268041237113,
"grad_norm": 0.4087701862728044,
"learning_rate": 1.2850777091912364e-06,
"loss": 0.2615,
"step": 769
},
{
"epoch": 2.377319587628866,
"grad_norm": 0.4354684165110471,
"learning_rate": 1.2730725855265452e-06,
"loss": 0.2617,
"step": 770
},
{
"epoch": 2.3804123711340206,
"grad_norm": 0.4232874812730349,
"learning_rate": 1.2611156153050963e-06,
"loss": 0.2992,
"step": 771
},
{
"epoch": 2.3835051546391752,
"grad_norm": 0.4866904483506314,
"learning_rate": 1.2492069530158829e-06,
"loss": 0.2852,
"step": 772
},
{
"epoch": 2.38659793814433,
"grad_norm": 0.522108829131126,
"learning_rate": 1.237346752523752e-06,
"loss": 0.2949,
"step": 773
},
{
"epoch": 2.3896907216494845,
"grad_norm": 0.36945752306550494,
"learning_rate": 1.225535167067392e-06,
"loss": 0.299,
"step": 774
},
{
"epoch": 2.392783505154639,
"grad_norm": 0.40065674081859776,
"learning_rate": 1.2137723492573766e-06,
"loss": 0.2727,
"step": 775
},
{
"epoch": 2.3958762886597937,
"grad_norm": 0.43968669047979536,
"learning_rate": 1.2020584510741707e-06,
"loss": 0.2764,
"step": 776
},
{
"epoch": 2.3989690721649484,
"grad_norm": 0.3823590728970254,
"learning_rate": 1.1903936238661868e-06,
"loss": 0.2512,
"step": 777
},
{
"epoch": 2.402061855670103,
"grad_norm": 0.3865543216955176,
"learning_rate": 1.1787780183478126e-06,
"loss": 0.2597,
"step": 778
},
{
"epoch": 2.4051546391752576,
"grad_norm": 0.4474565050735651,
"learning_rate": 1.167211784597474e-06,
"loss": 0.2475,
"step": 779
},
{
"epoch": 2.4082474226804123,
"grad_norm": 1.5862590518924486,
"learning_rate": 1.1556950720556976e-06,
"loss": 0.2697,
"step": 780
},
{
"epoch": 2.411340206185567,
"grad_norm": 0.46216684820643666,
"learning_rate": 1.1442280295231656e-06,
"loss": 0.2752,
"step": 781
},
{
"epoch": 2.4144329896907215,
"grad_norm": 0.40916275158083065,
"learning_rate": 1.1328108051588154e-06,
"loss": 0.2357,
"step": 782
},
{
"epoch": 2.417525773195876,
"grad_norm": 0.48248035654922017,
"learning_rate": 1.1214435464779006e-06,
"loss": 0.2558,
"step": 783
},
{
"epoch": 2.4206185567010308,
"grad_norm": 0.36558870979576563,
"learning_rate": 1.1101264003501088e-06,
"loss": 0.2771,
"step": 784
},
{
"epoch": 2.4237113402061854,
"grad_norm": 0.37906519013254625,
"learning_rate": 1.0988595129976444e-06,
"loss": 0.2524,
"step": 785
},
{
"epoch": 2.42680412371134,
"grad_norm": 0.3733058866504784,
"learning_rate": 1.0876430299933516e-06,
"loss": 0.258,
"step": 786
},
{
"epoch": 2.429896907216495,
"grad_norm": 0.3403712357526472,
"learning_rate": 1.0764770962588278e-06,
"loss": 0.2502,
"step": 787
},
{
"epoch": 2.4329896907216497,
"grad_norm": 0.38993238552151593,
"learning_rate": 1.0653618560625556e-06,
"loss": 0.2777,
"step": 788
},
{
"epoch": 2.4360824742268044,
"grad_norm": 0.4792747181734268,
"learning_rate": 1.0542974530180327e-06,
"loss": 0.297,
"step": 789
},
{
"epoch": 2.439175257731959,
"grad_norm": 0.47061224367787174,
"learning_rate": 1.0432840300819224e-06,
"loss": 0.2518,
"step": 790
},
{
"epoch": 2.4422680412371136,
"grad_norm": 0.35277694812297855,
"learning_rate": 1.0323217295522026e-06,
"loss": 0.245,
"step": 791
},
{
"epoch": 2.4453608247422682,
"grad_norm": 0.360935495801492,
"learning_rate": 1.0214106930663293e-06,
"loss": 0.2731,
"step": 792
},
{
"epoch": 2.448453608247423,
"grad_norm": 0.4162882205392147,
"learning_rate": 1.0105510615994051e-06,
"loss": 0.2776,
"step": 793
},
{
"epoch": 2.4515463917525775,
"grad_norm": 0.41547933460758235,
"learning_rate": 9.99742975462359e-07,
"loss": 0.2611,
"step": 794
},
{
"epoch": 2.454639175257732,
"grad_norm": 0.37728046401810167,
"learning_rate": 9.889865743001332e-07,
"loss": 0.2664,
"step": 795
},
{
"epoch": 2.4577319587628867,
"grad_norm": 0.43165759943984655,
"learning_rate": 9.782819970898776e-07,
"loss": 0.2605,
"step": 796
},
{
"epoch": 2.4608247422680414,
"grad_norm": 0.3654529820399213,
"learning_rate": 9.676293821391568e-07,
"loss": 0.2988,
"step": 797
},
{
"epoch": 2.463917525773196,
"grad_norm": 0.43771001110749813,
"learning_rate": 9.570288670841609e-07,
"loss": 0.2555,
"step": 798
},
{
"epoch": 2.4670103092783506,
"grad_norm": 0.338337995644797,
"learning_rate": 9.464805888879264e-07,
"loss": 0.2696,
"step": 799
},
{
"epoch": 2.4701030927835053,
"grad_norm": 0.3598910989343362,
"learning_rate": 9.359846838385706e-07,
"loss": 0.2768,
"step": 800
},
{
"epoch": 2.47319587628866,
"grad_norm": 0.47013544090875536,
"learning_rate": 9.255412875475256e-07,
"loss": 0.2867,
"step": 801
},
{
"epoch": 2.4762886597938145,
"grad_norm": 0.42217158967218676,
"learning_rate": 9.151505349477901e-07,
"loss": 0.2746,
"step": 802
},
{
"epoch": 2.479381443298969,
"grad_norm": 0.35208495582452803,
"learning_rate": 9.048125602921843e-07,
"loss": 0.2563,
"step": 803
},
{
"epoch": 2.4824742268041238,
"grad_norm": 0.3505110513475081,
"learning_rate": 8.945274971516155e-07,
"loss": 0.2536,
"step": 804
},
{
"epoch": 2.4855670103092784,
"grad_norm": 0.4087816084686581,
"learning_rate": 8.842954784133517e-07,
"loss": 0.3051,
"step": 805
},
{
"epoch": 2.488659793814433,
"grad_norm": 0.3960789356267894,
"learning_rate": 8.741166362793057e-07,
"loss": 0.2537,
"step": 806
},
{
"epoch": 2.4917525773195877,
"grad_norm": 0.39069133380864984,
"learning_rate": 8.639911022643288e-07,
"loss": 0.264,
"step": 807
},
{
"epoch": 2.4948453608247423,
"grad_norm": 0.3863554138938276,
"learning_rate": 8.539190071945036e-07,
"loss": 0.272,
"step": 808
},
{
"epoch": 2.497938144329897,
"grad_norm": 0.40862601106103924,
"learning_rate": 8.439004812054658e-07,
"loss": 0.2636,
"step": 809
},
{
"epoch": 2.5010309278350515,
"grad_norm": 0.4222027241318616,
"learning_rate": 8.339356537407129e-07,
"loss": 0.2596,
"step": 810
},
{
"epoch": 2.504123711340206,
"grad_norm": 0.36178986472682245,
"learning_rate": 8.240246535499369e-07,
"loss": 0.2847,
"step": 811
},
{
"epoch": 2.507216494845361,
"grad_norm": 0.38472636391035925,
"learning_rate": 8.141676086873574e-07,
"loss": 0.2774,
"step": 812
},
{
"epoch": 2.5103092783505154,
"grad_norm": 0.4039931071916291,
"learning_rate": 8.043646465100696e-07,
"loss": 0.2434,
"step": 813
},
{
"epoch": 2.51340206185567,
"grad_norm": 0.32765615054736147,
"learning_rate": 7.946158936764003e-07,
"loss": 0.2747,
"step": 814
},
{
"epoch": 2.5164948453608247,
"grad_norm": 0.3772369754517315,
"learning_rate": 7.849214761442637e-07,
"loss": 0.2729,
"step": 815
},
{
"epoch": 2.5195876288659793,
"grad_norm": 0.3986761836353652,
"learning_rate": 7.752815191695462e-07,
"loss": 0.2977,
"step": 816
},
{
"epoch": 2.522680412371134,
"grad_norm": 0.44269655672764346,
"learning_rate": 7.656961473044744e-07,
"loss": 0.2605,
"step": 817
},
{
"epoch": 2.5257731958762886,
"grad_norm": 0.43722142368011174,
"learning_rate": 7.561654843960208e-07,
"loss": 0.2602,
"step": 818
},
{
"epoch": 2.528865979381443,
"grad_norm": 0.3764444430973381,
"learning_rate": 7.466896535842865e-07,
"loss": 0.262,
"step": 819
},
{
"epoch": 2.531958762886598,
"grad_norm": 0.43708092583266656,
"learning_rate": 7.372687773009273e-07,
"loss": 0.2876,
"step": 820
},
{
"epoch": 2.5350515463917525,
"grad_norm": 0.375053913647473,
"learning_rate": 7.279029772675572e-07,
"loss": 0.2637,
"step": 821
},
{
"epoch": 2.538144329896907,
"grad_norm": 5.106978224597531,
"learning_rate": 7.185923744941881e-07,
"loss": 0.248,
"step": 822
},
{
"epoch": 2.5412371134020617,
"grad_norm": 0.40781135132686575,
"learning_rate": 7.093370892776558e-07,
"loss": 0.2786,
"step": 823
},
{
"epoch": 2.5443298969072163,
"grad_norm": 0.3881270194714463,
"learning_rate": 7.001372412000718e-07,
"loss": 0.281,
"step": 824
},
{
"epoch": 2.547422680412371,
"grad_norm": 0.31196018561276995,
"learning_rate": 6.909929491272799e-07,
"loss": 0.2535,
"step": 825
},
{
"epoch": 2.5505154639175256,
"grad_norm": 0.40077079764947493,
"learning_rate": 6.819043312073109e-07,
"loss": 0.2762,
"step": 826
},
{
"epoch": 2.55360824742268,
"grad_norm": 0.5121874348267299,
"learning_rate": 6.728715048688711e-07,
"loss": 0.2696,
"step": 827
},
{
"epoch": 2.556701030927835,
"grad_norm": 0.49662806556832984,
"learning_rate": 6.638945868198071e-07,
"loss": 0.2629,
"step": 828
},
{
"epoch": 2.5597938144329895,
"grad_norm": 0.4261532245947411,
"learning_rate": 6.549736930456163e-07,
"loss": 0.2577,
"step": 829
},
{
"epoch": 2.562886597938144,
"grad_norm": 0.3591850207269949,
"learning_rate": 6.461089388079316e-07,
"loss": 0.2779,
"step": 830
},
{
"epoch": 2.5659793814432987,
"grad_norm": 0.4543277400791709,
"learning_rate": 6.373004386430442e-07,
"loss": 0.2866,
"step": 831
},
{
"epoch": 2.5690721649484534,
"grad_norm": 0.39010995724066533,
"learning_rate": 6.285483063604187e-07,
"loss": 0.2647,
"step": 832
},
{
"epoch": 2.572164948453608,
"grad_norm": 0.4890947193837219,
"learning_rate": 6.198526550412232e-07,
"loss": 0.2426,
"step": 833
},
{
"epoch": 2.5752577319587626,
"grad_norm": 0.39146288102200943,
"learning_rate": 6.112135970368682e-07,
"loss": 0.2516,
"step": 834
},
{
"epoch": 2.5783505154639177,
"grad_norm": 0.35663592908885416,
"learning_rate": 6.026312439675553e-07,
"loss": 0.2481,
"step": 835
},
{
"epoch": 2.5814432989690723,
"grad_norm": 0.3665307794329737,
"learning_rate": 5.941057067208345e-07,
"loss": 0.2902,
"step": 836
},
{
"epoch": 2.584536082474227,
"grad_norm": 0.3711205197724142,
"learning_rate": 5.856370954501722e-07,
"loss": 0.2383,
"step": 837
},
{
"epoch": 2.5876288659793816,
"grad_norm": 0.34065816638638197,
"learning_rate": 5.772255195735287e-07,
"loss": 0.2502,
"step": 838
},
{
"epoch": 2.590721649484536,
"grad_norm": 0.6025679888532969,
"learning_rate": 5.688710877719417e-07,
"loss": 0.2592,
"step": 839
},
{
"epoch": 2.593814432989691,
"grad_norm": 0.3777446364919783,
"learning_rate": 5.60573907988124e-07,
"loss": 0.2496,
"step": 840
},
{
"epoch": 2.5969072164948455,
"grad_norm": 0.3927351584397551,
"learning_rate": 5.523340874250704e-07,
"loss": 0.2932,
"step": 841
},
{
"epoch": 2.6,
"grad_norm": 0.4611904495937945,
"learning_rate": 5.441517325446688e-07,
"loss": 0.2724,
"step": 842
},
{
"epoch": 2.6030927835051547,
"grad_norm": 0.4838396928689296,
"learning_rate": 5.360269490663278e-07,
"loss": 0.2788,
"step": 843
},
{
"epoch": 2.6061855670103093,
"grad_norm": 0.28849031956241356,
"learning_rate": 5.279598419656096e-07,
"loss": 0.2754,
"step": 844
},
{
"epoch": 2.609278350515464,
"grad_norm": 0.4277305524273044,
"learning_rate": 5.199505154728729e-07,
"loss": 0.2812,
"step": 845
},
{
"epoch": 2.6123711340206186,
"grad_norm": 0.431380090314389,
"learning_rate": 5.119990730719287e-07,
"loss": 0.248,
"step": 846
},
{
"epoch": 2.6154639175257732,
"grad_norm": 0.457727370255861,
"learning_rate": 5.041056174987008e-07,
"loss": 0.3045,
"step": 847
},
{
"epoch": 2.618556701030928,
"grad_norm": 0.337047837273392,
"learning_rate": 4.962702507398981e-07,
"loss": 0.2562,
"step": 848
},
{
"epoch": 2.6216494845360825,
"grad_norm": 0.444447361152871,
"learning_rate": 4.88493074031699e-07,
"loss": 0.2774,
"step": 849
},
{
"epoch": 2.624742268041237,
"grad_norm": 0.47058562033160894,
"learning_rate": 4.807741878584444e-07,
"loss": 0.2833,
"step": 850
},
{
"epoch": 2.6278350515463917,
"grad_norm": 0.5103792800918486,
"learning_rate": 4.7311369195133127e-07,
"loss": 0.2742,
"step": 851
},
{
"epoch": 2.6309278350515464,
"grad_norm": 0.3871907852225993,
"learning_rate": 4.6551168528713884e-07,
"loss": 0.2707,
"step": 852
},
{
"epoch": 2.634020618556701,
"grad_norm": 0.3990501931627433,
"learning_rate": 4.5796826608693277e-07,
"loss": 0.2832,
"step": 853
},
{
"epoch": 2.6371134020618556,
"grad_norm": 0.3926262596809124,
"learning_rate": 4.5048353181481043e-07,
"loss": 0.2603,
"step": 854
},
{
"epoch": 2.6402061855670103,
"grad_norm": 0.42016597491134544,
"learning_rate": 4.4305757917663284e-07,
"loss": 0.2423,
"step": 855
},
{
"epoch": 2.643298969072165,
"grad_norm": 0.33762976623227237,
"learning_rate": 4.3569050411877867e-07,
"loss": 0.3094,
"step": 856
},
{
"epoch": 2.6463917525773195,
"grad_norm": 0.4255252595029353,
"learning_rate": 4.283824018269045e-07,
"loss": 0.2543,
"step": 857
},
{
"epoch": 2.649484536082474,
"grad_norm": 0.38422820883982844,
"learning_rate": 4.211333667247125e-07,
"loss": 0.2908,
"step": 858
},
{
"epoch": 2.6525773195876288,
"grad_norm": 0.41409709865845773,
"learning_rate": 4.139434924727359e-07,
"loss": 0.2527,
"step": 859
},
{
"epoch": 2.6556701030927834,
"grad_norm": 0.4891114365667326,
"learning_rate": 4.0681287196711883e-07,
"loss": 0.2578,
"step": 860
},
{
"epoch": 2.658762886597938,
"grad_norm": 0.382607275622272,
"learning_rate": 3.997415973384311e-07,
"loss": 0.2739,
"step": 861
},
{
"epoch": 2.6618556701030927,
"grad_norm": 0.4775250066484936,
"learning_rate": 3.9272975995046146e-07,
"loss": 0.2569,
"step": 862
},
{
"epoch": 2.6649484536082473,
"grad_norm": 0.3349894883367055,
"learning_rate": 3.857774503990513e-07,
"loss": 0.3055,
"step": 863
},
{
"epoch": 2.668041237113402,
"grad_norm": 0.361989359292344,
"learning_rate": 3.7888475851091123e-07,
"loss": 0.2593,
"step": 864
},
{
"epoch": 2.6711340206185565,
"grad_norm": 1.0723316740464135,
"learning_rate": 3.7205177334247445e-07,
"loss": 0.287,
"step": 865
},
{
"epoch": 2.6742268041237116,
"grad_norm": 0.4674936800468732,
"learning_rate": 3.6527858317873146e-07,
"loss": 0.2634,
"step": 866
},
{
"epoch": 2.6773195876288662,
"grad_norm": 0.3896450406311716,
"learning_rate": 3.585652755321012e-07,
"loss": 0.306,
"step": 867
},
{
"epoch": 2.680412371134021,
"grad_norm": 0.36435008009318615,
"learning_rate": 3.519119371412938e-07,
"loss": 0.2608,
"step": 868
},
{
"epoch": 2.6835051546391755,
"grad_norm": 0.3340294872611109,
"learning_rate": 3.453186539701925e-07,
"loss": 0.2463,
"step": 869
},
{
"epoch": 2.68659793814433,
"grad_norm": 0.387560286599799,
"learning_rate": 3.3878551120674343e-07,
"loss": 0.2553,
"step": 870
},
{
"epoch": 2.6896907216494848,
"grad_norm": 0.402851383790354,
"learning_rate": 3.3231259326184983e-07,
"loss": 0.2429,
"step": 871
},
{
"epoch": 2.6927835051546394,
"grad_norm": 0.4149186840791913,
"learning_rate": 3.2589998376829135e-07,
"loss": 0.2619,
"step": 872
},
{
"epoch": 2.695876288659794,
"grad_norm": 0.33280941208455317,
"learning_rate": 3.1954776557963086e-07,
"loss": 0.2696,
"step": 873
},
{
"epoch": 2.6989690721649486,
"grad_norm": 0.342998191453652,
"learning_rate": 3.1325602076915706e-07,
"loss": 0.246,
"step": 874
},
{
"epoch": 2.7020618556701033,
"grad_norm": 0.3793080311309386,
"learning_rate": 3.0702483062881206e-07,
"loss": 0.2769,
"step": 875
},
{
"epoch": 2.705154639175258,
"grad_norm": 0.3772309885276913,
"learning_rate": 3.0085427566814985e-07,
"loss": 0.281,
"step": 876
},
{
"epoch": 2.7082474226804125,
"grad_norm": 0.44008596469496586,
"learning_rate": 2.947444356132917e-07,
"loss": 0.3324,
"step": 877
},
{
"epoch": 2.711340206185567,
"grad_norm": 0.5243569107693952,
"learning_rate": 2.88695389405898e-07,
"loss": 0.3094,
"step": 878
},
{
"epoch": 2.7144329896907218,
"grad_norm": 0.3169183516642598,
"learning_rate": 2.827072152021465e-07,
"loss": 0.2737,
"step": 879
},
{
"epoch": 2.7175257731958764,
"grad_norm": 0.49768488943684985,
"learning_rate": 2.767799903717244e-07,
"loss": 0.2457,
"step": 880
},
{
"epoch": 2.720618556701031,
"grad_norm": 0.41352156394619477,
"learning_rate": 2.7091379149682683e-07,
"loss": 0.2771,
"step": 881
},
{
"epoch": 2.7237113402061857,
"grad_norm": 0.35439240015429474,
"learning_rate": 2.6510869437116946e-07,
"loss": 0.2667,
"step": 882
},
{
"epoch": 2.7268041237113403,
"grad_norm": 0.37933465501415164,
"learning_rate": 2.593647739990068e-07,
"loss": 0.2894,
"step": 883
},
{
"epoch": 2.729896907216495,
"grad_norm": 0.39623780129771313,
"learning_rate": 2.5368210459416565e-07,
"loss": 0.2505,
"step": 884
},
{
"epoch": 2.7329896907216495,
"grad_norm": 0.3981902202273425,
"learning_rate": 2.480607595790846e-07,
"loss": 0.267,
"step": 885
},
{
"epoch": 2.736082474226804,
"grad_norm": 0.47542948996594553,
"learning_rate": 2.425008115838651e-07,
"loss": 0.279,
"step": 886
},
{
"epoch": 2.739175257731959,
"grad_norm": 0.6127141337516678,
"learning_rate": 2.3700233244533412e-07,
"loss": 0.2449,
"step": 887
},
{
"epoch": 2.7422680412371134,
"grad_norm": 0.38973267154824953,
"learning_rate": 2.3156539320611627e-07,
"loss": 0.2935,
"step": 888
},
{
"epoch": 2.745360824742268,
"grad_norm": 0.3155145685377891,
"learning_rate": 2.2619006411371437e-07,
"loss": 0.3411,
"step": 889
},
{
"epoch": 2.7484536082474227,
"grad_norm": 0.4456296619500586,
"learning_rate": 2.2087641461960295e-07,
"loss": 0.2672,
"step": 890
},
{
"epoch": 2.7515463917525773,
"grad_norm": 0.48391156603601054,
"learning_rate": 2.156245133783308e-07,
"loss": 0.2872,
"step": 891
},
{
"epoch": 2.754639175257732,
"grad_norm": 0.38451851250891883,
"learning_rate": 2.1043442824663308e-07,
"loss": 0.2703,
"step": 892
},
{
"epoch": 2.7577319587628866,
"grad_norm": 0.4464145304550885,
"learning_rate": 2.0530622628255613e-07,
"loss": 0.287,
"step": 893
},
{
"epoch": 2.760824742268041,
"grad_norm": 0.3885382227965622,
"learning_rate": 2.0023997374458927e-07,
"loss": 0.2586,
"step": 894
},
{
"epoch": 2.763917525773196,
"grad_norm": 0.3558472125223351,
"learning_rate": 1.9523573609081137e-07,
"loss": 0.2846,
"step": 895
},
{
"epoch": 2.7670103092783505,
"grad_norm": 0.35075461056120477,
"learning_rate": 1.9029357797804017e-07,
"loss": 0.2949,
"step": 896
},
{
"epoch": 2.770103092783505,
"grad_norm": 0.5126676208214347,
"learning_rate": 1.8541356326100436e-07,
"loss": 0.2796,
"step": 897
},
{
"epoch": 2.7731958762886597,
"grad_norm": 0.43519449725676185,
"learning_rate": 1.8059575499150883e-07,
"loss": 0.2585,
"step": 898
},
{
"epoch": 2.7762886597938143,
"grad_norm": 0.34708575228506444,
"learning_rate": 1.758402154176314e-07,
"loss": 0.294,
"step": 899
},
{
"epoch": 2.779381443298969,
"grad_norm": 0.3774280912494685,
"learning_rate": 1.71147005982909e-07,
"loss": 0.2599,
"step": 900
},
{
"epoch": 2.7824742268041236,
"grad_norm": 0.3108267423948177,
"learning_rate": 1.6651618732554774e-07,
"loss": 0.2571,
"step": 901
},
{
"epoch": 2.7855670103092782,
"grad_norm": 0.433653354138328,
"learning_rate": 1.6194781927763913e-07,
"loss": 0.279,
"step": 902
},
{
"epoch": 2.788659793814433,
"grad_norm": 0.37052857742824075,
"learning_rate": 1.5744196086438789e-07,
"loss": 0.2591,
"step": 903
},
{
"epoch": 2.7917525773195875,
"grad_norm": 0.46199204480002765,
"learning_rate": 1.5299867030334815e-07,
"loss": 0.255,
"step": 904
},
{
"epoch": 2.794845360824742,
"grad_norm": 0.580124929875133,
"learning_rate": 1.4861800500367007e-07,
"loss": 0.2875,
"step": 905
},
{
"epoch": 2.7979381443298967,
"grad_norm": 0.5217340942830481,
"learning_rate": 1.4430002156536226e-07,
"loss": 0.2861,
"step": 906
},
{
"epoch": 2.8010309278350514,
"grad_norm": 0.3904931691443872,
"learning_rate": 1.4004477577855392e-07,
"loss": 0.2675,
"step": 907
},
{
"epoch": 2.804123711340206,
"grad_norm": 0.36630360257428185,
"learning_rate": 1.3585232262278258e-07,
"loss": 0.2712,
"step": 908
},
{
"epoch": 2.8072164948453606,
"grad_norm": 0.353945605569572,
"learning_rate": 1.3172271626627486e-07,
"loss": 0.3017,
"step": 909
},
{
"epoch": 2.8103092783505152,
"grad_norm": 0.4188118630836017,
"learning_rate": 1.276560100652535e-07,
"loss": 0.2353,
"step": 910
},
{
"epoch": 2.81340206185567,
"grad_norm": 0.34093312115837016,
"learning_rate": 1.2365225656324308e-07,
"loss": 0.2573,
"step": 911
},
{
"epoch": 2.8164948453608245,
"grad_norm": 0.3665586170666988,
"learning_rate": 1.197115074903954e-07,
"loss": 0.2662,
"step": 912
},
{
"epoch": 2.819587628865979,
"grad_norm": 0.2807017898415233,
"learning_rate": 1.1583381376281733e-07,
"loss": 0.2606,
"step": 913
},
{
"epoch": 2.8226804123711338,
"grad_norm": 0.4234165577007828,
"learning_rate": 1.1201922548191468e-07,
"loss": 0.274,
"step": 914
},
{
"epoch": 2.8257731958762884,
"grad_norm": 0.3720057394231554,
"learning_rate": 1.0826779193374715e-07,
"loss": 0.2876,
"step": 915
},
{
"epoch": 2.8288659793814435,
"grad_norm": 0.3204436821534787,
"learning_rate": 1.0457956158838545e-07,
"loss": 0.28,
"step": 916
},
{
"epoch": 2.831958762886598,
"grad_norm": 0.565914708121846,
"learning_rate": 1.0095458209929243e-07,
"loss": 0.2697,
"step": 917
},
{
"epoch": 2.8350515463917527,
"grad_norm": 0.34518258805906293,
"learning_rate": 9.739290030269965e-08,
"loss": 0.2942,
"step": 918
},
{
"epoch": 2.8381443298969073,
"grad_norm": 0.3636538558427229,
"learning_rate": 9.389456221701121e-08,
"loss": 0.2733,
"step": 919
},
{
"epoch": 2.841237113402062,
"grad_norm": 0.40022954321305015,
"learning_rate": 9.045961304219974e-08,
"loss": 0.2621,
"step": 920
},
{
"epoch": 2.8443298969072166,
"grad_norm": 0.4344415769634195,
"learning_rate": 8.708809715922973e-08,
"loss": 0.2534,
"step": 921
},
{
"epoch": 2.8474226804123712,
"grad_norm": 0.3824356118052112,
"learning_rate": 8.378005812948064e-08,
"loss": 0.2793,
"step": 922
},
{
"epoch": 2.850515463917526,
"grad_norm": 0.35498451884083126,
"learning_rate": 8.053553869418418e-08,
"loss": 0.2764,
"step": 923
},
{
"epoch": 2.8536082474226805,
"grad_norm": 0.4429208680189465,
"learning_rate": 7.735458077387292e-08,
"loss": 0.2563,
"step": 924
},
{
"epoch": 2.856701030927835,
"grad_norm": 0.3742405675124718,
"learning_rate": 7.423722546783918e-08,
"loss": 0.28,
"step": 925
},
{
"epoch": 2.8597938144329897,
"grad_norm": 0.42632928679275617,
"learning_rate": 7.118351305360205e-08,
"loss": 0.3038,
"step": 926
},
{
"epoch": 2.8628865979381444,
"grad_norm": 0.3897225778477736,
"learning_rate": 6.819348298638839e-08,
"loss": 0.2552,
"step": 927
},
{
"epoch": 2.865979381443299,
"grad_norm": 0.2883333432226637,
"learning_rate": 6.526717389862325e-08,
"loss": 0.3023,
"step": 928
},
{
"epoch": 2.8690721649484536,
"grad_norm": 0.5777472796705979,
"learning_rate": 6.240462359942967e-08,
"loss": 0.2608,
"step": 929
},
{
"epoch": 2.8721649484536083,
"grad_norm": 0.40631841925621104,
"learning_rate": 5.960586907414189e-08,
"loss": 0.2779,
"step": 930
},
{
"epoch": 2.875257731958763,
"grad_norm": 0.46092516469259476,
"learning_rate": 5.687094648382518e-08,
"loss": 0.2641,
"step": 931
},
{
"epoch": 2.8783505154639175,
"grad_norm": 0.34915642648853407,
"learning_rate": 5.419989116481061e-08,
"loss": 0.3025,
"step": 932
},
{
"epoch": 2.881443298969072,
"grad_norm": 0.47411839457507193,
"learning_rate": 5.159273762823658e-08,
"loss": 0.3305,
"step": 933
},
{
"epoch": 2.8845360824742268,
"grad_norm": 0.36833349217647166,
"learning_rate": 4.90495195596058e-08,
"loss": 0.2965,
"step": 934
},
{
"epoch": 2.8876288659793814,
"grad_norm": 0.4500567088618251,
"learning_rate": 4.657026981834623e-08,
"loss": 0.2479,
"step": 935
},
{
"epoch": 2.890721649484536,
"grad_norm": 0.4208605131949342,
"learning_rate": 4.415502043739084e-08,
"loss": 0.2799,
"step": 936
},
{
"epoch": 2.8938144329896907,
"grad_norm": 0.31553449780177045,
"learning_rate": 4.180380262275907e-08,
"loss": 0.2921,
"step": 937
},
{
"epoch": 2.8969072164948453,
"grad_norm": 0.3409240679158436,
"learning_rate": 3.9516646753158247e-08,
"loss": 0.2697,
"step": 938
},
{
"epoch": 2.9,
"grad_norm": 0.4208461142015241,
"learning_rate": 3.72935823795878e-08,
"loss": 0.2855,
"step": 939
},
{
"epoch": 2.9030927835051545,
"grad_norm": 0.4435280021877381,
"learning_rate": 3.513463822495844e-08,
"loss": 0.2819,
"step": 940
},
{
"epoch": 2.906185567010309,
"grad_norm": 0.41517671074392126,
"learning_rate": 3.303984218372136e-08,
"loss": 0.2588,
"step": 941
},
{
"epoch": 2.909278350515464,
"grad_norm": 0.3540642341715653,
"learning_rate": 3.100922132150741e-08,
"loss": 0.2296,
"step": 942
},
{
"epoch": 2.9123711340206184,
"grad_norm": 0.3690759806820083,
"learning_rate": 2.9042801874777925e-08,
"loss": 0.2871,
"step": 943
},
{
"epoch": 2.915463917525773,
"grad_norm": 0.4546986119024043,
"learning_rate": 2.714060925048556e-08,
"loss": 0.2569,
"step": 944
},
{
"epoch": 2.9185567010309277,
"grad_norm": 0.32836248625240094,
"learning_rate": 2.53026680257451e-08,
"loss": 0.2511,
"step": 945
},
{
"epoch": 2.9216494845360823,
"grad_norm": 0.4058215648872401,
"learning_rate": 2.352900194751706e-08,
"loss": 0.3142,
"step": 946
},
{
"epoch": 2.9247422680412374,
"grad_norm": 0.40012516007646937,
"learning_rate": 2.1819633932301797e-08,
"loss": 0.2792,
"step": 947
},
{
"epoch": 2.927835051546392,
"grad_norm": 0.4125955989567251,
"learning_rate": 2.0174586065838664e-08,
"loss": 0.2907,
"step": 948
},
{
"epoch": 2.9309278350515466,
"grad_norm": 0.3524425854408069,
"learning_rate": 1.8593879602828434e-08,
"loss": 0.2886,
"step": 949
},
{
"epoch": 2.9340206185567013,
"grad_norm": 0.3170471980249348,
"learning_rate": 1.7077534966650767e-08,
"loss": 0.2784,
"step": 950
},
{
"epoch": 2.937113402061856,
"grad_norm": 0.4411619208014409,
"learning_rate": 1.562557174910606e-08,
"loss": 0.3009,
"step": 951
},
{
"epoch": 2.9402061855670105,
"grad_norm": 0.46198859439075746,
"learning_rate": 1.4238008710159567e-08,
"loss": 0.2446,
"step": 952
},
{
"epoch": 2.943298969072165,
"grad_norm": 0.36122796091522313,
"learning_rate": 1.2914863777698794e-08,
"loss": 0.2738,
"step": 953
},
{
"epoch": 2.94639175257732,
"grad_norm": 0.35861966022079894,
"learning_rate": 1.1656154047303691e-08,
"loss": 0.2798,
"step": 954
},
{
"epoch": 2.9494845360824744,
"grad_norm": 0.3875255977808121,
"learning_rate": 1.0461895782025166e-08,
"loss": 0.2711,
"step": 955
},
{
"epoch": 2.952577319587629,
"grad_norm": 0.35530566543196673,
"learning_rate": 9.332104412173027e-09,
"loss": 0.3048,
"step": 956
},
{
"epoch": 2.9556701030927837,
"grad_norm": 0.361452363482159,
"learning_rate": 8.266794535118915e-09,
"loss": 0.2831,
"step": 957
},
{
"epoch": 2.9587628865979383,
"grad_norm": 0.4420803980691798,
"learning_rate": 7.265979915107024e-09,
"loss": 0.2521,
"step": 958
},
{
"epoch": 2.961855670103093,
"grad_norm": 0.46036523480311,
"learning_rate": 6.329673483076448e-09,
"loss": 0.2704,
"step": 959
},
{
"epoch": 2.9649484536082475,
"grad_norm": 0.6117555345915687,
"learning_rate": 5.4578873364929955e-09,
"loss": 0.2917,
"step": 960
},
{
"epoch": 2.968041237113402,
"grad_norm": 0.42432677847100164,
"learning_rate": 4.650632739194305e-09,
"loss": 0.2692,
"step": 961
},
{
"epoch": 2.971134020618557,
"grad_norm": 0.46221110515624103,
"learning_rate": 3.907920121243858e-09,
"loss": 0.2854,
"step": 962
},
{
"epoch": 2.9742268041237114,
"grad_norm": 0.4255820151298039,
"learning_rate": 3.2297590787955248e-09,
"loss": 0.2589,
"step": 963
},
{
"epoch": 2.977319587628866,
"grad_norm": 0.4146109541033293,
"learning_rate": 2.6161583739703344e-09,
"loss": 0.2617,
"step": 964
},
{
"epoch": 2.9804123711340207,
"grad_norm": 0.3106992898270277,
"learning_rate": 2.067125934742675e-09,
"loss": 0.2612,
"step": 965
},
{
"epoch": 2.9835051546391753,
"grad_norm": 0.308479948088998,
"learning_rate": 1.5826688548398194e-09,
"loss": 0.2896,
"step": 966
},
{
"epoch": 2.98659793814433,
"grad_norm": 0.4301763954612885,
"learning_rate": 1.1627933936464442e-09,
"loss": 0.2707,
"step": 967
},
{
"epoch": 2.9896907216494846,
"grad_norm": 0.4411906019363483,
"learning_rate": 8.075049761274711e-10,
"loss": 0.2932,
"step": 968
},
{
"epoch": 2.992783505154639,
"grad_norm": 0.40563705971599223,
"learning_rate": 5.168081927564572e-10,
"loss": 0.2849,
"step": 969
},
{
"epoch": 2.995876288659794,
"grad_norm": 0.3315762234528888,
"learning_rate": 2.907067994556423e-10,
"loss": 0.2589,
"step": 970
},
{
"epoch": 2.9989690721649485,
"grad_norm": 0.39679025047238414,
"learning_rate": 1.2920371754931994e-10,
"loss": 0.2737,
"step": 971
},
{
"epoch": 3.0,
"grad_norm": 1.2638973106902003,
"learning_rate": 3.230103372275917e-11,
"loss": 0.2755,
"step": 972
},
{
"epoch": 3.0,
"step": 972,
"total_flos": 328086624501760.0,
"train_loss": 0.3500921587179963,
"train_runtime": 40196.0341,
"train_samples_per_second": 1.158,
"train_steps_per_second": 0.024
}
],
"logging_steps": 1,
"max_steps": 972,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 328086624501760.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}