model_ctdlcm6q / checkpoint-2195 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
09371fb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.990136570561456,
"eval_steps": 500,
"global_step": 2195,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002276176024279211,
"grad_norm": 5.864941596984863,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.982,
"step": 1
},
{
"epoch": 0.004552352048558422,
"grad_norm": 6.175215244293213,
"learning_rate": 1.0000000000000001e-07,
"loss": 2.0217,
"step": 2
},
{
"epoch": 0.006828528072837633,
"grad_norm": 6.1325860023498535,
"learning_rate": 1.5000000000000002e-07,
"loss": 2.0283,
"step": 3
},
{
"epoch": 0.009104704097116844,
"grad_norm": 6.438838481903076,
"learning_rate": 2.0000000000000002e-07,
"loss": 2.0133,
"step": 4
},
{
"epoch": 0.011380880121396054,
"grad_norm": 6.120014190673828,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.9788,
"step": 5
},
{
"epoch": 0.013657056145675266,
"grad_norm": 6.399510860443115,
"learning_rate": 3.0000000000000004e-07,
"loss": 2.0115,
"step": 6
},
{
"epoch": 0.015933232169954476,
"grad_norm": 6.267389297485352,
"learning_rate": 3.5000000000000004e-07,
"loss": 2.034,
"step": 7
},
{
"epoch": 0.018209408194233688,
"grad_norm": 6.195969581604004,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.0221,
"step": 8
},
{
"epoch": 0.0204855842185129,
"grad_norm": 6.281792163848877,
"learning_rate": 4.5000000000000003e-07,
"loss": 2.034,
"step": 9
},
{
"epoch": 0.02276176024279211,
"grad_norm": 6.259925365447998,
"learning_rate": 5.000000000000001e-07,
"loss": 1.9919,
"step": 10
},
{
"epoch": 0.02503793626707132,
"grad_norm": 6.189306259155273,
"learning_rate": 5.5e-07,
"loss": 1.9989,
"step": 11
},
{
"epoch": 0.027314112291350532,
"grad_norm": 6.382223606109619,
"learning_rate": 6.000000000000001e-07,
"loss": 2.0004,
"step": 12
},
{
"epoch": 0.02959028831562974,
"grad_norm": 6.581198215484619,
"learning_rate": 6.5e-07,
"loss": 1.9606,
"step": 13
},
{
"epoch": 0.03186646433990895,
"grad_norm": 6.698477268218994,
"learning_rate": 7.000000000000001e-07,
"loss": 1.9986,
"step": 14
},
{
"epoch": 0.03414264036418816,
"grad_norm": 6.462113857269287,
"learning_rate": 7.5e-07,
"loss": 1.9435,
"step": 15
},
{
"epoch": 0.036418816388467376,
"grad_norm": 6.667123794555664,
"learning_rate": 8.000000000000001e-07,
"loss": 1.9262,
"step": 16
},
{
"epoch": 0.038694992412746584,
"grad_norm": 6.812009334564209,
"learning_rate": 8.500000000000001e-07,
"loss": 1.9341,
"step": 17
},
{
"epoch": 0.0409711684370258,
"grad_norm": 6.460822582244873,
"learning_rate": 9.000000000000001e-07,
"loss": 1.8857,
"step": 18
},
{
"epoch": 0.04324734446130501,
"grad_norm": 5.623890399932861,
"learning_rate": 9.500000000000001e-07,
"loss": 1.8256,
"step": 19
},
{
"epoch": 0.04552352048558422,
"grad_norm": 4.976780414581299,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.8312,
"step": 20
},
{
"epoch": 0.04779969650986343,
"grad_norm": 4.3025383949279785,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.8263,
"step": 21
},
{
"epoch": 0.05007587253414264,
"grad_norm": 3.7881436347961426,
"learning_rate": 1.1e-06,
"loss": 1.7652,
"step": 22
},
{
"epoch": 0.05235204855842185,
"grad_norm": 3.4925425052642822,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.7603,
"step": 23
},
{
"epoch": 0.054628224582701064,
"grad_norm": 3.0760865211486816,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.7599,
"step": 24
},
{
"epoch": 0.05690440060698027,
"grad_norm": 2.7170724868774414,
"learning_rate": 1.25e-06,
"loss": 1.7725,
"step": 25
},
{
"epoch": 0.05918057663125948,
"grad_norm": 2.0981554985046387,
"learning_rate": 1.3e-06,
"loss": 1.6781,
"step": 26
},
{
"epoch": 0.061456752655538696,
"grad_norm": 1.9057221412658691,
"learning_rate": 1.3500000000000002e-06,
"loss": 1.6897,
"step": 27
},
{
"epoch": 0.0637329286798179,
"grad_norm": 1.678957223892212,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.7124,
"step": 28
},
{
"epoch": 0.06600910470409711,
"grad_norm": 1.594223141670227,
"learning_rate": 1.45e-06,
"loss": 1.6953,
"step": 29
},
{
"epoch": 0.06828528072837632,
"grad_norm": 1.5038321018218994,
"learning_rate": 1.5e-06,
"loss": 1.6392,
"step": 30
},
{
"epoch": 0.07056145675265554,
"grad_norm": 1.5202770233154297,
"learning_rate": 1.5500000000000002e-06,
"loss": 1.6756,
"step": 31
},
{
"epoch": 0.07283763277693475,
"grad_norm": 1.4849720001220703,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.6587,
"step": 32
},
{
"epoch": 0.07511380880121396,
"grad_norm": 1.4973641633987427,
"learning_rate": 1.6500000000000003e-06,
"loss": 1.6222,
"step": 33
},
{
"epoch": 0.07738998482549317,
"grad_norm": 1.4055628776550293,
"learning_rate": 1.7000000000000002e-06,
"loss": 1.6318,
"step": 34
},
{
"epoch": 0.07966616084977238,
"grad_norm": 1.365734338760376,
"learning_rate": 1.75e-06,
"loss": 1.5656,
"step": 35
},
{
"epoch": 0.0819423368740516,
"grad_norm": 1.2574050426483154,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.602,
"step": 36
},
{
"epoch": 0.08421851289833081,
"grad_norm": 1.2459263801574707,
"learning_rate": 1.85e-06,
"loss": 1.571,
"step": 37
},
{
"epoch": 0.08649468892261002,
"grad_norm": 1.1563637256622314,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.5968,
"step": 38
},
{
"epoch": 0.08877086494688922,
"grad_norm": 1.0916545391082764,
"learning_rate": 1.9500000000000004e-06,
"loss": 1.5493,
"step": 39
},
{
"epoch": 0.09104704097116843,
"grad_norm": 1.0802186727523804,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.529,
"step": 40
},
{
"epoch": 0.09332321699544764,
"grad_norm": 1.0635664463043213,
"learning_rate": 2.05e-06,
"loss": 1.4784,
"step": 41
},
{
"epoch": 0.09559939301972686,
"grad_norm": 0.985824465751648,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.5508,
"step": 42
},
{
"epoch": 0.09787556904400607,
"grad_norm": 1.036191701889038,
"learning_rate": 2.15e-06,
"loss": 1.5465,
"step": 43
},
{
"epoch": 0.10015174506828528,
"grad_norm": 1.0564978122711182,
"learning_rate": 2.2e-06,
"loss": 1.503,
"step": 44
},
{
"epoch": 0.10242792109256449,
"grad_norm": 1.1553199291229248,
"learning_rate": 2.25e-06,
"loss": 1.4578,
"step": 45
},
{
"epoch": 0.1047040971168437,
"grad_norm": 1.1265777349472046,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.4497,
"step": 46
},
{
"epoch": 0.10698027314112292,
"grad_norm": 0.9469030499458313,
"learning_rate": 2.35e-06,
"loss": 1.4676,
"step": 47
},
{
"epoch": 0.10925644916540213,
"grad_norm": 0.649141252040863,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.455,
"step": 48
},
{
"epoch": 0.11153262518968134,
"grad_norm": 0.6022727489471436,
"learning_rate": 2.4500000000000003e-06,
"loss": 1.4814,
"step": 49
},
{
"epoch": 0.11380880121396054,
"grad_norm": 0.7700338363647461,
"learning_rate": 2.5e-06,
"loss": 1.4786,
"step": 50
},
{
"epoch": 0.11608497723823975,
"grad_norm": 0.924614429473877,
"learning_rate": 2.55e-06,
"loss": 1.4338,
"step": 51
},
{
"epoch": 0.11836115326251896,
"grad_norm": 0.8892627954483032,
"learning_rate": 2.6e-06,
"loss": 1.441,
"step": 52
},
{
"epoch": 0.12063732928679818,
"grad_norm": 0.7454217076301575,
"learning_rate": 2.6500000000000005e-06,
"loss": 1.4016,
"step": 53
},
{
"epoch": 0.12291350531107739,
"grad_norm": 0.5784000754356384,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.4222,
"step": 54
},
{
"epoch": 0.1251896813353566,
"grad_norm": 0.5783917903900146,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.4087,
"step": 55
},
{
"epoch": 0.1274658573596358,
"grad_norm": 0.5947427153587341,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.4008,
"step": 56
},
{
"epoch": 0.12974203338391502,
"grad_norm": 0.6172689199447632,
"learning_rate": 2.85e-06,
"loss": 1.4292,
"step": 57
},
{
"epoch": 0.13201820940819423,
"grad_norm": 0.6890118718147278,
"learning_rate": 2.9e-06,
"loss": 1.4215,
"step": 58
},
{
"epoch": 0.13429438543247343,
"grad_norm": 0.5748654007911682,
"learning_rate": 2.95e-06,
"loss": 1.4402,
"step": 59
},
{
"epoch": 0.13657056145675264,
"grad_norm": 0.5015429258346558,
"learning_rate": 3e-06,
"loss": 1.4338,
"step": 60
},
{
"epoch": 0.13884673748103188,
"grad_norm": 0.4844941794872284,
"learning_rate": 3.05e-06,
"loss": 1.3846,
"step": 61
},
{
"epoch": 0.1411229135053111,
"grad_norm": 0.48353612422943115,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.3864,
"step": 62
},
{
"epoch": 0.1433990895295903,
"grad_norm": 0.47880005836486816,
"learning_rate": 3.1500000000000003e-06,
"loss": 1.3764,
"step": 63
},
{
"epoch": 0.1456752655538695,
"grad_norm": 0.5600204467773438,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.398,
"step": 64
},
{
"epoch": 0.1479514415781487,
"grad_norm": 0.4868157207965851,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.3959,
"step": 65
},
{
"epoch": 0.15022761760242792,
"grad_norm": 0.4253179430961609,
"learning_rate": 3.3000000000000006e-06,
"loss": 1.3695,
"step": 66
},
{
"epoch": 0.15250379362670713,
"grad_norm": 0.4152253270149231,
"learning_rate": 3.3500000000000005e-06,
"loss": 1.428,
"step": 67
},
{
"epoch": 0.15477996965098634,
"grad_norm": 0.43653807044029236,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.4244,
"step": 68
},
{
"epoch": 0.15705614567526555,
"grad_norm": 0.4184909164905548,
"learning_rate": 3.45e-06,
"loss": 1.413,
"step": 69
},
{
"epoch": 0.15933232169954475,
"grad_norm": 0.4401929974555969,
"learning_rate": 3.5e-06,
"loss": 1.3769,
"step": 70
},
{
"epoch": 0.16160849772382396,
"grad_norm": 0.42470934987068176,
"learning_rate": 3.5500000000000003e-06,
"loss": 1.328,
"step": 71
},
{
"epoch": 0.1638846737481032,
"grad_norm": 0.43167445063591003,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.3585,
"step": 72
},
{
"epoch": 0.1661608497723824,
"grad_norm": 0.39305731654167175,
"learning_rate": 3.65e-06,
"loss": 1.3635,
"step": 73
},
{
"epoch": 0.16843702579666162,
"grad_norm": 0.3937039077281952,
"learning_rate": 3.7e-06,
"loss": 1.3583,
"step": 74
},
{
"epoch": 0.17071320182094082,
"grad_norm": 0.4098603129386902,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3651,
"step": 75
},
{
"epoch": 0.17298937784522003,
"grad_norm": 0.41061389446258545,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.4184,
"step": 76
},
{
"epoch": 0.17526555386949924,
"grad_norm": 0.3926120698451996,
"learning_rate": 3.85e-06,
"loss": 1.3693,
"step": 77
},
{
"epoch": 0.17754172989377845,
"grad_norm": 0.41317838430404663,
"learning_rate": 3.900000000000001e-06,
"loss": 1.3354,
"step": 78
},
{
"epoch": 0.17981790591805766,
"grad_norm": 0.37922877073287964,
"learning_rate": 3.95e-06,
"loss": 1.364,
"step": 79
},
{
"epoch": 0.18209408194233687,
"grad_norm": 0.3894996643066406,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3495,
"step": 80
},
{
"epoch": 0.18437025796661607,
"grad_norm": 0.4024641513824463,
"learning_rate": 4.05e-06,
"loss": 1.3604,
"step": 81
},
{
"epoch": 0.18664643399089528,
"grad_norm": 0.38427308201789856,
"learning_rate": 4.1e-06,
"loss": 1.3734,
"step": 82
},
{
"epoch": 0.18892261001517452,
"grad_norm": 0.38881292939186096,
"learning_rate": 4.15e-06,
"loss": 1.3235,
"step": 83
},
{
"epoch": 0.19119878603945373,
"grad_norm": 0.4112228453159332,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.3714,
"step": 84
},
{
"epoch": 0.19347496206373294,
"grad_norm": 0.3790343999862671,
"learning_rate": 4.25e-06,
"loss": 1.3508,
"step": 85
},
{
"epoch": 0.19575113808801214,
"grad_norm": 0.38511818647384644,
"learning_rate": 4.3e-06,
"loss": 1.3726,
"step": 86
},
{
"epoch": 0.19802731411229135,
"grad_norm": 0.3809172213077545,
"learning_rate": 4.350000000000001e-06,
"loss": 1.3978,
"step": 87
},
{
"epoch": 0.20030349013657056,
"grad_norm": 0.39862319827079773,
"learning_rate": 4.4e-06,
"loss": 1.3402,
"step": 88
},
{
"epoch": 0.20257966616084977,
"grad_norm": 0.3779354989528656,
"learning_rate": 4.450000000000001e-06,
"loss": 1.3585,
"step": 89
},
{
"epoch": 0.20485584218512898,
"grad_norm": 0.3755280375480652,
"learning_rate": 4.5e-06,
"loss": 1.3809,
"step": 90
},
{
"epoch": 0.2071320182094082,
"grad_norm": 0.4072270691394806,
"learning_rate": 4.5500000000000005e-06,
"loss": 1.337,
"step": 91
},
{
"epoch": 0.2094081942336874,
"grad_norm": 0.3852587938308716,
"learning_rate": 4.600000000000001e-06,
"loss": 1.3239,
"step": 92
},
{
"epoch": 0.2116843702579666,
"grad_norm": 0.3857567012310028,
"learning_rate": 4.65e-06,
"loss": 1.3676,
"step": 93
},
{
"epoch": 0.21396054628224584,
"grad_norm": 0.39954471588134766,
"learning_rate": 4.7e-06,
"loss": 1.372,
"step": 94
},
{
"epoch": 0.21623672230652505,
"grad_norm": 0.3801283836364746,
"learning_rate": 4.75e-06,
"loss": 1.3636,
"step": 95
},
{
"epoch": 0.21851289833080426,
"grad_norm": 0.37748953700065613,
"learning_rate": 4.800000000000001e-06,
"loss": 1.3298,
"step": 96
},
{
"epoch": 0.22078907435508346,
"grad_norm": 0.3678078055381775,
"learning_rate": 4.85e-06,
"loss": 1.3267,
"step": 97
},
{
"epoch": 0.22306525037936267,
"grad_norm": 0.3928042948246002,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.3705,
"step": 98
},
{
"epoch": 0.22534142640364188,
"grad_norm": 0.3824443817138672,
"learning_rate": 4.95e-06,
"loss": 1.3536,
"step": 99
},
{
"epoch": 0.2276176024279211,
"grad_norm": 0.38775718212127686,
"learning_rate": 5e-06,
"loss": 1.3366,
"step": 100
},
{
"epoch": 0.2298937784522003,
"grad_norm": 0.39415422081947327,
"learning_rate": 4.999998078694254e-06,
"loss": 1.3369,
"step": 101
},
{
"epoch": 0.2321699544764795,
"grad_norm": 0.3640560507774353,
"learning_rate": 4.999992314779968e-06,
"loss": 1.3548,
"step": 102
},
{
"epoch": 0.23444613050075871,
"grad_norm": 0.38077881932258606,
"learning_rate": 4.999982708266002e-06,
"loss": 1.322,
"step": 103
},
{
"epoch": 0.23672230652503792,
"grad_norm": 0.3910675346851349,
"learning_rate": 4.999969259167121e-06,
"loss": 1.3568,
"step": 104
},
{
"epoch": 0.23899848254931716,
"grad_norm": 0.3724777102470398,
"learning_rate": 4.999951967503998e-06,
"loss": 1.3657,
"step": 105
},
{
"epoch": 0.24127465857359637,
"grad_norm": 0.39835065603256226,
"learning_rate": 4.9999308333032095e-06,
"loss": 1.3728,
"step": 106
},
{
"epoch": 0.24355083459787558,
"grad_norm": 0.3887874186038971,
"learning_rate": 4.999905856597241e-06,
"loss": 1.3269,
"step": 107
},
{
"epoch": 0.24582701062215478,
"grad_norm": 0.37291401624679565,
"learning_rate": 4.999877037424482e-06,
"loss": 1.3522,
"step": 108
},
{
"epoch": 0.248103186646434,
"grad_norm": 0.3793584406375885,
"learning_rate": 4.999844375829229e-06,
"loss": 1.3459,
"step": 109
},
{
"epoch": 0.2503793626707132,
"grad_norm": 0.38437148928642273,
"learning_rate": 4.999807871861686e-06,
"loss": 1.3419,
"step": 110
},
{
"epoch": 0.2526555386949924,
"grad_norm": 0.37772583961486816,
"learning_rate": 4.999767525577958e-06,
"loss": 1.3349,
"step": 111
},
{
"epoch": 0.2549317147192716,
"grad_norm": 0.3829944133758545,
"learning_rate": 4.999723337040062e-06,
"loss": 1.3193,
"step": 112
},
{
"epoch": 0.2572078907435508,
"grad_norm": 0.38355737924575806,
"learning_rate": 4.999675306315917e-06,
"loss": 1.3457,
"step": 113
},
{
"epoch": 0.25948406676783003,
"grad_norm": 0.39071688055992126,
"learning_rate": 4.999623433479346e-06,
"loss": 1.3401,
"step": 114
},
{
"epoch": 0.26176024279210924,
"grad_norm": 0.3796067535877228,
"learning_rate": 4.9995677186100835e-06,
"loss": 1.3593,
"step": 115
},
{
"epoch": 0.26403641881638845,
"grad_norm": 0.3870932459831238,
"learning_rate": 4.9995081617937635e-06,
"loss": 1.3678,
"step": 116
},
{
"epoch": 0.26631259484066766,
"grad_norm": 0.3870759606361389,
"learning_rate": 4.999444763121928e-06,
"loss": 1.331,
"step": 117
},
{
"epoch": 0.26858877086494687,
"grad_norm": 0.37003180384635925,
"learning_rate": 4.999377522692023e-06,
"loss": 1.3242,
"step": 118
},
{
"epoch": 0.2708649468892261,
"grad_norm": 0.3826284408569336,
"learning_rate": 4.999306440607401e-06,
"loss": 1.2921,
"step": 119
},
{
"epoch": 0.2731411229135053,
"grad_norm": 0.3886045515537262,
"learning_rate": 4.999231516977318e-06,
"loss": 1.2971,
"step": 120
},
{
"epoch": 0.27541729893778455,
"grad_norm": 0.3992857336997986,
"learning_rate": 4.999152751916936e-06,
"loss": 1.2872,
"step": 121
},
{
"epoch": 0.27769347496206376,
"grad_norm": 0.4303230941295624,
"learning_rate": 4.999070145547318e-06,
"loss": 1.3562,
"step": 122
},
{
"epoch": 0.27996965098634297,
"grad_norm": 0.40188783407211304,
"learning_rate": 4.998983697995435e-06,
"loss": 1.3251,
"step": 123
},
{
"epoch": 0.2822458270106222,
"grad_norm": 0.41683951020240784,
"learning_rate": 4.998893409394162e-06,
"loss": 1.3279,
"step": 124
},
{
"epoch": 0.2845220030349014,
"grad_norm": 0.4539605379104614,
"learning_rate": 4.9987992798822745e-06,
"loss": 1.3133,
"step": 125
},
{
"epoch": 0.2867981790591806,
"grad_norm": 0.40195104479789734,
"learning_rate": 4.998701309604454e-06,
"loss": 1.3372,
"step": 126
},
{
"epoch": 0.2890743550834598,
"grad_norm": 0.40602678060531616,
"learning_rate": 4.998599498711287e-06,
"loss": 1.3008,
"step": 127
},
{
"epoch": 0.291350531107739,
"grad_norm": 0.37955862283706665,
"learning_rate": 4.99849384735926e-06,
"loss": 1.2919,
"step": 128
},
{
"epoch": 0.2936267071320182,
"grad_norm": 0.38034912943840027,
"learning_rate": 4.9983843557107635e-06,
"loss": 1.3307,
"step": 129
},
{
"epoch": 0.2959028831562974,
"grad_norm": 0.3922058641910553,
"learning_rate": 4.9982710239340915e-06,
"loss": 1.3211,
"step": 130
},
{
"epoch": 0.29817905918057663,
"grad_norm": 0.4012414515018463,
"learning_rate": 4.998153852203441e-06,
"loss": 1.3762,
"step": 131
},
{
"epoch": 0.30045523520485584,
"grad_norm": 0.41045159101486206,
"learning_rate": 4.998032840698909e-06,
"loss": 1.3384,
"step": 132
},
{
"epoch": 0.30273141122913505,
"grad_norm": 0.3880952298641205,
"learning_rate": 4.997907989606495e-06,
"loss": 1.2976,
"step": 133
},
{
"epoch": 0.30500758725341426,
"grad_norm": 0.39358070492744446,
"learning_rate": 4.997779299118102e-06,
"loss": 1.3036,
"step": 134
},
{
"epoch": 0.30728376327769347,
"grad_norm": 0.400647908449173,
"learning_rate": 4.997646769431532e-06,
"loss": 1.3573,
"step": 135
},
{
"epoch": 0.3095599393019727,
"grad_norm": 0.40589869022369385,
"learning_rate": 4.99751040075049e-06,
"loss": 1.3462,
"step": 136
},
{
"epoch": 0.3118361153262519,
"grad_norm": 0.420673131942749,
"learning_rate": 4.997370193284581e-06,
"loss": 1.317,
"step": 137
},
{
"epoch": 0.3141122913505311,
"grad_norm": 0.3844830393791199,
"learning_rate": 4.997226147249309e-06,
"loss": 1.3437,
"step": 138
},
{
"epoch": 0.3163884673748103,
"grad_norm": 0.37681150436401367,
"learning_rate": 4.9970782628660794e-06,
"loss": 1.3216,
"step": 139
},
{
"epoch": 0.3186646433990895,
"grad_norm": 0.40281322598457336,
"learning_rate": 4.996926540362198e-06,
"loss": 1.3578,
"step": 140
},
{
"epoch": 0.3209408194233687,
"grad_norm": 0.3950099050998688,
"learning_rate": 4.9967709799708675e-06,
"loss": 1.3472,
"step": 141
},
{
"epoch": 0.3232169954476479,
"grad_norm": 0.3890508711338043,
"learning_rate": 4.9966115819311926e-06,
"loss": 1.3112,
"step": 142
},
{
"epoch": 0.3254931714719272,
"grad_norm": 0.3960939347743988,
"learning_rate": 4.996448346488175e-06,
"loss": 1.331,
"step": 143
},
{
"epoch": 0.3277693474962064,
"grad_norm": 0.394761323928833,
"learning_rate": 4.9962812738927135e-06,
"loss": 1.3265,
"step": 144
},
{
"epoch": 0.3300455235204856,
"grad_norm": 0.4139835238456726,
"learning_rate": 4.996110364401607e-06,
"loss": 1.3423,
"step": 145
},
{
"epoch": 0.3323216995447648,
"grad_norm": 0.40223428606987,
"learning_rate": 4.9959356182775525e-06,
"loss": 1.3213,
"step": 146
},
{
"epoch": 0.334597875569044,
"grad_norm": 0.41239285469055176,
"learning_rate": 4.9957570357891406e-06,
"loss": 1.3488,
"step": 147
},
{
"epoch": 0.33687405159332323,
"grad_norm": 0.41569817066192627,
"learning_rate": 4.995574617210861e-06,
"loss": 1.3373,
"step": 148
},
{
"epoch": 0.33915022761760244,
"grad_norm": 0.40224048495292664,
"learning_rate": 4.9953883628231e-06,
"loss": 1.3086,
"step": 149
},
{
"epoch": 0.34142640364188165,
"grad_norm": 0.4080573618412018,
"learning_rate": 4.995198272912137e-06,
"loss": 1.3221,
"step": 150
},
{
"epoch": 0.34370257966616086,
"grad_norm": 0.41279059648513794,
"learning_rate": 4.9950043477701505e-06,
"loss": 1.3336,
"step": 151
},
{
"epoch": 0.34597875569044007,
"grad_norm": 0.4138430655002594,
"learning_rate": 4.994806587695212e-06,
"loss": 1.3245,
"step": 152
},
{
"epoch": 0.3482549317147193,
"grad_norm": 0.4141685664653778,
"learning_rate": 4.994604992991287e-06,
"loss": 1.3459,
"step": 153
},
{
"epoch": 0.3505311077389985,
"grad_norm": 0.4655224680900574,
"learning_rate": 4.994399563968235e-06,
"loss": 1.307,
"step": 154
},
{
"epoch": 0.3528072837632777,
"grad_norm": 0.40181776881217957,
"learning_rate": 4.99419030094181e-06,
"loss": 1.2951,
"step": 155
},
{
"epoch": 0.3550834597875569,
"grad_norm": 0.4349536597728729,
"learning_rate": 4.99397720423366e-06,
"loss": 1.3346,
"step": 156
},
{
"epoch": 0.3573596358118361,
"grad_norm": 0.47389090061187744,
"learning_rate": 4.993760274171322e-06,
"loss": 1.2918,
"step": 157
},
{
"epoch": 0.3596358118361153,
"grad_norm": 0.43464231491088867,
"learning_rate": 4.993539511088228e-06,
"loss": 1.3469,
"step": 158
},
{
"epoch": 0.3619119878603945,
"grad_norm": 0.43050721287727356,
"learning_rate": 4.993314915323701e-06,
"loss": 1.2993,
"step": 159
},
{
"epoch": 0.36418816388467373,
"grad_norm": 0.4154967665672302,
"learning_rate": 4.9930864872229555e-06,
"loss": 1.301,
"step": 160
},
{
"epoch": 0.36646433990895294,
"grad_norm": 0.4043583869934082,
"learning_rate": 4.992854227137094e-06,
"loss": 1.3357,
"step": 161
},
{
"epoch": 0.36874051593323215,
"grad_norm": 0.4242326617240906,
"learning_rate": 4.992618135423111e-06,
"loss": 1.3139,
"step": 162
},
{
"epoch": 0.37101669195751136,
"grad_norm": 0.4029645621776581,
"learning_rate": 4.992378212443891e-06,
"loss": 1.2773,
"step": 163
},
{
"epoch": 0.37329286798179057,
"grad_norm": 0.3948841989040375,
"learning_rate": 4.992134458568205e-06,
"loss": 1.3267,
"step": 164
},
{
"epoch": 0.37556904400606983,
"grad_norm": 0.4325512647628784,
"learning_rate": 4.991886874170715e-06,
"loss": 1.2986,
"step": 165
},
{
"epoch": 0.37784522003034904,
"grad_norm": 0.4292261600494385,
"learning_rate": 4.991635459631968e-06,
"loss": 1.3383,
"step": 166
},
{
"epoch": 0.38012139605462825,
"grad_norm": 0.407819539308548,
"learning_rate": 4.991380215338399e-06,
"loss": 1.2798,
"step": 167
},
{
"epoch": 0.38239757207890746,
"grad_norm": 0.41592007875442505,
"learning_rate": 4.991121141682332e-06,
"loss": 1.3161,
"step": 168
},
{
"epoch": 0.38467374810318666,
"grad_norm": 0.4135512411594391,
"learning_rate": 4.990858239061973e-06,
"loss": 1.3221,
"step": 169
},
{
"epoch": 0.38694992412746587,
"grad_norm": 0.4168025851249695,
"learning_rate": 4.990591507881416e-06,
"loss": 1.3094,
"step": 170
},
{
"epoch": 0.3892261001517451,
"grad_norm": 0.42845603823661804,
"learning_rate": 4.990320948550638e-06,
"loss": 1.3086,
"step": 171
},
{
"epoch": 0.3915022761760243,
"grad_norm": 0.4117361009120941,
"learning_rate": 4.9900465614855e-06,
"loss": 1.3074,
"step": 172
},
{
"epoch": 0.3937784522003035,
"grad_norm": 0.40385058522224426,
"learning_rate": 4.989768347107749e-06,
"loss": 1.3015,
"step": 173
},
{
"epoch": 0.3960546282245827,
"grad_norm": 0.42507070302963257,
"learning_rate": 4.989486305845012e-06,
"loss": 1.303,
"step": 174
},
{
"epoch": 0.3983308042488619,
"grad_norm": 0.4167408347129822,
"learning_rate": 4.989200438130799e-06,
"loss": 1.3246,
"step": 175
},
{
"epoch": 0.4006069802731411,
"grad_norm": 0.4459727108478546,
"learning_rate": 4.988910744404501e-06,
"loss": 1.3082,
"step": 176
},
{
"epoch": 0.40288315629742033,
"grad_norm": 0.41572514176368713,
"learning_rate": 4.988617225111392e-06,
"loss": 1.329,
"step": 177
},
{
"epoch": 0.40515933232169954,
"grad_norm": 0.40346917510032654,
"learning_rate": 4.988319880702621e-06,
"loss": 1.3204,
"step": 178
},
{
"epoch": 0.40743550834597875,
"grad_norm": 0.49305301904678345,
"learning_rate": 4.988018711635223e-06,
"loss": 1.3174,
"step": 179
},
{
"epoch": 0.40971168437025796,
"grad_norm": 0.4136899411678314,
"learning_rate": 4.987713718372106e-06,
"loss": 1.3153,
"step": 180
},
{
"epoch": 0.41198786039453716,
"grad_norm": 0.4320002794265747,
"learning_rate": 4.98740490138206e-06,
"loss": 1.3233,
"step": 181
},
{
"epoch": 0.4142640364188164,
"grad_norm": 0.40051817893981934,
"learning_rate": 4.9870922611397484e-06,
"loss": 1.3298,
"step": 182
},
{
"epoch": 0.4165402124430956,
"grad_norm": 0.43490317463874817,
"learning_rate": 4.986775798125715e-06,
"loss": 1.2924,
"step": 183
},
{
"epoch": 0.4188163884673748,
"grad_norm": 0.41733044385910034,
"learning_rate": 4.986455512826377e-06,
"loss": 1.3407,
"step": 184
},
{
"epoch": 0.421092564491654,
"grad_norm": 0.45686185359954834,
"learning_rate": 4.986131405734027e-06,
"loss": 1.3002,
"step": 185
},
{
"epoch": 0.4233687405159332,
"grad_norm": 0.4178033173084259,
"learning_rate": 4.985803477346832e-06,
"loss": 1.2707,
"step": 186
},
{
"epoch": 0.42564491654021247,
"grad_norm": 0.44030341506004333,
"learning_rate": 4.985471728168832e-06,
"loss": 1.3522,
"step": 187
},
{
"epoch": 0.4279210925644917,
"grad_norm": 0.4167434573173523,
"learning_rate": 4.985136158709942e-06,
"loss": 1.2952,
"step": 188
},
{
"epoch": 0.4301972685887709,
"grad_norm": 0.43799030780792236,
"learning_rate": 4.984796769485946e-06,
"loss": 1.3204,
"step": 189
},
{
"epoch": 0.4324734446130501,
"grad_norm": 0.3963024914264679,
"learning_rate": 4.984453561018501e-06,
"loss": 1.2852,
"step": 190
},
{
"epoch": 0.4347496206373293,
"grad_norm": 0.4606306850910187,
"learning_rate": 4.984106533835132e-06,
"loss": 1.3,
"step": 191
},
{
"epoch": 0.4370257966616085,
"grad_norm": 0.43703702092170715,
"learning_rate": 4.9837556884692374e-06,
"loss": 1.2865,
"step": 192
},
{
"epoch": 0.4393019726858877,
"grad_norm": 0.419226735830307,
"learning_rate": 4.9834010254600814e-06,
"loss": 1.3212,
"step": 193
},
{
"epoch": 0.44157814871016693,
"grad_norm": 0.4051378071308136,
"learning_rate": 4.983042545352796e-06,
"loss": 1.3102,
"step": 194
},
{
"epoch": 0.44385432473444614,
"grad_norm": 0.44308584928512573,
"learning_rate": 4.982680248698383e-06,
"loss": 1.2753,
"step": 195
},
{
"epoch": 0.44613050075872535,
"grad_norm": 0.48592913150787354,
"learning_rate": 4.982314136053707e-06,
"loss": 1.3468,
"step": 196
},
{
"epoch": 0.44840667678300455,
"grad_norm": 0.4361239969730377,
"learning_rate": 4.981944207981499e-06,
"loss": 1.2345,
"step": 197
},
{
"epoch": 0.45068285280728376,
"grad_norm": 0.4420235753059387,
"learning_rate": 4.981570465050357e-06,
"loss": 1.308,
"step": 198
},
{
"epoch": 0.45295902883156297,
"grad_norm": 0.4724012315273285,
"learning_rate": 4.98119290783474e-06,
"loss": 1.3451,
"step": 199
},
{
"epoch": 0.4552352048558422,
"grad_norm": 0.4347815215587616,
"learning_rate": 4.980811536914968e-06,
"loss": 1.2926,
"step": 200
},
{
"epoch": 0.4575113808801214,
"grad_norm": 0.4243141710758209,
"learning_rate": 4.980426352877228e-06,
"loss": 1.2863,
"step": 201
},
{
"epoch": 0.4597875569044006,
"grad_norm": 0.41129249334335327,
"learning_rate": 4.980037356313563e-06,
"loss": 1.3017,
"step": 202
},
{
"epoch": 0.4620637329286798,
"grad_norm": 0.4349686801433563,
"learning_rate": 4.979644547821879e-06,
"loss": 1.3655,
"step": 203
},
{
"epoch": 0.464339908952959,
"grad_norm": 0.438151478767395,
"learning_rate": 4.97924792800594e-06,
"loss": 1.304,
"step": 204
},
{
"epoch": 0.4666160849772382,
"grad_norm": 0.46755126118659973,
"learning_rate": 4.978847497475369e-06,
"loss": 1.3282,
"step": 205
},
{
"epoch": 0.46889226100151743,
"grad_norm": 0.42544615268707275,
"learning_rate": 4.9784432568456445e-06,
"loss": 1.3524,
"step": 206
},
{
"epoch": 0.47116843702579664,
"grad_norm": 0.4163425862789154,
"learning_rate": 4.9780352067381024e-06,
"loss": 1.3303,
"step": 207
},
{
"epoch": 0.47344461305007585,
"grad_norm": 0.4662051498889923,
"learning_rate": 4.977623347779935e-06,
"loss": 1.2723,
"step": 208
},
{
"epoch": 0.4757207890743551,
"grad_norm": 0.4841192662715912,
"learning_rate": 4.977207680604187e-06,
"loss": 1.3281,
"step": 209
},
{
"epoch": 0.4779969650986343,
"grad_norm": 0.47023245692253113,
"learning_rate": 4.976788205849758e-06,
"loss": 1.2983,
"step": 210
},
{
"epoch": 0.4802731411229135,
"grad_norm": 0.4251156449317932,
"learning_rate": 4.9763649241613985e-06,
"loss": 1.3215,
"step": 211
},
{
"epoch": 0.48254931714719274,
"grad_norm": 0.436788409948349,
"learning_rate": 4.975937836189712e-06,
"loss": 1.3006,
"step": 212
},
{
"epoch": 0.48482549317147194,
"grad_norm": 0.46025222539901733,
"learning_rate": 4.975506942591152e-06,
"loss": 1.3121,
"step": 213
},
{
"epoch": 0.48710166919575115,
"grad_norm": 0.43663930892944336,
"learning_rate": 4.97507224402802e-06,
"loss": 1.3133,
"step": 214
},
{
"epoch": 0.48937784522003036,
"grad_norm": 0.48787179589271545,
"learning_rate": 4.974633741168469e-06,
"loss": 1.266,
"step": 215
},
{
"epoch": 0.49165402124430957,
"grad_norm": 0.4265913665294647,
"learning_rate": 4.974191434686496e-06,
"loss": 1.3035,
"step": 216
},
{
"epoch": 0.4939301972685888,
"grad_norm": 0.4345017373561859,
"learning_rate": 4.973745325261946e-06,
"loss": 1.2987,
"step": 217
},
{
"epoch": 0.496206373292868,
"grad_norm": 0.47078996896743774,
"learning_rate": 4.973295413580509e-06,
"loss": 1.3176,
"step": 218
},
{
"epoch": 0.4984825493171472,
"grad_norm": 0.4349548816680908,
"learning_rate": 4.97284170033372e-06,
"loss": 1.2829,
"step": 219
},
{
"epoch": 0.5007587253414264,
"grad_norm": 0.4705260694026947,
"learning_rate": 4.9723841862189555e-06,
"loss": 1.2847,
"step": 220
},
{
"epoch": 0.5030349013657056,
"grad_norm": 0.4285137951374054,
"learning_rate": 4.971922871939436e-06,
"loss": 1.2774,
"step": 221
},
{
"epoch": 0.5053110773899848,
"grad_norm": 0.46022048592567444,
"learning_rate": 4.971457758204221e-06,
"loss": 1.3006,
"step": 222
},
{
"epoch": 0.507587253414264,
"grad_norm": 0.4904478192329407,
"learning_rate": 4.970988845728213e-06,
"loss": 1.3032,
"step": 223
},
{
"epoch": 0.5098634294385432,
"grad_norm": 0.4171503484249115,
"learning_rate": 4.9705161352321496e-06,
"loss": 1.3118,
"step": 224
},
{
"epoch": 0.5121396054628224,
"grad_norm": 0.4424084722995758,
"learning_rate": 4.970039627442608e-06,
"loss": 1.2342,
"step": 225
},
{
"epoch": 0.5144157814871017,
"grad_norm": 0.45744988322257996,
"learning_rate": 4.969559323092004e-06,
"loss": 1.2975,
"step": 226
},
{
"epoch": 0.5166919575113809,
"grad_norm": 0.4306228756904602,
"learning_rate": 4.969075222918583e-06,
"loss": 1.2791,
"step": 227
},
{
"epoch": 0.5189681335356601,
"grad_norm": 0.43930479884147644,
"learning_rate": 4.9685873276664324e-06,
"loss": 1.2952,
"step": 228
},
{
"epoch": 0.5212443095599393,
"grad_norm": 0.4268686771392822,
"learning_rate": 4.968095638085467e-06,
"loss": 1.2902,
"step": 229
},
{
"epoch": 0.5235204855842185,
"grad_norm": 0.4320680499076843,
"learning_rate": 4.9676001549314356e-06,
"loss": 1.2941,
"step": 230
},
{
"epoch": 0.5257966616084977,
"grad_norm": 0.4509009122848511,
"learning_rate": 4.967100878965918e-06,
"loss": 1.3353,
"step": 231
},
{
"epoch": 0.5280728376327769,
"grad_norm": 0.4458315670490265,
"learning_rate": 4.966597810956325e-06,
"loss": 1.2918,
"step": 232
},
{
"epoch": 0.5303490136570561,
"grad_norm": 0.4613376259803772,
"learning_rate": 4.966090951675893e-06,
"loss": 1.3085,
"step": 233
},
{
"epoch": 0.5326251896813353,
"grad_norm": 0.4486188590526581,
"learning_rate": 4.9655803019036875e-06,
"loss": 1.2783,
"step": 234
},
{
"epoch": 0.5349013657056145,
"grad_norm": 0.44070056080818176,
"learning_rate": 4.9650658624246e-06,
"loss": 1.2969,
"step": 235
},
{
"epoch": 0.5371775417298937,
"grad_norm": 0.45442667603492737,
"learning_rate": 4.9645476340293474e-06,
"loss": 1.273,
"step": 236
},
{
"epoch": 0.539453717754173,
"grad_norm": 0.4485810697078705,
"learning_rate": 4.96402561751447e-06,
"loss": 1.2524,
"step": 237
},
{
"epoch": 0.5417298937784522,
"grad_norm": 0.43408727645874023,
"learning_rate": 4.96349981368233e-06,
"loss": 1.3,
"step": 238
},
{
"epoch": 0.5440060698027314,
"grad_norm": 0.45317673683166504,
"learning_rate": 4.962970223341112e-06,
"loss": 1.2959,
"step": 239
},
{
"epoch": 0.5462822458270106,
"grad_norm": 0.45147350430488586,
"learning_rate": 4.962436847304818e-06,
"loss": 1.2588,
"step": 240
},
{
"epoch": 0.5485584218512898,
"grad_norm": 0.4372202157974243,
"learning_rate": 4.961899686393273e-06,
"loss": 1.2472,
"step": 241
},
{
"epoch": 0.5508345978755691,
"grad_norm": 0.4300381541252136,
"learning_rate": 4.961358741432116e-06,
"loss": 1.2892,
"step": 242
},
{
"epoch": 0.5531107738998483,
"grad_norm": 0.4326576888561249,
"learning_rate": 4.9608140132528045e-06,
"loss": 1.2873,
"step": 243
},
{
"epoch": 0.5553869499241275,
"grad_norm": 0.42891374230384827,
"learning_rate": 4.960265502692609e-06,
"loss": 1.3159,
"step": 244
},
{
"epoch": 0.5576631259484067,
"grad_norm": 0.44637322425842285,
"learning_rate": 4.959713210594616e-06,
"loss": 1.2964,
"step": 245
},
{
"epoch": 0.5599393019726859,
"grad_norm": 0.4534567892551422,
"learning_rate": 4.959157137807721e-06,
"loss": 1.2811,
"step": 246
},
{
"epoch": 0.5622154779969651,
"grad_norm": 0.4480896294116974,
"learning_rate": 4.958597285186635e-06,
"loss": 1.2887,
"step": 247
},
{
"epoch": 0.5644916540212443,
"grad_norm": 0.42966964840888977,
"learning_rate": 4.958033653591874e-06,
"loss": 1.2927,
"step": 248
},
{
"epoch": 0.5667678300455236,
"grad_norm": 0.4520474076271057,
"learning_rate": 4.9574662438897675e-06,
"loss": 1.334,
"step": 249
},
{
"epoch": 0.5690440060698028,
"grad_norm": 0.4476149082183838,
"learning_rate": 4.956895056952448e-06,
"loss": 1.2813,
"step": 250
},
{
"epoch": 0.571320182094082,
"grad_norm": 0.4495325982570648,
"learning_rate": 4.956320093657855e-06,
"loss": 1.3455,
"step": 251
},
{
"epoch": 0.5735963581183612,
"grad_norm": 0.4634062945842743,
"learning_rate": 4.955741354889734e-06,
"loss": 1.3009,
"step": 252
},
{
"epoch": 0.5758725341426404,
"grad_norm": 0.43844589591026306,
"learning_rate": 4.955158841537632e-06,
"loss": 1.2775,
"step": 253
},
{
"epoch": 0.5781487101669196,
"grad_norm": 0.4297947585582733,
"learning_rate": 4.954572554496897e-06,
"loss": 1.3005,
"step": 254
},
{
"epoch": 0.5804248861911988,
"grad_norm": 0.45026981830596924,
"learning_rate": 4.953982494668679e-06,
"loss": 1.2829,
"step": 255
},
{
"epoch": 0.582701062215478,
"grad_norm": 0.4508177936077118,
"learning_rate": 4.953388662959926e-06,
"loss": 1.3249,
"step": 256
},
{
"epoch": 0.5849772382397572,
"grad_norm": 0.4628501236438751,
"learning_rate": 4.952791060283384e-06,
"loss": 1.2772,
"step": 257
},
{
"epoch": 0.5872534142640364,
"grad_norm": 0.47145721316337585,
"learning_rate": 4.952189687557595e-06,
"loss": 1.2843,
"step": 258
},
{
"epoch": 0.5895295902883156,
"grad_norm": 0.44380298256874084,
"learning_rate": 4.951584545706896e-06,
"loss": 1.3169,
"step": 259
},
{
"epoch": 0.5918057663125948,
"grad_norm": 0.45627689361572266,
"learning_rate": 4.950975635661416e-06,
"loss": 1.2855,
"step": 260
},
{
"epoch": 0.5940819423368741,
"grad_norm": 0.43097957968711853,
"learning_rate": 4.950362958357078e-06,
"loss": 1.2802,
"step": 261
},
{
"epoch": 0.5963581183611533,
"grad_norm": 0.4480797052383423,
"learning_rate": 4.949746514735594e-06,
"loss": 1.2845,
"step": 262
},
{
"epoch": 0.5986342943854325,
"grad_norm": 0.4356028139591217,
"learning_rate": 4.949126305744466e-06,
"loss": 1.2559,
"step": 263
},
{
"epoch": 0.6009104704097117,
"grad_norm": 0.45533114671707153,
"learning_rate": 4.948502332336982e-06,
"loss": 1.333,
"step": 264
},
{
"epoch": 0.6031866464339909,
"grad_norm": 0.43486839532852173,
"learning_rate": 4.947874595472216e-06,
"loss": 1.299,
"step": 265
},
{
"epoch": 0.6054628224582701,
"grad_norm": 0.45472636818885803,
"learning_rate": 4.947243096115028e-06,
"loss": 1.2853,
"step": 266
},
{
"epoch": 0.6077389984825493,
"grad_norm": 0.448030024766922,
"learning_rate": 4.946607835236064e-06,
"loss": 1.2549,
"step": 267
},
{
"epoch": 0.6100151745068285,
"grad_norm": 0.46248579025268555,
"learning_rate": 4.945968813811743e-06,
"loss": 1.2845,
"step": 268
},
{
"epoch": 0.6122913505311077,
"grad_norm": 0.47284016013145447,
"learning_rate": 4.9453260328242735e-06,
"loss": 1.274,
"step": 269
},
{
"epoch": 0.6145675265553869,
"grad_norm": 0.46916916966438293,
"learning_rate": 4.944679493261637e-06,
"loss": 1.272,
"step": 270
},
{
"epoch": 0.6168437025796661,
"grad_norm": 0.4469199776649475,
"learning_rate": 4.944029196117594e-06,
"loss": 1.273,
"step": 271
},
{
"epoch": 0.6191198786039454,
"grad_norm": 0.4460132420063019,
"learning_rate": 4.943375142391679e-06,
"loss": 1.2749,
"step": 272
},
{
"epoch": 0.6213960546282246,
"grad_norm": 0.45281344652175903,
"learning_rate": 4.942717333089204e-06,
"loss": 1.2858,
"step": 273
},
{
"epoch": 0.6236722306525038,
"grad_norm": 0.4766104221343994,
"learning_rate": 4.942055769221249e-06,
"loss": 1.3047,
"step": 274
},
{
"epoch": 0.625948406676783,
"grad_norm": 0.4342869818210602,
"learning_rate": 4.941390451804668e-06,
"loss": 1.258,
"step": 275
},
{
"epoch": 0.6282245827010622,
"grad_norm": 0.44943931698799133,
"learning_rate": 4.940721381862083e-06,
"loss": 1.2714,
"step": 276
},
{
"epoch": 0.6305007587253414,
"grad_norm": 0.4642450213432312,
"learning_rate": 4.940048560421887e-06,
"loss": 1.2883,
"step": 277
},
{
"epoch": 0.6327769347496206,
"grad_norm": 0.530925989151001,
"learning_rate": 4.9393719885182335e-06,
"loss": 1.2869,
"step": 278
},
{
"epoch": 0.6350531107738998,
"grad_norm": 0.44706323742866516,
"learning_rate": 4.938691667191044e-06,
"loss": 1.2912,
"step": 279
},
{
"epoch": 0.637329286798179,
"grad_norm": 0.46952497959136963,
"learning_rate": 4.938007597486005e-06,
"loss": 1.3293,
"step": 280
},
{
"epoch": 0.6396054628224582,
"grad_norm": 0.45387259125709534,
"learning_rate": 4.937319780454559e-06,
"loss": 1.2328,
"step": 281
},
{
"epoch": 0.6418816388467374,
"grad_norm": 0.4683968126773834,
"learning_rate": 4.936628217153914e-06,
"loss": 1.3101,
"step": 282
},
{
"epoch": 0.6441578148710166,
"grad_norm": 0.4984208941459656,
"learning_rate": 4.935932908647033e-06,
"loss": 1.3078,
"step": 283
},
{
"epoch": 0.6464339908952959,
"grad_norm": 0.47393515706062317,
"learning_rate": 4.935233856002635e-06,
"loss": 1.2667,
"step": 284
},
{
"epoch": 0.6487101669195751,
"grad_norm": 0.4559146761894226,
"learning_rate": 4.9345310602951964e-06,
"loss": 1.2816,
"step": 285
},
{
"epoch": 0.6509863429438544,
"grad_norm": 0.4612574279308319,
"learning_rate": 4.933824522604945e-06,
"loss": 1.3009,
"step": 286
},
{
"epoch": 0.6532625189681336,
"grad_norm": 0.4839983880519867,
"learning_rate": 4.933114244017861e-06,
"loss": 1.2762,
"step": 287
},
{
"epoch": 0.6555386949924128,
"grad_norm": 0.47950032353401184,
"learning_rate": 4.932400225625674e-06,
"loss": 1.2639,
"step": 288
},
{
"epoch": 0.657814871016692,
"grad_norm": 0.46797841787338257,
"learning_rate": 4.931682468525863e-06,
"loss": 1.3116,
"step": 289
},
{
"epoch": 0.6600910470409712,
"grad_norm": 0.46507689356803894,
"learning_rate": 4.93096097382165e-06,
"loss": 1.2795,
"step": 290
},
{
"epoch": 0.6623672230652504,
"grad_norm": 0.4672064781188965,
"learning_rate": 4.9302357426220086e-06,
"loss": 1.2769,
"step": 291
},
{
"epoch": 0.6646433990895296,
"grad_norm": 0.469881147146225,
"learning_rate": 4.929506776041648e-06,
"loss": 1.246,
"step": 292
},
{
"epoch": 0.6669195751138088,
"grad_norm": 0.49012723565101624,
"learning_rate": 4.928774075201024e-06,
"loss": 1.3308,
"step": 293
},
{
"epoch": 0.669195751138088,
"grad_norm": 0.47186344861984253,
"learning_rate": 4.9280376412263295e-06,
"loss": 1.2685,
"step": 294
},
{
"epoch": 0.6714719271623673,
"grad_norm": 0.4914249777793884,
"learning_rate": 4.9272974752494974e-06,
"loss": 1.3029,
"step": 295
},
{
"epoch": 0.6737481031866465,
"grad_norm": 0.4709179699420929,
"learning_rate": 4.9265535784081965e-06,
"loss": 1.2459,
"step": 296
},
{
"epoch": 0.6760242792109257,
"grad_norm": 0.46568986773490906,
"learning_rate": 4.925805951845826e-06,
"loss": 1.2713,
"step": 297
},
{
"epoch": 0.6783004552352049,
"grad_norm": 0.46113038063049316,
"learning_rate": 4.925054596711526e-06,
"loss": 1.2787,
"step": 298
},
{
"epoch": 0.6805766312594841,
"grad_norm": 0.49636346101760864,
"learning_rate": 4.92429951416016e-06,
"loss": 1.2787,
"step": 299
},
{
"epoch": 0.6828528072837633,
"grad_norm": 0.4823263883590698,
"learning_rate": 4.9235407053523235e-06,
"loss": 1.3029,
"step": 300
},
{
"epoch": 0.6851289833080425,
"grad_norm": 0.45272234082221985,
"learning_rate": 4.92277817145434e-06,
"loss": 1.3053,
"step": 301
},
{
"epoch": 0.6874051593323217,
"grad_norm": 0.4724232256412506,
"learning_rate": 4.922011913638258e-06,
"loss": 1.2594,
"step": 302
},
{
"epoch": 0.6896813353566009,
"grad_norm": 0.5244677066802979,
"learning_rate": 4.92124193308185e-06,
"loss": 1.305,
"step": 303
},
{
"epoch": 0.6919575113808801,
"grad_norm": 0.4562852382659912,
"learning_rate": 4.92046823096861e-06,
"loss": 1.283,
"step": 304
},
{
"epoch": 0.6942336874051593,
"grad_norm": 0.460565447807312,
"learning_rate": 4.919690808487754e-06,
"loss": 1.3004,
"step": 305
},
{
"epoch": 0.6965098634294385,
"grad_norm": 0.4588528871536255,
"learning_rate": 4.918909666834214e-06,
"loss": 1.2745,
"step": 306
},
{
"epoch": 0.6987860394537178,
"grad_norm": 0.4980691075325012,
"learning_rate": 4.91812480720864e-06,
"loss": 1.2802,
"step": 307
},
{
"epoch": 0.701062215477997,
"grad_norm": 0.5080570578575134,
"learning_rate": 4.917336230817396e-06,
"loss": 1.286,
"step": 308
},
{
"epoch": 0.7033383915022762,
"grad_norm": 0.46659743785858154,
"learning_rate": 4.9165439388725585e-06,
"loss": 1.3093,
"step": 309
},
{
"epoch": 0.7056145675265554,
"grad_norm": 0.4846821129322052,
"learning_rate": 4.915747932591916e-06,
"loss": 1.2904,
"step": 310
},
{
"epoch": 0.7078907435508346,
"grad_norm": 0.4945422112941742,
"learning_rate": 4.914948213198966e-06,
"loss": 1.2592,
"step": 311
},
{
"epoch": 0.7101669195751138,
"grad_norm": 0.49606069922447205,
"learning_rate": 4.9141447819229125e-06,
"loss": 1.2699,
"step": 312
},
{
"epoch": 0.712443095599393,
"grad_norm": 0.48810863494873047,
"learning_rate": 4.913337639998666e-06,
"loss": 1.2993,
"step": 313
},
{
"epoch": 0.7147192716236722,
"grad_norm": 0.4933323562145233,
"learning_rate": 4.912526788666838e-06,
"loss": 1.2514,
"step": 314
},
{
"epoch": 0.7169954476479514,
"grad_norm": 0.4674908220767975,
"learning_rate": 4.911712229173745e-06,
"loss": 1.2602,
"step": 315
},
{
"epoch": 0.7192716236722306,
"grad_norm": 0.5178641676902771,
"learning_rate": 4.9108939627714e-06,
"loss": 1.312,
"step": 316
},
{
"epoch": 0.7215477996965098,
"grad_norm": 0.4949224293231964,
"learning_rate": 4.910071990717516e-06,
"loss": 1.2787,
"step": 317
},
{
"epoch": 0.723823975720789,
"grad_norm": 0.4700353443622589,
"learning_rate": 4.909246314275499e-06,
"loss": 1.251,
"step": 318
},
{
"epoch": 0.7261001517450683,
"grad_norm": 0.4828815758228302,
"learning_rate": 4.908416934714452e-06,
"loss": 1.2967,
"step": 319
},
{
"epoch": 0.7283763277693475,
"grad_norm": 0.47781631350517273,
"learning_rate": 4.907583853309168e-06,
"loss": 1.3108,
"step": 320
},
{
"epoch": 0.7306525037936267,
"grad_norm": 0.4467979073524475,
"learning_rate": 4.90674707134013e-06,
"loss": 1.2332,
"step": 321
},
{
"epoch": 0.7329286798179059,
"grad_norm": 0.4529818892478943,
"learning_rate": 4.90590659009351e-06,
"loss": 1.2958,
"step": 322
},
{
"epoch": 0.7352048558421851,
"grad_norm": 0.4782491624355316,
"learning_rate": 4.905062410861164e-06,
"loss": 1.2754,
"step": 323
},
{
"epoch": 0.7374810318664643,
"grad_norm": 0.4517338275909424,
"learning_rate": 4.9042145349406335e-06,
"loss": 1.3098,
"step": 324
},
{
"epoch": 0.7397572078907435,
"grad_norm": 0.4599636197090149,
"learning_rate": 4.903362963635142e-06,
"loss": 1.2843,
"step": 325
},
{
"epoch": 0.7420333839150227,
"grad_norm": 0.4922712743282318,
"learning_rate": 4.902507698253593e-06,
"loss": 1.2987,
"step": 326
},
{
"epoch": 0.7443095599393019,
"grad_norm": 0.47610870003700256,
"learning_rate": 4.901648740110566e-06,
"loss": 1.2739,
"step": 327
},
{
"epoch": 0.7465857359635811,
"grad_norm": 0.46494367718696594,
"learning_rate": 4.900786090526319e-06,
"loss": 1.2579,
"step": 328
},
{
"epoch": 0.7488619119878603,
"grad_norm": 0.46867313981056213,
"learning_rate": 4.899919750826784e-06,
"loss": 1.2838,
"step": 329
},
{
"epoch": 0.7511380880121397,
"grad_norm": 0.49616602063179016,
"learning_rate": 4.899049722343561e-06,
"loss": 1.3108,
"step": 330
},
{
"epoch": 0.7534142640364189,
"grad_norm": 0.46307483315467834,
"learning_rate": 4.898176006413925e-06,
"loss": 1.3047,
"step": 331
},
{
"epoch": 0.7556904400606981,
"grad_norm": 0.47475141286849976,
"learning_rate": 4.897298604380816e-06,
"loss": 1.2416,
"step": 332
},
{
"epoch": 0.7579666160849773,
"grad_norm": 0.468184232711792,
"learning_rate": 4.896417517592838e-06,
"loss": 1.2904,
"step": 333
},
{
"epoch": 0.7602427921092565,
"grad_norm": 0.47171875834465027,
"learning_rate": 4.895532747404263e-06,
"loss": 1.2641,
"step": 334
},
{
"epoch": 0.7625189681335357,
"grad_norm": 0.45646342635154724,
"learning_rate": 4.8946442951750215e-06,
"loss": 1.285,
"step": 335
},
{
"epoch": 0.7647951441578149,
"grad_norm": 0.48363035917282104,
"learning_rate": 4.893752162270704e-06,
"loss": 1.2507,
"step": 336
},
{
"epoch": 0.7670713201820941,
"grad_norm": 0.4761241674423218,
"learning_rate": 4.892856350062558e-06,
"loss": 1.2628,
"step": 337
},
{
"epoch": 0.7693474962063733,
"grad_norm": 0.47408172488212585,
"learning_rate": 4.891956859927489e-06,
"loss": 1.2919,
"step": 338
},
{
"epoch": 0.7716236722306525,
"grad_norm": 0.48075783252716064,
"learning_rate": 4.89105369324805e-06,
"loss": 1.282,
"step": 339
},
{
"epoch": 0.7738998482549317,
"grad_norm": 0.45937585830688477,
"learning_rate": 4.890146851412452e-06,
"loss": 1.2823,
"step": 340
},
{
"epoch": 0.776176024279211,
"grad_norm": 0.5253570675849915,
"learning_rate": 4.889236335814549e-06,
"loss": 1.2657,
"step": 341
},
{
"epoch": 0.7784522003034902,
"grad_norm": 0.47888922691345215,
"learning_rate": 4.888322147853846e-06,
"loss": 1.3003,
"step": 342
},
{
"epoch": 0.7807283763277694,
"grad_norm": 0.4705219566822052,
"learning_rate": 4.887404288935488e-06,
"loss": 1.2822,
"step": 343
},
{
"epoch": 0.7830045523520486,
"grad_norm": 0.5236004590988159,
"learning_rate": 4.8864827604702675e-06,
"loss": 1.2338,
"step": 344
},
{
"epoch": 0.7852807283763278,
"grad_norm": 0.4856922924518585,
"learning_rate": 4.885557563874614e-06,
"loss": 1.2394,
"step": 345
},
{
"epoch": 0.787556904400607,
"grad_norm": 0.48127493262290955,
"learning_rate": 4.884628700570595e-06,
"loss": 1.2827,
"step": 346
},
{
"epoch": 0.7898330804248862,
"grad_norm": 0.46932077407836914,
"learning_rate": 4.883696171985917e-06,
"loss": 1.2608,
"step": 347
},
{
"epoch": 0.7921092564491654,
"grad_norm": 0.5052128434181213,
"learning_rate": 4.882759979553916e-06,
"loss": 1.2727,
"step": 348
},
{
"epoch": 0.7943854324734446,
"grad_norm": 0.5077352523803711,
"learning_rate": 4.881820124713562e-06,
"loss": 1.2364,
"step": 349
},
{
"epoch": 0.7966616084977238,
"grad_norm": 0.5095151662826538,
"learning_rate": 4.880876608909454e-06,
"loss": 1.2788,
"step": 350
},
{
"epoch": 0.798937784522003,
"grad_norm": 0.4920441806316376,
"learning_rate": 4.8799294335918185e-06,
"loss": 1.2944,
"step": 351
},
{
"epoch": 0.8012139605462822,
"grad_norm": 0.4824545085430145,
"learning_rate": 4.8789786002165055e-06,
"loss": 1.2669,
"step": 352
},
{
"epoch": 0.8034901365705615,
"grad_norm": 0.49492961168289185,
"learning_rate": 4.878024110244988e-06,
"loss": 1.3021,
"step": 353
},
{
"epoch": 0.8057663125948407,
"grad_norm": 0.5213160514831543,
"learning_rate": 4.877065965144361e-06,
"loss": 1.2832,
"step": 354
},
{
"epoch": 0.8080424886191199,
"grad_norm": 0.4782240390777588,
"learning_rate": 4.8761041663873345e-06,
"loss": 1.2812,
"step": 355
},
{
"epoch": 0.8103186646433991,
"grad_norm": 0.4901832938194275,
"learning_rate": 4.875138715452237e-06,
"loss": 1.289,
"step": 356
},
{
"epoch": 0.8125948406676783,
"grad_norm": 0.48875507712364197,
"learning_rate": 4.87416961382301e-06,
"loss": 1.2876,
"step": 357
},
{
"epoch": 0.8148710166919575,
"grad_norm": 0.49773871898651123,
"learning_rate": 4.873196862989205e-06,
"loss": 1.2766,
"step": 358
},
{
"epoch": 0.8171471927162367,
"grad_norm": 0.5069698691368103,
"learning_rate": 4.872220464445983e-06,
"loss": 1.284,
"step": 359
},
{
"epoch": 0.8194233687405159,
"grad_norm": 0.4725041389465332,
"learning_rate": 4.871240419694115e-06,
"loss": 1.2183,
"step": 360
},
{
"epoch": 0.8216995447647951,
"grad_norm": 0.4846250116825104,
"learning_rate": 4.8702567302399705e-06,
"loss": 1.2851,
"step": 361
},
{
"epoch": 0.8239757207890743,
"grad_norm": 0.4825296998023987,
"learning_rate": 4.869269397595525e-06,
"loss": 1.2621,
"step": 362
},
{
"epoch": 0.8262518968133535,
"grad_norm": 0.4880293905735016,
"learning_rate": 4.8682784232783535e-06,
"loss": 1.2684,
"step": 363
},
{
"epoch": 0.8285280728376327,
"grad_norm": 0.4805878698825836,
"learning_rate": 4.867283808811626e-06,
"loss": 1.2604,
"step": 364
},
{
"epoch": 0.830804248861912,
"grad_norm": 0.5031499266624451,
"learning_rate": 4.86628555572411e-06,
"loss": 1.2701,
"step": 365
},
{
"epoch": 0.8330804248861912,
"grad_norm": 0.49856945872306824,
"learning_rate": 4.865283665550167e-06,
"loss": 1.266,
"step": 366
},
{
"epoch": 0.8353566009104704,
"grad_norm": 0.49834373593330383,
"learning_rate": 4.864278139829745e-06,
"loss": 1.254,
"step": 367
},
{
"epoch": 0.8376327769347496,
"grad_norm": 0.47436273097991943,
"learning_rate": 4.863268980108381e-06,
"loss": 1.308,
"step": 368
},
{
"epoch": 0.8399089529590288,
"grad_norm": 0.4866158962249756,
"learning_rate": 4.8622561879372e-06,
"loss": 1.2565,
"step": 369
},
{
"epoch": 0.842185128983308,
"grad_norm": 0.46591049432754517,
"learning_rate": 4.861239764872909e-06,
"loss": 1.2528,
"step": 370
},
{
"epoch": 0.8444613050075872,
"grad_norm": 0.5084807872772217,
"learning_rate": 4.860219712477795e-06,
"loss": 1.2727,
"step": 371
},
{
"epoch": 0.8467374810318664,
"grad_norm": 0.49390751123428345,
"learning_rate": 4.859196032319724e-06,
"loss": 1.2544,
"step": 372
},
{
"epoch": 0.8490136570561456,
"grad_norm": 0.4931376576423645,
"learning_rate": 4.8581687259721375e-06,
"loss": 1.2728,
"step": 373
},
{
"epoch": 0.8512898330804249,
"grad_norm": 0.4991268813610077,
"learning_rate": 4.857137795014051e-06,
"loss": 1.2382,
"step": 374
},
{
"epoch": 0.8535660091047041,
"grad_norm": 0.48629266023635864,
"learning_rate": 4.856103241030054e-06,
"loss": 1.2464,
"step": 375
},
{
"epoch": 0.8558421851289834,
"grad_norm": 0.4945109188556671,
"learning_rate": 4.855065065610298e-06,
"loss": 1.2592,
"step": 376
},
{
"epoch": 0.8581183611532626,
"grad_norm": 0.4683839678764343,
"learning_rate": 4.8540232703505085e-06,
"loss": 1.2795,
"step": 377
},
{
"epoch": 0.8603945371775418,
"grad_norm": 0.4917154610157013,
"learning_rate": 4.8529778568519695e-06,
"loss": 1.297,
"step": 378
},
{
"epoch": 0.862670713201821,
"grad_norm": 0.4950079917907715,
"learning_rate": 4.851928826721528e-06,
"loss": 1.2424,
"step": 379
},
{
"epoch": 0.8649468892261002,
"grad_norm": 0.49165982007980347,
"learning_rate": 4.850876181571592e-06,
"loss": 1.2442,
"step": 380
},
{
"epoch": 0.8672230652503794,
"grad_norm": 0.47863882780075073,
"learning_rate": 4.849819923020121e-06,
"loss": 1.2946,
"step": 381
},
{
"epoch": 0.8694992412746586,
"grad_norm": 0.5066231489181519,
"learning_rate": 4.848760052690635e-06,
"loss": 1.2658,
"step": 382
},
{
"epoch": 0.8717754172989378,
"grad_norm": 0.46788156032562256,
"learning_rate": 4.847696572212199e-06,
"loss": 1.2787,
"step": 383
},
{
"epoch": 0.874051593323217,
"grad_norm": 0.5010194182395935,
"learning_rate": 4.846629483219431e-06,
"loss": 1.2645,
"step": 384
},
{
"epoch": 0.8763277693474962,
"grad_norm": 0.480258584022522,
"learning_rate": 4.845558787352495e-06,
"loss": 1.2535,
"step": 385
},
{
"epoch": 0.8786039453717754,
"grad_norm": 0.5160472393035889,
"learning_rate": 4.844484486257097e-06,
"loss": 1.2838,
"step": 386
},
{
"epoch": 0.8808801213960546,
"grad_norm": 0.5098587870597839,
"learning_rate": 4.843406581584487e-06,
"loss": 1.2834,
"step": 387
},
{
"epoch": 0.8831562974203339,
"grad_norm": 0.5033400058746338,
"learning_rate": 4.8423250749914515e-06,
"loss": 1.2959,
"step": 388
},
{
"epoch": 0.8854324734446131,
"grad_norm": 0.506367564201355,
"learning_rate": 4.841239968140316e-06,
"loss": 1.2757,
"step": 389
},
{
"epoch": 0.8877086494688923,
"grad_norm": 0.47980019450187683,
"learning_rate": 4.8401512626989354e-06,
"loss": 1.2683,
"step": 390
},
{
"epoch": 0.8899848254931715,
"grad_norm": 0.48923107981681824,
"learning_rate": 4.8390589603407005e-06,
"loss": 1.2325,
"step": 391
},
{
"epoch": 0.8922610015174507,
"grad_norm": 0.4891837537288666,
"learning_rate": 4.8379630627445286e-06,
"loss": 1.2508,
"step": 392
},
{
"epoch": 0.8945371775417299,
"grad_norm": 0.4819527566432953,
"learning_rate": 4.836863571594863e-06,
"loss": 1.2655,
"step": 393
},
{
"epoch": 0.8968133535660091,
"grad_norm": 0.5067424178123474,
"learning_rate": 4.83576048858167e-06,
"loss": 1.2477,
"step": 394
},
{
"epoch": 0.8990895295902883,
"grad_norm": 0.5201086401939392,
"learning_rate": 4.8346538154004386e-06,
"loss": 1.249,
"step": 395
},
{
"epoch": 0.9013657056145675,
"grad_norm": 0.5033949017524719,
"learning_rate": 4.833543553752173e-06,
"loss": 1.2882,
"step": 396
},
{
"epoch": 0.9036418816388467,
"grad_norm": 0.4921282231807709,
"learning_rate": 4.8324297053433975e-06,
"loss": 1.2355,
"step": 397
},
{
"epoch": 0.9059180576631259,
"grad_norm": 0.49898359179496765,
"learning_rate": 4.831312271886145e-06,
"loss": 1.24,
"step": 398
},
{
"epoch": 0.9081942336874052,
"grad_norm": 0.4932885468006134,
"learning_rate": 4.83019125509796e-06,
"loss": 1.2651,
"step": 399
},
{
"epoch": 0.9104704097116844,
"grad_norm": 0.5081654191017151,
"learning_rate": 4.829066656701897e-06,
"loss": 1.2846,
"step": 400
},
{
"epoch": 0.9127465857359636,
"grad_norm": 0.4848720133304596,
"learning_rate": 4.8279384784265124e-06,
"loss": 1.2834,
"step": 401
},
{
"epoch": 0.9150227617602428,
"grad_norm": 0.47641217708587646,
"learning_rate": 4.826806722005868e-06,
"loss": 1.2556,
"step": 402
},
{
"epoch": 0.917298937784522,
"grad_norm": 0.5004164576530457,
"learning_rate": 4.825671389179522e-06,
"loss": 1.2852,
"step": 403
},
{
"epoch": 0.9195751138088012,
"grad_norm": 0.5069151520729065,
"learning_rate": 4.824532481692533e-06,
"loss": 1.2468,
"step": 404
},
{
"epoch": 0.9218512898330804,
"grad_norm": 0.5043609738349915,
"learning_rate": 4.823390001295453e-06,
"loss": 1.2602,
"step": 405
},
{
"epoch": 0.9241274658573596,
"grad_norm": 0.47922301292419434,
"learning_rate": 4.822243949744324e-06,
"loss": 1.2909,
"step": 406
},
{
"epoch": 0.9264036418816388,
"grad_norm": 0.5012561082839966,
"learning_rate": 4.821094328800678e-06,
"loss": 1.3058,
"step": 407
},
{
"epoch": 0.928679817905918,
"grad_norm": 0.5232773423194885,
"learning_rate": 4.8199411402315356e-06,
"loss": 1.2689,
"step": 408
},
{
"epoch": 0.9309559939301972,
"grad_norm": 0.5023229718208313,
"learning_rate": 4.8187843858093975e-06,
"loss": 1.2623,
"step": 409
},
{
"epoch": 0.9332321699544764,
"grad_norm": 0.5061272382736206,
"learning_rate": 4.817624067312247e-06,
"loss": 1.2771,
"step": 410
},
{
"epoch": 0.9355083459787557,
"grad_norm": 0.47715064883232117,
"learning_rate": 4.816460186523547e-06,
"loss": 1.266,
"step": 411
},
{
"epoch": 0.9377845220030349,
"grad_norm": 0.5037026405334473,
"learning_rate": 4.815292745232233e-06,
"loss": 1.2812,
"step": 412
},
{
"epoch": 0.9400606980273141,
"grad_norm": 0.47421544790267944,
"learning_rate": 4.814121745232714e-06,
"loss": 1.2349,
"step": 413
},
{
"epoch": 0.9423368740515933,
"grad_norm": 0.5214923620223999,
"learning_rate": 4.812947188324868e-06,
"loss": 1.2986,
"step": 414
},
{
"epoch": 0.9446130500758725,
"grad_norm": 0.5169025659561157,
"learning_rate": 4.811769076314044e-06,
"loss": 1.2687,
"step": 415
},
{
"epoch": 0.9468892261001517,
"grad_norm": 0.5028119087219238,
"learning_rate": 4.8105874110110516e-06,
"loss": 1.2666,
"step": 416
},
{
"epoch": 0.9491654021244309,
"grad_norm": 0.5233621597290039,
"learning_rate": 4.809402194232163e-06,
"loss": 1.2817,
"step": 417
},
{
"epoch": 0.9514415781487102,
"grad_norm": 0.5662165880203247,
"learning_rate": 4.808213427799108e-06,
"loss": 1.212,
"step": 418
},
{
"epoch": 0.9537177541729894,
"grad_norm": 0.5214280486106873,
"learning_rate": 4.807021113539077e-06,
"loss": 1.2659,
"step": 419
},
{
"epoch": 0.9559939301972686,
"grad_norm": 0.5059605240821838,
"learning_rate": 4.805825253284706e-06,
"loss": 1.2417,
"step": 420
},
{
"epoch": 0.9582701062215478,
"grad_norm": 0.48347723484039307,
"learning_rate": 4.804625848874088e-06,
"loss": 1.279,
"step": 421
},
{
"epoch": 0.960546282245827,
"grad_norm": 0.5225522518157959,
"learning_rate": 4.803422902150762e-06,
"loss": 1.2555,
"step": 422
},
{
"epoch": 0.9628224582701063,
"grad_norm": 0.49709466099739075,
"learning_rate": 4.802216414963708e-06,
"loss": 1.2956,
"step": 423
},
{
"epoch": 0.9650986342943855,
"grad_norm": 0.500357985496521,
"learning_rate": 4.801006389167352e-06,
"loss": 1.2748,
"step": 424
},
{
"epoch": 0.9673748103186647,
"grad_norm": 0.504552960395813,
"learning_rate": 4.799792826621559e-06,
"loss": 1.2939,
"step": 425
},
{
"epoch": 0.9696509863429439,
"grad_norm": 0.4881986379623413,
"learning_rate": 4.7985757291916264e-06,
"loss": 1.2827,
"step": 426
},
{
"epoch": 0.9719271623672231,
"grad_norm": 0.517511785030365,
"learning_rate": 4.797355098748289e-06,
"loss": 1.2668,
"step": 427
},
{
"epoch": 0.9742033383915023,
"grad_norm": 0.49534812569618225,
"learning_rate": 4.796130937167709e-06,
"loss": 1.2878,
"step": 428
},
{
"epoch": 0.9764795144157815,
"grad_norm": 0.4725462794303894,
"learning_rate": 4.794903246331477e-06,
"loss": 1.2612,
"step": 429
},
{
"epoch": 0.9787556904400607,
"grad_norm": 0.49760913848876953,
"learning_rate": 4.79367202812661e-06,
"loss": 1.284,
"step": 430
},
{
"epoch": 0.9810318664643399,
"grad_norm": 0.5361410975456238,
"learning_rate": 4.792437284445545e-06,
"loss": 1.2517,
"step": 431
},
{
"epoch": 0.9833080424886191,
"grad_norm": 0.5160269141197205,
"learning_rate": 4.791199017186137e-06,
"loss": 1.2422,
"step": 432
},
{
"epoch": 0.9855842185128983,
"grad_norm": 0.5418286919593811,
"learning_rate": 4.7899572282516596e-06,
"loss": 1.2697,
"step": 433
},
{
"epoch": 0.9878603945371776,
"grad_norm": 0.5236756801605225,
"learning_rate": 4.788711919550796e-06,
"loss": 1.2546,
"step": 434
},
{
"epoch": 0.9901365705614568,
"grad_norm": 0.4919045567512512,
"learning_rate": 4.787463092997643e-06,
"loss": 1.2478,
"step": 435
},
{
"epoch": 0.992412746585736,
"grad_norm": 0.4918051064014435,
"learning_rate": 4.786210750511701e-06,
"loss": 1.2522,
"step": 436
},
{
"epoch": 0.9946889226100152,
"grad_norm": 0.5032536387443542,
"learning_rate": 4.784954894017878e-06,
"loss": 1.2924,
"step": 437
},
{
"epoch": 0.9969650986342944,
"grad_norm": 0.5253746509552002,
"learning_rate": 4.78369552544648e-06,
"loss": 1.258,
"step": 438
},
{
"epoch": 0.9992412746585736,
"grad_norm": 0.5097838044166565,
"learning_rate": 4.782432646733214e-06,
"loss": 1.2479,
"step": 439
},
{
"epoch": 1.0,
"grad_norm": 0.5097838044166565,
"learning_rate": 4.781166259819179e-06,
"loss": 1.2895,
"step": 440
},
{
"epoch": 1.0022761760242793,
"grad_norm": 1.0558606386184692,
"learning_rate": 4.77989636665087e-06,
"loss": 1.2707,
"step": 441
},
{
"epoch": 1.0045523520485584,
"grad_norm": 0.47916215658187866,
"learning_rate": 4.778622969180167e-06,
"loss": 1.2364,
"step": 442
},
{
"epoch": 1.0068285280728377,
"grad_norm": 0.5158357620239258,
"learning_rate": 4.777346069364343e-06,
"loss": 1.2421,
"step": 443
},
{
"epoch": 1.0091047040971168,
"grad_norm": 0.4970231354236603,
"learning_rate": 4.776065669166045e-06,
"loss": 1.2534,
"step": 444
},
{
"epoch": 1.0113808801213962,
"grad_norm": 0.529381513595581,
"learning_rate": 4.774781770553309e-06,
"loss": 1.2429,
"step": 445
},
{
"epoch": 1.0136570561456753,
"grad_norm": 0.5027406811714172,
"learning_rate": 4.773494375499543e-06,
"loss": 1.2427,
"step": 446
},
{
"epoch": 1.0159332321699546,
"grad_norm": 0.5164632797241211,
"learning_rate": 4.772203485983531e-06,
"loss": 1.273,
"step": 447
},
{
"epoch": 1.0182094081942337,
"grad_norm": 0.5203757882118225,
"learning_rate": 4.770909103989426e-06,
"loss": 1.2261,
"step": 448
},
{
"epoch": 1.020485584218513,
"grad_norm": 0.518552839756012,
"learning_rate": 4.769611231506753e-06,
"loss": 1.2404,
"step": 449
},
{
"epoch": 1.022761760242792,
"grad_norm": 0.5020595788955688,
"learning_rate": 4.7683098705303995e-06,
"loss": 1.2722,
"step": 450
},
{
"epoch": 1.0250379362670714,
"grad_norm": 0.508852481842041,
"learning_rate": 4.767005023060615e-06,
"loss": 1.2344,
"step": 451
},
{
"epoch": 1.0273141122913505,
"grad_norm": 0.5240857005119324,
"learning_rate": 4.765696691103008e-06,
"loss": 1.2553,
"step": 452
},
{
"epoch": 1.0295902883156298,
"grad_norm": 0.5548052787780762,
"learning_rate": 4.764384876668542e-06,
"loss": 1.3039,
"step": 453
},
{
"epoch": 1.031866464339909,
"grad_norm": 0.5021058917045593,
"learning_rate": 4.763069581773537e-06,
"loss": 1.2636,
"step": 454
},
{
"epoch": 1.0341426403641882,
"grad_norm": 0.5170218348503113,
"learning_rate": 4.761750808439658e-06,
"loss": 1.2584,
"step": 455
},
{
"epoch": 1.0364188163884673,
"grad_norm": 0.5254265069961548,
"learning_rate": 4.760428558693919e-06,
"loss": 1.2578,
"step": 456
},
{
"epoch": 1.0386949924127467,
"grad_norm": 0.5046964883804321,
"learning_rate": 4.7591028345686765e-06,
"loss": 1.253,
"step": 457
},
{
"epoch": 1.0409711684370258,
"grad_norm": 0.5212562084197998,
"learning_rate": 4.757773638101629e-06,
"loss": 1.2453,
"step": 458
},
{
"epoch": 1.043247344461305,
"grad_norm": 0.5397632718086243,
"learning_rate": 4.7564409713358075e-06,
"loss": 1.2612,
"step": 459
},
{
"epoch": 1.0455235204855842,
"grad_norm": 0.5086544752120972,
"learning_rate": 4.755104836319583e-06,
"loss": 1.27,
"step": 460
},
{
"epoch": 1.0477996965098635,
"grad_norm": 0.4974862337112427,
"learning_rate": 4.7537652351066545e-06,
"loss": 1.1955,
"step": 461
},
{
"epoch": 1.0500758725341426,
"grad_norm": 0.5382196307182312,
"learning_rate": 4.752422169756048e-06,
"loss": 1.2996,
"step": 462
},
{
"epoch": 1.052352048558422,
"grad_norm": 0.5093661546707153,
"learning_rate": 4.751075642332116e-06,
"loss": 1.2671,
"step": 463
},
{
"epoch": 1.054628224582701,
"grad_norm": 0.53044593334198,
"learning_rate": 4.749725654904529e-06,
"loss": 1.2572,
"step": 464
},
{
"epoch": 1.0569044006069803,
"grad_norm": 0.5372816920280457,
"learning_rate": 4.74837220954828e-06,
"loss": 1.2215,
"step": 465
},
{
"epoch": 1.0591805766312594,
"grad_norm": 0.5148317217826843,
"learning_rate": 4.747015308343673e-06,
"loss": 1.2636,
"step": 466
},
{
"epoch": 1.0614567526555387,
"grad_norm": 0.5267722010612488,
"learning_rate": 4.745654953376327e-06,
"loss": 1.2786,
"step": 467
},
{
"epoch": 1.0637329286798178,
"grad_norm": 0.5123690366744995,
"learning_rate": 4.744291146737169e-06,
"loss": 1.2217,
"step": 468
},
{
"epoch": 1.0660091047040972,
"grad_norm": 0.5397908687591553,
"learning_rate": 4.74292389052243e-06,
"loss": 1.2353,
"step": 469
},
{
"epoch": 1.0682852807283763,
"grad_norm": 0.5311163067817688,
"learning_rate": 4.741553186833642e-06,
"loss": 1.2307,
"step": 470
},
{
"epoch": 1.0705614567526556,
"grad_norm": 0.5108172297477722,
"learning_rate": 4.740179037777639e-06,
"loss": 1.2526,
"step": 471
},
{
"epoch": 1.0728376327769347,
"grad_norm": 0.5670639276504517,
"learning_rate": 4.7388014454665495e-06,
"loss": 1.214,
"step": 472
},
{
"epoch": 1.075113808801214,
"grad_norm": 0.5621855854988098,
"learning_rate": 4.737420412017795e-06,
"loss": 1.2202,
"step": 473
},
{
"epoch": 1.077389984825493,
"grad_norm": 0.5175919532775879,
"learning_rate": 4.736035939554084e-06,
"loss": 1.2295,
"step": 474
},
{
"epoch": 1.0796661608497724,
"grad_norm": 0.510009765625,
"learning_rate": 4.7346480302034144e-06,
"loss": 1.2489,
"step": 475
},
{
"epoch": 1.0819423368740515,
"grad_norm": 0.5198955535888672,
"learning_rate": 4.733256686099063e-06,
"loss": 1.2148,
"step": 476
},
{
"epoch": 1.0842185128983308,
"grad_norm": 0.5157918334007263,
"learning_rate": 4.731861909379588e-06,
"loss": 1.2858,
"step": 477
},
{
"epoch": 1.08649468892261,
"grad_norm": 0.5016840100288391,
"learning_rate": 4.730463702188824e-06,
"loss": 1.2137,
"step": 478
},
{
"epoch": 1.0887708649468892,
"grad_norm": 0.5427749156951904,
"learning_rate": 4.729062066675877e-06,
"loss": 1.2616,
"step": 479
},
{
"epoch": 1.0910470409711683,
"grad_norm": 0.5368303656578064,
"learning_rate": 4.727657004995124e-06,
"loss": 1.22,
"step": 480
},
{
"epoch": 1.0933232169954477,
"grad_norm": 0.5127097964286804,
"learning_rate": 4.726248519306208e-06,
"loss": 1.1953,
"step": 481
},
{
"epoch": 1.095599393019727,
"grad_norm": 0.5109656453132629,
"learning_rate": 4.724836611774032e-06,
"loss": 1.2483,
"step": 482
},
{
"epoch": 1.097875569044006,
"grad_norm": 0.5445286631584167,
"learning_rate": 4.723421284568764e-06,
"loss": 1.242,
"step": 483
},
{
"epoch": 1.1001517450682852,
"grad_norm": 0.5462026000022888,
"learning_rate": 4.722002539865823e-06,
"loss": 1.2475,
"step": 484
},
{
"epoch": 1.1024279210925645,
"grad_norm": 0.5589436292648315,
"learning_rate": 4.720580379845884e-06,
"loss": 1.2511,
"step": 485
},
{
"epoch": 1.1047040971168438,
"grad_norm": 0.5450273752212524,
"learning_rate": 4.719154806694869e-06,
"loss": 1.2843,
"step": 486
},
{
"epoch": 1.106980273141123,
"grad_norm": 0.5322884321212769,
"learning_rate": 4.717725822603948e-06,
"loss": 1.2159,
"step": 487
},
{
"epoch": 1.1092564491654022,
"grad_norm": 0.5098543763160706,
"learning_rate": 4.716293429769534e-06,
"loss": 1.2818,
"step": 488
},
{
"epoch": 1.1115326251896813,
"grad_norm": 0.5248117446899414,
"learning_rate": 4.7148576303932784e-06,
"loss": 1.2497,
"step": 489
},
{
"epoch": 1.1138088012139606,
"grad_norm": 0.5317633748054504,
"learning_rate": 4.7134184266820675e-06,
"loss": 1.2174,
"step": 490
},
{
"epoch": 1.1160849772382397,
"grad_norm": 0.5104670524597168,
"learning_rate": 4.711975820848024e-06,
"loss": 1.2492,
"step": 491
},
{
"epoch": 1.118361153262519,
"grad_norm": 0.5210446715354919,
"learning_rate": 4.710529815108496e-06,
"loss": 1.2478,
"step": 492
},
{
"epoch": 1.1206373292867982,
"grad_norm": 0.5357753038406372,
"learning_rate": 4.7090804116860574e-06,
"loss": 1.2533,
"step": 493
},
{
"epoch": 1.1229135053110775,
"grad_norm": 0.5544043779373169,
"learning_rate": 4.707627612808509e-06,
"loss": 1.2315,
"step": 494
},
{
"epoch": 1.1251896813353566,
"grad_norm": 0.5387628674507141,
"learning_rate": 4.706171420708866e-06,
"loss": 1.2492,
"step": 495
},
{
"epoch": 1.127465857359636,
"grad_norm": 0.5289620757102966,
"learning_rate": 4.704711837625361e-06,
"loss": 1.1865,
"step": 496
},
{
"epoch": 1.129742033383915,
"grad_norm": 0.5673317909240723,
"learning_rate": 4.703248865801436e-06,
"loss": 1.1963,
"step": 497
},
{
"epoch": 1.1320182094081943,
"grad_norm": 0.5180116295814514,
"learning_rate": 4.701782507485747e-06,
"loss": 1.2431,
"step": 498
},
{
"epoch": 1.1342943854324734,
"grad_norm": 0.5326710343360901,
"learning_rate": 4.700312764932151e-06,
"loss": 1.2543,
"step": 499
},
{
"epoch": 1.1365705614567527,
"grad_norm": 0.536686360836029,
"learning_rate": 4.698839640399707e-06,
"loss": 1.2664,
"step": 500
},
{
"epoch": 1.1388467374810318,
"grad_norm": 0.5708869695663452,
"learning_rate": 4.6973631361526745e-06,
"loss": 1.2445,
"step": 501
},
{
"epoch": 1.1411229135053111,
"grad_norm": 0.5445765852928162,
"learning_rate": 4.695883254460505e-06,
"loss": 1.2111,
"step": 502
},
{
"epoch": 1.1433990895295902,
"grad_norm": 0.5529754161834717,
"learning_rate": 4.6943999975978445e-06,
"loss": 1.2346,
"step": 503
},
{
"epoch": 1.1456752655538696,
"grad_norm": 0.5409250855445862,
"learning_rate": 4.692913367844523e-06,
"loss": 1.2338,
"step": 504
},
{
"epoch": 1.1479514415781487,
"grad_norm": 0.5459516644477844,
"learning_rate": 4.691423367485558e-06,
"loss": 1.2487,
"step": 505
},
{
"epoch": 1.150227617602428,
"grad_norm": 0.5377400517463684,
"learning_rate": 4.689929998811145e-06,
"loss": 1.2719,
"step": 506
},
{
"epoch": 1.152503793626707,
"grad_norm": 0.5768429636955261,
"learning_rate": 4.68843326411666e-06,
"loss": 1.2106,
"step": 507
},
{
"epoch": 1.1547799696509864,
"grad_norm": 0.5586393475532532,
"learning_rate": 4.686933165702651e-06,
"loss": 1.2469,
"step": 508
},
{
"epoch": 1.1570561456752655,
"grad_norm": 0.5209569334983826,
"learning_rate": 4.685429705874834e-06,
"loss": 1.2453,
"step": 509
},
{
"epoch": 1.1593323216995448,
"grad_norm": 0.5145371556282043,
"learning_rate": 4.6839228869440965e-06,
"loss": 1.2484,
"step": 510
},
{
"epoch": 1.161608497723824,
"grad_norm": 0.5463981032371521,
"learning_rate": 4.682412711226485e-06,
"loss": 1.2691,
"step": 511
},
{
"epoch": 1.1638846737481032,
"grad_norm": 0.5128470659255981,
"learning_rate": 4.680899181043206e-06,
"loss": 1.2579,
"step": 512
},
{
"epoch": 1.1661608497723823,
"grad_norm": 0.5277767777442932,
"learning_rate": 4.679382298720625e-06,
"loss": 1.2247,
"step": 513
},
{
"epoch": 1.1684370257966616,
"grad_norm": 0.5547785758972168,
"learning_rate": 4.6778620665902566e-06,
"loss": 1.2492,
"step": 514
},
{
"epoch": 1.1707132018209407,
"grad_norm": 0.5689957737922668,
"learning_rate": 4.676338486988765e-06,
"loss": 1.2384,
"step": 515
},
{
"epoch": 1.17298937784522,
"grad_norm": 0.5139868259429932,
"learning_rate": 4.674811562257961e-06,
"loss": 1.2562,
"step": 516
},
{
"epoch": 1.1752655538694992,
"grad_norm": 0.5729711055755615,
"learning_rate": 4.673281294744796e-06,
"loss": 1.2833,
"step": 517
},
{
"epoch": 1.1775417298937785,
"grad_norm": 0.5735371708869934,
"learning_rate": 4.671747686801358e-06,
"loss": 1.2481,
"step": 518
},
{
"epoch": 1.1798179059180576,
"grad_norm": 0.5259848833084106,
"learning_rate": 4.670210740784872e-06,
"loss": 1.2496,
"step": 519
},
{
"epoch": 1.182094081942337,
"grad_norm": 0.5374155640602112,
"learning_rate": 4.668670459057693e-06,
"loss": 1.2484,
"step": 520
},
{
"epoch": 1.184370257966616,
"grad_norm": 0.5365428328514099,
"learning_rate": 4.667126843987301e-06,
"loss": 1.2651,
"step": 521
},
{
"epoch": 1.1866464339908953,
"grad_norm": 0.5263276100158691,
"learning_rate": 4.665579897946303e-06,
"loss": 1.19,
"step": 522
},
{
"epoch": 1.1889226100151746,
"grad_norm": 0.5412886142730713,
"learning_rate": 4.664029623312422e-06,
"loss": 1.2551,
"step": 523
},
{
"epoch": 1.1911987860394537,
"grad_norm": 0.5376629829406738,
"learning_rate": 4.662476022468503e-06,
"loss": 1.2541,
"step": 524
},
{
"epoch": 1.1934749620637328,
"grad_norm": 0.5543259382247925,
"learning_rate": 4.660919097802495e-06,
"loss": 1.2745,
"step": 525
},
{
"epoch": 1.1957511380880121,
"grad_norm": 0.5453343987464905,
"learning_rate": 4.659358851707464e-06,
"loss": 1.238,
"step": 526
},
{
"epoch": 1.1980273141122915,
"grad_norm": 0.5588712692260742,
"learning_rate": 4.657795286581576e-06,
"loss": 1.1767,
"step": 527
},
{
"epoch": 1.2003034901365706,
"grad_norm": 0.5432548522949219,
"learning_rate": 4.656228404828102e-06,
"loss": 1.2243,
"step": 528
},
{
"epoch": 1.2025796661608497,
"grad_norm": 0.5616108179092407,
"learning_rate": 4.654658208855408e-06,
"loss": 1.1937,
"step": 529
},
{
"epoch": 1.204855842185129,
"grad_norm": 0.5578548908233643,
"learning_rate": 4.653084701076955e-06,
"loss": 1.2454,
"step": 530
},
{
"epoch": 1.2071320182094083,
"grad_norm": 0.5913681983947754,
"learning_rate": 4.651507883911296e-06,
"loss": 1.2717,
"step": 531
},
{
"epoch": 1.2094081942336874,
"grad_norm": 0.5625573992729187,
"learning_rate": 4.649927759782068e-06,
"loss": 1.2619,
"step": 532
},
{
"epoch": 1.2116843702579665,
"grad_norm": 0.5766717195510864,
"learning_rate": 4.648344331117992e-06,
"loss": 1.2748,
"step": 533
},
{
"epoch": 1.2139605462822458,
"grad_norm": 0.529719889163971,
"learning_rate": 4.64675760035287e-06,
"loss": 1.2443,
"step": 534
},
{
"epoch": 1.2162367223065251,
"grad_norm": 0.5937225222587585,
"learning_rate": 4.645167569925577e-06,
"loss": 1.253,
"step": 535
},
{
"epoch": 1.2185128983308042,
"grad_norm": 0.6403617262840271,
"learning_rate": 4.64357424228006e-06,
"loss": 1.1932,
"step": 536
},
{
"epoch": 1.2207890743550835,
"grad_norm": 0.5702269077301025,
"learning_rate": 4.6419776198653365e-06,
"loss": 1.2498,
"step": 537
},
{
"epoch": 1.2230652503793626,
"grad_norm": 0.5545888543128967,
"learning_rate": 4.640377705135485e-06,
"loss": 1.2517,
"step": 538
},
{
"epoch": 1.225341426403642,
"grad_norm": 0.5598457455635071,
"learning_rate": 4.638774500549645e-06,
"loss": 1.2503,
"step": 539
},
{
"epoch": 1.227617602427921,
"grad_norm": 0.5853296518325806,
"learning_rate": 4.637168008572016e-06,
"loss": 1.2418,
"step": 540
},
{
"epoch": 1.2298937784522004,
"grad_norm": 0.5423877239227295,
"learning_rate": 4.635558231671846e-06,
"loss": 1.2295,
"step": 541
},
{
"epoch": 1.2321699544764795,
"grad_norm": 0.5638657808303833,
"learning_rate": 4.633945172323434e-06,
"loss": 1.2934,
"step": 542
},
{
"epoch": 1.2344461305007588,
"grad_norm": 0.5612449645996094,
"learning_rate": 4.6323288330061244e-06,
"loss": 1.2624,
"step": 543
},
{
"epoch": 1.236722306525038,
"grad_norm": 0.5534572601318359,
"learning_rate": 4.630709216204303e-06,
"loss": 1.2488,
"step": 544
},
{
"epoch": 1.2389984825493172,
"grad_norm": 0.5525970458984375,
"learning_rate": 4.629086324407393e-06,
"loss": 1.231,
"step": 545
},
{
"epoch": 1.2412746585735963,
"grad_norm": 0.5725768804550171,
"learning_rate": 4.6274601601098505e-06,
"loss": 1.2959,
"step": 546
},
{
"epoch": 1.2435508345978756,
"grad_norm": 0.582775354385376,
"learning_rate": 4.625830725811164e-06,
"loss": 1.2554,
"step": 547
},
{
"epoch": 1.2458270106221547,
"grad_norm": 0.5522809028625488,
"learning_rate": 4.624198024015845e-06,
"loss": 1.2487,
"step": 548
},
{
"epoch": 1.248103186646434,
"grad_norm": 0.5601561069488525,
"learning_rate": 4.622562057233431e-06,
"loss": 1.2489,
"step": 549
},
{
"epoch": 1.2503793626707131,
"grad_norm": 0.5581909418106079,
"learning_rate": 4.620922827978475e-06,
"loss": 1.205,
"step": 550
},
{
"epoch": 1.2526555386949925,
"grad_norm": 0.5560769438743591,
"learning_rate": 4.619280338770545e-06,
"loss": 1.2253,
"step": 551
},
{
"epoch": 1.2549317147192716,
"grad_norm": 0.5541017651557922,
"learning_rate": 4.617634592134221e-06,
"loss": 1.2476,
"step": 552
},
{
"epoch": 1.2572078907435509,
"grad_norm": 0.5714686512947083,
"learning_rate": 4.615985590599088e-06,
"loss": 1.2274,
"step": 553
},
{
"epoch": 1.25948406676783,
"grad_norm": 0.5909372568130493,
"learning_rate": 4.6143333366997354e-06,
"loss": 1.2481,
"step": 554
},
{
"epoch": 1.2617602427921093,
"grad_norm": 0.5704237818717957,
"learning_rate": 4.612677832975751e-06,
"loss": 1.2607,
"step": 555
},
{
"epoch": 1.2640364188163884,
"grad_norm": 0.5494899749755859,
"learning_rate": 4.611019081971719e-06,
"loss": 1.2171,
"step": 556
},
{
"epoch": 1.2663125948406677,
"grad_norm": 0.5628857612609863,
"learning_rate": 4.609357086237213e-06,
"loss": 1.2185,
"step": 557
},
{
"epoch": 1.2685887708649468,
"grad_norm": 0.5746468305587769,
"learning_rate": 4.607691848326793e-06,
"loss": 1.2485,
"step": 558
},
{
"epoch": 1.2708649468892261,
"grad_norm": 0.5731273889541626,
"learning_rate": 4.606023370800006e-06,
"loss": 1.2302,
"step": 559
},
{
"epoch": 1.2731411229135052,
"grad_norm": 0.5782604217529297,
"learning_rate": 4.604351656221374e-06,
"loss": 1.2281,
"step": 560
},
{
"epoch": 1.2754172989377845,
"grad_norm": 0.5706422328948975,
"learning_rate": 4.6026767071604e-06,
"loss": 1.2145,
"step": 561
},
{
"epoch": 1.2776934749620636,
"grad_norm": 0.5888031125068665,
"learning_rate": 4.6009985261915536e-06,
"loss": 1.1982,
"step": 562
},
{
"epoch": 1.279969650986343,
"grad_norm": 0.543771505355835,
"learning_rate": 4.599317115894273e-06,
"loss": 1.2439,
"step": 563
},
{
"epoch": 1.2822458270106223,
"grad_norm": 0.5837553143501282,
"learning_rate": 4.597632478852963e-06,
"loss": 1.22,
"step": 564
},
{
"epoch": 1.2845220030349014,
"grad_norm": 0.5469195246696472,
"learning_rate": 4.595944617656984e-06,
"loss": 1.2161,
"step": 565
},
{
"epoch": 1.2867981790591805,
"grad_norm": 0.5544828772544861,
"learning_rate": 4.594253534900656e-06,
"loss": 1.22,
"step": 566
},
{
"epoch": 1.2890743550834598,
"grad_norm": 0.5594440698623657,
"learning_rate": 4.592559233183246e-06,
"loss": 1.2088,
"step": 567
},
{
"epoch": 1.2913505311077391,
"grad_norm": 0.541545569896698,
"learning_rate": 4.590861715108972e-06,
"loss": 1.2185,
"step": 568
},
{
"epoch": 1.2936267071320182,
"grad_norm": 0.5520378947257996,
"learning_rate": 4.5891609832869964e-06,
"loss": 1.2268,
"step": 569
},
{
"epoch": 1.2959028831562973,
"grad_norm": 0.5583465695381165,
"learning_rate": 4.587457040331419e-06,
"loss": 1.2225,
"step": 570
},
{
"epoch": 1.2981790591805766,
"grad_norm": 0.5398393869400024,
"learning_rate": 4.5857498888612755e-06,
"loss": 1.2479,
"step": 571
},
{
"epoch": 1.300455235204856,
"grad_norm": 0.5736100673675537,
"learning_rate": 4.584039531500535e-06,
"loss": 1.2572,
"step": 572
},
{
"epoch": 1.302731411229135,
"grad_norm": 0.5614636540412903,
"learning_rate": 4.582325970878092e-06,
"loss": 1.2221,
"step": 573
},
{
"epoch": 1.3050075872534141,
"grad_norm": 0.5580296516418457,
"learning_rate": 4.580609209627766e-06,
"loss": 1.232,
"step": 574
},
{
"epoch": 1.3072837632776935,
"grad_norm": 0.5606446266174316,
"learning_rate": 4.578889250388296e-06,
"loss": 1.2214,
"step": 575
},
{
"epoch": 1.3095599393019728,
"grad_norm": 0.5508303642272949,
"learning_rate": 4.577166095803336e-06,
"loss": 1.244,
"step": 576
},
{
"epoch": 1.3118361153262519,
"grad_norm": 0.557896614074707,
"learning_rate": 4.5754397485214505e-06,
"loss": 1.2668,
"step": 577
},
{
"epoch": 1.314112291350531,
"grad_norm": 0.5473496317863464,
"learning_rate": 4.573710211196113e-06,
"loss": 1.2265,
"step": 578
},
{
"epoch": 1.3163884673748103,
"grad_norm": 0.5576569437980652,
"learning_rate": 4.5719774864857e-06,
"loss": 1.2626,
"step": 579
},
{
"epoch": 1.3186646433990896,
"grad_norm": 0.5799663662910461,
"learning_rate": 4.570241577053486e-06,
"loss": 1.2573,
"step": 580
},
{
"epoch": 1.3209408194233687,
"grad_norm": 0.555438756942749,
"learning_rate": 4.568502485567641e-06,
"loss": 1.2775,
"step": 581
},
{
"epoch": 1.3232169954476478,
"grad_norm": 0.5486553907394409,
"learning_rate": 4.566760214701227e-06,
"loss": 1.2588,
"step": 582
},
{
"epoch": 1.3254931714719271,
"grad_norm": 0.5853822231292725,
"learning_rate": 4.565014767132191e-06,
"loss": 1.2185,
"step": 583
},
{
"epoch": 1.3277693474962065,
"grad_norm": 0.569977879524231,
"learning_rate": 4.563266145543364e-06,
"loss": 1.2387,
"step": 584
},
{
"epoch": 1.3300455235204856,
"grad_norm": 0.5845345258712769,
"learning_rate": 4.5615143526224555e-06,
"loss": 1.2935,
"step": 585
},
{
"epoch": 1.3323216995447649,
"grad_norm": 0.5513466000556946,
"learning_rate": 4.559759391062051e-06,
"loss": 1.2347,
"step": 586
},
{
"epoch": 1.334597875569044,
"grad_norm": 0.5497938990592957,
"learning_rate": 4.558001263559602e-06,
"loss": 1.2266,
"step": 587
},
{
"epoch": 1.3368740515933233,
"grad_norm": 0.5504549145698547,
"learning_rate": 4.556239972817429e-06,
"loss": 1.2535,
"step": 588
},
{
"epoch": 1.3391502276176024,
"grad_norm": 0.5670903325080872,
"learning_rate": 4.5544755215427175e-06,
"loss": 1.261,
"step": 589
},
{
"epoch": 1.3414264036418817,
"grad_norm": 0.5838532447814941,
"learning_rate": 4.552707912447504e-06,
"loss": 1.2487,
"step": 590
},
{
"epoch": 1.3437025796661608,
"grad_norm": 0.5291898250579834,
"learning_rate": 4.550937148248685e-06,
"loss": 1.2528,
"step": 591
},
{
"epoch": 1.3459787556904401,
"grad_norm": 0.5700204968452454,
"learning_rate": 4.549163231668004e-06,
"loss": 1.2657,
"step": 592
},
{
"epoch": 1.3482549317147192,
"grad_norm": 0.5522517561912537,
"learning_rate": 4.547386165432048e-06,
"loss": 1.2542,
"step": 593
},
{
"epoch": 1.3505311077389985,
"grad_norm": 0.5714395046234131,
"learning_rate": 4.545605952272249e-06,
"loss": 1.2343,
"step": 594
},
{
"epoch": 1.3528072837632776,
"grad_norm": 0.5690736174583435,
"learning_rate": 4.543822594924874e-06,
"loss": 1.2462,
"step": 595
},
{
"epoch": 1.355083459787557,
"grad_norm": 0.5521000027656555,
"learning_rate": 4.54203609613102e-06,
"loss": 1.2512,
"step": 596
},
{
"epoch": 1.357359635811836,
"grad_norm": 0.5685454607009888,
"learning_rate": 4.540246458636619e-06,
"loss": 1.2296,
"step": 597
},
{
"epoch": 1.3596358118361154,
"grad_norm": 0.5521453022956848,
"learning_rate": 4.538453685192421e-06,
"loss": 1.2533,
"step": 598
},
{
"epoch": 1.3619119878603945,
"grad_norm": 0.545840322971344,
"learning_rate": 4.536657778554e-06,
"loss": 1.2456,
"step": 599
},
{
"epoch": 1.3641881638846738,
"grad_norm": 0.5703026056289673,
"learning_rate": 4.534858741481745e-06,
"loss": 1.2293,
"step": 600
},
{
"epoch": 1.3664643399089529,
"grad_norm": 0.5508074760437012,
"learning_rate": 4.5330565767408555e-06,
"loss": 1.2657,
"step": 601
},
{
"epoch": 1.3687405159332322,
"grad_norm": 0.5637306571006775,
"learning_rate": 4.531251287101338e-06,
"loss": 1.2199,
"step": 602
},
{
"epoch": 1.3710166919575113,
"grad_norm": 0.5585516095161438,
"learning_rate": 4.529442875338005e-06,
"loss": 1.2331,
"step": 603
},
{
"epoch": 1.3732928679817906,
"grad_norm": 0.5738129019737244,
"learning_rate": 4.527631344230466e-06,
"loss": 1.215,
"step": 604
},
{
"epoch": 1.37556904400607,
"grad_norm": 0.5905203223228455,
"learning_rate": 4.525816696563123e-06,
"loss": 1.2322,
"step": 605
},
{
"epoch": 1.377845220030349,
"grad_norm": 0.5772601366043091,
"learning_rate": 4.523998935125173e-06,
"loss": 1.2344,
"step": 606
},
{
"epoch": 1.3801213960546281,
"grad_norm": 0.6194104552268982,
"learning_rate": 4.5221780627105945e-06,
"loss": 1.2647,
"step": 607
},
{
"epoch": 1.3823975720789075,
"grad_norm": 0.5779480934143066,
"learning_rate": 4.520354082118151e-06,
"loss": 1.2148,
"step": 608
},
{
"epoch": 1.3846737481031868,
"grad_norm": 0.5630953907966614,
"learning_rate": 4.518526996151381e-06,
"loss": 1.2647,
"step": 609
},
{
"epoch": 1.3869499241274659,
"grad_norm": 0.5726267099380493,
"learning_rate": 4.516696807618598e-06,
"loss": 1.2741,
"step": 610
},
{
"epoch": 1.389226100151745,
"grad_norm": 0.5838750600814819,
"learning_rate": 4.514863519332882e-06,
"loss": 1.1919,
"step": 611
},
{
"epoch": 1.3915022761760243,
"grad_norm": 0.5766186714172363,
"learning_rate": 4.5130271341120805e-06,
"loss": 1.2359,
"step": 612
},
{
"epoch": 1.3937784522003036,
"grad_norm": 0.5568646192550659,
"learning_rate": 4.511187654778798e-06,
"loss": 1.2107,
"step": 613
},
{
"epoch": 1.3960546282245827,
"grad_norm": 0.5602480173110962,
"learning_rate": 4.509345084160397e-06,
"loss": 1.2276,
"step": 614
},
{
"epoch": 1.3983308042488618,
"grad_norm": 0.5605113506317139,
"learning_rate": 4.507499425088991e-06,
"loss": 1.2259,
"step": 615
},
{
"epoch": 1.4006069802731411,
"grad_norm": 0.5589579939842224,
"learning_rate": 4.505650680401441e-06,
"loss": 1.2212,
"step": 616
},
{
"epoch": 1.4028831562974204,
"grad_norm": 0.5683750510215759,
"learning_rate": 4.503798852939347e-06,
"loss": 1.2313,
"step": 617
},
{
"epoch": 1.4051593323216995,
"grad_norm": 0.5655199885368347,
"learning_rate": 4.501943945549054e-06,
"loss": 1.2199,
"step": 618
},
{
"epoch": 1.4074355083459786,
"grad_norm": 0.5633233785629272,
"learning_rate": 4.500085961081635e-06,
"loss": 1.2305,
"step": 619
},
{
"epoch": 1.409711684370258,
"grad_norm": 0.5716864466667175,
"learning_rate": 4.498224902392896e-06,
"loss": 1.2135,
"step": 620
},
{
"epoch": 1.4119878603945373,
"grad_norm": 0.5524502992630005,
"learning_rate": 4.496360772343367e-06,
"loss": 1.221,
"step": 621
},
{
"epoch": 1.4142640364188164,
"grad_norm": 0.5607890486717224,
"learning_rate": 4.494493573798299e-06,
"loss": 1.2243,
"step": 622
},
{
"epoch": 1.4165402124430955,
"grad_norm": 0.5746079683303833,
"learning_rate": 4.49262330962766e-06,
"loss": 1.2064,
"step": 623
},
{
"epoch": 1.4188163884673748,
"grad_norm": 0.5607832670211792,
"learning_rate": 4.490749982706128e-06,
"loss": 1.2248,
"step": 624
},
{
"epoch": 1.421092564491654,
"grad_norm": 0.5688823461532593,
"learning_rate": 4.488873595913092e-06,
"loss": 1.232,
"step": 625
},
{
"epoch": 1.4233687405159332,
"grad_norm": 0.5820784568786621,
"learning_rate": 4.48699415213264e-06,
"loss": 1.2485,
"step": 626
},
{
"epoch": 1.4256449165402125,
"grad_norm": 0.56890869140625,
"learning_rate": 4.4851116542535625e-06,
"loss": 1.2286,
"step": 627
},
{
"epoch": 1.4279210925644916,
"grad_norm": 0.6012819409370422,
"learning_rate": 4.483226105169341e-06,
"loss": 1.2343,
"step": 628
},
{
"epoch": 1.430197268588771,
"grad_norm": 0.570756733417511,
"learning_rate": 4.481337507778151e-06,
"loss": 1.2447,
"step": 629
},
{
"epoch": 1.43247344461305,
"grad_norm": 0.5640760660171509,
"learning_rate": 4.47944586498285e-06,
"loss": 1.2298,
"step": 630
},
{
"epoch": 1.4347496206373294,
"grad_norm": 0.5836703777313232,
"learning_rate": 4.477551179690977e-06,
"loss": 1.2099,
"step": 631
},
{
"epoch": 1.4370257966616085,
"grad_norm": 0.5838893055915833,
"learning_rate": 4.475653454814746e-06,
"loss": 1.2437,
"step": 632
},
{
"epoch": 1.4393019726858878,
"grad_norm": 0.5973705053329468,
"learning_rate": 4.473752693271048e-06,
"loss": 1.2872,
"step": 633
},
{
"epoch": 1.4415781487101669,
"grad_norm": 0.5992927551269531,
"learning_rate": 4.471848897981437e-06,
"loss": 1.2072,
"step": 634
},
{
"epoch": 1.4438543247344462,
"grad_norm": 0.566234827041626,
"learning_rate": 4.46994207187213e-06,
"loss": 1.2181,
"step": 635
},
{
"epoch": 1.4461305007587253,
"grad_norm": 0.5693137645721436,
"learning_rate": 4.4680322178740056e-06,
"loss": 1.1862,
"step": 636
},
{
"epoch": 1.4484066767830046,
"grad_norm": 0.5798976421356201,
"learning_rate": 4.466119338922593e-06,
"loss": 1.2225,
"step": 637
},
{
"epoch": 1.4506828528072837,
"grad_norm": 0.575389564037323,
"learning_rate": 4.464203437958075e-06,
"loss": 1.2257,
"step": 638
},
{
"epoch": 1.452959028831563,
"grad_norm": 0.6053541302680969,
"learning_rate": 4.4622845179252735e-06,
"loss": 1.241,
"step": 639
},
{
"epoch": 1.4552352048558421,
"grad_norm": 0.5716749429702759,
"learning_rate": 4.460362581773656e-06,
"loss": 1.2278,
"step": 640
},
{
"epoch": 1.4575113808801214,
"grad_norm": 0.5863229036331177,
"learning_rate": 4.458437632457325e-06,
"loss": 1.2238,
"step": 641
},
{
"epoch": 1.4597875569044005,
"grad_norm": 0.6117021441459656,
"learning_rate": 4.456509672935011e-06,
"loss": 1.2318,
"step": 642
},
{
"epoch": 1.4620637329286799,
"grad_norm": 0.6031973361968994,
"learning_rate": 4.454578706170075e-06,
"loss": 1.2309,
"step": 643
},
{
"epoch": 1.464339908952959,
"grad_norm": 0.6449349522590637,
"learning_rate": 4.4526447351304995e-06,
"loss": 1.2357,
"step": 644
},
{
"epoch": 1.4666160849772383,
"grad_norm": 0.5698959231376648,
"learning_rate": 4.450707762788884e-06,
"loss": 1.2064,
"step": 645
},
{
"epoch": 1.4688922610015174,
"grad_norm": 0.6145030856132507,
"learning_rate": 4.44876779212244e-06,
"loss": 1.1837,
"step": 646
},
{
"epoch": 1.4711684370257967,
"grad_norm": 0.6202698349952698,
"learning_rate": 4.446824826112992e-06,
"loss": 1.2459,
"step": 647
},
{
"epoch": 1.4734446130500758,
"grad_norm": 0.5868430137634277,
"learning_rate": 4.444878867746962e-06,
"loss": 1.1797,
"step": 648
},
{
"epoch": 1.475720789074355,
"grad_norm": 0.6009106636047363,
"learning_rate": 4.442929920015377e-06,
"loss": 1.2008,
"step": 649
},
{
"epoch": 1.4779969650986344,
"grad_norm": 0.6000754237174988,
"learning_rate": 4.440977985913856e-06,
"loss": 1.199,
"step": 650
},
{
"epoch": 1.4802731411229135,
"grad_norm": 0.5801194310188293,
"learning_rate": 4.439023068442608e-06,
"loss": 1.2806,
"step": 651
},
{
"epoch": 1.4825493171471926,
"grad_norm": 0.6096365451812744,
"learning_rate": 4.43706517060643e-06,
"loss": 1.2434,
"step": 652
},
{
"epoch": 1.484825493171472,
"grad_norm": 0.6116917133331299,
"learning_rate": 4.435104295414697e-06,
"loss": 1.2262,
"step": 653
},
{
"epoch": 1.4871016691957513,
"grad_norm": 0.5588528513908386,
"learning_rate": 4.4331404458813615e-06,
"loss": 1.2373,
"step": 654
},
{
"epoch": 1.4893778452200304,
"grad_norm": 0.5834910869598389,
"learning_rate": 4.431173625024948e-06,
"loss": 1.2766,
"step": 655
},
{
"epoch": 1.4916540212443095,
"grad_norm": 0.623333215713501,
"learning_rate": 4.429203835868549e-06,
"loss": 1.2375,
"step": 656
},
{
"epoch": 1.4939301972685888,
"grad_norm": 0.6033525466918945,
"learning_rate": 4.427231081439817e-06,
"loss": 1.2,
"step": 657
},
{
"epoch": 1.496206373292868,
"grad_norm": 0.5829868912696838,
"learning_rate": 4.4252553647709635e-06,
"loss": 1.2349,
"step": 658
},
{
"epoch": 1.4984825493171472,
"grad_norm": 0.5703787803649902,
"learning_rate": 4.423276688898754e-06,
"loss": 1.2213,
"step": 659
},
{
"epoch": 1.5007587253414263,
"grad_norm": 0.5715304017066956,
"learning_rate": 4.421295056864501e-06,
"loss": 1.2394,
"step": 660
},
{
"epoch": 1.5030349013657056,
"grad_norm": 0.6249496340751648,
"learning_rate": 4.419310471714061e-06,
"loss": 1.2027,
"step": 661
},
{
"epoch": 1.505311077389985,
"grad_norm": 0.5828440189361572,
"learning_rate": 4.417322936497831e-06,
"loss": 1.2442,
"step": 662
},
{
"epoch": 1.507587253414264,
"grad_norm": 0.5692103505134583,
"learning_rate": 4.415332454270741e-06,
"loss": 1.1791,
"step": 663
},
{
"epoch": 1.5098634294385431,
"grad_norm": 0.595786988735199,
"learning_rate": 4.41333902809225e-06,
"loss": 1.231,
"step": 664
},
{
"epoch": 1.5121396054628224,
"grad_norm": 0.5955888032913208,
"learning_rate": 4.411342661026342e-06,
"loss": 1.2206,
"step": 665
},
{
"epoch": 1.5144157814871018,
"grad_norm": 0.582911491394043,
"learning_rate": 4.409343356141525e-06,
"loss": 1.2169,
"step": 666
},
{
"epoch": 1.5166919575113809,
"grad_norm": 0.585781455039978,
"learning_rate": 4.407341116510818e-06,
"loss": 1.2345,
"step": 667
},
{
"epoch": 1.51896813353566,
"grad_norm": 0.5766403675079346,
"learning_rate": 4.405335945211754e-06,
"loss": 1.2307,
"step": 668
},
{
"epoch": 1.5212443095599393,
"grad_norm": 0.5894457101821899,
"learning_rate": 4.4033278453263685e-06,
"loss": 1.2445,
"step": 669
},
{
"epoch": 1.5235204855842186,
"grad_norm": 0.5737869143486023,
"learning_rate": 4.401316819941203e-06,
"loss": 1.2311,
"step": 670
},
{
"epoch": 1.5257966616084977,
"grad_norm": 0.5908883213996887,
"learning_rate": 4.399302872147292e-06,
"loss": 1.2381,
"step": 671
},
{
"epoch": 1.5280728376327768,
"grad_norm": 0.6145277619361877,
"learning_rate": 4.397286005040162e-06,
"loss": 1.2394,
"step": 672
},
{
"epoch": 1.5303490136570561,
"grad_norm": 0.5731965899467468,
"learning_rate": 4.395266221719829e-06,
"loss": 1.2369,
"step": 673
},
{
"epoch": 1.5326251896813354,
"grad_norm": 0.5849004983901978,
"learning_rate": 4.3932435252907914e-06,
"loss": 1.2308,
"step": 674
},
{
"epoch": 1.5349013657056145,
"grad_norm": 0.5686678290367126,
"learning_rate": 4.391217918862021e-06,
"loss": 1.259,
"step": 675
},
{
"epoch": 1.5371775417298936,
"grad_norm": 0.580635666847229,
"learning_rate": 4.389189405546966e-06,
"loss": 1.2359,
"step": 676
},
{
"epoch": 1.539453717754173,
"grad_norm": 0.5722584128379822,
"learning_rate": 4.387157988463544e-06,
"loss": 1.231,
"step": 677
},
{
"epoch": 1.5417298937784523,
"grad_norm": 0.5868629813194275,
"learning_rate": 4.38512367073413e-06,
"loss": 1.2363,
"step": 678
},
{
"epoch": 1.5440060698027314,
"grad_norm": 0.5766255259513855,
"learning_rate": 4.383086455485564e-06,
"loss": 1.2556,
"step": 679
},
{
"epoch": 1.5462822458270105,
"grad_norm": 0.5849782824516296,
"learning_rate": 4.381046345849136e-06,
"loss": 1.2189,
"step": 680
},
{
"epoch": 1.5485584218512898,
"grad_norm": 0.6070932149887085,
"learning_rate": 4.379003344960585e-06,
"loss": 1.2351,
"step": 681
},
{
"epoch": 1.550834597875569,
"grad_norm": 0.6085125803947449,
"learning_rate": 4.376957455960094e-06,
"loss": 1.2218,
"step": 682
},
{
"epoch": 1.5531107738998484,
"grad_norm": 0.5707188844680786,
"learning_rate": 4.374908681992287e-06,
"loss": 1.2501,
"step": 683
},
{
"epoch": 1.5553869499241275,
"grad_norm": 0.6099936366081238,
"learning_rate": 4.37285702620622e-06,
"loss": 1.2436,
"step": 684
},
{
"epoch": 1.5576631259484066,
"grad_norm": 0.603273332118988,
"learning_rate": 4.37080249175538e-06,
"loss": 1.239,
"step": 685
},
{
"epoch": 1.559939301972686,
"grad_norm": 0.5822923183441162,
"learning_rate": 4.368745081797678e-06,
"loss": 1.22,
"step": 686
},
{
"epoch": 1.5622154779969653,
"grad_norm": 0.5922508835792542,
"learning_rate": 4.3666847994954445e-06,
"loss": 1.2138,
"step": 687
},
{
"epoch": 1.5644916540212443,
"grad_norm": 0.585437536239624,
"learning_rate": 4.364621648015426e-06,
"loss": 1.207,
"step": 688
},
{
"epoch": 1.5667678300455234,
"grad_norm": 0.5693568587303162,
"learning_rate": 4.362555630528776e-06,
"loss": 1.2036,
"step": 689
},
{
"epoch": 1.5690440060698028,
"grad_norm": 0.5950521230697632,
"learning_rate": 4.360486750211059e-06,
"loss": 1.2682,
"step": 690
},
{
"epoch": 1.571320182094082,
"grad_norm": 0.5919183492660522,
"learning_rate": 4.358415010242234e-06,
"loss": 1.2082,
"step": 691
},
{
"epoch": 1.5735963581183612,
"grad_norm": 0.6143742203712463,
"learning_rate": 4.356340413806658e-06,
"loss": 1.1925,
"step": 692
},
{
"epoch": 1.5758725341426403,
"grad_norm": 0.6028359532356262,
"learning_rate": 4.354262964093079e-06,
"loss": 1.2196,
"step": 693
},
{
"epoch": 1.5781487101669196,
"grad_norm": 0.6061824560165405,
"learning_rate": 4.35218266429463e-06,
"loss": 1.2266,
"step": 694
},
{
"epoch": 1.580424886191199,
"grad_norm": 0.6007355451583862,
"learning_rate": 4.3500995176088235e-06,
"loss": 1.2104,
"step": 695
},
{
"epoch": 1.582701062215478,
"grad_norm": 0.6342191100120544,
"learning_rate": 4.348013527237549e-06,
"loss": 1.2197,
"step": 696
},
{
"epoch": 1.5849772382397571,
"grad_norm": 0.5949456095695496,
"learning_rate": 4.345924696387067e-06,
"loss": 1.2258,
"step": 697
},
{
"epoch": 1.5872534142640364,
"grad_norm": 0.6161270141601562,
"learning_rate": 4.343833028268004e-06,
"loss": 1.2299,
"step": 698
},
{
"epoch": 1.5895295902883158,
"grad_norm": 0.5942959785461426,
"learning_rate": 4.341738526095348e-06,
"loss": 1.2594,
"step": 699
},
{
"epoch": 1.5918057663125948,
"grad_norm": 0.5933099389076233,
"learning_rate": 4.339641193088439e-06,
"loss": 1.1932,
"step": 700
},
{
"epoch": 1.594081942336874,
"grad_norm": 0.5857350826263428,
"learning_rate": 4.337541032470976e-06,
"loss": 1.3019,
"step": 701
},
{
"epoch": 1.5963581183611533,
"grad_norm": 0.604029655456543,
"learning_rate": 4.335438047470996e-06,
"loss": 1.2227,
"step": 702
},
{
"epoch": 1.5986342943854326,
"grad_norm": 0.5927514433860779,
"learning_rate": 4.333332241320882e-06,
"loss": 1.2742,
"step": 703
},
{
"epoch": 1.6009104704097117,
"grad_norm": 0.5811514854431152,
"learning_rate": 4.331223617257351e-06,
"loss": 1.23,
"step": 704
},
{
"epoch": 1.6031866464339908,
"grad_norm": 0.5948609709739685,
"learning_rate": 4.329112178521454e-06,
"loss": 1.2114,
"step": 705
},
{
"epoch": 1.60546282245827,
"grad_norm": 0.6194981932640076,
"learning_rate": 4.326997928358565e-06,
"loss": 1.2439,
"step": 706
},
{
"epoch": 1.6077389984825494,
"grad_norm": 0.5834797024726868,
"learning_rate": 4.324880870018382e-06,
"loss": 1.2269,
"step": 707
},
{
"epoch": 1.6100151745068285,
"grad_norm": 0.5746902823448181,
"learning_rate": 4.322761006754916e-06,
"loss": 1.2175,
"step": 708
},
{
"epoch": 1.6122913505311076,
"grad_norm": 0.6000075936317444,
"learning_rate": 4.320638341826494e-06,
"loss": 1.2316,
"step": 709
},
{
"epoch": 1.614567526555387,
"grad_norm": 0.588010311126709,
"learning_rate": 4.318512878495745e-06,
"loss": 1.245,
"step": 710
},
{
"epoch": 1.6168437025796663,
"grad_norm": 0.6053698658943176,
"learning_rate": 4.316384620029601e-06,
"loss": 1.228,
"step": 711
},
{
"epoch": 1.6191198786039454,
"grad_norm": 0.5857113599777222,
"learning_rate": 4.314253569699292e-06,
"loss": 1.2511,
"step": 712
},
{
"epoch": 1.6213960546282244,
"grad_norm": 0.5974637866020203,
"learning_rate": 4.312119730780334e-06,
"loss": 1.2377,
"step": 713
},
{
"epoch": 1.6236722306525038,
"grad_norm": 0.5964690446853638,
"learning_rate": 4.309983106552535e-06,
"loss": 1.2307,
"step": 714
},
{
"epoch": 1.625948406676783,
"grad_norm": 0.5781478881835938,
"learning_rate": 4.307843700299982e-06,
"loss": 1.2295,
"step": 715
},
{
"epoch": 1.6282245827010622,
"grad_norm": 0.597053587436676,
"learning_rate": 4.305701515311037e-06,
"loss": 1.2085,
"step": 716
},
{
"epoch": 1.6305007587253413,
"grad_norm": 0.6326000690460205,
"learning_rate": 4.303556554878333e-06,
"loss": 1.238,
"step": 717
},
{
"epoch": 1.6327769347496206,
"grad_norm": 0.6087371706962585,
"learning_rate": 4.3014088222987714e-06,
"loss": 1.2275,
"step": 718
},
{
"epoch": 1.6350531107739,
"grad_norm": 0.5937424898147583,
"learning_rate": 4.299258320873513e-06,
"loss": 1.2144,
"step": 719
},
{
"epoch": 1.637329286798179,
"grad_norm": 0.5922595262527466,
"learning_rate": 4.297105053907973e-06,
"loss": 1.2078,
"step": 720
},
{
"epoch": 1.6396054628224581,
"grad_norm": 0.603537380695343,
"learning_rate": 4.294949024711819e-06,
"loss": 1.2054,
"step": 721
},
{
"epoch": 1.6418816388467374,
"grad_norm": 0.5896364450454712,
"learning_rate": 4.2927902365989645e-06,
"loss": 1.2038,
"step": 722
},
{
"epoch": 1.6441578148710168,
"grad_norm": 0.614658534526825,
"learning_rate": 4.290628692887564e-06,
"loss": 1.2428,
"step": 723
},
{
"epoch": 1.6464339908952959,
"grad_norm": 0.5901724100112915,
"learning_rate": 4.288464396900005e-06,
"loss": 1.2464,
"step": 724
},
{
"epoch": 1.648710166919575,
"grad_norm": 0.6086544394493103,
"learning_rate": 4.286297351962908e-06,
"loss": 1.1895,
"step": 725
},
{
"epoch": 1.6509863429438543,
"grad_norm": 0.5841042399406433,
"learning_rate": 4.284127561407118e-06,
"loss": 1.2222,
"step": 726
},
{
"epoch": 1.6532625189681336,
"grad_norm": 0.5791555643081665,
"learning_rate": 4.281955028567698e-06,
"loss": 1.2489,
"step": 727
},
{
"epoch": 1.655538694992413,
"grad_norm": 0.6219162344932556,
"learning_rate": 4.27977975678393e-06,
"loss": 1.2208,
"step": 728
},
{
"epoch": 1.657814871016692,
"grad_norm": 0.597656786441803,
"learning_rate": 4.277601749399301e-06,
"loss": 1.2049,
"step": 729
},
{
"epoch": 1.660091047040971,
"grad_norm": 0.5991064310073853,
"learning_rate": 4.27542100976151e-06,
"loss": 1.2602,
"step": 730
},
{
"epoch": 1.6623672230652504,
"grad_norm": 0.5922961831092834,
"learning_rate": 4.273237541222447e-06,
"loss": 1.2077,
"step": 731
},
{
"epoch": 1.6646433990895297,
"grad_norm": 0.6028023362159729,
"learning_rate": 4.2710513471382005e-06,
"loss": 1.2092,
"step": 732
},
{
"epoch": 1.6669195751138088,
"grad_norm": 0.581685483455658,
"learning_rate": 4.268862430869052e-06,
"loss": 1.2192,
"step": 733
},
{
"epoch": 1.669195751138088,
"grad_norm": 0.6332095265388489,
"learning_rate": 4.26667079577946e-06,
"loss": 1.2573,
"step": 734
},
{
"epoch": 1.6714719271623673,
"grad_norm": 0.6062667369842529,
"learning_rate": 4.2644764452380675e-06,
"loss": 1.2994,
"step": 735
},
{
"epoch": 1.6737481031866466,
"grad_norm": 0.5829861164093018,
"learning_rate": 4.262279382617687e-06,
"loss": 1.2286,
"step": 736
},
{
"epoch": 1.6760242792109257,
"grad_norm": 0.587378203868866,
"learning_rate": 4.260079611295303e-06,
"loss": 1.182,
"step": 737
},
{
"epoch": 1.6783004552352048,
"grad_norm": 0.6240544319152832,
"learning_rate": 4.257877134652062e-06,
"loss": 1.2543,
"step": 738
},
{
"epoch": 1.680576631259484,
"grad_norm": 0.5865784287452698,
"learning_rate": 4.255671956073269e-06,
"loss": 1.2355,
"step": 739
},
{
"epoch": 1.6828528072837634,
"grad_norm": 0.5847815871238708,
"learning_rate": 4.253464078948382e-06,
"loss": 1.2069,
"step": 740
},
{
"epoch": 1.6851289833080425,
"grad_norm": 0.5941992402076721,
"learning_rate": 4.251253506671006e-06,
"loss": 1.2423,
"step": 741
},
{
"epoch": 1.6874051593323216,
"grad_norm": 0.6245031952857971,
"learning_rate": 4.249040242638889e-06,
"loss": 1.2555,
"step": 742
},
{
"epoch": 1.689681335356601,
"grad_norm": 0.6055291295051575,
"learning_rate": 4.246824290253917e-06,
"loss": 1.2261,
"step": 743
},
{
"epoch": 1.6919575113808802,
"grad_norm": 0.5905616283416748,
"learning_rate": 4.244605652922108e-06,
"loss": 1.2385,
"step": 744
},
{
"epoch": 1.6942336874051593,
"grad_norm": 0.5896965265274048,
"learning_rate": 4.2423843340536066e-06,
"loss": 1.1945,
"step": 745
},
{
"epoch": 1.6965098634294384,
"grad_norm": 0.6129325032234192,
"learning_rate": 4.240160337062678e-06,
"loss": 1.223,
"step": 746
},
{
"epoch": 1.6987860394537178,
"grad_norm": 0.5988030433654785,
"learning_rate": 4.237933665367705e-06,
"loss": 1.2197,
"step": 747
},
{
"epoch": 1.701062215477997,
"grad_norm": 0.599388837814331,
"learning_rate": 4.235704322391181e-06,
"loss": 1.2214,
"step": 748
},
{
"epoch": 1.7033383915022762,
"grad_norm": 0.6087759137153625,
"learning_rate": 4.233472311559708e-06,
"loss": 1.2302,
"step": 749
},
{
"epoch": 1.7056145675265553,
"grad_norm": 0.5895616412162781,
"learning_rate": 4.231237636303982e-06,
"loss": 1.1976,
"step": 750
},
{
"epoch": 1.7078907435508346,
"grad_norm": 0.6117663383483887,
"learning_rate": 4.229000300058802e-06,
"loss": 1.1928,
"step": 751
},
{
"epoch": 1.710166919575114,
"grad_norm": 0.5945206880569458,
"learning_rate": 4.2267603062630526e-06,
"loss": 1.201,
"step": 752
},
{
"epoch": 1.712443095599393,
"grad_norm": 0.6434623599052429,
"learning_rate": 4.224517658359704e-06,
"loss": 1.239,
"step": 753
},
{
"epoch": 1.714719271623672,
"grad_norm": 0.5895166397094727,
"learning_rate": 4.222272359795806e-06,
"loss": 1.2305,
"step": 754
},
{
"epoch": 1.7169954476479514,
"grad_norm": 0.6248841285705566,
"learning_rate": 4.220024414022482e-06,
"loss": 1.2332,
"step": 755
},
{
"epoch": 1.7192716236722307,
"grad_norm": 0.6209638118743896,
"learning_rate": 4.217773824494926e-06,
"loss": 1.2773,
"step": 756
},
{
"epoch": 1.7215477996965098,
"grad_norm": 0.5973532199859619,
"learning_rate": 4.215520594672394e-06,
"loss": 1.1992,
"step": 757
},
{
"epoch": 1.723823975720789,
"grad_norm": 0.5936313271522522,
"learning_rate": 4.2132647280182e-06,
"loss": 1.2412,
"step": 758
},
{
"epoch": 1.7261001517450683,
"grad_norm": 0.6053516268730164,
"learning_rate": 4.211006227999713e-06,
"loss": 1.2129,
"step": 759
},
{
"epoch": 1.7283763277693476,
"grad_norm": 0.6065954566001892,
"learning_rate": 4.208745098088348e-06,
"loss": 1.2395,
"step": 760
},
{
"epoch": 1.7306525037936267,
"grad_norm": 0.6134182214736938,
"learning_rate": 4.206481341759562e-06,
"loss": 1.1969,
"step": 761
},
{
"epoch": 1.7329286798179058,
"grad_norm": 0.6103958487510681,
"learning_rate": 4.204214962492849e-06,
"loss": 1.2583,
"step": 762
},
{
"epoch": 1.735204855842185,
"grad_norm": 0.6010955572128296,
"learning_rate": 4.201945963771736e-06,
"loss": 1.2638,
"step": 763
},
{
"epoch": 1.7374810318664644,
"grad_norm": 0.6201740503311157,
"learning_rate": 4.199674349083776e-06,
"loss": 1.2491,
"step": 764
},
{
"epoch": 1.7397572078907435,
"grad_norm": 0.6140694618225098,
"learning_rate": 4.197400121920539e-06,
"loss": 1.243,
"step": 765
},
{
"epoch": 1.7420333839150226,
"grad_norm": 0.6441624164581299,
"learning_rate": 4.1951232857776164e-06,
"loss": 1.2614,
"step": 766
},
{
"epoch": 1.744309559939302,
"grad_norm": 0.6050844192504883,
"learning_rate": 4.192843844154606e-06,
"loss": 1.1756,
"step": 767
},
{
"epoch": 1.7465857359635812,
"grad_norm": 0.6491802930831909,
"learning_rate": 4.190561800555111e-06,
"loss": 1.2029,
"step": 768
},
{
"epoch": 1.7488619119878603,
"grad_norm": 0.6259174942970276,
"learning_rate": 4.1882771584867345e-06,
"loss": 1.1912,
"step": 769
},
{
"epoch": 1.7511380880121397,
"grad_norm": 0.5955666303634644,
"learning_rate": 4.1859899214610735e-06,
"loss": 1.2701,
"step": 770
},
{
"epoch": 1.7534142640364188,
"grad_norm": 0.6060442924499512,
"learning_rate": 4.183700092993712e-06,
"loss": 1.2269,
"step": 771
},
{
"epoch": 1.755690440060698,
"grad_norm": 0.6210846900939941,
"learning_rate": 4.1814076766042206e-06,
"loss": 1.2679,
"step": 772
},
{
"epoch": 1.7579666160849774,
"grad_norm": 0.5922744870185852,
"learning_rate": 4.179112675816144e-06,
"loss": 1.2171,
"step": 773
},
{
"epoch": 1.7602427921092565,
"grad_norm": 0.6048167943954468,
"learning_rate": 4.176815094157e-06,
"loss": 1.1887,
"step": 774
},
{
"epoch": 1.7625189681335356,
"grad_norm": 0.6661959290504456,
"learning_rate": 4.174514935158277e-06,
"loss": 1.2439,
"step": 775
},
{
"epoch": 1.764795144157815,
"grad_norm": 0.5862908959388733,
"learning_rate": 4.172212202355419e-06,
"loss": 1.2594,
"step": 776
},
{
"epoch": 1.7670713201820942,
"grad_norm": 0.615178644657135,
"learning_rate": 4.16990689928783e-06,
"loss": 1.2137,
"step": 777
},
{
"epoch": 1.7693474962063733,
"grad_norm": 0.6170365810394287,
"learning_rate": 4.167599029498865e-06,
"loss": 1.2278,
"step": 778
},
{
"epoch": 1.7716236722306524,
"grad_norm": 0.6055428385734558,
"learning_rate": 4.165288596535821e-06,
"loss": 1.232,
"step": 779
},
{
"epoch": 1.7738998482549317,
"grad_norm": 0.6081527471542358,
"learning_rate": 4.162975603949937e-06,
"loss": 1.2392,
"step": 780
},
{
"epoch": 1.776176024279211,
"grad_norm": 0.6220976710319519,
"learning_rate": 4.160660055296385e-06,
"loss": 1.2467,
"step": 781
},
{
"epoch": 1.7784522003034902,
"grad_norm": 0.5995768904685974,
"learning_rate": 4.158341954134268e-06,
"loss": 1.2141,
"step": 782
},
{
"epoch": 1.7807283763277693,
"grad_norm": 0.5946653485298157,
"learning_rate": 4.15602130402661e-06,
"loss": 1.255,
"step": 783
},
{
"epoch": 1.7830045523520486,
"grad_norm": 0.6094076633453369,
"learning_rate": 4.1536981085403546e-06,
"loss": 1.243,
"step": 784
},
{
"epoch": 1.785280728376328,
"grad_norm": 0.6584082841873169,
"learning_rate": 4.151372371246356e-06,
"loss": 1.2382,
"step": 785
},
{
"epoch": 1.787556904400607,
"grad_norm": 0.6139714121818542,
"learning_rate": 4.149044095719377e-06,
"loss": 1.2528,
"step": 786
},
{
"epoch": 1.789833080424886,
"grad_norm": 0.6047011017799377,
"learning_rate": 4.14671328553808e-06,
"loss": 1.2034,
"step": 787
},
{
"epoch": 1.7921092564491654,
"grad_norm": 0.6093196868896484,
"learning_rate": 4.144379944285024e-06,
"loss": 1.2669,
"step": 788
},
{
"epoch": 1.7943854324734447,
"grad_norm": 0.6222574710845947,
"learning_rate": 4.142044075546658e-06,
"loss": 1.1817,
"step": 789
},
{
"epoch": 1.7966616084977238,
"grad_norm": 0.6427398920059204,
"learning_rate": 4.13970568291332e-06,
"loss": 1.2165,
"step": 790
},
{
"epoch": 1.798937784522003,
"grad_norm": 0.6227960586547852,
"learning_rate": 4.13736476997922e-06,
"loss": 1.1816,
"step": 791
},
{
"epoch": 1.8012139605462822,
"grad_norm": 0.6001450419425964,
"learning_rate": 4.135021340342446e-06,
"loss": 1.2373,
"step": 792
},
{
"epoch": 1.8034901365705616,
"grad_norm": 0.6028245091438293,
"learning_rate": 4.132675397604956e-06,
"loss": 1.2524,
"step": 793
},
{
"epoch": 1.8057663125948407,
"grad_norm": 0.5959303379058838,
"learning_rate": 4.130326945372567e-06,
"loss": 1.198,
"step": 794
},
{
"epoch": 1.8080424886191198,
"grad_norm": 0.6001620888710022,
"learning_rate": 4.127975987254955e-06,
"loss": 1.2137,
"step": 795
},
{
"epoch": 1.810318664643399,
"grad_norm": 0.5951507091522217,
"learning_rate": 4.125622526865647e-06,
"loss": 1.2285,
"step": 796
},
{
"epoch": 1.8125948406676784,
"grad_norm": 0.614658534526825,
"learning_rate": 4.123266567822017e-06,
"loss": 1.2119,
"step": 797
},
{
"epoch": 1.8148710166919575,
"grad_norm": 0.6394176483154297,
"learning_rate": 4.120908113745281e-06,
"loss": 1.2444,
"step": 798
},
{
"epoch": 1.8171471927162366,
"grad_norm": 0.5989351868629456,
"learning_rate": 4.118547168260485e-06,
"loss": 1.1838,
"step": 799
},
{
"epoch": 1.819423368740516,
"grad_norm": 0.6235303282737732,
"learning_rate": 4.11618373499651e-06,
"loss": 1.2163,
"step": 800
},
{
"epoch": 1.8216995447647952,
"grad_norm": 0.6402750015258789,
"learning_rate": 4.113817817586055e-06,
"loss": 1.2445,
"step": 801
},
{
"epoch": 1.8239757207890743,
"grad_norm": 0.5973191857337952,
"learning_rate": 4.111449419665645e-06,
"loss": 1.2308,
"step": 802
},
{
"epoch": 1.8262518968133534,
"grad_norm": 0.6300286650657654,
"learning_rate": 4.1090785448756096e-06,
"loss": 1.2319,
"step": 803
},
{
"epoch": 1.8285280728376327,
"grad_norm": 0.5970984697341919,
"learning_rate": 4.1067051968600914e-06,
"loss": 1.1944,
"step": 804
},
{
"epoch": 1.830804248861912,
"grad_norm": 0.607427179813385,
"learning_rate": 4.104329379267031e-06,
"loss": 1.2331,
"step": 805
},
{
"epoch": 1.8330804248861912,
"grad_norm": 0.6165644526481628,
"learning_rate": 4.101951095748166e-06,
"loss": 1.2337,
"step": 806
},
{
"epoch": 1.8353566009104703,
"grad_norm": 0.639166533946991,
"learning_rate": 4.099570349959025e-06,
"loss": 1.2263,
"step": 807
},
{
"epoch": 1.8376327769347496,
"grad_norm": 0.6345863342285156,
"learning_rate": 4.097187145558919e-06,
"loss": 1.2397,
"step": 808
},
{
"epoch": 1.839908952959029,
"grad_norm": 0.607635498046875,
"learning_rate": 4.094801486210941e-06,
"loss": 1.1972,
"step": 809
},
{
"epoch": 1.842185128983308,
"grad_norm": 0.6224584579467773,
"learning_rate": 4.092413375581955e-06,
"loss": 1.231,
"step": 810
},
{
"epoch": 1.844461305007587,
"grad_norm": 0.5929398536682129,
"learning_rate": 4.090022817342593e-06,
"loss": 1.2234,
"step": 811
},
{
"epoch": 1.8467374810318664,
"grad_norm": 0.6391967535018921,
"learning_rate": 4.0876298151672525e-06,
"loss": 1.1931,
"step": 812
},
{
"epoch": 1.8490136570561457,
"grad_norm": 0.599383533000946,
"learning_rate": 4.08523437273408e-06,
"loss": 1.2425,
"step": 813
},
{
"epoch": 1.851289833080425,
"grad_norm": 0.5998767614364624,
"learning_rate": 4.082836493724981e-06,
"loss": 1.2188,
"step": 814
},
{
"epoch": 1.8535660091047041,
"grad_norm": 0.5895645618438721,
"learning_rate": 4.080436181825601e-06,
"loss": 1.2286,
"step": 815
},
{
"epoch": 1.8558421851289832,
"grad_norm": 0.6172052621841431,
"learning_rate": 4.078033440725327e-06,
"loss": 1.2007,
"step": 816
},
{
"epoch": 1.8581183611532626,
"grad_norm": 0.613259494304657,
"learning_rate": 4.075628274117279e-06,
"loss": 1.2256,
"step": 817
},
{
"epoch": 1.8603945371775419,
"grad_norm": 0.6026812791824341,
"learning_rate": 4.073220685698304e-06,
"loss": 1.2317,
"step": 818
},
{
"epoch": 1.862670713201821,
"grad_norm": 0.6112560629844666,
"learning_rate": 4.070810679168975e-06,
"loss": 1.2275,
"step": 819
},
{
"epoch": 1.8649468892261,
"grad_norm": 0.6044736504554749,
"learning_rate": 4.068398258233579e-06,
"loss": 1.2515,
"step": 820
},
{
"epoch": 1.8672230652503794,
"grad_norm": 0.6291022896766663,
"learning_rate": 4.065983426600113e-06,
"loss": 1.2137,
"step": 821
},
{
"epoch": 1.8694992412746587,
"grad_norm": 0.6136301755905151,
"learning_rate": 4.063566187980282e-06,
"loss": 1.2144,
"step": 822
},
{
"epoch": 1.8717754172989378,
"grad_norm": 0.6166698932647705,
"learning_rate": 4.06114654608949e-06,
"loss": 1.2434,
"step": 823
},
{
"epoch": 1.874051593323217,
"grad_norm": 0.6023617386817932,
"learning_rate": 4.058724504646834e-06,
"loss": 1.2186,
"step": 824
},
{
"epoch": 1.8763277693474962,
"grad_norm": 0.6259661912918091,
"learning_rate": 4.0563000673751e-06,
"loss": 1.1989,
"step": 825
},
{
"epoch": 1.8786039453717756,
"grad_norm": 0.6420421004295349,
"learning_rate": 4.053873238000756e-06,
"loss": 1.1981,
"step": 826
},
{
"epoch": 1.8808801213960546,
"grad_norm": 0.6250731348991394,
"learning_rate": 4.051444020253947e-06,
"loss": 1.246,
"step": 827
},
{
"epoch": 1.8831562974203337,
"grad_norm": 0.6473506689071655,
"learning_rate": 4.0490124178684884e-06,
"loss": 1.213,
"step": 828
},
{
"epoch": 1.885432473444613,
"grad_norm": 0.6448357701301575,
"learning_rate": 4.046578434581862e-06,
"loss": 1.1696,
"step": 829
},
{
"epoch": 1.8877086494688924,
"grad_norm": 0.6176803112030029,
"learning_rate": 4.044142074135209e-06,
"loss": 1.2453,
"step": 830
},
{
"epoch": 1.8899848254931715,
"grad_norm": 0.6398005485534668,
"learning_rate": 4.0417033402733244e-06,
"loss": 1.2198,
"step": 831
},
{
"epoch": 1.8922610015174506,
"grad_norm": 0.6350208520889282,
"learning_rate": 4.03926223674465e-06,
"loss": 1.2528,
"step": 832
},
{
"epoch": 1.89453717754173,
"grad_norm": 0.5937830209732056,
"learning_rate": 4.03681876730127e-06,
"loss": 1.1594,
"step": 833
},
{
"epoch": 1.8968133535660092,
"grad_norm": 0.6130216121673584,
"learning_rate": 4.034372935698908e-06,
"loss": 1.222,
"step": 834
},
{
"epoch": 1.8990895295902883,
"grad_norm": 0.6638323664665222,
"learning_rate": 4.031924745696916e-06,
"loss": 1.2338,
"step": 835
},
{
"epoch": 1.9013657056145674,
"grad_norm": 0.6491904258728027,
"learning_rate": 4.029474201058269e-06,
"loss": 1.2219,
"step": 836
},
{
"epoch": 1.9036418816388467,
"grad_norm": 0.612301766872406,
"learning_rate": 4.027021305549565e-06,
"loss": 1.2663,
"step": 837
},
{
"epoch": 1.905918057663126,
"grad_norm": 0.6025054454803467,
"learning_rate": 4.024566062941014e-06,
"loss": 1.2264,
"step": 838
},
{
"epoch": 1.9081942336874052,
"grad_norm": 0.6344963312149048,
"learning_rate": 4.022108477006434e-06,
"loss": 1.1948,
"step": 839
},
{
"epoch": 1.9104704097116842,
"grad_norm": 0.6077335476875305,
"learning_rate": 4.019648551523243e-06,
"loss": 1.2394,
"step": 840
},
{
"epoch": 1.9127465857359636,
"grad_norm": 0.6338925361633301,
"learning_rate": 4.017186290272456e-06,
"loss": 1.2136,
"step": 841
},
{
"epoch": 1.9150227617602429,
"grad_norm": 0.6291373372077942,
"learning_rate": 4.014721697038678e-06,
"loss": 1.2374,
"step": 842
},
{
"epoch": 1.917298937784522,
"grad_norm": 0.6118108630180359,
"learning_rate": 4.0122547756101005e-06,
"loss": 1.2045,
"step": 843
},
{
"epoch": 1.919575113808801,
"grad_norm": 0.6250407695770264,
"learning_rate": 4.009785529778489e-06,
"loss": 1.2349,
"step": 844
},
{
"epoch": 1.9218512898330804,
"grad_norm": 0.6737698912620544,
"learning_rate": 4.007313963339188e-06,
"loss": 1.2334,
"step": 845
},
{
"epoch": 1.9241274658573597,
"grad_norm": 0.649118959903717,
"learning_rate": 4.004840080091103e-06,
"loss": 1.1981,
"step": 846
},
{
"epoch": 1.9264036418816388,
"grad_norm": 0.6312914490699768,
"learning_rate": 4.002363883836704e-06,
"loss": 1.2341,
"step": 847
},
{
"epoch": 1.928679817905918,
"grad_norm": 0.6146298050880432,
"learning_rate": 3.999885378382013e-06,
"loss": 1.1925,
"step": 848
},
{
"epoch": 1.9309559939301972,
"grad_norm": 0.6233289241790771,
"learning_rate": 3.997404567536606e-06,
"loss": 1.2407,
"step": 849
},
{
"epoch": 1.9332321699544766,
"grad_norm": 0.6072235107421875,
"learning_rate": 3.994921455113598e-06,
"loss": 1.2033,
"step": 850
},
{
"epoch": 1.9355083459787557,
"grad_norm": 0.6547655463218689,
"learning_rate": 3.992436044929645e-06,
"loss": 1.2368,
"step": 851
},
{
"epoch": 1.9377845220030347,
"grad_norm": 0.6056034564971924,
"learning_rate": 3.989948340804932e-06,
"loss": 1.2212,
"step": 852
},
{
"epoch": 1.940060698027314,
"grad_norm": 0.6160012483596802,
"learning_rate": 3.9874583465631725e-06,
"loss": 1.1944,
"step": 853
},
{
"epoch": 1.9423368740515934,
"grad_norm": 0.641826868057251,
"learning_rate": 3.984966066031598e-06,
"loss": 1.2499,
"step": 854
},
{
"epoch": 1.9446130500758725,
"grad_norm": 0.6412007808685303,
"learning_rate": 3.982471503040954e-06,
"loss": 1.2024,
"step": 855
},
{
"epoch": 1.9468892261001516,
"grad_norm": 0.6296584606170654,
"learning_rate": 3.979974661425497e-06,
"loss": 1.1813,
"step": 856
},
{
"epoch": 1.949165402124431,
"grad_norm": 0.6448803544044495,
"learning_rate": 3.977475545022983e-06,
"loss": 1.2672,
"step": 857
},
{
"epoch": 1.9514415781487102,
"grad_norm": 0.6320902705192566,
"learning_rate": 3.9749741576746645e-06,
"loss": 1.196,
"step": 858
},
{
"epoch": 1.9537177541729895,
"grad_norm": 0.6109302639961243,
"learning_rate": 3.972470503225285e-06,
"loss": 1.2277,
"step": 859
},
{
"epoch": 1.9559939301972686,
"grad_norm": 0.6240274310112,
"learning_rate": 3.969964585523076e-06,
"loss": 1.2625,
"step": 860
},
{
"epoch": 1.9582701062215477,
"grad_norm": 0.5958450436592102,
"learning_rate": 3.967456408419742e-06,
"loss": 1.2133,
"step": 861
},
{
"epoch": 1.960546282245827,
"grad_norm": 0.6262888312339783,
"learning_rate": 3.964945975770464e-06,
"loss": 1.2238,
"step": 862
},
{
"epoch": 1.9628224582701064,
"grad_norm": 0.6366564631462097,
"learning_rate": 3.962433291433889e-06,
"loss": 1.2372,
"step": 863
},
{
"epoch": 1.9650986342943855,
"grad_norm": 0.6750831007957458,
"learning_rate": 3.959918359272125e-06,
"loss": 1.2409,
"step": 864
},
{
"epoch": 1.9673748103186646,
"grad_norm": 0.5879358649253845,
"learning_rate": 3.957401183150734e-06,
"loss": 1.2122,
"step": 865
},
{
"epoch": 1.9696509863429439,
"grad_norm": 0.6384773254394531,
"learning_rate": 3.9548817669387295e-06,
"loss": 1.2046,
"step": 866
},
{
"epoch": 1.9719271623672232,
"grad_norm": 0.6435151100158691,
"learning_rate": 3.952360114508565e-06,
"loss": 1.2545,
"step": 867
},
{
"epoch": 1.9742033383915023,
"grad_norm": 0.6609162092208862,
"learning_rate": 3.949836229736133e-06,
"loss": 1.2548,
"step": 868
},
{
"epoch": 1.9764795144157814,
"grad_norm": 0.6402998566627502,
"learning_rate": 3.947310116500758e-06,
"loss": 1.2369,
"step": 869
},
{
"epoch": 1.9787556904400607,
"grad_norm": 0.6171389222145081,
"learning_rate": 3.944781778685189e-06,
"loss": 1.1803,
"step": 870
},
{
"epoch": 1.98103186646434,
"grad_norm": 0.6790279150009155,
"learning_rate": 3.9422512201755925e-06,
"loss": 1.2349,
"step": 871
},
{
"epoch": 1.9833080424886191,
"grad_norm": 0.636738121509552,
"learning_rate": 3.93971844486155e-06,
"loss": 1.233,
"step": 872
},
{
"epoch": 1.9855842185128982,
"grad_norm": 0.6281400918960571,
"learning_rate": 3.937183456636051e-06,
"loss": 1.1973,
"step": 873
},
{
"epoch": 1.9878603945371776,
"grad_norm": 0.6086034774780273,
"learning_rate": 3.9346462593954845e-06,
"loss": 1.2017,
"step": 874
},
{
"epoch": 1.9901365705614569,
"grad_norm": 0.6195533871650696,
"learning_rate": 3.932106857039637e-06,
"loss": 1.22,
"step": 875
},
{
"epoch": 1.992412746585736,
"grad_norm": 0.6325448155403137,
"learning_rate": 3.929565253471681e-06,
"loss": 1.2081,
"step": 876
},
{
"epoch": 1.994688922610015,
"grad_norm": 0.6466575860977173,
"learning_rate": 3.927021452598177e-06,
"loss": 1.2734,
"step": 877
},
{
"epoch": 1.9969650986342944,
"grad_norm": 0.648371160030365,
"learning_rate": 3.924475458329059e-06,
"loss": 1.2018,
"step": 878
},
{
"epoch": 1.9992412746585737,
"grad_norm": 0.6124558448791504,
"learning_rate": 3.921927274577633e-06,
"loss": 1.2244,
"step": 879
},
{
"epoch": 2.0,
"grad_norm": 0.6124558448791504,
"learning_rate": 3.919376905260575e-06,
"loss": 1.1772,
"step": 880
},
{
"epoch": 2.0022761760242793,
"grad_norm": 1.1429736614227295,
"learning_rate": 3.916824354297911e-06,
"loss": 1.208,
"step": 881
},
{
"epoch": 2.0045523520485586,
"grad_norm": 0.6282771229743958,
"learning_rate": 3.91426962561303e-06,
"loss": 1.196,
"step": 882
},
{
"epoch": 2.0068285280728375,
"grad_norm": 0.6108458042144775,
"learning_rate": 3.911712723132661e-06,
"loss": 1.2091,
"step": 883
},
{
"epoch": 2.009104704097117,
"grad_norm": 0.6176791787147522,
"learning_rate": 3.909153650786878e-06,
"loss": 1.1683,
"step": 884
},
{
"epoch": 2.011380880121396,
"grad_norm": 0.6084854006767273,
"learning_rate": 3.9065924125090905e-06,
"loss": 1.1683,
"step": 885
},
{
"epoch": 2.0136570561456755,
"grad_norm": 0.6014538407325745,
"learning_rate": 3.904029012236033e-06,
"loss": 1.2182,
"step": 886
},
{
"epoch": 2.0159332321699543,
"grad_norm": 0.6618431210517883,
"learning_rate": 3.901463453907771e-06,
"loss": 1.2022,
"step": 887
},
{
"epoch": 2.0182094081942337,
"grad_norm": 0.6439629197120667,
"learning_rate": 3.898895741467678e-06,
"loss": 1.1718,
"step": 888
},
{
"epoch": 2.020485584218513,
"grad_norm": 0.6629829406738281,
"learning_rate": 3.8963258788624425e-06,
"loss": 1.2078,
"step": 889
},
{
"epoch": 2.0227617602427923,
"grad_norm": 0.6440435647964478,
"learning_rate": 3.89375387004206e-06,
"loss": 1.226,
"step": 890
},
{
"epoch": 2.025037936267071,
"grad_norm": 0.6640979647636414,
"learning_rate": 3.891179718959822e-06,
"loss": 1.2087,
"step": 891
},
{
"epoch": 2.0273141122913505,
"grad_norm": 0.6583533883094788,
"learning_rate": 3.888603429572314e-06,
"loss": 1.2205,
"step": 892
},
{
"epoch": 2.02959028831563,
"grad_norm": 0.6385941505432129,
"learning_rate": 3.886025005839406e-06,
"loss": 1.2329,
"step": 893
},
{
"epoch": 2.031866464339909,
"grad_norm": 0.625807523727417,
"learning_rate": 3.883444451724251e-06,
"loss": 1.2047,
"step": 894
},
{
"epoch": 2.034142640364188,
"grad_norm": 0.6311827301979065,
"learning_rate": 3.8808617711932776e-06,
"loss": 1.1932,
"step": 895
},
{
"epoch": 2.0364188163884673,
"grad_norm": 0.6245951652526855,
"learning_rate": 3.878276968216178e-06,
"loss": 1.1699,
"step": 896
},
{
"epoch": 2.0386949924127467,
"grad_norm": 0.6247482895851135,
"learning_rate": 3.875690046765912e-06,
"loss": 1.1894,
"step": 897
},
{
"epoch": 2.040971168437026,
"grad_norm": 0.6321713924407959,
"learning_rate": 3.873101010818692e-06,
"loss": 1.1908,
"step": 898
},
{
"epoch": 2.043247344461305,
"grad_norm": 0.6260454058647156,
"learning_rate": 3.8705098643539825e-06,
"loss": 1.1934,
"step": 899
},
{
"epoch": 2.045523520485584,
"grad_norm": 0.6460039019584656,
"learning_rate": 3.867916611354489e-06,
"loss": 1.1728,
"step": 900
},
{
"epoch": 2.0477996965098635,
"grad_norm": 0.6572577357292175,
"learning_rate": 3.865321255806161e-06,
"loss": 1.196,
"step": 901
},
{
"epoch": 2.050075872534143,
"grad_norm": 0.6264122724533081,
"learning_rate": 3.8627238016981726e-06,
"loss": 1.1786,
"step": 902
},
{
"epoch": 2.0523520485584217,
"grad_norm": 0.6537541747093201,
"learning_rate": 3.860124253022928e-06,
"loss": 1.2392,
"step": 903
},
{
"epoch": 2.054628224582701,
"grad_norm": 0.6670436263084412,
"learning_rate": 3.857522613776048e-06,
"loss": 1.2374,
"step": 904
},
{
"epoch": 2.0569044006069803,
"grad_norm": 0.6385306715965271,
"learning_rate": 3.8549188879563685e-06,
"loss": 1.2107,
"step": 905
},
{
"epoch": 2.0591805766312596,
"grad_norm": 0.6829336881637573,
"learning_rate": 3.852313079565933e-06,
"loss": 1.1949,
"step": 906
},
{
"epoch": 2.0614567526555385,
"grad_norm": 0.6565775871276855,
"learning_rate": 3.849705192609987e-06,
"loss": 1.1741,
"step": 907
},
{
"epoch": 2.063732928679818,
"grad_norm": 0.6490259766578674,
"learning_rate": 3.847095231096965e-06,
"loss": 1.1502,
"step": 908
},
{
"epoch": 2.066009104704097,
"grad_norm": 0.666535496711731,
"learning_rate": 3.844483199038497e-06,
"loss": 1.2038,
"step": 909
},
{
"epoch": 2.0682852807283765,
"grad_norm": 0.6315106153488159,
"learning_rate": 3.841869100449392e-06,
"loss": 1.1853,
"step": 910
},
{
"epoch": 2.0705614567526553,
"grad_norm": 0.6364261507987976,
"learning_rate": 3.839252939347636e-06,
"loss": 1.2354,
"step": 911
},
{
"epoch": 2.0728376327769347,
"grad_norm": 0.6435543298721313,
"learning_rate": 3.836634719754385e-06,
"loss": 1.1969,
"step": 912
},
{
"epoch": 2.075113808801214,
"grad_norm": 0.6444733142852783,
"learning_rate": 3.834014445693961e-06,
"loss": 1.1584,
"step": 913
},
{
"epoch": 2.0773899848254933,
"grad_norm": 0.6528134942054749,
"learning_rate": 3.83139212119384e-06,
"loss": 1.2509,
"step": 914
},
{
"epoch": 2.079666160849772,
"grad_norm": 0.6634727120399475,
"learning_rate": 3.828767750284652e-06,
"loss": 1.1778,
"step": 915
},
{
"epoch": 2.0819423368740515,
"grad_norm": 0.6481142044067383,
"learning_rate": 3.826141337000173e-06,
"loss": 1.2162,
"step": 916
},
{
"epoch": 2.084218512898331,
"grad_norm": 0.6647942066192627,
"learning_rate": 3.8235128853773175e-06,
"loss": 1.2049,
"step": 917
},
{
"epoch": 2.08649468892261,
"grad_norm": 0.6879945397377014,
"learning_rate": 3.820882399456132e-06,
"loss": 1.1974,
"step": 918
},
{
"epoch": 2.088770864946889,
"grad_norm": 0.6415085196495056,
"learning_rate": 3.818249883279791e-06,
"loss": 1.1889,
"step": 919
},
{
"epoch": 2.0910470409711683,
"grad_norm": 0.6825420260429382,
"learning_rate": 3.8156153408945884e-06,
"loss": 1.1798,
"step": 920
},
{
"epoch": 2.0933232169954477,
"grad_norm": 0.672672688961029,
"learning_rate": 3.8129787763499354e-06,
"loss": 1.1754,
"step": 921
},
{
"epoch": 2.095599393019727,
"grad_norm": 0.6526525616645813,
"learning_rate": 3.810340193698348e-06,
"loss": 1.2003,
"step": 922
},
{
"epoch": 2.097875569044006,
"grad_norm": 0.6436169147491455,
"learning_rate": 3.807699596995445e-06,
"loss": 1.1869,
"step": 923
},
{
"epoch": 2.100151745068285,
"grad_norm": 0.6731212139129639,
"learning_rate": 3.805056990299942e-06,
"loss": 1.1897,
"step": 924
},
{
"epoch": 2.1024279210925645,
"grad_norm": 0.6585912108421326,
"learning_rate": 3.8024123776736433e-06,
"loss": 1.2104,
"step": 925
},
{
"epoch": 2.104704097116844,
"grad_norm": 0.6583617329597473,
"learning_rate": 3.7997657631814366e-06,
"loss": 1.2077,
"step": 926
},
{
"epoch": 2.106980273141123,
"grad_norm": 0.6234392523765564,
"learning_rate": 3.797117150891285e-06,
"loss": 1.2304,
"step": 927
},
{
"epoch": 2.109256449165402,
"grad_norm": 0.6528206467628479,
"learning_rate": 3.7944665448742257e-06,
"loss": 1.2274,
"step": 928
},
{
"epoch": 2.1115326251896813,
"grad_norm": 0.6302603483200073,
"learning_rate": 3.7918139492043572e-06,
"loss": 1.2271,
"step": 929
},
{
"epoch": 2.1138088012139606,
"grad_norm": 0.6452434659004211,
"learning_rate": 3.789159367958838e-06,
"loss": 1.1668,
"step": 930
},
{
"epoch": 2.11608497723824,
"grad_norm": 0.6380173563957214,
"learning_rate": 3.786502805217877e-06,
"loss": 1.2107,
"step": 931
},
{
"epoch": 2.118361153262519,
"grad_norm": 0.6548686027526855,
"learning_rate": 3.7838442650647307e-06,
"loss": 1.1961,
"step": 932
},
{
"epoch": 2.120637329286798,
"grad_norm": 0.6462785005569458,
"learning_rate": 3.781183751585693e-06,
"loss": 1.2093,
"step": 933
},
{
"epoch": 2.1229135053110775,
"grad_norm": 0.6643891930580139,
"learning_rate": 3.7785212688700917e-06,
"loss": 1.2116,
"step": 934
},
{
"epoch": 2.125189681335357,
"grad_norm": 0.6929313540458679,
"learning_rate": 3.775856821010282e-06,
"loss": 1.1751,
"step": 935
},
{
"epoch": 2.1274658573596357,
"grad_norm": 0.6516011357307434,
"learning_rate": 3.7731904121016394e-06,
"loss": 1.1917,
"step": 936
},
{
"epoch": 2.129742033383915,
"grad_norm": 0.6524226665496826,
"learning_rate": 3.770522046242552e-06,
"loss": 1.1569,
"step": 937
},
{
"epoch": 2.1320182094081943,
"grad_norm": 0.6462170481681824,
"learning_rate": 3.7678517275344184e-06,
"loss": 1.2077,
"step": 938
},
{
"epoch": 2.1342943854324736,
"grad_norm": 0.6451734900474548,
"learning_rate": 3.765179460081636e-06,
"loss": 1.1946,
"step": 939
},
{
"epoch": 2.1365705614567525,
"grad_norm": 0.6531258225440979,
"learning_rate": 3.762505247991601e-06,
"loss": 1.1757,
"step": 940
},
{
"epoch": 2.138846737481032,
"grad_norm": 0.6745213270187378,
"learning_rate": 3.759829095374697e-06,
"loss": 1.2196,
"step": 941
},
{
"epoch": 2.141122913505311,
"grad_norm": 0.6472035050392151,
"learning_rate": 3.7571510063442873e-06,
"loss": 1.1318,
"step": 942
},
{
"epoch": 2.1433990895295905,
"grad_norm": 0.6743549704551697,
"learning_rate": 3.754470985016716e-06,
"loss": 1.2066,
"step": 943
},
{
"epoch": 2.1456752655538693,
"grad_norm": 0.6646394729614258,
"learning_rate": 3.751789035511294e-06,
"loss": 1.2378,
"step": 944
},
{
"epoch": 2.1479514415781487,
"grad_norm": 0.6676492691040039,
"learning_rate": 3.749105161950299e-06,
"loss": 1.1922,
"step": 945
},
{
"epoch": 2.150227617602428,
"grad_norm": 0.6555543541908264,
"learning_rate": 3.7464193684589637e-06,
"loss": 1.1849,
"step": 946
},
{
"epoch": 2.1525037936267073,
"grad_norm": 0.6590687036514282,
"learning_rate": 3.7437316591654726e-06,
"loss": 1.2332,
"step": 947
},
{
"epoch": 2.154779969650986,
"grad_norm": 0.645908534526825,
"learning_rate": 3.7410420382009544e-06,
"loss": 1.2452,
"step": 948
},
{
"epoch": 2.1570561456752655,
"grad_norm": 0.6459996104240417,
"learning_rate": 3.7383505096994764e-06,
"loss": 1.2464,
"step": 949
},
{
"epoch": 2.159332321699545,
"grad_norm": 0.6533696055412292,
"learning_rate": 3.7356570777980377e-06,
"loss": 1.1695,
"step": 950
},
{
"epoch": 2.161608497723824,
"grad_norm": 0.6279348731040955,
"learning_rate": 3.7329617466365648e-06,
"loss": 1.2133,
"step": 951
},
{
"epoch": 2.163884673748103,
"grad_norm": 0.6500206589698792,
"learning_rate": 3.7302645203579004e-06,
"loss": 1.2656,
"step": 952
},
{
"epoch": 2.1661608497723823,
"grad_norm": 0.6392419338226318,
"learning_rate": 3.727565403107801e-06,
"loss": 1.1728,
"step": 953
},
{
"epoch": 2.1684370257966616,
"grad_norm": 0.6631274819374084,
"learning_rate": 3.724864399034932e-06,
"loss": 1.213,
"step": 954
},
{
"epoch": 2.170713201820941,
"grad_norm": 0.6563039422035217,
"learning_rate": 3.7221615122908566e-06,
"loss": 1.203,
"step": 955
},
{
"epoch": 2.17298937784522,
"grad_norm": 0.6851363778114319,
"learning_rate": 3.719456747030032e-06,
"loss": 1.2404,
"step": 956
},
{
"epoch": 2.175265553869499,
"grad_norm": 0.6711891293525696,
"learning_rate": 3.7167501074098023e-06,
"loss": 1.2512,
"step": 957
},
{
"epoch": 2.1775417298937785,
"grad_norm": 0.6652558445930481,
"learning_rate": 3.714041597590394e-06,
"loss": 1.1676,
"step": 958
},
{
"epoch": 2.179817905918058,
"grad_norm": 0.6488142609596252,
"learning_rate": 3.711331221734908e-06,
"loss": 1.1614,
"step": 959
},
{
"epoch": 2.1820940819423367,
"grad_norm": 0.6525776386260986,
"learning_rate": 3.7086189840093125e-06,
"loss": 1.2314,
"step": 960
},
{
"epoch": 2.184370257966616,
"grad_norm": 0.6627135276794434,
"learning_rate": 3.7059048885824367e-06,
"loss": 1.2194,
"step": 961
},
{
"epoch": 2.1866464339908953,
"grad_norm": 0.6578177213668823,
"learning_rate": 3.703188939625968e-06,
"loss": 1.2638,
"step": 962
},
{
"epoch": 2.1889226100151746,
"grad_norm": 0.6674039363861084,
"learning_rate": 3.7004711413144404e-06,
"loss": 1.2127,
"step": 963
},
{
"epoch": 2.191198786039454,
"grad_norm": 0.6704514026641846,
"learning_rate": 3.697751497825231e-06,
"loss": 1.1941,
"step": 964
},
{
"epoch": 2.193474962063733,
"grad_norm": 0.6485816836357117,
"learning_rate": 3.6950300133385524e-06,
"loss": 1.2305,
"step": 965
},
{
"epoch": 2.195751138088012,
"grad_norm": 0.6731365323066711,
"learning_rate": 3.6923066920374494e-06,
"loss": 1.2207,
"step": 966
},
{
"epoch": 2.1980273141122915,
"grad_norm": 0.6393440961837769,
"learning_rate": 3.6895815381077874e-06,
"loss": 1.2081,
"step": 967
},
{
"epoch": 2.2003034901365703,
"grad_norm": 0.6894279718399048,
"learning_rate": 3.686854555738249e-06,
"loss": 1.1939,
"step": 968
},
{
"epoch": 2.2025796661608497,
"grad_norm": 0.6538242101669312,
"learning_rate": 3.684125749120329e-06,
"loss": 1.2074,
"step": 969
},
{
"epoch": 2.204855842185129,
"grad_norm": 0.6664542555809021,
"learning_rate": 3.6813951224483226e-06,
"loss": 1.1853,
"step": 970
},
{
"epoch": 2.2071320182094083,
"grad_norm": 0.68585205078125,
"learning_rate": 3.678662679919327e-06,
"loss": 1.2169,
"step": 971
},
{
"epoch": 2.2094081942336876,
"grad_norm": 0.6920559406280518,
"learning_rate": 3.675928425733227e-06,
"loss": 1.2321,
"step": 972
},
{
"epoch": 2.2116843702579665,
"grad_norm": 0.7139000296592712,
"learning_rate": 3.6731923640926943e-06,
"loss": 1.2052,
"step": 973
},
{
"epoch": 2.213960546282246,
"grad_norm": 0.6535520553588867,
"learning_rate": 3.6704544992031766e-06,
"loss": 1.2275,
"step": 974
},
{
"epoch": 2.216236722306525,
"grad_norm": 0.6742737889289856,
"learning_rate": 3.6677148352728947e-06,
"loss": 1.2093,
"step": 975
},
{
"epoch": 2.2185128983308045,
"grad_norm": 0.6573116779327393,
"learning_rate": 3.6649733765128344e-06,
"loss": 1.1972,
"step": 976
},
{
"epoch": 2.2207890743550833,
"grad_norm": 0.6521129012107849,
"learning_rate": 3.66223012713674e-06,
"loss": 1.1748,
"step": 977
},
{
"epoch": 2.2230652503793626,
"grad_norm": 0.6640632152557373,
"learning_rate": 3.6594850913611085e-06,
"loss": 1.205,
"step": 978
},
{
"epoch": 2.225341426403642,
"grad_norm": 0.660221517086029,
"learning_rate": 3.6567382734051815e-06,
"loss": 1.2089,
"step": 979
},
{
"epoch": 2.2276176024279213,
"grad_norm": 0.6484681963920593,
"learning_rate": 3.6539896774909405e-06,
"loss": 1.2025,
"step": 980
},
{
"epoch": 2.2298937784522,
"grad_norm": 0.6685246825218201,
"learning_rate": 3.6512393078431013e-06,
"loss": 1.1783,
"step": 981
},
{
"epoch": 2.2321699544764795,
"grad_norm": 0.6591317057609558,
"learning_rate": 3.6484871686891044e-06,
"loss": 1.2257,
"step": 982
},
{
"epoch": 2.234446130500759,
"grad_norm": 0.6399511694908142,
"learning_rate": 3.645733264259109e-06,
"loss": 1.1855,
"step": 983
},
{
"epoch": 2.236722306525038,
"grad_norm": 0.6378699541091919,
"learning_rate": 3.642977598785991e-06,
"loss": 1.1868,
"step": 984
},
{
"epoch": 2.238998482549317,
"grad_norm": 0.6714856624603271,
"learning_rate": 3.6402201765053295e-06,
"loss": 1.2395,
"step": 985
},
{
"epoch": 2.2412746585735963,
"grad_norm": 0.6694827675819397,
"learning_rate": 3.6374610016554068e-06,
"loss": 1.1766,
"step": 986
},
{
"epoch": 2.2435508345978756,
"grad_norm": 0.6676952838897705,
"learning_rate": 3.634700078477197e-06,
"loss": 1.2046,
"step": 987
},
{
"epoch": 2.245827010622155,
"grad_norm": 0.6898479461669922,
"learning_rate": 3.6319374112143618e-06,
"loss": 1.1922,
"step": 988
},
{
"epoch": 2.248103186646434,
"grad_norm": 0.6739006042480469,
"learning_rate": 3.629173004113245e-06,
"loss": 1.2118,
"step": 989
},
{
"epoch": 2.250379362670713,
"grad_norm": 0.6676629185676575,
"learning_rate": 3.6264068614228625e-06,
"loss": 1.2002,
"step": 990
},
{
"epoch": 2.2526555386949925,
"grad_norm": 0.6608707308769226,
"learning_rate": 3.6236389873948995e-06,
"loss": 1.2316,
"step": 991
},
{
"epoch": 2.254931714719272,
"grad_norm": 0.6895280480384827,
"learning_rate": 3.6208693862837023e-06,
"loss": 1.2418,
"step": 992
},
{
"epoch": 2.2572078907435507,
"grad_norm": 0.6558012962341309,
"learning_rate": 3.618098062346271e-06,
"loss": 1.2236,
"step": 993
},
{
"epoch": 2.25948406676783,
"grad_norm": 0.6760514974594116,
"learning_rate": 3.615325019842253e-06,
"loss": 1.1848,
"step": 994
},
{
"epoch": 2.2617602427921093,
"grad_norm": 0.6816234588623047,
"learning_rate": 3.61255026303394e-06,
"loss": 1.1821,
"step": 995
},
{
"epoch": 2.2640364188163886,
"grad_norm": 0.6997912526130676,
"learning_rate": 3.609773796186256e-06,
"loss": 1.1973,
"step": 996
},
{
"epoch": 2.2663125948406675,
"grad_norm": 0.6629183888435364,
"learning_rate": 3.6069956235667547e-06,
"loss": 1.2428,
"step": 997
},
{
"epoch": 2.268588770864947,
"grad_norm": 0.66066575050354,
"learning_rate": 3.604215749445611e-06,
"loss": 1.2153,
"step": 998
},
{
"epoch": 2.270864946889226,
"grad_norm": 0.6838124394416809,
"learning_rate": 3.6014341780956157e-06,
"loss": 1.1784,
"step": 999
},
{
"epoch": 2.2731411229135055,
"grad_norm": 0.6711928248405457,
"learning_rate": 3.5986509137921677e-06,
"loss": 1.183,
"step": 1000
},
{
"epoch": 2.2754172989377848,
"grad_norm": 0.6433883905410767,
"learning_rate": 3.595865960813269e-06,
"loss": 1.2432,
"step": 1001
},
{
"epoch": 2.2776934749620636,
"grad_norm": 0.6601680517196655,
"learning_rate": 3.5930793234395157e-06,
"loss": 1.1752,
"step": 1002
},
{
"epoch": 2.279969650986343,
"grad_norm": 0.6449770927429199,
"learning_rate": 3.590291005954094e-06,
"loss": 1.1987,
"step": 1003
},
{
"epoch": 2.2822458270106223,
"grad_norm": 0.7027525901794434,
"learning_rate": 3.5875010126427733e-06,
"loss": 1.1809,
"step": 1004
},
{
"epoch": 2.284522003034901,
"grad_norm": 0.675951361656189,
"learning_rate": 3.5847093477938955e-06,
"loss": 1.1955,
"step": 1005
},
{
"epoch": 2.2867981790591805,
"grad_norm": 0.656053900718689,
"learning_rate": 3.581916015698376e-06,
"loss": 1.2256,
"step": 1006
},
{
"epoch": 2.28907435508346,
"grad_norm": 0.6874861121177673,
"learning_rate": 3.5791210206496897e-06,
"loss": 1.1917,
"step": 1007
},
{
"epoch": 2.291350531107739,
"grad_norm": 0.6393834948539734,
"learning_rate": 3.5763243669438696e-06,
"loss": 1.1689,
"step": 1008
},
{
"epoch": 2.2936267071320184,
"grad_norm": 0.6576155424118042,
"learning_rate": 3.5735260588794955e-06,
"loss": 1.1979,
"step": 1009
},
{
"epoch": 2.2959028831562973,
"grad_norm": 0.6721700429916382,
"learning_rate": 3.570726100757693e-06,
"loss": 1.1886,
"step": 1010
},
{
"epoch": 2.2981790591805766,
"grad_norm": 0.7089741826057434,
"learning_rate": 3.5679244968821235e-06,
"loss": 1.1678,
"step": 1011
},
{
"epoch": 2.300455235204856,
"grad_norm": 0.6565976142883301,
"learning_rate": 3.565121251558975e-06,
"loss": 1.1886,
"step": 1012
},
{
"epoch": 2.302731411229135,
"grad_norm": 0.6577697992324829,
"learning_rate": 3.562316369096962e-06,
"loss": 1.2083,
"step": 1013
},
{
"epoch": 2.305007587253414,
"grad_norm": 0.6564366817474365,
"learning_rate": 3.559509853807313e-06,
"loss": 1.2012,
"step": 1014
},
{
"epoch": 2.3072837632776935,
"grad_norm": 0.6902772784233093,
"learning_rate": 3.5567017100037683e-06,
"loss": 1.1863,
"step": 1015
},
{
"epoch": 2.309559939301973,
"grad_norm": 0.6742022633552551,
"learning_rate": 3.553891942002569e-06,
"loss": 1.1892,
"step": 1016
},
{
"epoch": 2.311836115326252,
"grad_norm": 0.6750596761703491,
"learning_rate": 3.5510805541224536e-06,
"loss": 1.1811,
"step": 1017
},
{
"epoch": 2.314112291350531,
"grad_norm": 0.695951521396637,
"learning_rate": 3.5482675506846527e-06,
"loss": 1.2489,
"step": 1018
},
{
"epoch": 2.3163884673748103,
"grad_norm": 0.6974602937698364,
"learning_rate": 3.5454529360128763e-06,
"loss": 1.2016,
"step": 1019
},
{
"epoch": 2.3186646433990896,
"grad_norm": 0.7286347150802612,
"learning_rate": 3.542636714433312e-06,
"loss": 1.2065,
"step": 1020
},
{
"epoch": 2.3209408194233685,
"grad_norm": 0.6884462833404541,
"learning_rate": 3.53981889027462e-06,
"loss": 1.2105,
"step": 1021
},
{
"epoch": 2.323216995447648,
"grad_norm": 0.6842535138130188,
"learning_rate": 3.536999467867921e-06,
"loss": 1.1876,
"step": 1022
},
{
"epoch": 2.325493171471927,
"grad_norm": 0.7068074941635132,
"learning_rate": 3.5341784515467926e-06,
"loss": 1.2378,
"step": 1023
},
{
"epoch": 2.3277693474962065,
"grad_norm": 0.6900534629821777,
"learning_rate": 3.5313558456472623e-06,
"loss": 1.2192,
"step": 1024
},
{
"epoch": 2.3300455235204858,
"grad_norm": 0.6720012426376343,
"learning_rate": 3.5285316545078018e-06,
"loss": 1.1843,
"step": 1025
},
{
"epoch": 2.3323216995447646,
"grad_norm": 0.6527561545372009,
"learning_rate": 3.5257058824693197e-06,
"loss": 1.2391,
"step": 1026
},
{
"epoch": 2.334597875569044,
"grad_norm": 0.6607952117919922,
"learning_rate": 3.5228785338751525e-06,
"loss": 1.1932,
"step": 1027
},
{
"epoch": 2.3368740515933233,
"grad_norm": 0.7056044340133667,
"learning_rate": 3.5200496130710606e-06,
"loss": 1.1677,
"step": 1028
},
{
"epoch": 2.3391502276176026,
"grad_norm": 0.6829009056091309,
"learning_rate": 3.517219124405222e-06,
"loss": 1.2209,
"step": 1029
},
{
"epoch": 2.3414264036418815,
"grad_norm": 0.6884670853614807,
"learning_rate": 3.5143870722282257e-06,
"loss": 1.1822,
"step": 1030
},
{
"epoch": 2.343702579666161,
"grad_norm": 0.6838653683662415,
"learning_rate": 3.511553460893059e-06,
"loss": 1.2193,
"step": 1031
},
{
"epoch": 2.34597875569044,
"grad_norm": 0.6836718320846558,
"learning_rate": 3.5087182947551113e-06,
"loss": 1.2061,
"step": 1032
},
{
"epoch": 2.3482549317147194,
"grad_norm": 0.6770094037055969,
"learning_rate": 3.505881578172159e-06,
"loss": 1.2141,
"step": 1033
},
{
"epoch": 2.3505311077389983,
"grad_norm": 0.7328886389732361,
"learning_rate": 3.503043315504361e-06,
"loss": 1.2023,
"step": 1034
},
{
"epoch": 2.3528072837632776,
"grad_norm": 0.7042023539543152,
"learning_rate": 3.5002035111142543e-06,
"loss": 1.2014,
"step": 1035
},
{
"epoch": 2.355083459787557,
"grad_norm": 0.7104170918464661,
"learning_rate": 3.4973621693667446e-06,
"loss": 1.2263,
"step": 1036
},
{
"epoch": 2.3573596358118363,
"grad_norm": 0.6627414226531982,
"learning_rate": 3.4945192946291016e-06,
"loss": 1.2012,
"step": 1037
},
{
"epoch": 2.359635811836115,
"grad_norm": 0.6971149444580078,
"learning_rate": 3.4916748912709506e-06,
"loss": 1.2168,
"step": 1038
},
{
"epoch": 2.3619119878603945,
"grad_norm": 0.6787693500518799,
"learning_rate": 3.4888289636642645e-06,
"loss": 1.2049,
"step": 1039
},
{
"epoch": 2.364188163884674,
"grad_norm": 0.6599656939506531,
"learning_rate": 3.4859815161833636e-06,
"loss": 1.2252,
"step": 1040
},
{
"epoch": 2.366464339908953,
"grad_norm": 0.6828228235244751,
"learning_rate": 3.4831325532049e-06,
"loss": 1.228,
"step": 1041
},
{
"epoch": 2.368740515933232,
"grad_norm": 0.6853594779968262,
"learning_rate": 3.480282079107857e-06,
"loss": 1.1806,
"step": 1042
},
{
"epoch": 2.3710166919575113,
"grad_norm": 0.6828693747520447,
"learning_rate": 3.477430098273541e-06,
"loss": 1.1818,
"step": 1043
},
{
"epoch": 2.3732928679817906,
"grad_norm": 0.6563565731048584,
"learning_rate": 3.4745766150855738e-06,
"loss": 1.2072,
"step": 1044
},
{
"epoch": 2.37556904400607,
"grad_norm": 0.7111157774925232,
"learning_rate": 3.4717216339298852e-06,
"loss": 1.2176,
"step": 1045
},
{
"epoch": 2.3778452200303493,
"grad_norm": 0.6725479364395142,
"learning_rate": 3.46886515919471e-06,
"loss": 1.1899,
"step": 1046
},
{
"epoch": 2.380121396054628,
"grad_norm": 0.7368158102035522,
"learning_rate": 3.4660071952705752e-06,
"loss": 1.1762,
"step": 1047
},
{
"epoch": 2.3823975720789075,
"grad_norm": 0.6941075325012207,
"learning_rate": 3.4631477465503018e-06,
"loss": 1.202,
"step": 1048
},
{
"epoch": 2.3846737481031868,
"grad_norm": 0.6628252863883972,
"learning_rate": 3.460286817428987e-06,
"loss": 1.1803,
"step": 1049
},
{
"epoch": 2.3869499241274656,
"grad_norm": 0.689207911491394,
"learning_rate": 3.4574244123040083e-06,
"loss": 1.2108,
"step": 1050
},
{
"epoch": 2.389226100151745,
"grad_norm": 0.7419899702072144,
"learning_rate": 3.4545605355750096e-06,
"loss": 1.2356,
"step": 1051
},
{
"epoch": 2.3915022761760243,
"grad_norm": 0.7288059592247009,
"learning_rate": 3.4516951916438974e-06,
"loss": 1.2054,
"step": 1052
},
{
"epoch": 2.3937784522003036,
"grad_norm": 0.7065550088882446,
"learning_rate": 3.4488283849148324e-06,
"loss": 1.1689,
"step": 1053
},
{
"epoch": 2.396054628224583,
"grad_norm": 0.7047455906867981,
"learning_rate": 3.445960119794225e-06,
"loss": 1.2038,
"step": 1054
},
{
"epoch": 2.398330804248862,
"grad_norm": 0.6833023428916931,
"learning_rate": 3.443090400690726e-06,
"loss": 1.1766,
"step": 1055
},
{
"epoch": 2.400606980273141,
"grad_norm": 0.6780257821083069,
"learning_rate": 3.440219232015222e-06,
"loss": 1.151,
"step": 1056
},
{
"epoch": 2.4028831562974204,
"grad_norm": 0.6979094743728638,
"learning_rate": 3.4373466181808284e-06,
"loss": 1.2191,
"step": 1057
},
{
"epoch": 2.4051593323216993,
"grad_norm": 0.6790235042572021,
"learning_rate": 3.4344725636028787e-06,
"loss": 1.1576,
"step": 1058
},
{
"epoch": 2.4074355083459786,
"grad_norm": 0.6650685667991638,
"learning_rate": 3.4315970726989244e-06,
"loss": 1.1853,
"step": 1059
},
{
"epoch": 2.409711684370258,
"grad_norm": 0.7322258949279785,
"learning_rate": 3.428720149888723e-06,
"loss": 1.1959,
"step": 1060
},
{
"epoch": 2.4119878603945373,
"grad_norm": 0.7185081243515015,
"learning_rate": 3.425841799594233e-06,
"loss": 1.1897,
"step": 1061
},
{
"epoch": 2.4142640364188166,
"grad_norm": 0.7351638674736023,
"learning_rate": 3.4229620262396063e-06,
"loss": 1.2453,
"step": 1062
},
{
"epoch": 2.4165402124430955,
"grad_norm": 0.6832559108734131,
"learning_rate": 3.4200808342511845e-06,
"loss": 1.1628,
"step": 1063
},
{
"epoch": 2.418816388467375,
"grad_norm": 0.7146325707435608,
"learning_rate": 3.4171982280574877e-06,
"loss": 1.1572,
"step": 1064
},
{
"epoch": 2.421092564491654,
"grad_norm": 0.6700774431228638,
"learning_rate": 3.414314212089209e-06,
"loss": 1.2444,
"step": 1065
},
{
"epoch": 2.423368740515933,
"grad_norm": 0.7039579749107361,
"learning_rate": 3.4114287907792115e-06,
"loss": 1.1903,
"step": 1066
},
{
"epoch": 2.4256449165402123,
"grad_norm": 0.6913965940475464,
"learning_rate": 3.4085419685625153e-06,
"loss": 1.1349,
"step": 1067
},
{
"epoch": 2.4279210925644916,
"grad_norm": 0.6725445985794067,
"learning_rate": 3.4056537498762955e-06,
"loss": 1.1814,
"step": 1068
},
{
"epoch": 2.430197268588771,
"grad_norm": 0.6761681437492371,
"learning_rate": 3.402764139159872e-06,
"loss": 1.1526,
"step": 1069
},
{
"epoch": 2.4324734446130503,
"grad_norm": 0.6802586913108826,
"learning_rate": 3.3998731408547065e-06,
"loss": 1.2111,
"step": 1070
},
{
"epoch": 2.434749620637329,
"grad_norm": 0.7151522040367126,
"learning_rate": 3.3969807594043913e-06,
"loss": 1.1904,
"step": 1071
},
{
"epoch": 2.4370257966616085,
"grad_norm": 0.6683356761932373,
"learning_rate": 3.3940869992546467e-06,
"loss": 1.2203,
"step": 1072
},
{
"epoch": 2.4393019726858878,
"grad_norm": 0.7200412154197693,
"learning_rate": 3.3911918648533094e-06,
"loss": 1.2061,
"step": 1073
},
{
"epoch": 2.441578148710167,
"grad_norm": 0.694735050201416,
"learning_rate": 3.3882953606503323e-06,
"loss": 1.2051,
"step": 1074
},
{
"epoch": 2.443854324734446,
"grad_norm": 0.7197138071060181,
"learning_rate": 3.3853974910977706e-06,
"loss": 1.2354,
"step": 1075
},
{
"epoch": 2.4461305007587253,
"grad_norm": 0.6728242039680481,
"learning_rate": 3.382498260649778e-06,
"loss": 1.1792,
"step": 1076
},
{
"epoch": 2.4484066767830046,
"grad_norm": 0.6801380515098572,
"learning_rate": 3.3795976737626025e-06,
"loss": 1.167,
"step": 1077
},
{
"epoch": 2.450682852807284,
"grad_norm": 0.7056938409805298,
"learning_rate": 3.376695734894575e-06,
"loss": 1.1945,
"step": 1078
},
{
"epoch": 2.452959028831563,
"grad_norm": 0.6955033540725708,
"learning_rate": 3.3737924485061046e-06,
"loss": 1.1787,
"step": 1079
},
{
"epoch": 2.455235204855842,
"grad_norm": 0.6794352531433105,
"learning_rate": 3.3708878190596724e-06,
"loss": 1.1848,
"step": 1080
},
{
"epoch": 2.4575113808801214,
"grad_norm": 0.7221025824546814,
"learning_rate": 3.3679818510198224e-06,
"loss": 1.2013,
"step": 1081
},
{
"epoch": 2.4597875569044008,
"grad_norm": 0.670121431350708,
"learning_rate": 3.3650745488531593e-06,
"loss": 1.1737,
"step": 1082
},
{
"epoch": 2.4620637329286796,
"grad_norm": 0.7007501125335693,
"learning_rate": 3.362165917028334e-06,
"loss": 1.2041,
"step": 1083
},
{
"epoch": 2.464339908952959,
"grad_norm": 0.7078006863594055,
"learning_rate": 3.3592559600160446e-06,
"loss": 1.1799,
"step": 1084
},
{
"epoch": 2.4666160849772383,
"grad_norm": 0.6946449875831604,
"learning_rate": 3.3563446822890246e-06,
"loss": 1.213,
"step": 1085
},
{
"epoch": 2.4688922610015176,
"grad_norm": 0.7082563042640686,
"learning_rate": 3.3534320883220367e-06,
"loss": 1.1749,
"step": 1086
},
{
"epoch": 2.4711684370257965,
"grad_norm": 0.6920600533485413,
"learning_rate": 3.3505181825918685e-06,
"loss": 1.2008,
"step": 1087
},
{
"epoch": 2.473444613050076,
"grad_norm": 0.6847231984138489,
"learning_rate": 3.347602969577323e-06,
"loss": 1.1531,
"step": 1088
},
{
"epoch": 2.475720789074355,
"grad_norm": 0.6741018295288086,
"learning_rate": 3.344686453759213e-06,
"loss": 1.2028,
"step": 1089
},
{
"epoch": 2.4779969650986344,
"grad_norm": 0.6896122694015503,
"learning_rate": 3.341768639620353e-06,
"loss": 1.1821,
"step": 1090
},
{
"epoch": 2.4802731411229137,
"grad_norm": 0.6965914368629456,
"learning_rate": 3.3388495316455525e-06,
"loss": 1.1665,
"step": 1091
},
{
"epoch": 2.4825493171471926,
"grad_norm": 0.6966100335121155,
"learning_rate": 3.3359291343216126e-06,
"loss": 1.2321,
"step": 1092
},
{
"epoch": 2.484825493171472,
"grad_norm": 0.6773708462715149,
"learning_rate": 3.3330074521373134e-06,
"loss": 1.1853,
"step": 1093
},
{
"epoch": 2.4871016691957513,
"grad_norm": 0.7116764187812805,
"learning_rate": 3.33008448958341e-06,
"loss": 1.2027,
"step": 1094
},
{
"epoch": 2.48937784522003,
"grad_norm": 0.6633743643760681,
"learning_rate": 3.327160251152627e-06,
"loss": 1.2257,
"step": 1095
},
{
"epoch": 2.4916540212443095,
"grad_norm": 0.6858770251274109,
"learning_rate": 3.3242347413396488e-06,
"loss": 1.1807,
"step": 1096
},
{
"epoch": 2.4939301972685888,
"grad_norm": 0.680118978023529,
"learning_rate": 3.321307964641115e-06,
"loss": 1.2371,
"step": 1097
},
{
"epoch": 2.496206373292868,
"grad_norm": 0.6915070414543152,
"learning_rate": 3.3183799255556115e-06,
"loss": 1.1813,
"step": 1098
},
{
"epoch": 2.4984825493171474,
"grad_norm": 0.6780185103416443,
"learning_rate": 3.3154506285836645e-06,
"loss": 1.1874,
"step": 1099
},
{
"epoch": 2.5007587253414263,
"grad_norm": 0.6911877393722534,
"learning_rate": 3.3125200782277356e-06,
"loss": 1.2274,
"step": 1100
},
{
"epoch": 2.5030349013657056,
"grad_norm": 0.6855583786964417,
"learning_rate": 3.3095882789922102e-06,
"loss": 1.1905,
"step": 1101
},
{
"epoch": 2.505311077389985,
"grad_norm": 0.7394909858703613,
"learning_rate": 3.306655235383394e-06,
"loss": 1.1557,
"step": 1102
},
{
"epoch": 2.507587253414264,
"grad_norm": 0.6894367933273315,
"learning_rate": 3.3037209519095072e-06,
"loss": 1.2127,
"step": 1103
},
{
"epoch": 2.509863429438543,
"grad_norm": 0.7138856649398804,
"learning_rate": 3.3007854330806733e-06,
"loss": 1.1866,
"step": 1104
},
{
"epoch": 2.5121396054628224,
"grad_norm": 0.7123029232025146,
"learning_rate": 3.2978486834089163e-06,
"loss": 1.1866,
"step": 1105
},
{
"epoch": 2.5144157814871018,
"grad_norm": 0.7103905081748962,
"learning_rate": 3.294910707408151e-06,
"loss": 1.2229,
"step": 1106
},
{
"epoch": 2.516691957511381,
"grad_norm": 0.7092157602310181,
"learning_rate": 3.2919715095941774e-06,
"loss": 1.1981,
"step": 1107
},
{
"epoch": 2.51896813353566,
"grad_norm": 0.6840993762016296,
"learning_rate": 3.289031094484675e-06,
"loss": 1.1945,
"step": 1108
},
{
"epoch": 2.5212443095599393,
"grad_norm": 0.6992108225822449,
"learning_rate": 3.286089466599191e-06,
"loss": 1.1687,
"step": 1109
},
{
"epoch": 2.5235204855842186,
"grad_norm": 0.7055697441101074,
"learning_rate": 3.2831466304591396e-06,
"loss": 1.2066,
"step": 1110
},
{
"epoch": 2.5257966616084975,
"grad_norm": 0.6737038493156433,
"learning_rate": 3.2802025905877916e-06,
"loss": 1.1869,
"step": 1111
},
{
"epoch": 2.528072837632777,
"grad_norm": 0.74284428358078,
"learning_rate": 3.277257351510267e-06,
"loss": 1.2072,
"step": 1112
},
{
"epoch": 2.530349013657056,
"grad_norm": 0.6852964758872986,
"learning_rate": 3.2743109177535292e-06,
"loss": 1.2403,
"step": 1113
},
{
"epoch": 2.5326251896813354,
"grad_norm": 0.712960958480835,
"learning_rate": 3.2713632938463785e-06,
"loss": 1.1688,
"step": 1114
},
{
"epoch": 2.5349013657056148,
"grad_norm": 0.6892920136451721,
"learning_rate": 3.2684144843194453e-06,
"loss": 1.202,
"step": 1115
},
{
"epoch": 2.5371775417298936,
"grad_norm": 0.6760216355323792,
"learning_rate": 3.265464493705181e-06,
"loss": 1.2104,
"step": 1116
},
{
"epoch": 2.539453717754173,
"grad_norm": 0.687100887298584,
"learning_rate": 3.262513326537852e-06,
"loss": 1.1846,
"step": 1117
},
{
"epoch": 2.5417298937784523,
"grad_norm": 0.6958955526351929,
"learning_rate": 3.2595609873535335e-06,
"loss": 1.2094,
"step": 1118
},
{
"epoch": 2.544006069802731,
"grad_norm": 0.6893970966339111,
"learning_rate": 3.256607480690104e-06,
"loss": 1.186,
"step": 1119
},
{
"epoch": 2.5462822458270105,
"grad_norm": 0.6832887530326843,
"learning_rate": 3.253652811087234e-06,
"loss": 1.2228,
"step": 1120
},
{
"epoch": 2.54855842185129,
"grad_norm": 0.6801987886428833,
"learning_rate": 3.2506969830863824e-06,
"loss": 1.1701,
"step": 1121
},
{
"epoch": 2.550834597875569,
"grad_norm": 0.6851920485496521,
"learning_rate": 3.2477400012307885e-06,
"loss": 1.1747,
"step": 1122
},
{
"epoch": 2.5531107738998484,
"grad_norm": 0.701210618019104,
"learning_rate": 3.2447818700654667e-06,
"loss": 1.2172,
"step": 1123
},
{
"epoch": 2.5553869499241273,
"grad_norm": 0.7184703350067139,
"learning_rate": 3.2418225941371957e-06,
"loss": 1.1676,
"step": 1124
},
{
"epoch": 2.5576631259484066,
"grad_norm": 0.6979882717132568,
"learning_rate": 3.2388621779945136e-06,
"loss": 1.204,
"step": 1125
},
{
"epoch": 2.559939301972686,
"grad_norm": 0.7246853113174438,
"learning_rate": 3.235900626187713e-06,
"loss": 1.2352,
"step": 1126
},
{
"epoch": 2.5622154779969653,
"grad_norm": 0.7014902234077454,
"learning_rate": 3.2329379432688314e-06,
"loss": 1.2407,
"step": 1127
},
{
"epoch": 2.5644916540212446,
"grad_norm": 0.6897662281990051,
"learning_rate": 3.229974133791643e-06,
"loss": 1.2321,
"step": 1128
},
{
"epoch": 2.5667678300455234,
"grad_norm": 0.6940773725509644,
"learning_rate": 3.2270092023116564e-06,
"loss": 1.217,
"step": 1129
},
{
"epoch": 2.5690440060698028,
"grad_norm": 0.7043965458869934,
"learning_rate": 3.224043153386104e-06,
"loss": 1.1847,
"step": 1130
},
{
"epoch": 2.571320182094082,
"grad_norm": 0.7059136033058167,
"learning_rate": 3.221075991573935e-06,
"loss": 1.1589,
"step": 1131
},
{
"epoch": 2.573596358118361,
"grad_norm": 0.6794278621673584,
"learning_rate": 3.218107721435808e-06,
"loss": 1.179,
"step": 1132
},
{
"epoch": 2.5758725341426403,
"grad_norm": 0.6917217373847961,
"learning_rate": 3.2151383475340875e-06,
"loss": 1.2174,
"step": 1133
},
{
"epoch": 2.5781487101669196,
"grad_norm": 0.7160817384719849,
"learning_rate": 3.2121678744328343e-06,
"loss": 1.1958,
"step": 1134
},
{
"epoch": 2.580424886191199,
"grad_norm": 0.6953707933425903,
"learning_rate": 3.209196306697798e-06,
"loss": 1.2311,
"step": 1135
},
{
"epoch": 2.5827010622154782,
"grad_norm": 0.6894403100013733,
"learning_rate": 3.206223648896409e-06,
"loss": 1.1963,
"step": 1136
},
{
"epoch": 2.584977238239757,
"grad_norm": 0.7261834144592285,
"learning_rate": 3.203249905597777e-06,
"loss": 1.2048,
"step": 1137
},
{
"epoch": 2.5872534142640364,
"grad_norm": 0.7032581567764282,
"learning_rate": 3.2002750813726774e-06,
"loss": 1.2023,
"step": 1138
},
{
"epoch": 2.5895295902883158,
"grad_norm": 0.6896634697914124,
"learning_rate": 3.1972991807935473e-06,
"loss": 1.2132,
"step": 1139
},
{
"epoch": 2.5918057663125946,
"grad_norm": 0.7110108733177185,
"learning_rate": 3.194322208434478e-06,
"loss": 1.1932,
"step": 1140
},
{
"epoch": 2.594081942336874,
"grad_norm": 0.7088103294372559,
"learning_rate": 3.191344168871211e-06,
"loss": 1.1936,
"step": 1141
},
{
"epoch": 2.5963581183611533,
"grad_norm": 0.6900802254676819,
"learning_rate": 3.1883650666811237e-06,
"loss": 1.1975,
"step": 1142
},
{
"epoch": 2.5986342943854326,
"grad_norm": 0.7168719172477722,
"learning_rate": 3.1853849064432296e-06,
"loss": 1.181,
"step": 1143
},
{
"epoch": 2.600910470409712,
"grad_norm": 0.6837049722671509,
"learning_rate": 3.182403692738168e-06,
"loss": 1.175,
"step": 1144
},
{
"epoch": 2.603186646433991,
"grad_norm": 0.6712286472320557,
"learning_rate": 3.1794214301481978e-06,
"loss": 1.1729,
"step": 1145
},
{
"epoch": 2.60546282245827,
"grad_norm": 0.6919139623641968,
"learning_rate": 3.1764381232571894e-06,
"loss": 1.1975,
"step": 1146
},
{
"epoch": 2.6077389984825494,
"grad_norm": 0.6942136883735657,
"learning_rate": 3.173453776650618e-06,
"loss": 1.2079,
"step": 1147
},
{
"epoch": 2.6100151745068283,
"grad_norm": 0.7067781090736389,
"learning_rate": 3.170468394915558e-06,
"loss": 1.1622,
"step": 1148
},
{
"epoch": 2.6122913505311076,
"grad_norm": 0.6851193308830261,
"learning_rate": 3.1674819826406744e-06,
"loss": 1.1951,
"step": 1149
},
{
"epoch": 2.614567526555387,
"grad_norm": 0.6892246603965759,
"learning_rate": 3.1644945444162155e-06,
"loss": 1.1746,
"step": 1150
},
{
"epoch": 2.6168437025796663,
"grad_norm": 0.7149601578712463,
"learning_rate": 3.1615060848340066e-06,
"loss": 1.1893,
"step": 1151
},
{
"epoch": 2.6191198786039456,
"grad_norm": 0.6909357905387878,
"learning_rate": 3.1585166084874446e-06,
"loss": 1.1756,
"step": 1152
},
{
"epoch": 2.6213960546282244,
"grad_norm": 0.7025142312049866,
"learning_rate": 3.155526119971488e-06,
"loss": 1.226,
"step": 1153
},
{
"epoch": 2.6236722306525038,
"grad_norm": 0.7248009443283081,
"learning_rate": 3.15253462388265e-06,
"loss": 1.1613,
"step": 1154
},
{
"epoch": 2.625948406676783,
"grad_norm": 0.7198726534843445,
"learning_rate": 3.149542124818993e-06,
"loss": 1.2107,
"step": 1155
},
{
"epoch": 2.628224582701062,
"grad_norm": 0.70361328125,
"learning_rate": 3.146548627380124e-06,
"loss": 1.148,
"step": 1156
},
{
"epoch": 2.6305007587253413,
"grad_norm": 0.6757749319076538,
"learning_rate": 3.14355413616718e-06,
"loss": 1.1859,
"step": 1157
},
{
"epoch": 2.6327769347496206,
"grad_norm": 0.7123044729232788,
"learning_rate": 3.1405586557828275e-06,
"loss": 1.2212,
"step": 1158
},
{
"epoch": 2.6350531107739,
"grad_norm": 0.7022401690483093,
"learning_rate": 3.137562190831255e-06,
"loss": 1.1943,
"step": 1159
},
{
"epoch": 2.6373292867981792,
"grad_norm": 0.7225694060325623,
"learning_rate": 3.134564745918161e-06,
"loss": 1.2306,
"step": 1160
},
{
"epoch": 2.639605462822458,
"grad_norm": 0.7538037896156311,
"learning_rate": 3.1315663256507533e-06,
"loss": 1.1613,
"step": 1161
},
{
"epoch": 2.6418816388467374,
"grad_norm": 0.7213900685310364,
"learning_rate": 3.1285669346377363e-06,
"loss": 1.1686,
"step": 1162
},
{
"epoch": 2.6441578148710168,
"grad_norm": 0.6754977107048035,
"learning_rate": 3.1255665774893085e-06,
"loss": 1.2004,
"step": 1163
},
{
"epoch": 2.6464339908952956,
"grad_norm": 0.6743507981300354,
"learning_rate": 3.1225652588171534e-06,
"loss": 1.2097,
"step": 1164
},
{
"epoch": 2.648710166919575,
"grad_norm": 0.7086999416351318,
"learning_rate": 3.119562983234431e-06,
"loss": 1.1811,
"step": 1165
},
{
"epoch": 2.6509863429438543,
"grad_norm": 0.7122178077697754,
"learning_rate": 3.116559755355772e-06,
"loss": 1.1792,
"step": 1166
},
{
"epoch": 2.6532625189681336,
"grad_norm": 0.7318682670593262,
"learning_rate": 3.1135555797972715e-06,
"loss": 1.1885,
"step": 1167
},
{
"epoch": 2.655538694992413,
"grad_norm": 0.7025822997093201,
"learning_rate": 3.110550461176484e-06,
"loss": 1.1924,
"step": 1168
},
{
"epoch": 2.657814871016692,
"grad_norm": 0.683168351650238,
"learning_rate": 3.1075444041124077e-06,
"loss": 1.1656,
"step": 1169
},
{
"epoch": 2.660091047040971,
"grad_norm": 0.7369166016578674,
"learning_rate": 3.1045374132254875e-06,
"loss": 1.2286,
"step": 1170
},
{
"epoch": 2.6623672230652504,
"grad_norm": 0.7125474214553833,
"learning_rate": 3.1015294931376035e-06,
"loss": 1.2138,
"step": 1171
},
{
"epoch": 2.6646433990895297,
"grad_norm": 0.7190746068954468,
"learning_rate": 3.0985206484720616e-06,
"loss": 1.1815,
"step": 1172
},
{
"epoch": 2.666919575113809,
"grad_norm": 0.6940246224403381,
"learning_rate": 3.09551088385359e-06,
"loss": 1.2445,
"step": 1173
},
{
"epoch": 2.669195751138088,
"grad_norm": 0.7135853171348572,
"learning_rate": 3.092500203908332e-06,
"loss": 1.1845,
"step": 1174
},
{
"epoch": 2.6714719271623673,
"grad_norm": 0.6984366774559021,
"learning_rate": 3.0894886132638375e-06,
"loss": 1.186,
"step": 1175
},
{
"epoch": 2.6737481031866466,
"grad_norm": 0.7473638653755188,
"learning_rate": 3.0864761165490546e-06,
"loss": 1.1662,
"step": 1176
},
{
"epoch": 2.6760242792109254,
"grad_norm": 0.6937222480773926,
"learning_rate": 3.0834627183943255e-06,
"loss": 1.1268,
"step": 1177
},
{
"epoch": 2.6783004552352048,
"grad_norm": 0.6648399829864502,
"learning_rate": 3.080448423431377e-06,
"loss": 1.1714,
"step": 1178
},
{
"epoch": 2.680576631259484,
"grad_norm": 0.7059394121170044,
"learning_rate": 3.0774332362933163e-06,
"loss": 1.1788,
"step": 1179
},
{
"epoch": 2.6828528072837634,
"grad_norm": 0.7164271473884583,
"learning_rate": 3.074417161614619e-06,
"loss": 1.1699,
"step": 1180
},
{
"epoch": 2.6851289833080427,
"grad_norm": 0.6942355632781982,
"learning_rate": 3.071400204031127e-06,
"loss": 1.2212,
"step": 1181
},
{
"epoch": 2.6874051593323216,
"grad_norm": 0.7147754430770874,
"learning_rate": 3.0683823681800382e-06,
"loss": 1.1895,
"step": 1182
},
{
"epoch": 2.689681335356601,
"grad_norm": 0.7072470188140869,
"learning_rate": 3.0653636586999025e-06,
"loss": 1.1963,
"step": 1183
},
{
"epoch": 2.6919575113808802,
"grad_norm": 0.7091546654701233,
"learning_rate": 3.0623440802306087e-06,
"loss": 1.167,
"step": 1184
},
{
"epoch": 2.694233687405159,
"grad_norm": 0.698569655418396,
"learning_rate": 3.059323637413385e-06,
"loss": 1.166,
"step": 1185
},
{
"epoch": 2.6965098634294384,
"grad_norm": 0.7070002555847168,
"learning_rate": 3.056302334890786e-06,
"loss": 1.1733,
"step": 1186
},
{
"epoch": 2.6987860394537178,
"grad_norm": 0.7015774250030518,
"learning_rate": 3.05328017730669e-06,
"loss": 1.2012,
"step": 1187
},
{
"epoch": 2.701062215477997,
"grad_norm": 0.6821005344390869,
"learning_rate": 3.0502571693062856e-06,
"loss": 1.1639,
"step": 1188
},
{
"epoch": 2.7033383915022764,
"grad_norm": 0.7214947938919067,
"learning_rate": 3.0472333155360724e-06,
"loss": 1.185,
"step": 1189
},
{
"epoch": 2.7056145675265553,
"grad_norm": 0.703359842300415,
"learning_rate": 3.0442086206438483e-06,
"loss": 1.1825,
"step": 1190
},
{
"epoch": 2.7078907435508346,
"grad_norm": 0.6933659911155701,
"learning_rate": 3.041183089278704e-06,
"loss": 1.1595,
"step": 1191
},
{
"epoch": 2.710166919575114,
"grad_norm": 0.7080999612808228,
"learning_rate": 3.0381567260910166e-06,
"loss": 1.2293,
"step": 1192
},
{
"epoch": 2.712443095599393,
"grad_norm": 0.6948418021202087,
"learning_rate": 3.0351295357324405e-06,
"loss": 1.2046,
"step": 1193
},
{
"epoch": 2.714719271623672,
"grad_norm": 0.7300384044647217,
"learning_rate": 3.0321015228559035e-06,
"loss": 1.2105,
"step": 1194
},
{
"epoch": 2.7169954476479514,
"grad_norm": 0.6972754597663879,
"learning_rate": 3.0290726921155954e-06,
"loss": 1.1983,
"step": 1195
},
{
"epoch": 2.7192716236722307,
"grad_norm": 0.6930494904518127,
"learning_rate": 3.026043048166964e-06,
"loss": 1.1745,
"step": 1196
},
{
"epoch": 2.72154779969651,
"grad_norm": 0.7130087018013,
"learning_rate": 3.023012595666708e-06,
"loss": 1.2083,
"step": 1197
},
{
"epoch": 2.723823975720789,
"grad_norm": 0.6889429688453674,
"learning_rate": 3.019981339272768e-06,
"loss": 1.2154,
"step": 1198
},
{
"epoch": 2.7261001517450683,
"grad_norm": 0.7289022207260132,
"learning_rate": 3.016949283644319e-06,
"loss": 1.2305,
"step": 1199
},
{
"epoch": 2.7283763277693476,
"grad_norm": 0.7369369864463806,
"learning_rate": 3.0139164334417665e-06,
"loss": 1.1922,
"step": 1200
},
{
"epoch": 2.7306525037936265,
"grad_norm": 0.6780570149421692,
"learning_rate": 3.010882793326737e-06,
"loss": 1.2169,
"step": 1201
},
{
"epoch": 2.7329286798179058,
"grad_norm": 0.70427405834198,
"learning_rate": 3.0078483679620706e-06,
"loss": 1.1819,
"step": 1202
},
{
"epoch": 2.735204855842185,
"grad_norm": 0.7048825621604919,
"learning_rate": 3.0048131620118137e-06,
"loss": 1.1639,
"step": 1203
},
{
"epoch": 2.7374810318664644,
"grad_norm": 0.7250698804855347,
"learning_rate": 3.001777180141213e-06,
"loss": 1.2162,
"step": 1204
},
{
"epoch": 2.7397572078907437,
"grad_norm": 0.715922474861145,
"learning_rate": 2.99874042701671e-06,
"loss": 1.209,
"step": 1205
},
{
"epoch": 2.7420333839150226,
"grad_norm": 0.6971619129180908,
"learning_rate": 2.9957029073059276e-06,
"loss": 1.1778,
"step": 1206
},
{
"epoch": 2.744309559939302,
"grad_norm": 0.6936290264129639,
"learning_rate": 2.992664625677669e-06,
"loss": 1.2263,
"step": 1207
},
{
"epoch": 2.7465857359635812,
"grad_norm": 0.7032736539840698,
"learning_rate": 2.9896255868019102e-06,
"loss": 1.2502,
"step": 1208
},
{
"epoch": 2.74886191198786,
"grad_norm": 0.6784445643424988,
"learning_rate": 2.9865857953497896e-06,
"loss": 1.1699,
"step": 1209
},
{
"epoch": 2.75113808801214,
"grad_norm": 0.7215850949287415,
"learning_rate": 2.9835452559935996e-06,
"loss": 1.189,
"step": 1210
},
{
"epoch": 2.7534142640364188,
"grad_norm": 0.7106872797012329,
"learning_rate": 2.9805039734067863e-06,
"loss": 1.2023,
"step": 1211
},
{
"epoch": 2.755690440060698,
"grad_norm": 0.6902872920036316,
"learning_rate": 2.977461952263938e-06,
"loss": 1.1879,
"step": 1212
},
{
"epoch": 2.7579666160849774,
"grad_norm": 0.722634494304657,
"learning_rate": 2.9744191972407754e-06,
"loss": 1.1981,
"step": 1213
},
{
"epoch": 2.7602427921092563,
"grad_norm": 0.7154293656349182,
"learning_rate": 2.9713757130141483e-06,
"loss": 1.2136,
"step": 1214
},
{
"epoch": 2.7625189681335356,
"grad_norm": 0.7049538493156433,
"learning_rate": 2.968331504262028e-06,
"loss": 1.2065,
"step": 1215
},
{
"epoch": 2.764795144157815,
"grad_norm": 0.7349944114685059,
"learning_rate": 2.9652865756635007e-06,
"loss": 1.2482,
"step": 1216
},
{
"epoch": 2.7670713201820942,
"grad_norm": 0.728399932384491,
"learning_rate": 2.9622409318987554e-06,
"loss": 1.1997,
"step": 1217
},
{
"epoch": 2.7693474962063735,
"grad_norm": 0.6748265624046326,
"learning_rate": 2.959194577649083e-06,
"loss": 1.1792,
"step": 1218
},
{
"epoch": 2.7716236722306524,
"grad_norm": 0.7254298329353333,
"learning_rate": 2.9561475175968663e-06,
"loss": 1.1635,
"step": 1219
},
{
"epoch": 2.7738998482549317,
"grad_norm": 0.7145636081695557,
"learning_rate": 2.9530997564255728e-06,
"loss": 1.1879,
"step": 1220
},
{
"epoch": 2.776176024279211,
"grad_norm": 0.7130534648895264,
"learning_rate": 2.950051298819746e-06,
"loss": 1.2206,
"step": 1221
},
{
"epoch": 2.77845220030349,
"grad_norm": 0.724371612071991,
"learning_rate": 2.9470021494650016e-06,
"loss": 1.1766,
"step": 1222
},
{
"epoch": 2.7807283763277693,
"grad_norm": 0.7017263770103455,
"learning_rate": 2.9439523130480185e-06,
"loss": 1.1888,
"step": 1223
},
{
"epoch": 2.7830045523520486,
"grad_norm": 0.715480625629425,
"learning_rate": 2.940901794256533e-06,
"loss": 1.2084,
"step": 1224
},
{
"epoch": 2.785280728376328,
"grad_norm": 0.7027157545089722,
"learning_rate": 2.9378505977793246e-06,
"loss": 1.18,
"step": 1225
},
{
"epoch": 2.787556904400607,
"grad_norm": 0.7351143956184387,
"learning_rate": 2.9347987283062213e-06,
"loss": 1.1632,
"step": 1226
},
{
"epoch": 2.789833080424886,
"grad_norm": 0.710259735584259,
"learning_rate": 2.931746190528082e-06,
"loss": 1.1886,
"step": 1227
},
{
"epoch": 2.7921092564491654,
"grad_norm": 0.7170918583869934,
"learning_rate": 2.9286929891367936e-06,
"loss": 1.1326,
"step": 1228
},
{
"epoch": 2.7943854324734447,
"grad_norm": 0.7224164605140686,
"learning_rate": 2.9256391288252617e-06,
"loss": 1.1565,
"step": 1229
},
{
"epoch": 2.7966616084977236,
"grad_norm": 0.7351495027542114,
"learning_rate": 2.9225846142874064e-06,
"loss": 1.1525,
"step": 1230
},
{
"epoch": 2.798937784522003,
"grad_norm": 0.7327077984809875,
"learning_rate": 2.919529450218154e-06,
"loss": 1.2468,
"step": 1231
},
{
"epoch": 2.8012139605462822,
"grad_norm": 0.756344199180603,
"learning_rate": 2.9164736413134263e-06,
"loss": 1.2027,
"step": 1232
},
{
"epoch": 2.8034901365705616,
"grad_norm": 0.7047960162162781,
"learning_rate": 2.9134171922701383e-06,
"loss": 1.1763,
"step": 1233
},
{
"epoch": 2.805766312594841,
"grad_norm": 0.6881158947944641,
"learning_rate": 2.9103601077861875e-06,
"loss": 1.1651,
"step": 1234
},
{
"epoch": 2.8080424886191198,
"grad_norm": 0.707214891910553,
"learning_rate": 2.907302392560452e-06,
"loss": 1.207,
"step": 1235
},
{
"epoch": 2.810318664643399,
"grad_norm": 0.7242740988731384,
"learning_rate": 2.904244051292774e-06,
"loss": 1.2092,
"step": 1236
},
{
"epoch": 2.8125948406676784,
"grad_norm": 0.7232706546783447,
"learning_rate": 2.9011850886839604e-06,
"loss": 1.154,
"step": 1237
},
{
"epoch": 2.8148710166919573,
"grad_norm": 0.7449567317962646,
"learning_rate": 2.8981255094357742e-06,
"loss": 1.1846,
"step": 1238
},
{
"epoch": 2.8171471927162366,
"grad_norm": 0.6920654773712158,
"learning_rate": 2.8950653182509253e-06,
"loss": 1.1999,
"step": 1239
},
{
"epoch": 2.819423368740516,
"grad_norm": 0.676856517791748,
"learning_rate": 2.892004519833063e-06,
"loss": 1.2061,
"step": 1240
},
{
"epoch": 2.8216995447647952,
"grad_norm": 0.7137316465377808,
"learning_rate": 2.888943118886771e-06,
"loss": 1.2316,
"step": 1241
},
{
"epoch": 2.8239757207890746,
"grad_norm": 0.7022238373756409,
"learning_rate": 2.88588112011756e-06,
"loss": 1.2096,
"step": 1242
},
{
"epoch": 2.8262518968133534,
"grad_norm": 0.7406110167503357,
"learning_rate": 2.8828185282318588e-06,
"loss": 1.2334,
"step": 1243
},
{
"epoch": 2.8285280728376327,
"grad_norm": 0.7122722268104553,
"learning_rate": 2.879755347937006e-06,
"loss": 1.1957,
"step": 1244
},
{
"epoch": 2.830804248861912,
"grad_norm": 0.7112712860107422,
"learning_rate": 2.876691583941248e-06,
"loss": 1.1985,
"step": 1245
},
{
"epoch": 2.833080424886191,
"grad_norm": 0.7014644145965576,
"learning_rate": 2.8736272409537257e-06,
"loss": 1.2052,
"step": 1246
},
{
"epoch": 2.8353566009104703,
"grad_norm": 0.7228798866271973,
"learning_rate": 2.870562323684473e-06,
"loss": 1.1795,
"step": 1247
},
{
"epoch": 2.8376327769347496,
"grad_norm": 0.7157189846038818,
"learning_rate": 2.8674968368444004e-06,
"loss": 1.22,
"step": 1248
},
{
"epoch": 2.839908952959029,
"grad_norm": 0.7256055474281311,
"learning_rate": 2.864430785145301e-06,
"loss": 1.1724,
"step": 1249
},
{
"epoch": 2.842185128983308,
"grad_norm": 0.6792826056480408,
"learning_rate": 2.8613641732998338e-06,
"loss": 1.1944,
"step": 1250
},
{
"epoch": 2.844461305007587,
"grad_norm": 0.743437647819519,
"learning_rate": 2.858297006021515e-06,
"loss": 1.2023,
"step": 1251
},
{
"epoch": 2.8467374810318664,
"grad_norm": 0.6970618367195129,
"learning_rate": 2.855229288024719e-06,
"loss": 1.1884,
"step": 1252
},
{
"epoch": 2.8490136570561457,
"grad_norm": 0.734380304813385,
"learning_rate": 2.8521610240246657e-06,
"loss": 1.1759,
"step": 1253
},
{
"epoch": 2.851289833080425,
"grad_norm": 0.7149617671966553,
"learning_rate": 2.8490922187374132e-06,
"loss": 1.1868,
"step": 1254
},
{
"epoch": 2.8535660091047044,
"grad_norm": 0.6973447799682617,
"learning_rate": 2.8460228768798507e-06,
"loss": 1.2332,
"step": 1255
},
{
"epoch": 2.8558421851289832,
"grad_norm": 0.6929753422737122,
"learning_rate": 2.8429530031696954e-06,
"loss": 1.1955,
"step": 1256
},
{
"epoch": 2.8581183611532626,
"grad_norm": 0.7042129635810852,
"learning_rate": 2.8398826023254804e-06,
"loss": 1.1998,
"step": 1257
},
{
"epoch": 2.860394537177542,
"grad_norm": 0.6975540518760681,
"learning_rate": 2.8368116790665478e-06,
"loss": 1.2024,
"step": 1258
},
{
"epoch": 2.8626707132018208,
"grad_norm": 0.7412719130516052,
"learning_rate": 2.8337402381130426e-06,
"loss": 1.1479,
"step": 1259
},
{
"epoch": 2.8649468892261,
"grad_norm": 0.6957024335861206,
"learning_rate": 2.830668284185908e-06,
"loss": 1.2087,
"step": 1260
},
{
"epoch": 2.8672230652503794,
"grad_norm": 0.6976204514503479,
"learning_rate": 2.827595822006874e-06,
"loss": 1.2259,
"step": 1261
},
{
"epoch": 2.8694992412746587,
"grad_norm": 0.7119215726852417,
"learning_rate": 2.8245228562984518e-06,
"loss": 1.1775,
"step": 1262
},
{
"epoch": 2.871775417298938,
"grad_norm": 0.7110069394111633,
"learning_rate": 2.8214493917839264e-06,
"loss": 1.2107,
"step": 1263
},
{
"epoch": 2.874051593323217,
"grad_norm": 0.6970719695091248,
"learning_rate": 2.81837543318735e-06,
"loss": 1.2019,
"step": 1264
},
{
"epoch": 2.8763277693474962,
"grad_norm": 0.683922529220581,
"learning_rate": 2.815300985233535e-06,
"loss": 1.1741,
"step": 1265
},
{
"epoch": 2.8786039453717756,
"grad_norm": 0.7189832925796509,
"learning_rate": 2.8122260526480433e-06,
"loss": 1.1423,
"step": 1266
},
{
"epoch": 2.8808801213960544,
"grad_norm": 0.7439128160476685,
"learning_rate": 2.8091506401571846e-06,
"loss": 1.183,
"step": 1267
},
{
"epoch": 2.8831562974203337,
"grad_norm": 0.7034988403320312,
"learning_rate": 2.8060747524880045e-06,
"loss": 1.1821,
"step": 1268
},
{
"epoch": 2.885432473444613,
"grad_norm": 0.7155296206474304,
"learning_rate": 2.80299839436828e-06,
"loss": 1.1895,
"step": 1269
},
{
"epoch": 2.8877086494688924,
"grad_norm": 0.7293388247489929,
"learning_rate": 2.7999215705265104e-06,
"loss": 1.167,
"step": 1270
},
{
"epoch": 2.8899848254931717,
"grad_norm": 0.6977200508117676,
"learning_rate": 2.7968442856919116e-06,
"loss": 1.1888,
"step": 1271
},
{
"epoch": 2.8922610015174506,
"grad_norm": 0.6939589381217957,
"learning_rate": 2.7937665445944075e-06,
"loss": 1.2061,
"step": 1272
},
{
"epoch": 2.89453717754173,
"grad_norm": 0.7107787728309631,
"learning_rate": 2.7906883519646227e-06,
"loss": 1.1893,
"step": 1273
},
{
"epoch": 2.896813353566009,
"grad_norm": 0.7114563584327698,
"learning_rate": 2.787609712533877e-06,
"loss": 1.1933,
"step": 1274
},
{
"epoch": 2.899089529590288,
"grad_norm": 0.6855188012123108,
"learning_rate": 2.784530631034176e-06,
"loss": 1.1904,
"step": 1275
},
{
"epoch": 2.9013657056145674,
"grad_norm": 0.7423205971717834,
"learning_rate": 2.781451112198208e-06,
"loss": 1.1694,
"step": 1276
},
{
"epoch": 2.9036418816388467,
"grad_norm": 0.7629468441009521,
"learning_rate": 2.778371160759327e-06,
"loss": 1.1773,
"step": 1277
},
{
"epoch": 2.905918057663126,
"grad_norm": 0.7157300710678101,
"learning_rate": 2.7752907814515573e-06,
"loss": 1.1702,
"step": 1278
},
{
"epoch": 2.9081942336874054,
"grad_norm": 0.688721776008606,
"learning_rate": 2.7722099790095793e-06,
"loss": 1.1629,
"step": 1279
},
{
"epoch": 2.9104704097116842,
"grad_norm": 0.7173575162887573,
"learning_rate": 2.769128758168725e-06,
"loss": 1.1998,
"step": 1280
},
{
"epoch": 2.9127465857359636,
"grad_norm": 0.714175283908844,
"learning_rate": 2.766047123664966e-06,
"loss": 1.2034,
"step": 1281
},
{
"epoch": 2.915022761760243,
"grad_norm": 0.723911702632904,
"learning_rate": 2.7629650802349127e-06,
"loss": 1.1596,
"step": 1282
},
{
"epoch": 2.9172989377845218,
"grad_norm": 0.68398118019104,
"learning_rate": 2.7598826326158045e-06,
"loss": 1.1954,
"step": 1283
},
{
"epoch": 2.919575113808801,
"grad_norm": 0.7057693600654602,
"learning_rate": 2.7567997855454998e-06,
"loss": 1.1715,
"step": 1284
},
{
"epoch": 2.9218512898330804,
"grad_norm": 0.7358039021492004,
"learning_rate": 2.7537165437624715e-06,
"loss": 1.2015,
"step": 1285
},
{
"epoch": 2.9241274658573597,
"grad_norm": 0.7091867923736572,
"learning_rate": 2.750632912005801e-06,
"loss": 1.1549,
"step": 1286
},
{
"epoch": 2.926403641881639,
"grad_norm": 0.743848979473114,
"learning_rate": 2.747548895015167e-06,
"loss": 1.1734,
"step": 1287
},
{
"epoch": 2.928679817905918,
"grad_norm": 0.7099263072013855,
"learning_rate": 2.744464497530842e-06,
"loss": 1.185,
"step": 1288
},
{
"epoch": 2.9309559939301972,
"grad_norm": 0.7112051248550415,
"learning_rate": 2.7413797242936806e-06,
"loss": 1.1913,
"step": 1289
},
{
"epoch": 2.9332321699544766,
"grad_norm": 0.780794084072113,
"learning_rate": 2.738294580045119e-06,
"loss": 1.2336,
"step": 1290
},
{
"epoch": 2.9355083459787554,
"grad_norm": 0.7613317966461182,
"learning_rate": 2.7352090695271614e-06,
"loss": 1.2095,
"step": 1291
},
{
"epoch": 2.9377845220030347,
"grad_norm": 0.7245267033576965,
"learning_rate": 2.7321231974823732e-06,
"loss": 1.188,
"step": 1292
},
{
"epoch": 2.940060698027314,
"grad_norm": 0.7280985116958618,
"learning_rate": 2.729036968653878e-06,
"loss": 1.2282,
"step": 1293
},
{
"epoch": 2.9423368740515934,
"grad_norm": 0.6889516115188599,
"learning_rate": 2.725950387785349e-06,
"loss": 1.1587,
"step": 1294
},
{
"epoch": 2.9446130500758727,
"grad_norm": 0.7361829876899719,
"learning_rate": 2.722863459620997e-06,
"loss": 1.2065,
"step": 1295
},
{
"epoch": 2.9468892261001516,
"grad_norm": 0.7415118217468262,
"learning_rate": 2.7197761889055674e-06,
"loss": 1.1946,
"step": 1296
},
{
"epoch": 2.949165402124431,
"grad_norm": 0.7229265570640564,
"learning_rate": 2.7166885803843347e-06,
"loss": 1.1816,
"step": 1297
},
{
"epoch": 2.95144157814871,
"grad_norm": 0.6884288787841797,
"learning_rate": 2.71360063880309e-06,
"loss": 1.2027,
"step": 1298
},
{
"epoch": 2.9537177541729895,
"grad_norm": 0.7227476239204407,
"learning_rate": 2.710512368908138e-06,
"loss": 1.248,
"step": 1299
},
{
"epoch": 2.955993930197269,
"grad_norm": 0.7278752326965332,
"learning_rate": 2.707423775446286e-06,
"loss": 1.2036,
"step": 1300
},
{
"epoch": 2.9582701062215477,
"grad_norm": 0.7094444632530212,
"learning_rate": 2.7043348631648415e-06,
"loss": 1.2214,
"step": 1301
},
{
"epoch": 2.960546282245827,
"grad_norm": 0.6929619908332825,
"learning_rate": 2.701245636811599e-06,
"loss": 1.1794,
"step": 1302
},
{
"epoch": 2.9628224582701064,
"grad_norm": 0.7046913504600525,
"learning_rate": 2.6981561011348385e-06,
"loss": 1.2069,
"step": 1303
},
{
"epoch": 2.9650986342943852,
"grad_norm": 0.7059223055839539,
"learning_rate": 2.695066260883313e-06,
"loss": 1.2024,
"step": 1304
},
{
"epoch": 2.9673748103186646,
"grad_norm": 0.7080976366996765,
"learning_rate": 2.6919761208062445e-06,
"loss": 1.1625,
"step": 1305
},
{
"epoch": 2.969650986342944,
"grad_norm": 0.7362763285636902,
"learning_rate": 2.688885685653318e-06,
"loss": 1.2217,
"step": 1306
},
{
"epoch": 2.971927162367223,
"grad_norm": 0.7461861371994019,
"learning_rate": 2.6857949601746676e-06,
"loss": 1.2043,
"step": 1307
},
{
"epoch": 2.9742033383915025,
"grad_norm": 0.7257667183876038,
"learning_rate": 2.682703949120878e-06,
"loss": 1.1749,
"step": 1308
},
{
"epoch": 2.9764795144157814,
"grad_norm": 0.7328771352767944,
"learning_rate": 2.6796126572429703e-06,
"loss": 1.2021,
"step": 1309
},
{
"epoch": 2.9787556904400607,
"grad_norm": 0.705711305141449,
"learning_rate": 2.6765210892923986e-06,
"loss": 1.189,
"step": 1310
},
{
"epoch": 2.98103186646434,
"grad_norm": 0.7146801352500916,
"learning_rate": 2.67342925002104e-06,
"loss": 1.1967,
"step": 1311
},
{
"epoch": 2.983308042488619,
"grad_norm": 0.7356604933738708,
"learning_rate": 2.67033714418119e-06,
"loss": 1.1506,
"step": 1312
},
{
"epoch": 2.9855842185128982,
"grad_norm": 0.6894738674163818,
"learning_rate": 2.667244776525553e-06,
"loss": 1.1712,
"step": 1313
},
{
"epoch": 2.9878603945371776,
"grad_norm": 0.7163074612617493,
"learning_rate": 2.6641521518072355e-06,
"loss": 1.2052,
"step": 1314
},
{
"epoch": 2.990136570561457,
"grad_norm": 0.7195286154747009,
"learning_rate": 2.6610592747797397e-06,
"loss": 1.1834,
"step": 1315
},
{
"epoch": 2.992412746585736,
"grad_norm": 0.7429676055908203,
"learning_rate": 2.657966150196956e-06,
"loss": 1.1779,
"step": 1316
},
{
"epoch": 2.994688922610015,
"grad_norm": 0.7009066343307495,
"learning_rate": 2.6548727828131554e-06,
"loss": 1.1745,
"step": 1317
},
{
"epoch": 2.9969650986342944,
"grad_norm": 0.6867721676826477,
"learning_rate": 2.65177917738298e-06,
"loss": 1.1665,
"step": 1318
},
{
"epoch": 2.9992412746585737,
"grad_norm": 0.7170870900154114,
"learning_rate": 2.6486853386614397e-06,
"loss": 1.158,
"step": 1319
},
{
"epoch": 3.0,
"grad_norm": 1.3577916622161865,
"learning_rate": 2.6455912714039033e-06,
"loss": 1.1783,
"step": 1320
},
{
"epoch": 3.0022761760242793,
"grad_norm": 0.7354686856269836,
"learning_rate": 2.6424969803660903e-06,
"loss": 1.169,
"step": 1321
},
{
"epoch": 3.0045523520485586,
"grad_norm": 0.7197363972663879,
"learning_rate": 2.639402470304063e-06,
"loss": 1.1474,
"step": 1322
},
{
"epoch": 3.0068285280728375,
"grad_norm": 0.7077065110206604,
"learning_rate": 2.6363077459742214e-06,
"loss": 1.1614,
"step": 1323
},
{
"epoch": 3.009104704097117,
"grad_norm": 0.7212318778038025,
"learning_rate": 2.6332128121332967e-06,
"loss": 1.1648,
"step": 1324
},
{
"epoch": 3.011380880121396,
"grad_norm": 0.7356379628181458,
"learning_rate": 2.6301176735383382e-06,
"loss": 1.1547,
"step": 1325
},
{
"epoch": 3.0136570561456755,
"grad_norm": 0.7490338087081909,
"learning_rate": 2.627022334946712e-06,
"loss": 1.1572,
"step": 1326
},
{
"epoch": 3.0159332321699543,
"grad_norm": 0.7195541262626648,
"learning_rate": 2.6239268011160923e-06,
"loss": 1.1909,
"step": 1327
},
{
"epoch": 3.0182094081942337,
"grad_norm": 0.7337531447410583,
"learning_rate": 2.620831076804453e-06,
"loss": 1.1573,
"step": 1328
},
{
"epoch": 3.020485584218513,
"grad_norm": 0.7338323593139648,
"learning_rate": 2.61773516677006e-06,
"loss": 1.141,
"step": 1329
},
{
"epoch": 3.0227617602427923,
"grad_norm": 0.7600909471511841,
"learning_rate": 2.614639075771465e-06,
"loss": 1.1509,
"step": 1330
},
{
"epoch": 3.025037936267071,
"grad_norm": 0.7509938478469849,
"learning_rate": 2.611542808567497e-06,
"loss": 1.1715,
"step": 1331
},
{
"epoch": 3.0273141122913505,
"grad_norm": 0.7426080703735352,
"learning_rate": 2.6084463699172594e-06,
"loss": 1.1395,
"step": 1332
},
{
"epoch": 3.02959028831563,
"grad_norm": 0.712645947933197,
"learning_rate": 2.6053497645801133e-06,
"loss": 1.1786,
"step": 1333
},
{
"epoch": 3.031866464339909,
"grad_norm": 0.7545322775840759,
"learning_rate": 2.6022529973156813e-06,
"loss": 1.1562,
"step": 1334
},
{
"epoch": 3.034142640364188,
"grad_norm": 0.7290534973144531,
"learning_rate": 2.5991560728838326e-06,
"loss": 1.1683,
"step": 1335
},
{
"epoch": 3.0364188163884673,
"grad_norm": 0.7173789143562317,
"learning_rate": 2.596058996044678e-06,
"loss": 1.1608,
"step": 1336
},
{
"epoch": 3.0386949924127467,
"grad_norm": 0.7625917792320251,
"learning_rate": 2.5929617715585614e-06,
"loss": 1.1349,
"step": 1337
},
{
"epoch": 3.040971168437026,
"grad_norm": 0.7284402251243591,
"learning_rate": 2.5898644041860567e-06,
"loss": 1.1264,
"step": 1338
},
{
"epoch": 3.043247344461305,
"grad_norm": 0.7259999513626099,
"learning_rate": 2.586766898687955e-06,
"loss": 1.1817,
"step": 1339
},
{
"epoch": 3.045523520485584,
"grad_norm": 0.8065728545188904,
"learning_rate": 2.583669259825261e-06,
"loss": 1.2324,
"step": 1340
},
{
"epoch": 3.0477996965098635,
"grad_norm": 0.7757611870765686,
"learning_rate": 2.580571492359183e-06,
"loss": 1.181,
"step": 1341
},
{
"epoch": 3.050075872534143,
"grad_norm": 0.6941875219345093,
"learning_rate": 2.5774736010511275e-06,
"loss": 1.1928,
"step": 1342
},
{
"epoch": 3.0523520485584217,
"grad_norm": 0.7500492334365845,
"learning_rate": 2.5743755906626928e-06,
"loss": 1.2046,
"step": 1343
},
{
"epoch": 3.054628224582701,
"grad_norm": 0.7713794112205505,
"learning_rate": 2.571277465955658e-06,
"loss": 1.1514,
"step": 1344
},
{
"epoch": 3.0569044006069803,
"grad_norm": 0.7205464243888855,
"learning_rate": 2.5681792316919785e-06,
"loss": 1.1583,
"step": 1345
},
{
"epoch": 3.0591805766312596,
"grad_norm": 0.7222645878791809,
"learning_rate": 2.56508089263378e-06,
"loss": 1.143,
"step": 1346
},
{
"epoch": 3.0614567526555385,
"grad_norm": 0.7436219453811646,
"learning_rate": 2.561982453543348e-06,
"loss": 1.1691,
"step": 1347
},
{
"epoch": 3.063732928679818,
"grad_norm": 0.7247343063354492,
"learning_rate": 2.5588839191831196e-06,
"loss": 1.1691,
"step": 1348
},
{
"epoch": 3.066009104704097,
"grad_norm": 0.7624401450157166,
"learning_rate": 2.5557852943156807e-06,
"loss": 1.1982,
"step": 1349
},
{
"epoch": 3.0682852807283765,
"grad_norm": 0.7353934645652771,
"learning_rate": 2.552686583703758e-06,
"loss": 1.1609,
"step": 1350
},
{
"epoch": 3.0705614567526553,
"grad_norm": 0.7236819863319397,
"learning_rate": 2.5495877921102056e-06,
"loss": 1.1766,
"step": 1351
},
{
"epoch": 3.0728376327769347,
"grad_norm": 0.7330083250999451,
"learning_rate": 2.546488924298006e-06,
"loss": 1.1572,
"step": 1352
},
{
"epoch": 3.075113808801214,
"grad_norm": 0.72292160987854,
"learning_rate": 2.5433899850302552e-06,
"loss": 1.194,
"step": 1353
},
{
"epoch": 3.0773899848254933,
"grad_norm": 0.7504494190216064,
"learning_rate": 2.5402909790701636e-06,
"loss": 1.1868,
"step": 1354
},
{
"epoch": 3.079666160849772,
"grad_norm": 0.7421271800994873,
"learning_rate": 2.53719191118104e-06,
"loss": 1.1708,
"step": 1355
},
{
"epoch": 3.0819423368740515,
"grad_norm": 0.7539661526679993,
"learning_rate": 2.53409278612629e-06,
"loss": 1.1786,
"step": 1356
},
{
"epoch": 3.084218512898331,
"grad_norm": 0.7343708872795105,
"learning_rate": 2.530993608669407e-06,
"loss": 1.191,
"step": 1357
},
{
"epoch": 3.08649468892261,
"grad_norm": 0.7485260367393494,
"learning_rate": 2.5278943835739656e-06,
"loss": 1.1513,
"step": 1358
},
{
"epoch": 3.088770864946889,
"grad_norm": 0.7451958656311035,
"learning_rate": 2.524795115603613e-06,
"loss": 1.1514,
"step": 1359
},
{
"epoch": 3.0910470409711683,
"grad_norm": 0.726077139377594,
"learning_rate": 2.521695809522061e-06,
"loss": 1.1405,
"step": 1360
},
{
"epoch": 3.0933232169954477,
"grad_norm": 0.7406827807426453,
"learning_rate": 2.518596470093083e-06,
"loss": 1.197,
"step": 1361
},
{
"epoch": 3.095599393019727,
"grad_norm": 0.7313615679740906,
"learning_rate": 2.5154971020805018e-06,
"loss": 1.191,
"step": 1362
},
{
"epoch": 3.097875569044006,
"grad_norm": 0.7504832744598389,
"learning_rate": 2.512397710248182e-06,
"loss": 1.2038,
"step": 1363
},
{
"epoch": 3.100151745068285,
"grad_norm": 0.7463207244873047,
"learning_rate": 2.5092982993600294e-06,
"loss": 1.1504,
"step": 1364
},
{
"epoch": 3.1024279210925645,
"grad_norm": 0.723032534122467,
"learning_rate": 2.506198874179976e-06,
"loss": 1.1612,
"step": 1365
},
{
"epoch": 3.104704097116844,
"grad_norm": 0.7396848797798157,
"learning_rate": 2.503099439471977e-06,
"loss": 1.1658,
"step": 1366
},
{
"epoch": 3.106980273141123,
"grad_norm": 0.7143369913101196,
"learning_rate": 2.5e-06,
"loss": 1.1587,
"step": 1367
},
{
"epoch": 3.109256449165402,
"grad_norm": 0.7214608192443848,
"learning_rate": 2.4969005605280243e-06,
"loss": 1.1627,
"step": 1368
},
{
"epoch": 3.1115326251896813,
"grad_norm": 0.7474159598350525,
"learning_rate": 2.4938011258200244e-06,
"loss": 1.1744,
"step": 1369
},
{
"epoch": 3.1138088012139606,
"grad_norm": 0.74878990650177,
"learning_rate": 2.4907017006399715e-06,
"loss": 1.1715,
"step": 1370
},
{
"epoch": 3.11608497723824,
"grad_norm": 0.7635120749473572,
"learning_rate": 2.487602289751819e-06,
"loss": 1.1706,
"step": 1371
},
{
"epoch": 3.118361153262519,
"grad_norm": 0.7503812313079834,
"learning_rate": 2.484502897919499e-06,
"loss": 1.1673,
"step": 1372
},
{
"epoch": 3.120637329286798,
"grad_norm": 0.7519100904464722,
"learning_rate": 2.481403529906918e-06,
"loss": 1.1862,
"step": 1373
},
{
"epoch": 3.1229135053110775,
"grad_norm": 0.7507646679878235,
"learning_rate": 2.4783041904779386e-06,
"loss": 1.1761,
"step": 1374
},
{
"epoch": 3.125189681335357,
"grad_norm": 0.7176265120506287,
"learning_rate": 2.4752048843963877e-06,
"loss": 1.1601,
"step": 1375
},
{
"epoch": 3.1274658573596357,
"grad_norm": 0.7137569785118103,
"learning_rate": 2.4721056164260348e-06,
"loss": 1.1542,
"step": 1376
},
{
"epoch": 3.129742033383915,
"grad_norm": 0.745905876159668,
"learning_rate": 2.4690063913305936e-06,
"loss": 1.1524,
"step": 1377
},
{
"epoch": 3.1320182094081943,
"grad_norm": 0.7238267660140991,
"learning_rate": 2.465907213873711e-06,
"loss": 1.1493,
"step": 1378
},
{
"epoch": 3.1342943854324736,
"grad_norm": 0.7267714142799377,
"learning_rate": 2.462808088818961e-06,
"loss": 1.1969,
"step": 1379
},
{
"epoch": 3.1365705614567525,
"grad_norm": 0.7494551539421082,
"learning_rate": 2.4597090209298372e-06,
"loss": 1.1708,
"step": 1380
},
{
"epoch": 3.138846737481032,
"grad_norm": 0.7473982572555542,
"learning_rate": 2.4566100149697456e-06,
"loss": 1.1915,
"step": 1381
},
{
"epoch": 3.141122913505311,
"grad_norm": 0.7496033906936646,
"learning_rate": 2.453511075701996e-06,
"loss": 1.1942,
"step": 1382
},
{
"epoch": 3.1433990895295905,
"grad_norm": 0.7699615359306335,
"learning_rate": 2.4504122078897948e-06,
"loss": 1.214,
"step": 1383
},
{
"epoch": 3.1456752655538693,
"grad_norm": 0.7535260915756226,
"learning_rate": 2.447313416296243e-06,
"loss": 1.1413,
"step": 1384
},
{
"epoch": 3.1479514415781487,
"grad_norm": 0.7432587146759033,
"learning_rate": 2.4442147056843193e-06,
"loss": 1.2077,
"step": 1385
},
{
"epoch": 3.150227617602428,
"grad_norm": 0.7599760293960571,
"learning_rate": 2.4411160808168817e-06,
"loss": 1.2043,
"step": 1386
},
{
"epoch": 3.1525037936267073,
"grad_norm": 0.7298524379730225,
"learning_rate": 2.4380175464566534e-06,
"loss": 1.2061,
"step": 1387
},
{
"epoch": 3.154779969650986,
"grad_norm": 0.743593156337738,
"learning_rate": 2.4349191073662203e-06,
"loss": 1.1708,
"step": 1388
},
{
"epoch": 3.1570561456752655,
"grad_norm": 0.7407417893409729,
"learning_rate": 2.431820768308022e-06,
"loss": 1.1469,
"step": 1389
},
{
"epoch": 3.159332321699545,
"grad_norm": 0.7478795051574707,
"learning_rate": 2.4287225340443434e-06,
"loss": 1.172,
"step": 1390
},
{
"epoch": 3.161608497723824,
"grad_norm": 0.7392578721046448,
"learning_rate": 2.425624409337308e-06,
"loss": 1.1722,
"step": 1391
},
{
"epoch": 3.163884673748103,
"grad_norm": 0.7476488947868347,
"learning_rate": 2.4225263989488733e-06,
"loss": 1.1907,
"step": 1392
},
{
"epoch": 3.1661608497723823,
"grad_norm": 0.7702009677886963,
"learning_rate": 2.4194285076408175e-06,
"loss": 1.1757,
"step": 1393
},
{
"epoch": 3.1684370257966616,
"grad_norm": 0.7480501532554626,
"learning_rate": 2.4163307401747393e-06,
"loss": 1.1914,
"step": 1394
},
{
"epoch": 3.170713201820941,
"grad_norm": 0.7919908165931702,
"learning_rate": 2.4132331013120454e-06,
"loss": 1.1775,
"step": 1395
},
{
"epoch": 3.17298937784522,
"grad_norm": 0.7680845260620117,
"learning_rate": 2.4101355958139437e-06,
"loss": 1.1885,
"step": 1396
},
{
"epoch": 3.175265553869499,
"grad_norm": 0.7591391205787659,
"learning_rate": 2.407038228441439e-06,
"loss": 1.1877,
"step": 1397
},
{
"epoch": 3.1775417298937785,
"grad_norm": 0.762646496295929,
"learning_rate": 2.4039410039553233e-06,
"loss": 1.2105,
"step": 1398
},
{
"epoch": 3.179817905918058,
"grad_norm": 0.7501146197319031,
"learning_rate": 2.4008439271161678e-06,
"loss": 1.1484,
"step": 1399
},
{
"epoch": 3.1820940819423367,
"grad_norm": 0.7478851675987244,
"learning_rate": 2.3977470026843196e-06,
"loss": 1.187,
"step": 1400
},
{
"epoch": 3.184370257966616,
"grad_norm": 0.7486398220062256,
"learning_rate": 2.3946502354198875e-06,
"loss": 1.2158,
"step": 1401
},
{
"epoch": 3.1866464339908953,
"grad_norm": 0.7453305721282959,
"learning_rate": 2.3915536300827414e-06,
"loss": 1.2463,
"step": 1402
},
{
"epoch": 3.1889226100151746,
"grad_norm": 0.8247939944267273,
"learning_rate": 2.3884571914325034e-06,
"loss": 1.1362,
"step": 1403
},
{
"epoch": 3.191198786039454,
"grad_norm": 0.7734161019325256,
"learning_rate": 2.3853609242285356e-06,
"loss": 1.1811,
"step": 1404
},
{
"epoch": 3.193474962063733,
"grad_norm": 0.7732148170471191,
"learning_rate": 2.3822648332299405e-06,
"loss": 1.1969,
"step": 1405
},
{
"epoch": 3.195751138088012,
"grad_norm": 0.7798835039138794,
"learning_rate": 2.3791689231955473e-06,
"loss": 1.167,
"step": 1406
},
{
"epoch": 3.1980273141122915,
"grad_norm": 0.7879831790924072,
"learning_rate": 2.3760731988839077e-06,
"loss": 1.1513,
"step": 1407
},
{
"epoch": 3.2003034901365703,
"grad_norm": 0.7619938254356384,
"learning_rate": 2.3729776650532887e-06,
"loss": 1.1692,
"step": 1408
},
{
"epoch": 3.2025796661608497,
"grad_norm": 0.7818293571472168,
"learning_rate": 2.3698823264616635e-06,
"loss": 1.1752,
"step": 1409
},
{
"epoch": 3.204855842185129,
"grad_norm": 0.7367649078369141,
"learning_rate": 2.366787187866704e-06,
"loss": 1.1639,
"step": 1410
},
{
"epoch": 3.2071320182094083,
"grad_norm": 0.7616274356842041,
"learning_rate": 2.363692254025779e-06,
"loss": 1.1844,
"step": 1411
},
{
"epoch": 3.2094081942336876,
"grad_norm": 0.7888926863670349,
"learning_rate": 2.360597529695938e-06,
"loss": 1.1797,
"step": 1412
},
{
"epoch": 3.2116843702579665,
"grad_norm": 0.7439415454864502,
"learning_rate": 2.35750301963391e-06,
"loss": 1.1933,
"step": 1413
},
{
"epoch": 3.213960546282246,
"grad_norm": 0.7649375200271606,
"learning_rate": 2.3544087285960975e-06,
"loss": 1.1848,
"step": 1414
},
{
"epoch": 3.216236722306525,
"grad_norm": 0.7349463701248169,
"learning_rate": 2.3513146613385603e-06,
"loss": 1.1557,
"step": 1415
},
{
"epoch": 3.2185128983308045,
"grad_norm": 0.756278395652771,
"learning_rate": 2.348220822617021e-06,
"loss": 1.2136,
"step": 1416
},
{
"epoch": 3.2207890743550833,
"grad_norm": 0.7582489252090454,
"learning_rate": 2.345127217186846e-06,
"loss": 1.1606,
"step": 1417
},
{
"epoch": 3.2230652503793626,
"grad_norm": 0.7773423194885254,
"learning_rate": 2.3420338498030445e-06,
"loss": 1.1465,
"step": 1418
},
{
"epoch": 3.225341426403642,
"grad_norm": 0.7887737154960632,
"learning_rate": 2.3389407252202607e-06,
"loss": 1.1791,
"step": 1419
},
{
"epoch": 3.2276176024279213,
"grad_norm": 0.7461211681365967,
"learning_rate": 2.3358478481927657e-06,
"loss": 1.1888,
"step": 1420
},
{
"epoch": 3.2298937784522,
"grad_norm": 0.7704999446868896,
"learning_rate": 2.332755223474448e-06,
"loss": 1.1494,
"step": 1421
},
{
"epoch": 3.2321699544764795,
"grad_norm": 0.7470236420631409,
"learning_rate": 2.329662855818811e-06,
"loss": 1.1556,
"step": 1422
},
{
"epoch": 3.234446130500759,
"grad_norm": 0.7861848473548889,
"learning_rate": 2.32657074997896e-06,
"loss": 1.1904,
"step": 1423
},
{
"epoch": 3.236722306525038,
"grad_norm": 0.7543413043022156,
"learning_rate": 2.323478910707602e-06,
"loss": 1.1813,
"step": 1424
},
{
"epoch": 3.238998482549317,
"grad_norm": 0.7611058950424194,
"learning_rate": 2.3203873427570305e-06,
"loss": 1.1658,
"step": 1425
},
{
"epoch": 3.2412746585735963,
"grad_norm": 0.7355452179908752,
"learning_rate": 2.3172960508791225e-06,
"loss": 1.1899,
"step": 1426
},
{
"epoch": 3.2435508345978756,
"grad_norm": 0.7388221025466919,
"learning_rate": 2.314205039825333e-06,
"loss": 1.1881,
"step": 1427
},
{
"epoch": 3.245827010622155,
"grad_norm": 0.7748705744743347,
"learning_rate": 2.3111143143466836e-06,
"loss": 1.2042,
"step": 1428
},
{
"epoch": 3.248103186646434,
"grad_norm": 0.7724549770355225,
"learning_rate": 2.308023879193756e-06,
"loss": 1.183,
"step": 1429
},
{
"epoch": 3.250379362670713,
"grad_norm": 0.7296189665794373,
"learning_rate": 2.3049337391166884e-06,
"loss": 1.1326,
"step": 1430
},
{
"epoch": 3.2526555386949925,
"grad_norm": 0.7788570523262024,
"learning_rate": 2.3018438988651628e-06,
"loss": 1.1718,
"step": 1431
},
{
"epoch": 3.254931714719272,
"grad_norm": 0.7668931484222412,
"learning_rate": 2.2987543631884014e-06,
"loss": 1.1742,
"step": 1432
},
{
"epoch": 3.2572078907435507,
"grad_norm": 0.7767664790153503,
"learning_rate": 2.2956651368351597e-06,
"loss": 1.2012,
"step": 1433
},
{
"epoch": 3.25948406676783,
"grad_norm": 0.7626579999923706,
"learning_rate": 2.2925762245537135e-06,
"loss": 1.1495,
"step": 1434
},
{
"epoch": 3.2617602427921093,
"grad_norm": 0.7605862021446228,
"learning_rate": 2.289487631091863e-06,
"loss": 1.1791,
"step": 1435
},
{
"epoch": 3.2640364188163886,
"grad_norm": 0.7632426023483276,
"learning_rate": 2.2863993611969105e-06,
"loss": 1.1704,
"step": 1436
},
{
"epoch": 3.2663125948406675,
"grad_norm": 0.741607129573822,
"learning_rate": 2.2833114196156657e-06,
"loss": 1.162,
"step": 1437
},
{
"epoch": 3.268588770864947,
"grad_norm": 0.7359287142753601,
"learning_rate": 2.2802238110944335e-06,
"loss": 1.1825,
"step": 1438
},
{
"epoch": 3.270864946889226,
"grad_norm": 0.798477053642273,
"learning_rate": 2.2771365403790046e-06,
"loss": 1.1829,
"step": 1439
},
{
"epoch": 3.2731411229135055,
"grad_norm": 0.777813196182251,
"learning_rate": 2.274049612214652e-06,
"loss": 1.1611,
"step": 1440
},
{
"epoch": 3.2754172989377848,
"grad_norm": 0.7524014115333557,
"learning_rate": 2.2709630313461224e-06,
"loss": 1.1774,
"step": 1441
},
{
"epoch": 3.2776934749620636,
"grad_norm": 0.7729030847549438,
"learning_rate": 2.267876802517628e-06,
"loss": 1.1656,
"step": 1442
},
{
"epoch": 3.279969650986343,
"grad_norm": 0.7829767465591431,
"learning_rate": 2.2647909304728394e-06,
"loss": 1.1561,
"step": 1443
},
{
"epoch": 3.2822458270106223,
"grad_norm": 0.7700682282447815,
"learning_rate": 2.261705419954882e-06,
"loss": 1.17,
"step": 1444
},
{
"epoch": 3.284522003034901,
"grad_norm": 0.7718088626861572,
"learning_rate": 2.258620275706319e-06,
"loss": 1.1421,
"step": 1445
},
{
"epoch": 3.2867981790591805,
"grad_norm": 0.775188148021698,
"learning_rate": 2.255535502469159e-06,
"loss": 1.2197,
"step": 1446
},
{
"epoch": 3.28907435508346,
"grad_norm": 0.7680513858795166,
"learning_rate": 2.2524511049848335e-06,
"loss": 1.2133,
"step": 1447
},
{
"epoch": 3.291350531107739,
"grad_norm": 0.7592117786407471,
"learning_rate": 2.2493670879941996e-06,
"loss": 1.1552,
"step": 1448
},
{
"epoch": 3.2936267071320184,
"grad_norm": 0.748960554599762,
"learning_rate": 2.246283456237529e-06,
"loss": 1.1759,
"step": 1449
},
{
"epoch": 3.2959028831562973,
"grad_norm": 0.7754799723625183,
"learning_rate": 2.2432002144545015e-06,
"loss": 1.1703,
"step": 1450
},
{
"epoch": 3.2981790591805766,
"grad_norm": 0.757481038570404,
"learning_rate": 2.2401173673841963e-06,
"loss": 1.1892,
"step": 1451
},
{
"epoch": 3.300455235204856,
"grad_norm": 0.7509388327598572,
"learning_rate": 2.2370349197650877e-06,
"loss": 1.2018,
"step": 1452
},
{
"epoch": 3.302731411229135,
"grad_norm": 0.7432618141174316,
"learning_rate": 2.2339528763350353e-06,
"loss": 1.1659,
"step": 1453
},
{
"epoch": 3.305007587253414,
"grad_norm": 0.7313361763954163,
"learning_rate": 2.230871241831276e-06,
"loss": 1.1701,
"step": 1454
},
{
"epoch": 3.3072837632776935,
"grad_norm": 0.7643156051635742,
"learning_rate": 2.2277900209904215e-06,
"loss": 1.145,
"step": 1455
},
{
"epoch": 3.309559939301973,
"grad_norm": 0.7757098078727722,
"learning_rate": 2.224709218548443e-06,
"loss": 1.1609,
"step": 1456
},
{
"epoch": 3.311836115326252,
"grad_norm": 0.7871479392051697,
"learning_rate": 2.221628839240674e-06,
"loss": 1.1922,
"step": 1457
},
{
"epoch": 3.314112291350531,
"grad_norm": 0.774569034576416,
"learning_rate": 2.2185488878017934e-06,
"loss": 1.1608,
"step": 1458
},
{
"epoch": 3.3163884673748103,
"grad_norm": 0.7552246451377869,
"learning_rate": 2.215469368965824e-06,
"loss": 1.1586,
"step": 1459
},
{
"epoch": 3.3186646433990896,
"grad_norm": 0.7549350261688232,
"learning_rate": 2.2123902874661237e-06,
"loss": 1.2161,
"step": 1460
},
{
"epoch": 3.3209408194233685,
"grad_norm": 0.7435033917427063,
"learning_rate": 2.2093116480353785e-06,
"loss": 1.1543,
"step": 1461
},
{
"epoch": 3.323216995447648,
"grad_norm": 0.7678859233856201,
"learning_rate": 2.2062334554055937e-06,
"loss": 1.1509,
"step": 1462
},
{
"epoch": 3.325493171471927,
"grad_norm": 0.7754087448120117,
"learning_rate": 2.2031557143080896e-06,
"loss": 1.1821,
"step": 1463
},
{
"epoch": 3.3277693474962065,
"grad_norm": 0.7418244481086731,
"learning_rate": 2.2000784294734896e-06,
"loss": 1.1803,
"step": 1464
},
{
"epoch": 3.3300455235204858,
"grad_norm": 0.7413983345031738,
"learning_rate": 2.1970016056317202e-06,
"loss": 1.1761,
"step": 1465
},
{
"epoch": 3.3323216995447646,
"grad_norm": 0.7306979894638062,
"learning_rate": 2.193925247511996e-06,
"loss": 1.1418,
"step": 1466
},
{
"epoch": 3.334597875569044,
"grad_norm": 0.7656619548797607,
"learning_rate": 2.190849359842816e-06,
"loss": 1.1635,
"step": 1467
},
{
"epoch": 3.3368740515933233,
"grad_norm": 0.7424888610839844,
"learning_rate": 2.1877739473519575e-06,
"loss": 1.1551,
"step": 1468
},
{
"epoch": 3.3391502276176026,
"grad_norm": 0.7459004521369934,
"learning_rate": 2.184699014766466e-06,
"loss": 1.1799,
"step": 1469
},
{
"epoch": 3.3414264036418815,
"grad_norm": 0.7800491452217102,
"learning_rate": 2.1816245668126506e-06,
"loss": 1.1304,
"step": 1470
},
{
"epoch": 3.343702579666161,
"grad_norm": 0.7625274658203125,
"learning_rate": 2.1785506082160745e-06,
"loss": 1.1869,
"step": 1471
},
{
"epoch": 3.34597875569044,
"grad_norm": 0.7717932462692261,
"learning_rate": 2.1754771437015495e-06,
"loss": 1.1893,
"step": 1472
},
{
"epoch": 3.3482549317147194,
"grad_norm": 0.7545213103294373,
"learning_rate": 2.1724041779931266e-06,
"loss": 1.1929,
"step": 1473
},
{
"epoch": 3.3505311077389983,
"grad_norm": 0.7686711549758911,
"learning_rate": 2.169331715814093e-06,
"loss": 1.1781,
"step": 1474
},
{
"epoch": 3.3528072837632776,
"grad_norm": 0.7366244792938232,
"learning_rate": 2.1662597618869574e-06,
"loss": 1.1401,
"step": 1475
},
{
"epoch": 3.355083459787557,
"grad_norm": 0.7632220387458801,
"learning_rate": 2.163188320933453e-06,
"loss": 1.1431,
"step": 1476
},
{
"epoch": 3.3573596358118363,
"grad_norm": 0.7904044389724731,
"learning_rate": 2.1601173976745205e-06,
"loss": 1.2071,
"step": 1477
},
{
"epoch": 3.359635811836115,
"grad_norm": 0.7487012147903442,
"learning_rate": 2.157046996830304e-06,
"loss": 1.1827,
"step": 1478
},
{
"epoch": 3.3619119878603945,
"grad_norm": 0.7721722722053528,
"learning_rate": 2.1539771231201497e-06,
"loss": 1.1984,
"step": 1479
},
{
"epoch": 3.364188163884674,
"grad_norm": 0.7438533902168274,
"learning_rate": 2.1509077812625885e-06,
"loss": 1.161,
"step": 1480
},
{
"epoch": 3.366464339908953,
"grad_norm": 0.7563179731369019,
"learning_rate": 2.147838975975335e-06,
"loss": 1.182,
"step": 1481
},
{
"epoch": 3.368740515933232,
"grad_norm": 0.7858923673629761,
"learning_rate": 2.1447707119752817e-06,
"loss": 1.2036,
"step": 1482
},
{
"epoch": 3.3710166919575113,
"grad_norm": 0.7806487083435059,
"learning_rate": 2.141702993978486e-06,
"loss": 1.1444,
"step": 1483
},
{
"epoch": 3.3732928679817906,
"grad_norm": 0.7645002603530884,
"learning_rate": 2.138635826700167e-06,
"loss": 1.1818,
"step": 1484
},
{
"epoch": 3.37556904400607,
"grad_norm": 0.7865437865257263,
"learning_rate": 2.1355692148546993e-06,
"loss": 1.1859,
"step": 1485
},
{
"epoch": 3.3778452200303493,
"grad_norm": 0.7360503673553467,
"learning_rate": 2.1325031631555996e-06,
"loss": 1.175,
"step": 1486
},
{
"epoch": 3.380121396054628,
"grad_norm": 0.7604530453681946,
"learning_rate": 2.1294376763155284e-06,
"loss": 1.1844,
"step": 1487
},
{
"epoch": 3.3823975720789075,
"grad_norm": 0.7645239233970642,
"learning_rate": 2.1263727590462747e-06,
"loss": 1.1488,
"step": 1488
},
{
"epoch": 3.3846737481031868,
"grad_norm": 0.7591574788093567,
"learning_rate": 2.1233084160587524e-06,
"loss": 1.1975,
"step": 1489
},
{
"epoch": 3.3869499241274656,
"grad_norm": 0.7940670847892761,
"learning_rate": 2.1202446520629945e-06,
"loss": 1.1756,
"step": 1490
},
{
"epoch": 3.389226100151745,
"grad_norm": 0.76637864112854,
"learning_rate": 2.117181471768143e-06,
"loss": 1.1533,
"step": 1491
},
{
"epoch": 3.3915022761760243,
"grad_norm": 0.8150556683540344,
"learning_rate": 2.1141188798824404e-06,
"loss": 1.2104,
"step": 1492
},
{
"epoch": 3.3937784522003036,
"grad_norm": 0.7532956600189209,
"learning_rate": 2.11105688111323e-06,
"loss": 1.1345,
"step": 1493
},
{
"epoch": 3.396054628224583,
"grad_norm": 0.7805771827697754,
"learning_rate": 2.107995480166937e-06,
"loss": 1.1963,
"step": 1494
},
{
"epoch": 3.398330804248862,
"grad_norm": 0.8066619634628296,
"learning_rate": 2.1049346817490756e-06,
"loss": 1.171,
"step": 1495
},
{
"epoch": 3.400606980273141,
"grad_norm": 0.7702916264533997,
"learning_rate": 2.101874490564227e-06,
"loss": 1.191,
"step": 1496
},
{
"epoch": 3.4028831562974204,
"grad_norm": 0.7325629591941833,
"learning_rate": 2.0988149113160395e-06,
"loss": 1.1501,
"step": 1497
},
{
"epoch": 3.4051593323216993,
"grad_norm": 0.7644321918487549,
"learning_rate": 2.095755948707227e-06,
"loss": 1.1714,
"step": 1498
},
{
"epoch": 3.4074355083459786,
"grad_norm": 0.7521936893463135,
"learning_rate": 2.092697607439549e-06,
"loss": 1.1995,
"step": 1499
},
{
"epoch": 3.409711684370258,
"grad_norm": 0.7865051627159119,
"learning_rate": 2.0896398922138124e-06,
"loss": 1.1151,
"step": 1500
},
{
"epoch": 3.4119878603945373,
"grad_norm": 0.74745774269104,
"learning_rate": 2.086582807729863e-06,
"loss": 1.1869,
"step": 1501
},
{
"epoch": 3.4142640364188166,
"grad_norm": 0.7475553750991821,
"learning_rate": 2.083526358686575e-06,
"loss": 1.147,
"step": 1502
},
{
"epoch": 3.4165402124430955,
"grad_norm": 0.7867413759231567,
"learning_rate": 2.0804705497818466e-06,
"loss": 1.1804,
"step": 1503
},
{
"epoch": 3.418816388467375,
"grad_norm": 0.7480242252349854,
"learning_rate": 2.077415385712594e-06,
"loss": 1.1545,
"step": 1504
},
{
"epoch": 3.421092564491654,
"grad_norm": 0.7600266933441162,
"learning_rate": 2.0743608711747383e-06,
"loss": 1.1668,
"step": 1505
},
{
"epoch": 3.423368740515933,
"grad_norm": 0.7508987188339233,
"learning_rate": 2.0713070108632072e-06,
"loss": 1.1793,
"step": 1506
},
{
"epoch": 3.4256449165402123,
"grad_norm": 0.7640750408172607,
"learning_rate": 2.0682538094719183e-06,
"loss": 1.1797,
"step": 1507
},
{
"epoch": 3.4279210925644916,
"grad_norm": 0.7698054313659668,
"learning_rate": 2.065201271693779e-06,
"loss": 1.1753,
"step": 1508
},
{
"epoch": 3.430197268588771,
"grad_norm": 0.7779369354248047,
"learning_rate": 2.0621494022206758e-06,
"loss": 1.1866,
"step": 1509
},
{
"epoch": 3.4324734446130503,
"grad_norm": 0.7479546666145325,
"learning_rate": 2.0590982057434684e-06,
"loss": 1.1501,
"step": 1510
},
{
"epoch": 3.434749620637329,
"grad_norm": 0.7484161853790283,
"learning_rate": 2.0560476869519815e-06,
"loss": 1.172,
"step": 1511
},
{
"epoch": 3.4370257966616085,
"grad_norm": 0.7611399292945862,
"learning_rate": 2.052997850534999e-06,
"loss": 1.1624,
"step": 1512
},
{
"epoch": 3.4393019726858878,
"grad_norm": 0.7756854295730591,
"learning_rate": 2.0499487011802554e-06,
"loss": 1.1852,
"step": 1513
},
{
"epoch": 3.441578148710167,
"grad_norm": 0.783087968826294,
"learning_rate": 2.0469002435744285e-06,
"loss": 1.1806,
"step": 1514
},
{
"epoch": 3.443854324734446,
"grad_norm": 0.8397075533866882,
"learning_rate": 2.0438524824031346e-06,
"loss": 1.1927,
"step": 1515
},
{
"epoch": 3.4461305007587253,
"grad_norm": 0.7895216941833496,
"learning_rate": 2.0408054223509173e-06,
"loss": 1.158,
"step": 1516
},
{
"epoch": 3.4484066767830046,
"grad_norm": 0.7796897292137146,
"learning_rate": 2.0377590681012454e-06,
"loss": 1.1838,
"step": 1517
},
{
"epoch": 3.450682852807284,
"grad_norm": 0.7745197415351868,
"learning_rate": 2.0347134243365e-06,
"loss": 1.1581,
"step": 1518
},
{
"epoch": 3.452959028831563,
"grad_norm": 0.7631146907806396,
"learning_rate": 2.031668495737972e-06,
"loss": 1.2247,
"step": 1519
},
{
"epoch": 3.455235204855842,
"grad_norm": 0.7887859344482422,
"learning_rate": 2.0286242869858525e-06,
"loss": 1.1955,
"step": 1520
},
{
"epoch": 3.4575113808801214,
"grad_norm": 0.7753341794013977,
"learning_rate": 2.0255808027592263e-06,
"loss": 1.2184,
"step": 1521
},
{
"epoch": 3.4597875569044008,
"grad_norm": 0.8212582468986511,
"learning_rate": 2.022538047736063e-06,
"loss": 1.1923,
"step": 1522
},
{
"epoch": 3.4620637329286796,
"grad_norm": 0.7870779037475586,
"learning_rate": 2.019496026593214e-06,
"loss": 1.2015,
"step": 1523
},
{
"epoch": 3.464339908952959,
"grad_norm": 0.7596839070320129,
"learning_rate": 2.0164547440064017e-06,
"loss": 1.1863,
"step": 1524
},
{
"epoch": 3.4666160849772383,
"grad_norm": 0.7594588994979858,
"learning_rate": 2.0134142046502112e-06,
"loss": 1.1413,
"step": 1525
},
{
"epoch": 3.4688922610015176,
"grad_norm": 0.7960179448127747,
"learning_rate": 2.0103744131980906e-06,
"loss": 1.1916,
"step": 1526
},
{
"epoch": 3.4711684370257965,
"grad_norm": 0.7636160850524902,
"learning_rate": 2.007335374322331e-06,
"loss": 1.1859,
"step": 1527
},
{
"epoch": 3.473444613050076,
"grad_norm": 0.8009867072105408,
"learning_rate": 2.004297092694073e-06,
"loss": 1.1821,
"step": 1528
},
{
"epoch": 3.475720789074355,
"grad_norm": 0.7843029499053955,
"learning_rate": 2.001259572983291e-06,
"loss": 1.1519,
"step": 1529
},
{
"epoch": 3.4779969650986344,
"grad_norm": 0.7983155250549316,
"learning_rate": 1.998222819858787e-06,
"loss": 1.1777,
"step": 1530
},
{
"epoch": 3.4802731411229137,
"grad_norm": 0.7752939462661743,
"learning_rate": 1.995186837988187e-06,
"loss": 1.176,
"step": 1531
},
{
"epoch": 3.4825493171471926,
"grad_norm": 0.7518580555915833,
"learning_rate": 1.9921516320379306e-06,
"loss": 1.1867,
"step": 1532
},
{
"epoch": 3.484825493171472,
"grad_norm": 0.7821219563484192,
"learning_rate": 1.989117206673264e-06,
"loss": 1.1544,
"step": 1533
},
{
"epoch": 3.4871016691957513,
"grad_norm": 0.7886951565742493,
"learning_rate": 1.9860835665582343e-06,
"loss": 1.1812,
"step": 1534
},
{
"epoch": 3.48937784522003,
"grad_norm": 0.7971673607826233,
"learning_rate": 1.9830507163556815e-06,
"loss": 1.1644,
"step": 1535
},
{
"epoch": 3.4916540212443095,
"grad_norm": 0.7841876149177551,
"learning_rate": 1.9800186607272333e-06,
"loss": 1.1841,
"step": 1536
},
{
"epoch": 3.4939301972685888,
"grad_norm": 0.760624349117279,
"learning_rate": 1.9769874043332934e-06,
"loss": 1.1542,
"step": 1537
},
{
"epoch": 3.496206373292868,
"grad_norm": 0.8207852244377136,
"learning_rate": 1.9739569518330364e-06,
"loss": 1.2127,
"step": 1538
},
{
"epoch": 3.4984825493171474,
"grad_norm": 0.7697487473487854,
"learning_rate": 1.9709273078844054e-06,
"loss": 1.1335,
"step": 1539
},
{
"epoch": 3.5007587253414263,
"grad_norm": 0.7819302082061768,
"learning_rate": 1.9678984771440974e-06,
"loss": 1.1826,
"step": 1540
},
{
"epoch": 3.5030349013657056,
"grad_norm": 0.818231999874115,
"learning_rate": 1.96487046426756e-06,
"loss": 1.1659,
"step": 1541
},
{
"epoch": 3.505311077389985,
"grad_norm": 0.7885006070137024,
"learning_rate": 1.9618432739089843e-06,
"loss": 1.1239,
"step": 1542
},
{
"epoch": 3.507587253414264,
"grad_norm": 0.7705325484275818,
"learning_rate": 1.9588169107212968e-06,
"loss": 1.1393,
"step": 1543
},
{
"epoch": 3.509863429438543,
"grad_norm": 0.7711304426193237,
"learning_rate": 1.955791379356152e-06,
"loss": 1.1612,
"step": 1544
},
{
"epoch": 3.5121396054628224,
"grad_norm": 0.7557392716407776,
"learning_rate": 1.952766684463929e-06,
"loss": 1.184,
"step": 1545
},
{
"epoch": 3.5144157814871018,
"grad_norm": 0.7863343358039856,
"learning_rate": 1.9497428306937148e-06,
"loss": 1.1757,
"step": 1546
},
{
"epoch": 3.516691957511381,
"grad_norm": 0.7670086622238159,
"learning_rate": 1.946719822693311e-06,
"loss": 1.192,
"step": 1547
},
{
"epoch": 3.51896813353566,
"grad_norm": 0.7611085176467896,
"learning_rate": 1.9436976651092143e-06,
"loss": 1.1697,
"step": 1548
},
{
"epoch": 3.5212443095599393,
"grad_norm": 0.7767881155014038,
"learning_rate": 1.9406763625866155e-06,
"loss": 1.1844,
"step": 1549
},
{
"epoch": 3.5235204855842186,
"grad_norm": 0.792441189289093,
"learning_rate": 1.937655919769392e-06,
"loss": 1.1898,
"step": 1550
},
{
"epoch": 3.5257966616084975,
"grad_norm": 0.7552328705787659,
"learning_rate": 1.9346363413000988e-06,
"loss": 1.1162,
"step": 1551
},
{
"epoch": 3.528072837632777,
"grad_norm": 0.7915894389152527,
"learning_rate": 1.931617631819962e-06,
"loss": 1.1508,
"step": 1552
},
{
"epoch": 3.530349013657056,
"grad_norm": 0.7816686034202576,
"learning_rate": 1.9285997959688742e-06,
"loss": 1.1886,
"step": 1553
},
{
"epoch": 3.5326251896813354,
"grad_norm": 0.7791935205459595,
"learning_rate": 1.9255828383853822e-06,
"loss": 1.1651,
"step": 1554
},
{
"epoch": 3.5349013657056148,
"grad_norm": 0.7524689435958862,
"learning_rate": 1.9225667637066845e-06,
"loss": 1.1455,
"step": 1555
},
{
"epoch": 3.5371775417298936,
"grad_norm": 0.7957069873809814,
"learning_rate": 1.9195515765686237e-06,
"loss": 1.1811,
"step": 1556
},
{
"epoch": 3.539453717754173,
"grad_norm": 0.7892540693283081,
"learning_rate": 1.916537281605675e-06,
"loss": 1.1673,
"step": 1557
},
{
"epoch": 3.5417298937784523,
"grad_norm": 0.7897363305091858,
"learning_rate": 1.913523883450946e-06,
"loss": 1.2002,
"step": 1558
},
{
"epoch": 3.544006069802731,
"grad_norm": 0.7709572315216064,
"learning_rate": 1.9105113867361633e-06,
"loss": 1.1636,
"step": 1559
},
{
"epoch": 3.5462822458270105,
"grad_norm": 0.8005422353744507,
"learning_rate": 1.907499796091668e-06,
"loss": 1.1738,
"step": 1560
},
{
"epoch": 3.54855842185129,
"grad_norm": 0.803570032119751,
"learning_rate": 1.9044891161464108e-06,
"loss": 1.1728,
"step": 1561
},
{
"epoch": 3.550834597875569,
"grad_norm": 0.8024869561195374,
"learning_rate": 1.90147935152794e-06,
"loss": 1.1666,
"step": 1562
},
{
"epoch": 3.5531107738998484,
"grad_norm": 0.7676922082901001,
"learning_rate": 1.8984705068623976e-06,
"loss": 1.1665,
"step": 1563
},
{
"epoch": 3.5553869499241273,
"grad_norm": 0.7918079495429993,
"learning_rate": 1.895462586774513e-06,
"loss": 1.162,
"step": 1564
},
{
"epoch": 3.5576631259484066,
"grad_norm": 0.7807730436325073,
"learning_rate": 1.8924555958875923e-06,
"loss": 1.1971,
"step": 1565
},
{
"epoch": 3.559939301972686,
"grad_norm": 0.7498663067817688,
"learning_rate": 1.8894495388235165e-06,
"loss": 1.1726,
"step": 1566
},
{
"epoch": 3.5622154779969653,
"grad_norm": 0.7693403959274292,
"learning_rate": 1.8864444202027287e-06,
"loss": 1.1912,
"step": 1567
},
{
"epoch": 3.5644916540212446,
"grad_norm": 0.7785525918006897,
"learning_rate": 1.8834402446442284e-06,
"loss": 1.1865,
"step": 1568
},
{
"epoch": 3.5667678300455234,
"grad_norm": 0.7578516006469727,
"learning_rate": 1.88043701676557e-06,
"loss": 1.1451,
"step": 1569
},
{
"epoch": 3.5690440060698028,
"grad_norm": 0.7615971565246582,
"learning_rate": 1.8774347411828472e-06,
"loss": 1.2075,
"step": 1570
},
{
"epoch": 3.571320182094082,
"grad_norm": 0.7744317054748535,
"learning_rate": 1.8744334225106917e-06,
"loss": 1.2109,
"step": 1571
},
{
"epoch": 3.573596358118361,
"grad_norm": 0.749718427658081,
"learning_rate": 1.8714330653622645e-06,
"loss": 1.163,
"step": 1572
},
{
"epoch": 3.5758725341426403,
"grad_norm": 0.7745640873908997,
"learning_rate": 1.8684336743492481e-06,
"loss": 1.1909,
"step": 1573
},
{
"epoch": 3.5781487101669196,
"grad_norm": 0.7815781235694885,
"learning_rate": 1.8654352540818398e-06,
"loss": 1.225,
"step": 1574
},
{
"epoch": 3.580424886191199,
"grad_norm": 0.7583136558532715,
"learning_rate": 1.862437809168746e-06,
"loss": 1.1589,
"step": 1575
},
{
"epoch": 3.5827010622154782,
"grad_norm": 0.7747395038604736,
"learning_rate": 1.8594413442171722e-06,
"loss": 1.1941,
"step": 1576
},
{
"epoch": 3.584977238239757,
"grad_norm": 0.7689574956893921,
"learning_rate": 1.8564458638328203e-06,
"loss": 1.1382,
"step": 1577
},
{
"epoch": 3.5872534142640364,
"grad_norm": 0.7516262531280518,
"learning_rate": 1.8534513726198773e-06,
"loss": 1.1587,
"step": 1578
},
{
"epoch": 3.5895295902883158,
"grad_norm": 0.7683000564575195,
"learning_rate": 1.8504578751810066e-06,
"loss": 1.211,
"step": 1579
},
{
"epoch": 3.5918057663125946,
"grad_norm": 0.7480950951576233,
"learning_rate": 1.8474653761173506e-06,
"loss": 1.1418,
"step": 1580
},
{
"epoch": 3.594081942336874,
"grad_norm": 0.7797414064407349,
"learning_rate": 1.8444738800285128e-06,
"loss": 1.1673,
"step": 1581
},
{
"epoch": 3.5963581183611533,
"grad_norm": 0.7567870020866394,
"learning_rate": 1.8414833915125554e-06,
"loss": 1.1997,
"step": 1582
},
{
"epoch": 3.5986342943854326,
"grad_norm": 0.7724438905715942,
"learning_rate": 1.8384939151659936e-06,
"loss": 1.2075,
"step": 1583
},
{
"epoch": 3.600910470409712,
"grad_norm": 0.7550652623176575,
"learning_rate": 1.835505455583786e-06,
"loss": 1.1633,
"step": 1584
},
{
"epoch": 3.603186646433991,
"grad_norm": 0.7587825059890747,
"learning_rate": 1.8325180173593265e-06,
"loss": 1.1306,
"step": 1585
},
{
"epoch": 3.60546282245827,
"grad_norm": 0.76627117395401,
"learning_rate": 1.8295316050844428e-06,
"loss": 1.164,
"step": 1586
},
{
"epoch": 3.6077389984825494,
"grad_norm": 0.7493066191673279,
"learning_rate": 1.8265462233493819e-06,
"loss": 1.1746,
"step": 1587
},
{
"epoch": 3.6100151745068283,
"grad_norm": 0.7724924683570862,
"learning_rate": 1.823561876742811e-06,
"loss": 1.1918,
"step": 1588
},
{
"epoch": 3.6122913505311076,
"grad_norm": 0.7840549945831299,
"learning_rate": 1.8205785698518024e-06,
"loss": 1.1779,
"step": 1589
},
{
"epoch": 3.614567526555387,
"grad_norm": 0.7741526961326599,
"learning_rate": 1.817596307261832e-06,
"loss": 1.1731,
"step": 1590
},
{
"epoch": 3.6168437025796663,
"grad_norm": 0.7666813135147095,
"learning_rate": 1.8146150935567712e-06,
"loss": 1.2023,
"step": 1591
},
{
"epoch": 3.6191198786039456,
"grad_norm": 0.7450817227363586,
"learning_rate": 1.8116349333188775e-06,
"loss": 1.1614,
"step": 1592
},
{
"epoch": 3.6213960546282244,
"grad_norm": 0.7609484195709229,
"learning_rate": 1.80865583112879e-06,
"loss": 1.1923,
"step": 1593
},
{
"epoch": 3.6236722306525038,
"grad_norm": 0.7363880276679993,
"learning_rate": 1.8056777915655223e-06,
"loss": 1.1198,
"step": 1594
},
{
"epoch": 3.625948406676783,
"grad_norm": 0.7470148801803589,
"learning_rate": 1.8027008192064537e-06,
"loss": 1.1398,
"step": 1595
},
{
"epoch": 3.628224582701062,
"grad_norm": 0.7653167247772217,
"learning_rate": 1.7997249186273233e-06,
"loss": 1.1567,
"step": 1596
},
{
"epoch": 3.6305007587253413,
"grad_norm": 0.7713541984558105,
"learning_rate": 1.7967500944022237e-06,
"loss": 1.1461,
"step": 1597
},
{
"epoch": 3.6327769347496206,
"grad_norm": 0.7798824310302734,
"learning_rate": 1.7937763511035904e-06,
"loss": 1.1613,
"step": 1598
},
{
"epoch": 3.6350531107739,
"grad_norm": 0.7728201746940613,
"learning_rate": 1.7908036933022027e-06,
"loss": 1.1569,
"step": 1599
},
{
"epoch": 3.6373292867981792,
"grad_norm": 0.7966005802154541,
"learning_rate": 1.787832125567166e-06,
"loss": 1.1902,
"step": 1600
},
{
"epoch": 3.639605462822458,
"grad_norm": 0.7702760696411133,
"learning_rate": 1.7848616524659125e-06,
"loss": 1.1803,
"step": 1601
},
{
"epoch": 3.6418816388467374,
"grad_norm": 0.750785768032074,
"learning_rate": 1.781892278564193e-06,
"loss": 1.1602,
"step": 1602
},
{
"epoch": 3.6441578148710168,
"grad_norm": 0.777713418006897,
"learning_rate": 1.7789240084260668e-06,
"loss": 1.164,
"step": 1603
},
{
"epoch": 3.6464339908952956,
"grad_norm": 0.7839711904525757,
"learning_rate": 1.7759568466138966e-06,
"loss": 1.1698,
"step": 1604
},
{
"epoch": 3.648710166919575,
"grad_norm": 0.7893310189247131,
"learning_rate": 1.7729907976883443e-06,
"loss": 1.1634,
"step": 1605
},
{
"epoch": 3.6509863429438543,
"grad_norm": 0.7450976967811584,
"learning_rate": 1.7700258662083574e-06,
"loss": 1.1598,
"step": 1606
},
{
"epoch": 3.6532625189681336,
"grad_norm": 0.7937871217727661,
"learning_rate": 1.7670620567311696e-06,
"loss": 1.1632,
"step": 1607
},
{
"epoch": 3.655538694992413,
"grad_norm": 0.7774436473846436,
"learning_rate": 1.7640993738122886e-06,
"loss": 1.1634,
"step": 1608
},
{
"epoch": 3.657814871016692,
"grad_norm": 0.7838451266288757,
"learning_rate": 1.761137822005487e-06,
"loss": 1.1642,
"step": 1609
},
{
"epoch": 3.660091047040971,
"grad_norm": 0.7670826315879822,
"learning_rate": 1.7581774058628054e-06,
"loss": 1.1602,
"step": 1610
},
{
"epoch": 3.6623672230652504,
"grad_norm": 0.7767178416252136,
"learning_rate": 1.755218129934534e-06,
"loss": 1.154,
"step": 1611
},
{
"epoch": 3.6646433990895297,
"grad_norm": 0.8149465322494507,
"learning_rate": 1.7522599987692113e-06,
"loss": 1.2108,
"step": 1612
},
{
"epoch": 3.666919575113809,
"grad_norm": 0.780608594417572,
"learning_rate": 1.7493030169136183e-06,
"loss": 1.1816,
"step": 1613
},
{
"epoch": 3.669195751138088,
"grad_norm": 0.7886972427368164,
"learning_rate": 1.7463471889127673e-06,
"loss": 1.177,
"step": 1614
},
{
"epoch": 3.6714719271623673,
"grad_norm": 0.8002546429634094,
"learning_rate": 1.743392519309897e-06,
"loss": 1.2167,
"step": 1615
},
{
"epoch": 3.6737481031866466,
"grad_norm": 0.8180500864982605,
"learning_rate": 1.7404390126464676e-06,
"loss": 1.1524,
"step": 1616
},
{
"epoch": 3.6760242792109254,
"grad_norm": 0.7811232209205627,
"learning_rate": 1.7374866734621487e-06,
"loss": 1.1734,
"step": 1617
},
{
"epoch": 3.6783004552352048,
"grad_norm": 0.7839295268058777,
"learning_rate": 1.73453550629482e-06,
"loss": 1.2271,
"step": 1618
},
{
"epoch": 3.680576631259484,
"grad_norm": 0.7847591042518616,
"learning_rate": 1.7315855156805558e-06,
"loss": 1.1878,
"step": 1619
},
{
"epoch": 3.6828528072837634,
"grad_norm": 0.77289217710495,
"learning_rate": 1.7286367061536215e-06,
"loss": 1.1785,
"step": 1620
},
{
"epoch": 3.6851289833080427,
"grad_norm": 0.7843948602676392,
"learning_rate": 1.7256890822464716e-06,
"loss": 1.1834,
"step": 1621
},
{
"epoch": 3.6874051593323216,
"grad_norm": 0.8314946889877319,
"learning_rate": 1.7227426484897345e-06,
"loss": 1.2082,
"step": 1622
},
{
"epoch": 3.689681335356601,
"grad_norm": 0.7974013686180115,
"learning_rate": 1.7197974094122095e-06,
"loss": 1.139,
"step": 1623
},
{
"epoch": 3.6919575113808802,
"grad_norm": 0.7534367442131042,
"learning_rate": 1.7168533695408612e-06,
"loss": 1.1726,
"step": 1624
},
{
"epoch": 3.694233687405159,
"grad_norm": 0.7677541971206665,
"learning_rate": 1.7139105334008099e-06,
"loss": 1.1262,
"step": 1625
},
{
"epoch": 3.6965098634294384,
"grad_norm": 0.7810897827148438,
"learning_rate": 1.7109689055153261e-06,
"loss": 1.1605,
"step": 1626
},
{
"epoch": 3.6987860394537178,
"grad_norm": 0.8533305525779724,
"learning_rate": 1.708028490405823e-06,
"loss": 1.1913,
"step": 1627
},
{
"epoch": 3.701062215477997,
"grad_norm": 0.80948805809021,
"learning_rate": 1.7050892925918491e-06,
"loss": 1.1917,
"step": 1628
},
{
"epoch": 3.7033383915022764,
"grad_norm": 0.7783576846122742,
"learning_rate": 1.7021513165910841e-06,
"loss": 1.215,
"step": 1629
},
{
"epoch": 3.7056145675265553,
"grad_norm": 0.7968083024024963,
"learning_rate": 1.699214566919327e-06,
"loss": 1.2006,
"step": 1630
},
{
"epoch": 3.7078907435508346,
"grad_norm": 0.7826852798461914,
"learning_rate": 1.6962790480904934e-06,
"loss": 1.1686,
"step": 1631
},
{
"epoch": 3.710166919575114,
"grad_norm": 0.7661287784576416,
"learning_rate": 1.6933447646166069e-06,
"loss": 1.149,
"step": 1632
},
{
"epoch": 3.712443095599393,
"grad_norm": 0.8017462491989136,
"learning_rate": 1.690411721007791e-06,
"loss": 1.1997,
"step": 1633
},
{
"epoch": 3.714719271623672,
"grad_norm": 0.78822261095047,
"learning_rate": 1.6874799217722652e-06,
"loss": 1.1678,
"step": 1634
},
{
"epoch": 3.7169954476479514,
"grad_norm": 0.8100883364677429,
"learning_rate": 1.6845493714163361e-06,
"loss": 1.1477,
"step": 1635
},
{
"epoch": 3.7192716236722307,
"grad_norm": 0.7745562195777893,
"learning_rate": 1.681620074444389e-06,
"loss": 1.1496,
"step": 1636
},
{
"epoch": 3.72154779969651,
"grad_norm": 0.7901548743247986,
"learning_rate": 1.6786920353588859e-06,
"loss": 1.1845,
"step": 1637
},
{
"epoch": 3.723823975720789,
"grad_norm": 0.8251738548278809,
"learning_rate": 1.6757652586603523e-06,
"loss": 1.1844,
"step": 1638
},
{
"epoch": 3.7261001517450683,
"grad_norm": 0.7896043062210083,
"learning_rate": 1.6728397488473733e-06,
"loss": 1.2202,
"step": 1639
},
{
"epoch": 3.7283763277693476,
"grad_norm": 0.7613170742988586,
"learning_rate": 1.6699155104165903e-06,
"loss": 1.2186,
"step": 1640
},
{
"epoch": 3.7306525037936265,
"grad_norm": 0.7568488717079163,
"learning_rate": 1.6669925478626874e-06,
"loss": 1.18,
"step": 1641
},
{
"epoch": 3.7329286798179058,
"grad_norm": 0.7813270688056946,
"learning_rate": 1.6640708656783878e-06,
"loss": 1.1845,
"step": 1642
},
{
"epoch": 3.735204855842185,
"grad_norm": 0.800815761089325,
"learning_rate": 1.6611504683544477e-06,
"loss": 1.131,
"step": 1643
},
{
"epoch": 3.7374810318664644,
"grad_norm": 0.748050332069397,
"learning_rate": 1.6582313603796485e-06,
"loss": 1.1603,
"step": 1644
},
{
"epoch": 3.7397572078907437,
"grad_norm": 0.7783641219139099,
"learning_rate": 1.6553135462407876e-06,
"loss": 1.1516,
"step": 1645
},
{
"epoch": 3.7420333839150226,
"grad_norm": 0.7908198237419128,
"learning_rate": 1.6523970304226778e-06,
"loss": 1.2066,
"step": 1646
},
{
"epoch": 3.744309559939302,
"grad_norm": 0.7437866926193237,
"learning_rate": 1.6494818174081317e-06,
"loss": 1.1779,
"step": 1647
},
{
"epoch": 3.7465857359635812,
"grad_norm": 0.7539526224136353,
"learning_rate": 1.646567911677964e-06,
"loss": 1.2012,
"step": 1648
},
{
"epoch": 3.74886191198786,
"grad_norm": 0.7456071972846985,
"learning_rate": 1.643655317710977e-06,
"loss": 1.1731,
"step": 1649
},
{
"epoch": 3.75113808801214,
"grad_norm": 0.771332323551178,
"learning_rate": 1.6407440399839558e-06,
"loss": 1.1825,
"step": 1650
},
{
"epoch": 3.7534142640364188,
"grad_norm": 0.8010808825492859,
"learning_rate": 1.6378340829716662e-06,
"loss": 1.2054,
"step": 1651
},
{
"epoch": 3.755690440060698,
"grad_norm": 0.7861669063568115,
"learning_rate": 1.6349254511468415e-06,
"loss": 1.159,
"step": 1652
},
{
"epoch": 3.7579666160849774,
"grad_norm": 0.777998149394989,
"learning_rate": 1.6320181489801774e-06,
"loss": 1.2102,
"step": 1653
},
{
"epoch": 3.7602427921092563,
"grad_norm": 0.7587355375289917,
"learning_rate": 1.6291121809403287e-06,
"loss": 1.1602,
"step": 1654
},
{
"epoch": 3.7625189681335356,
"grad_norm": 0.8161150217056274,
"learning_rate": 1.6262075514938966e-06,
"loss": 1.1934,
"step": 1655
},
{
"epoch": 3.764795144157815,
"grad_norm": 0.7970815896987915,
"learning_rate": 1.6233042651054258e-06,
"loss": 1.1995,
"step": 1656
},
{
"epoch": 3.7670713201820942,
"grad_norm": 0.7849256992340088,
"learning_rate": 1.6204023262373985e-06,
"loss": 1.1368,
"step": 1657
},
{
"epoch": 3.7693474962063735,
"grad_norm": 0.7707874774932861,
"learning_rate": 1.6175017393502223e-06,
"loss": 1.1544,
"step": 1658
},
{
"epoch": 3.7716236722306524,
"grad_norm": 0.7858372330665588,
"learning_rate": 1.6146025089022304e-06,
"loss": 1.2052,
"step": 1659
},
{
"epoch": 3.7738998482549317,
"grad_norm": 0.7694425582885742,
"learning_rate": 1.6117046393496685e-06,
"loss": 1.189,
"step": 1660
},
{
"epoch": 3.776176024279211,
"grad_norm": 0.7819948196411133,
"learning_rate": 1.6088081351466908e-06,
"loss": 1.1476,
"step": 1661
},
{
"epoch": 3.77845220030349,
"grad_norm": 0.7882603406906128,
"learning_rate": 1.6059130007453544e-06,
"loss": 1.165,
"step": 1662
},
{
"epoch": 3.7807283763277693,
"grad_norm": 0.7949514389038086,
"learning_rate": 1.6030192405956097e-06,
"loss": 1.2037,
"step": 1663
},
{
"epoch": 3.7830045523520486,
"grad_norm": 0.7628163695335388,
"learning_rate": 1.6001268591452946e-06,
"loss": 1.1531,
"step": 1664
},
{
"epoch": 3.785280728376328,
"grad_norm": 0.7776125073432922,
"learning_rate": 1.5972358608401286e-06,
"loss": 1.145,
"step": 1665
},
{
"epoch": 3.787556904400607,
"grad_norm": 0.7946699857711792,
"learning_rate": 1.5943462501237055e-06,
"loss": 1.1877,
"step": 1666
},
{
"epoch": 3.789833080424886,
"grad_norm": 0.7983624339103699,
"learning_rate": 1.591458031437485e-06,
"loss": 1.1365,
"step": 1667
},
{
"epoch": 3.7921092564491654,
"grad_norm": 0.7676346302032471,
"learning_rate": 1.588571209220789e-06,
"loss": 1.1635,
"step": 1668
},
{
"epoch": 3.7943854324734447,
"grad_norm": 0.7794129848480225,
"learning_rate": 1.5856857879107907e-06,
"loss": 1.172,
"step": 1669
},
{
"epoch": 3.7966616084977236,
"grad_norm": 0.7796185612678528,
"learning_rate": 1.5828017719425131e-06,
"loss": 1.1559,
"step": 1670
},
{
"epoch": 3.798937784522003,
"grad_norm": 0.7712554931640625,
"learning_rate": 1.5799191657488162e-06,
"loss": 1.1982,
"step": 1671
},
{
"epoch": 3.8012139605462822,
"grad_norm": 0.7830713987350464,
"learning_rate": 1.577037973760394e-06,
"loss": 1.1714,
"step": 1672
},
{
"epoch": 3.8034901365705616,
"grad_norm": 0.7980160713195801,
"learning_rate": 1.5741582004057683e-06,
"loss": 1.154,
"step": 1673
},
{
"epoch": 3.805766312594841,
"grad_norm": 0.7877330183982849,
"learning_rate": 1.571279850111278e-06,
"loss": 1.1599,
"step": 1674
},
{
"epoch": 3.8080424886191198,
"grad_norm": 0.7914711833000183,
"learning_rate": 1.5684029273010762e-06,
"loss": 1.1495,
"step": 1675
},
{
"epoch": 3.810318664643399,
"grad_norm": 0.7610024213790894,
"learning_rate": 1.5655274363971222e-06,
"loss": 1.1686,
"step": 1676
},
{
"epoch": 3.8125948406676784,
"grad_norm": 0.7735294103622437,
"learning_rate": 1.562653381819172e-06,
"loss": 1.1742,
"step": 1677
},
{
"epoch": 3.8148710166919573,
"grad_norm": 0.7893885374069214,
"learning_rate": 1.5597807679847782e-06,
"loss": 1.1584,
"step": 1678
},
{
"epoch": 3.8171471927162366,
"grad_norm": 0.7622188925743103,
"learning_rate": 1.5569095993092747e-06,
"loss": 1.1389,
"step": 1679
},
{
"epoch": 3.819423368740516,
"grad_norm": 0.80954509973526,
"learning_rate": 1.5540398802057755e-06,
"loss": 1.1898,
"step": 1680
},
{
"epoch": 3.8216995447647952,
"grad_norm": 0.7971329689025879,
"learning_rate": 1.551171615085168e-06,
"loss": 1.1567,
"step": 1681
},
{
"epoch": 3.8239757207890746,
"grad_norm": 0.8103090524673462,
"learning_rate": 1.5483048083561036e-06,
"loss": 1.1744,
"step": 1682
},
{
"epoch": 3.8262518968133534,
"grad_norm": 0.7729601860046387,
"learning_rate": 1.545439464424991e-06,
"loss": 1.1338,
"step": 1683
},
{
"epoch": 3.8285280728376327,
"grad_norm": 0.8032084107398987,
"learning_rate": 1.5425755876959925e-06,
"loss": 1.1474,
"step": 1684
},
{
"epoch": 3.830804248861912,
"grad_norm": 0.776904284954071,
"learning_rate": 1.5397131825710137e-06,
"loss": 1.1574,
"step": 1685
},
{
"epoch": 3.833080424886191,
"grad_norm": 0.8134787678718567,
"learning_rate": 1.5368522534496993e-06,
"loss": 1.2379,
"step": 1686
},
{
"epoch": 3.8353566009104703,
"grad_norm": 0.7977766394615173,
"learning_rate": 1.5339928047294256e-06,
"loss": 1.1946,
"step": 1687
},
{
"epoch": 3.8376327769347496,
"grad_norm": 0.7849652767181396,
"learning_rate": 1.5311348408052905e-06,
"loss": 1.154,
"step": 1688
},
{
"epoch": 3.839908952959029,
"grad_norm": 0.8153653740882874,
"learning_rate": 1.5282783660701154e-06,
"loss": 1.1685,
"step": 1689
},
{
"epoch": 3.842185128983308,
"grad_norm": 0.794487714767456,
"learning_rate": 1.525423384914428e-06,
"loss": 1.1452,
"step": 1690
},
{
"epoch": 3.844461305007587,
"grad_norm": 0.7951854467391968,
"learning_rate": 1.522569901726459e-06,
"loss": 1.1637,
"step": 1691
},
{
"epoch": 3.8467374810318664,
"grad_norm": 0.8048427104949951,
"learning_rate": 1.5197179208921434e-06,
"loss": 1.1898,
"step": 1692
},
{
"epoch": 3.8490136570561457,
"grad_norm": 0.8001375794410706,
"learning_rate": 1.5168674467951008e-06,
"loss": 1.1973,
"step": 1693
},
{
"epoch": 3.851289833080425,
"grad_norm": 0.7969352006912231,
"learning_rate": 1.5140184838166368e-06,
"loss": 1.1641,
"step": 1694
},
{
"epoch": 3.8535660091047044,
"grad_norm": 0.7886945605278015,
"learning_rate": 1.5111710363357357e-06,
"loss": 1.1504,
"step": 1695
},
{
"epoch": 3.8558421851289832,
"grad_norm": 0.8164499402046204,
"learning_rate": 1.5083251087290506e-06,
"loss": 1.1686,
"step": 1696
},
{
"epoch": 3.8581183611532626,
"grad_norm": 0.7746224999427795,
"learning_rate": 1.5054807053708988e-06,
"loss": 1.1458,
"step": 1697
},
{
"epoch": 3.860394537177542,
"grad_norm": 0.8148301243782043,
"learning_rate": 1.5026378306332563e-06,
"loss": 1.2068,
"step": 1698
},
{
"epoch": 3.8626707132018208,
"grad_norm": 0.8069217801094055,
"learning_rate": 1.4997964888857457e-06,
"loss": 1.1743,
"step": 1699
},
{
"epoch": 3.8649468892261,
"grad_norm": 0.7751657962799072,
"learning_rate": 1.4969566844956397e-06,
"loss": 1.1802,
"step": 1700
},
{
"epoch": 3.8672230652503794,
"grad_norm": 0.797619104385376,
"learning_rate": 1.494118421827842e-06,
"loss": 1.1807,
"step": 1701
},
{
"epoch": 3.8694992412746587,
"grad_norm": 0.784611165523529,
"learning_rate": 1.4912817052448891e-06,
"loss": 1.1702,
"step": 1702
},
{
"epoch": 3.871775417298938,
"grad_norm": 0.784424901008606,
"learning_rate": 1.4884465391069415e-06,
"loss": 1.1321,
"step": 1703
},
{
"epoch": 3.874051593323217,
"grad_norm": 0.7754137516021729,
"learning_rate": 1.4856129277717758e-06,
"loss": 1.1553,
"step": 1704
},
{
"epoch": 3.8763277693474962,
"grad_norm": 0.7742170691490173,
"learning_rate": 1.482780875594778e-06,
"loss": 1.1627,
"step": 1705
},
{
"epoch": 3.8786039453717756,
"grad_norm": 0.7944441437721252,
"learning_rate": 1.4799503869289402e-06,
"loss": 1.1726,
"step": 1706
},
{
"epoch": 3.8808801213960544,
"grad_norm": 0.7838897109031677,
"learning_rate": 1.477121466124848e-06,
"loss": 1.1731,
"step": 1707
},
{
"epoch": 3.8831562974203337,
"grad_norm": 0.7788136005401611,
"learning_rate": 1.474294117530681e-06,
"loss": 1.1877,
"step": 1708
},
{
"epoch": 3.885432473444613,
"grad_norm": 0.7596397399902344,
"learning_rate": 1.4714683454921986e-06,
"loss": 1.1627,
"step": 1709
},
{
"epoch": 3.8877086494688924,
"grad_norm": 0.7933751940727234,
"learning_rate": 1.4686441543527374e-06,
"loss": 1.1785,
"step": 1710
},
{
"epoch": 3.8899848254931717,
"grad_norm": 0.7714102268218994,
"learning_rate": 1.465821548453208e-06,
"loss": 1.1882,
"step": 1711
},
{
"epoch": 3.8922610015174506,
"grad_norm": 0.7759813666343689,
"learning_rate": 1.4630005321320796e-06,
"loss": 1.1538,
"step": 1712
},
{
"epoch": 3.89453717754173,
"grad_norm": 0.757717490196228,
"learning_rate": 1.46018110972538e-06,
"loss": 1.1462,
"step": 1713
},
{
"epoch": 3.896813353566009,
"grad_norm": 0.7566017508506775,
"learning_rate": 1.4573632855666887e-06,
"loss": 1.1943,
"step": 1714
},
{
"epoch": 3.899089529590288,
"grad_norm": 0.784542441368103,
"learning_rate": 1.4545470639871256e-06,
"loss": 1.1897,
"step": 1715
},
{
"epoch": 3.9013657056145674,
"grad_norm": 0.7824509143829346,
"learning_rate": 1.4517324493153481e-06,
"loss": 1.1691,
"step": 1716
},
{
"epoch": 3.9036418816388467,
"grad_norm": 0.7722765207290649,
"learning_rate": 1.4489194458775468e-06,
"loss": 1.1754,
"step": 1717
},
{
"epoch": 3.905918057663126,
"grad_norm": 0.7892184853553772,
"learning_rate": 1.4461080579974316e-06,
"loss": 1.1507,
"step": 1718
},
{
"epoch": 3.9081942336874054,
"grad_norm": 0.7828143239021301,
"learning_rate": 1.4432982899962326e-06,
"loss": 1.1833,
"step": 1719
},
{
"epoch": 3.9104704097116842,
"grad_norm": 0.7912802696228027,
"learning_rate": 1.4404901461926873e-06,
"loss": 1.1668,
"step": 1720
},
{
"epoch": 3.9127465857359636,
"grad_norm": 0.8008295297622681,
"learning_rate": 1.437683630903039e-06,
"loss": 1.1395,
"step": 1721
},
{
"epoch": 3.915022761760243,
"grad_norm": 0.8199208974838257,
"learning_rate": 1.434878748441026e-06,
"loss": 1.1546,
"step": 1722
},
{
"epoch": 3.9172989377845218,
"grad_norm": 0.7762032151222229,
"learning_rate": 1.432075503117878e-06,
"loss": 1.158,
"step": 1723
},
{
"epoch": 3.919575113808801,
"grad_norm": 0.8160228133201599,
"learning_rate": 1.4292738992423066e-06,
"loss": 1.2023,
"step": 1724
},
{
"epoch": 3.9218512898330804,
"grad_norm": 0.7772383689880371,
"learning_rate": 1.4264739411205047e-06,
"loss": 1.1636,
"step": 1725
},
{
"epoch": 3.9241274658573597,
"grad_norm": 0.8001096248626709,
"learning_rate": 1.4236756330561319e-06,
"loss": 1.1817,
"step": 1726
},
{
"epoch": 3.926403641881639,
"grad_norm": 0.7925685048103333,
"learning_rate": 1.4208789793503103e-06,
"loss": 1.1283,
"step": 1727
},
{
"epoch": 3.928679817905918,
"grad_norm": 0.783243715763092,
"learning_rate": 1.4180839843016246e-06,
"loss": 1.1621,
"step": 1728
},
{
"epoch": 3.9309559939301972,
"grad_norm": 0.8283969759941101,
"learning_rate": 1.415290652206105e-06,
"loss": 1.2235,
"step": 1729
},
{
"epoch": 3.9332321699544766,
"grad_norm": 0.7659640312194824,
"learning_rate": 1.4124989873572282e-06,
"loss": 1.146,
"step": 1730
},
{
"epoch": 3.9355083459787554,
"grad_norm": 0.7988297343254089,
"learning_rate": 1.409708994045907e-06,
"loss": 1.1957,
"step": 1731
},
{
"epoch": 3.9377845220030347,
"grad_norm": 0.7699180245399475,
"learning_rate": 1.4069206765604845e-06,
"loss": 1.1627,
"step": 1732
},
{
"epoch": 3.940060698027314,
"grad_norm": 0.7823548913002014,
"learning_rate": 1.4041340391867313e-06,
"loss": 1.2001,
"step": 1733
},
{
"epoch": 3.9423368740515934,
"grad_norm": 0.7833247780799866,
"learning_rate": 1.4013490862078327e-06,
"loss": 1.1744,
"step": 1734
},
{
"epoch": 3.9446130500758727,
"grad_norm": 0.7722859978675842,
"learning_rate": 1.3985658219043843e-06,
"loss": 1.1858,
"step": 1735
},
{
"epoch": 3.9468892261001516,
"grad_norm": 0.8108896613121033,
"learning_rate": 1.3957842505543893e-06,
"loss": 1.1314,
"step": 1736
},
{
"epoch": 3.949165402124431,
"grad_norm": 0.7728601694107056,
"learning_rate": 1.3930043764332457e-06,
"loss": 1.1643,
"step": 1737
},
{
"epoch": 3.95144157814871,
"grad_norm": 0.7797267436981201,
"learning_rate": 1.3902262038137449e-06,
"loss": 1.1802,
"step": 1738
},
{
"epoch": 3.9537177541729895,
"grad_norm": 0.7659248113632202,
"learning_rate": 1.387449736966061e-06,
"loss": 1.1851,
"step": 1739
},
{
"epoch": 3.955993930197269,
"grad_norm": 0.7742710709571838,
"learning_rate": 1.384674980157747e-06,
"loss": 1.1694,
"step": 1740
},
{
"epoch": 3.9582701062215477,
"grad_norm": 0.7799730896949768,
"learning_rate": 1.3819019376537299e-06,
"loss": 1.1809,
"step": 1741
},
{
"epoch": 3.960546282245827,
"grad_norm": 0.792921781539917,
"learning_rate": 1.3791306137162985e-06,
"loss": 1.1651,
"step": 1742
},
{
"epoch": 3.9628224582701064,
"grad_norm": 0.7535181641578674,
"learning_rate": 1.3763610126051014e-06,
"loss": 1.1209,
"step": 1743
},
{
"epoch": 3.9650986342943852,
"grad_norm": 0.7926993370056152,
"learning_rate": 1.3735931385771386e-06,
"loss": 1.1498,
"step": 1744
},
{
"epoch": 3.9673748103186646,
"grad_norm": 0.7826522588729858,
"learning_rate": 1.3708269958867565e-06,
"loss": 1.1504,
"step": 1745
},
{
"epoch": 3.969650986342944,
"grad_norm": 0.7804858088493347,
"learning_rate": 1.3680625887856386e-06,
"loss": 1.1618,
"step": 1746
},
{
"epoch": 3.971927162367223,
"grad_norm": 0.7874334454536438,
"learning_rate": 1.365299921522804e-06,
"loss": 1.1396,
"step": 1747
},
{
"epoch": 3.9742033383915025,
"grad_norm": 0.7938604950904846,
"learning_rate": 1.3625389983445932e-06,
"loss": 1.1497,
"step": 1748
},
{
"epoch": 3.9764795144157814,
"grad_norm": 0.8329913020133972,
"learning_rate": 1.3597798234946705e-06,
"loss": 1.1946,
"step": 1749
},
{
"epoch": 3.9787556904400607,
"grad_norm": 0.7830440998077393,
"learning_rate": 1.3570224012140096e-06,
"loss": 1.1734,
"step": 1750
},
{
"epoch": 3.98103186646434,
"grad_norm": 0.8128320574760437,
"learning_rate": 1.3542667357408915e-06,
"loss": 1.1875,
"step": 1751
},
{
"epoch": 3.983308042488619,
"grad_norm": 0.8153894543647766,
"learning_rate": 1.3515128313108966e-06,
"loss": 1.1496,
"step": 1752
},
{
"epoch": 3.9855842185128982,
"grad_norm": 0.7969558238983154,
"learning_rate": 1.3487606921568995e-06,
"loss": 1.1872,
"step": 1753
},
{
"epoch": 3.9878603945371776,
"grad_norm": 0.7873245477676392,
"learning_rate": 1.3460103225090599e-06,
"loss": 1.1803,
"step": 1754
},
{
"epoch": 3.990136570561457,
"grad_norm": 0.7957016825675964,
"learning_rate": 1.3432617265948196e-06,
"loss": 1.1586,
"step": 1755
},
{
"epoch": 3.992412746585736,
"grad_norm": 0.7677023410797119,
"learning_rate": 1.3405149086388928e-06,
"loss": 1.1406,
"step": 1756
},
{
"epoch": 3.994688922610015,
"grad_norm": 0.8339362144470215,
"learning_rate": 1.3377698728632599e-06,
"loss": 1.1411,
"step": 1757
},
{
"epoch": 3.9969650986342944,
"grad_norm": 0.7654924392700195,
"learning_rate": 1.335026623487166e-06,
"loss": 1.1947,
"step": 1758
},
{
"epoch": 3.9992412746585737,
"grad_norm": 0.8090951442718506,
"learning_rate": 1.3322851647271057e-06,
"loss": 1.1684,
"step": 1759
},
{
"epoch": 4.0,
"grad_norm": 0.8090951442718506,
"learning_rate": 1.3295455007968245e-06,
"loss": 1.1183,
"step": 1760
},
{
"epoch": 4.002276176024279,
"grad_norm": 1.8324748277664185,
"learning_rate": 1.3268076359073068e-06,
"loss": 1.1734,
"step": 1761
},
{
"epoch": 4.004552352048559,
"grad_norm": 0.7935696840286255,
"learning_rate": 1.3240715742667732e-06,
"loss": 1.1447,
"step": 1762
},
{
"epoch": 4.0068285280728375,
"grad_norm": 0.8030862212181091,
"learning_rate": 1.3213373200806738e-06,
"loss": 1.1581,
"step": 1763
},
{
"epoch": 4.009104704097117,
"grad_norm": 0.7820909023284912,
"learning_rate": 1.3186048775516782e-06,
"loss": 1.1474,
"step": 1764
},
{
"epoch": 4.011380880121396,
"grad_norm": 0.7807154059410095,
"learning_rate": 1.3158742508796718e-06,
"loss": 1.1618,
"step": 1765
},
{
"epoch": 4.013657056145675,
"grad_norm": 0.7716015577316284,
"learning_rate": 1.3131454442617523e-06,
"loss": 1.1246,
"step": 1766
},
{
"epoch": 4.015933232169955,
"grad_norm": 0.7621894478797913,
"learning_rate": 1.3104184618922142e-06,
"loss": 1.1691,
"step": 1767
},
{
"epoch": 4.018209408194234,
"grad_norm": 0.8327509164810181,
"learning_rate": 1.3076933079625508e-06,
"loss": 1.1942,
"step": 1768
},
{
"epoch": 4.0204855842185125,
"grad_norm": 0.7734883427619934,
"learning_rate": 1.304969986661448e-06,
"loss": 1.1896,
"step": 1769
},
{
"epoch": 4.022761760242792,
"grad_norm": 0.83384108543396,
"learning_rate": 1.3022485021747693e-06,
"loss": 1.1164,
"step": 1770
},
{
"epoch": 4.025037936267071,
"grad_norm": 0.805117130279541,
"learning_rate": 1.29952885868556e-06,
"loss": 1.1337,
"step": 1771
},
{
"epoch": 4.027314112291351,
"grad_norm": 0.7874476909637451,
"learning_rate": 1.2968110603740325e-06,
"loss": 1.1443,
"step": 1772
},
{
"epoch": 4.02959028831563,
"grad_norm": 0.8174652457237244,
"learning_rate": 1.2940951114175637e-06,
"loss": 1.1477,
"step": 1773
},
{
"epoch": 4.031866464339909,
"grad_norm": 0.7917712330818176,
"learning_rate": 1.291381015990689e-06,
"loss": 1.1864,
"step": 1774
},
{
"epoch": 4.0341426403641885,
"grad_norm": 0.7918281555175781,
"learning_rate": 1.288668778265093e-06,
"loss": 1.1196,
"step": 1775
},
{
"epoch": 4.036418816388467,
"grad_norm": 0.8245083093643188,
"learning_rate": 1.2859584024096062e-06,
"loss": 1.143,
"step": 1776
},
{
"epoch": 4.038694992412746,
"grad_norm": 0.8343380689620972,
"learning_rate": 1.2832498925901984e-06,
"loss": 1.2433,
"step": 1777
},
{
"epoch": 4.040971168437026,
"grad_norm": 0.7949922680854797,
"learning_rate": 1.2805432529699686e-06,
"loss": 1.1572,
"step": 1778
},
{
"epoch": 4.043247344461305,
"grad_norm": 0.8003636598587036,
"learning_rate": 1.2778384877091438e-06,
"loss": 1.1255,
"step": 1779
},
{
"epoch": 4.045523520485585,
"grad_norm": 0.8091865181922913,
"learning_rate": 1.275135600965068e-06,
"loss": 1.154,
"step": 1780
},
{
"epoch": 4.0477996965098635,
"grad_norm": 0.8288428783416748,
"learning_rate": 1.272434596892199e-06,
"loss": 1.1757,
"step": 1781
},
{
"epoch": 4.050075872534142,
"grad_norm": 0.8075243830680847,
"learning_rate": 1.2697354796421007e-06,
"loss": 1.1537,
"step": 1782
},
{
"epoch": 4.052352048558422,
"grad_norm": 0.8341973423957825,
"learning_rate": 1.2670382533634365e-06,
"loss": 1.1628,
"step": 1783
},
{
"epoch": 4.054628224582701,
"grad_norm": 0.8466330766677856,
"learning_rate": 1.2643429222019623e-06,
"loss": 1.1386,
"step": 1784
},
{
"epoch": 4.05690440060698,
"grad_norm": 0.782442033290863,
"learning_rate": 1.2616494903005244e-06,
"loss": 1.1272,
"step": 1785
},
{
"epoch": 4.05918057663126,
"grad_norm": 0.7994256615638733,
"learning_rate": 1.2589579617990466e-06,
"loss": 1.17,
"step": 1786
},
{
"epoch": 4.0614567526555385,
"grad_norm": 0.7817173004150391,
"learning_rate": 1.2562683408345279e-06,
"loss": 1.142,
"step": 1787
},
{
"epoch": 4.063732928679818,
"grad_norm": 0.8269613981246948,
"learning_rate": 1.2535806315410365e-06,
"loss": 1.1204,
"step": 1788
},
{
"epoch": 4.066009104704097,
"grad_norm": 0.8326630592346191,
"learning_rate": 1.2508948380497012e-06,
"loss": 1.1796,
"step": 1789
},
{
"epoch": 4.068285280728376,
"grad_norm": 0.8466435074806213,
"learning_rate": 1.2482109644887064e-06,
"loss": 1.0959,
"step": 1790
},
{
"epoch": 4.070561456752656,
"grad_norm": 0.794165313243866,
"learning_rate": 1.2455290149832856e-06,
"loss": 1.1375,
"step": 1791
},
{
"epoch": 4.072837632776935,
"grad_norm": 0.7998282313346863,
"learning_rate": 1.2428489936557131e-06,
"loss": 1.2033,
"step": 1792
},
{
"epoch": 4.0751138088012135,
"grad_norm": 0.7995575666427612,
"learning_rate": 1.2401709046253038e-06,
"loss": 1.1629,
"step": 1793
},
{
"epoch": 4.077389984825493,
"grad_norm": 0.8074122071266174,
"learning_rate": 1.237494752008399e-06,
"loss": 1.1649,
"step": 1794
},
{
"epoch": 4.079666160849772,
"grad_norm": 0.807697057723999,
"learning_rate": 1.2348205399183632e-06,
"loss": 1.1257,
"step": 1795
},
{
"epoch": 4.081942336874052,
"grad_norm": 0.8166428208351135,
"learning_rate": 1.232148272465583e-06,
"loss": 1.153,
"step": 1796
},
{
"epoch": 4.084218512898331,
"grad_norm": 0.8070620894432068,
"learning_rate": 1.2294779537574495e-06,
"loss": 1.1732,
"step": 1797
},
{
"epoch": 4.08649468892261,
"grad_norm": 0.8349140286445618,
"learning_rate": 1.2268095878983617e-06,
"loss": 1.1604,
"step": 1798
},
{
"epoch": 4.0887708649468895,
"grad_norm": 0.8050034642219543,
"learning_rate": 1.2241431789897188e-06,
"loss": 1.1658,
"step": 1799
},
{
"epoch": 4.091047040971168,
"grad_norm": 0.8552670478820801,
"learning_rate": 1.2214787311299085e-06,
"loss": 1.1373,
"step": 1800
},
{
"epoch": 4.093323216995447,
"grad_norm": 0.7918221950531006,
"learning_rate": 1.2188162484143077e-06,
"loss": 1.1648,
"step": 1801
},
{
"epoch": 4.095599393019727,
"grad_norm": 0.818851113319397,
"learning_rate": 1.21615573493527e-06,
"loss": 1.1468,
"step": 1802
},
{
"epoch": 4.097875569044006,
"grad_norm": 0.8211784362792969,
"learning_rate": 1.2134971947821236e-06,
"loss": 1.1566,
"step": 1803
},
{
"epoch": 4.100151745068286,
"grad_norm": 0.8089801073074341,
"learning_rate": 1.2108406320411632e-06,
"loss": 1.1235,
"step": 1804
},
{
"epoch": 4.1024279210925645,
"grad_norm": 0.7928494811058044,
"learning_rate": 1.2081860507956438e-06,
"loss": 1.1476,
"step": 1805
},
{
"epoch": 4.104704097116843,
"grad_norm": 0.8036773204803467,
"learning_rate": 1.2055334551257747e-06,
"loss": 1.1873,
"step": 1806
},
{
"epoch": 4.106980273141123,
"grad_norm": 0.7972615957260132,
"learning_rate": 1.2028828491087155e-06,
"loss": 1.1559,
"step": 1807
},
{
"epoch": 4.109256449165402,
"grad_norm": 0.8029186725616455,
"learning_rate": 1.2002342368185638e-06,
"loss": 1.1704,
"step": 1808
},
{
"epoch": 4.111532625189682,
"grad_norm": 0.8103779554367065,
"learning_rate": 1.1975876223263569e-06,
"loss": 1.185,
"step": 1809
},
{
"epoch": 4.113808801213961,
"grad_norm": 0.8010536432266235,
"learning_rate": 1.1949430097000584e-06,
"loss": 1.1596,
"step": 1810
},
{
"epoch": 4.1160849772382395,
"grad_norm": 0.7956396341323853,
"learning_rate": 1.1923004030045556e-06,
"loss": 1.1719,
"step": 1811
},
{
"epoch": 4.118361153262519,
"grad_norm": 0.8688467144966125,
"learning_rate": 1.1896598063016531e-06,
"loss": 1.1714,
"step": 1812
},
{
"epoch": 4.120637329286798,
"grad_norm": 0.7763445973396301,
"learning_rate": 1.1870212236500659e-06,
"loss": 1.1822,
"step": 1813
},
{
"epoch": 4.122913505311077,
"grad_norm": 0.8073698282241821,
"learning_rate": 1.1843846591054117e-06,
"loss": 1.2203,
"step": 1814
},
{
"epoch": 4.125189681335357,
"grad_norm": 0.7985076308250427,
"learning_rate": 1.18175011672021e-06,
"loss": 1.1586,
"step": 1815
},
{
"epoch": 4.127465857359636,
"grad_norm": 0.7932565212249756,
"learning_rate": 1.1791176005438692e-06,
"loss": 1.1562,
"step": 1816
},
{
"epoch": 4.129742033383915,
"grad_norm": 0.8024340271949768,
"learning_rate": 1.176487114622683e-06,
"loss": 1.1752,
"step": 1817
},
{
"epoch": 4.132018209408194,
"grad_norm": 0.8179346323013306,
"learning_rate": 1.1738586629998272e-06,
"loss": 1.1726,
"step": 1818
},
{
"epoch": 4.134294385432473,
"grad_norm": 0.8161062598228455,
"learning_rate": 1.1712322497153486e-06,
"loss": 1.1127,
"step": 1819
},
{
"epoch": 4.136570561456753,
"grad_norm": 0.7926722168922424,
"learning_rate": 1.1686078788061612e-06,
"loss": 1.1157,
"step": 1820
},
{
"epoch": 4.138846737481032,
"grad_norm": 0.8068000674247742,
"learning_rate": 1.1659855543060405e-06,
"loss": 1.1799,
"step": 1821
},
{
"epoch": 4.141122913505311,
"grad_norm": 0.7692276835441589,
"learning_rate": 1.163365280245615e-06,
"loss": 1.1622,
"step": 1822
},
{
"epoch": 4.1433990895295905,
"grad_norm": 0.8077060580253601,
"learning_rate": 1.1607470606523646e-06,
"loss": 1.1528,
"step": 1823
},
{
"epoch": 4.145675265553869,
"grad_norm": 0.8008897304534912,
"learning_rate": 1.1581308995506088e-06,
"loss": 1.149,
"step": 1824
},
{
"epoch": 4.147951441578149,
"grad_norm": 0.8196450471878052,
"learning_rate": 1.1555168009615039e-06,
"loss": 1.1427,
"step": 1825
},
{
"epoch": 4.150227617602428,
"grad_norm": 0.8355448246002197,
"learning_rate": 1.152904768903036e-06,
"loss": 1.1397,
"step": 1826
},
{
"epoch": 4.152503793626707,
"grad_norm": 0.8095211386680603,
"learning_rate": 1.1502948073900148e-06,
"loss": 1.1932,
"step": 1827
},
{
"epoch": 4.154779969650987,
"grad_norm": 0.8357805609703064,
"learning_rate": 1.1476869204340665e-06,
"loss": 1.1586,
"step": 1828
},
{
"epoch": 4.1570561456752655,
"grad_norm": 0.827434778213501,
"learning_rate": 1.1450811120436319e-06,
"loss": 1.1342,
"step": 1829
},
{
"epoch": 4.159332321699544,
"grad_norm": 0.8159314393997192,
"learning_rate": 1.1424773862239527e-06,
"loss": 1.1705,
"step": 1830
},
{
"epoch": 4.161608497723824,
"grad_norm": 0.8122511506080627,
"learning_rate": 1.1398757469770732e-06,
"loss": 1.1408,
"step": 1831
},
{
"epoch": 4.163884673748103,
"grad_norm": 0.8244626522064209,
"learning_rate": 1.1372761983018283e-06,
"loss": 1.1666,
"step": 1832
},
{
"epoch": 4.166160849772383,
"grad_norm": 0.8183591365814209,
"learning_rate": 1.1346787441938398e-06,
"loss": 1.1652,
"step": 1833
},
{
"epoch": 4.168437025796662,
"grad_norm": 0.8229405283927917,
"learning_rate": 1.132083388645511e-06,
"loss": 1.1964,
"step": 1834
},
{
"epoch": 4.1707132018209405,
"grad_norm": 0.8457160592079163,
"learning_rate": 1.1294901356460192e-06,
"loss": 1.1677,
"step": 1835
},
{
"epoch": 4.17298937784522,
"grad_norm": 0.8162142634391785,
"learning_rate": 1.1268989891813085e-06,
"loss": 1.142,
"step": 1836
},
{
"epoch": 4.175265553869499,
"grad_norm": 0.7983715534210205,
"learning_rate": 1.1243099532340888e-06,
"loss": 1.151,
"step": 1837
},
{
"epoch": 4.177541729893778,
"grad_norm": 0.8044275641441345,
"learning_rate": 1.1217230317838227e-06,
"loss": 1.1799,
"step": 1838
},
{
"epoch": 4.179817905918058,
"grad_norm": 0.8157429695129395,
"learning_rate": 1.1191382288067228e-06,
"loss": 1.1363,
"step": 1839
},
{
"epoch": 4.182094081942337,
"grad_norm": 0.8047868013381958,
"learning_rate": 1.116555548275749e-06,
"loss": 1.1847,
"step": 1840
},
{
"epoch": 4.184370257966616,
"grad_norm": 0.8166713118553162,
"learning_rate": 1.1139749941605949e-06,
"loss": 1.1487,
"step": 1841
},
{
"epoch": 4.186646433990895,
"grad_norm": 0.8015767931938171,
"learning_rate": 1.1113965704276874e-06,
"loss": 1.1356,
"step": 1842
},
{
"epoch": 4.188922610015174,
"grad_norm": 0.7973054051399231,
"learning_rate": 1.1088202810401789e-06,
"loss": 1.1751,
"step": 1843
},
{
"epoch": 4.191198786039454,
"grad_norm": 0.794691264629364,
"learning_rate": 1.1062461299579399e-06,
"loss": 1.1655,
"step": 1844
},
{
"epoch": 4.193474962063733,
"grad_norm": 0.8075692653656006,
"learning_rate": 1.1036741211375577e-06,
"loss": 1.1609,
"step": 1845
},
{
"epoch": 4.195751138088012,
"grad_norm": 0.8212078213691711,
"learning_rate": 1.1011042585323235e-06,
"loss": 1.1359,
"step": 1846
},
{
"epoch": 4.1980273141122915,
"grad_norm": 0.811058759689331,
"learning_rate": 1.0985365460922293e-06,
"loss": 1.1255,
"step": 1847
},
{
"epoch": 4.20030349013657,
"grad_norm": 0.7934993505477905,
"learning_rate": 1.095970987763967e-06,
"loss": 1.1596,
"step": 1848
},
{
"epoch": 4.20257966616085,
"grad_norm": 0.8093920350074768,
"learning_rate": 1.0934075874909103e-06,
"loss": 1.1553,
"step": 1849
},
{
"epoch": 4.204855842185129,
"grad_norm": 0.8409412503242493,
"learning_rate": 1.0908463492131227e-06,
"loss": 1.1393,
"step": 1850
},
{
"epoch": 4.207132018209408,
"grad_norm": 0.8130190968513489,
"learning_rate": 1.0882872768673402e-06,
"loss": 1.1094,
"step": 1851
},
{
"epoch": 4.209408194233688,
"grad_norm": 0.8121690154075623,
"learning_rate": 1.0857303743869707e-06,
"loss": 1.1595,
"step": 1852
},
{
"epoch": 4.2116843702579665,
"grad_norm": 0.806825578212738,
"learning_rate": 1.083175645702089e-06,
"loss": 1.2038,
"step": 1853
},
{
"epoch": 4.213960546282246,
"grad_norm": 0.798464834690094,
"learning_rate": 1.080623094739426e-06,
"loss": 1.1796,
"step": 1854
},
{
"epoch": 4.216236722306525,
"grad_norm": 0.8041526079177856,
"learning_rate": 1.0780727254223666e-06,
"loss": 1.1309,
"step": 1855
},
{
"epoch": 4.218512898330804,
"grad_norm": 0.8249539136886597,
"learning_rate": 1.075524541670942e-06,
"loss": 1.1777,
"step": 1856
},
{
"epoch": 4.220789074355084,
"grad_norm": 0.8255074620246887,
"learning_rate": 1.0729785474018243e-06,
"loss": 1.1509,
"step": 1857
},
{
"epoch": 4.223065250379363,
"grad_norm": 0.828778862953186,
"learning_rate": 1.0704347465283194e-06,
"loss": 1.1356,
"step": 1858
},
{
"epoch": 4.2253414264036415,
"grad_norm": 0.8041991591453552,
"learning_rate": 1.0678931429603641e-06,
"loss": 1.1588,
"step": 1859
},
{
"epoch": 4.227617602427921,
"grad_norm": 0.8260260820388794,
"learning_rate": 1.0653537406045157e-06,
"loss": 1.1428,
"step": 1860
},
{
"epoch": 4.2298937784522,
"grad_norm": 0.8194622993469238,
"learning_rate": 1.0628165433639493e-06,
"loss": 1.1496,
"step": 1861
},
{
"epoch": 4.23216995447648,
"grad_norm": 0.8133224844932556,
"learning_rate": 1.0602815551384502e-06,
"loss": 1.1684,
"step": 1862
},
{
"epoch": 4.234446130500759,
"grad_norm": 0.8095599412918091,
"learning_rate": 1.0577487798244081e-06,
"loss": 1.186,
"step": 1863
},
{
"epoch": 4.236722306525038,
"grad_norm": 0.8215416669845581,
"learning_rate": 1.0552182213148119e-06,
"loss": 1.1679,
"step": 1864
},
{
"epoch": 4.238998482549317,
"grad_norm": 0.8311387896537781,
"learning_rate": 1.0526898834992422e-06,
"loss": 1.1679,
"step": 1865
},
{
"epoch": 4.241274658573596,
"grad_norm": 0.8266900181770325,
"learning_rate": 1.0501637702638666e-06,
"loss": 1.1585,
"step": 1866
},
{
"epoch": 4.243550834597875,
"grad_norm": 0.8136650323867798,
"learning_rate": 1.0476398854914355e-06,
"loss": 1.1466,
"step": 1867
},
{
"epoch": 4.245827010622155,
"grad_norm": 0.8111891150474548,
"learning_rate": 1.0451182330612715e-06,
"loss": 1.1611,
"step": 1868
},
{
"epoch": 4.248103186646434,
"grad_norm": 0.7687380313873291,
"learning_rate": 1.0425988168492659e-06,
"loss": 1.1466,
"step": 1869
},
{
"epoch": 4.250379362670714,
"grad_norm": 0.7977138161659241,
"learning_rate": 1.0400816407278754e-06,
"loss": 1.1897,
"step": 1870
},
{
"epoch": 4.2526555386949925,
"grad_norm": 0.8051496744155884,
"learning_rate": 1.0375667085661115e-06,
"loss": 1.1285,
"step": 1871
},
{
"epoch": 4.254931714719271,
"grad_norm": 0.7781476974487305,
"learning_rate": 1.0350540242295367e-06,
"loss": 1.1211,
"step": 1872
},
{
"epoch": 4.257207890743551,
"grad_norm": 0.838594913482666,
"learning_rate": 1.032543591580259e-06,
"loss": 1.2148,
"step": 1873
},
{
"epoch": 4.25948406676783,
"grad_norm": 0.8080545663833618,
"learning_rate": 1.0300354144769245e-06,
"loss": 1.0911,
"step": 1874
},
{
"epoch": 4.261760242792109,
"grad_norm": 0.7992098331451416,
"learning_rate": 1.027529496774715e-06,
"loss": 1.1572,
"step": 1875
},
{
"epoch": 4.264036418816389,
"grad_norm": 0.7982013821601868,
"learning_rate": 1.0250258423253367e-06,
"loss": 1.1533,
"step": 1876
},
{
"epoch": 4.2663125948406675,
"grad_norm": 0.7892776131629944,
"learning_rate": 1.0225244549770175e-06,
"loss": 1.1394,
"step": 1877
},
{
"epoch": 4.268588770864947,
"grad_norm": 0.8064398169517517,
"learning_rate": 1.020025338574504e-06,
"loss": 1.1703,
"step": 1878
},
{
"epoch": 4.270864946889226,
"grad_norm": 0.8267671465873718,
"learning_rate": 1.0175284969590457e-06,
"loss": 1.1699,
"step": 1879
},
{
"epoch": 4.273141122913505,
"grad_norm": 0.8314613103866577,
"learning_rate": 1.0150339339684026e-06,
"loss": 1.1349,
"step": 1880
},
{
"epoch": 4.275417298937785,
"grad_norm": 0.7835724353790283,
"learning_rate": 1.0125416534368279e-06,
"loss": 1.1296,
"step": 1881
},
{
"epoch": 4.277693474962064,
"grad_norm": 0.826766312122345,
"learning_rate": 1.0100516591950676e-06,
"loss": 1.1924,
"step": 1882
},
{
"epoch": 4.279969650986343,
"grad_norm": 0.8165386915206909,
"learning_rate": 1.0075639550703553e-06,
"loss": 1.124,
"step": 1883
},
{
"epoch": 4.282245827010622,
"grad_norm": 0.807698130607605,
"learning_rate": 1.0050785448864022e-06,
"loss": 1.1557,
"step": 1884
},
{
"epoch": 4.284522003034901,
"grad_norm": 0.8152635097503662,
"learning_rate": 1.0025954324633949e-06,
"loss": 1.1825,
"step": 1885
},
{
"epoch": 4.286798179059181,
"grad_norm": 0.8133754134178162,
"learning_rate": 1.000114621617988e-06,
"loss": 1.147,
"step": 1886
},
{
"epoch": 4.28907435508346,
"grad_norm": 0.803455650806427,
"learning_rate": 9.976361161632977e-07,
"loss": 1.2258,
"step": 1887
},
{
"epoch": 4.291350531107739,
"grad_norm": 0.8496401309967041,
"learning_rate": 9.951599199088977e-07,
"loss": 1.1188,
"step": 1888
},
{
"epoch": 4.293626707132018,
"grad_norm": 0.8170112371444702,
"learning_rate": 9.926860366608128e-07,
"loss": 1.162,
"step": 1889
},
{
"epoch": 4.295902883156297,
"grad_norm": 0.8310915231704712,
"learning_rate": 9.902144702215102e-07,
"loss": 1.1522,
"step": 1890
},
{
"epoch": 4.298179059180576,
"grad_norm": 0.7971112728118896,
"learning_rate": 9.877452243899003e-07,
"loss": 1.1522,
"step": 1891
},
{
"epoch": 4.300455235204856,
"grad_norm": 0.8279590010643005,
"learning_rate": 9.852783029613224e-07,
"loss": 1.1391,
"step": 1892
},
{
"epoch": 4.302731411229135,
"grad_norm": 0.842805802822113,
"learning_rate": 9.828137097275454e-07,
"loss": 1.1617,
"step": 1893
},
{
"epoch": 4.305007587253415,
"grad_norm": 0.8263971209526062,
"learning_rate": 9.803514484767582e-07,
"loss": 1.1846,
"step": 1894
},
{
"epoch": 4.3072837632776935,
"grad_norm": 0.8175419569015503,
"learning_rate": 9.77891522993567e-07,
"loss": 1.1615,
"step": 1895
},
{
"epoch": 4.309559939301972,
"grad_norm": 0.8182225823402405,
"learning_rate": 9.754339370589854e-07,
"loss": 1.1318,
"step": 1896
},
{
"epoch": 4.311836115326252,
"grad_norm": 0.7963380217552185,
"learning_rate": 9.72978694450435e-07,
"loss": 1.1567,
"step": 1897
},
{
"epoch": 4.314112291350531,
"grad_norm": 0.8066350817680359,
"learning_rate": 9.705257989417315e-07,
"loss": 1.1575,
"step": 1898
},
{
"epoch": 4.316388467374811,
"grad_norm": 0.8392632603645325,
"learning_rate": 9.680752543030844e-07,
"loss": 1.1697,
"step": 1899
},
{
"epoch": 4.31866464339909,
"grad_norm": 0.8114210367202759,
"learning_rate": 9.656270643010917e-07,
"loss": 1.1911,
"step": 1900
},
{
"epoch": 4.3209408194233685,
"grad_norm": 0.805618166923523,
"learning_rate": 9.6318123269873e-07,
"loss": 1.1543,
"step": 1901
},
{
"epoch": 4.323216995447648,
"grad_norm": 0.7983014583587646,
"learning_rate": 9.60737763255351e-07,
"loss": 1.1322,
"step": 1902
},
{
"epoch": 4.325493171471927,
"grad_norm": 0.8233836889266968,
"learning_rate": 9.582966597266768e-07,
"loss": 1.1761,
"step": 1903
},
{
"epoch": 4.327769347496206,
"grad_norm": 0.8023352026939392,
"learning_rate": 9.55857925864791e-07,
"loss": 1.1663,
"step": 1904
},
{
"epoch": 4.330045523520486,
"grad_norm": 0.8328466415405273,
"learning_rate": 9.534215654181384e-07,
"loss": 1.1736,
"step": 1905
},
{
"epoch": 4.332321699544765,
"grad_norm": 0.809187650680542,
"learning_rate": 9.509875821315126e-07,
"loss": 1.1839,
"step": 1906
},
{
"epoch": 4.334597875569044,
"grad_norm": 0.7974687218666077,
"learning_rate": 9.485559797460544e-07,
"loss": 1.1693,
"step": 1907
},
{
"epoch": 4.336874051593323,
"grad_norm": 0.8204778432846069,
"learning_rate": 9.461267619992453e-07,
"loss": 1.1418,
"step": 1908
},
{
"epoch": 4.339150227617602,
"grad_norm": 0.8286815285682678,
"learning_rate": 9.436999326249013e-07,
"loss": 1.1902,
"step": 1909
},
{
"epoch": 4.341426403641882,
"grad_norm": 0.824350893497467,
"learning_rate": 9.412754953531664e-07,
"loss": 1.1553,
"step": 1910
},
{
"epoch": 4.343702579666161,
"grad_norm": 0.7882816791534424,
"learning_rate": 9.388534539105107e-07,
"loss": 1.1776,
"step": 1911
},
{
"epoch": 4.34597875569044,
"grad_norm": 0.7909602522850037,
"learning_rate": 9.364338120197181e-07,
"loss": 1.1548,
"step": 1912
},
{
"epoch": 4.348254931714719,
"grad_norm": 0.8286248445510864,
"learning_rate": 9.340165733998877e-07,
"loss": 1.1333,
"step": 1913
},
{
"epoch": 4.350531107738998,
"grad_norm": 0.8236697912216187,
"learning_rate": 9.316017417664222e-07,
"loss": 1.1992,
"step": 1914
},
{
"epoch": 4.352807283763278,
"grad_norm": 0.8361827731132507,
"learning_rate": 9.291893208310257e-07,
"loss": 1.119,
"step": 1915
},
{
"epoch": 4.355083459787557,
"grad_norm": 0.8130219578742981,
"learning_rate": 9.267793143016967e-07,
"loss": 1.1503,
"step": 1916
},
{
"epoch": 4.357359635811836,
"grad_norm": 0.8324246406555176,
"learning_rate": 9.243717258827228e-07,
"loss": 1.1888,
"step": 1917
},
{
"epoch": 4.359635811836116,
"grad_norm": 0.8405057787895203,
"learning_rate": 9.219665592746738e-07,
"loss": 1.1459,
"step": 1918
},
{
"epoch": 4.3619119878603945,
"grad_norm": 0.8185500502586365,
"learning_rate": 9.195638181743996e-07,
"loss": 1.1404,
"step": 1919
},
{
"epoch": 4.364188163884673,
"grad_norm": 0.8271884918212891,
"learning_rate": 9.171635062750189e-07,
"loss": 1.1827,
"step": 1920
},
{
"epoch": 4.366464339908953,
"grad_norm": 0.8363154530525208,
"learning_rate": 9.147656272659197e-07,
"loss": 1.1247,
"step": 1921
},
{
"epoch": 4.368740515933232,
"grad_norm": 0.8215076923370361,
"learning_rate": 9.123701848327485e-07,
"loss": 1.1362,
"step": 1922
},
{
"epoch": 4.371016691957512,
"grad_norm": 0.7907038331031799,
"learning_rate": 9.099771826574069e-07,
"loss": 1.1445,
"step": 1923
},
{
"epoch": 4.373292867981791,
"grad_norm": 0.8025128841400146,
"learning_rate": 9.075866244180459e-07,
"loss": 1.108,
"step": 1924
},
{
"epoch": 4.3755690440060695,
"grad_norm": 0.8598397374153137,
"learning_rate": 9.051985137890601e-07,
"loss": 1.1311,
"step": 1925
},
{
"epoch": 4.377845220030349,
"grad_norm": 0.8464901447296143,
"learning_rate": 9.028128544410814e-07,
"loss": 1.134,
"step": 1926
},
{
"epoch": 4.380121396054628,
"grad_norm": 0.8181291222572327,
"learning_rate": 9.004296500409759e-07,
"loss": 1.1774,
"step": 1927
},
{
"epoch": 4.382397572078908,
"grad_norm": 0.7884581685066223,
"learning_rate": 8.980489042518348e-07,
"loss": 1.1325,
"step": 1928
},
{
"epoch": 4.384673748103187,
"grad_norm": 0.79710453748703,
"learning_rate": 8.956706207329694e-07,
"loss": 1.1751,
"step": 1929
},
{
"epoch": 4.386949924127466,
"grad_norm": 0.8176055550575256,
"learning_rate": 8.932948031399099e-07,
"loss": 1.1749,
"step": 1930
},
{
"epoch": 4.389226100151745,
"grad_norm": 0.8096023797988892,
"learning_rate": 8.909214551243908e-07,
"loss": 1.1616,
"step": 1931
},
{
"epoch": 4.391502276176024,
"grad_norm": 0.8391863107681274,
"learning_rate": 8.885505803343561e-07,
"loss": 1.1719,
"step": 1932
},
{
"epoch": 4.393778452200303,
"grad_norm": 0.8197864294052124,
"learning_rate": 8.861821824139455e-07,
"loss": 1.1678,
"step": 1933
},
{
"epoch": 4.396054628224583,
"grad_norm": 0.8241091370582581,
"learning_rate": 8.838162650034912e-07,
"loss": 1.1282,
"step": 1934
},
{
"epoch": 4.398330804248862,
"grad_norm": 0.7828463315963745,
"learning_rate": 8.814528317395155e-07,
"loss": 1.1597,
"step": 1935
},
{
"epoch": 4.400606980273141,
"grad_norm": 0.8351078629493713,
"learning_rate": 8.790918862547201e-07,
"loss": 1.1592,
"step": 1936
},
{
"epoch": 4.40288315629742,
"grad_norm": 0.8341789841651917,
"learning_rate": 8.767334321779831e-07,
"loss": 1.1755,
"step": 1937
},
{
"epoch": 4.405159332321699,
"grad_norm": 0.8040433526039124,
"learning_rate": 8.743774731343541e-07,
"loss": 1.1688,
"step": 1938
},
{
"epoch": 4.407435508345979,
"grad_norm": 0.8132420778274536,
"learning_rate": 8.720240127450466e-07,
"loss": 1.1287,
"step": 1939
},
{
"epoch": 4.409711684370258,
"grad_norm": 0.8121916055679321,
"learning_rate": 8.69673054627434e-07,
"loss": 1.1637,
"step": 1940
},
{
"epoch": 4.411987860394537,
"grad_norm": 0.8064197301864624,
"learning_rate": 8.673246023950449e-07,
"loss": 1.1867,
"step": 1941
},
{
"epoch": 4.414264036418817,
"grad_norm": 0.8088646531105042,
"learning_rate": 8.649786596575538e-07,
"loss": 1.1627,
"step": 1942
},
{
"epoch": 4.4165402124430955,
"grad_norm": 0.8088123202323914,
"learning_rate": 8.626352300207808e-07,
"loss": 1.155,
"step": 1943
},
{
"epoch": 4.418816388467375,
"grad_norm": 0.8343124389648438,
"learning_rate": 8.602943170866809e-07,
"loss": 1.1284,
"step": 1944
},
{
"epoch": 4.421092564491654,
"grad_norm": 0.8085336089134216,
"learning_rate": 8.579559244533416e-07,
"loss": 1.1536,
"step": 1945
},
{
"epoch": 4.423368740515933,
"grad_norm": 0.8021194338798523,
"learning_rate": 8.556200557149771e-07,
"loss": 1.1656,
"step": 1946
},
{
"epoch": 4.425644916540213,
"grad_norm": 0.8365726470947266,
"learning_rate": 8.532867144619217e-07,
"loss": 1.1704,
"step": 1947
},
{
"epoch": 4.427921092564492,
"grad_norm": 0.8232294321060181,
"learning_rate": 8.509559042806237e-07,
"loss": 1.1315,
"step": 1948
},
{
"epoch": 4.4301972685887705,
"grad_norm": 0.83378005027771,
"learning_rate": 8.486276287536444e-07,
"loss": 1.1661,
"step": 1949
},
{
"epoch": 4.43247344461305,
"grad_norm": 0.7996213436126709,
"learning_rate": 8.463018914596449e-07,
"loss": 1.16,
"step": 1950
},
{
"epoch": 4.434749620637329,
"grad_norm": 0.8141563534736633,
"learning_rate": 8.439786959733895e-07,
"loss": 1.1678,
"step": 1951
},
{
"epoch": 4.437025796661609,
"grad_norm": 0.8117381930351257,
"learning_rate": 8.416580458657322e-07,
"loss": 1.1405,
"step": 1952
},
{
"epoch": 4.439301972685888,
"grad_norm": 0.8416996598243713,
"learning_rate": 8.393399447036155e-07,
"loss": 1.1442,
"step": 1953
},
{
"epoch": 4.441578148710167,
"grad_norm": 0.8117838501930237,
"learning_rate": 8.370243960500646e-07,
"loss": 1.1743,
"step": 1954
},
{
"epoch": 4.443854324734446,
"grad_norm": 0.8272352814674377,
"learning_rate": 8.347114034641807e-07,
"loss": 1.1774,
"step": 1955
},
{
"epoch": 4.446130500758725,
"grad_norm": 0.8017353415489197,
"learning_rate": 8.324009705011357e-07,
"loss": 1.1253,
"step": 1956
},
{
"epoch": 4.448406676783004,
"grad_norm": 0.8281505107879639,
"learning_rate": 8.300931007121701e-07,
"loss": 1.1341,
"step": 1957
},
{
"epoch": 4.450682852807284,
"grad_norm": 0.7994737029075623,
"learning_rate": 8.277877976445819e-07,
"loss": 1.1655,
"step": 1958
},
{
"epoch": 4.452959028831563,
"grad_norm": 0.8165961503982544,
"learning_rate": 8.254850648417234e-07,
"loss": 1.147,
"step": 1959
},
{
"epoch": 4.455235204855843,
"grad_norm": 0.8197671175003052,
"learning_rate": 8.231849058430005e-07,
"loss": 1.1588,
"step": 1960
},
{
"epoch": 4.457511380880121,
"grad_norm": 0.8069389462471008,
"learning_rate": 8.208873241838569e-07,
"loss": 1.1662,
"step": 1961
},
{
"epoch": 4.4597875569044,
"grad_norm": 0.801243782043457,
"learning_rate": 8.185923233957802e-07,
"loss": 1.1433,
"step": 1962
},
{
"epoch": 4.46206373292868,
"grad_norm": 0.8067401051521301,
"learning_rate": 8.162999070062885e-07,
"loss": 1.1697,
"step": 1963
},
{
"epoch": 4.464339908952959,
"grad_norm": 0.8027750849723816,
"learning_rate": 8.140100785389271e-07,
"loss": 1.1336,
"step": 1964
},
{
"epoch": 4.466616084977238,
"grad_norm": 0.8215749859809875,
"learning_rate": 8.117228415132658e-07,
"loss": 1.1875,
"step": 1965
},
{
"epoch": 4.468892261001518,
"grad_norm": 0.8175052404403687,
"learning_rate": 8.094381994448897e-07,
"loss": 1.1357,
"step": 1966
},
{
"epoch": 4.4711684370257965,
"grad_norm": 0.8070038557052612,
"learning_rate": 8.07156155845395e-07,
"loss": 1.1345,
"step": 1967
},
{
"epoch": 4.473444613050076,
"grad_norm": 0.8089066743850708,
"learning_rate": 8.048767142223845e-07,
"loss": 1.1524,
"step": 1968
},
{
"epoch": 4.475720789074355,
"grad_norm": 0.817398190498352,
"learning_rate": 8.025998780794622e-07,
"loss": 1.1694,
"step": 1969
},
{
"epoch": 4.477996965098634,
"grad_norm": 0.8068968653678894,
"learning_rate": 8.003256509162252e-07,
"loss": 1.1189,
"step": 1970
},
{
"epoch": 4.480273141122914,
"grad_norm": 0.8235507607460022,
"learning_rate": 7.980540362282643e-07,
"loss": 1.193,
"step": 1971
},
{
"epoch": 4.482549317147193,
"grad_norm": 0.823232114315033,
"learning_rate": 7.95785037507151e-07,
"loss": 1.1441,
"step": 1972
},
{
"epoch": 4.484825493171472,
"grad_norm": 0.7972595691680908,
"learning_rate": 7.935186582404386e-07,
"loss": 1.1691,
"step": 1973
},
{
"epoch": 4.487101669195751,
"grad_norm": 0.7991148233413696,
"learning_rate": 7.912549019116528e-07,
"loss": 1.1403,
"step": 1974
},
{
"epoch": 4.48937784522003,
"grad_norm": 0.8262163400650024,
"learning_rate": 7.889937720002874e-07,
"loss": 1.1792,
"step": 1975
},
{
"epoch": 4.49165402124431,
"grad_norm": 0.8145712614059448,
"learning_rate": 7.867352719818008e-07,
"loss": 1.1707,
"step": 1976
},
{
"epoch": 4.493930197268589,
"grad_norm": 0.8145231008529663,
"learning_rate": 7.844794053276076e-07,
"loss": 1.1955,
"step": 1977
},
{
"epoch": 4.496206373292868,
"grad_norm": 0.8229146599769592,
"learning_rate": 7.82226175505075e-07,
"loss": 1.128,
"step": 1978
},
{
"epoch": 4.498482549317147,
"grad_norm": 0.8210102319717407,
"learning_rate": 7.79975585977519e-07,
"loss": 1.1757,
"step": 1979
},
{
"epoch": 4.500758725341426,
"grad_norm": 0.7983988523483276,
"learning_rate": 7.777276402041956e-07,
"loss": 1.1589,
"step": 1980
},
{
"epoch": 4.503034901365705,
"grad_norm": 0.8079198002815247,
"learning_rate": 7.754823416402965e-07,
"loss": 1.1685,
"step": 1981
},
{
"epoch": 4.505311077389985,
"grad_norm": 0.831045925617218,
"learning_rate": 7.732396937369479e-07,
"loss": 1.1415,
"step": 1982
},
{
"epoch": 4.507587253414264,
"grad_norm": 0.7844316363334656,
"learning_rate": 7.709996999411984e-07,
"loss": 1.1581,
"step": 1983
},
{
"epoch": 4.509863429438544,
"grad_norm": 0.8027727007865906,
"learning_rate": 7.687623636960184e-07,
"loss": 1.1683,
"step": 1984
},
{
"epoch": 4.5121396054628224,
"grad_norm": 0.8118910789489746,
"learning_rate": 7.665276884402936e-07,
"loss": 1.1924,
"step": 1985
},
{
"epoch": 4.514415781487101,
"grad_norm": 0.8252329230308533,
"learning_rate": 7.642956776088187e-07,
"loss": 1.1428,
"step": 1986
},
{
"epoch": 4.516691957511381,
"grad_norm": 0.8044348359107971,
"learning_rate": 7.620663346322956e-07,
"loss": 1.1572,
"step": 1987
},
{
"epoch": 4.51896813353566,
"grad_norm": 0.8026670813560486,
"learning_rate": 7.598396629373228e-07,
"loss": 1.1693,
"step": 1988
},
{
"epoch": 4.52124430955994,
"grad_norm": 0.7939416170120239,
"learning_rate": 7.576156659463943e-07,
"loss": 1.1549,
"step": 1989
},
{
"epoch": 4.523520485584219,
"grad_norm": 0.779471218585968,
"learning_rate": 7.553943470778927e-07,
"loss": 1.117,
"step": 1990
},
{
"epoch": 4.5257966616084975,
"grad_norm": 0.8062757849693298,
"learning_rate": 7.531757097460828e-07,
"loss": 1.2131,
"step": 1991
},
{
"epoch": 4.528072837632777,
"grad_norm": 0.8512879610061646,
"learning_rate": 7.509597573611113e-07,
"loss": 1.1859,
"step": 1992
},
{
"epoch": 4.530349013657056,
"grad_norm": 0.8231443762779236,
"learning_rate": 7.487464933289948e-07,
"loss": 1.1504,
"step": 1993
},
{
"epoch": 4.532625189681335,
"grad_norm": 0.8112626075744629,
"learning_rate": 7.465359210516182e-07,
"loss": 1.1982,
"step": 1994
},
{
"epoch": 4.534901365705615,
"grad_norm": 0.8114172220230103,
"learning_rate": 7.443280439267311e-07,
"loss": 1.1768,
"step": 1995
},
{
"epoch": 4.537177541729894,
"grad_norm": 0.809261679649353,
"learning_rate": 7.421228653479385e-07,
"loss": 1.1516,
"step": 1996
},
{
"epoch": 4.5394537177541725,
"grad_norm": 0.8252702355384827,
"learning_rate": 7.399203887046977e-07,
"loss": 1.1944,
"step": 1997
},
{
"epoch": 4.541729893778452,
"grad_norm": 0.8462184071540833,
"learning_rate": 7.377206173823142e-07,
"loss": 1.179,
"step": 1998
},
{
"epoch": 4.544006069802731,
"grad_norm": 0.8015170097351074,
"learning_rate": 7.355235547619341e-07,
"loss": 1.1374,
"step": 1999
},
{
"epoch": 4.546282245827011,
"grad_norm": 0.8240376710891724,
"learning_rate": 7.333292042205404e-07,
"loss": 1.1751,
"step": 2000
},
{
"epoch": 4.54855842185129,
"grad_norm": 0.8022201061248779,
"learning_rate": 7.311375691309488e-07,
"loss": 1.1666,
"step": 2001
},
{
"epoch": 4.5508345978755695,
"grad_norm": 0.8332545161247253,
"learning_rate": 7.289486528617986e-07,
"loss": 1.1528,
"step": 2002
},
{
"epoch": 4.553110773899848,
"grad_norm": 0.8448001742362976,
"learning_rate": 7.267624587775537e-07,
"loss": 1.1488,
"step": 2003
},
{
"epoch": 4.555386949924127,
"grad_norm": 0.8457192182540894,
"learning_rate": 7.245789902384908e-07,
"loss": 1.1254,
"step": 2004
},
{
"epoch": 4.557663125948407,
"grad_norm": 0.8168480396270752,
"learning_rate": 7.223982506006988e-07,
"loss": 1.1782,
"step": 2005
},
{
"epoch": 4.559939301972686,
"grad_norm": 0.8271265029907227,
"learning_rate": 7.202202432160713e-07,
"loss": 1.168,
"step": 2006
},
{
"epoch": 4.562215477996965,
"grad_norm": 0.8210901618003845,
"learning_rate": 7.180449714323032e-07,
"loss": 1.168,
"step": 2007
},
{
"epoch": 4.564491654021245,
"grad_norm": 0.83175128698349,
"learning_rate": 7.158724385928828e-07,
"loss": 1.1645,
"step": 2008
},
{
"epoch": 4.5667678300455234,
"grad_norm": 0.7995832562446594,
"learning_rate": 7.137026480370923e-07,
"loss": 1.1604,
"step": 2009
},
{
"epoch": 4.569044006069802,
"grad_norm": 0.7985032796859741,
"learning_rate": 7.115356030999954e-07,
"loss": 1.1848,
"step": 2010
},
{
"epoch": 4.571320182094082,
"grad_norm": 0.8107689619064331,
"learning_rate": 7.093713071124361e-07,
"loss": 1.1495,
"step": 2011
},
{
"epoch": 4.573596358118361,
"grad_norm": 0.8028638958930969,
"learning_rate": 7.072097634010353e-07,
"loss": 1.1259,
"step": 2012
},
{
"epoch": 4.575872534142641,
"grad_norm": 0.8166947960853577,
"learning_rate": 7.050509752881815e-07,
"loss": 1.1866,
"step": 2013
},
{
"epoch": 4.57814871016692,
"grad_norm": 0.8298448920249939,
"learning_rate": 7.028949460920282e-07,
"loss": 1.1801,
"step": 2014
},
{
"epoch": 4.5804248861911985,
"grad_norm": 0.7964473366737366,
"learning_rate": 7.007416791264882e-07,
"loss": 1.1624,
"step": 2015
},
{
"epoch": 4.582701062215478,
"grad_norm": 0.805364727973938,
"learning_rate": 6.985911777012286e-07,
"loss": 1.1575,
"step": 2016
},
{
"epoch": 4.584977238239757,
"grad_norm": 0.8281643986701965,
"learning_rate": 6.96443445121667e-07,
"loss": 1.1739,
"step": 2017
},
{
"epoch": 4.587253414264037,
"grad_norm": 0.78781658411026,
"learning_rate": 6.942984846889639e-07,
"loss": 1.1249,
"step": 2018
},
{
"epoch": 4.589529590288316,
"grad_norm": 0.841881275177002,
"learning_rate": 6.921562997000186e-07,
"loss": 1.1361,
"step": 2019
},
{
"epoch": 4.591805766312595,
"grad_norm": 0.8270856738090515,
"learning_rate": 6.900168934474655e-07,
"loss": 1.1668,
"step": 2020
},
{
"epoch": 4.594081942336874,
"grad_norm": 0.8418042659759521,
"learning_rate": 6.878802692196663e-07,
"loss": 1.1731,
"step": 2021
},
{
"epoch": 4.596358118361153,
"grad_norm": 0.8001435995101929,
"learning_rate": 6.857464303007091e-07,
"loss": 1.1323,
"step": 2022
},
{
"epoch": 4.598634294385432,
"grad_norm": 0.8071247339248657,
"learning_rate": 6.836153799703993e-07,
"loss": 1.1322,
"step": 2023
},
{
"epoch": 4.600910470409712,
"grad_norm": 0.8100282549858093,
"learning_rate": 6.814871215042552e-07,
"loss": 1.1978,
"step": 2024
},
{
"epoch": 4.603186646433991,
"grad_norm": 0.8448064923286438,
"learning_rate": 6.793616581735063e-07,
"loss": 1.1592,
"step": 2025
},
{
"epoch": 4.60546282245827,
"grad_norm": 0.8261284232139587,
"learning_rate": 6.772389932450841e-07,
"loss": 1.1742,
"step": 2026
},
{
"epoch": 4.607738998482549,
"grad_norm": 0.8271704316139221,
"learning_rate": 6.751191299816192e-07,
"loss": 1.1841,
"step": 2027
},
{
"epoch": 4.610015174506828,
"grad_norm": 0.8234429955482483,
"learning_rate": 6.730020716414357e-07,
"loss": 1.1664,
"step": 2028
},
{
"epoch": 4.612291350531108,
"grad_norm": 0.8064398169517517,
"learning_rate": 6.708878214785472e-07,
"loss": 1.1777,
"step": 2029
},
{
"epoch": 4.614567526555387,
"grad_norm": 0.8364176154136658,
"learning_rate": 6.687763827426491e-07,
"loss": 1.1359,
"step": 2030
},
{
"epoch": 4.616843702579666,
"grad_norm": 0.8281029462814331,
"learning_rate": 6.66667758679119e-07,
"loss": 1.126,
"step": 2031
},
{
"epoch": 4.619119878603946,
"grad_norm": 0.8018792867660522,
"learning_rate": 6.645619525290043e-07,
"loss": 1.1828,
"step": 2032
},
{
"epoch": 4.6213960546282244,
"grad_norm": 0.8104063272476196,
"learning_rate": 6.624589675290244e-07,
"loss": 1.1446,
"step": 2033
},
{
"epoch": 4.623672230652504,
"grad_norm": 0.8424516916275024,
"learning_rate": 6.603588069115605e-07,
"loss": 1.1931,
"step": 2034
},
{
"epoch": 4.625948406676783,
"grad_norm": 0.8288046717643738,
"learning_rate": 6.582614739046528e-07,
"loss": 1.1313,
"step": 2035
},
{
"epoch": 4.628224582701062,
"grad_norm": 0.8373975157737732,
"learning_rate": 6.561669717319962e-07,
"loss": 1.1495,
"step": 2036
},
{
"epoch": 4.630500758725342,
"grad_norm": 0.8243685364723206,
"learning_rate": 6.540753036129336e-07,
"loss": 1.1612,
"step": 2037
},
{
"epoch": 4.632776934749621,
"grad_norm": 0.8179749846458435,
"learning_rate": 6.519864727624514e-07,
"loss": 1.1444,
"step": 2038
},
{
"epoch": 4.6350531107738995,
"grad_norm": 0.8195221424102783,
"learning_rate": 6.499004823911772e-07,
"loss": 1.169,
"step": 2039
},
{
"epoch": 4.637329286798179,
"grad_norm": 0.8341556787490845,
"learning_rate": 6.47817335705371e-07,
"loss": 1.144,
"step": 2040
},
{
"epoch": 4.639605462822458,
"grad_norm": 0.8077003359794617,
"learning_rate": 6.457370359069209e-07,
"loss": 1.2334,
"step": 2041
},
{
"epoch": 4.641881638846737,
"grad_norm": 0.8267129063606262,
"learning_rate": 6.436595861933428e-07,
"loss": 1.19,
"step": 2042
},
{
"epoch": 4.644157814871017,
"grad_norm": 0.8191685676574707,
"learning_rate": 6.415849897577667e-07,
"loss": 1.1607,
"step": 2043
},
{
"epoch": 4.646433990895296,
"grad_norm": 0.8216390013694763,
"learning_rate": 6.39513249788942e-07,
"loss": 1.1454,
"step": 2044
},
{
"epoch": 4.648710166919575,
"grad_norm": 0.8405131101608276,
"learning_rate": 6.374443694712246e-07,
"loss": 1.1722,
"step": 2045
},
{
"epoch": 4.650986342943854,
"grad_norm": 0.8507837057113647,
"learning_rate": 6.353783519845752e-07,
"loss": 1.1185,
"step": 2046
},
{
"epoch": 4.653262518968134,
"grad_norm": 0.8254572153091431,
"learning_rate": 6.333152005045562e-07,
"loss": 1.1494,
"step": 2047
},
{
"epoch": 4.655538694992413,
"grad_norm": 0.8171166181564331,
"learning_rate": 6.312549182023229e-07,
"loss": 1.1168,
"step": 2048
},
{
"epoch": 4.657814871016692,
"grad_norm": 0.8100786805152893,
"learning_rate": 6.291975082446206e-07,
"loss": 1.1426,
"step": 2049
},
{
"epoch": 4.6600910470409715,
"grad_norm": 0.8109079003334045,
"learning_rate": 6.271429737937806e-07,
"loss": 1.1661,
"step": 2050
},
{
"epoch": 4.66236722306525,
"grad_norm": 0.7933968305587769,
"learning_rate": 6.250913180077139e-07,
"loss": 1.1421,
"step": 2051
},
{
"epoch": 4.664643399089529,
"grad_norm": 0.8203511834144592,
"learning_rate": 6.230425440399065e-07,
"loss": 1.1702,
"step": 2052
},
{
"epoch": 4.666919575113809,
"grad_norm": 0.8336530923843384,
"learning_rate": 6.209966550394162e-07,
"loss": 1.1474,
"step": 2053
},
{
"epoch": 4.669195751138088,
"grad_norm": 0.8284228444099426,
"learning_rate": 6.189536541508645e-07,
"loss": 1.1556,
"step": 2054
},
{
"epoch": 4.671471927162367,
"grad_norm": 0.8003067374229431,
"learning_rate": 6.169135445144364e-07,
"loss": 1.1865,
"step": 2055
},
{
"epoch": 4.673748103186647,
"grad_norm": 0.8248746991157532,
"learning_rate": 6.148763292658704e-07,
"loss": 1.2081,
"step": 2056
},
{
"epoch": 4.6760242792109254,
"grad_norm": 0.8294516205787659,
"learning_rate": 6.128420115364575e-07,
"loss": 1.1637,
"step": 2057
},
{
"epoch": 4.678300455235205,
"grad_norm": 0.8218663334846497,
"learning_rate": 6.108105944530346e-07,
"loss": 1.1297,
"step": 2058
},
{
"epoch": 4.680576631259484,
"grad_norm": 0.8338181376457214,
"learning_rate": 6.087820811379802e-07,
"loss": 1.1064,
"step": 2059
},
{
"epoch": 4.682852807283763,
"grad_norm": 0.821843147277832,
"learning_rate": 6.067564747092095e-07,
"loss": 1.1483,
"step": 2060
},
{
"epoch": 4.685128983308043,
"grad_norm": 0.8181595802307129,
"learning_rate": 6.04733778280171e-07,
"loss": 1.1717,
"step": 2061
},
{
"epoch": 4.687405159332322,
"grad_norm": 0.7944086194038391,
"learning_rate": 6.02713994959838e-07,
"loss": 1.1617,
"step": 2062
},
{
"epoch": 4.689681335356601,
"grad_norm": 0.8191626667976379,
"learning_rate": 6.006971278527085e-07,
"loss": 1.1005,
"step": 2063
},
{
"epoch": 4.69195751138088,
"grad_norm": 0.8066505789756775,
"learning_rate": 5.986831800587972e-07,
"loss": 1.1382,
"step": 2064
},
{
"epoch": 4.694233687405159,
"grad_norm": 0.831630527973175,
"learning_rate": 5.966721546736315e-07,
"loss": 1.1558,
"step": 2065
},
{
"epoch": 4.696509863429439,
"grad_norm": 0.8153077960014343,
"learning_rate": 5.946640547882468e-07,
"loss": 1.1598,
"step": 2066
},
{
"epoch": 4.698786039453718,
"grad_norm": 0.8225074410438538,
"learning_rate": 5.926588834891823e-07,
"loss": 1.1484,
"step": 2067
},
{
"epoch": 4.701062215477997,
"grad_norm": 0.818878173828125,
"learning_rate": 5.906566438584752e-07,
"loss": 1.1618,
"step": 2068
},
{
"epoch": 4.703338391502276,
"grad_norm": 0.8222529888153076,
"learning_rate": 5.88657338973658e-07,
"loss": 1.1248,
"step": 2069
},
{
"epoch": 4.705614567526555,
"grad_norm": 0.8568587303161621,
"learning_rate": 5.866609719077515e-07,
"loss": 1.1797,
"step": 2070
},
{
"epoch": 4.707890743550834,
"grad_norm": 0.8610350489616394,
"learning_rate": 5.846675457292597e-07,
"loss": 1.1441,
"step": 2071
},
{
"epoch": 4.710166919575114,
"grad_norm": 0.8169013857841492,
"learning_rate": 5.826770635021697e-07,
"loss": 1.1417,
"step": 2072
},
{
"epoch": 4.712443095599393,
"grad_norm": 0.8121698498725891,
"learning_rate": 5.80689528285939e-07,
"loss": 1.2031,
"step": 2073
},
{
"epoch": 4.7147192716236725,
"grad_norm": 0.8544483780860901,
"learning_rate": 5.787049431354996e-07,
"loss": 1.15,
"step": 2074
},
{
"epoch": 4.716995447647951,
"grad_norm": 0.8276944756507874,
"learning_rate": 5.767233111012466e-07,
"loss": 1.1589,
"step": 2075
},
{
"epoch": 4.71927162367223,
"grad_norm": 0.8428331017494202,
"learning_rate": 5.747446352290364e-07,
"loss": 1.1786,
"step": 2076
},
{
"epoch": 4.72154779969651,
"grad_norm": 0.8325369954109192,
"learning_rate": 5.727689185601834e-07,
"loss": 1.1864,
"step": 2077
},
{
"epoch": 4.723823975720789,
"grad_norm": 0.8243165016174316,
"learning_rate": 5.707961641314516e-07,
"loss": 1.1329,
"step": 2078
},
{
"epoch": 4.726100151745069,
"grad_norm": 0.8249309659004211,
"learning_rate": 5.688263749750523e-07,
"loss": 1.1502,
"step": 2079
},
{
"epoch": 4.728376327769348,
"grad_norm": 0.8111382126808167,
"learning_rate": 5.668595541186395e-07,
"loss": 1.1593,
"step": 2080
},
{
"epoch": 4.7306525037936265,
"grad_norm": 0.8352332711219788,
"learning_rate": 5.648957045853043e-07,
"loss": 1.1917,
"step": 2081
},
{
"epoch": 4.732928679817906,
"grad_norm": 0.8187192678451538,
"learning_rate": 5.629348293935704e-07,
"loss": 1.1352,
"step": 2082
},
{
"epoch": 4.735204855842185,
"grad_norm": 0.8193315267562866,
"learning_rate": 5.609769315573921e-07,
"loss": 1.1494,
"step": 2083
},
{
"epoch": 4.737481031866464,
"grad_norm": 0.8024821281433105,
"learning_rate": 5.590220140861441e-07,
"loss": 1.1278,
"step": 2084
},
{
"epoch": 4.739757207890744,
"grad_norm": 0.8180125951766968,
"learning_rate": 5.570700799846232e-07,
"loss": 1.1684,
"step": 2085
},
{
"epoch": 4.742033383915023,
"grad_norm": 0.8202872276306152,
"learning_rate": 5.551211322530381e-07,
"loss": 1.1732,
"step": 2086
},
{
"epoch": 4.7443095599393015,
"grad_norm": 0.8189080357551575,
"learning_rate": 5.531751738870089e-07,
"loss": 1.1718,
"step": 2087
},
{
"epoch": 4.746585735963581,
"grad_norm": 0.8162835836410522,
"learning_rate": 5.512322078775603e-07,
"loss": 1.1112,
"step": 2088
},
{
"epoch": 4.74886191198786,
"grad_norm": 0.824478268623352,
"learning_rate": 5.492922372111173e-07,
"loss": 1.1374,
"step": 2089
},
{
"epoch": 4.75113808801214,
"grad_norm": 0.8006694316864014,
"learning_rate": 5.47355264869501e-07,
"loss": 1.1511,
"step": 2090
},
{
"epoch": 4.753414264036419,
"grad_norm": 0.8223046064376831,
"learning_rate": 5.454212938299256e-07,
"loss": 1.1636,
"step": 2091
},
{
"epoch": 4.7556904400606985,
"grad_norm": 0.7983687520027161,
"learning_rate": 5.434903270649894e-07,
"loss": 1.1965,
"step": 2092
},
{
"epoch": 4.757966616084977,
"grad_norm": 0.8100834488868713,
"learning_rate": 5.415623675426759e-07,
"loss": 1.1471,
"step": 2093
},
{
"epoch": 4.760242792109256,
"grad_norm": 0.8282079696655273,
"learning_rate": 5.396374182263442e-07,
"loss": 1.1684,
"step": 2094
},
{
"epoch": 4.762518968133536,
"grad_norm": 0.8106188178062439,
"learning_rate": 5.377154820747271e-07,
"loss": 1.1781,
"step": 2095
},
{
"epoch": 4.764795144157815,
"grad_norm": 0.8157429099082947,
"learning_rate": 5.357965620419262e-07,
"loss": 1.1867,
"step": 2096
},
{
"epoch": 4.767071320182094,
"grad_norm": 0.811551570892334,
"learning_rate": 5.338806610774072e-07,
"loss": 1.1776,
"step": 2097
},
{
"epoch": 4.7693474962063735,
"grad_norm": 0.8151900172233582,
"learning_rate": 5.319677821259947e-07,
"loss": 1.1505,
"step": 2098
},
{
"epoch": 4.771623672230652,
"grad_norm": 0.816935658454895,
"learning_rate": 5.300579281278703e-07,
"loss": 1.1596,
"step": 2099
},
{
"epoch": 4.773899848254931,
"grad_norm": 0.8230782747268677,
"learning_rate": 5.281511020185639e-07,
"loss": 1.1585,
"step": 2100
},
{
"epoch": 4.776176024279211,
"grad_norm": 0.8262774348258972,
"learning_rate": 5.262473067289528e-07,
"loss": 1.1281,
"step": 2101
},
{
"epoch": 4.77845220030349,
"grad_norm": 0.8364963531494141,
"learning_rate": 5.243465451852548e-07,
"loss": 1.1742,
"step": 2102
},
{
"epoch": 4.78072837632777,
"grad_norm": 0.8024651408195496,
"learning_rate": 5.224488203090241e-07,
"loss": 1.1709,
"step": 2103
},
{
"epoch": 4.783004552352049,
"grad_norm": 0.8232298493385315,
"learning_rate": 5.205541350171508e-07,
"loss": 1.1824,
"step": 2104
},
{
"epoch": 4.7852807283763275,
"grad_norm": 0.8233107328414917,
"learning_rate": 5.186624922218495e-07,
"loss": 1.1335,
"step": 2105
},
{
"epoch": 4.787556904400607,
"grad_norm": 0.8403355479240417,
"learning_rate": 5.167738948306586e-07,
"loss": 1.1386,
"step": 2106
},
{
"epoch": 4.789833080424886,
"grad_norm": 0.8523696660995483,
"learning_rate": 5.148883457464385e-07,
"loss": 1.2001,
"step": 2107
},
{
"epoch": 4.792109256449166,
"grad_norm": 0.825985848903656,
"learning_rate": 5.130058478673608e-07,
"loss": 1.1253,
"step": 2108
},
{
"epoch": 4.794385432473445,
"grad_norm": 0.8368105292320251,
"learning_rate": 5.111264040869093e-07,
"loss": 1.1424,
"step": 2109
},
{
"epoch": 4.796661608497724,
"grad_norm": 0.8407347798347473,
"learning_rate": 5.092500172938728e-07,
"loss": 1.2224,
"step": 2110
},
{
"epoch": 4.798937784522003,
"grad_norm": 0.8046110272407532,
"learning_rate": 5.073766903723415e-07,
"loss": 1.1643,
"step": 2111
},
{
"epoch": 4.801213960546282,
"grad_norm": 0.8466883301734924,
"learning_rate": 5.055064262017012e-07,
"loss": 1.1745,
"step": 2112
},
{
"epoch": 4.803490136570561,
"grad_norm": 0.8486934304237366,
"learning_rate": 5.036392276566335e-07,
"loss": 1.1093,
"step": 2113
},
{
"epoch": 4.805766312594841,
"grad_norm": 0.8254931569099426,
"learning_rate": 5.01775097607104e-07,
"loss": 1.1839,
"step": 2114
},
{
"epoch": 4.80804248861912,
"grad_norm": 0.8326417803764343,
"learning_rate": 4.999140389183652e-07,
"loss": 1.1827,
"step": 2115
},
{
"epoch": 4.810318664643399,
"grad_norm": 0.8163275122642517,
"learning_rate": 4.980560544509467e-07,
"loss": 1.1775,
"step": 2116
},
{
"epoch": 4.812594840667678,
"grad_norm": 0.8416717648506165,
"learning_rate": 4.962011470606531e-07,
"loss": 1.1539,
"step": 2117
},
{
"epoch": 4.814871016691957,
"grad_norm": 0.785467803478241,
"learning_rate": 4.943493195985604e-07,
"loss": 1.1366,
"step": 2118
},
{
"epoch": 4.817147192716237,
"grad_norm": 0.8316680192947388,
"learning_rate": 4.925005749110096e-07,
"loss": 1.1258,
"step": 2119
},
{
"epoch": 4.819423368740516,
"grad_norm": 0.8190419673919678,
"learning_rate": 4.906549158396029e-07,
"loss": 1.1238,
"step": 2120
},
{
"epoch": 4.821699544764795,
"grad_norm": 0.8137006759643555,
"learning_rate": 4.888123452212023e-07,
"loss": 1.1634,
"step": 2121
},
{
"epoch": 4.8239757207890746,
"grad_norm": 0.8443368077278137,
"learning_rate": 4.869728658879205e-07,
"loss": 1.1868,
"step": 2122
},
{
"epoch": 4.826251896813353,
"grad_norm": 0.8072926998138428,
"learning_rate": 4.85136480667118e-07,
"loss": 1.1586,
"step": 2123
},
{
"epoch": 4.828528072837633,
"grad_norm": 0.8307923674583435,
"learning_rate": 4.833031923814033e-07,
"loss": 1.1681,
"step": 2124
},
{
"epoch": 4.830804248861912,
"grad_norm": 0.7928266525268555,
"learning_rate": 4.814730038486193e-07,
"loss": 1.1716,
"step": 2125
},
{
"epoch": 4.833080424886191,
"grad_norm": 0.8232721090316772,
"learning_rate": 4.796459178818496e-07,
"loss": 1.1974,
"step": 2126
},
{
"epoch": 4.835356600910471,
"grad_norm": 0.8364123106002808,
"learning_rate": 4.77821937289406e-07,
"loss": 1.1508,
"step": 2127
},
{
"epoch": 4.83763277693475,
"grad_norm": 0.8027380108833313,
"learning_rate": 4.760010648748273e-07,
"loss": 1.1578,
"step": 2128
},
{
"epoch": 4.8399089529590285,
"grad_norm": 0.8701675534248352,
"learning_rate": 4.7418330343687703e-07,
"loss": 1.1634,
"step": 2129
},
{
"epoch": 4.842185128983308,
"grad_norm": 0.8248427510261536,
"learning_rate": 4.723686557695351e-07,
"loss": 1.1305,
"step": 2130
},
{
"epoch": 4.844461305007587,
"grad_norm": 0.8026407957077026,
"learning_rate": 4.705571246619955e-07,
"loss": 1.182,
"step": 2131
},
{
"epoch": 4.846737481031866,
"grad_norm": 0.8148173689842224,
"learning_rate": 4.687487128986629e-07,
"loss": 1.1769,
"step": 2132
},
{
"epoch": 4.849013657056146,
"grad_norm": 0.8292121291160583,
"learning_rate": 4.669434232591455e-07,
"loss": 1.1573,
"step": 2133
},
{
"epoch": 4.851289833080425,
"grad_norm": 0.8071368932723999,
"learning_rate": 4.6514125851825574e-07,
"loss": 1.1858,
"step": 2134
},
{
"epoch": 4.853566009104704,
"grad_norm": 0.8253504633903503,
"learning_rate": 4.633422214460004e-07,
"loss": 1.1667,
"step": 2135
},
{
"epoch": 4.855842185128983,
"grad_norm": 0.8513967990875244,
"learning_rate": 4.6154631480757913e-07,
"loss": 1.1429,
"step": 2136
},
{
"epoch": 4.858118361153263,
"grad_norm": 0.8523240089416504,
"learning_rate": 4.5975354136338164e-07,
"loss": 1.168,
"step": 2137
},
{
"epoch": 4.860394537177542,
"grad_norm": 0.8168431520462036,
"learning_rate": 4.579639038689804e-07,
"loss": 1.0975,
"step": 2138
},
{
"epoch": 4.862670713201821,
"grad_norm": 0.8387389183044434,
"learning_rate": 4.561774050751275e-07,
"loss": 1.1652,
"step": 2139
},
{
"epoch": 4.8649468892261005,
"grad_norm": 0.8411140441894531,
"learning_rate": 4.543940477277517e-07,
"loss": 1.1272,
"step": 2140
},
{
"epoch": 4.867223065250379,
"grad_norm": 0.8062079548835754,
"learning_rate": 4.526138345679526e-07,
"loss": 1.128,
"step": 2141
},
{
"epoch": 4.869499241274658,
"grad_norm": 0.8193447589874268,
"learning_rate": 4.508367683319967e-07,
"loss": 1.1541,
"step": 2142
},
{
"epoch": 4.871775417298938,
"grad_norm": 0.8450594544410706,
"learning_rate": 4.4906285175131515e-07,
"loss": 1.162,
"step": 2143
},
{
"epoch": 4.874051593323217,
"grad_norm": 0.8277652263641357,
"learning_rate": 4.472920875524958e-07,
"loss": 1.1554,
"step": 2144
},
{
"epoch": 4.876327769347496,
"grad_norm": 0.833281397819519,
"learning_rate": 4.455244784572832e-07,
"loss": 1.1684,
"step": 2145
},
{
"epoch": 4.8786039453717756,
"grad_norm": 0.8380672931671143,
"learning_rate": 4.4376002718257095e-07,
"loss": 1.2199,
"step": 2146
},
{
"epoch": 4.880880121396054,
"grad_norm": 0.8138743042945862,
"learning_rate": 4.419987364403991e-07,
"loss": 1.1682,
"step": 2147
},
{
"epoch": 4.883156297420334,
"grad_norm": 0.8843202590942383,
"learning_rate": 4.402406089379502e-07,
"loss": 1.1937,
"step": 2148
},
{
"epoch": 4.885432473444613,
"grad_norm": 0.8394472002983093,
"learning_rate": 4.384856473775448e-07,
"loss": 1.1256,
"step": 2149
},
{
"epoch": 4.887708649468892,
"grad_norm": 0.8245413899421692,
"learning_rate": 4.367338544566363e-07,
"loss": 1.1426,
"step": 2150
},
{
"epoch": 4.889984825493172,
"grad_norm": 0.8412423729896545,
"learning_rate": 4.3498523286780973e-07,
"loss": 1.1481,
"step": 2151
},
{
"epoch": 4.892261001517451,
"grad_norm": 0.8549144268035889,
"learning_rate": 4.332397852987741e-07,
"loss": 1.1438,
"step": 2152
},
{
"epoch": 4.89453717754173,
"grad_norm": 0.8178640604019165,
"learning_rate": 4.314975144323591e-07,
"loss": 1.1949,
"step": 2153
},
{
"epoch": 4.896813353566009,
"grad_norm": 0.8267374038696289,
"learning_rate": 4.297584229465149e-07,
"loss": 1.1239,
"step": 2154
},
{
"epoch": 4.899089529590288,
"grad_norm": 0.8270822763442993,
"learning_rate": 4.280225135143004e-07,
"loss": 1.1741,
"step": 2155
},
{
"epoch": 4.901365705614568,
"grad_norm": 0.8518311381340027,
"learning_rate": 4.262897888038872e-07,
"loss": 1.1664,
"step": 2156
},
{
"epoch": 4.903641881638847,
"grad_norm": 0.8309064507484436,
"learning_rate": 4.2456025147855016e-07,
"loss": 1.1916,
"step": 2157
},
{
"epoch": 4.905918057663126,
"grad_norm": 0.8361365795135498,
"learning_rate": 4.228339041966645e-07,
"loss": 1.1917,
"step": 2158
},
{
"epoch": 4.908194233687405,
"grad_norm": 0.8316016793251038,
"learning_rate": 4.211107496117042e-07,
"loss": 1.1765,
"step": 2159
},
{
"epoch": 4.910470409711684,
"grad_norm": 0.8184367418289185,
"learning_rate": 4.193907903722344e-07,
"loss": 1.1862,
"step": 2160
},
{
"epoch": 4.912746585735963,
"grad_norm": 0.8196964859962463,
"learning_rate": 4.176740291219089e-07,
"loss": 1.156,
"step": 2161
},
{
"epoch": 4.915022761760243,
"grad_norm": 0.8255534172058105,
"learning_rate": 4.1596046849946614e-07,
"loss": 1.1498,
"step": 2162
},
{
"epoch": 4.917298937784522,
"grad_norm": 0.8433260321617126,
"learning_rate": 4.142501111387251e-07,
"loss": 1.1963,
"step": 2163
},
{
"epoch": 4.9195751138088015,
"grad_norm": 0.8118224740028381,
"learning_rate": 4.1254295966858206e-07,
"loss": 1.1702,
"step": 2164
},
{
"epoch": 4.92185128983308,
"grad_norm": 0.8457343578338623,
"learning_rate": 4.108390167130044e-07,
"loss": 1.1766,
"step": 2165
},
{
"epoch": 4.924127465857359,
"grad_norm": 0.8077833652496338,
"learning_rate": 4.0913828489102804e-07,
"loss": 1.1485,
"step": 2166
},
{
"epoch": 4.926403641881639,
"grad_norm": 0.8197788000106812,
"learning_rate": 4.074407668167549e-07,
"loss": 1.1737,
"step": 2167
},
{
"epoch": 4.928679817905918,
"grad_norm": 0.833842933177948,
"learning_rate": 4.057464650993451e-07,
"loss": 1.117,
"step": 2168
},
{
"epoch": 4.930955993930198,
"grad_norm": 0.8162358403205872,
"learning_rate": 4.0405538234301627e-07,
"loss": 1.1513,
"step": 2169
},
{
"epoch": 4.9332321699544766,
"grad_norm": 0.8365995287895203,
"learning_rate": 4.0236752114703764e-07,
"loss": 1.1059,
"step": 2170
},
{
"epoch": 4.935508345978755,
"grad_norm": 0.8199490308761597,
"learning_rate": 4.006828841057273e-07,
"loss": 1.1429,
"step": 2171
},
{
"epoch": 4.937784522003035,
"grad_norm": 0.8299211263656616,
"learning_rate": 3.9900147380844716e-07,
"loss": 1.1555,
"step": 2172
},
{
"epoch": 4.940060698027314,
"grad_norm": 0.8137705326080322,
"learning_rate": 3.9732329283960065e-07,
"loss": 1.1494,
"step": 2173
},
{
"epoch": 4.942336874051593,
"grad_norm": 0.8200099468231201,
"learning_rate": 3.956483437786257e-07,
"loss": 1.1554,
"step": 2174
},
{
"epoch": 4.944613050075873,
"grad_norm": 0.8227297067642212,
"learning_rate": 3.9397662919999495e-07,
"loss": 1.171,
"step": 2175
},
{
"epoch": 4.946889226100152,
"grad_norm": 0.8356049656867981,
"learning_rate": 3.923081516732077e-07,
"loss": 1.1456,
"step": 2176
},
{
"epoch": 4.9491654021244305,
"grad_norm": 0.8176268339157104,
"learning_rate": 3.906429137627882e-07,
"loss": 1.123,
"step": 2177
},
{
"epoch": 4.95144157814871,
"grad_norm": 0.8163107633590698,
"learning_rate": 3.8898091802828135e-07,
"loss": 1.1503,
"step": 2178
},
{
"epoch": 4.953717754172989,
"grad_norm": 0.7986423969268799,
"learning_rate": 3.8732216702424915e-07,
"loss": 1.145,
"step": 2179
},
{
"epoch": 4.955993930197269,
"grad_norm": 0.8431774973869324,
"learning_rate": 3.856666633002648e-07,
"loss": 1.1953,
"step": 2180
},
{
"epoch": 4.958270106221548,
"grad_norm": 0.840099036693573,
"learning_rate": 3.840144094009124e-07,
"loss": 1.1333,
"step": 2181
},
{
"epoch": 4.9605462822458275,
"grad_norm": 0.8205875754356384,
"learning_rate": 3.8236540786577987e-07,
"loss": 1.1537,
"step": 2182
},
{
"epoch": 4.962822458270106,
"grad_norm": 0.8261964917182922,
"learning_rate": 3.8071966122945585e-07,
"loss": 1.1524,
"step": 2183
},
{
"epoch": 4.965098634294385,
"grad_norm": 0.8418870568275452,
"learning_rate": 3.790771720215261e-07,
"loss": 1.1054,
"step": 2184
},
{
"epoch": 4.967374810318665,
"grad_norm": 0.8424736857414246,
"learning_rate": 3.774379427665695e-07,
"loss": 1.1525,
"step": 2185
},
{
"epoch": 4.969650986342944,
"grad_norm": 0.8084613680839539,
"learning_rate": 3.7580197598415523e-07,
"loss": 1.1821,
"step": 2186
},
{
"epoch": 4.971927162367223,
"grad_norm": 0.8364375233650208,
"learning_rate": 3.7416927418883724e-07,
"loss": 1.1547,
"step": 2187
},
{
"epoch": 4.9742033383915025,
"grad_norm": 0.8641977906227112,
"learning_rate": 3.7253983989015e-07,
"loss": 1.1181,
"step": 2188
},
{
"epoch": 4.976479514415781,
"grad_norm": 0.7975889444351196,
"learning_rate": 3.709136755926082e-07,
"loss": 1.1443,
"step": 2189
},
{
"epoch": 4.97875569044006,
"grad_norm": 0.8529811501502991,
"learning_rate": 3.69290783795698e-07,
"loss": 1.1151,
"step": 2190
},
{
"epoch": 4.98103186646434,
"grad_norm": 0.8240101933479309,
"learning_rate": 3.676711669938765e-07,
"loss": 1.1451,
"step": 2191
},
{
"epoch": 4.983308042488619,
"grad_norm": 0.8223132491111755,
"learning_rate": 3.6605482767656684e-07,
"loss": 1.1656,
"step": 2192
},
{
"epoch": 4.985584218512899,
"grad_norm": 0.8219751715660095,
"learning_rate": 3.644417683281551e-07,
"loss": 1.1648,
"step": 2193
},
{
"epoch": 4.9878603945371776,
"grad_norm": 0.8212680816650391,
"learning_rate": 3.628319914279843e-07,
"loss": 1.1567,
"step": 2194
},
{
"epoch": 4.990136570561456,
"grad_norm": 0.8115838170051575,
"learning_rate": 3.61225499450355e-07,
"loss": 1.1528,
"step": 2195
}
],
"logging_steps": 1,
"max_steps": 2634,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 439,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.910766821998592e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}