model_ctdlcm6q / checkpoint-878 /trainer_state.json
ugaoo's picture
Upload folder using huggingface_hub
09371fb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9969650986342944,
"eval_steps": 500,
"global_step": 878,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002276176024279211,
"grad_norm": 5.864941596984863,
"learning_rate": 5.0000000000000004e-08,
"loss": 1.982,
"step": 1
},
{
"epoch": 0.004552352048558422,
"grad_norm": 6.175215244293213,
"learning_rate": 1.0000000000000001e-07,
"loss": 2.0217,
"step": 2
},
{
"epoch": 0.006828528072837633,
"grad_norm": 6.1325860023498535,
"learning_rate": 1.5000000000000002e-07,
"loss": 2.0283,
"step": 3
},
{
"epoch": 0.009104704097116844,
"grad_norm": 6.438838481903076,
"learning_rate": 2.0000000000000002e-07,
"loss": 2.0133,
"step": 4
},
{
"epoch": 0.011380880121396054,
"grad_norm": 6.120014190673828,
"learning_rate": 2.5000000000000004e-07,
"loss": 1.9788,
"step": 5
},
{
"epoch": 0.013657056145675266,
"grad_norm": 6.399510860443115,
"learning_rate": 3.0000000000000004e-07,
"loss": 2.0115,
"step": 6
},
{
"epoch": 0.015933232169954476,
"grad_norm": 6.267389297485352,
"learning_rate": 3.5000000000000004e-07,
"loss": 2.034,
"step": 7
},
{
"epoch": 0.018209408194233688,
"grad_norm": 6.195969581604004,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.0221,
"step": 8
},
{
"epoch": 0.0204855842185129,
"grad_norm": 6.281792163848877,
"learning_rate": 4.5000000000000003e-07,
"loss": 2.034,
"step": 9
},
{
"epoch": 0.02276176024279211,
"grad_norm": 6.259925365447998,
"learning_rate": 5.000000000000001e-07,
"loss": 1.9919,
"step": 10
},
{
"epoch": 0.02503793626707132,
"grad_norm": 6.189306259155273,
"learning_rate": 5.5e-07,
"loss": 1.9989,
"step": 11
},
{
"epoch": 0.027314112291350532,
"grad_norm": 6.382223606109619,
"learning_rate": 6.000000000000001e-07,
"loss": 2.0004,
"step": 12
},
{
"epoch": 0.02959028831562974,
"grad_norm": 6.581198215484619,
"learning_rate": 6.5e-07,
"loss": 1.9606,
"step": 13
},
{
"epoch": 0.03186646433990895,
"grad_norm": 6.698477268218994,
"learning_rate": 7.000000000000001e-07,
"loss": 1.9986,
"step": 14
},
{
"epoch": 0.03414264036418816,
"grad_norm": 6.462113857269287,
"learning_rate": 7.5e-07,
"loss": 1.9435,
"step": 15
},
{
"epoch": 0.036418816388467376,
"grad_norm": 6.667123794555664,
"learning_rate": 8.000000000000001e-07,
"loss": 1.9262,
"step": 16
},
{
"epoch": 0.038694992412746584,
"grad_norm": 6.812009334564209,
"learning_rate": 8.500000000000001e-07,
"loss": 1.9341,
"step": 17
},
{
"epoch": 0.0409711684370258,
"grad_norm": 6.460822582244873,
"learning_rate": 9.000000000000001e-07,
"loss": 1.8857,
"step": 18
},
{
"epoch": 0.04324734446130501,
"grad_norm": 5.623890399932861,
"learning_rate": 9.500000000000001e-07,
"loss": 1.8256,
"step": 19
},
{
"epoch": 0.04552352048558422,
"grad_norm": 4.976780414581299,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.8312,
"step": 20
},
{
"epoch": 0.04779969650986343,
"grad_norm": 4.3025383949279785,
"learning_rate": 1.0500000000000001e-06,
"loss": 1.8263,
"step": 21
},
{
"epoch": 0.05007587253414264,
"grad_norm": 3.7881436347961426,
"learning_rate": 1.1e-06,
"loss": 1.7652,
"step": 22
},
{
"epoch": 0.05235204855842185,
"grad_norm": 3.4925425052642822,
"learning_rate": 1.1500000000000002e-06,
"loss": 1.7603,
"step": 23
},
{
"epoch": 0.054628224582701064,
"grad_norm": 3.0760865211486816,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.7599,
"step": 24
},
{
"epoch": 0.05690440060698027,
"grad_norm": 2.7170724868774414,
"learning_rate": 1.25e-06,
"loss": 1.7725,
"step": 25
},
{
"epoch": 0.05918057663125948,
"grad_norm": 2.0981554985046387,
"learning_rate": 1.3e-06,
"loss": 1.6781,
"step": 26
},
{
"epoch": 0.061456752655538696,
"grad_norm": 1.9057221412658691,
"learning_rate": 1.3500000000000002e-06,
"loss": 1.6897,
"step": 27
},
{
"epoch": 0.0637329286798179,
"grad_norm": 1.678957223892212,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.7124,
"step": 28
},
{
"epoch": 0.06600910470409711,
"grad_norm": 1.594223141670227,
"learning_rate": 1.45e-06,
"loss": 1.6953,
"step": 29
},
{
"epoch": 0.06828528072837632,
"grad_norm": 1.5038321018218994,
"learning_rate": 1.5e-06,
"loss": 1.6392,
"step": 30
},
{
"epoch": 0.07056145675265554,
"grad_norm": 1.5202770233154297,
"learning_rate": 1.5500000000000002e-06,
"loss": 1.6756,
"step": 31
},
{
"epoch": 0.07283763277693475,
"grad_norm": 1.4849720001220703,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.6587,
"step": 32
},
{
"epoch": 0.07511380880121396,
"grad_norm": 1.4973641633987427,
"learning_rate": 1.6500000000000003e-06,
"loss": 1.6222,
"step": 33
},
{
"epoch": 0.07738998482549317,
"grad_norm": 1.4055628776550293,
"learning_rate": 1.7000000000000002e-06,
"loss": 1.6318,
"step": 34
},
{
"epoch": 0.07966616084977238,
"grad_norm": 1.365734338760376,
"learning_rate": 1.75e-06,
"loss": 1.5656,
"step": 35
},
{
"epoch": 0.0819423368740516,
"grad_norm": 1.2574050426483154,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.602,
"step": 36
},
{
"epoch": 0.08421851289833081,
"grad_norm": 1.2459263801574707,
"learning_rate": 1.85e-06,
"loss": 1.571,
"step": 37
},
{
"epoch": 0.08649468892261002,
"grad_norm": 1.1563637256622314,
"learning_rate": 1.9000000000000002e-06,
"loss": 1.5968,
"step": 38
},
{
"epoch": 0.08877086494688922,
"grad_norm": 1.0916545391082764,
"learning_rate": 1.9500000000000004e-06,
"loss": 1.5493,
"step": 39
},
{
"epoch": 0.09104704097116843,
"grad_norm": 1.0802186727523804,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.529,
"step": 40
},
{
"epoch": 0.09332321699544764,
"grad_norm": 1.0635664463043213,
"learning_rate": 2.05e-06,
"loss": 1.4784,
"step": 41
},
{
"epoch": 0.09559939301972686,
"grad_norm": 0.985824465751648,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.5508,
"step": 42
},
{
"epoch": 0.09787556904400607,
"grad_norm": 1.036191701889038,
"learning_rate": 2.15e-06,
"loss": 1.5465,
"step": 43
},
{
"epoch": 0.10015174506828528,
"grad_norm": 1.0564978122711182,
"learning_rate": 2.2e-06,
"loss": 1.503,
"step": 44
},
{
"epoch": 0.10242792109256449,
"grad_norm": 1.1553199291229248,
"learning_rate": 2.25e-06,
"loss": 1.4578,
"step": 45
},
{
"epoch": 0.1047040971168437,
"grad_norm": 1.1265777349472046,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.4497,
"step": 46
},
{
"epoch": 0.10698027314112292,
"grad_norm": 0.9469030499458313,
"learning_rate": 2.35e-06,
"loss": 1.4676,
"step": 47
},
{
"epoch": 0.10925644916540213,
"grad_norm": 0.649141252040863,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.455,
"step": 48
},
{
"epoch": 0.11153262518968134,
"grad_norm": 0.6022727489471436,
"learning_rate": 2.4500000000000003e-06,
"loss": 1.4814,
"step": 49
},
{
"epoch": 0.11380880121396054,
"grad_norm": 0.7700338363647461,
"learning_rate": 2.5e-06,
"loss": 1.4786,
"step": 50
},
{
"epoch": 0.11608497723823975,
"grad_norm": 0.924614429473877,
"learning_rate": 2.55e-06,
"loss": 1.4338,
"step": 51
},
{
"epoch": 0.11836115326251896,
"grad_norm": 0.8892627954483032,
"learning_rate": 2.6e-06,
"loss": 1.441,
"step": 52
},
{
"epoch": 0.12063732928679818,
"grad_norm": 0.7454217076301575,
"learning_rate": 2.6500000000000005e-06,
"loss": 1.4016,
"step": 53
},
{
"epoch": 0.12291350531107739,
"grad_norm": 0.5784000754356384,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.4222,
"step": 54
},
{
"epoch": 0.1251896813353566,
"grad_norm": 0.5783917903900146,
"learning_rate": 2.7500000000000004e-06,
"loss": 1.4087,
"step": 55
},
{
"epoch": 0.1274658573596358,
"grad_norm": 0.5947427153587341,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.4008,
"step": 56
},
{
"epoch": 0.12974203338391502,
"grad_norm": 0.6172689199447632,
"learning_rate": 2.85e-06,
"loss": 1.4292,
"step": 57
},
{
"epoch": 0.13201820940819423,
"grad_norm": 0.6890118718147278,
"learning_rate": 2.9e-06,
"loss": 1.4215,
"step": 58
},
{
"epoch": 0.13429438543247343,
"grad_norm": 0.5748654007911682,
"learning_rate": 2.95e-06,
"loss": 1.4402,
"step": 59
},
{
"epoch": 0.13657056145675264,
"grad_norm": 0.5015429258346558,
"learning_rate": 3e-06,
"loss": 1.4338,
"step": 60
},
{
"epoch": 0.13884673748103188,
"grad_norm": 0.4844941794872284,
"learning_rate": 3.05e-06,
"loss": 1.3846,
"step": 61
},
{
"epoch": 0.1411229135053111,
"grad_norm": 0.48353612422943115,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.3864,
"step": 62
},
{
"epoch": 0.1433990895295903,
"grad_norm": 0.47880005836486816,
"learning_rate": 3.1500000000000003e-06,
"loss": 1.3764,
"step": 63
},
{
"epoch": 0.1456752655538695,
"grad_norm": 0.5600204467773438,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.398,
"step": 64
},
{
"epoch": 0.1479514415781487,
"grad_norm": 0.4868157207965851,
"learning_rate": 3.2500000000000002e-06,
"loss": 1.3959,
"step": 65
},
{
"epoch": 0.15022761760242792,
"grad_norm": 0.4253179430961609,
"learning_rate": 3.3000000000000006e-06,
"loss": 1.3695,
"step": 66
},
{
"epoch": 0.15250379362670713,
"grad_norm": 0.4152253270149231,
"learning_rate": 3.3500000000000005e-06,
"loss": 1.428,
"step": 67
},
{
"epoch": 0.15477996965098634,
"grad_norm": 0.43653807044029236,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.4244,
"step": 68
},
{
"epoch": 0.15705614567526555,
"grad_norm": 0.4184909164905548,
"learning_rate": 3.45e-06,
"loss": 1.413,
"step": 69
},
{
"epoch": 0.15933232169954475,
"grad_norm": 0.4401929974555969,
"learning_rate": 3.5e-06,
"loss": 1.3769,
"step": 70
},
{
"epoch": 0.16160849772382396,
"grad_norm": 0.42470934987068176,
"learning_rate": 3.5500000000000003e-06,
"loss": 1.328,
"step": 71
},
{
"epoch": 0.1638846737481032,
"grad_norm": 0.43167445063591003,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.3585,
"step": 72
},
{
"epoch": 0.1661608497723824,
"grad_norm": 0.39305731654167175,
"learning_rate": 3.65e-06,
"loss": 1.3635,
"step": 73
},
{
"epoch": 0.16843702579666162,
"grad_norm": 0.3937039077281952,
"learning_rate": 3.7e-06,
"loss": 1.3583,
"step": 74
},
{
"epoch": 0.17071320182094082,
"grad_norm": 0.4098603129386902,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.3651,
"step": 75
},
{
"epoch": 0.17298937784522003,
"grad_norm": 0.41061389446258545,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.4184,
"step": 76
},
{
"epoch": 0.17526555386949924,
"grad_norm": 0.3926120698451996,
"learning_rate": 3.85e-06,
"loss": 1.3693,
"step": 77
},
{
"epoch": 0.17754172989377845,
"grad_norm": 0.41317838430404663,
"learning_rate": 3.900000000000001e-06,
"loss": 1.3354,
"step": 78
},
{
"epoch": 0.17981790591805766,
"grad_norm": 0.37922877073287964,
"learning_rate": 3.95e-06,
"loss": 1.364,
"step": 79
},
{
"epoch": 0.18209408194233687,
"grad_norm": 0.3894996643066406,
"learning_rate": 4.000000000000001e-06,
"loss": 1.3495,
"step": 80
},
{
"epoch": 0.18437025796661607,
"grad_norm": 0.4024641513824463,
"learning_rate": 4.05e-06,
"loss": 1.3604,
"step": 81
},
{
"epoch": 0.18664643399089528,
"grad_norm": 0.38427308201789856,
"learning_rate": 4.1e-06,
"loss": 1.3734,
"step": 82
},
{
"epoch": 0.18892261001517452,
"grad_norm": 0.38881292939186096,
"learning_rate": 4.15e-06,
"loss": 1.3235,
"step": 83
},
{
"epoch": 0.19119878603945373,
"grad_norm": 0.4112228453159332,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.3714,
"step": 84
},
{
"epoch": 0.19347496206373294,
"grad_norm": 0.3790343999862671,
"learning_rate": 4.25e-06,
"loss": 1.3508,
"step": 85
},
{
"epoch": 0.19575113808801214,
"grad_norm": 0.38511818647384644,
"learning_rate": 4.3e-06,
"loss": 1.3726,
"step": 86
},
{
"epoch": 0.19802731411229135,
"grad_norm": 0.3809172213077545,
"learning_rate": 4.350000000000001e-06,
"loss": 1.3978,
"step": 87
},
{
"epoch": 0.20030349013657056,
"grad_norm": 0.39862319827079773,
"learning_rate": 4.4e-06,
"loss": 1.3402,
"step": 88
},
{
"epoch": 0.20257966616084977,
"grad_norm": 0.3779354989528656,
"learning_rate": 4.450000000000001e-06,
"loss": 1.3585,
"step": 89
},
{
"epoch": 0.20485584218512898,
"grad_norm": 0.3755280375480652,
"learning_rate": 4.5e-06,
"loss": 1.3809,
"step": 90
},
{
"epoch": 0.2071320182094082,
"grad_norm": 0.4072270691394806,
"learning_rate": 4.5500000000000005e-06,
"loss": 1.337,
"step": 91
},
{
"epoch": 0.2094081942336874,
"grad_norm": 0.3852587938308716,
"learning_rate": 4.600000000000001e-06,
"loss": 1.3239,
"step": 92
},
{
"epoch": 0.2116843702579666,
"grad_norm": 0.3857567012310028,
"learning_rate": 4.65e-06,
"loss": 1.3676,
"step": 93
},
{
"epoch": 0.21396054628224584,
"grad_norm": 0.39954471588134766,
"learning_rate": 4.7e-06,
"loss": 1.372,
"step": 94
},
{
"epoch": 0.21623672230652505,
"grad_norm": 0.3801283836364746,
"learning_rate": 4.75e-06,
"loss": 1.3636,
"step": 95
},
{
"epoch": 0.21851289833080426,
"grad_norm": 0.37748953700065613,
"learning_rate": 4.800000000000001e-06,
"loss": 1.3298,
"step": 96
},
{
"epoch": 0.22078907435508346,
"grad_norm": 0.3678078055381775,
"learning_rate": 4.85e-06,
"loss": 1.3267,
"step": 97
},
{
"epoch": 0.22306525037936267,
"grad_norm": 0.3928042948246002,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.3705,
"step": 98
},
{
"epoch": 0.22534142640364188,
"grad_norm": 0.3824443817138672,
"learning_rate": 4.95e-06,
"loss": 1.3536,
"step": 99
},
{
"epoch": 0.2276176024279211,
"grad_norm": 0.38775718212127686,
"learning_rate": 5e-06,
"loss": 1.3366,
"step": 100
},
{
"epoch": 0.2298937784522003,
"grad_norm": 0.39415422081947327,
"learning_rate": 4.999998078694254e-06,
"loss": 1.3369,
"step": 101
},
{
"epoch": 0.2321699544764795,
"grad_norm": 0.3640560507774353,
"learning_rate": 4.999992314779968e-06,
"loss": 1.3548,
"step": 102
},
{
"epoch": 0.23444613050075871,
"grad_norm": 0.38077881932258606,
"learning_rate": 4.999982708266002e-06,
"loss": 1.322,
"step": 103
},
{
"epoch": 0.23672230652503792,
"grad_norm": 0.3910675346851349,
"learning_rate": 4.999969259167121e-06,
"loss": 1.3568,
"step": 104
},
{
"epoch": 0.23899848254931716,
"grad_norm": 0.3724777102470398,
"learning_rate": 4.999951967503998e-06,
"loss": 1.3657,
"step": 105
},
{
"epoch": 0.24127465857359637,
"grad_norm": 0.39835065603256226,
"learning_rate": 4.9999308333032095e-06,
"loss": 1.3728,
"step": 106
},
{
"epoch": 0.24355083459787558,
"grad_norm": 0.3887874186038971,
"learning_rate": 4.999905856597241e-06,
"loss": 1.3269,
"step": 107
},
{
"epoch": 0.24582701062215478,
"grad_norm": 0.37291401624679565,
"learning_rate": 4.999877037424482e-06,
"loss": 1.3522,
"step": 108
},
{
"epoch": 0.248103186646434,
"grad_norm": 0.3793584406375885,
"learning_rate": 4.999844375829229e-06,
"loss": 1.3459,
"step": 109
},
{
"epoch": 0.2503793626707132,
"grad_norm": 0.38437148928642273,
"learning_rate": 4.999807871861686e-06,
"loss": 1.3419,
"step": 110
},
{
"epoch": 0.2526555386949924,
"grad_norm": 0.37772583961486816,
"learning_rate": 4.999767525577958e-06,
"loss": 1.3349,
"step": 111
},
{
"epoch": 0.2549317147192716,
"grad_norm": 0.3829944133758545,
"learning_rate": 4.999723337040062e-06,
"loss": 1.3193,
"step": 112
},
{
"epoch": 0.2572078907435508,
"grad_norm": 0.38355737924575806,
"learning_rate": 4.999675306315917e-06,
"loss": 1.3457,
"step": 113
},
{
"epoch": 0.25948406676783003,
"grad_norm": 0.39071688055992126,
"learning_rate": 4.999623433479346e-06,
"loss": 1.3401,
"step": 114
},
{
"epoch": 0.26176024279210924,
"grad_norm": 0.3796067535877228,
"learning_rate": 4.9995677186100835e-06,
"loss": 1.3593,
"step": 115
},
{
"epoch": 0.26403641881638845,
"grad_norm": 0.3870932459831238,
"learning_rate": 4.9995081617937635e-06,
"loss": 1.3678,
"step": 116
},
{
"epoch": 0.26631259484066766,
"grad_norm": 0.3870759606361389,
"learning_rate": 4.999444763121928e-06,
"loss": 1.331,
"step": 117
},
{
"epoch": 0.26858877086494687,
"grad_norm": 0.37003180384635925,
"learning_rate": 4.999377522692023e-06,
"loss": 1.3242,
"step": 118
},
{
"epoch": 0.2708649468892261,
"grad_norm": 0.3826284408569336,
"learning_rate": 4.999306440607401e-06,
"loss": 1.2921,
"step": 119
},
{
"epoch": 0.2731411229135053,
"grad_norm": 0.3886045515537262,
"learning_rate": 4.999231516977318e-06,
"loss": 1.2971,
"step": 120
},
{
"epoch": 0.27541729893778455,
"grad_norm": 0.3992857336997986,
"learning_rate": 4.999152751916936e-06,
"loss": 1.2872,
"step": 121
},
{
"epoch": 0.27769347496206376,
"grad_norm": 0.4303230941295624,
"learning_rate": 4.999070145547318e-06,
"loss": 1.3562,
"step": 122
},
{
"epoch": 0.27996965098634297,
"grad_norm": 0.40188783407211304,
"learning_rate": 4.998983697995435e-06,
"loss": 1.3251,
"step": 123
},
{
"epoch": 0.2822458270106222,
"grad_norm": 0.41683951020240784,
"learning_rate": 4.998893409394162e-06,
"loss": 1.3279,
"step": 124
},
{
"epoch": 0.2845220030349014,
"grad_norm": 0.4539605379104614,
"learning_rate": 4.9987992798822745e-06,
"loss": 1.3133,
"step": 125
},
{
"epoch": 0.2867981790591806,
"grad_norm": 0.40195104479789734,
"learning_rate": 4.998701309604454e-06,
"loss": 1.3372,
"step": 126
},
{
"epoch": 0.2890743550834598,
"grad_norm": 0.40602678060531616,
"learning_rate": 4.998599498711287e-06,
"loss": 1.3008,
"step": 127
},
{
"epoch": 0.291350531107739,
"grad_norm": 0.37955862283706665,
"learning_rate": 4.99849384735926e-06,
"loss": 1.2919,
"step": 128
},
{
"epoch": 0.2936267071320182,
"grad_norm": 0.38034912943840027,
"learning_rate": 4.9983843557107635e-06,
"loss": 1.3307,
"step": 129
},
{
"epoch": 0.2959028831562974,
"grad_norm": 0.3922058641910553,
"learning_rate": 4.9982710239340915e-06,
"loss": 1.3211,
"step": 130
},
{
"epoch": 0.29817905918057663,
"grad_norm": 0.4012414515018463,
"learning_rate": 4.998153852203441e-06,
"loss": 1.3762,
"step": 131
},
{
"epoch": 0.30045523520485584,
"grad_norm": 0.41045159101486206,
"learning_rate": 4.998032840698909e-06,
"loss": 1.3384,
"step": 132
},
{
"epoch": 0.30273141122913505,
"grad_norm": 0.3880952298641205,
"learning_rate": 4.997907989606495e-06,
"loss": 1.2976,
"step": 133
},
{
"epoch": 0.30500758725341426,
"grad_norm": 0.39358070492744446,
"learning_rate": 4.997779299118102e-06,
"loss": 1.3036,
"step": 134
},
{
"epoch": 0.30728376327769347,
"grad_norm": 0.400647908449173,
"learning_rate": 4.997646769431532e-06,
"loss": 1.3573,
"step": 135
},
{
"epoch": 0.3095599393019727,
"grad_norm": 0.40589869022369385,
"learning_rate": 4.99751040075049e-06,
"loss": 1.3462,
"step": 136
},
{
"epoch": 0.3118361153262519,
"grad_norm": 0.420673131942749,
"learning_rate": 4.997370193284581e-06,
"loss": 1.317,
"step": 137
},
{
"epoch": 0.3141122913505311,
"grad_norm": 0.3844830393791199,
"learning_rate": 4.997226147249309e-06,
"loss": 1.3437,
"step": 138
},
{
"epoch": 0.3163884673748103,
"grad_norm": 0.37681150436401367,
"learning_rate": 4.9970782628660794e-06,
"loss": 1.3216,
"step": 139
},
{
"epoch": 0.3186646433990895,
"grad_norm": 0.40281322598457336,
"learning_rate": 4.996926540362198e-06,
"loss": 1.3578,
"step": 140
},
{
"epoch": 0.3209408194233687,
"grad_norm": 0.3950099050998688,
"learning_rate": 4.9967709799708675e-06,
"loss": 1.3472,
"step": 141
},
{
"epoch": 0.3232169954476479,
"grad_norm": 0.3890508711338043,
"learning_rate": 4.9966115819311926e-06,
"loss": 1.3112,
"step": 142
},
{
"epoch": 0.3254931714719272,
"grad_norm": 0.3960939347743988,
"learning_rate": 4.996448346488175e-06,
"loss": 1.331,
"step": 143
},
{
"epoch": 0.3277693474962064,
"grad_norm": 0.394761323928833,
"learning_rate": 4.9962812738927135e-06,
"loss": 1.3265,
"step": 144
},
{
"epoch": 0.3300455235204856,
"grad_norm": 0.4139835238456726,
"learning_rate": 4.996110364401607e-06,
"loss": 1.3423,
"step": 145
},
{
"epoch": 0.3323216995447648,
"grad_norm": 0.40223428606987,
"learning_rate": 4.9959356182775525e-06,
"loss": 1.3213,
"step": 146
},
{
"epoch": 0.334597875569044,
"grad_norm": 0.41239285469055176,
"learning_rate": 4.9957570357891406e-06,
"loss": 1.3488,
"step": 147
},
{
"epoch": 0.33687405159332323,
"grad_norm": 0.41569817066192627,
"learning_rate": 4.995574617210861e-06,
"loss": 1.3373,
"step": 148
},
{
"epoch": 0.33915022761760244,
"grad_norm": 0.40224048495292664,
"learning_rate": 4.9953883628231e-06,
"loss": 1.3086,
"step": 149
},
{
"epoch": 0.34142640364188165,
"grad_norm": 0.4080573618412018,
"learning_rate": 4.995198272912137e-06,
"loss": 1.3221,
"step": 150
},
{
"epoch": 0.34370257966616086,
"grad_norm": 0.41279059648513794,
"learning_rate": 4.9950043477701505e-06,
"loss": 1.3336,
"step": 151
},
{
"epoch": 0.34597875569044007,
"grad_norm": 0.4138430655002594,
"learning_rate": 4.994806587695212e-06,
"loss": 1.3245,
"step": 152
},
{
"epoch": 0.3482549317147193,
"grad_norm": 0.4141685664653778,
"learning_rate": 4.994604992991287e-06,
"loss": 1.3459,
"step": 153
},
{
"epoch": 0.3505311077389985,
"grad_norm": 0.4655224680900574,
"learning_rate": 4.994399563968235e-06,
"loss": 1.307,
"step": 154
},
{
"epoch": 0.3528072837632777,
"grad_norm": 0.40181776881217957,
"learning_rate": 4.99419030094181e-06,
"loss": 1.2951,
"step": 155
},
{
"epoch": 0.3550834597875569,
"grad_norm": 0.4349536597728729,
"learning_rate": 4.99397720423366e-06,
"loss": 1.3346,
"step": 156
},
{
"epoch": 0.3573596358118361,
"grad_norm": 0.47389090061187744,
"learning_rate": 4.993760274171322e-06,
"loss": 1.2918,
"step": 157
},
{
"epoch": 0.3596358118361153,
"grad_norm": 0.43464231491088867,
"learning_rate": 4.993539511088228e-06,
"loss": 1.3469,
"step": 158
},
{
"epoch": 0.3619119878603945,
"grad_norm": 0.43050721287727356,
"learning_rate": 4.993314915323701e-06,
"loss": 1.2993,
"step": 159
},
{
"epoch": 0.36418816388467373,
"grad_norm": 0.4154967665672302,
"learning_rate": 4.9930864872229555e-06,
"loss": 1.301,
"step": 160
},
{
"epoch": 0.36646433990895294,
"grad_norm": 0.4043583869934082,
"learning_rate": 4.992854227137094e-06,
"loss": 1.3357,
"step": 161
},
{
"epoch": 0.36874051593323215,
"grad_norm": 0.4242326617240906,
"learning_rate": 4.992618135423111e-06,
"loss": 1.3139,
"step": 162
},
{
"epoch": 0.37101669195751136,
"grad_norm": 0.4029645621776581,
"learning_rate": 4.992378212443891e-06,
"loss": 1.2773,
"step": 163
},
{
"epoch": 0.37329286798179057,
"grad_norm": 0.3948841989040375,
"learning_rate": 4.992134458568205e-06,
"loss": 1.3267,
"step": 164
},
{
"epoch": 0.37556904400606983,
"grad_norm": 0.4325512647628784,
"learning_rate": 4.991886874170715e-06,
"loss": 1.2986,
"step": 165
},
{
"epoch": 0.37784522003034904,
"grad_norm": 0.4292261600494385,
"learning_rate": 4.991635459631968e-06,
"loss": 1.3383,
"step": 166
},
{
"epoch": 0.38012139605462825,
"grad_norm": 0.407819539308548,
"learning_rate": 4.991380215338399e-06,
"loss": 1.2798,
"step": 167
},
{
"epoch": 0.38239757207890746,
"grad_norm": 0.41592007875442505,
"learning_rate": 4.991121141682332e-06,
"loss": 1.3161,
"step": 168
},
{
"epoch": 0.38467374810318666,
"grad_norm": 0.4135512411594391,
"learning_rate": 4.990858239061973e-06,
"loss": 1.3221,
"step": 169
},
{
"epoch": 0.38694992412746587,
"grad_norm": 0.4168025851249695,
"learning_rate": 4.990591507881416e-06,
"loss": 1.3094,
"step": 170
},
{
"epoch": 0.3892261001517451,
"grad_norm": 0.42845603823661804,
"learning_rate": 4.990320948550638e-06,
"loss": 1.3086,
"step": 171
},
{
"epoch": 0.3915022761760243,
"grad_norm": 0.4117361009120941,
"learning_rate": 4.9900465614855e-06,
"loss": 1.3074,
"step": 172
},
{
"epoch": 0.3937784522003035,
"grad_norm": 0.40385058522224426,
"learning_rate": 4.989768347107749e-06,
"loss": 1.3015,
"step": 173
},
{
"epoch": 0.3960546282245827,
"grad_norm": 0.42507070302963257,
"learning_rate": 4.989486305845012e-06,
"loss": 1.303,
"step": 174
},
{
"epoch": 0.3983308042488619,
"grad_norm": 0.4167408347129822,
"learning_rate": 4.989200438130799e-06,
"loss": 1.3246,
"step": 175
},
{
"epoch": 0.4006069802731411,
"grad_norm": 0.4459727108478546,
"learning_rate": 4.988910744404501e-06,
"loss": 1.3082,
"step": 176
},
{
"epoch": 0.40288315629742033,
"grad_norm": 0.41572514176368713,
"learning_rate": 4.988617225111392e-06,
"loss": 1.329,
"step": 177
},
{
"epoch": 0.40515933232169954,
"grad_norm": 0.40346917510032654,
"learning_rate": 4.988319880702621e-06,
"loss": 1.3204,
"step": 178
},
{
"epoch": 0.40743550834597875,
"grad_norm": 0.49305301904678345,
"learning_rate": 4.988018711635223e-06,
"loss": 1.3174,
"step": 179
},
{
"epoch": 0.40971168437025796,
"grad_norm": 0.4136899411678314,
"learning_rate": 4.987713718372106e-06,
"loss": 1.3153,
"step": 180
},
{
"epoch": 0.41198786039453716,
"grad_norm": 0.4320002794265747,
"learning_rate": 4.98740490138206e-06,
"loss": 1.3233,
"step": 181
},
{
"epoch": 0.4142640364188164,
"grad_norm": 0.40051817893981934,
"learning_rate": 4.9870922611397484e-06,
"loss": 1.3298,
"step": 182
},
{
"epoch": 0.4165402124430956,
"grad_norm": 0.43490317463874817,
"learning_rate": 4.986775798125715e-06,
"loss": 1.2924,
"step": 183
},
{
"epoch": 0.4188163884673748,
"grad_norm": 0.41733044385910034,
"learning_rate": 4.986455512826377e-06,
"loss": 1.3407,
"step": 184
},
{
"epoch": 0.421092564491654,
"grad_norm": 0.45686185359954834,
"learning_rate": 4.986131405734027e-06,
"loss": 1.3002,
"step": 185
},
{
"epoch": 0.4233687405159332,
"grad_norm": 0.4178033173084259,
"learning_rate": 4.985803477346832e-06,
"loss": 1.2707,
"step": 186
},
{
"epoch": 0.42564491654021247,
"grad_norm": 0.44030341506004333,
"learning_rate": 4.985471728168832e-06,
"loss": 1.3522,
"step": 187
},
{
"epoch": 0.4279210925644917,
"grad_norm": 0.4167434573173523,
"learning_rate": 4.985136158709942e-06,
"loss": 1.2952,
"step": 188
},
{
"epoch": 0.4301972685887709,
"grad_norm": 0.43799030780792236,
"learning_rate": 4.984796769485946e-06,
"loss": 1.3204,
"step": 189
},
{
"epoch": 0.4324734446130501,
"grad_norm": 0.3963024914264679,
"learning_rate": 4.984453561018501e-06,
"loss": 1.2852,
"step": 190
},
{
"epoch": 0.4347496206373293,
"grad_norm": 0.4606306850910187,
"learning_rate": 4.984106533835132e-06,
"loss": 1.3,
"step": 191
},
{
"epoch": 0.4370257966616085,
"grad_norm": 0.43703702092170715,
"learning_rate": 4.9837556884692374e-06,
"loss": 1.2865,
"step": 192
},
{
"epoch": 0.4393019726858877,
"grad_norm": 0.419226735830307,
"learning_rate": 4.9834010254600814e-06,
"loss": 1.3212,
"step": 193
},
{
"epoch": 0.44157814871016693,
"grad_norm": 0.4051378071308136,
"learning_rate": 4.983042545352796e-06,
"loss": 1.3102,
"step": 194
},
{
"epoch": 0.44385432473444614,
"grad_norm": 0.44308584928512573,
"learning_rate": 4.982680248698383e-06,
"loss": 1.2753,
"step": 195
},
{
"epoch": 0.44613050075872535,
"grad_norm": 0.48592913150787354,
"learning_rate": 4.982314136053707e-06,
"loss": 1.3468,
"step": 196
},
{
"epoch": 0.44840667678300455,
"grad_norm": 0.4361239969730377,
"learning_rate": 4.981944207981499e-06,
"loss": 1.2345,
"step": 197
},
{
"epoch": 0.45068285280728376,
"grad_norm": 0.4420235753059387,
"learning_rate": 4.981570465050357e-06,
"loss": 1.308,
"step": 198
},
{
"epoch": 0.45295902883156297,
"grad_norm": 0.4724012315273285,
"learning_rate": 4.98119290783474e-06,
"loss": 1.3451,
"step": 199
},
{
"epoch": 0.4552352048558422,
"grad_norm": 0.4347815215587616,
"learning_rate": 4.980811536914968e-06,
"loss": 1.2926,
"step": 200
},
{
"epoch": 0.4575113808801214,
"grad_norm": 0.4243141710758209,
"learning_rate": 4.980426352877228e-06,
"loss": 1.2863,
"step": 201
},
{
"epoch": 0.4597875569044006,
"grad_norm": 0.41129249334335327,
"learning_rate": 4.980037356313563e-06,
"loss": 1.3017,
"step": 202
},
{
"epoch": 0.4620637329286798,
"grad_norm": 0.4349686801433563,
"learning_rate": 4.979644547821879e-06,
"loss": 1.3655,
"step": 203
},
{
"epoch": 0.464339908952959,
"grad_norm": 0.438151478767395,
"learning_rate": 4.97924792800594e-06,
"loss": 1.304,
"step": 204
},
{
"epoch": 0.4666160849772382,
"grad_norm": 0.46755126118659973,
"learning_rate": 4.978847497475369e-06,
"loss": 1.3282,
"step": 205
},
{
"epoch": 0.46889226100151743,
"grad_norm": 0.42544615268707275,
"learning_rate": 4.9784432568456445e-06,
"loss": 1.3524,
"step": 206
},
{
"epoch": 0.47116843702579664,
"grad_norm": 0.4163425862789154,
"learning_rate": 4.9780352067381024e-06,
"loss": 1.3303,
"step": 207
},
{
"epoch": 0.47344461305007585,
"grad_norm": 0.4662051498889923,
"learning_rate": 4.977623347779935e-06,
"loss": 1.2723,
"step": 208
},
{
"epoch": 0.4757207890743551,
"grad_norm": 0.4841192662715912,
"learning_rate": 4.977207680604187e-06,
"loss": 1.3281,
"step": 209
},
{
"epoch": 0.4779969650986343,
"grad_norm": 0.47023245692253113,
"learning_rate": 4.976788205849758e-06,
"loss": 1.2983,
"step": 210
},
{
"epoch": 0.4802731411229135,
"grad_norm": 0.4251156449317932,
"learning_rate": 4.9763649241613985e-06,
"loss": 1.3215,
"step": 211
},
{
"epoch": 0.48254931714719274,
"grad_norm": 0.436788409948349,
"learning_rate": 4.975937836189712e-06,
"loss": 1.3006,
"step": 212
},
{
"epoch": 0.48482549317147194,
"grad_norm": 0.46025222539901733,
"learning_rate": 4.975506942591152e-06,
"loss": 1.3121,
"step": 213
},
{
"epoch": 0.48710166919575115,
"grad_norm": 0.43663930892944336,
"learning_rate": 4.97507224402802e-06,
"loss": 1.3133,
"step": 214
},
{
"epoch": 0.48937784522003036,
"grad_norm": 0.48787179589271545,
"learning_rate": 4.974633741168469e-06,
"loss": 1.266,
"step": 215
},
{
"epoch": 0.49165402124430957,
"grad_norm": 0.4265913665294647,
"learning_rate": 4.974191434686496e-06,
"loss": 1.3035,
"step": 216
},
{
"epoch": 0.4939301972685888,
"grad_norm": 0.4345017373561859,
"learning_rate": 4.973745325261946e-06,
"loss": 1.2987,
"step": 217
},
{
"epoch": 0.496206373292868,
"grad_norm": 0.47078996896743774,
"learning_rate": 4.973295413580509e-06,
"loss": 1.3176,
"step": 218
},
{
"epoch": 0.4984825493171472,
"grad_norm": 0.4349548816680908,
"learning_rate": 4.97284170033372e-06,
"loss": 1.2829,
"step": 219
},
{
"epoch": 0.5007587253414264,
"grad_norm": 0.4705260694026947,
"learning_rate": 4.9723841862189555e-06,
"loss": 1.2847,
"step": 220
},
{
"epoch": 0.5030349013657056,
"grad_norm": 0.4285137951374054,
"learning_rate": 4.971922871939436e-06,
"loss": 1.2774,
"step": 221
},
{
"epoch": 0.5053110773899848,
"grad_norm": 0.46022048592567444,
"learning_rate": 4.971457758204221e-06,
"loss": 1.3006,
"step": 222
},
{
"epoch": 0.507587253414264,
"grad_norm": 0.4904478192329407,
"learning_rate": 4.970988845728213e-06,
"loss": 1.3032,
"step": 223
},
{
"epoch": 0.5098634294385432,
"grad_norm": 0.4171503484249115,
"learning_rate": 4.9705161352321496e-06,
"loss": 1.3118,
"step": 224
},
{
"epoch": 0.5121396054628224,
"grad_norm": 0.4424084722995758,
"learning_rate": 4.970039627442608e-06,
"loss": 1.2342,
"step": 225
},
{
"epoch": 0.5144157814871017,
"grad_norm": 0.45744988322257996,
"learning_rate": 4.969559323092004e-06,
"loss": 1.2975,
"step": 226
},
{
"epoch": 0.5166919575113809,
"grad_norm": 0.4306228756904602,
"learning_rate": 4.969075222918583e-06,
"loss": 1.2791,
"step": 227
},
{
"epoch": 0.5189681335356601,
"grad_norm": 0.43930479884147644,
"learning_rate": 4.9685873276664324e-06,
"loss": 1.2952,
"step": 228
},
{
"epoch": 0.5212443095599393,
"grad_norm": 0.4268686771392822,
"learning_rate": 4.968095638085467e-06,
"loss": 1.2902,
"step": 229
},
{
"epoch": 0.5235204855842185,
"grad_norm": 0.4320680499076843,
"learning_rate": 4.9676001549314356e-06,
"loss": 1.2941,
"step": 230
},
{
"epoch": 0.5257966616084977,
"grad_norm": 0.4509009122848511,
"learning_rate": 4.967100878965918e-06,
"loss": 1.3353,
"step": 231
},
{
"epoch": 0.5280728376327769,
"grad_norm": 0.4458315670490265,
"learning_rate": 4.966597810956325e-06,
"loss": 1.2918,
"step": 232
},
{
"epoch": 0.5303490136570561,
"grad_norm": 0.4613376259803772,
"learning_rate": 4.966090951675893e-06,
"loss": 1.3085,
"step": 233
},
{
"epoch": 0.5326251896813353,
"grad_norm": 0.4486188590526581,
"learning_rate": 4.9655803019036875e-06,
"loss": 1.2783,
"step": 234
},
{
"epoch": 0.5349013657056145,
"grad_norm": 0.44070056080818176,
"learning_rate": 4.9650658624246e-06,
"loss": 1.2969,
"step": 235
},
{
"epoch": 0.5371775417298937,
"grad_norm": 0.45442667603492737,
"learning_rate": 4.9645476340293474e-06,
"loss": 1.273,
"step": 236
},
{
"epoch": 0.539453717754173,
"grad_norm": 0.4485810697078705,
"learning_rate": 4.96402561751447e-06,
"loss": 1.2524,
"step": 237
},
{
"epoch": 0.5417298937784522,
"grad_norm": 0.43408727645874023,
"learning_rate": 4.96349981368233e-06,
"loss": 1.3,
"step": 238
},
{
"epoch": 0.5440060698027314,
"grad_norm": 0.45317673683166504,
"learning_rate": 4.962970223341112e-06,
"loss": 1.2959,
"step": 239
},
{
"epoch": 0.5462822458270106,
"grad_norm": 0.45147350430488586,
"learning_rate": 4.962436847304818e-06,
"loss": 1.2588,
"step": 240
},
{
"epoch": 0.5485584218512898,
"grad_norm": 0.4372202157974243,
"learning_rate": 4.961899686393273e-06,
"loss": 1.2472,
"step": 241
},
{
"epoch": 0.5508345978755691,
"grad_norm": 0.4300381541252136,
"learning_rate": 4.961358741432116e-06,
"loss": 1.2892,
"step": 242
},
{
"epoch": 0.5531107738998483,
"grad_norm": 0.4326576888561249,
"learning_rate": 4.9608140132528045e-06,
"loss": 1.2873,
"step": 243
},
{
"epoch": 0.5553869499241275,
"grad_norm": 0.42891374230384827,
"learning_rate": 4.960265502692609e-06,
"loss": 1.3159,
"step": 244
},
{
"epoch": 0.5576631259484067,
"grad_norm": 0.44637322425842285,
"learning_rate": 4.959713210594616e-06,
"loss": 1.2964,
"step": 245
},
{
"epoch": 0.5599393019726859,
"grad_norm": 0.4534567892551422,
"learning_rate": 4.959157137807721e-06,
"loss": 1.2811,
"step": 246
},
{
"epoch": 0.5622154779969651,
"grad_norm": 0.4480896294116974,
"learning_rate": 4.958597285186635e-06,
"loss": 1.2887,
"step": 247
},
{
"epoch": 0.5644916540212443,
"grad_norm": 0.42966964840888977,
"learning_rate": 4.958033653591874e-06,
"loss": 1.2927,
"step": 248
},
{
"epoch": 0.5667678300455236,
"grad_norm": 0.4520474076271057,
"learning_rate": 4.9574662438897675e-06,
"loss": 1.334,
"step": 249
},
{
"epoch": 0.5690440060698028,
"grad_norm": 0.4476149082183838,
"learning_rate": 4.956895056952448e-06,
"loss": 1.2813,
"step": 250
},
{
"epoch": 0.571320182094082,
"grad_norm": 0.4495325982570648,
"learning_rate": 4.956320093657855e-06,
"loss": 1.3455,
"step": 251
},
{
"epoch": 0.5735963581183612,
"grad_norm": 0.4634062945842743,
"learning_rate": 4.955741354889734e-06,
"loss": 1.3009,
"step": 252
},
{
"epoch": 0.5758725341426404,
"grad_norm": 0.43844589591026306,
"learning_rate": 4.955158841537632e-06,
"loss": 1.2775,
"step": 253
},
{
"epoch": 0.5781487101669196,
"grad_norm": 0.4297947585582733,
"learning_rate": 4.954572554496897e-06,
"loss": 1.3005,
"step": 254
},
{
"epoch": 0.5804248861911988,
"grad_norm": 0.45026981830596924,
"learning_rate": 4.953982494668679e-06,
"loss": 1.2829,
"step": 255
},
{
"epoch": 0.582701062215478,
"grad_norm": 0.4508177936077118,
"learning_rate": 4.953388662959926e-06,
"loss": 1.3249,
"step": 256
},
{
"epoch": 0.5849772382397572,
"grad_norm": 0.4628501236438751,
"learning_rate": 4.952791060283384e-06,
"loss": 1.2772,
"step": 257
},
{
"epoch": 0.5872534142640364,
"grad_norm": 0.47145721316337585,
"learning_rate": 4.952189687557595e-06,
"loss": 1.2843,
"step": 258
},
{
"epoch": 0.5895295902883156,
"grad_norm": 0.44380298256874084,
"learning_rate": 4.951584545706896e-06,
"loss": 1.3169,
"step": 259
},
{
"epoch": 0.5918057663125948,
"grad_norm": 0.45627689361572266,
"learning_rate": 4.950975635661416e-06,
"loss": 1.2855,
"step": 260
},
{
"epoch": 0.5940819423368741,
"grad_norm": 0.43097957968711853,
"learning_rate": 4.950362958357078e-06,
"loss": 1.2802,
"step": 261
},
{
"epoch": 0.5963581183611533,
"grad_norm": 0.4480797052383423,
"learning_rate": 4.949746514735594e-06,
"loss": 1.2845,
"step": 262
},
{
"epoch": 0.5986342943854325,
"grad_norm": 0.4356028139591217,
"learning_rate": 4.949126305744466e-06,
"loss": 1.2559,
"step": 263
},
{
"epoch": 0.6009104704097117,
"grad_norm": 0.45533114671707153,
"learning_rate": 4.948502332336982e-06,
"loss": 1.333,
"step": 264
},
{
"epoch": 0.6031866464339909,
"grad_norm": 0.43486839532852173,
"learning_rate": 4.947874595472216e-06,
"loss": 1.299,
"step": 265
},
{
"epoch": 0.6054628224582701,
"grad_norm": 0.45472636818885803,
"learning_rate": 4.947243096115028e-06,
"loss": 1.2853,
"step": 266
},
{
"epoch": 0.6077389984825493,
"grad_norm": 0.448030024766922,
"learning_rate": 4.946607835236064e-06,
"loss": 1.2549,
"step": 267
},
{
"epoch": 0.6100151745068285,
"grad_norm": 0.46248579025268555,
"learning_rate": 4.945968813811743e-06,
"loss": 1.2845,
"step": 268
},
{
"epoch": 0.6122913505311077,
"grad_norm": 0.47284016013145447,
"learning_rate": 4.9453260328242735e-06,
"loss": 1.274,
"step": 269
},
{
"epoch": 0.6145675265553869,
"grad_norm": 0.46916916966438293,
"learning_rate": 4.944679493261637e-06,
"loss": 1.272,
"step": 270
},
{
"epoch": 0.6168437025796661,
"grad_norm": 0.4469199776649475,
"learning_rate": 4.944029196117594e-06,
"loss": 1.273,
"step": 271
},
{
"epoch": 0.6191198786039454,
"grad_norm": 0.4460132420063019,
"learning_rate": 4.943375142391679e-06,
"loss": 1.2749,
"step": 272
},
{
"epoch": 0.6213960546282246,
"grad_norm": 0.45281344652175903,
"learning_rate": 4.942717333089204e-06,
"loss": 1.2858,
"step": 273
},
{
"epoch": 0.6236722306525038,
"grad_norm": 0.4766104221343994,
"learning_rate": 4.942055769221249e-06,
"loss": 1.3047,
"step": 274
},
{
"epoch": 0.625948406676783,
"grad_norm": 0.4342869818210602,
"learning_rate": 4.941390451804668e-06,
"loss": 1.258,
"step": 275
},
{
"epoch": 0.6282245827010622,
"grad_norm": 0.44943931698799133,
"learning_rate": 4.940721381862083e-06,
"loss": 1.2714,
"step": 276
},
{
"epoch": 0.6305007587253414,
"grad_norm": 0.4642450213432312,
"learning_rate": 4.940048560421887e-06,
"loss": 1.2883,
"step": 277
},
{
"epoch": 0.6327769347496206,
"grad_norm": 0.530925989151001,
"learning_rate": 4.9393719885182335e-06,
"loss": 1.2869,
"step": 278
},
{
"epoch": 0.6350531107738998,
"grad_norm": 0.44706323742866516,
"learning_rate": 4.938691667191044e-06,
"loss": 1.2912,
"step": 279
},
{
"epoch": 0.637329286798179,
"grad_norm": 0.46952497959136963,
"learning_rate": 4.938007597486005e-06,
"loss": 1.3293,
"step": 280
},
{
"epoch": 0.6396054628224582,
"grad_norm": 0.45387259125709534,
"learning_rate": 4.937319780454559e-06,
"loss": 1.2328,
"step": 281
},
{
"epoch": 0.6418816388467374,
"grad_norm": 0.4683968126773834,
"learning_rate": 4.936628217153914e-06,
"loss": 1.3101,
"step": 282
},
{
"epoch": 0.6441578148710166,
"grad_norm": 0.4984208941459656,
"learning_rate": 4.935932908647033e-06,
"loss": 1.3078,
"step": 283
},
{
"epoch": 0.6464339908952959,
"grad_norm": 0.47393515706062317,
"learning_rate": 4.935233856002635e-06,
"loss": 1.2667,
"step": 284
},
{
"epoch": 0.6487101669195751,
"grad_norm": 0.4559146761894226,
"learning_rate": 4.9345310602951964e-06,
"loss": 1.2816,
"step": 285
},
{
"epoch": 0.6509863429438544,
"grad_norm": 0.4612574279308319,
"learning_rate": 4.933824522604945e-06,
"loss": 1.3009,
"step": 286
},
{
"epoch": 0.6532625189681336,
"grad_norm": 0.4839983880519867,
"learning_rate": 4.933114244017861e-06,
"loss": 1.2762,
"step": 287
},
{
"epoch": 0.6555386949924128,
"grad_norm": 0.47950032353401184,
"learning_rate": 4.932400225625674e-06,
"loss": 1.2639,
"step": 288
},
{
"epoch": 0.657814871016692,
"grad_norm": 0.46797841787338257,
"learning_rate": 4.931682468525863e-06,
"loss": 1.3116,
"step": 289
},
{
"epoch": 0.6600910470409712,
"grad_norm": 0.46507689356803894,
"learning_rate": 4.93096097382165e-06,
"loss": 1.2795,
"step": 290
},
{
"epoch": 0.6623672230652504,
"grad_norm": 0.4672064781188965,
"learning_rate": 4.9302357426220086e-06,
"loss": 1.2769,
"step": 291
},
{
"epoch": 0.6646433990895296,
"grad_norm": 0.469881147146225,
"learning_rate": 4.929506776041648e-06,
"loss": 1.246,
"step": 292
},
{
"epoch": 0.6669195751138088,
"grad_norm": 0.49012723565101624,
"learning_rate": 4.928774075201024e-06,
"loss": 1.3308,
"step": 293
},
{
"epoch": 0.669195751138088,
"grad_norm": 0.47186344861984253,
"learning_rate": 4.9280376412263295e-06,
"loss": 1.2685,
"step": 294
},
{
"epoch": 0.6714719271623673,
"grad_norm": 0.4914249777793884,
"learning_rate": 4.9272974752494974e-06,
"loss": 1.3029,
"step": 295
},
{
"epoch": 0.6737481031866465,
"grad_norm": 0.4709179699420929,
"learning_rate": 4.9265535784081965e-06,
"loss": 1.2459,
"step": 296
},
{
"epoch": 0.6760242792109257,
"grad_norm": 0.46568986773490906,
"learning_rate": 4.925805951845826e-06,
"loss": 1.2713,
"step": 297
},
{
"epoch": 0.6783004552352049,
"grad_norm": 0.46113038063049316,
"learning_rate": 4.925054596711526e-06,
"loss": 1.2787,
"step": 298
},
{
"epoch": 0.6805766312594841,
"grad_norm": 0.49636346101760864,
"learning_rate": 4.92429951416016e-06,
"loss": 1.2787,
"step": 299
},
{
"epoch": 0.6828528072837633,
"grad_norm": 0.4823263883590698,
"learning_rate": 4.9235407053523235e-06,
"loss": 1.3029,
"step": 300
},
{
"epoch": 0.6851289833080425,
"grad_norm": 0.45272234082221985,
"learning_rate": 4.92277817145434e-06,
"loss": 1.3053,
"step": 301
},
{
"epoch": 0.6874051593323217,
"grad_norm": 0.4724232256412506,
"learning_rate": 4.922011913638258e-06,
"loss": 1.2594,
"step": 302
},
{
"epoch": 0.6896813353566009,
"grad_norm": 0.5244677066802979,
"learning_rate": 4.92124193308185e-06,
"loss": 1.305,
"step": 303
},
{
"epoch": 0.6919575113808801,
"grad_norm": 0.4562852382659912,
"learning_rate": 4.92046823096861e-06,
"loss": 1.283,
"step": 304
},
{
"epoch": 0.6942336874051593,
"grad_norm": 0.460565447807312,
"learning_rate": 4.919690808487754e-06,
"loss": 1.3004,
"step": 305
},
{
"epoch": 0.6965098634294385,
"grad_norm": 0.4588528871536255,
"learning_rate": 4.918909666834214e-06,
"loss": 1.2745,
"step": 306
},
{
"epoch": 0.6987860394537178,
"grad_norm": 0.4980691075325012,
"learning_rate": 4.91812480720864e-06,
"loss": 1.2802,
"step": 307
},
{
"epoch": 0.701062215477997,
"grad_norm": 0.5080570578575134,
"learning_rate": 4.917336230817396e-06,
"loss": 1.286,
"step": 308
},
{
"epoch": 0.7033383915022762,
"grad_norm": 0.46659743785858154,
"learning_rate": 4.9165439388725585e-06,
"loss": 1.3093,
"step": 309
},
{
"epoch": 0.7056145675265554,
"grad_norm": 0.4846821129322052,
"learning_rate": 4.915747932591916e-06,
"loss": 1.2904,
"step": 310
},
{
"epoch": 0.7078907435508346,
"grad_norm": 0.4945422112941742,
"learning_rate": 4.914948213198966e-06,
"loss": 1.2592,
"step": 311
},
{
"epoch": 0.7101669195751138,
"grad_norm": 0.49606069922447205,
"learning_rate": 4.9141447819229125e-06,
"loss": 1.2699,
"step": 312
},
{
"epoch": 0.712443095599393,
"grad_norm": 0.48810863494873047,
"learning_rate": 4.913337639998666e-06,
"loss": 1.2993,
"step": 313
},
{
"epoch": 0.7147192716236722,
"grad_norm": 0.4933323562145233,
"learning_rate": 4.912526788666838e-06,
"loss": 1.2514,
"step": 314
},
{
"epoch": 0.7169954476479514,
"grad_norm": 0.4674908220767975,
"learning_rate": 4.911712229173745e-06,
"loss": 1.2602,
"step": 315
},
{
"epoch": 0.7192716236722306,
"grad_norm": 0.5178641676902771,
"learning_rate": 4.9108939627714e-06,
"loss": 1.312,
"step": 316
},
{
"epoch": 0.7215477996965098,
"grad_norm": 0.4949224293231964,
"learning_rate": 4.910071990717516e-06,
"loss": 1.2787,
"step": 317
},
{
"epoch": 0.723823975720789,
"grad_norm": 0.4700353443622589,
"learning_rate": 4.909246314275499e-06,
"loss": 1.251,
"step": 318
},
{
"epoch": 0.7261001517450683,
"grad_norm": 0.4828815758228302,
"learning_rate": 4.908416934714452e-06,
"loss": 1.2967,
"step": 319
},
{
"epoch": 0.7283763277693475,
"grad_norm": 0.47781631350517273,
"learning_rate": 4.907583853309168e-06,
"loss": 1.3108,
"step": 320
},
{
"epoch": 0.7306525037936267,
"grad_norm": 0.4467979073524475,
"learning_rate": 4.90674707134013e-06,
"loss": 1.2332,
"step": 321
},
{
"epoch": 0.7329286798179059,
"grad_norm": 0.4529818892478943,
"learning_rate": 4.90590659009351e-06,
"loss": 1.2958,
"step": 322
},
{
"epoch": 0.7352048558421851,
"grad_norm": 0.4782491624355316,
"learning_rate": 4.905062410861164e-06,
"loss": 1.2754,
"step": 323
},
{
"epoch": 0.7374810318664643,
"grad_norm": 0.4517338275909424,
"learning_rate": 4.9042145349406335e-06,
"loss": 1.3098,
"step": 324
},
{
"epoch": 0.7397572078907435,
"grad_norm": 0.4599636197090149,
"learning_rate": 4.903362963635142e-06,
"loss": 1.2843,
"step": 325
},
{
"epoch": 0.7420333839150227,
"grad_norm": 0.4922712743282318,
"learning_rate": 4.902507698253593e-06,
"loss": 1.2987,
"step": 326
},
{
"epoch": 0.7443095599393019,
"grad_norm": 0.47610870003700256,
"learning_rate": 4.901648740110566e-06,
"loss": 1.2739,
"step": 327
},
{
"epoch": 0.7465857359635811,
"grad_norm": 0.46494367718696594,
"learning_rate": 4.900786090526319e-06,
"loss": 1.2579,
"step": 328
},
{
"epoch": 0.7488619119878603,
"grad_norm": 0.46867313981056213,
"learning_rate": 4.899919750826784e-06,
"loss": 1.2838,
"step": 329
},
{
"epoch": 0.7511380880121397,
"grad_norm": 0.49616602063179016,
"learning_rate": 4.899049722343561e-06,
"loss": 1.3108,
"step": 330
},
{
"epoch": 0.7534142640364189,
"grad_norm": 0.46307483315467834,
"learning_rate": 4.898176006413925e-06,
"loss": 1.3047,
"step": 331
},
{
"epoch": 0.7556904400606981,
"grad_norm": 0.47475141286849976,
"learning_rate": 4.897298604380816e-06,
"loss": 1.2416,
"step": 332
},
{
"epoch": 0.7579666160849773,
"grad_norm": 0.468184232711792,
"learning_rate": 4.896417517592838e-06,
"loss": 1.2904,
"step": 333
},
{
"epoch": 0.7602427921092565,
"grad_norm": 0.47171875834465027,
"learning_rate": 4.895532747404263e-06,
"loss": 1.2641,
"step": 334
},
{
"epoch": 0.7625189681335357,
"grad_norm": 0.45646342635154724,
"learning_rate": 4.8946442951750215e-06,
"loss": 1.285,
"step": 335
},
{
"epoch": 0.7647951441578149,
"grad_norm": 0.48363035917282104,
"learning_rate": 4.893752162270704e-06,
"loss": 1.2507,
"step": 336
},
{
"epoch": 0.7670713201820941,
"grad_norm": 0.4761241674423218,
"learning_rate": 4.892856350062558e-06,
"loss": 1.2628,
"step": 337
},
{
"epoch": 0.7693474962063733,
"grad_norm": 0.47408172488212585,
"learning_rate": 4.891956859927489e-06,
"loss": 1.2919,
"step": 338
},
{
"epoch": 0.7716236722306525,
"grad_norm": 0.48075783252716064,
"learning_rate": 4.89105369324805e-06,
"loss": 1.282,
"step": 339
},
{
"epoch": 0.7738998482549317,
"grad_norm": 0.45937585830688477,
"learning_rate": 4.890146851412452e-06,
"loss": 1.2823,
"step": 340
},
{
"epoch": 0.776176024279211,
"grad_norm": 0.5253570675849915,
"learning_rate": 4.889236335814549e-06,
"loss": 1.2657,
"step": 341
},
{
"epoch": 0.7784522003034902,
"grad_norm": 0.47888922691345215,
"learning_rate": 4.888322147853846e-06,
"loss": 1.3003,
"step": 342
},
{
"epoch": 0.7807283763277694,
"grad_norm": 0.4705219566822052,
"learning_rate": 4.887404288935488e-06,
"loss": 1.2822,
"step": 343
},
{
"epoch": 0.7830045523520486,
"grad_norm": 0.5236004590988159,
"learning_rate": 4.8864827604702675e-06,
"loss": 1.2338,
"step": 344
},
{
"epoch": 0.7852807283763278,
"grad_norm": 0.4856922924518585,
"learning_rate": 4.885557563874614e-06,
"loss": 1.2394,
"step": 345
},
{
"epoch": 0.787556904400607,
"grad_norm": 0.48127493262290955,
"learning_rate": 4.884628700570595e-06,
"loss": 1.2827,
"step": 346
},
{
"epoch": 0.7898330804248862,
"grad_norm": 0.46932077407836914,
"learning_rate": 4.883696171985917e-06,
"loss": 1.2608,
"step": 347
},
{
"epoch": 0.7921092564491654,
"grad_norm": 0.5052128434181213,
"learning_rate": 4.882759979553916e-06,
"loss": 1.2727,
"step": 348
},
{
"epoch": 0.7943854324734446,
"grad_norm": 0.5077352523803711,
"learning_rate": 4.881820124713562e-06,
"loss": 1.2364,
"step": 349
},
{
"epoch": 0.7966616084977238,
"grad_norm": 0.5095151662826538,
"learning_rate": 4.880876608909454e-06,
"loss": 1.2788,
"step": 350
},
{
"epoch": 0.798937784522003,
"grad_norm": 0.4920441806316376,
"learning_rate": 4.8799294335918185e-06,
"loss": 1.2944,
"step": 351
},
{
"epoch": 0.8012139605462822,
"grad_norm": 0.4824545085430145,
"learning_rate": 4.8789786002165055e-06,
"loss": 1.2669,
"step": 352
},
{
"epoch": 0.8034901365705615,
"grad_norm": 0.49492961168289185,
"learning_rate": 4.878024110244988e-06,
"loss": 1.3021,
"step": 353
},
{
"epoch": 0.8057663125948407,
"grad_norm": 0.5213160514831543,
"learning_rate": 4.877065965144361e-06,
"loss": 1.2832,
"step": 354
},
{
"epoch": 0.8080424886191199,
"grad_norm": 0.4782240390777588,
"learning_rate": 4.8761041663873345e-06,
"loss": 1.2812,
"step": 355
},
{
"epoch": 0.8103186646433991,
"grad_norm": 0.4901832938194275,
"learning_rate": 4.875138715452237e-06,
"loss": 1.289,
"step": 356
},
{
"epoch": 0.8125948406676783,
"grad_norm": 0.48875507712364197,
"learning_rate": 4.87416961382301e-06,
"loss": 1.2876,
"step": 357
},
{
"epoch": 0.8148710166919575,
"grad_norm": 0.49773871898651123,
"learning_rate": 4.873196862989205e-06,
"loss": 1.2766,
"step": 358
},
{
"epoch": 0.8171471927162367,
"grad_norm": 0.5069698691368103,
"learning_rate": 4.872220464445983e-06,
"loss": 1.284,
"step": 359
},
{
"epoch": 0.8194233687405159,
"grad_norm": 0.4725041389465332,
"learning_rate": 4.871240419694115e-06,
"loss": 1.2183,
"step": 360
},
{
"epoch": 0.8216995447647951,
"grad_norm": 0.4846250116825104,
"learning_rate": 4.8702567302399705e-06,
"loss": 1.2851,
"step": 361
},
{
"epoch": 0.8239757207890743,
"grad_norm": 0.4825296998023987,
"learning_rate": 4.869269397595525e-06,
"loss": 1.2621,
"step": 362
},
{
"epoch": 0.8262518968133535,
"grad_norm": 0.4880293905735016,
"learning_rate": 4.8682784232783535e-06,
"loss": 1.2684,
"step": 363
},
{
"epoch": 0.8285280728376327,
"grad_norm": 0.4805878698825836,
"learning_rate": 4.867283808811626e-06,
"loss": 1.2604,
"step": 364
},
{
"epoch": 0.830804248861912,
"grad_norm": 0.5031499266624451,
"learning_rate": 4.86628555572411e-06,
"loss": 1.2701,
"step": 365
},
{
"epoch": 0.8330804248861912,
"grad_norm": 0.49856945872306824,
"learning_rate": 4.865283665550167e-06,
"loss": 1.266,
"step": 366
},
{
"epoch": 0.8353566009104704,
"grad_norm": 0.49834373593330383,
"learning_rate": 4.864278139829745e-06,
"loss": 1.254,
"step": 367
},
{
"epoch": 0.8376327769347496,
"grad_norm": 0.47436273097991943,
"learning_rate": 4.863268980108381e-06,
"loss": 1.308,
"step": 368
},
{
"epoch": 0.8399089529590288,
"grad_norm": 0.4866158962249756,
"learning_rate": 4.8622561879372e-06,
"loss": 1.2565,
"step": 369
},
{
"epoch": 0.842185128983308,
"grad_norm": 0.46591049432754517,
"learning_rate": 4.861239764872909e-06,
"loss": 1.2528,
"step": 370
},
{
"epoch": 0.8444613050075872,
"grad_norm": 0.5084807872772217,
"learning_rate": 4.860219712477795e-06,
"loss": 1.2727,
"step": 371
},
{
"epoch": 0.8467374810318664,
"grad_norm": 0.49390751123428345,
"learning_rate": 4.859196032319724e-06,
"loss": 1.2544,
"step": 372
},
{
"epoch": 0.8490136570561456,
"grad_norm": 0.4931376576423645,
"learning_rate": 4.8581687259721375e-06,
"loss": 1.2728,
"step": 373
},
{
"epoch": 0.8512898330804249,
"grad_norm": 0.4991268813610077,
"learning_rate": 4.857137795014051e-06,
"loss": 1.2382,
"step": 374
},
{
"epoch": 0.8535660091047041,
"grad_norm": 0.48629266023635864,
"learning_rate": 4.856103241030054e-06,
"loss": 1.2464,
"step": 375
},
{
"epoch": 0.8558421851289834,
"grad_norm": 0.4945109188556671,
"learning_rate": 4.855065065610298e-06,
"loss": 1.2592,
"step": 376
},
{
"epoch": 0.8581183611532626,
"grad_norm": 0.4683839678764343,
"learning_rate": 4.8540232703505085e-06,
"loss": 1.2795,
"step": 377
},
{
"epoch": 0.8603945371775418,
"grad_norm": 0.4917154610157013,
"learning_rate": 4.8529778568519695e-06,
"loss": 1.297,
"step": 378
},
{
"epoch": 0.862670713201821,
"grad_norm": 0.4950079917907715,
"learning_rate": 4.851928826721528e-06,
"loss": 1.2424,
"step": 379
},
{
"epoch": 0.8649468892261002,
"grad_norm": 0.49165982007980347,
"learning_rate": 4.850876181571592e-06,
"loss": 1.2442,
"step": 380
},
{
"epoch": 0.8672230652503794,
"grad_norm": 0.47863882780075073,
"learning_rate": 4.849819923020121e-06,
"loss": 1.2946,
"step": 381
},
{
"epoch": 0.8694992412746586,
"grad_norm": 0.5066231489181519,
"learning_rate": 4.848760052690635e-06,
"loss": 1.2658,
"step": 382
},
{
"epoch": 0.8717754172989378,
"grad_norm": 0.46788156032562256,
"learning_rate": 4.847696572212199e-06,
"loss": 1.2787,
"step": 383
},
{
"epoch": 0.874051593323217,
"grad_norm": 0.5010194182395935,
"learning_rate": 4.846629483219431e-06,
"loss": 1.2645,
"step": 384
},
{
"epoch": 0.8763277693474962,
"grad_norm": 0.480258584022522,
"learning_rate": 4.845558787352495e-06,
"loss": 1.2535,
"step": 385
},
{
"epoch": 0.8786039453717754,
"grad_norm": 0.5160472393035889,
"learning_rate": 4.844484486257097e-06,
"loss": 1.2838,
"step": 386
},
{
"epoch": 0.8808801213960546,
"grad_norm": 0.5098587870597839,
"learning_rate": 4.843406581584487e-06,
"loss": 1.2834,
"step": 387
},
{
"epoch": 0.8831562974203339,
"grad_norm": 0.5033400058746338,
"learning_rate": 4.8423250749914515e-06,
"loss": 1.2959,
"step": 388
},
{
"epoch": 0.8854324734446131,
"grad_norm": 0.506367564201355,
"learning_rate": 4.841239968140316e-06,
"loss": 1.2757,
"step": 389
},
{
"epoch": 0.8877086494688923,
"grad_norm": 0.47980019450187683,
"learning_rate": 4.8401512626989354e-06,
"loss": 1.2683,
"step": 390
},
{
"epoch": 0.8899848254931715,
"grad_norm": 0.48923107981681824,
"learning_rate": 4.8390589603407005e-06,
"loss": 1.2325,
"step": 391
},
{
"epoch": 0.8922610015174507,
"grad_norm": 0.4891837537288666,
"learning_rate": 4.8379630627445286e-06,
"loss": 1.2508,
"step": 392
},
{
"epoch": 0.8945371775417299,
"grad_norm": 0.4819527566432953,
"learning_rate": 4.836863571594863e-06,
"loss": 1.2655,
"step": 393
},
{
"epoch": 0.8968133535660091,
"grad_norm": 0.5067424178123474,
"learning_rate": 4.83576048858167e-06,
"loss": 1.2477,
"step": 394
},
{
"epoch": 0.8990895295902883,
"grad_norm": 0.5201086401939392,
"learning_rate": 4.8346538154004386e-06,
"loss": 1.249,
"step": 395
},
{
"epoch": 0.9013657056145675,
"grad_norm": 0.5033949017524719,
"learning_rate": 4.833543553752173e-06,
"loss": 1.2882,
"step": 396
},
{
"epoch": 0.9036418816388467,
"grad_norm": 0.4921282231807709,
"learning_rate": 4.8324297053433975e-06,
"loss": 1.2355,
"step": 397
},
{
"epoch": 0.9059180576631259,
"grad_norm": 0.49898359179496765,
"learning_rate": 4.831312271886145e-06,
"loss": 1.24,
"step": 398
},
{
"epoch": 0.9081942336874052,
"grad_norm": 0.4932885468006134,
"learning_rate": 4.83019125509796e-06,
"loss": 1.2651,
"step": 399
},
{
"epoch": 0.9104704097116844,
"grad_norm": 0.5081654191017151,
"learning_rate": 4.829066656701897e-06,
"loss": 1.2846,
"step": 400
},
{
"epoch": 0.9127465857359636,
"grad_norm": 0.4848720133304596,
"learning_rate": 4.8279384784265124e-06,
"loss": 1.2834,
"step": 401
},
{
"epoch": 0.9150227617602428,
"grad_norm": 0.47641217708587646,
"learning_rate": 4.826806722005868e-06,
"loss": 1.2556,
"step": 402
},
{
"epoch": 0.917298937784522,
"grad_norm": 0.5004164576530457,
"learning_rate": 4.825671389179522e-06,
"loss": 1.2852,
"step": 403
},
{
"epoch": 0.9195751138088012,
"grad_norm": 0.5069151520729065,
"learning_rate": 4.824532481692533e-06,
"loss": 1.2468,
"step": 404
},
{
"epoch": 0.9218512898330804,
"grad_norm": 0.5043609738349915,
"learning_rate": 4.823390001295453e-06,
"loss": 1.2602,
"step": 405
},
{
"epoch": 0.9241274658573596,
"grad_norm": 0.47922301292419434,
"learning_rate": 4.822243949744324e-06,
"loss": 1.2909,
"step": 406
},
{
"epoch": 0.9264036418816388,
"grad_norm": 0.5012561082839966,
"learning_rate": 4.821094328800678e-06,
"loss": 1.3058,
"step": 407
},
{
"epoch": 0.928679817905918,
"grad_norm": 0.5232773423194885,
"learning_rate": 4.8199411402315356e-06,
"loss": 1.2689,
"step": 408
},
{
"epoch": 0.9309559939301972,
"grad_norm": 0.5023229718208313,
"learning_rate": 4.8187843858093975e-06,
"loss": 1.2623,
"step": 409
},
{
"epoch": 0.9332321699544764,
"grad_norm": 0.5061272382736206,
"learning_rate": 4.817624067312247e-06,
"loss": 1.2771,
"step": 410
},
{
"epoch": 0.9355083459787557,
"grad_norm": 0.47715064883232117,
"learning_rate": 4.816460186523547e-06,
"loss": 1.266,
"step": 411
},
{
"epoch": 0.9377845220030349,
"grad_norm": 0.5037026405334473,
"learning_rate": 4.815292745232233e-06,
"loss": 1.2812,
"step": 412
},
{
"epoch": 0.9400606980273141,
"grad_norm": 0.47421544790267944,
"learning_rate": 4.814121745232714e-06,
"loss": 1.2349,
"step": 413
},
{
"epoch": 0.9423368740515933,
"grad_norm": 0.5214923620223999,
"learning_rate": 4.812947188324868e-06,
"loss": 1.2986,
"step": 414
},
{
"epoch": 0.9446130500758725,
"grad_norm": 0.5169025659561157,
"learning_rate": 4.811769076314044e-06,
"loss": 1.2687,
"step": 415
},
{
"epoch": 0.9468892261001517,
"grad_norm": 0.5028119087219238,
"learning_rate": 4.8105874110110516e-06,
"loss": 1.2666,
"step": 416
},
{
"epoch": 0.9491654021244309,
"grad_norm": 0.5233621597290039,
"learning_rate": 4.809402194232163e-06,
"loss": 1.2817,
"step": 417
},
{
"epoch": 0.9514415781487102,
"grad_norm": 0.5662165880203247,
"learning_rate": 4.808213427799108e-06,
"loss": 1.212,
"step": 418
},
{
"epoch": 0.9537177541729894,
"grad_norm": 0.5214280486106873,
"learning_rate": 4.807021113539077e-06,
"loss": 1.2659,
"step": 419
},
{
"epoch": 0.9559939301972686,
"grad_norm": 0.5059605240821838,
"learning_rate": 4.805825253284706e-06,
"loss": 1.2417,
"step": 420
},
{
"epoch": 0.9582701062215478,
"grad_norm": 0.48347723484039307,
"learning_rate": 4.804625848874088e-06,
"loss": 1.279,
"step": 421
},
{
"epoch": 0.960546282245827,
"grad_norm": 0.5225522518157959,
"learning_rate": 4.803422902150762e-06,
"loss": 1.2555,
"step": 422
},
{
"epoch": 0.9628224582701063,
"grad_norm": 0.49709466099739075,
"learning_rate": 4.802216414963708e-06,
"loss": 1.2956,
"step": 423
},
{
"epoch": 0.9650986342943855,
"grad_norm": 0.500357985496521,
"learning_rate": 4.801006389167352e-06,
"loss": 1.2748,
"step": 424
},
{
"epoch": 0.9673748103186647,
"grad_norm": 0.504552960395813,
"learning_rate": 4.799792826621559e-06,
"loss": 1.2939,
"step": 425
},
{
"epoch": 0.9696509863429439,
"grad_norm": 0.4881986379623413,
"learning_rate": 4.7985757291916264e-06,
"loss": 1.2827,
"step": 426
},
{
"epoch": 0.9719271623672231,
"grad_norm": 0.517511785030365,
"learning_rate": 4.797355098748289e-06,
"loss": 1.2668,
"step": 427
},
{
"epoch": 0.9742033383915023,
"grad_norm": 0.49534812569618225,
"learning_rate": 4.796130937167709e-06,
"loss": 1.2878,
"step": 428
},
{
"epoch": 0.9764795144157815,
"grad_norm": 0.4725462794303894,
"learning_rate": 4.794903246331477e-06,
"loss": 1.2612,
"step": 429
},
{
"epoch": 0.9787556904400607,
"grad_norm": 0.49760913848876953,
"learning_rate": 4.79367202812661e-06,
"loss": 1.284,
"step": 430
},
{
"epoch": 0.9810318664643399,
"grad_norm": 0.5361410975456238,
"learning_rate": 4.792437284445545e-06,
"loss": 1.2517,
"step": 431
},
{
"epoch": 0.9833080424886191,
"grad_norm": 0.5160269141197205,
"learning_rate": 4.791199017186137e-06,
"loss": 1.2422,
"step": 432
},
{
"epoch": 0.9855842185128983,
"grad_norm": 0.5418286919593811,
"learning_rate": 4.7899572282516596e-06,
"loss": 1.2697,
"step": 433
},
{
"epoch": 0.9878603945371776,
"grad_norm": 0.5236756801605225,
"learning_rate": 4.788711919550796e-06,
"loss": 1.2546,
"step": 434
},
{
"epoch": 0.9901365705614568,
"grad_norm": 0.4919045567512512,
"learning_rate": 4.787463092997643e-06,
"loss": 1.2478,
"step": 435
},
{
"epoch": 0.992412746585736,
"grad_norm": 0.4918051064014435,
"learning_rate": 4.786210750511701e-06,
"loss": 1.2522,
"step": 436
},
{
"epoch": 0.9946889226100152,
"grad_norm": 0.5032536387443542,
"learning_rate": 4.784954894017878e-06,
"loss": 1.2924,
"step": 437
},
{
"epoch": 0.9969650986342944,
"grad_norm": 0.5253746509552002,
"learning_rate": 4.78369552544648e-06,
"loss": 1.258,
"step": 438
},
{
"epoch": 0.9992412746585736,
"grad_norm": 0.5097838044166565,
"learning_rate": 4.782432646733214e-06,
"loss": 1.2479,
"step": 439
},
{
"epoch": 1.0,
"grad_norm": 0.5097838044166565,
"learning_rate": 4.781166259819179e-06,
"loss": 1.2895,
"step": 440
},
{
"epoch": 1.0022761760242793,
"grad_norm": 1.0558606386184692,
"learning_rate": 4.77989636665087e-06,
"loss": 1.2707,
"step": 441
},
{
"epoch": 1.0045523520485584,
"grad_norm": 0.47916215658187866,
"learning_rate": 4.778622969180167e-06,
"loss": 1.2364,
"step": 442
},
{
"epoch": 1.0068285280728377,
"grad_norm": 0.5158357620239258,
"learning_rate": 4.777346069364343e-06,
"loss": 1.2421,
"step": 443
},
{
"epoch": 1.0091047040971168,
"grad_norm": 0.4970231354236603,
"learning_rate": 4.776065669166045e-06,
"loss": 1.2534,
"step": 444
},
{
"epoch": 1.0113808801213962,
"grad_norm": 0.529381513595581,
"learning_rate": 4.774781770553309e-06,
"loss": 1.2429,
"step": 445
},
{
"epoch": 1.0136570561456753,
"grad_norm": 0.5027406811714172,
"learning_rate": 4.773494375499543e-06,
"loss": 1.2427,
"step": 446
},
{
"epoch": 1.0159332321699546,
"grad_norm": 0.5164632797241211,
"learning_rate": 4.772203485983531e-06,
"loss": 1.273,
"step": 447
},
{
"epoch": 1.0182094081942337,
"grad_norm": 0.5203757882118225,
"learning_rate": 4.770909103989426e-06,
"loss": 1.2261,
"step": 448
},
{
"epoch": 1.020485584218513,
"grad_norm": 0.518552839756012,
"learning_rate": 4.769611231506753e-06,
"loss": 1.2404,
"step": 449
},
{
"epoch": 1.022761760242792,
"grad_norm": 0.5020595788955688,
"learning_rate": 4.7683098705303995e-06,
"loss": 1.2722,
"step": 450
},
{
"epoch": 1.0250379362670714,
"grad_norm": 0.508852481842041,
"learning_rate": 4.767005023060615e-06,
"loss": 1.2344,
"step": 451
},
{
"epoch": 1.0273141122913505,
"grad_norm": 0.5240857005119324,
"learning_rate": 4.765696691103008e-06,
"loss": 1.2553,
"step": 452
},
{
"epoch": 1.0295902883156298,
"grad_norm": 0.5548052787780762,
"learning_rate": 4.764384876668542e-06,
"loss": 1.3039,
"step": 453
},
{
"epoch": 1.031866464339909,
"grad_norm": 0.5021058917045593,
"learning_rate": 4.763069581773537e-06,
"loss": 1.2636,
"step": 454
},
{
"epoch": 1.0341426403641882,
"grad_norm": 0.5170218348503113,
"learning_rate": 4.761750808439658e-06,
"loss": 1.2584,
"step": 455
},
{
"epoch": 1.0364188163884673,
"grad_norm": 0.5254265069961548,
"learning_rate": 4.760428558693919e-06,
"loss": 1.2578,
"step": 456
},
{
"epoch": 1.0386949924127467,
"grad_norm": 0.5046964883804321,
"learning_rate": 4.7591028345686765e-06,
"loss": 1.253,
"step": 457
},
{
"epoch": 1.0409711684370258,
"grad_norm": 0.5212562084197998,
"learning_rate": 4.757773638101629e-06,
"loss": 1.2453,
"step": 458
},
{
"epoch": 1.043247344461305,
"grad_norm": 0.5397632718086243,
"learning_rate": 4.7564409713358075e-06,
"loss": 1.2612,
"step": 459
},
{
"epoch": 1.0455235204855842,
"grad_norm": 0.5086544752120972,
"learning_rate": 4.755104836319583e-06,
"loss": 1.27,
"step": 460
},
{
"epoch": 1.0477996965098635,
"grad_norm": 0.4974862337112427,
"learning_rate": 4.7537652351066545e-06,
"loss": 1.1955,
"step": 461
},
{
"epoch": 1.0500758725341426,
"grad_norm": 0.5382196307182312,
"learning_rate": 4.752422169756048e-06,
"loss": 1.2996,
"step": 462
},
{
"epoch": 1.052352048558422,
"grad_norm": 0.5093661546707153,
"learning_rate": 4.751075642332116e-06,
"loss": 1.2671,
"step": 463
},
{
"epoch": 1.054628224582701,
"grad_norm": 0.53044593334198,
"learning_rate": 4.749725654904529e-06,
"loss": 1.2572,
"step": 464
},
{
"epoch": 1.0569044006069803,
"grad_norm": 0.5372816920280457,
"learning_rate": 4.74837220954828e-06,
"loss": 1.2215,
"step": 465
},
{
"epoch": 1.0591805766312594,
"grad_norm": 0.5148317217826843,
"learning_rate": 4.747015308343673e-06,
"loss": 1.2636,
"step": 466
},
{
"epoch": 1.0614567526555387,
"grad_norm": 0.5267722010612488,
"learning_rate": 4.745654953376327e-06,
"loss": 1.2786,
"step": 467
},
{
"epoch": 1.0637329286798178,
"grad_norm": 0.5123690366744995,
"learning_rate": 4.744291146737169e-06,
"loss": 1.2217,
"step": 468
},
{
"epoch": 1.0660091047040972,
"grad_norm": 0.5397908687591553,
"learning_rate": 4.74292389052243e-06,
"loss": 1.2353,
"step": 469
},
{
"epoch": 1.0682852807283763,
"grad_norm": 0.5311163067817688,
"learning_rate": 4.741553186833642e-06,
"loss": 1.2307,
"step": 470
},
{
"epoch": 1.0705614567526556,
"grad_norm": 0.5108172297477722,
"learning_rate": 4.740179037777639e-06,
"loss": 1.2526,
"step": 471
},
{
"epoch": 1.0728376327769347,
"grad_norm": 0.5670639276504517,
"learning_rate": 4.7388014454665495e-06,
"loss": 1.214,
"step": 472
},
{
"epoch": 1.075113808801214,
"grad_norm": 0.5621855854988098,
"learning_rate": 4.737420412017795e-06,
"loss": 1.2202,
"step": 473
},
{
"epoch": 1.077389984825493,
"grad_norm": 0.5175919532775879,
"learning_rate": 4.736035939554084e-06,
"loss": 1.2295,
"step": 474
},
{
"epoch": 1.0796661608497724,
"grad_norm": 0.510009765625,
"learning_rate": 4.7346480302034144e-06,
"loss": 1.2489,
"step": 475
},
{
"epoch": 1.0819423368740515,
"grad_norm": 0.5198955535888672,
"learning_rate": 4.733256686099063e-06,
"loss": 1.2148,
"step": 476
},
{
"epoch": 1.0842185128983308,
"grad_norm": 0.5157918334007263,
"learning_rate": 4.731861909379588e-06,
"loss": 1.2858,
"step": 477
},
{
"epoch": 1.08649468892261,
"grad_norm": 0.5016840100288391,
"learning_rate": 4.730463702188824e-06,
"loss": 1.2137,
"step": 478
},
{
"epoch": 1.0887708649468892,
"grad_norm": 0.5427749156951904,
"learning_rate": 4.729062066675877e-06,
"loss": 1.2616,
"step": 479
},
{
"epoch": 1.0910470409711683,
"grad_norm": 0.5368303656578064,
"learning_rate": 4.727657004995124e-06,
"loss": 1.22,
"step": 480
},
{
"epoch": 1.0933232169954477,
"grad_norm": 0.5127097964286804,
"learning_rate": 4.726248519306208e-06,
"loss": 1.1953,
"step": 481
},
{
"epoch": 1.095599393019727,
"grad_norm": 0.5109656453132629,
"learning_rate": 4.724836611774032e-06,
"loss": 1.2483,
"step": 482
},
{
"epoch": 1.097875569044006,
"grad_norm": 0.5445286631584167,
"learning_rate": 4.723421284568764e-06,
"loss": 1.242,
"step": 483
},
{
"epoch": 1.1001517450682852,
"grad_norm": 0.5462026000022888,
"learning_rate": 4.722002539865823e-06,
"loss": 1.2475,
"step": 484
},
{
"epoch": 1.1024279210925645,
"grad_norm": 0.5589436292648315,
"learning_rate": 4.720580379845884e-06,
"loss": 1.2511,
"step": 485
},
{
"epoch": 1.1047040971168438,
"grad_norm": 0.5450273752212524,
"learning_rate": 4.719154806694869e-06,
"loss": 1.2843,
"step": 486
},
{
"epoch": 1.106980273141123,
"grad_norm": 0.5322884321212769,
"learning_rate": 4.717725822603948e-06,
"loss": 1.2159,
"step": 487
},
{
"epoch": 1.1092564491654022,
"grad_norm": 0.5098543763160706,
"learning_rate": 4.716293429769534e-06,
"loss": 1.2818,
"step": 488
},
{
"epoch": 1.1115326251896813,
"grad_norm": 0.5248117446899414,
"learning_rate": 4.7148576303932784e-06,
"loss": 1.2497,
"step": 489
},
{
"epoch": 1.1138088012139606,
"grad_norm": 0.5317633748054504,
"learning_rate": 4.7134184266820675e-06,
"loss": 1.2174,
"step": 490
},
{
"epoch": 1.1160849772382397,
"grad_norm": 0.5104670524597168,
"learning_rate": 4.711975820848024e-06,
"loss": 1.2492,
"step": 491
},
{
"epoch": 1.118361153262519,
"grad_norm": 0.5210446715354919,
"learning_rate": 4.710529815108496e-06,
"loss": 1.2478,
"step": 492
},
{
"epoch": 1.1206373292867982,
"grad_norm": 0.5357753038406372,
"learning_rate": 4.7090804116860574e-06,
"loss": 1.2533,
"step": 493
},
{
"epoch": 1.1229135053110775,
"grad_norm": 0.5544043779373169,
"learning_rate": 4.707627612808509e-06,
"loss": 1.2315,
"step": 494
},
{
"epoch": 1.1251896813353566,
"grad_norm": 0.5387628674507141,
"learning_rate": 4.706171420708866e-06,
"loss": 1.2492,
"step": 495
},
{
"epoch": 1.127465857359636,
"grad_norm": 0.5289620757102966,
"learning_rate": 4.704711837625361e-06,
"loss": 1.1865,
"step": 496
},
{
"epoch": 1.129742033383915,
"grad_norm": 0.5673317909240723,
"learning_rate": 4.703248865801436e-06,
"loss": 1.1963,
"step": 497
},
{
"epoch": 1.1320182094081943,
"grad_norm": 0.5180116295814514,
"learning_rate": 4.701782507485747e-06,
"loss": 1.2431,
"step": 498
},
{
"epoch": 1.1342943854324734,
"grad_norm": 0.5326710343360901,
"learning_rate": 4.700312764932151e-06,
"loss": 1.2543,
"step": 499
},
{
"epoch": 1.1365705614567527,
"grad_norm": 0.536686360836029,
"learning_rate": 4.698839640399707e-06,
"loss": 1.2664,
"step": 500
},
{
"epoch": 1.1388467374810318,
"grad_norm": 0.5708869695663452,
"learning_rate": 4.6973631361526745e-06,
"loss": 1.2445,
"step": 501
},
{
"epoch": 1.1411229135053111,
"grad_norm": 0.5445765852928162,
"learning_rate": 4.695883254460505e-06,
"loss": 1.2111,
"step": 502
},
{
"epoch": 1.1433990895295902,
"grad_norm": 0.5529754161834717,
"learning_rate": 4.6943999975978445e-06,
"loss": 1.2346,
"step": 503
},
{
"epoch": 1.1456752655538696,
"grad_norm": 0.5409250855445862,
"learning_rate": 4.692913367844523e-06,
"loss": 1.2338,
"step": 504
},
{
"epoch": 1.1479514415781487,
"grad_norm": 0.5459516644477844,
"learning_rate": 4.691423367485558e-06,
"loss": 1.2487,
"step": 505
},
{
"epoch": 1.150227617602428,
"grad_norm": 0.5377400517463684,
"learning_rate": 4.689929998811145e-06,
"loss": 1.2719,
"step": 506
},
{
"epoch": 1.152503793626707,
"grad_norm": 0.5768429636955261,
"learning_rate": 4.68843326411666e-06,
"loss": 1.2106,
"step": 507
},
{
"epoch": 1.1547799696509864,
"grad_norm": 0.5586393475532532,
"learning_rate": 4.686933165702651e-06,
"loss": 1.2469,
"step": 508
},
{
"epoch": 1.1570561456752655,
"grad_norm": 0.5209569334983826,
"learning_rate": 4.685429705874834e-06,
"loss": 1.2453,
"step": 509
},
{
"epoch": 1.1593323216995448,
"grad_norm": 0.5145371556282043,
"learning_rate": 4.6839228869440965e-06,
"loss": 1.2484,
"step": 510
},
{
"epoch": 1.161608497723824,
"grad_norm": 0.5463981032371521,
"learning_rate": 4.682412711226485e-06,
"loss": 1.2691,
"step": 511
},
{
"epoch": 1.1638846737481032,
"grad_norm": 0.5128470659255981,
"learning_rate": 4.680899181043206e-06,
"loss": 1.2579,
"step": 512
},
{
"epoch": 1.1661608497723823,
"grad_norm": 0.5277767777442932,
"learning_rate": 4.679382298720625e-06,
"loss": 1.2247,
"step": 513
},
{
"epoch": 1.1684370257966616,
"grad_norm": 0.5547785758972168,
"learning_rate": 4.6778620665902566e-06,
"loss": 1.2492,
"step": 514
},
{
"epoch": 1.1707132018209407,
"grad_norm": 0.5689957737922668,
"learning_rate": 4.676338486988765e-06,
"loss": 1.2384,
"step": 515
},
{
"epoch": 1.17298937784522,
"grad_norm": 0.5139868259429932,
"learning_rate": 4.674811562257961e-06,
"loss": 1.2562,
"step": 516
},
{
"epoch": 1.1752655538694992,
"grad_norm": 0.5729711055755615,
"learning_rate": 4.673281294744796e-06,
"loss": 1.2833,
"step": 517
},
{
"epoch": 1.1775417298937785,
"grad_norm": 0.5735371708869934,
"learning_rate": 4.671747686801358e-06,
"loss": 1.2481,
"step": 518
},
{
"epoch": 1.1798179059180576,
"grad_norm": 0.5259848833084106,
"learning_rate": 4.670210740784872e-06,
"loss": 1.2496,
"step": 519
},
{
"epoch": 1.182094081942337,
"grad_norm": 0.5374155640602112,
"learning_rate": 4.668670459057693e-06,
"loss": 1.2484,
"step": 520
},
{
"epoch": 1.184370257966616,
"grad_norm": 0.5365428328514099,
"learning_rate": 4.667126843987301e-06,
"loss": 1.2651,
"step": 521
},
{
"epoch": 1.1866464339908953,
"grad_norm": 0.5263276100158691,
"learning_rate": 4.665579897946303e-06,
"loss": 1.19,
"step": 522
},
{
"epoch": 1.1889226100151746,
"grad_norm": 0.5412886142730713,
"learning_rate": 4.664029623312422e-06,
"loss": 1.2551,
"step": 523
},
{
"epoch": 1.1911987860394537,
"grad_norm": 0.5376629829406738,
"learning_rate": 4.662476022468503e-06,
"loss": 1.2541,
"step": 524
},
{
"epoch": 1.1934749620637328,
"grad_norm": 0.5543259382247925,
"learning_rate": 4.660919097802495e-06,
"loss": 1.2745,
"step": 525
},
{
"epoch": 1.1957511380880121,
"grad_norm": 0.5453343987464905,
"learning_rate": 4.659358851707464e-06,
"loss": 1.238,
"step": 526
},
{
"epoch": 1.1980273141122915,
"grad_norm": 0.5588712692260742,
"learning_rate": 4.657795286581576e-06,
"loss": 1.1767,
"step": 527
},
{
"epoch": 1.2003034901365706,
"grad_norm": 0.5432548522949219,
"learning_rate": 4.656228404828102e-06,
"loss": 1.2243,
"step": 528
},
{
"epoch": 1.2025796661608497,
"grad_norm": 0.5616108179092407,
"learning_rate": 4.654658208855408e-06,
"loss": 1.1937,
"step": 529
},
{
"epoch": 1.204855842185129,
"grad_norm": 0.5578548908233643,
"learning_rate": 4.653084701076955e-06,
"loss": 1.2454,
"step": 530
},
{
"epoch": 1.2071320182094083,
"grad_norm": 0.5913681983947754,
"learning_rate": 4.651507883911296e-06,
"loss": 1.2717,
"step": 531
},
{
"epoch": 1.2094081942336874,
"grad_norm": 0.5625573992729187,
"learning_rate": 4.649927759782068e-06,
"loss": 1.2619,
"step": 532
},
{
"epoch": 1.2116843702579665,
"grad_norm": 0.5766717195510864,
"learning_rate": 4.648344331117992e-06,
"loss": 1.2748,
"step": 533
},
{
"epoch": 1.2139605462822458,
"grad_norm": 0.529719889163971,
"learning_rate": 4.64675760035287e-06,
"loss": 1.2443,
"step": 534
},
{
"epoch": 1.2162367223065251,
"grad_norm": 0.5937225222587585,
"learning_rate": 4.645167569925577e-06,
"loss": 1.253,
"step": 535
},
{
"epoch": 1.2185128983308042,
"grad_norm": 0.6403617262840271,
"learning_rate": 4.64357424228006e-06,
"loss": 1.1932,
"step": 536
},
{
"epoch": 1.2207890743550835,
"grad_norm": 0.5702269077301025,
"learning_rate": 4.6419776198653365e-06,
"loss": 1.2498,
"step": 537
},
{
"epoch": 1.2230652503793626,
"grad_norm": 0.5545888543128967,
"learning_rate": 4.640377705135485e-06,
"loss": 1.2517,
"step": 538
},
{
"epoch": 1.225341426403642,
"grad_norm": 0.5598457455635071,
"learning_rate": 4.638774500549645e-06,
"loss": 1.2503,
"step": 539
},
{
"epoch": 1.227617602427921,
"grad_norm": 0.5853296518325806,
"learning_rate": 4.637168008572016e-06,
"loss": 1.2418,
"step": 540
},
{
"epoch": 1.2298937784522004,
"grad_norm": 0.5423877239227295,
"learning_rate": 4.635558231671846e-06,
"loss": 1.2295,
"step": 541
},
{
"epoch": 1.2321699544764795,
"grad_norm": 0.5638657808303833,
"learning_rate": 4.633945172323434e-06,
"loss": 1.2934,
"step": 542
},
{
"epoch": 1.2344461305007588,
"grad_norm": 0.5612449645996094,
"learning_rate": 4.6323288330061244e-06,
"loss": 1.2624,
"step": 543
},
{
"epoch": 1.236722306525038,
"grad_norm": 0.5534572601318359,
"learning_rate": 4.630709216204303e-06,
"loss": 1.2488,
"step": 544
},
{
"epoch": 1.2389984825493172,
"grad_norm": 0.5525970458984375,
"learning_rate": 4.629086324407393e-06,
"loss": 1.231,
"step": 545
},
{
"epoch": 1.2412746585735963,
"grad_norm": 0.5725768804550171,
"learning_rate": 4.6274601601098505e-06,
"loss": 1.2959,
"step": 546
},
{
"epoch": 1.2435508345978756,
"grad_norm": 0.582775354385376,
"learning_rate": 4.625830725811164e-06,
"loss": 1.2554,
"step": 547
},
{
"epoch": 1.2458270106221547,
"grad_norm": 0.5522809028625488,
"learning_rate": 4.624198024015845e-06,
"loss": 1.2487,
"step": 548
},
{
"epoch": 1.248103186646434,
"grad_norm": 0.5601561069488525,
"learning_rate": 4.622562057233431e-06,
"loss": 1.2489,
"step": 549
},
{
"epoch": 1.2503793626707131,
"grad_norm": 0.5581909418106079,
"learning_rate": 4.620922827978475e-06,
"loss": 1.205,
"step": 550
},
{
"epoch": 1.2526555386949925,
"grad_norm": 0.5560769438743591,
"learning_rate": 4.619280338770545e-06,
"loss": 1.2253,
"step": 551
},
{
"epoch": 1.2549317147192716,
"grad_norm": 0.5541017651557922,
"learning_rate": 4.617634592134221e-06,
"loss": 1.2476,
"step": 552
},
{
"epoch": 1.2572078907435509,
"grad_norm": 0.5714686512947083,
"learning_rate": 4.615985590599088e-06,
"loss": 1.2274,
"step": 553
},
{
"epoch": 1.25948406676783,
"grad_norm": 0.5909372568130493,
"learning_rate": 4.6143333366997354e-06,
"loss": 1.2481,
"step": 554
},
{
"epoch": 1.2617602427921093,
"grad_norm": 0.5704237818717957,
"learning_rate": 4.612677832975751e-06,
"loss": 1.2607,
"step": 555
},
{
"epoch": 1.2640364188163884,
"grad_norm": 0.5494899749755859,
"learning_rate": 4.611019081971719e-06,
"loss": 1.2171,
"step": 556
},
{
"epoch": 1.2663125948406677,
"grad_norm": 0.5628857612609863,
"learning_rate": 4.609357086237213e-06,
"loss": 1.2185,
"step": 557
},
{
"epoch": 1.2685887708649468,
"grad_norm": 0.5746468305587769,
"learning_rate": 4.607691848326793e-06,
"loss": 1.2485,
"step": 558
},
{
"epoch": 1.2708649468892261,
"grad_norm": 0.5731273889541626,
"learning_rate": 4.606023370800006e-06,
"loss": 1.2302,
"step": 559
},
{
"epoch": 1.2731411229135052,
"grad_norm": 0.5782604217529297,
"learning_rate": 4.604351656221374e-06,
"loss": 1.2281,
"step": 560
},
{
"epoch": 1.2754172989377845,
"grad_norm": 0.5706422328948975,
"learning_rate": 4.6026767071604e-06,
"loss": 1.2145,
"step": 561
},
{
"epoch": 1.2776934749620636,
"grad_norm": 0.5888031125068665,
"learning_rate": 4.6009985261915536e-06,
"loss": 1.1982,
"step": 562
},
{
"epoch": 1.279969650986343,
"grad_norm": 0.543771505355835,
"learning_rate": 4.599317115894273e-06,
"loss": 1.2439,
"step": 563
},
{
"epoch": 1.2822458270106223,
"grad_norm": 0.5837553143501282,
"learning_rate": 4.597632478852963e-06,
"loss": 1.22,
"step": 564
},
{
"epoch": 1.2845220030349014,
"grad_norm": 0.5469195246696472,
"learning_rate": 4.595944617656984e-06,
"loss": 1.2161,
"step": 565
},
{
"epoch": 1.2867981790591805,
"grad_norm": 0.5544828772544861,
"learning_rate": 4.594253534900656e-06,
"loss": 1.22,
"step": 566
},
{
"epoch": 1.2890743550834598,
"grad_norm": 0.5594440698623657,
"learning_rate": 4.592559233183246e-06,
"loss": 1.2088,
"step": 567
},
{
"epoch": 1.2913505311077391,
"grad_norm": 0.541545569896698,
"learning_rate": 4.590861715108972e-06,
"loss": 1.2185,
"step": 568
},
{
"epoch": 1.2936267071320182,
"grad_norm": 0.5520378947257996,
"learning_rate": 4.5891609832869964e-06,
"loss": 1.2268,
"step": 569
},
{
"epoch": 1.2959028831562973,
"grad_norm": 0.5583465695381165,
"learning_rate": 4.587457040331419e-06,
"loss": 1.2225,
"step": 570
},
{
"epoch": 1.2981790591805766,
"grad_norm": 0.5398393869400024,
"learning_rate": 4.5857498888612755e-06,
"loss": 1.2479,
"step": 571
},
{
"epoch": 1.300455235204856,
"grad_norm": 0.5736100673675537,
"learning_rate": 4.584039531500535e-06,
"loss": 1.2572,
"step": 572
},
{
"epoch": 1.302731411229135,
"grad_norm": 0.5614636540412903,
"learning_rate": 4.582325970878092e-06,
"loss": 1.2221,
"step": 573
},
{
"epoch": 1.3050075872534141,
"grad_norm": 0.5580296516418457,
"learning_rate": 4.580609209627766e-06,
"loss": 1.232,
"step": 574
},
{
"epoch": 1.3072837632776935,
"grad_norm": 0.5606446266174316,
"learning_rate": 4.578889250388296e-06,
"loss": 1.2214,
"step": 575
},
{
"epoch": 1.3095599393019728,
"grad_norm": 0.5508303642272949,
"learning_rate": 4.577166095803336e-06,
"loss": 1.244,
"step": 576
},
{
"epoch": 1.3118361153262519,
"grad_norm": 0.557896614074707,
"learning_rate": 4.5754397485214505e-06,
"loss": 1.2668,
"step": 577
},
{
"epoch": 1.314112291350531,
"grad_norm": 0.5473496317863464,
"learning_rate": 4.573710211196113e-06,
"loss": 1.2265,
"step": 578
},
{
"epoch": 1.3163884673748103,
"grad_norm": 0.5576569437980652,
"learning_rate": 4.5719774864857e-06,
"loss": 1.2626,
"step": 579
},
{
"epoch": 1.3186646433990896,
"grad_norm": 0.5799663662910461,
"learning_rate": 4.570241577053486e-06,
"loss": 1.2573,
"step": 580
},
{
"epoch": 1.3209408194233687,
"grad_norm": 0.555438756942749,
"learning_rate": 4.568502485567641e-06,
"loss": 1.2775,
"step": 581
},
{
"epoch": 1.3232169954476478,
"grad_norm": 0.5486553907394409,
"learning_rate": 4.566760214701227e-06,
"loss": 1.2588,
"step": 582
},
{
"epoch": 1.3254931714719271,
"grad_norm": 0.5853822231292725,
"learning_rate": 4.565014767132191e-06,
"loss": 1.2185,
"step": 583
},
{
"epoch": 1.3277693474962065,
"grad_norm": 0.569977879524231,
"learning_rate": 4.563266145543364e-06,
"loss": 1.2387,
"step": 584
},
{
"epoch": 1.3300455235204856,
"grad_norm": 0.5845345258712769,
"learning_rate": 4.5615143526224555e-06,
"loss": 1.2935,
"step": 585
},
{
"epoch": 1.3323216995447649,
"grad_norm": 0.5513466000556946,
"learning_rate": 4.559759391062051e-06,
"loss": 1.2347,
"step": 586
},
{
"epoch": 1.334597875569044,
"grad_norm": 0.5497938990592957,
"learning_rate": 4.558001263559602e-06,
"loss": 1.2266,
"step": 587
},
{
"epoch": 1.3368740515933233,
"grad_norm": 0.5504549145698547,
"learning_rate": 4.556239972817429e-06,
"loss": 1.2535,
"step": 588
},
{
"epoch": 1.3391502276176024,
"grad_norm": 0.5670903325080872,
"learning_rate": 4.5544755215427175e-06,
"loss": 1.261,
"step": 589
},
{
"epoch": 1.3414264036418817,
"grad_norm": 0.5838532447814941,
"learning_rate": 4.552707912447504e-06,
"loss": 1.2487,
"step": 590
},
{
"epoch": 1.3437025796661608,
"grad_norm": 0.5291898250579834,
"learning_rate": 4.550937148248685e-06,
"loss": 1.2528,
"step": 591
},
{
"epoch": 1.3459787556904401,
"grad_norm": 0.5700204968452454,
"learning_rate": 4.549163231668004e-06,
"loss": 1.2657,
"step": 592
},
{
"epoch": 1.3482549317147192,
"grad_norm": 0.5522517561912537,
"learning_rate": 4.547386165432048e-06,
"loss": 1.2542,
"step": 593
},
{
"epoch": 1.3505311077389985,
"grad_norm": 0.5714395046234131,
"learning_rate": 4.545605952272249e-06,
"loss": 1.2343,
"step": 594
},
{
"epoch": 1.3528072837632776,
"grad_norm": 0.5690736174583435,
"learning_rate": 4.543822594924874e-06,
"loss": 1.2462,
"step": 595
},
{
"epoch": 1.355083459787557,
"grad_norm": 0.5521000027656555,
"learning_rate": 4.54203609613102e-06,
"loss": 1.2512,
"step": 596
},
{
"epoch": 1.357359635811836,
"grad_norm": 0.5685454607009888,
"learning_rate": 4.540246458636619e-06,
"loss": 1.2296,
"step": 597
},
{
"epoch": 1.3596358118361154,
"grad_norm": 0.5521453022956848,
"learning_rate": 4.538453685192421e-06,
"loss": 1.2533,
"step": 598
},
{
"epoch": 1.3619119878603945,
"grad_norm": 0.545840322971344,
"learning_rate": 4.536657778554e-06,
"loss": 1.2456,
"step": 599
},
{
"epoch": 1.3641881638846738,
"grad_norm": 0.5703026056289673,
"learning_rate": 4.534858741481745e-06,
"loss": 1.2293,
"step": 600
},
{
"epoch": 1.3664643399089529,
"grad_norm": 0.5508074760437012,
"learning_rate": 4.5330565767408555e-06,
"loss": 1.2657,
"step": 601
},
{
"epoch": 1.3687405159332322,
"grad_norm": 0.5637306571006775,
"learning_rate": 4.531251287101338e-06,
"loss": 1.2199,
"step": 602
},
{
"epoch": 1.3710166919575113,
"grad_norm": 0.5585516095161438,
"learning_rate": 4.529442875338005e-06,
"loss": 1.2331,
"step": 603
},
{
"epoch": 1.3732928679817906,
"grad_norm": 0.5738129019737244,
"learning_rate": 4.527631344230466e-06,
"loss": 1.215,
"step": 604
},
{
"epoch": 1.37556904400607,
"grad_norm": 0.5905203223228455,
"learning_rate": 4.525816696563123e-06,
"loss": 1.2322,
"step": 605
},
{
"epoch": 1.377845220030349,
"grad_norm": 0.5772601366043091,
"learning_rate": 4.523998935125173e-06,
"loss": 1.2344,
"step": 606
},
{
"epoch": 1.3801213960546281,
"grad_norm": 0.6194104552268982,
"learning_rate": 4.5221780627105945e-06,
"loss": 1.2647,
"step": 607
},
{
"epoch": 1.3823975720789075,
"grad_norm": 0.5779480934143066,
"learning_rate": 4.520354082118151e-06,
"loss": 1.2148,
"step": 608
},
{
"epoch": 1.3846737481031868,
"grad_norm": 0.5630953907966614,
"learning_rate": 4.518526996151381e-06,
"loss": 1.2647,
"step": 609
},
{
"epoch": 1.3869499241274659,
"grad_norm": 0.5726267099380493,
"learning_rate": 4.516696807618598e-06,
"loss": 1.2741,
"step": 610
},
{
"epoch": 1.389226100151745,
"grad_norm": 0.5838750600814819,
"learning_rate": 4.514863519332882e-06,
"loss": 1.1919,
"step": 611
},
{
"epoch": 1.3915022761760243,
"grad_norm": 0.5766186714172363,
"learning_rate": 4.5130271341120805e-06,
"loss": 1.2359,
"step": 612
},
{
"epoch": 1.3937784522003036,
"grad_norm": 0.5568646192550659,
"learning_rate": 4.511187654778798e-06,
"loss": 1.2107,
"step": 613
},
{
"epoch": 1.3960546282245827,
"grad_norm": 0.5602480173110962,
"learning_rate": 4.509345084160397e-06,
"loss": 1.2276,
"step": 614
},
{
"epoch": 1.3983308042488618,
"grad_norm": 0.5605113506317139,
"learning_rate": 4.507499425088991e-06,
"loss": 1.2259,
"step": 615
},
{
"epoch": 1.4006069802731411,
"grad_norm": 0.5589579939842224,
"learning_rate": 4.505650680401441e-06,
"loss": 1.2212,
"step": 616
},
{
"epoch": 1.4028831562974204,
"grad_norm": 0.5683750510215759,
"learning_rate": 4.503798852939347e-06,
"loss": 1.2313,
"step": 617
},
{
"epoch": 1.4051593323216995,
"grad_norm": 0.5655199885368347,
"learning_rate": 4.501943945549054e-06,
"loss": 1.2199,
"step": 618
},
{
"epoch": 1.4074355083459786,
"grad_norm": 0.5633233785629272,
"learning_rate": 4.500085961081635e-06,
"loss": 1.2305,
"step": 619
},
{
"epoch": 1.409711684370258,
"grad_norm": 0.5716864466667175,
"learning_rate": 4.498224902392896e-06,
"loss": 1.2135,
"step": 620
},
{
"epoch": 1.4119878603945373,
"grad_norm": 0.5524502992630005,
"learning_rate": 4.496360772343367e-06,
"loss": 1.221,
"step": 621
},
{
"epoch": 1.4142640364188164,
"grad_norm": 0.5607890486717224,
"learning_rate": 4.494493573798299e-06,
"loss": 1.2243,
"step": 622
},
{
"epoch": 1.4165402124430955,
"grad_norm": 0.5746079683303833,
"learning_rate": 4.49262330962766e-06,
"loss": 1.2064,
"step": 623
},
{
"epoch": 1.4188163884673748,
"grad_norm": 0.5607832670211792,
"learning_rate": 4.490749982706128e-06,
"loss": 1.2248,
"step": 624
},
{
"epoch": 1.421092564491654,
"grad_norm": 0.5688823461532593,
"learning_rate": 4.488873595913092e-06,
"loss": 1.232,
"step": 625
},
{
"epoch": 1.4233687405159332,
"grad_norm": 0.5820784568786621,
"learning_rate": 4.48699415213264e-06,
"loss": 1.2485,
"step": 626
},
{
"epoch": 1.4256449165402125,
"grad_norm": 0.56890869140625,
"learning_rate": 4.4851116542535625e-06,
"loss": 1.2286,
"step": 627
},
{
"epoch": 1.4279210925644916,
"grad_norm": 0.6012819409370422,
"learning_rate": 4.483226105169341e-06,
"loss": 1.2343,
"step": 628
},
{
"epoch": 1.430197268588771,
"grad_norm": 0.570756733417511,
"learning_rate": 4.481337507778151e-06,
"loss": 1.2447,
"step": 629
},
{
"epoch": 1.43247344461305,
"grad_norm": 0.5640760660171509,
"learning_rate": 4.47944586498285e-06,
"loss": 1.2298,
"step": 630
},
{
"epoch": 1.4347496206373294,
"grad_norm": 0.5836703777313232,
"learning_rate": 4.477551179690977e-06,
"loss": 1.2099,
"step": 631
},
{
"epoch": 1.4370257966616085,
"grad_norm": 0.5838893055915833,
"learning_rate": 4.475653454814746e-06,
"loss": 1.2437,
"step": 632
},
{
"epoch": 1.4393019726858878,
"grad_norm": 0.5973705053329468,
"learning_rate": 4.473752693271048e-06,
"loss": 1.2872,
"step": 633
},
{
"epoch": 1.4415781487101669,
"grad_norm": 0.5992927551269531,
"learning_rate": 4.471848897981437e-06,
"loss": 1.2072,
"step": 634
},
{
"epoch": 1.4438543247344462,
"grad_norm": 0.566234827041626,
"learning_rate": 4.46994207187213e-06,
"loss": 1.2181,
"step": 635
},
{
"epoch": 1.4461305007587253,
"grad_norm": 0.5693137645721436,
"learning_rate": 4.4680322178740056e-06,
"loss": 1.1862,
"step": 636
},
{
"epoch": 1.4484066767830046,
"grad_norm": 0.5798976421356201,
"learning_rate": 4.466119338922593e-06,
"loss": 1.2225,
"step": 637
},
{
"epoch": 1.4506828528072837,
"grad_norm": 0.575389564037323,
"learning_rate": 4.464203437958075e-06,
"loss": 1.2257,
"step": 638
},
{
"epoch": 1.452959028831563,
"grad_norm": 0.6053541302680969,
"learning_rate": 4.4622845179252735e-06,
"loss": 1.241,
"step": 639
},
{
"epoch": 1.4552352048558421,
"grad_norm": 0.5716749429702759,
"learning_rate": 4.460362581773656e-06,
"loss": 1.2278,
"step": 640
},
{
"epoch": 1.4575113808801214,
"grad_norm": 0.5863229036331177,
"learning_rate": 4.458437632457325e-06,
"loss": 1.2238,
"step": 641
},
{
"epoch": 1.4597875569044005,
"grad_norm": 0.6117021441459656,
"learning_rate": 4.456509672935011e-06,
"loss": 1.2318,
"step": 642
},
{
"epoch": 1.4620637329286799,
"grad_norm": 0.6031973361968994,
"learning_rate": 4.454578706170075e-06,
"loss": 1.2309,
"step": 643
},
{
"epoch": 1.464339908952959,
"grad_norm": 0.6449349522590637,
"learning_rate": 4.4526447351304995e-06,
"loss": 1.2357,
"step": 644
},
{
"epoch": 1.4666160849772383,
"grad_norm": 0.5698959231376648,
"learning_rate": 4.450707762788884e-06,
"loss": 1.2064,
"step": 645
},
{
"epoch": 1.4688922610015174,
"grad_norm": 0.6145030856132507,
"learning_rate": 4.44876779212244e-06,
"loss": 1.1837,
"step": 646
},
{
"epoch": 1.4711684370257967,
"grad_norm": 0.6202698349952698,
"learning_rate": 4.446824826112992e-06,
"loss": 1.2459,
"step": 647
},
{
"epoch": 1.4734446130500758,
"grad_norm": 0.5868430137634277,
"learning_rate": 4.444878867746962e-06,
"loss": 1.1797,
"step": 648
},
{
"epoch": 1.475720789074355,
"grad_norm": 0.6009106636047363,
"learning_rate": 4.442929920015377e-06,
"loss": 1.2008,
"step": 649
},
{
"epoch": 1.4779969650986344,
"grad_norm": 0.6000754237174988,
"learning_rate": 4.440977985913856e-06,
"loss": 1.199,
"step": 650
},
{
"epoch": 1.4802731411229135,
"grad_norm": 0.5801194310188293,
"learning_rate": 4.439023068442608e-06,
"loss": 1.2806,
"step": 651
},
{
"epoch": 1.4825493171471926,
"grad_norm": 0.6096365451812744,
"learning_rate": 4.43706517060643e-06,
"loss": 1.2434,
"step": 652
},
{
"epoch": 1.484825493171472,
"grad_norm": 0.6116917133331299,
"learning_rate": 4.435104295414697e-06,
"loss": 1.2262,
"step": 653
},
{
"epoch": 1.4871016691957513,
"grad_norm": 0.5588528513908386,
"learning_rate": 4.4331404458813615e-06,
"loss": 1.2373,
"step": 654
},
{
"epoch": 1.4893778452200304,
"grad_norm": 0.5834910869598389,
"learning_rate": 4.431173625024948e-06,
"loss": 1.2766,
"step": 655
},
{
"epoch": 1.4916540212443095,
"grad_norm": 0.623333215713501,
"learning_rate": 4.429203835868549e-06,
"loss": 1.2375,
"step": 656
},
{
"epoch": 1.4939301972685888,
"grad_norm": 0.6033525466918945,
"learning_rate": 4.427231081439817e-06,
"loss": 1.2,
"step": 657
},
{
"epoch": 1.496206373292868,
"grad_norm": 0.5829868912696838,
"learning_rate": 4.4252553647709635e-06,
"loss": 1.2349,
"step": 658
},
{
"epoch": 1.4984825493171472,
"grad_norm": 0.5703787803649902,
"learning_rate": 4.423276688898754e-06,
"loss": 1.2213,
"step": 659
},
{
"epoch": 1.5007587253414263,
"grad_norm": 0.5715304017066956,
"learning_rate": 4.421295056864501e-06,
"loss": 1.2394,
"step": 660
},
{
"epoch": 1.5030349013657056,
"grad_norm": 0.6249496340751648,
"learning_rate": 4.419310471714061e-06,
"loss": 1.2027,
"step": 661
},
{
"epoch": 1.505311077389985,
"grad_norm": 0.5828440189361572,
"learning_rate": 4.417322936497831e-06,
"loss": 1.2442,
"step": 662
},
{
"epoch": 1.507587253414264,
"grad_norm": 0.5692103505134583,
"learning_rate": 4.415332454270741e-06,
"loss": 1.1791,
"step": 663
},
{
"epoch": 1.5098634294385431,
"grad_norm": 0.595786988735199,
"learning_rate": 4.41333902809225e-06,
"loss": 1.231,
"step": 664
},
{
"epoch": 1.5121396054628224,
"grad_norm": 0.5955888032913208,
"learning_rate": 4.411342661026342e-06,
"loss": 1.2206,
"step": 665
},
{
"epoch": 1.5144157814871018,
"grad_norm": 0.582911491394043,
"learning_rate": 4.409343356141525e-06,
"loss": 1.2169,
"step": 666
},
{
"epoch": 1.5166919575113809,
"grad_norm": 0.585781455039978,
"learning_rate": 4.407341116510818e-06,
"loss": 1.2345,
"step": 667
},
{
"epoch": 1.51896813353566,
"grad_norm": 0.5766403675079346,
"learning_rate": 4.405335945211754e-06,
"loss": 1.2307,
"step": 668
},
{
"epoch": 1.5212443095599393,
"grad_norm": 0.5894457101821899,
"learning_rate": 4.4033278453263685e-06,
"loss": 1.2445,
"step": 669
},
{
"epoch": 1.5235204855842186,
"grad_norm": 0.5737869143486023,
"learning_rate": 4.401316819941203e-06,
"loss": 1.2311,
"step": 670
},
{
"epoch": 1.5257966616084977,
"grad_norm": 0.5908883213996887,
"learning_rate": 4.399302872147292e-06,
"loss": 1.2381,
"step": 671
},
{
"epoch": 1.5280728376327768,
"grad_norm": 0.6145277619361877,
"learning_rate": 4.397286005040162e-06,
"loss": 1.2394,
"step": 672
},
{
"epoch": 1.5303490136570561,
"grad_norm": 0.5731965899467468,
"learning_rate": 4.395266221719829e-06,
"loss": 1.2369,
"step": 673
},
{
"epoch": 1.5326251896813354,
"grad_norm": 0.5849004983901978,
"learning_rate": 4.3932435252907914e-06,
"loss": 1.2308,
"step": 674
},
{
"epoch": 1.5349013657056145,
"grad_norm": 0.5686678290367126,
"learning_rate": 4.391217918862021e-06,
"loss": 1.259,
"step": 675
},
{
"epoch": 1.5371775417298936,
"grad_norm": 0.580635666847229,
"learning_rate": 4.389189405546966e-06,
"loss": 1.2359,
"step": 676
},
{
"epoch": 1.539453717754173,
"grad_norm": 0.5722584128379822,
"learning_rate": 4.387157988463544e-06,
"loss": 1.231,
"step": 677
},
{
"epoch": 1.5417298937784523,
"grad_norm": 0.5868629813194275,
"learning_rate": 4.38512367073413e-06,
"loss": 1.2363,
"step": 678
},
{
"epoch": 1.5440060698027314,
"grad_norm": 0.5766255259513855,
"learning_rate": 4.383086455485564e-06,
"loss": 1.2556,
"step": 679
},
{
"epoch": 1.5462822458270105,
"grad_norm": 0.5849782824516296,
"learning_rate": 4.381046345849136e-06,
"loss": 1.2189,
"step": 680
},
{
"epoch": 1.5485584218512898,
"grad_norm": 0.6070932149887085,
"learning_rate": 4.379003344960585e-06,
"loss": 1.2351,
"step": 681
},
{
"epoch": 1.550834597875569,
"grad_norm": 0.6085125803947449,
"learning_rate": 4.376957455960094e-06,
"loss": 1.2218,
"step": 682
},
{
"epoch": 1.5531107738998484,
"grad_norm": 0.5707188844680786,
"learning_rate": 4.374908681992287e-06,
"loss": 1.2501,
"step": 683
},
{
"epoch": 1.5553869499241275,
"grad_norm": 0.6099936366081238,
"learning_rate": 4.37285702620622e-06,
"loss": 1.2436,
"step": 684
},
{
"epoch": 1.5576631259484066,
"grad_norm": 0.603273332118988,
"learning_rate": 4.37080249175538e-06,
"loss": 1.239,
"step": 685
},
{
"epoch": 1.559939301972686,
"grad_norm": 0.5822923183441162,
"learning_rate": 4.368745081797678e-06,
"loss": 1.22,
"step": 686
},
{
"epoch": 1.5622154779969653,
"grad_norm": 0.5922508835792542,
"learning_rate": 4.3666847994954445e-06,
"loss": 1.2138,
"step": 687
},
{
"epoch": 1.5644916540212443,
"grad_norm": 0.585437536239624,
"learning_rate": 4.364621648015426e-06,
"loss": 1.207,
"step": 688
},
{
"epoch": 1.5667678300455234,
"grad_norm": 0.5693568587303162,
"learning_rate": 4.362555630528776e-06,
"loss": 1.2036,
"step": 689
},
{
"epoch": 1.5690440060698028,
"grad_norm": 0.5950521230697632,
"learning_rate": 4.360486750211059e-06,
"loss": 1.2682,
"step": 690
},
{
"epoch": 1.571320182094082,
"grad_norm": 0.5919183492660522,
"learning_rate": 4.358415010242234e-06,
"loss": 1.2082,
"step": 691
},
{
"epoch": 1.5735963581183612,
"grad_norm": 0.6143742203712463,
"learning_rate": 4.356340413806658e-06,
"loss": 1.1925,
"step": 692
},
{
"epoch": 1.5758725341426403,
"grad_norm": 0.6028359532356262,
"learning_rate": 4.354262964093079e-06,
"loss": 1.2196,
"step": 693
},
{
"epoch": 1.5781487101669196,
"grad_norm": 0.6061824560165405,
"learning_rate": 4.35218266429463e-06,
"loss": 1.2266,
"step": 694
},
{
"epoch": 1.580424886191199,
"grad_norm": 0.6007355451583862,
"learning_rate": 4.3500995176088235e-06,
"loss": 1.2104,
"step": 695
},
{
"epoch": 1.582701062215478,
"grad_norm": 0.6342191100120544,
"learning_rate": 4.348013527237549e-06,
"loss": 1.2197,
"step": 696
},
{
"epoch": 1.5849772382397571,
"grad_norm": 0.5949456095695496,
"learning_rate": 4.345924696387067e-06,
"loss": 1.2258,
"step": 697
},
{
"epoch": 1.5872534142640364,
"grad_norm": 0.6161270141601562,
"learning_rate": 4.343833028268004e-06,
"loss": 1.2299,
"step": 698
},
{
"epoch": 1.5895295902883158,
"grad_norm": 0.5942959785461426,
"learning_rate": 4.341738526095348e-06,
"loss": 1.2594,
"step": 699
},
{
"epoch": 1.5918057663125948,
"grad_norm": 0.5933099389076233,
"learning_rate": 4.339641193088439e-06,
"loss": 1.1932,
"step": 700
},
{
"epoch": 1.594081942336874,
"grad_norm": 0.5857350826263428,
"learning_rate": 4.337541032470976e-06,
"loss": 1.3019,
"step": 701
},
{
"epoch": 1.5963581183611533,
"grad_norm": 0.604029655456543,
"learning_rate": 4.335438047470996e-06,
"loss": 1.2227,
"step": 702
},
{
"epoch": 1.5986342943854326,
"grad_norm": 0.5927514433860779,
"learning_rate": 4.333332241320882e-06,
"loss": 1.2742,
"step": 703
},
{
"epoch": 1.6009104704097117,
"grad_norm": 0.5811514854431152,
"learning_rate": 4.331223617257351e-06,
"loss": 1.23,
"step": 704
},
{
"epoch": 1.6031866464339908,
"grad_norm": 0.5948609709739685,
"learning_rate": 4.329112178521454e-06,
"loss": 1.2114,
"step": 705
},
{
"epoch": 1.60546282245827,
"grad_norm": 0.6194981932640076,
"learning_rate": 4.326997928358565e-06,
"loss": 1.2439,
"step": 706
},
{
"epoch": 1.6077389984825494,
"grad_norm": 0.5834797024726868,
"learning_rate": 4.324880870018382e-06,
"loss": 1.2269,
"step": 707
},
{
"epoch": 1.6100151745068285,
"grad_norm": 0.5746902823448181,
"learning_rate": 4.322761006754916e-06,
"loss": 1.2175,
"step": 708
},
{
"epoch": 1.6122913505311076,
"grad_norm": 0.6000075936317444,
"learning_rate": 4.320638341826494e-06,
"loss": 1.2316,
"step": 709
},
{
"epoch": 1.614567526555387,
"grad_norm": 0.588010311126709,
"learning_rate": 4.318512878495745e-06,
"loss": 1.245,
"step": 710
},
{
"epoch": 1.6168437025796663,
"grad_norm": 0.6053698658943176,
"learning_rate": 4.316384620029601e-06,
"loss": 1.228,
"step": 711
},
{
"epoch": 1.6191198786039454,
"grad_norm": 0.5857113599777222,
"learning_rate": 4.314253569699292e-06,
"loss": 1.2511,
"step": 712
},
{
"epoch": 1.6213960546282244,
"grad_norm": 0.5974637866020203,
"learning_rate": 4.312119730780334e-06,
"loss": 1.2377,
"step": 713
},
{
"epoch": 1.6236722306525038,
"grad_norm": 0.5964690446853638,
"learning_rate": 4.309983106552535e-06,
"loss": 1.2307,
"step": 714
},
{
"epoch": 1.625948406676783,
"grad_norm": 0.5781478881835938,
"learning_rate": 4.307843700299982e-06,
"loss": 1.2295,
"step": 715
},
{
"epoch": 1.6282245827010622,
"grad_norm": 0.597053587436676,
"learning_rate": 4.305701515311037e-06,
"loss": 1.2085,
"step": 716
},
{
"epoch": 1.6305007587253413,
"grad_norm": 0.6326000690460205,
"learning_rate": 4.303556554878333e-06,
"loss": 1.238,
"step": 717
},
{
"epoch": 1.6327769347496206,
"grad_norm": 0.6087371706962585,
"learning_rate": 4.3014088222987714e-06,
"loss": 1.2275,
"step": 718
},
{
"epoch": 1.6350531107739,
"grad_norm": 0.5937424898147583,
"learning_rate": 4.299258320873513e-06,
"loss": 1.2144,
"step": 719
},
{
"epoch": 1.637329286798179,
"grad_norm": 0.5922595262527466,
"learning_rate": 4.297105053907973e-06,
"loss": 1.2078,
"step": 720
},
{
"epoch": 1.6396054628224581,
"grad_norm": 0.603537380695343,
"learning_rate": 4.294949024711819e-06,
"loss": 1.2054,
"step": 721
},
{
"epoch": 1.6418816388467374,
"grad_norm": 0.5896364450454712,
"learning_rate": 4.2927902365989645e-06,
"loss": 1.2038,
"step": 722
},
{
"epoch": 1.6441578148710168,
"grad_norm": 0.614658534526825,
"learning_rate": 4.290628692887564e-06,
"loss": 1.2428,
"step": 723
},
{
"epoch": 1.6464339908952959,
"grad_norm": 0.5901724100112915,
"learning_rate": 4.288464396900005e-06,
"loss": 1.2464,
"step": 724
},
{
"epoch": 1.648710166919575,
"grad_norm": 0.6086544394493103,
"learning_rate": 4.286297351962908e-06,
"loss": 1.1895,
"step": 725
},
{
"epoch": 1.6509863429438543,
"grad_norm": 0.5841042399406433,
"learning_rate": 4.284127561407118e-06,
"loss": 1.2222,
"step": 726
},
{
"epoch": 1.6532625189681336,
"grad_norm": 0.5791555643081665,
"learning_rate": 4.281955028567698e-06,
"loss": 1.2489,
"step": 727
},
{
"epoch": 1.655538694992413,
"grad_norm": 0.6219162344932556,
"learning_rate": 4.27977975678393e-06,
"loss": 1.2208,
"step": 728
},
{
"epoch": 1.657814871016692,
"grad_norm": 0.597656786441803,
"learning_rate": 4.277601749399301e-06,
"loss": 1.2049,
"step": 729
},
{
"epoch": 1.660091047040971,
"grad_norm": 0.5991064310073853,
"learning_rate": 4.27542100976151e-06,
"loss": 1.2602,
"step": 730
},
{
"epoch": 1.6623672230652504,
"grad_norm": 0.5922961831092834,
"learning_rate": 4.273237541222447e-06,
"loss": 1.2077,
"step": 731
},
{
"epoch": 1.6646433990895297,
"grad_norm": 0.6028023362159729,
"learning_rate": 4.2710513471382005e-06,
"loss": 1.2092,
"step": 732
},
{
"epoch": 1.6669195751138088,
"grad_norm": 0.581685483455658,
"learning_rate": 4.268862430869052e-06,
"loss": 1.2192,
"step": 733
},
{
"epoch": 1.669195751138088,
"grad_norm": 0.6332095265388489,
"learning_rate": 4.26667079577946e-06,
"loss": 1.2573,
"step": 734
},
{
"epoch": 1.6714719271623673,
"grad_norm": 0.6062667369842529,
"learning_rate": 4.2644764452380675e-06,
"loss": 1.2994,
"step": 735
},
{
"epoch": 1.6737481031866466,
"grad_norm": 0.5829861164093018,
"learning_rate": 4.262279382617687e-06,
"loss": 1.2286,
"step": 736
},
{
"epoch": 1.6760242792109257,
"grad_norm": 0.587378203868866,
"learning_rate": 4.260079611295303e-06,
"loss": 1.182,
"step": 737
},
{
"epoch": 1.6783004552352048,
"grad_norm": 0.6240544319152832,
"learning_rate": 4.257877134652062e-06,
"loss": 1.2543,
"step": 738
},
{
"epoch": 1.680576631259484,
"grad_norm": 0.5865784287452698,
"learning_rate": 4.255671956073269e-06,
"loss": 1.2355,
"step": 739
},
{
"epoch": 1.6828528072837634,
"grad_norm": 0.5847815871238708,
"learning_rate": 4.253464078948382e-06,
"loss": 1.2069,
"step": 740
},
{
"epoch": 1.6851289833080425,
"grad_norm": 0.5941992402076721,
"learning_rate": 4.251253506671006e-06,
"loss": 1.2423,
"step": 741
},
{
"epoch": 1.6874051593323216,
"grad_norm": 0.6245031952857971,
"learning_rate": 4.249040242638889e-06,
"loss": 1.2555,
"step": 742
},
{
"epoch": 1.689681335356601,
"grad_norm": 0.6055291295051575,
"learning_rate": 4.246824290253917e-06,
"loss": 1.2261,
"step": 743
},
{
"epoch": 1.6919575113808802,
"grad_norm": 0.5905616283416748,
"learning_rate": 4.244605652922108e-06,
"loss": 1.2385,
"step": 744
},
{
"epoch": 1.6942336874051593,
"grad_norm": 0.5896965265274048,
"learning_rate": 4.2423843340536066e-06,
"loss": 1.1945,
"step": 745
},
{
"epoch": 1.6965098634294384,
"grad_norm": 0.6129325032234192,
"learning_rate": 4.240160337062678e-06,
"loss": 1.223,
"step": 746
},
{
"epoch": 1.6987860394537178,
"grad_norm": 0.5988030433654785,
"learning_rate": 4.237933665367705e-06,
"loss": 1.2197,
"step": 747
},
{
"epoch": 1.701062215477997,
"grad_norm": 0.599388837814331,
"learning_rate": 4.235704322391181e-06,
"loss": 1.2214,
"step": 748
},
{
"epoch": 1.7033383915022762,
"grad_norm": 0.6087759137153625,
"learning_rate": 4.233472311559708e-06,
"loss": 1.2302,
"step": 749
},
{
"epoch": 1.7056145675265553,
"grad_norm": 0.5895616412162781,
"learning_rate": 4.231237636303982e-06,
"loss": 1.1976,
"step": 750
},
{
"epoch": 1.7078907435508346,
"grad_norm": 0.6117663383483887,
"learning_rate": 4.229000300058802e-06,
"loss": 1.1928,
"step": 751
},
{
"epoch": 1.710166919575114,
"grad_norm": 0.5945206880569458,
"learning_rate": 4.2267603062630526e-06,
"loss": 1.201,
"step": 752
},
{
"epoch": 1.712443095599393,
"grad_norm": 0.6434623599052429,
"learning_rate": 4.224517658359704e-06,
"loss": 1.239,
"step": 753
},
{
"epoch": 1.714719271623672,
"grad_norm": 0.5895166397094727,
"learning_rate": 4.222272359795806e-06,
"loss": 1.2305,
"step": 754
},
{
"epoch": 1.7169954476479514,
"grad_norm": 0.6248841285705566,
"learning_rate": 4.220024414022482e-06,
"loss": 1.2332,
"step": 755
},
{
"epoch": 1.7192716236722307,
"grad_norm": 0.6209638118743896,
"learning_rate": 4.217773824494926e-06,
"loss": 1.2773,
"step": 756
},
{
"epoch": 1.7215477996965098,
"grad_norm": 0.5973532199859619,
"learning_rate": 4.215520594672394e-06,
"loss": 1.1992,
"step": 757
},
{
"epoch": 1.723823975720789,
"grad_norm": 0.5936313271522522,
"learning_rate": 4.2132647280182e-06,
"loss": 1.2412,
"step": 758
},
{
"epoch": 1.7261001517450683,
"grad_norm": 0.6053516268730164,
"learning_rate": 4.211006227999713e-06,
"loss": 1.2129,
"step": 759
},
{
"epoch": 1.7283763277693476,
"grad_norm": 0.6065954566001892,
"learning_rate": 4.208745098088348e-06,
"loss": 1.2395,
"step": 760
},
{
"epoch": 1.7306525037936267,
"grad_norm": 0.6134182214736938,
"learning_rate": 4.206481341759562e-06,
"loss": 1.1969,
"step": 761
},
{
"epoch": 1.7329286798179058,
"grad_norm": 0.6103958487510681,
"learning_rate": 4.204214962492849e-06,
"loss": 1.2583,
"step": 762
},
{
"epoch": 1.735204855842185,
"grad_norm": 0.6010955572128296,
"learning_rate": 4.201945963771736e-06,
"loss": 1.2638,
"step": 763
},
{
"epoch": 1.7374810318664644,
"grad_norm": 0.6201740503311157,
"learning_rate": 4.199674349083776e-06,
"loss": 1.2491,
"step": 764
},
{
"epoch": 1.7397572078907435,
"grad_norm": 0.6140694618225098,
"learning_rate": 4.197400121920539e-06,
"loss": 1.243,
"step": 765
},
{
"epoch": 1.7420333839150226,
"grad_norm": 0.6441624164581299,
"learning_rate": 4.1951232857776164e-06,
"loss": 1.2614,
"step": 766
},
{
"epoch": 1.744309559939302,
"grad_norm": 0.6050844192504883,
"learning_rate": 4.192843844154606e-06,
"loss": 1.1756,
"step": 767
},
{
"epoch": 1.7465857359635812,
"grad_norm": 0.6491802930831909,
"learning_rate": 4.190561800555111e-06,
"loss": 1.2029,
"step": 768
},
{
"epoch": 1.7488619119878603,
"grad_norm": 0.6259174942970276,
"learning_rate": 4.1882771584867345e-06,
"loss": 1.1912,
"step": 769
},
{
"epoch": 1.7511380880121397,
"grad_norm": 0.5955666303634644,
"learning_rate": 4.1859899214610735e-06,
"loss": 1.2701,
"step": 770
},
{
"epoch": 1.7534142640364188,
"grad_norm": 0.6060442924499512,
"learning_rate": 4.183700092993712e-06,
"loss": 1.2269,
"step": 771
},
{
"epoch": 1.755690440060698,
"grad_norm": 0.6210846900939941,
"learning_rate": 4.1814076766042206e-06,
"loss": 1.2679,
"step": 772
},
{
"epoch": 1.7579666160849774,
"grad_norm": 0.5922744870185852,
"learning_rate": 4.179112675816144e-06,
"loss": 1.2171,
"step": 773
},
{
"epoch": 1.7602427921092565,
"grad_norm": 0.6048167943954468,
"learning_rate": 4.176815094157e-06,
"loss": 1.1887,
"step": 774
},
{
"epoch": 1.7625189681335356,
"grad_norm": 0.6661959290504456,
"learning_rate": 4.174514935158277e-06,
"loss": 1.2439,
"step": 775
},
{
"epoch": 1.764795144157815,
"grad_norm": 0.5862908959388733,
"learning_rate": 4.172212202355419e-06,
"loss": 1.2594,
"step": 776
},
{
"epoch": 1.7670713201820942,
"grad_norm": 0.615178644657135,
"learning_rate": 4.16990689928783e-06,
"loss": 1.2137,
"step": 777
},
{
"epoch": 1.7693474962063733,
"grad_norm": 0.6170365810394287,
"learning_rate": 4.167599029498865e-06,
"loss": 1.2278,
"step": 778
},
{
"epoch": 1.7716236722306524,
"grad_norm": 0.6055428385734558,
"learning_rate": 4.165288596535821e-06,
"loss": 1.232,
"step": 779
},
{
"epoch": 1.7738998482549317,
"grad_norm": 0.6081527471542358,
"learning_rate": 4.162975603949937e-06,
"loss": 1.2392,
"step": 780
},
{
"epoch": 1.776176024279211,
"grad_norm": 0.6220976710319519,
"learning_rate": 4.160660055296385e-06,
"loss": 1.2467,
"step": 781
},
{
"epoch": 1.7784522003034902,
"grad_norm": 0.5995768904685974,
"learning_rate": 4.158341954134268e-06,
"loss": 1.2141,
"step": 782
},
{
"epoch": 1.7807283763277693,
"grad_norm": 0.5946653485298157,
"learning_rate": 4.15602130402661e-06,
"loss": 1.255,
"step": 783
},
{
"epoch": 1.7830045523520486,
"grad_norm": 0.6094076633453369,
"learning_rate": 4.1536981085403546e-06,
"loss": 1.243,
"step": 784
},
{
"epoch": 1.785280728376328,
"grad_norm": 0.6584082841873169,
"learning_rate": 4.151372371246356e-06,
"loss": 1.2382,
"step": 785
},
{
"epoch": 1.787556904400607,
"grad_norm": 0.6139714121818542,
"learning_rate": 4.149044095719377e-06,
"loss": 1.2528,
"step": 786
},
{
"epoch": 1.789833080424886,
"grad_norm": 0.6047011017799377,
"learning_rate": 4.14671328553808e-06,
"loss": 1.2034,
"step": 787
},
{
"epoch": 1.7921092564491654,
"grad_norm": 0.6093196868896484,
"learning_rate": 4.144379944285024e-06,
"loss": 1.2669,
"step": 788
},
{
"epoch": 1.7943854324734447,
"grad_norm": 0.6222574710845947,
"learning_rate": 4.142044075546658e-06,
"loss": 1.1817,
"step": 789
},
{
"epoch": 1.7966616084977238,
"grad_norm": 0.6427398920059204,
"learning_rate": 4.13970568291332e-06,
"loss": 1.2165,
"step": 790
},
{
"epoch": 1.798937784522003,
"grad_norm": 0.6227960586547852,
"learning_rate": 4.13736476997922e-06,
"loss": 1.1816,
"step": 791
},
{
"epoch": 1.8012139605462822,
"grad_norm": 0.6001450419425964,
"learning_rate": 4.135021340342446e-06,
"loss": 1.2373,
"step": 792
},
{
"epoch": 1.8034901365705616,
"grad_norm": 0.6028245091438293,
"learning_rate": 4.132675397604956e-06,
"loss": 1.2524,
"step": 793
},
{
"epoch": 1.8057663125948407,
"grad_norm": 0.5959303379058838,
"learning_rate": 4.130326945372567e-06,
"loss": 1.198,
"step": 794
},
{
"epoch": 1.8080424886191198,
"grad_norm": 0.6001620888710022,
"learning_rate": 4.127975987254955e-06,
"loss": 1.2137,
"step": 795
},
{
"epoch": 1.810318664643399,
"grad_norm": 0.5951507091522217,
"learning_rate": 4.125622526865647e-06,
"loss": 1.2285,
"step": 796
},
{
"epoch": 1.8125948406676784,
"grad_norm": 0.614658534526825,
"learning_rate": 4.123266567822017e-06,
"loss": 1.2119,
"step": 797
},
{
"epoch": 1.8148710166919575,
"grad_norm": 0.6394176483154297,
"learning_rate": 4.120908113745281e-06,
"loss": 1.2444,
"step": 798
},
{
"epoch": 1.8171471927162366,
"grad_norm": 0.5989351868629456,
"learning_rate": 4.118547168260485e-06,
"loss": 1.1838,
"step": 799
},
{
"epoch": 1.819423368740516,
"grad_norm": 0.6235303282737732,
"learning_rate": 4.11618373499651e-06,
"loss": 1.2163,
"step": 800
},
{
"epoch": 1.8216995447647952,
"grad_norm": 0.6402750015258789,
"learning_rate": 4.113817817586055e-06,
"loss": 1.2445,
"step": 801
},
{
"epoch": 1.8239757207890743,
"grad_norm": 0.5973191857337952,
"learning_rate": 4.111449419665645e-06,
"loss": 1.2308,
"step": 802
},
{
"epoch": 1.8262518968133534,
"grad_norm": 0.6300286650657654,
"learning_rate": 4.1090785448756096e-06,
"loss": 1.2319,
"step": 803
},
{
"epoch": 1.8285280728376327,
"grad_norm": 0.5970984697341919,
"learning_rate": 4.1067051968600914e-06,
"loss": 1.1944,
"step": 804
},
{
"epoch": 1.830804248861912,
"grad_norm": 0.607427179813385,
"learning_rate": 4.104329379267031e-06,
"loss": 1.2331,
"step": 805
},
{
"epoch": 1.8330804248861912,
"grad_norm": 0.6165644526481628,
"learning_rate": 4.101951095748166e-06,
"loss": 1.2337,
"step": 806
},
{
"epoch": 1.8353566009104703,
"grad_norm": 0.639166533946991,
"learning_rate": 4.099570349959025e-06,
"loss": 1.2263,
"step": 807
},
{
"epoch": 1.8376327769347496,
"grad_norm": 0.6345863342285156,
"learning_rate": 4.097187145558919e-06,
"loss": 1.2397,
"step": 808
},
{
"epoch": 1.839908952959029,
"grad_norm": 0.607635498046875,
"learning_rate": 4.094801486210941e-06,
"loss": 1.1972,
"step": 809
},
{
"epoch": 1.842185128983308,
"grad_norm": 0.6224584579467773,
"learning_rate": 4.092413375581955e-06,
"loss": 1.231,
"step": 810
},
{
"epoch": 1.844461305007587,
"grad_norm": 0.5929398536682129,
"learning_rate": 4.090022817342593e-06,
"loss": 1.2234,
"step": 811
},
{
"epoch": 1.8467374810318664,
"grad_norm": 0.6391967535018921,
"learning_rate": 4.0876298151672525e-06,
"loss": 1.1931,
"step": 812
},
{
"epoch": 1.8490136570561457,
"grad_norm": 0.599383533000946,
"learning_rate": 4.08523437273408e-06,
"loss": 1.2425,
"step": 813
},
{
"epoch": 1.851289833080425,
"grad_norm": 0.5998767614364624,
"learning_rate": 4.082836493724981e-06,
"loss": 1.2188,
"step": 814
},
{
"epoch": 1.8535660091047041,
"grad_norm": 0.5895645618438721,
"learning_rate": 4.080436181825601e-06,
"loss": 1.2286,
"step": 815
},
{
"epoch": 1.8558421851289832,
"grad_norm": 0.6172052621841431,
"learning_rate": 4.078033440725327e-06,
"loss": 1.2007,
"step": 816
},
{
"epoch": 1.8581183611532626,
"grad_norm": 0.613259494304657,
"learning_rate": 4.075628274117279e-06,
"loss": 1.2256,
"step": 817
},
{
"epoch": 1.8603945371775419,
"grad_norm": 0.6026812791824341,
"learning_rate": 4.073220685698304e-06,
"loss": 1.2317,
"step": 818
},
{
"epoch": 1.862670713201821,
"grad_norm": 0.6112560629844666,
"learning_rate": 4.070810679168975e-06,
"loss": 1.2275,
"step": 819
},
{
"epoch": 1.8649468892261,
"grad_norm": 0.6044736504554749,
"learning_rate": 4.068398258233579e-06,
"loss": 1.2515,
"step": 820
},
{
"epoch": 1.8672230652503794,
"grad_norm": 0.6291022896766663,
"learning_rate": 4.065983426600113e-06,
"loss": 1.2137,
"step": 821
},
{
"epoch": 1.8694992412746587,
"grad_norm": 0.6136301755905151,
"learning_rate": 4.063566187980282e-06,
"loss": 1.2144,
"step": 822
},
{
"epoch": 1.8717754172989378,
"grad_norm": 0.6166698932647705,
"learning_rate": 4.06114654608949e-06,
"loss": 1.2434,
"step": 823
},
{
"epoch": 1.874051593323217,
"grad_norm": 0.6023617386817932,
"learning_rate": 4.058724504646834e-06,
"loss": 1.2186,
"step": 824
},
{
"epoch": 1.8763277693474962,
"grad_norm": 0.6259661912918091,
"learning_rate": 4.0563000673751e-06,
"loss": 1.1989,
"step": 825
},
{
"epoch": 1.8786039453717756,
"grad_norm": 0.6420421004295349,
"learning_rate": 4.053873238000756e-06,
"loss": 1.1981,
"step": 826
},
{
"epoch": 1.8808801213960546,
"grad_norm": 0.6250731348991394,
"learning_rate": 4.051444020253947e-06,
"loss": 1.246,
"step": 827
},
{
"epoch": 1.8831562974203337,
"grad_norm": 0.6473506689071655,
"learning_rate": 4.0490124178684884e-06,
"loss": 1.213,
"step": 828
},
{
"epoch": 1.885432473444613,
"grad_norm": 0.6448357701301575,
"learning_rate": 4.046578434581862e-06,
"loss": 1.1696,
"step": 829
},
{
"epoch": 1.8877086494688924,
"grad_norm": 0.6176803112030029,
"learning_rate": 4.044142074135209e-06,
"loss": 1.2453,
"step": 830
},
{
"epoch": 1.8899848254931715,
"grad_norm": 0.6398005485534668,
"learning_rate": 4.0417033402733244e-06,
"loss": 1.2198,
"step": 831
},
{
"epoch": 1.8922610015174506,
"grad_norm": 0.6350208520889282,
"learning_rate": 4.03926223674465e-06,
"loss": 1.2528,
"step": 832
},
{
"epoch": 1.89453717754173,
"grad_norm": 0.5937830209732056,
"learning_rate": 4.03681876730127e-06,
"loss": 1.1594,
"step": 833
},
{
"epoch": 1.8968133535660092,
"grad_norm": 0.6130216121673584,
"learning_rate": 4.034372935698908e-06,
"loss": 1.222,
"step": 834
},
{
"epoch": 1.8990895295902883,
"grad_norm": 0.6638323664665222,
"learning_rate": 4.031924745696916e-06,
"loss": 1.2338,
"step": 835
},
{
"epoch": 1.9013657056145674,
"grad_norm": 0.6491904258728027,
"learning_rate": 4.029474201058269e-06,
"loss": 1.2219,
"step": 836
},
{
"epoch": 1.9036418816388467,
"grad_norm": 0.612301766872406,
"learning_rate": 4.027021305549565e-06,
"loss": 1.2663,
"step": 837
},
{
"epoch": 1.905918057663126,
"grad_norm": 0.6025054454803467,
"learning_rate": 4.024566062941014e-06,
"loss": 1.2264,
"step": 838
},
{
"epoch": 1.9081942336874052,
"grad_norm": 0.6344963312149048,
"learning_rate": 4.022108477006434e-06,
"loss": 1.1948,
"step": 839
},
{
"epoch": 1.9104704097116842,
"grad_norm": 0.6077335476875305,
"learning_rate": 4.019648551523243e-06,
"loss": 1.2394,
"step": 840
},
{
"epoch": 1.9127465857359636,
"grad_norm": 0.6338925361633301,
"learning_rate": 4.017186290272456e-06,
"loss": 1.2136,
"step": 841
},
{
"epoch": 1.9150227617602429,
"grad_norm": 0.6291373372077942,
"learning_rate": 4.014721697038678e-06,
"loss": 1.2374,
"step": 842
},
{
"epoch": 1.917298937784522,
"grad_norm": 0.6118108630180359,
"learning_rate": 4.0122547756101005e-06,
"loss": 1.2045,
"step": 843
},
{
"epoch": 1.919575113808801,
"grad_norm": 0.6250407695770264,
"learning_rate": 4.009785529778489e-06,
"loss": 1.2349,
"step": 844
},
{
"epoch": 1.9218512898330804,
"grad_norm": 0.6737698912620544,
"learning_rate": 4.007313963339188e-06,
"loss": 1.2334,
"step": 845
},
{
"epoch": 1.9241274658573597,
"grad_norm": 0.649118959903717,
"learning_rate": 4.004840080091103e-06,
"loss": 1.1981,
"step": 846
},
{
"epoch": 1.9264036418816388,
"grad_norm": 0.6312914490699768,
"learning_rate": 4.002363883836704e-06,
"loss": 1.2341,
"step": 847
},
{
"epoch": 1.928679817905918,
"grad_norm": 0.6146298050880432,
"learning_rate": 3.999885378382013e-06,
"loss": 1.1925,
"step": 848
},
{
"epoch": 1.9309559939301972,
"grad_norm": 0.6233289241790771,
"learning_rate": 3.997404567536606e-06,
"loss": 1.2407,
"step": 849
},
{
"epoch": 1.9332321699544766,
"grad_norm": 0.6072235107421875,
"learning_rate": 3.994921455113598e-06,
"loss": 1.2033,
"step": 850
},
{
"epoch": 1.9355083459787557,
"grad_norm": 0.6547655463218689,
"learning_rate": 3.992436044929645e-06,
"loss": 1.2368,
"step": 851
},
{
"epoch": 1.9377845220030347,
"grad_norm": 0.6056034564971924,
"learning_rate": 3.989948340804932e-06,
"loss": 1.2212,
"step": 852
},
{
"epoch": 1.940060698027314,
"grad_norm": 0.6160012483596802,
"learning_rate": 3.9874583465631725e-06,
"loss": 1.1944,
"step": 853
},
{
"epoch": 1.9423368740515934,
"grad_norm": 0.641826868057251,
"learning_rate": 3.984966066031598e-06,
"loss": 1.2499,
"step": 854
},
{
"epoch": 1.9446130500758725,
"grad_norm": 0.6412007808685303,
"learning_rate": 3.982471503040954e-06,
"loss": 1.2024,
"step": 855
},
{
"epoch": 1.9468892261001516,
"grad_norm": 0.6296584606170654,
"learning_rate": 3.979974661425497e-06,
"loss": 1.1813,
"step": 856
},
{
"epoch": 1.949165402124431,
"grad_norm": 0.6448803544044495,
"learning_rate": 3.977475545022983e-06,
"loss": 1.2672,
"step": 857
},
{
"epoch": 1.9514415781487102,
"grad_norm": 0.6320902705192566,
"learning_rate": 3.9749741576746645e-06,
"loss": 1.196,
"step": 858
},
{
"epoch": 1.9537177541729895,
"grad_norm": 0.6109302639961243,
"learning_rate": 3.972470503225285e-06,
"loss": 1.2277,
"step": 859
},
{
"epoch": 1.9559939301972686,
"grad_norm": 0.6240274310112,
"learning_rate": 3.969964585523076e-06,
"loss": 1.2625,
"step": 860
},
{
"epoch": 1.9582701062215477,
"grad_norm": 0.5958450436592102,
"learning_rate": 3.967456408419742e-06,
"loss": 1.2133,
"step": 861
},
{
"epoch": 1.960546282245827,
"grad_norm": 0.6262888312339783,
"learning_rate": 3.964945975770464e-06,
"loss": 1.2238,
"step": 862
},
{
"epoch": 1.9628224582701064,
"grad_norm": 0.6366564631462097,
"learning_rate": 3.962433291433889e-06,
"loss": 1.2372,
"step": 863
},
{
"epoch": 1.9650986342943855,
"grad_norm": 0.6750831007957458,
"learning_rate": 3.959918359272125e-06,
"loss": 1.2409,
"step": 864
},
{
"epoch": 1.9673748103186646,
"grad_norm": 0.5879358649253845,
"learning_rate": 3.957401183150734e-06,
"loss": 1.2122,
"step": 865
},
{
"epoch": 1.9696509863429439,
"grad_norm": 0.6384773254394531,
"learning_rate": 3.9548817669387295e-06,
"loss": 1.2046,
"step": 866
},
{
"epoch": 1.9719271623672232,
"grad_norm": 0.6435151100158691,
"learning_rate": 3.952360114508565e-06,
"loss": 1.2545,
"step": 867
},
{
"epoch": 1.9742033383915023,
"grad_norm": 0.6609162092208862,
"learning_rate": 3.949836229736133e-06,
"loss": 1.2548,
"step": 868
},
{
"epoch": 1.9764795144157814,
"grad_norm": 0.6402998566627502,
"learning_rate": 3.947310116500758e-06,
"loss": 1.2369,
"step": 869
},
{
"epoch": 1.9787556904400607,
"grad_norm": 0.6171389222145081,
"learning_rate": 3.944781778685189e-06,
"loss": 1.1803,
"step": 870
},
{
"epoch": 1.98103186646434,
"grad_norm": 0.6790279150009155,
"learning_rate": 3.9422512201755925e-06,
"loss": 1.2349,
"step": 871
},
{
"epoch": 1.9833080424886191,
"grad_norm": 0.636738121509552,
"learning_rate": 3.93971844486155e-06,
"loss": 1.233,
"step": 872
},
{
"epoch": 1.9855842185128982,
"grad_norm": 0.6281400918960571,
"learning_rate": 3.937183456636051e-06,
"loss": 1.1973,
"step": 873
},
{
"epoch": 1.9878603945371776,
"grad_norm": 0.6086034774780273,
"learning_rate": 3.9346462593954845e-06,
"loss": 1.2017,
"step": 874
},
{
"epoch": 1.9901365705614569,
"grad_norm": 0.6195533871650696,
"learning_rate": 3.932106857039637e-06,
"loss": 1.22,
"step": 875
},
{
"epoch": 1.992412746585736,
"grad_norm": 0.6325448155403137,
"learning_rate": 3.929565253471681e-06,
"loss": 1.2081,
"step": 876
},
{
"epoch": 1.994688922610015,
"grad_norm": 0.6466575860977173,
"learning_rate": 3.927021452598177e-06,
"loss": 1.2734,
"step": 877
},
{
"epoch": 1.9969650986342944,
"grad_norm": 0.648371160030365,
"learning_rate": 3.924475458329059e-06,
"loss": 1.2018,
"step": 878
}
],
"logging_steps": 1,
"max_steps": 2634,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 439,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.965165450266411e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}