MMR1-3B-SFT / trainer_state.json
Sicong's picture
Add files using upload-large-folder tool
71b17e3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.99952614120992,
"eval_steps": 500,
"global_step": 7910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00631811720107408,
"grad_norm": 2.410219669342041,
"learning_rate": 1.1378002528445008e-07,
"loss": 0.6234,
"step": 10
},
{
"epoch": 0.01263623440214816,
"grad_norm": 2.261991500854492,
"learning_rate": 2.4020227560050574e-07,
"loss": 0.6184,
"step": 20
},
{
"epoch": 0.01895435160322224,
"grad_norm": 2.1056859493255615,
"learning_rate": 3.6662452591656137e-07,
"loss": 0.6112,
"step": 30
},
{
"epoch": 0.02527246880429632,
"grad_norm": 1.712091326713562,
"learning_rate": 4.93046776232617e-07,
"loss": 0.6003,
"step": 40
},
{
"epoch": 0.0315905860053704,
"grad_norm": 1.321094274520874,
"learning_rate": 6.194690265486726e-07,
"loss": 0.575,
"step": 50
},
{
"epoch": 0.03790870320644448,
"grad_norm": 0.8089994192123413,
"learning_rate": 7.458912768647282e-07,
"loss": 0.5377,
"step": 60
},
{
"epoch": 0.04422682040751856,
"grad_norm": 0.544200599193573,
"learning_rate": 8.72313527180784e-07,
"loss": 0.512,
"step": 70
},
{
"epoch": 0.05054493760859264,
"grad_norm": 0.44749483466148376,
"learning_rate": 9.987357774968396e-07,
"loss": 0.4917,
"step": 80
},
{
"epoch": 0.05686305480966672,
"grad_norm": 0.30434444546699524,
"learning_rate": 1.1251580278128951e-06,
"loss": 0.4749,
"step": 90
},
{
"epoch": 0.0631811720107408,
"grad_norm": 0.24813058972358704,
"learning_rate": 1.2515802781289506e-06,
"loss": 0.4607,
"step": 100
},
{
"epoch": 0.06949928921181488,
"grad_norm": 0.21706120669841766,
"learning_rate": 1.3780025284450064e-06,
"loss": 0.448,
"step": 110
},
{
"epoch": 0.07581740641288896,
"grad_norm": 0.2046414017677307,
"learning_rate": 1.5044247787610621e-06,
"loss": 0.4406,
"step": 120
},
{
"epoch": 0.08213552361396304,
"grad_norm": 0.1882794201374054,
"learning_rate": 1.6308470290771178e-06,
"loss": 0.4367,
"step": 130
},
{
"epoch": 0.08845364081503712,
"grad_norm": 0.19263681769371033,
"learning_rate": 1.7572692793931734e-06,
"loss": 0.4266,
"step": 140
},
{
"epoch": 0.0947717580161112,
"grad_norm": 0.1742035299539566,
"learning_rate": 1.8836915297092289e-06,
"loss": 0.4198,
"step": 150
},
{
"epoch": 0.10108987521718528,
"grad_norm": 0.1775059998035431,
"learning_rate": 2.0101137800252844e-06,
"loss": 0.4164,
"step": 160
},
{
"epoch": 0.10740799241825937,
"grad_norm": 0.18586362898349762,
"learning_rate": 2.13653603034134e-06,
"loss": 0.4101,
"step": 170
},
{
"epoch": 0.11372610961933344,
"grad_norm": 0.17294418811798096,
"learning_rate": 2.262958280657396e-06,
"loss": 0.4083,
"step": 180
},
{
"epoch": 0.12004422682040752,
"grad_norm": 0.1728675216436386,
"learning_rate": 2.3893805309734516e-06,
"loss": 0.4029,
"step": 190
},
{
"epoch": 0.1263623440214816,
"grad_norm": 0.1797151267528534,
"learning_rate": 2.515802781289507e-06,
"loss": 0.4007,
"step": 200
},
{
"epoch": 0.13268046122255567,
"grad_norm": 0.187180295586586,
"learning_rate": 2.6422250316055626e-06,
"loss": 0.3938,
"step": 210
},
{
"epoch": 0.13899857842362975,
"grad_norm": 0.17990782856941223,
"learning_rate": 2.768647281921619e-06,
"loss": 0.3902,
"step": 220
},
{
"epoch": 0.14531669562470384,
"grad_norm": 0.19836974143981934,
"learning_rate": 2.895069532237674e-06,
"loss": 0.3891,
"step": 230
},
{
"epoch": 0.15163481282577793,
"grad_norm": 0.17586922645568848,
"learning_rate": 3.02149178255373e-06,
"loss": 0.3876,
"step": 240
},
{
"epoch": 0.15795293002685198,
"grad_norm": 0.19539974629878998,
"learning_rate": 3.1479140328697856e-06,
"loss": 0.3819,
"step": 250
},
{
"epoch": 0.16427104722792607,
"grad_norm": 0.18709833920001984,
"learning_rate": 3.274336283185841e-06,
"loss": 0.3789,
"step": 260
},
{
"epoch": 0.17058916442900016,
"grad_norm": 0.18259377777576447,
"learning_rate": 3.4007585335018966e-06,
"loss": 0.3771,
"step": 270
},
{
"epoch": 0.17690728163007424,
"grad_norm": 0.1889650523662567,
"learning_rate": 3.5271807838179523e-06,
"loss": 0.3757,
"step": 280
},
{
"epoch": 0.18322539883114833,
"grad_norm": 0.17683972418308258,
"learning_rate": 3.6536030341340076e-06,
"loss": 0.378,
"step": 290
},
{
"epoch": 0.1895435160322224,
"grad_norm": 0.19599057734012604,
"learning_rate": 3.7800252844500634e-06,
"loss": 0.3683,
"step": 300
},
{
"epoch": 0.19586163323329647,
"grad_norm": 0.19569683074951172,
"learning_rate": 3.906447534766119e-06,
"loss": 0.37,
"step": 310
},
{
"epoch": 0.20217975043437056,
"grad_norm": 0.2033437043428421,
"learning_rate": 4.032869785082175e-06,
"loss": 0.3648,
"step": 320
},
{
"epoch": 0.20849786763544464,
"grad_norm": 0.1874990016222,
"learning_rate": 4.15929203539823e-06,
"loss": 0.3636,
"step": 330
},
{
"epoch": 0.21481598483651873,
"grad_norm": 0.1825045645236969,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.363,
"step": 340
},
{
"epoch": 0.2211341020375928,
"grad_norm": 0.19459910690784454,
"learning_rate": 4.412136536030342e-06,
"loss": 0.362,
"step": 350
},
{
"epoch": 0.22745221923866688,
"grad_norm": 0.1864989548921585,
"learning_rate": 4.538558786346398e-06,
"loss": 0.357,
"step": 360
},
{
"epoch": 0.23377033643974096,
"grad_norm": 0.1745409220457077,
"learning_rate": 4.664981036662453e-06,
"loss": 0.3564,
"step": 370
},
{
"epoch": 0.24008845364081505,
"grad_norm": 0.18947263062000275,
"learning_rate": 4.791403286978508e-06,
"loss": 0.3537,
"step": 380
},
{
"epoch": 0.2464065708418891,
"grad_norm": 0.1780448704957962,
"learning_rate": 4.9178255372945645e-06,
"loss": 0.3538,
"step": 390
},
{
"epoch": 0.2527246880429632,
"grad_norm": 0.21806994080543518,
"learning_rate": 5.04424778761062e-06,
"loss": 0.3517,
"step": 400
},
{
"epoch": 0.2590428052440373,
"grad_norm": 0.19830353558063507,
"learning_rate": 5.170670037926675e-06,
"loss": 0.3504,
"step": 410
},
{
"epoch": 0.26536092244511134,
"grad_norm": 0.1746763288974762,
"learning_rate": 5.297092288242731e-06,
"loss": 0.3456,
"step": 420
},
{
"epoch": 0.27167903964618545,
"grad_norm": 0.18027839064598083,
"learning_rate": 5.4235145385587875e-06,
"loss": 0.3476,
"step": 430
},
{
"epoch": 0.2779971568472595,
"grad_norm": 0.18963277339935303,
"learning_rate": 5.549936788874842e-06,
"loss": 0.3454,
"step": 440
},
{
"epoch": 0.2843152740483336,
"grad_norm": 0.1782628297805786,
"learning_rate": 5.676359039190898e-06,
"loss": 0.344,
"step": 450
},
{
"epoch": 0.2906333912494077,
"grad_norm": 0.21438680589199066,
"learning_rate": 5.802781289506953e-06,
"loss": 0.3441,
"step": 460
},
{
"epoch": 0.29695150845048174,
"grad_norm": 0.20768363773822784,
"learning_rate": 5.9292035398230096e-06,
"loss": 0.343,
"step": 470
},
{
"epoch": 0.30326962565155585,
"grad_norm": 0.1901923269033432,
"learning_rate": 6.055625790139065e-06,
"loss": 0.3405,
"step": 480
},
{
"epoch": 0.3095877428526299,
"grad_norm": 0.19777809083461761,
"learning_rate": 6.182048040455121e-06,
"loss": 0.3403,
"step": 490
},
{
"epoch": 0.31590586005370397,
"grad_norm": 0.1863890141248703,
"learning_rate": 6.3084702907711755e-06,
"loss": 0.337,
"step": 500
},
{
"epoch": 0.3222239772547781,
"grad_norm": 0.18657594919204712,
"learning_rate": 6.434892541087232e-06,
"loss": 0.3334,
"step": 510
},
{
"epoch": 0.32854209445585214,
"grad_norm": 0.2064000368118286,
"learning_rate": 6.561314791403287e-06,
"loss": 0.3328,
"step": 520
},
{
"epoch": 0.33486021165692625,
"grad_norm": 0.1871696412563324,
"learning_rate": 6.687737041719343e-06,
"loss": 0.3393,
"step": 530
},
{
"epoch": 0.3411783288580003,
"grad_norm": 0.20120146870613098,
"learning_rate": 6.814159292035398e-06,
"loss": 0.3334,
"step": 540
},
{
"epoch": 0.34749644605907437,
"grad_norm": 0.19019120931625366,
"learning_rate": 6.9405815423514546e-06,
"loss": 0.3366,
"step": 550
},
{
"epoch": 0.3538145632601485,
"grad_norm": 0.19137969613075256,
"learning_rate": 7.067003792667511e-06,
"loss": 0.3346,
"step": 560
},
{
"epoch": 0.36013268046122254,
"grad_norm": 0.20125152170658112,
"learning_rate": 7.193426042983566e-06,
"loss": 0.3289,
"step": 570
},
{
"epoch": 0.36645079766229666,
"grad_norm": 0.17702394723892212,
"learning_rate": 7.319848293299622e-06,
"loss": 0.3309,
"step": 580
},
{
"epoch": 0.3727689148633707,
"grad_norm": 0.1984817534685135,
"learning_rate": 7.446270543615677e-06,
"loss": 0.3316,
"step": 590
},
{
"epoch": 0.3790870320644448,
"grad_norm": 0.1926579773426056,
"learning_rate": 7.572692793931733e-06,
"loss": 0.3289,
"step": 600
},
{
"epoch": 0.3854051492655189,
"grad_norm": 0.21035262942314148,
"learning_rate": 7.699115044247788e-06,
"loss": 0.3282,
"step": 610
},
{
"epoch": 0.39172326646659295,
"grad_norm": 0.18808738887310028,
"learning_rate": 7.825537294563843e-06,
"loss": 0.3272,
"step": 620
},
{
"epoch": 0.39804138366766706,
"grad_norm": 0.19714747369289398,
"learning_rate": 7.951959544879899e-06,
"loss": 0.3248,
"step": 630
},
{
"epoch": 0.4043595008687411,
"grad_norm": 0.1970880627632141,
"learning_rate": 8.078381795195956e-06,
"loss": 0.3242,
"step": 640
},
{
"epoch": 0.4106776180698152,
"grad_norm": 0.18770354986190796,
"learning_rate": 8.204804045512011e-06,
"loss": 0.3215,
"step": 650
},
{
"epoch": 0.4169957352708893,
"grad_norm": 0.2027129977941513,
"learning_rate": 8.331226295828066e-06,
"loss": 0.3247,
"step": 660
},
{
"epoch": 0.42331385247196335,
"grad_norm": 0.21426306664943695,
"learning_rate": 8.457648546144122e-06,
"loss": 0.3217,
"step": 670
},
{
"epoch": 0.42963196967303746,
"grad_norm": 0.2167753279209137,
"learning_rate": 8.584070796460177e-06,
"loss": 0.322,
"step": 680
},
{
"epoch": 0.4359500868741115,
"grad_norm": 0.20410393178462982,
"learning_rate": 8.710493046776234e-06,
"loss": 0.3208,
"step": 690
},
{
"epoch": 0.4422682040751856,
"grad_norm": 0.20711293816566467,
"learning_rate": 8.83691529709229e-06,
"loss": 0.319,
"step": 700
},
{
"epoch": 0.4485863212762597,
"grad_norm": 0.20640410482883453,
"learning_rate": 8.963337547408345e-06,
"loss": 0.3172,
"step": 710
},
{
"epoch": 0.45490443847733375,
"grad_norm": 0.2493702918291092,
"learning_rate": 9.0897597977244e-06,
"loss": 0.3177,
"step": 720
},
{
"epoch": 0.4612225556784078,
"grad_norm": 0.24222460389137268,
"learning_rate": 9.216182048040457e-06,
"loss": 0.3167,
"step": 730
},
{
"epoch": 0.4675406728794819,
"grad_norm": 0.20584948360919952,
"learning_rate": 9.34260429835651e-06,
"loss": 0.3165,
"step": 740
},
{
"epoch": 0.473858790080556,
"grad_norm": 0.19482427835464478,
"learning_rate": 9.469026548672568e-06,
"loss": 0.3138,
"step": 750
},
{
"epoch": 0.4801769072816301,
"grad_norm": 0.19475619494915009,
"learning_rate": 9.595448798988623e-06,
"loss": 0.3171,
"step": 760
},
{
"epoch": 0.48649502448270415,
"grad_norm": 0.179108127951622,
"learning_rate": 9.721871049304678e-06,
"loss": 0.3103,
"step": 770
},
{
"epoch": 0.4928131416837782,
"grad_norm": 0.19913727045059204,
"learning_rate": 9.848293299620733e-06,
"loss": 0.314,
"step": 780
},
{
"epoch": 0.4991312588848523,
"grad_norm": 0.23399747908115387,
"learning_rate": 9.97471554993679e-06,
"loss": 0.3125,
"step": 790
},
{
"epoch": 0.5054493760859264,
"grad_norm": 0.19475406408309937,
"learning_rate": 9.999968841159285e-06,
"loss": 0.3118,
"step": 800
},
{
"epoch": 0.5117674932870004,
"grad_norm": 0.18734484910964966,
"learning_rate": 9.999842259034458e-06,
"loss": 0.3128,
"step": 810
},
{
"epoch": 0.5180856104880746,
"grad_norm": 0.2116527259349823,
"learning_rate": 9.99961830866117e-06,
"loss": 0.3072,
"step": 820
},
{
"epoch": 0.5244037276891487,
"grad_norm": 0.20056816935539246,
"learning_rate": 9.999296994400692e-06,
"loss": 0.3117,
"step": 830
},
{
"epoch": 0.5307218448902227,
"grad_norm": 0.187480166554451,
"learning_rate": 9.99887832251038e-06,
"loss": 0.3086,
"step": 840
},
{
"epoch": 0.5370399620912968,
"grad_norm": 0.18775825202465057,
"learning_rate": 9.998362301143562e-06,
"loss": 0.3079,
"step": 850
},
{
"epoch": 0.5433580792923709,
"grad_norm": 0.19840385019779205,
"learning_rate": 9.997748940349378e-06,
"loss": 0.3072,
"step": 860
},
{
"epoch": 0.5496761964934449,
"grad_norm": 0.1993194818496704,
"learning_rate": 9.997038252072573e-06,
"loss": 0.3065,
"step": 870
},
{
"epoch": 0.555994313694519,
"grad_norm": 0.188375785946846,
"learning_rate": 9.996230250153283e-06,
"loss": 0.3075,
"step": 880
},
{
"epoch": 0.5623124308955931,
"grad_norm": 0.21323969960212708,
"learning_rate": 9.995324950326746e-06,
"loss": 0.3064,
"step": 890
},
{
"epoch": 0.5686305480966672,
"grad_norm": 0.2170088142156601,
"learning_rate": 9.994322370223011e-06,
"loss": 0.3007,
"step": 900
},
{
"epoch": 0.5749486652977412,
"grad_norm": 0.1998039036989212,
"learning_rate": 9.993222529366591e-06,
"loss": 0.3022,
"step": 910
},
{
"epoch": 0.5812667824988154,
"grad_norm": 0.20587877929210663,
"learning_rate": 9.992025449176073e-06,
"loss": 0.3001,
"step": 920
},
{
"epoch": 0.5875848996998895,
"grad_norm": 0.20559850335121155,
"learning_rate": 9.990731152963715e-06,
"loss": 0.3068,
"step": 930
},
{
"epoch": 0.5939030169009635,
"grad_norm": 0.2025015950202942,
"learning_rate": 9.989339665934983e-06,
"loss": 0.3042,
"step": 940
},
{
"epoch": 0.6002211341020376,
"grad_norm": 0.19664855301380157,
"learning_rate": 9.987851015188064e-06,
"loss": 0.3045,
"step": 950
},
{
"epoch": 0.6065392513031117,
"grad_norm": 0.19013217091560364,
"learning_rate": 9.986265229713332e-06,
"loss": 0.2992,
"step": 960
},
{
"epoch": 0.6128573685041857,
"grad_norm": 0.18943046033382416,
"learning_rate": 9.984582340392797e-06,
"loss": 0.3017,
"step": 970
},
{
"epoch": 0.6191754857052598,
"grad_norm": 0.19746196269989014,
"learning_rate": 9.982802379999486e-06,
"loss": 0.3016,
"step": 980
},
{
"epoch": 0.6254936029063339,
"grad_norm": 0.19490814208984375,
"learning_rate": 9.98092538319682e-06,
"loss": 0.3004,
"step": 990
},
{
"epoch": 0.6318117201074079,
"grad_norm": 0.20448216795921326,
"learning_rate": 9.978951386537929e-06,
"loss": 0.3003,
"step": 1000
},
{
"epoch": 0.638129837308482,
"grad_norm": 0.2098686397075653,
"learning_rate": 9.976880428464948e-06,
"loss": 0.2992,
"step": 1010
},
{
"epoch": 0.6444479545095562,
"grad_norm": 0.2074064463376999,
"learning_rate": 9.974712549308257e-06,
"loss": 0.2984,
"step": 1020
},
{
"epoch": 0.6507660717106303,
"grad_norm": 0.19775456190109253,
"learning_rate": 9.97244779128571e-06,
"loss": 0.2966,
"step": 1030
},
{
"epoch": 0.6570841889117043,
"grad_norm": 0.20709405839443207,
"learning_rate": 9.970086198501803e-06,
"loss": 0.2983,
"step": 1040
},
{
"epoch": 0.6634023061127784,
"grad_norm": 0.21704506874084473,
"learning_rate": 9.967627816946816e-06,
"loss": 0.2989,
"step": 1050
},
{
"epoch": 0.6697204233138525,
"grad_norm": 0.22157025337219238,
"learning_rate": 9.965072694495922e-06,
"loss": 0.298,
"step": 1060
},
{
"epoch": 0.6760385405149265,
"grad_norm": 0.22472302615642548,
"learning_rate": 9.96242088090825e-06,
"loss": 0.2976,
"step": 1070
},
{
"epoch": 0.6823566577160006,
"grad_norm": 0.2012009471654892,
"learning_rate": 9.959672427825917e-06,
"loss": 0.2935,
"step": 1080
},
{
"epoch": 0.6886747749170747,
"grad_norm": 0.19134068489074707,
"learning_rate": 9.956827388773025e-06,
"loss": 0.2974,
"step": 1090
},
{
"epoch": 0.6949928921181487,
"grad_norm": 0.18882884085178375,
"learning_rate": 9.953885819154615e-06,
"loss": 0.2926,
"step": 1100
},
{
"epoch": 0.7013110093192229,
"grad_norm": 0.2316889613866806,
"learning_rate": 9.950847776255592e-06,
"loss": 0.2979,
"step": 1110
},
{
"epoch": 0.707629126520297,
"grad_norm": 0.21829363703727722,
"learning_rate": 9.947713319239605e-06,
"loss": 0.2947,
"step": 1120
},
{
"epoch": 0.7139472437213711,
"grad_norm": 0.19675135612487793,
"learning_rate": 9.944482509147896e-06,
"loss": 0.2939,
"step": 1130
},
{
"epoch": 0.7202653609224451,
"grad_norm": 0.21681798994541168,
"learning_rate": 9.941155408898117e-06,
"loss": 0.2943,
"step": 1140
},
{
"epoch": 0.7265834781235192,
"grad_norm": 0.18257145583629608,
"learning_rate": 9.937732083283096e-06,
"loss": 0.2917,
"step": 1150
},
{
"epoch": 0.7329015953245933,
"grad_norm": 0.20622026920318604,
"learning_rate": 9.934212598969577e-06,
"loss": 0.2948,
"step": 1160
},
{
"epoch": 0.7392197125256673,
"grad_norm": 0.16587024927139282,
"learning_rate": 9.930597024496933e-06,
"loss": 0.2918,
"step": 1170
},
{
"epoch": 0.7455378297267414,
"grad_norm": 0.1997261643409729,
"learning_rate": 9.926885430275807e-06,
"loss": 0.2922,
"step": 1180
},
{
"epoch": 0.7518559469278155,
"grad_norm": 0.20139716565608978,
"learning_rate": 9.923077888586775e-06,
"loss": 0.2891,
"step": 1190
},
{
"epoch": 0.7581740641288895,
"grad_norm": 0.20793363451957703,
"learning_rate": 9.919174473578901e-06,
"loss": 0.2918,
"step": 1200
},
{
"epoch": 0.7644921813299637,
"grad_norm": 0.19905509054660797,
"learning_rate": 9.915175261268327e-06,
"loss": 0.2929,
"step": 1210
},
{
"epoch": 0.7708102985310378,
"grad_norm": 0.19855041801929474,
"learning_rate": 9.911080329536761e-06,
"loss": 0.2921,
"step": 1220
},
{
"epoch": 0.7771284157321118,
"grad_norm": 0.24103382229804993,
"learning_rate": 9.906889758129994e-06,
"loss": 0.2919,
"step": 1230
},
{
"epoch": 0.7834465329331859,
"grad_norm": 0.24005091190338135,
"learning_rate": 9.902603628656312e-06,
"loss": 0.2921,
"step": 1240
},
{
"epoch": 0.78976465013426,
"grad_norm": 0.19127513468265533,
"learning_rate": 9.898222024584938e-06,
"loss": 0.2911,
"step": 1250
},
{
"epoch": 0.7960827673353341,
"grad_norm": 0.2415689080953598,
"learning_rate": 9.893745031244385e-06,
"loss": 0.2893,
"step": 1260
},
{
"epoch": 0.8024008845364081,
"grad_norm": 0.21930722892284393,
"learning_rate": 9.889172735820803e-06,
"loss": 0.293,
"step": 1270
},
{
"epoch": 0.8087190017374822,
"grad_norm": 0.23149755597114563,
"learning_rate": 9.884505227356281e-06,
"loss": 0.291,
"step": 1280
},
{
"epoch": 0.8150371189385563,
"grad_norm": 0.20088982582092285,
"learning_rate": 9.87974259674711e-06,
"loss": 0.2877,
"step": 1290
},
{
"epoch": 0.8213552361396304,
"grad_norm": 0.201844722032547,
"learning_rate": 9.87488493674202e-06,
"loss": 0.2892,
"step": 1300
},
{
"epoch": 0.8276733533407045,
"grad_norm": 0.2128770351409912,
"learning_rate": 9.86993234194036e-06,
"loss": 0.2882,
"step": 1310
},
{
"epoch": 0.8339914705417786,
"grad_norm": 0.21982018649578094,
"learning_rate": 9.86488490879027e-06,
"loss": 0.2889,
"step": 1320
},
{
"epoch": 0.8403095877428526,
"grad_norm": 0.20911258459091187,
"learning_rate": 9.859742735586801e-06,
"loss": 0.2881,
"step": 1330
},
{
"epoch": 0.8466277049439267,
"grad_norm": 0.22615337371826172,
"learning_rate": 9.854505922469985e-06,
"loss": 0.2896,
"step": 1340
},
{
"epoch": 0.8529458221450008,
"grad_norm": 0.1955297738313675,
"learning_rate": 9.849174571422906e-06,
"loss": 0.2885,
"step": 1350
},
{
"epoch": 0.8592639393460749,
"grad_norm": 0.1870257705450058,
"learning_rate": 9.843748786269704e-06,
"loss": 0.2849,
"step": 1360
},
{
"epoch": 0.8655820565471489,
"grad_norm": 0.20946596562862396,
"learning_rate": 9.838228672673551e-06,
"loss": 0.2873,
"step": 1370
},
{
"epoch": 0.871900173748223,
"grad_norm": 0.18047629296779633,
"learning_rate": 9.832614338134595e-06,
"loss": 0.2862,
"step": 1380
},
{
"epoch": 0.8782182909492972,
"grad_norm": 0.19568774104118347,
"learning_rate": 9.826905891987872e-06,
"loss": 0.2857,
"step": 1390
},
{
"epoch": 0.8845364081503712,
"grad_norm": 0.22279143333435059,
"learning_rate": 9.821103445401167e-06,
"loss": 0.2851,
"step": 1400
},
{
"epoch": 0.8908545253514453,
"grad_norm": 0.21086236834526062,
"learning_rate": 9.81520711137286e-06,
"loss": 0.2849,
"step": 1410
},
{
"epoch": 0.8971726425525194,
"grad_norm": 0.2367515116930008,
"learning_rate": 9.809217004729714e-06,
"loss": 0.2821,
"step": 1420
},
{
"epoch": 0.9034907597535934,
"grad_norm": 0.21128222346305847,
"learning_rate": 9.803133242124649e-06,
"loss": 0.2857,
"step": 1430
},
{
"epoch": 0.9098088769546675,
"grad_norm": 0.22519482672214508,
"learning_rate": 9.796955942034465e-06,
"loss": 0.2852,
"step": 1440
},
{
"epoch": 0.9161269941557416,
"grad_norm": 0.19642499089241028,
"learning_rate": 9.790685224757534e-06,
"loss": 0.2823,
"step": 1450
},
{
"epoch": 0.9224451113568156,
"grad_norm": 0.21369688212871552,
"learning_rate": 9.784321212411463e-06,
"loss": 0.2839,
"step": 1460
},
{
"epoch": 0.9287632285578897,
"grad_norm": 0.21286526322364807,
"learning_rate": 9.777864028930705e-06,
"loss": 0.2824,
"step": 1470
},
{
"epoch": 0.9350813457589638,
"grad_norm": 0.22185811400413513,
"learning_rate": 9.771313800064157e-06,
"loss": 0.2835,
"step": 1480
},
{
"epoch": 0.941399462960038,
"grad_norm": 0.2697184383869171,
"learning_rate": 9.764670653372709e-06,
"loss": 0.2827,
"step": 1490
},
{
"epoch": 0.947717580161112,
"grad_norm": 0.18580107390880585,
"learning_rate": 9.757934718226751e-06,
"loss": 0.2835,
"step": 1500
},
{
"epoch": 0.9540356973621861,
"grad_norm": 0.19771607220172882,
"learning_rate": 9.751106125803663e-06,
"loss": 0.2822,
"step": 1510
},
{
"epoch": 0.9603538145632602,
"grad_norm": 0.21847136318683624,
"learning_rate": 9.744185009085258e-06,
"loss": 0.284,
"step": 1520
},
{
"epoch": 0.9666719317643342,
"grad_norm": 0.18815948069095612,
"learning_rate": 9.73717150285519e-06,
"loss": 0.2819,
"step": 1530
},
{
"epoch": 0.9729900489654083,
"grad_norm": 0.19956186413764954,
"learning_rate": 9.730065743696332e-06,
"loss": 0.2828,
"step": 1540
},
{
"epoch": 0.9793081661664824,
"grad_norm": 0.18478693068027496,
"learning_rate": 9.722867869988112e-06,
"loss": 0.2819,
"step": 1550
},
{
"epoch": 0.9856262833675564,
"grad_norm": 0.21556143462657928,
"learning_rate": 9.715578021903827e-06,
"loss": 0.2805,
"step": 1560
},
{
"epoch": 0.9919444005686305,
"grad_norm": 0.1989905834197998,
"learning_rate": 9.7081963414079e-06,
"loss": 0.2788,
"step": 1570
},
{
"epoch": 0.9982625177697046,
"grad_norm": 0.1941995471715927,
"learning_rate": 9.70072297225313e-06,
"loss": 0.2804,
"step": 1580
},
{
"epoch": 1.0050544937608592,
"grad_norm": 0.192391499876976,
"learning_rate": 9.693158059977879e-06,
"loss": 0.2898,
"step": 1590
},
{
"epoch": 1.0113726109619334,
"grad_norm": 0.19495341181755066,
"learning_rate": 9.685501751903246e-06,
"loss": 0.2747,
"step": 1600
},
{
"epoch": 1.0176907281630074,
"grad_norm": 0.1872604936361313,
"learning_rate": 9.677754197130196e-06,
"loss": 0.2749,
"step": 1610
},
{
"epoch": 1.0240088453640814,
"grad_norm": 0.21903474628925323,
"learning_rate": 9.669915546536659e-06,
"loss": 0.2726,
"step": 1620
},
{
"epoch": 1.0303269625651557,
"grad_norm": 0.22876089811325073,
"learning_rate": 9.661985952774584e-06,
"loss": 0.2722,
"step": 1630
},
{
"epoch": 1.0366450797662297,
"grad_norm": 0.19803361594676971,
"learning_rate": 9.653965570266977e-06,
"loss": 0.2723,
"step": 1640
},
{
"epoch": 1.0429631969673037,
"grad_norm": 0.18463590741157532,
"learning_rate": 9.645854555204882e-06,
"loss": 0.2708,
"step": 1650
},
{
"epoch": 1.0492813141683779,
"grad_norm": 0.18571729958057404,
"learning_rate": 9.637653065544349e-06,
"loss": 0.2726,
"step": 1660
},
{
"epoch": 1.055599431369452,
"grad_norm": 0.199079692363739,
"learning_rate": 9.629361261003353e-06,
"loss": 0.2738,
"step": 1670
},
{
"epoch": 1.061917548570526,
"grad_norm": 0.20288918912410736,
"learning_rate": 9.620979303058686e-06,
"loss": 0.2746,
"step": 1680
},
{
"epoch": 1.0682356657716001,
"grad_norm": 0.2032773643732071,
"learning_rate": 9.612507354942811e-06,
"loss": 0.2736,
"step": 1690
},
{
"epoch": 1.0745537829726741,
"grad_norm": 0.19241447746753693,
"learning_rate": 9.603945581640682e-06,
"loss": 0.2721,
"step": 1700
},
{
"epoch": 1.0808719001737481,
"grad_norm": 0.18638016283512115,
"learning_rate": 9.595294149886532e-06,
"loss": 0.27,
"step": 1710
},
{
"epoch": 1.0871900173748223,
"grad_norm": 0.1852736473083496,
"learning_rate": 9.58655322816063e-06,
"loss": 0.2714,
"step": 1720
},
{
"epoch": 1.0935081345758964,
"grad_norm": 0.1990862339735031,
"learning_rate": 9.577722986685992e-06,
"loss": 0.2706,
"step": 1730
},
{
"epoch": 1.0998262517769706,
"grad_norm": 0.19899272918701172,
"learning_rate": 9.568803597425072e-06,
"loss": 0.275,
"step": 1740
},
{
"epoch": 1.1061443689780446,
"grad_norm": 0.18742632865905762,
"learning_rate": 9.559795234076414e-06,
"loss": 0.2721,
"step": 1750
},
{
"epoch": 1.1124624861791186,
"grad_norm": 0.223663330078125,
"learning_rate": 9.550698072071263e-06,
"loss": 0.2716,
"step": 1760
},
{
"epoch": 1.1187806033801928,
"grad_norm": 0.21346202492713928,
"learning_rate": 9.541512288570155e-06,
"loss": 0.274,
"step": 1770
},
{
"epoch": 1.1250987205812668,
"grad_norm": 0.19517794251441956,
"learning_rate": 9.532238062459465e-06,
"loss": 0.2711,
"step": 1780
},
{
"epoch": 1.1314168377823408,
"grad_norm": 0.18628506362438202,
"learning_rate": 9.522875574347917e-06,
"loss": 0.2719,
"step": 1790
},
{
"epoch": 1.137734954983415,
"grad_norm": 0.2409992814064026,
"learning_rate": 9.51342500656308e-06,
"loss": 0.2704,
"step": 1800
},
{
"epoch": 1.144053072184489,
"grad_norm": 0.2048967182636261,
"learning_rate": 9.503886543147804e-06,
"loss": 0.2703,
"step": 1810
},
{
"epoch": 1.150371189385563,
"grad_norm": 0.1800081878900528,
"learning_rate": 9.494260369856649e-06,
"loss": 0.2693,
"step": 1820
},
{
"epoch": 1.1566893065866373,
"grad_norm": 0.1908334493637085,
"learning_rate": 9.484546674152253e-06,
"loss": 0.2705,
"step": 1830
},
{
"epoch": 1.1630074237877113,
"grad_norm": 0.18866339325904846,
"learning_rate": 9.47474564520169e-06,
"loss": 0.2695,
"step": 1840
},
{
"epoch": 1.1693255409887853,
"grad_norm": 0.17103448510169983,
"learning_rate": 9.464857473872788e-06,
"loss": 0.2699,
"step": 1850
},
{
"epoch": 1.1756436581898595,
"grad_norm": 0.1825484037399292,
"learning_rate": 9.454882352730405e-06,
"loss": 0.2702,
"step": 1860
},
{
"epoch": 1.1819617753909335,
"grad_norm": 0.21534956991672516,
"learning_rate": 9.444820476032687e-06,
"loss": 0.2701,
"step": 1870
},
{
"epoch": 1.1882798925920075,
"grad_norm": 0.20504914224147797,
"learning_rate": 9.434672039727275e-06,
"loss": 0.2668,
"step": 1880
},
{
"epoch": 1.1945980097930817,
"grad_norm": 0.1951032131910324,
"learning_rate": 9.424437241447497e-06,
"loss": 0.2681,
"step": 1890
},
{
"epoch": 1.2009161269941557,
"grad_norm": 0.24697691202163696,
"learning_rate": 9.41411628050852e-06,
"loss": 0.2687,
"step": 1900
},
{
"epoch": 1.2072342441952297,
"grad_norm": 0.1977747082710266,
"learning_rate": 9.40370935790346e-06,
"loss": 0.2706,
"step": 1910
},
{
"epoch": 1.213552361396304,
"grad_norm": 0.2046399563550949,
"learning_rate": 9.393216676299481e-06,
"loss": 0.2672,
"step": 1920
},
{
"epoch": 1.219870478597378,
"grad_norm": 0.21050798892974854,
"learning_rate": 9.38263844003383e-06,
"loss": 0.2677,
"step": 1930
},
{
"epoch": 1.226188595798452,
"grad_norm": 0.18349182605743408,
"learning_rate": 9.371974855109876e-06,
"loss": 0.2676,
"step": 1940
},
{
"epoch": 1.2325067129995262,
"grad_norm": 0.2518089711666107,
"learning_rate": 9.361226129193086e-06,
"loss": 0.2659,
"step": 1950
},
{
"epoch": 1.2388248302006002,
"grad_norm": 0.18753299117088318,
"learning_rate": 9.350392471606989e-06,
"loss": 0.2641,
"step": 1960
},
{
"epoch": 1.2451429474016744,
"grad_norm": 0.2322888821363449,
"learning_rate": 9.339474093329094e-06,
"loss": 0.2675,
"step": 1970
},
{
"epoch": 1.2514610646027484,
"grad_norm": 0.19198372960090637,
"learning_rate": 9.328471206986778e-06,
"loss": 0.269,
"step": 1980
},
{
"epoch": 1.2577791818038224,
"grad_norm": 0.1776944249868393,
"learning_rate": 9.317384026853161e-06,
"loss": 0.2673,
"step": 1990
},
{
"epoch": 1.2640972990048964,
"grad_norm": 0.21030068397521973,
"learning_rate": 9.306212768842914e-06,
"loss": 0.2672,
"step": 2000
},
{
"epoch": 1.2704154162059706,
"grad_norm": 0.25448349118232727,
"learning_rate": 9.294957650508065e-06,
"loss": 0.2685,
"step": 2010
},
{
"epoch": 1.2767335334070447,
"grad_norm": 0.1928747445344925,
"learning_rate": 9.283618891033764e-06,
"loss": 0.2669,
"step": 2020
},
{
"epoch": 1.2830516506081189,
"grad_norm": 0.19075071811676025,
"learning_rate": 9.272196711234001e-06,
"loss": 0.2658,
"step": 2030
},
{
"epoch": 1.2893697678091929,
"grad_norm": 0.18030743300914764,
"learning_rate": 9.260691333547329e-06,
"loss": 0.269,
"step": 2040
},
{
"epoch": 1.2956878850102669,
"grad_norm": 0.20846770703792572,
"learning_rate": 9.249102982032506e-06,
"loss": 0.268,
"step": 2050
},
{
"epoch": 1.3020060022113409,
"grad_norm": 0.18990422785282135,
"learning_rate": 9.237431882364149e-06,
"loss": 0.2674,
"step": 2060
},
{
"epoch": 1.308324119412415,
"grad_norm": 0.21943022310733795,
"learning_rate": 9.22567826182834e-06,
"loss": 0.2655,
"step": 2070
},
{
"epoch": 1.3146422366134891,
"grad_norm": 0.21548326313495636,
"learning_rate": 9.213842349318185e-06,
"loss": 0.2657,
"step": 2080
},
{
"epoch": 1.3209603538145633,
"grad_norm": 0.18391166627407074,
"learning_rate": 9.201924375329372e-06,
"loss": 0.2663,
"step": 2090
},
{
"epoch": 1.3272784710156373,
"grad_norm": 0.17586641013622284,
"learning_rate": 9.189924571955671e-06,
"loss": 0.2624,
"step": 2100
},
{
"epoch": 1.3335965882167113,
"grad_norm": 0.19197408854961395,
"learning_rate": 9.177843172884423e-06,
"loss": 0.2647,
"step": 2110
},
{
"epoch": 1.3399147054177856,
"grad_norm": 0.21062326431274414,
"learning_rate": 9.165680413391987e-06,
"loss": 0.265,
"step": 2120
},
{
"epoch": 1.3462328226188596,
"grad_norm": 0.19581826031208038,
"learning_rate": 9.153436530339147e-06,
"loss": 0.2638,
"step": 2130
},
{
"epoch": 1.3525509398199338,
"grad_norm": 0.2166038602590561,
"learning_rate": 9.14111176216652e-06,
"loss": 0.2657,
"step": 2140
},
{
"epoch": 1.3588690570210078,
"grad_norm": 0.2010088860988617,
"learning_rate": 9.128706348889895e-06,
"loss": 0.2638,
"step": 2150
},
{
"epoch": 1.3651871742220818,
"grad_norm": 0.2053796499967575,
"learning_rate": 9.116220532095563e-06,
"loss": 0.264,
"step": 2160
},
{
"epoch": 1.3715052914231558,
"grad_norm": 0.17751292884349823,
"learning_rate": 9.10365455493562e-06,
"loss": 0.2653,
"step": 2170
},
{
"epoch": 1.37782340862423,
"grad_norm": 0.22349873185157776,
"learning_rate": 9.091008662123224e-06,
"loss": 0.2642,
"step": 2180
},
{
"epoch": 1.384141525825304,
"grad_norm": 0.1846960186958313,
"learning_rate": 9.078283099927829e-06,
"loss": 0.2653,
"step": 2190
},
{
"epoch": 1.3904596430263783,
"grad_norm": 0.2242564558982849,
"learning_rate": 9.065478116170394e-06,
"loss": 0.2621,
"step": 2200
},
{
"epoch": 1.3967777602274523,
"grad_norm": 0.241655170917511,
"learning_rate": 9.052593960218556e-06,
"loss": 0.2652,
"step": 2210
},
{
"epoch": 1.4030958774285263,
"grad_norm": 0.19567032158374786,
"learning_rate": 9.039630882981769e-06,
"loss": 0.2642,
"step": 2220
},
{
"epoch": 1.4094139946296003,
"grad_norm": 0.21501778066158295,
"learning_rate": 9.026589136906422e-06,
"loss": 0.2625,
"step": 2230
},
{
"epoch": 1.4157321118306745,
"grad_norm": 0.19091379642486572,
"learning_rate": 9.013468975970923e-06,
"loss": 0.2646,
"step": 2240
},
{
"epoch": 1.4220502290317485,
"grad_norm": 0.17913809418678284,
"learning_rate": 9.00027065568075e-06,
"loss": 0.2638,
"step": 2250
},
{
"epoch": 1.4283683462328227,
"grad_norm": 0.18866880238056183,
"learning_rate": 8.986994433063476e-06,
"loss": 0.2634,
"step": 2260
},
{
"epoch": 1.4346864634338967,
"grad_norm": 0.20900848507881165,
"learning_rate": 8.973640566663769e-06,
"loss": 0.2643,
"step": 2270
},
{
"epoch": 1.4410045806349707,
"grad_norm": 0.1879900097846985,
"learning_rate": 8.96020931653835e-06,
"loss": 0.2633,
"step": 2280
},
{
"epoch": 1.4473226978360447,
"grad_norm": 0.17993497848510742,
"learning_rate": 8.946700944250925e-06,
"loss": 0.2628,
"step": 2290
},
{
"epoch": 1.453640815037119,
"grad_norm": 0.2076902538537979,
"learning_rate": 8.93311571286711e-06,
"loss": 0.2629,
"step": 2300
},
{
"epoch": 1.459958932238193,
"grad_norm": 0.24252377450466156,
"learning_rate": 8.919453886949285e-06,
"loss": 0.2625,
"step": 2310
},
{
"epoch": 1.4662770494392672,
"grad_norm": 0.19852754473686218,
"learning_rate": 8.905715732551457e-06,
"loss": 0.263,
"step": 2320
},
{
"epoch": 1.4725951666403412,
"grad_norm": 0.1704029142856598,
"learning_rate": 8.89190151721407e-06,
"loss": 0.2642,
"step": 2330
},
{
"epoch": 1.4789132838414152,
"grad_norm": 0.19873927533626556,
"learning_rate": 8.878011509958804e-06,
"loss": 0.2612,
"step": 2340
},
{
"epoch": 1.4852314010424894,
"grad_norm": 0.1872422695159912,
"learning_rate": 8.864045981283327e-06,
"loss": 0.259,
"step": 2350
},
{
"epoch": 1.4915495182435634,
"grad_norm": 0.20828309655189514,
"learning_rate": 8.850005203156035e-06,
"loss": 0.2614,
"step": 2360
},
{
"epoch": 1.4978676354446376,
"grad_norm": 0.18343457579612732,
"learning_rate": 8.835889449010743e-06,
"loss": 0.2618,
"step": 2370
},
{
"epoch": 1.5041857526457116,
"grad_norm": 0.1891496777534485,
"learning_rate": 8.821698993741381e-06,
"loss": 0.264,
"step": 2380
},
{
"epoch": 1.5105038698467856,
"grad_norm": 0.19773255288600922,
"learning_rate": 8.80743411369662e-06,
"loss": 0.2609,
"step": 2390
},
{
"epoch": 1.5168219870478596,
"grad_norm": 0.20208434760570526,
"learning_rate": 8.7930950866745e-06,
"loss": 0.2632,
"step": 2400
},
{
"epoch": 1.5231401042489339,
"grad_norm": 0.2181108295917511,
"learning_rate": 8.778682191917019e-06,
"loss": 0.2619,
"step": 2410
},
{
"epoch": 1.5294582214500079,
"grad_norm": 0.20136655867099762,
"learning_rate": 8.764195710104699e-06,
"loss": 0.2625,
"step": 2420
},
{
"epoch": 1.535776338651082,
"grad_norm": 0.254148930311203,
"learning_rate": 8.749635923351108e-06,
"loss": 0.2601,
"step": 2430
},
{
"epoch": 1.542094455852156,
"grad_norm": 0.2224704623222351,
"learning_rate": 8.73500311519738e-06,
"loss": 0.2619,
"step": 2440
},
{
"epoch": 1.54841257305323,
"grad_norm": 0.17686180770397186,
"learning_rate": 8.720297570606686e-06,
"loss": 0.2607,
"step": 2450
},
{
"epoch": 1.554730690254304,
"grad_norm": 0.18937917053699493,
"learning_rate": 8.705519575958684e-06,
"loss": 0.2616,
"step": 2460
},
{
"epoch": 1.5610488074553783,
"grad_norm": 0.19412845373153687,
"learning_rate": 8.690669419043945e-06,
"loss": 0.2622,
"step": 2470
},
{
"epoch": 1.5673669246564523,
"grad_norm": 0.19065144658088684,
"learning_rate": 8.675747389058342e-06,
"loss": 0.2615,
"step": 2480
},
{
"epoch": 1.5736850418575266,
"grad_norm": 0.17359939217567444,
"learning_rate": 8.660753776597433e-06,
"loss": 0.261,
"step": 2490
},
{
"epoch": 1.5800031590586006,
"grad_norm": 0.19566282629966736,
"learning_rate": 8.645688873650785e-06,
"loss": 0.2623,
"step": 2500
},
{
"epoch": 1.5863212762596746,
"grad_norm": 0.1743886023759842,
"learning_rate": 8.630552973596294e-06,
"loss": 0.2613,
"step": 2510
},
{
"epoch": 1.5926393934607486,
"grad_norm": 0.20789675414562225,
"learning_rate": 8.615346371194475e-06,
"loss": 0.2603,
"step": 2520
},
{
"epoch": 1.5989575106618228,
"grad_norm": 0.17617076635360718,
"learning_rate": 8.600069362582722e-06,
"loss": 0.2613,
"step": 2530
},
{
"epoch": 1.605275627862897,
"grad_norm": 0.18429051339626312,
"learning_rate": 8.58472224526953e-06,
"loss": 0.2623,
"step": 2540
},
{
"epoch": 1.611593745063971,
"grad_norm": 0.2026170939207077,
"learning_rate": 8.569305318128717e-06,
"loss": 0.2614,
"step": 2550
},
{
"epoch": 1.617911862265045,
"grad_norm": 0.1982942372560501,
"learning_rate": 8.553818881393595e-06,
"loss": 0.2591,
"step": 2560
},
{
"epoch": 1.624229979466119,
"grad_norm": 0.17273586988449097,
"learning_rate": 8.538263236651119e-06,
"loss": 0.2612,
"step": 2570
},
{
"epoch": 1.630548096667193,
"grad_norm": 0.19549575448036194,
"learning_rate": 8.522638686836024e-06,
"loss": 0.259,
"step": 2580
},
{
"epoch": 1.6368662138682673,
"grad_norm": 0.23418502509593964,
"learning_rate": 8.50694553622492e-06,
"loss": 0.2582,
"step": 2590
},
{
"epoch": 1.6431843310693415,
"grad_norm": 0.19169150292873383,
"learning_rate": 8.491184090430365e-06,
"loss": 0.2592,
"step": 2600
},
{
"epoch": 1.6495024482704155,
"grad_norm": 0.20778028666973114,
"learning_rate": 8.475354656394916e-06,
"loss": 0.2624,
"step": 2610
},
{
"epoch": 1.6558205654714895,
"grad_norm": 0.19188308715820312,
"learning_rate": 8.459457542385154e-06,
"loss": 0.2589,
"step": 2620
},
{
"epoch": 1.6621386826725635,
"grad_norm": 0.187831848859787,
"learning_rate": 8.44349305798567e-06,
"loss": 0.2594,
"step": 2630
},
{
"epoch": 1.6684567998736377,
"grad_norm": 0.20327366888523102,
"learning_rate": 8.427461514093056e-06,
"loss": 0.2595,
"step": 2640
},
{
"epoch": 1.6747749170747117,
"grad_norm": 0.19990861415863037,
"learning_rate": 8.411363222909825e-06,
"loss": 0.2582,
"step": 2650
},
{
"epoch": 1.681093034275786,
"grad_norm": 0.19513264298439026,
"learning_rate": 8.395198497938354e-06,
"loss": 0.2587,
"step": 2660
},
{
"epoch": 1.68741115147686,
"grad_norm": 0.18786491453647614,
"learning_rate": 8.378967653974766e-06,
"loss": 0.2561,
"step": 2670
},
{
"epoch": 1.693729268677934,
"grad_norm": 0.2018646001815796,
"learning_rate": 8.362671007102798e-06,
"loss": 0.2582,
"step": 2680
},
{
"epoch": 1.700047385879008,
"grad_norm": 0.17802584171295166,
"learning_rate": 8.34630887468766e-06,
"loss": 0.2584,
"step": 2690
},
{
"epoch": 1.7063655030800822,
"grad_norm": 0.1678951233625412,
"learning_rate": 8.329881575369838e-06,
"loss": 0.2574,
"step": 2700
},
{
"epoch": 1.7126836202811562,
"grad_norm": 0.18521824479103088,
"learning_rate": 8.313389429058895e-06,
"loss": 0.26,
"step": 2710
},
{
"epoch": 1.7190017374822304,
"grad_norm": 0.18977366387844086,
"learning_rate": 8.296832756927245e-06,
"loss": 0.2586,
"step": 2720
},
{
"epoch": 1.7253198546833044,
"grad_norm": 0.19465599954128265,
"learning_rate": 8.280211881403892e-06,
"loss": 0.2599,
"step": 2730
},
{
"epoch": 1.7316379718843784,
"grad_norm": 0.20573335886001587,
"learning_rate": 8.263527126168156e-06,
"loss": 0.2582,
"step": 2740
},
{
"epoch": 1.7379560890854524,
"grad_norm": 0.18216483294963837,
"learning_rate": 8.246778816143365e-06,
"loss": 0.2594,
"step": 2750
},
{
"epoch": 1.7442742062865266,
"grad_norm": 0.1724158674478531,
"learning_rate": 8.229967277490533e-06,
"loss": 0.2585,
"step": 2760
},
{
"epoch": 1.7505923234876009,
"grad_norm": 0.22212329506874084,
"learning_rate": 8.213092837602004e-06,
"loss": 0.2587,
"step": 2770
},
{
"epoch": 1.7569104406886749,
"grad_norm": 0.21226562559604645,
"learning_rate": 8.196155825095073e-06,
"loss": 0.2592,
"step": 2780
},
{
"epoch": 1.7632285578897489,
"grad_norm": 0.1901644766330719,
"learning_rate": 8.179156569805597e-06,
"loss": 0.2584,
"step": 2790
},
{
"epoch": 1.7695466750908229,
"grad_norm": 0.1988213062286377,
"learning_rate": 8.16209540278156e-06,
"loss": 0.2595,
"step": 2800
},
{
"epoch": 1.7758647922918969,
"grad_norm": 0.1761639416217804,
"learning_rate": 8.144972656276637e-06,
"loss": 0.2576,
"step": 2810
},
{
"epoch": 1.782182909492971,
"grad_norm": 0.2082483023405075,
"learning_rate": 8.127788663743712e-06,
"loss": 0.2576,
"step": 2820
},
{
"epoch": 1.7885010266940453,
"grad_norm": 0.17774218320846558,
"learning_rate": 8.110543759828395e-06,
"loss": 0.2574,
"step": 2830
},
{
"epoch": 1.7948191438951193,
"grad_norm": 0.18034055829048157,
"learning_rate": 8.0932382803625e-06,
"loss": 0.2572,
"step": 2840
},
{
"epoch": 1.8011372610961933,
"grad_norm": 0.21685677766799927,
"learning_rate": 8.075872562357502e-06,
"loss": 0.2585,
"step": 2850
},
{
"epoch": 1.8074553782972673,
"grad_norm": 0.18717004358768463,
"learning_rate": 8.058446943997977e-06,
"loss": 0.258,
"step": 2860
},
{
"epoch": 1.8137734954983415,
"grad_norm": 0.1846955120563507,
"learning_rate": 8.040961764635025e-06,
"loss": 0.2573,
"step": 2870
},
{
"epoch": 1.8200916126994156,
"grad_norm": 0.17588602006435394,
"learning_rate": 8.02341736477964e-06,
"loss": 0.2585,
"step": 2880
},
{
"epoch": 1.8264097299004898,
"grad_norm": 0.16006359457969666,
"learning_rate": 8.0058140860961e-06,
"loss": 0.2581,
"step": 2890
},
{
"epoch": 1.8327278471015638,
"grad_norm": 0.20451048016548157,
"learning_rate": 7.988152271395304e-06,
"loss": 0.2569,
"step": 2900
},
{
"epoch": 1.8390459643026378,
"grad_norm": 0.22039860486984253,
"learning_rate": 7.970432264628094e-06,
"loss": 0.2548,
"step": 2910
},
{
"epoch": 1.8453640815037118,
"grad_norm": 0.20109356939792633,
"learning_rate": 7.95265441087856e-06,
"loss": 0.2557,
"step": 2920
},
{
"epoch": 1.851682198704786,
"grad_norm": 0.18628036975860596,
"learning_rate": 7.934819056357321e-06,
"loss": 0.255,
"step": 2930
},
{
"epoch": 1.85800031590586,
"grad_norm": 0.17076027393341064,
"learning_rate": 7.916926548394783e-06,
"loss": 0.2575,
"step": 2940
},
{
"epoch": 1.8643184331069342,
"grad_norm": 0.1676408052444458,
"learning_rate": 7.898977235434368e-06,
"loss": 0.2569,
"step": 2950
},
{
"epoch": 1.8706365503080082,
"grad_norm": 0.18232934176921844,
"learning_rate": 7.88097146702574e-06,
"loss": 0.2548,
"step": 2960
},
{
"epoch": 1.8769546675090822,
"grad_norm": 0.1734633445739746,
"learning_rate": 7.862909593817984e-06,
"loss": 0.2568,
"step": 2970
},
{
"epoch": 1.8832727847101562,
"grad_norm": 0.17797045409679413,
"learning_rate": 7.844791967552792e-06,
"loss": 0.2586,
"step": 2980
},
{
"epoch": 1.8895909019112305,
"grad_norm": 0.19380344450473785,
"learning_rate": 7.826618941057597e-06,
"loss": 0.2567,
"step": 2990
},
{
"epoch": 1.8959090191123047,
"grad_norm": 0.20007206499576569,
"learning_rate": 7.808390868238723e-06,
"loss": 0.2575,
"step": 3000
},
{
"epoch": 1.9022271363133787,
"grad_norm": 0.18448038399219513,
"learning_rate": 7.790108104074468e-06,
"loss": 0.2574,
"step": 3010
},
{
"epoch": 1.9085452535144527,
"grad_norm": 0.17711378633975983,
"learning_rate": 7.77177100460821e-06,
"loss": 0.2578,
"step": 3020
},
{
"epoch": 1.9148633707155267,
"grad_norm": 0.18232811987400055,
"learning_rate": 7.753379926941468e-06,
"loss": 0.2577,
"step": 3030
},
{
"epoch": 1.9211814879166007,
"grad_norm": 0.1973661184310913,
"learning_rate": 7.734935229226945e-06,
"loss": 0.254,
"step": 3040
},
{
"epoch": 1.927499605117675,
"grad_norm": 0.17610979080200195,
"learning_rate": 7.716437270661552e-06,
"loss": 0.2541,
"step": 3050
},
{
"epoch": 1.9338177223187492,
"grad_norm": 0.18116143345832825,
"learning_rate": 7.697886411479422e-06,
"loss": 0.2562,
"step": 3060
},
{
"epoch": 1.9401358395198232,
"grad_norm": 0.19937658309936523,
"learning_rate": 7.679283012944887e-06,
"loss": 0.2565,
"step": 3070
},
{
"epoch": 1.9464539567208972,
"grad_norm": 0.17094001173973083,
"learning_rate": 7.660627437345438e-06,
"loss": 0.2546,
"step": 3080
},
{
"epoch": 1.9527720739219712,
"grad_norm": 0.17260311543941498,
"learning_rate": 7.641920047984683e-06,
"loss": 0.2535,
"step": 3090
},
{
"epoch": 1.9590901911230452,
"grad_norm": 0.16419674456119537,
"learning_rate": 7.6231612091752625e-06,
"loss": 0.2574,
"step": 3100
},
{
"epoch": 1.9654083083241194,
"grad_norm": 0.17597036063671112,
"learning_rate": 7.604351286231759e-06,
"loss": 0.2538,
"step": 3110
},
{
"epoch": 1.9717264255251936,
"grad_norm": 0.19706901907920837,
"learning_rate": 7.585490645463574e-06,
"loss": 0.2525,
"step": 3120
},
{
"epoch": 1.9780445427262676,
"grad_norm": 0.16717633605003357,
"learning_rate": 7.5665796541678106e-06,
"loss": 0.2561,
"step": 3130
},
{
"epoch": 1.9843626599273416,
"grad_norm": 0.18098637461662292,
"learning_rate": 7.547618680622104e-06,
"loss": 0.2538,
"step": 3140
},
{
"epoch": 1.9906807771284156,
"grad_norm": 0.19447918236255646,
"learning_rate": 7.528608094077464e-06,
"loss": 0.2556,
"step": 3150
},
{
"epoch": 1.9969988943294898,
"grad_norm": 0.21584630012512207,
"learning_rate": 7.50954826475107e-06,
"loss": 0.2532,
"step": 3160
},
{
"epoch": 2.0037908703206444,
"grad_norm": 0.18063998222351074,
"learning_rate": 7.490439563819073e-06,
"loss": 0.2674,
"step": 3170
},
{
"epoch": 2.0101089875217184,
"grad_norm": 0.20729950070381165,
"learning_rate": 7.4712823634093605e-06,
"loss": 0.2439,
"step": 3180
},
{
"epoch": 2.0164271047227924,
"grad_norm": 0.16232196986675262,
"learning_rate": 7.452077036594311e-06,
"loss": 0.245,
"step": 3190
},
{
"epoch": 2.022745221923867,
"grad_norm": 0.172638937830925,
"learning_rate": 7.432823957383533e-06,
"loss": 0.245,
"step": 3200
},
{
"epoch": 2.029063339124941,
"grad_norm": 0.16291241347789764,
"learning_rate": 7.413523500716571e-06,
"loss": 0.2437,
"step": 3210
},
{
"epoch": 2.035381456326015,
"grad_norm": 0.1787315011024475,
"learning_rate": 7.394176042455619e-06,
"loss": 0.2467,
"step": 3220
},
{
"epoch": 2.041699573527089,
"grad_norm": 0.19181819260120392,
"learning_rate": 7.374781959378185e-06,
"loss": 0.2449,
"step": 3230
},
{
"epoch": 2.048017690728163,
"grad_norm": 0.17782440781593323,
"learning_rate": 7.355341629169768e-06,
"loss": 0.2457,
"step": 3240
},
{
"epoch": 2.0543358079292373,
"grad_norm": 0.18428935110569,
"learning_rate": 7.335855430416489e-06,
"loss": 0.2475,
"step": 3250
},
{
"epoch": 2.0606539251303113,
"grad_norm": 0.16668711602687836,
"learning_rate": 7.3163237425977305e-06,
"loss": 0.2442,
"step": 3260
},
{
"epoch": 2.0669720423313853,
"grad_norm": 0.20328602194786072,
"learning_rate": 7.296746946078737e-06,
"loss": 0.2428,
"step": 3270
},
{
"epoch": 2.0732901595324593,
"grad_norm": 0.17452338337898254,
"learning_rate": 7.277125422103213e-06,
"loss": 0.2434,
"step": 3280
},
{
"epoch": 2.0796082767335333,
"grad_norm": 0.19674983620643616,
"learning_rate": 7.2574595527859e-06,
"loss": 0.2459,
"step": 3290
},
{
"epoch": 2.0859263939346073,
"grad_norm": 0.16700546443462372,
"learning_rate": 7.23774972110513e-06,
"loss": 0.2441,
"step": 3300
},
{
"epoch": 2.0922445111356818,
"grad_norm": 0.1824389100074768,
"learning_rate": 7.217996310895367e-06,
"loss": 0.2447,
"step": 3310
},
{
"epoch": 2.0985626283367558,
"grad_norm": 0.1628822386264801,
"learning_rate": 7.19819970683974e-06,
"loss": 0.245,
"step": 3320
},
{
"epoch": 2.10488074553783,
"grad_norm": 0.19150730967521667,
"learning_rate": 7.178360294462545e-06,
"loss": 0.2439,
"step": 3330
},
{
"epoch": 2.111198862738904,
"grad_norm": 0.1673995554447174,
"learning_rate": 7.158478460121735e-06,
"loss": 0.2442,
"step": 3340
},
{
"epoch": 2.117516979939978,
"grad_norm": 0.19296851754188538,
"learning_rate": 7.138554591001405e-06,
"loss": 0.246,
"step": 3350
},
{
"epoch": 2.123835097141052,
"grad_norm": 0.17618988454341888,
"learning_rate": 7.118589075104243e-06,
"loss": 0.2418,
"step": 3360
},
{
"epoch": 2.1301532143421262,
"grad_norm": 0.19375811517238617,
"learning_rate": 7.0985823012439745e-06,
"loss": 0.2429,
"step": 3370
},
{
"epoch": 2.1364713315432002,
"grad_norm": 0.20015262067317963,
"learning_rate": 7.078534659037801e-06,
"loss": 0.2439,
"step": 3380
},
{
"epoch": 2.1427894487442742,
"grad_norm": 0.1756194531917572,
"learning_rate": 7.0584465388988e-06,
"loss": 0.2441,
"step": 3390
},
{
"epoch": 2.1491075659453482,
"grad_norm": 0.18751130998134613,
"learning_rate": 7.038318332028326e-06,
"loss": 0.2442,
"step": 3400
},
{
"epoch": 2.1554256831464222,
"grad_norm": 0.16298574209213257,
"learning_rate": 7.018150430408394e-06,
"loss": 0.2447,
"step": 3410
},
{
"epoch": 2.1617438003474962,
"grad_norm": 0.20823705196380615,
"learning_rate": 6.997943226794051e-06,
"loss": 0.2441,
"step": 3420
},
{
"epoch": 2.1680619175485707,
"grad_norm": 0.19422686100006104,
"learning_rate": 6.97769711470571e-06,
"loss": 0.2432,
"step": 3430
},
{
"epoch": 2.1743800347496447,
"grad_norm": 0.16952840983867645,
"learning_rate": 6.95741248842151e-06,
"loss": 0.2443,
"step": 3440
},
{
"epoch": 2.1806981519507187,
"grad_norm": 0.17325712740421295,
"learning_rate": 6.937089742969615e-06,
"loss": 0.2441,
"step": 3450
},
{
"epoch": 2.1870162691517927,
"grad_norm": 0.1852918565273285,
"learning_rate": 6.916729274120539e-06,
"loss": 0.2465,
"step": 3460
},
{
"epoch": 2.1933343863528667,
"grad_norm": 0.16571369767189026,
"learning_rate": 6.896331478379429e-06,
"loss": 0.2434,
"step": 3470
},
{
"epoch": 2.199652503553941,
"grad_norm": 0.18638812005519867,
"learning_rate": 6.875896752978345e-06,
"loss": 0.2461,
"step": 3480
},
{
"epoch": 2.205970620755015,
"grad_norm": 0.18144486844539642,
"learning_rate": 6.855425495868524e-06,
"loss": 0.2438,
"step": 3490
},
{
"epoch": 2.212288737956089,
"grad_norm": 0.1876654475927353,
"learning_rate": 6.834918105712638e-06,
"loss": 0.244,
"step": 3500
},
{
"epoch": 2.218606855157163,
"grad_norm": 0.18819020688533783,
"learning_rate": 6.814374981877013e-06,
"loss": 0.2432,
"step": 3510
},
{
"epoch": 2.224924972358237,
"grad_norm": 0.1788501888513565,
"learning_rate": 6.793796524423868e-06,
"loss": 0.245,
"step": 3520
},
{
"epoch": 2.231243089559311,
"grad_norm": 0.19036491215229034,
"learning_rate": 6.773183134103522e-06,
"loss": 0.2428,
"step": 3530
},
{
"epoch": 2.2375612067603856,
"grad_norm": 0.18438424170017242,
"learning_rate": 6.752535212346576e-06,
"loss": 0.2422,
"step": 3540
},
{
"epoch": 2.2438793239614596,
"grad_norm": 0.16770315170288086,
"learning_rate": 6.7318531612561145e-06,
"loss": 0.2426,
"step": 3550
},
{
"epoch": 2.2501974411625336,
"grad_norm": 0.1698455810546875,
"learning_rate": 6.711137383599859e-06,
"loss": 0.2441,
"step": 3560
},
{
"epoch": 2.2565155583636076,
"grad_norm": 0.16267286241054535,
"learning_rate": 6.690388282802338e-06,
"loss": 0.2435,
"step": 3570
},
{
"epoch": 2.2628336755646816,
"grad_norm": 0.19407695531845093,
"learning_rate": 6.6696062629370155e-06,
"loss": 0.2417,
"step": 3580
},
{
"epoch": 2.2691517927657556,
"grad_norm": 0.20387399196624756,
"learning_rate": 6.648791728718436e-06,
"loss": 0.2407,
"step": 3590
},
{
"epoch": 2.27546990996683,
"grad_norm": 0.17418253421783447,
"learning_rate": 6.627945085494335e-06,
"loss": 0.2451,
"step": 3600
},
{
"epoch": 2.281788027167904,
"grad_norm": 0.1878381371498108,
"learning_rate": 6.607066739237748e-06,
"loss": 0.2442,
"step": 3610
},
{
"epoch": 2.288106144368978,
"grad_norm": 0.16501325368881226,
"learning_rate": 6.586157096539105e-06,
"loss": 0.2427,
"step": 3620
},
{
"epoch": 2.294424261570052,
"grad_norm": 0.17008960247039795,
"learning_rate": 6.565216564598307e-06,
"loss": 0.2459,
"step": 3630
},
{
"epoch": 2.300742378771126,
"grad_norm": 0.167978435754776,
"learning_rate": 6.544245551216804e-06,
"loss": 0.2416,
"step": 3640
},
{
"epoch": 2.3070604959722,
"grad_norm": 0.17641465365886688,
"learning_rate": 6.5232444647896465e-06,
"loss": 0.2435,
"step": 3650
},
{
"epoch": 2.3133786131732745,
"grad_norm": 0.1629774123430252,
"learning_rate": 6.50221371429754e-06,
"loss": 0.244,
"step": 3660
},
{
"epoch": 2.3196967303743485,
"grad_norm": 0.1710384041070938,
"learning_rate": 6.481153709298872e-06,
"loss": 0.2437,
"step": 3670
},
{
"epoch": 2.3260148475754225,
"grad_norm": 0.1770370900630951,
"learning_rate": 6.4600648599217394e-06,
"loss": 0.2421,
"step": 3680
},
{
"epoch": 2.3323329647764965,
"grad_norm": 0.17405395209789276,
"learning_rate": 6.4389475768559675e-06,
"loss": 0.2414,
"step": 3690
},
{
"epoch": 2.3386510819775705,
"grad_norm": 0.1998765915632248,
"learning_rate": 6.417802271345102e-06,
"loss": 0.2416,
"step": 3700
},
{
"epoch": 2.344969199178645,
"grad_norm": 0.18685515224933624,
"learning_rate": 6.3966293551784035e-06,
"loss": 0.2431,
"step": 3710
},
{
"epoch": 2.351287316379719,
"grad_norm": 0.17079129815101624,
"learning_rate": 6.375429240682837e-06,
"loss": 0.2423,
"step": 3720
},
{
"epoch": 2.357605433580793,
"grad_norm": 0.18592600524425507,
"learning_rate": 6.354202340715027e-06,
"loss": 0.2419,
"step": 3730
},
{
"epoch": 2.363923550781867,
"grad_norm": 0.17736919224262238,
"learning_rate": 6.332949068653229e-06,
"loss": 0.2424,
"step": 3740
},
{
"epoch": 2.370241667982941,
"grad_norm": 0.1869024783372879,
"learning_rate": 6.311669838389279e-06,
"loss": 0.2446,
"step": 3750
},
{
"epoch": 2.376559785184015,
"grad_norm": 0.17358314990997314,
"learning_rate": 6.290365064320521e-06,
"loss": 0.2425,
"step": 3760
},
{
"epoch": 2.382877902385089,
"grad_norm": 0.16948603093624115,
"learning_rate": 6.2690351613417545e-06,
"loss": 0.2441,
"step": 3770
},
{
"epoch": 2.3891960195861635,
"grad_norm": 0.16800999641418457,
"learning_rate": 6.247680544837142e-06,
"loss": 0.2425,
"step": 3780
},
{
"epoch": 2.3955141367872375,
"grad_norm": 0.17783384025096893,
"learning_rate": 6.226301630672127e-06,
"loss": 0.2437,
"step": 3790
},
{
"epoch": 2.4018322539883115,
"grad_norm": 0.16958226263523102,
"learning_rate": 6.204898835185325e-06,
"loss": 0.2435,
"step": 3800
},
{
"epoch": 2.4081503711893855,
"grad_norm": 0.19137728214263916,
"learning_rate": 6.18347257518043e-06,
"loss": 0.2442,
"step": 3810
},
{
"epoch": 2.4144684883904595,
"grad_norm": 0.1784157156944275,
"learning_rate": 6.162023267918086e-06,
"loss": 0.2421,
"step": 3820
},
{
"epoch": 2.420786605591534,
"grad_norm": 0.15680409967899323,
"learning_rate": 6.140551331107767e-06,
"loss": 0.2421,
"step": 3830
},
{
"epoch": 2.427104722792608,
"grad_norm": 0.18923278152942657,
"learning_rate": 6.1190571828996425e-06,
"loss": 0.241,
"step": 3840
},
{
"epoch": 2.433422839993682,
"grad_norm": 0.2097504884004593,
"learning_rate": 6.097541241876428e-06,
"loss": 0.243,
"step": 3850
},
{
"epoch": 2.439740957194756,
"grad_norm": 0.18435165286064148,
"learning_rate": 6.076003927045242e-06,
"loss": 0.2427,
"step": 3860
},
{
"epoch": 2.44605907439583,
"grad_norm": 0.181401789188385,
"learning_rate": 6.05444565782944e-06,
"loss": 0.2416,
"step": 3870
},
{
"epoch": 2.452377191596904,
"grad_norm": 0.17077374458312988,
"learning_rate": 6.032866854060451e-06,
"loss": 0.2435,
"step": 3880
},
{
"epoch": 2.4586953087979784,
"grad_norm": 0.18238386511802673,
"learning_rate": 6.011267935969596e-06,
"loss": 0.2424,
"step": 3890
},
{
"epoch": 2.4650134259990524,
"grad_norm": 0.18740853667259216,
"learning_rate": 5.9896493241799115e-06,
"loss": 0.2415,
"step": 3900
},
{
"epoch": 2.4713315432001264,
"grad_norm": 0.1816156506538391,
"learning_rate": 5.968011439697951e-06,
"loss": 0.2432,
"step": 3910
},
{
"epoch": 2.4776496604012004,
"grad_norm": 0.16910015046596527,
"learning_rate": 5.946354703905591e-06,
"loss": 0.243,
"step": 3920
},
{
"epoch": 2.4839677776022744,
"grad_norm": 0.1906070113182068,
"learning_rate": 5.924679538551825e-06,
"loss": 0.2416,
"step": 3930
},
{
"epoch": 2.490285894803349,
"grad_norm": 0.1867346614599228,
"learning_rate": 5.902986365744544e-06,
"loss": 0.2437,
"step": 3940
},
{
"epoch": 2.496604012004423,
"grad_norm": 0.187602698802948,
"learning_rate": 5.881275607942325e-06,
"loss": 0.2408,
"step": 3950
},
{
"epoch": 2.502922129205497,
"grad_norm": 0.1724424809217453,
"learning_rate": 5.859547687946199e-06,
"loss": 0.2426,
"step": 3960
},
{
"epoch": 2.509240246406571,
"grad_norm": 0.1793140023946762,
"learning_rate": 5.837803028891418e-06,
"loss": 0.2425,
"step": 3970
},
{
"epoch": 2.515558363607645,
"grad_norm": 0.17329296469688416,
"learning_rate": 5.816042054239212e-06,
"loss": 0.2441,
"step": 3980
},
{
"epoch": 2.521876480808719,
"grad_norm": 0.22843770682811737,
"learning_rate": 5.794265187768551e-06,
"loss": 0.241,
"step": 3990
},
{
"epoch": 2.528194598009793,
"grad_norm": 0.1654650717973709,
"learning_rate": 5.772472853567882e-06,
"loss": 0.2426,
"step": 4000
},
{
"epoch": 2.5345127152108673,
"grad_norm": 0.17043884098529816,
"learning_rate": 5.750665476026875e-06,
"loss": 0.2406,
"step": 4010
},
{
"epoch": 2.5408308324119413,
"grad_norm": 0.16985023021697998,
"learning_rate": 5.728843479828161e-06,
"loss": 0.2401,
"step": 4020
},
{
"epoch": 2.5471489496130153,
"grad_norm": 0.17778819799423218,
"learning_rate": 5.707007289939055e-06,
"loss": 0.2441,
"step": 4030
},
{
"epoch": 2.5534670668140893,
"grad_norm": 0.1612013876438141,
"learning_rate": 5.6851573316032845e-06,
"loss": 0.2399,
"step": 4040
},
{
"epoch": 2.5597851840151638,
"grad_norm": 0.17063820362091064,
"learning_rate": 5.66329403033271e-06,
"loss": 0.2412,
"step": 4050
},
{
"epoch": 2.5661033012162378,
"grad_norm": 0.16587677597999573,
"learning_rate": 5.641417811899033e-06,
"loss": 0.239,
"step": 4060
},
{
"epoch": 2.5724214184173118,
"grad_norm": 0.17766372859477997,
"learning_rate": 5.619529102325507e-06,
"loss": 0.2411,
"step": 4070
},
{
"epoch": 2.5787395356183858,
"grad_norm": 0.18175509572029114,
"learning_rate": 5.597628327878645e-06,
"loss": 0.242,
"step": 4080
},
{
"epoch": 2.5850576528194598,
"grad_norm": 0.16519029438495636,
"learning_rate": 5.575715915059909e-06,
"loss": 0.2425,
"step": 4090
},
{
"epoch": 2.5913757700205338,
"grad_norm": 0.17657625675201416,
"learning_rate": 5.553792290597414e-06,
"loss": 0.2406,
"step": 4100
},
{
"epoch": 2.5976938872216078,
"grad_norm": 0.17835581302642822,
"learning_rate": 5.531857881437612e-06,
"loss": 0.2412,
"step": 4110
},
{
"epoch": 2.6040120044226818,
"grad_norm": 0.2040930986404419,
"learning_rate": 5.509913114736981e-06,
"loss": 0.2389,
"step": 4120
},
{
"epoch": 2.610330121623756,
"grad_norm": 0.17634861171245575,
"learning_rate": 5.487958417853699e-06,
"loss": 0.2409,
"step": 4130
},
{
"epoch": 2.61664823882483,
"grad_norm": 0.16980887949466705,
"learning_rate": 5.465994218339333e-06,
"loss": 0.2397,
"step": 4140
},
{
"epoch": 2.6229663560259042,
"grad_norm": 0.16278938949108124,
"learning_rate": 5.444020943930506e-06,
"loss": 0.2419,
"step": 4150
},
{
"epoch": 2.6292844732269782,
"grad_norm": 0.18307939171791077,
"learning_rate": 5.4220390225405606e-06,
"loss": 0.241,
"step": 4160
},
{
"epoch": 2.6356025904280527,
"grad_norm": 0.16562727093696594,
"learning_rate": 5.400048882251245e-06,
"loss": 0.2391,
"step": 4170
},
{
"epoch": 2.6419207076291267,
"grad_norm": 0.18560691177845,
"learning_rate": 5.378050951304356e-06,
"loss": 0.2417,
"step": 4180
},
{
"epoch": 2.6482388248302007,
"grad_norm": 0.18558987975120544,
"learning_rate": 5.3560456580934085e-06,
"loss": 0.2415,
"step": 4190
},
{
"epoch": 2.6545569420312747,
"grad_norm": 0.16538389027118683,
"learning_rate": 5.334033431155294e-06,
"loss": 0.2423,
"step": 4200
},
{
"epoch": 2.6608750592323487,
"grad_norm": 0.17581807076931,
"learning_rate": 5.312014699161935e-06,
"loss": 0.2402,
"step": 4210
},
{
"epoch": 2.6671931764334227,
"grad_norm": 0.18032985925674438,
"learning_rate": 5.289989890911928e-06,
"loss": 0.2421,
"step": 4220
},
{
"epoch": 2.6735112936344967,
"grad_norm": 0.18549709022045135,
"learning_rate": 5.267959435322209e-06,
"loss": 0.2413,
"step": 4230
},
{
"epoch": 2.679829410835571,
"grad_norm": 0.1603822559118271,
"learning_rate": 5.245923761419688e-06,
"loss": 0.2407,
"step": 4240
},
{
"epoch": 2.686147528036645,
"grad_norm": 0.17524629831314087,
"learning_rate": 5.223883298332894e-06,
"loss": 0.2395,
"step": 4250
},
{
"epoch": 2.692465645237719,
"grad_norm": 0.16933143138885498,
"learning_rate": 5.20183847528363e-06,
"loss": 0.2387,
"step": 4260
},
{
"epoch": 2.698783762438793,
"grad_norm": 0.17397332191467285,
"learning_rate": 5.179789721578597e-06,
"loss": 0.2392,
"step": 4270
},
{
"epoch": 2.7051018796398676,
"grad_norm": 0.1716376543045044,
"learning_rate": 5.157737466601049e-06,
"loss": 0.2412,
"step": 4280
},
{
"epoch": 2.7114199968409416,
"grad_norm": 0.17333756387233734,
"learning_rate": 5.135682139802422e-06,
"loss": 0.241,
"step": 4290
},
{
"epoch": 2.7177381140420156,
"grad_norm": 0.1601376235485077,
"learning_rate": 5.113624170693977e-06,
"loss": 0.2423,
"step": 4300
},
{
"epoch": 2.7240562312430896,
"grad_norm": 0.18671631813049316,
"learning_rate": 5.091563988838425e-06,
"loss": 0.2396,
"step": 4310
},
{
"epoch": 2.7303743484441636,
"grad_norm": 0.17103099822998047,
"learning_rate": 5.069502023841576e-06,
"loss": 0.2399,
"step": 4320
},
{
"epoch": 2.7366924656452376,
"grad_norm": 0.17045724391937256,
"learning_rate": 5.047438705343961e-06,
"loss": 0.2407,
"step": 4330
},
{
"epoch": 2.7430105828463116,
"grad_norm": 0.171345517039299,
"learning_rate": 5.025374463012472e-06,
"loss": 0.2411,
"step": 4340
},
{
"epoch": 2.7493287000473856,
"grad_norm": 0.16573168337345123,
"learning_rate": 5.00330972653199e-06,
"loss": 0.2394,
"step": 4350
},
{
"epoch": 2.75564681724846,
"grad_norm": 0.16506439447402954,
"learning_rate": 4.981244925597018e-06,
"loss": 0.24,
"step": 4360
},
{
"epoch": 2.761964934449534,
"grad_norm": 0.17510944604873657,
"learning_rate": 4.959180489903318e-06,
"loss": 0.2406,
"step": 4370
},
{
"epoch": 2.768283051650608,
"grad_norm": 0.17315103113651276,
"learning_rate": 4.937116849139538e-06,
"loss": 0.2407,
"step": 4380
},
{
"epoch": 2.774601168851682,
"grad_norm": 0.17643538117408752,
"learning_rate": 4.915054432978842e-06,
"loss": 0.2407,
"step": 4390
},
{
"epoch": 2.7809192860527565,
"grad_norm": 0.1600533127784729,
"learning_rate": 4.89299367107055e-06,
"loss": 0.2407,
"step": 4400
},
{
"epoch": 2.7872374032538305,
"grad_norm": 0.1802552044391632,
"learning_rate": 4.870934993031763e-06,
"loss": 0.2419,
"step": 4410
},
{
"epoch": 2.7935555204549045,
"grad_norm": 0.1862618327140808,
"learning_rate": 4.848878828439008e-06,
"loss": 0.2411,
"step": 4420
},
{
"epoch": 2.7998736376559785,
"grad_norm": 0.17863595485687256,
"learning_rate": 4.8268256068198525e-06,
"loss": 0.242,
"step": 4430
},
{
"epoch": 2.8061917548570525,
"grad_norm": 0.1779400110244751,
"learning_rate": 4.804775757644558e-06,
"loss": 0.241,
"step": 4440
},
{
"epoch": 2.8125098720581265,
"grad_norm": 0.16401080787181854,
"learning_rate": 4.782729710317713e-06,
"loss": 0.2412,
"step": 4450
},
{
"epoch": 2.8188279892592005,
"grad_norm": 0.16927611827850342,
"learning_rate": 4.760687894169867e-06,
"loss": 0.2385,
"step": 4460
},
{
"epoch": 2.825146106460275,
"grad_norm": 0.1770433932542801,
"learning_rate": 4.738650738449161e-06,
"loss": 0.2379,
"step": 4470
},
{
"epoch": 2.831464223661349,
"grad_norm": 0.16366536915302277,
"learning_rate": 4.7166186723129895e-06,
"loss": 0.2409,
"step": 4480
},
{
"epoch": 2.837782340862423,
"grad_norm": 0.15729236602783203,
"learning_rate": 4.694592124819628e-06,
"loss": 0.2408,
"step": 4490
},
{
"epoch": 2.844100458063497,
"grad_norm": 0.16710855066776276,
"learning_rate": 4.672571524919875e-06,
"loss": 0.2404,
"step": 4500
},
{
"epoch": 2.8504185752645714,
"grad_norm": 0.15631146728992462,
"learning_rate": 4.65055730144871e-06,
"loss": 0.2385,
"step": 4510
},
{
"epoch": 2.8567366924656454,
"grad_norm": 0.15912912786006927,
"learning_rate": 4.628549883116933e-06,
"loss": 0.2404,
"step": 4520
},
{
"epoch": 2.8630548096667194,
"grad_norm": 0.1697085201740265,
"learning_rate": 4.606549698502824e-06,
"loss": 0.238,
"step": 4530
},
{
"epoch": 2.8693729268677934,
"grad_norm": 0.1617184579372406,
"learning_rate": 4.584557176043782e-06,
"loss": 0.2386,
"step": 4540
},
{
"epoch": 2.8756910440688674,
"grad_norm": 0.16806644201278687,
"learning_rate": 4.562572744028e-06,
"loss": 0.2396,
"step": 4550
},
{
"epoch": 2.8820091612699414,
"grad_norm": 0.15261489152908325,
"learning_rate": 4.540596830586113e-06,
"loss": 0.2398,
"step": 4560
},
{
"epoch": 2.8883272784710154,
"grad_norm": 0.16236743330955505,
"learning_rate": 4.518629863682861e-06,
"loss": 0.2404,
"step": 4570
},
{
"epoch": 2.8946453956720894,
"grad_norm": 0.16257306933403015,
"learning_rate": 4.496672271108758e-06,
"loss": 0.2381,
"step": 4580
},
{
"epoch": 2.900963512873164,
"grad_norm": 0.1624518632888794,
"learning_rate": 4.474724480471762e-06,
"loss": 0.2422,
"step": 4590
},
{
"epoch": 2.907281630074238,
"grad_norm": 0.1868724673986435,
"learning_rate": 4.452786919188943e-06,
"loss": 0.2387,
"step": 4600
},
{
"epoch": 2.913599747275312,
"grad_norm": 0.15944679081439972,
"learning_rate": 4.430860014478162e-06,
"loss": 0.2375,
"step": 4610
},
{
"epoch": 2.919917864476386,
"grad_norm": 0.16003431379795074,
"learning_rate": 4.40894419334975e-06,
"loss": 0.2397,
"step": 4620
},
{
"epoch": 2.9262359816774604,
"grad_norm": 0.17390407621860504,
"learning_rate": 4.387039882598198e-06,
"loss": 0.2399,
"step": 4630
},
{
"epoch": 2.9325540988785344,
"grad_norm": 0.17524614930152893,
"learning_rate": 4.365147508793839e-06,
"loss": 0.2387,
"step": 4640
},
{
"epoch": 2.9388722160796084,
"grad_norm": 0.17224489152431488,
"learning_rate": 4.343267498274535e-06,
"loss": 0.2399,
"step": 4650
},
{
"epoch": 2.9451903332806824,
"grad_norm": 0.17266203463077545,
"learning_rate": 4.321400277137395e-06,
"loss": 0.2376,
"step": 4660
},
{
"epoch": 2.9515084504817564,
"grad_norm": 0.15991342067718506,
"learning_rate": 4.299546271230457e-06,
"loss": 0.2367,
"step": 4670
},
{
"epoch": 2.9578265676828304,
"grad_norm": 0.15629249811172485,
"learning_rate": 4.277705906144399e-06,
"loss": 0.2386,
"step": 4680
},
{
"epoch": 2.9641446848839044,
"grad_norm": 0.16798162460327148,
"learning_rate": 4.255879607204262e-06,
"loss": 0.2387,
"step": 4690
},
{
"epoch": 2.970462802084979,
"grad_norm": 0.16710205376148224,
"learning_rate": 4.234067799461153e-06,
"loss": 0.24,
"step": 4700
},
{
"epoch": 2.976780919286053,
"grad_norm": 0.16920699179172516,
"learning_rate": 4.212270907683979e-06,
"loss": 0.2415,
"step": 4710
},
{
"epoch": 2.983099036487127,
"grad_norm": 0.1665589064359665,
"learning_rate": 4.190489356351163e-06,
"loss": 0.2395,
"step": 4720
},
{
"epoch": 2.989417153688201,
"grad_norm": 0.1775292009115219,
"learning_rate": 4.168723569642388e-06,
"loss": 0.2377,
"step": 4730
},
{
"epoch": 2.9957352708892753,
"grad_norm": 0.1743878573179245,
"learning_rate": 4.146973971430333e-06,
"loss": 0.2384,
"step": 4740
},
{
"epoch": 3.0025272468804296,
"grad_norm": 0.17591196298599243,
"learning_rate": 4.125240985272419e-06,
"loss": 0.2507,
"step": 4750
},
{
"epoch": 3.0088453640815036,
"grad_norm": 0.17899581789970398,
"learning_rate": 4.103525034402554e-06,
"loss": 0.2312,
"step": 4760
},
{
"epoch": 3.0151634812825776,
"grad_norm": 0.16859206557273865,
"learning_rate": 4.0818265417228995e-06,
"loss": 0.2318,
"step": 4770
},
{
"epoch": 3.021481598483652,
"grad_norm": 0.16476421058177948,
"learning_rate": 4.060145929795635e-06,
"loss": 0.2291,
"step": 4780
},
{
"epoch": 3.027799715684726,
"grad_norm": 0.16536416113376617,
"learning_rate": 4.03848362083472e-06,
"loss": 0.2316,
"step": 4790
},
{
"epoch": 3.0341178328858,
"grad_norm": 0.1791021227836609,
"learning_rate": 4.01684003669768e-06,
"loss": 0.2297,
"step": 4800
},
{
"epoch": 3.040435950086874,
"grad_norm": 0.16363908350467682,
"learning_rate": 3.9952155988773876e-06,
"loss": 0.2309,
"step": 4810
},
{
"epoch": 3.046754067287948,
"grad_norm": 0.16255658864974976,
"learning_rate": 3.973610728493859e-06,
"loss": 0.2297,
"step": 4820
},
{
"epoch": 3.053072184489022,
"grad_norm": 0.16152887046337128,
"learning_rate": 3.952025846286039e-06,
"loss": 0.2297,
"step": 4830
},
{
"epoch": 3.0593903016900965,
"grad_norm": 0.16641443967819214,
"learning_rate": 3.930461372603627e-06,
"loss": 0.2331,
"step": 4840
},
{
"epoch": 3.0657084188911705,
"grad_norm": 0.1551787257194519,
"learning_rate": 3.9089177273988776e-06,
"loss": 0.2297,
"step": 4850
},
{
"epoch": 3.0720265360922445,
"grad_norm": 0.16921547055244446,
"learning_rate": 3.887395330218429e-06,
"loss": 0.2312,
"step": 4860
},
{
"epoch": 3.0783446532933185,
"grad_norm": 0.16538488864898682,
"learning_rate": 3.865894600195123e-06,
"loss": 0.2292,
"step": 4870
},
{
"epoch": 3.0846627704943925,
"grad_norm": 0.16358362138271332,
"learning_rate": 3.844415956039856e-06,
"loss": 0.2314,
"step": 4880
},
{
"epoch": 3.0909808876954665,
"grad_norm": 0.161546528339386,
"learning_rate": 3.822959816033417e-06,
"loss": 0.2298,
"step": 4890
},
{
"epoch": 3.097299004896541,
"grad_norm": 0.16105768084526062,
"learning_rate": 3.80152659801834e-06,
"loss": 0.2309,
"step": 4900
},
{
"epoch": 3.103617122097615,
"grad_norm": 0.16659840941429138,
"learning_rate": 3.7801167193907746e-06,
"loss": 0.232,
"step": 4910
},
{
"epoch": 3.109935239298689,
"grad_norm": 0.1748570203781128,
"learning_rate": 3.7587305970923495e-06,
"loss": 0.2314,
"step": 4920
},
{
"epoch": 3.116253356499763,
"grad_norm": 0.18045374751091003,
"learning_rate": 3.73736864760206e-06,
"loss": 0.2298,
"step": 4930
},
{
"epoch": 3.122571473700837,
"grad_norm": 0.1649467945098877,
"learning_rate": 3.7160312869281476e-06,
"loss": 0.2317,
"step": 4940
},
{
"epoch": 3.1288895909019114,
"grad_norm": 0.16685360670089722,
"learning_rate": 3.694718930600012e-06,
"loss": 0.2282,
"step": 4950
},
{
"epoch": 3.1352077081029854,
"grad_norm": 0.1727149933576584,
"learning_rate": 3.673431993660106e-06,
"loss": 0.2291,
"step": 4960
},
{
"epoch": 3.1415258253040594,
"grad_norm": 0.17158806324005127,
"learning_rate": 3.6521708906558653e-06,
"loss": 0.2308,
"step": 4970
},
{
"epoch": 3.1478439425051334,
"grad_norm": 0.16147060692310333,
"learning_rate": 3.6309360356316183e-06,
"loss": 0.2297,
"step": 4980
},
{
"epoch": 3.1541620597062074,
"grad_norm": 0.1733555942773819,
"learning_rate": 3.6097278421205408e-06,
"loss": 0.2293,
"step": 4990
},
{
"epoch": 3.1604801769072814,
"grad_norm": 0.15878255665302277,
"learning_rate": 3.588546723136598e-06,
"loss": 0.2309,
"step": 5000
},
{
"epoch": 3.166798294108356,
"grad_norm": 0.1642056703567505,
"learning_rate": 3.567393091166489e-06,
"loss": 0.2292,
"step": 5010
},
{
"epoch": 3.17311641130943,
"grad_norm": 0.16453072428703308,
"learning_rate": 3.5462673581616298e-06,
"loss": 0.2314,
"step": 5020
},
{
"epoch": 3.179434528510504,
"grad_norm": 0.16374921798706055,
"learning_rate": 3.5251699355301253e-06,
"loss": 0.2314,
"step": 5030
},
{
"epoch": 3.185752645711578,
"grad_norm": 0.1619606912136078,
"learning_rate": 3.504101234128757e-06,
"loss": 0.2321,
"step": 5040
},
{
"epoch": 3.192070762912652,
"grad_norm": 0.16570039093494415,
"learning_rate": 3.4830616642549734e-06,
"loss": 0.231,
"step": 5050
},
{
"epoch": 3.198388880113726,
"grad_norm": 0.15959931910037994,
"learning_rate": 3.462051635638919e-06,
"loss": 0.2316,
"step": 5060
},
{
"epoch": 3.2047069973148004,
"grad_norm": 0.16513067483901978,
"learning_rate": 3.441071557435438e-06,
"loss": 0.2317,
"step": 5070
},
{
"epoch": 3.2110251145158744,
"grad_norm": 0.15392282605171204,
"learning_rate": 3.420121838216114e-06,
"loss": 0.2305,
"step": 5080
},
{
"epoch": 3.2173432317169484,
"grad_norm": 0.1638430505990982,
"learning_rate": 3.39920288596131e-06,
"loss": 0.2308,
"step": 5090
},
{
"epoch": 3.2236613489180224,
"grad_norm": 0.15217792987823486,
"learning_rate": 3.378315108052227e-06,
"loss": 0.2322,
"step": 5100
},
{
"epoch": 3.2299794661190964,
"grad_norm": 0.17944923043251038,
"learning_rate": 3.3574589112629683e-06,
"loss": 0.2319,
"step": 5110
},
{
"epoch": 3.2362975833201704,
"grad_norm": 0.16493919491767883,
"learning_rate": 3.3366347017526162e-06,
"loss": 0.2314,
"step": 5120
},
{
"epoch": 3.242615700521245,
"grad_norm": 0.15931478142738342,
"learning_rate": 3.3158428850573273e-06,
"loss": 0.2308,
"step": 5130
},
{
"epoch": 3.248933817722319,
"grad_norm": 0.16134731471538544,
"learning_rate": 3.295083866082429e-06,
"loss": 0.2298,
"step": 5140
},
{
"epoch": 3.255251934923393,
"grad_norm": 0.1602196991443634,
"learning_rate": 3.274358049094541e-06,
"loss": 0.231,
"step": 5150
},
{
"epoch": 3.261570052124467,
"grad_norm": 0.15763860940933228,
"learning_rate": 3.253665837713694e-06,
"loss": 0.2296,
"step": 5160
},
{
"epoch": 3.267888169325541,
"grad_norm": 0.15692386031150818,
"learning_rate": 3.2330076349054767e-06,
"loss": 0.2301,
"step": 5170
},
{
"epoch": 3.2742062865266153,
"grad_norm": 0.17300792038440704,
"learning_rate": 3.2123838429731858e-06,
"loss": 0.2297,
"step": 5180
},
{
"epoch": 3.2805244037276893,
"grad_norm": 0.17287859320640564,
"learning_rate": 3.1917948635499956e-06,
"loss": 0.2301,
"step": 5190
},
{
"epoch": 3.2868425209287633,
"grad_norm": 0.169038787484169,
"learning_rate": 3.1712410975911224e-06,
"loss": 0.2293,
"step": 5200
},
{
"epoch": 3.2931606381298373,
"grad_norm": 0.1676977425813675,
"learning_rate": 3.150722945366035e-06,
"loss": 0.2307,
"step": 5210
},
{
"epoch": 3.2994787553309113,
"grad_norm": 0.1699369102716446,
"learning_rate": 3.1302408064506496e-06,
"loss": 0.2288,
"step": 5220
},
{
"epoch": 3.3057968725319853,
"grad_norm": 0.16887900233268738,
"learning_rate": 3.109795079719544e-06,
"loss": 0.2329,
"step": 5230
},
{
"epoch": 3.3121149897330597,
"grad_norm": 0.17086876928806305,
"learning_rate": 3.0893861633382015e-06,
"loss": 0.2297,
"step": 5240
},
{
"epoch": 3.3184331069341337,
"grad_norm": 0.19851693511009216,
"learning_rate": 3.0690144547552513e-06,
"loss": 0.2309,
"step": 5250
},
{
"epoch": 3.3247512241352077,
"grad_norm": 0.18008504807949066,
"learning_rate": 3.048680350694724e-06,
"loss": 0.234,
"step": 5260
},
{
"epoch": 3.3310693413362817,
"grad_norm": 0.18023867905139923,
"learning_rate": 3.0283842471483314e-06,
"loss": 0.2299,
"step": 5270
},
{
"epoch": 3.3373874585373557,
"grad_norm": 0.17149996757507324,
"learning_rate": 3.008126539367754e-06,
"loss": 0.2309,
"step": 5280
},
{
"epoch": 3.34370557573843,
"grad_norm": 0.1631331443786621,
"learning_rate": 2.9879076218569426e-06,
"loss": 0.2304,
"step": 5290
},
{
"epoch": 3.350023692939504,
"grad_norm": 0.1726110428571701,
"learning_rate": 2.9677278883644367e-06,
"loss": 0.2289,
"step": 5300
},
{
"epoch": 3.356341810140578,
"grad_norm": 0.16877932846546173,
"learning_rate": 2.9475877318756928e-06,
"loss": 0.2307,
"step": 5310
},
{
"epoch": 3.362659927341652,
"grad_norm": 0.1572154462337494,
"learning_rate": 2.9274875446054397e-06,
"loss": 0.2307,
"step": 5320
},
{
"epoch": 3.368978044542726,
"grad_norm": 0.16440938413143158,
"learning_rate": 2.9074277179900324e-06,
"loss": 0.2302,
"step": 5330
},
{
"epoch": 3.3752961617438,
"grad_norm": 0.16500261425971985,
"learning_rate": 2.887408642679825e-06,
"loss": 0.2307,
"step": 5340
},
{
"epoch": 3.381614278944874,
"grad_norm": 0.16005001962184906,
"learning_rate": 2.867430708531585e-06,
"loss": 0.2293,
"step": 5350
},
{
"epoch": 3.3879323961459487,
"grad_norm": 0.1674973964691162,
"learning_rate": 2.847494304600874e-06,
"loss": 0.2301,
"step": 5360
},
{
"epoch": 3.3942505133470227,
"grad_norm": 0.16450868546962738,
"learning_rate": 2.827599819134489e-06,
"loss": 0.23,
"step": 5370
},
{
"epoch": 3.4005686305480967,
"grad_norm": 0.1648285835981369,
"learning_rate": 2.807747639562889e-06,
"loss": 0.2305,
"step": 5380
},
{
"epoch": 3.4068867477491707,
"grad_norm": 0.17181555926799774,
"learning_rate": 2.7879381524926635e-06,
"loss": 0.2311,
"step": 5390
},
{
"epoch": 3.4132048649502447,
"grad_norm": 0.16114503145217896,
"learning_rate": 2.7681717436989954e-06,
"loss": 0.2307,
"step": 5400
},
{
"epoch": 3.419522982151319,
"grad_norm": 0.15842120349407196,
"learning_rate": 2.748448798118149e-06,
"loss": 0.2301,
"step": 5410
},
{
"epoch": 3.425841099352393,
"grad_norm": 0.16943858563899994,
"learning_rate": 2.728769699839975e-06,
"loss": 0.2305,
"step": 5420
},
{
"epoch": 3.432159216553467,
"grad_norm": 0.1570242941379547,
"learning_rate": 2.7091348321004286e-06,
"loss": 0.2286,
"step": 5430
},
{
"epoch": 3.438477333754541,
"grad_norm": 0.16255582869052887,
"learning_rate": 2.689544577274113e-06,
"loss": 0.2305,
"step": 5440
},
{
"epoch": 3.444795450955615,
"grad_norm": 0.16005097329616547,
"learning_rate": 2.669999316866819e-06,
"loss": 0.2303,
"step": 5450
},
{
"epoch": 3.451113568156689,
"grad_norm": 0.1680128127336502,
"learning_rate": 2.6504994315081114e-06,
"loss": 0.2295,
"step": 5460
},
{
"epoch": 3.4574316853577636,
"grad_norm": 0.1641710102558136,
"learning_rate": 2.631045300943904e-06,
"loss": 0.2318,
"step": 5470
},
{
"epoch": 3.4637498025588376,
"grad_norm": 0.1590966135263443,
"learning_rate": 2.61163730402908e-06,
"loss": 0.2298,
"step": 5480
},
{
"epoch": 3.4700679197599116,
"grad_norm": 0.16159506142139435,
"learning_rate": 2.5922758187200893e-06,
"loss": 0.2292,
"step": 5490
},
{
"epoch": 3.4763860369609856,
"grad_norm": 0.1627105474472046,
"learning_rate": 2.572961222067612e-06,
"loss": 0.2287,
"step": 5500
},
{
"epoch": 3.4827041541620596,
"grad_norm": 0.1647382527589798,
"learning_rate": 2.5536938902092056e-06,
"loss": 0.2297,
"step": 5510
},
{
"epoch": 3.489022271363134,
"grad_norm": 0.17726825177669525,
"learning_rate": 2.5344741983619734e-06,
"loss": 0.2275,
"step": 5520
},
{
"epoch": 3.495340388564208,
"grad_norm": 0.18429596722126007,
"learning_rate": 2.515302520815275e-06,
"loss": 0.2304,
"step": 5530
},
{
"epoch": 3.501658505765282,
"grad_norm": 0.16635169088840485,
"learning_rate": 2.4961792309234194e-06,
"loss": 0.2301,
"step": 5540
},
{
"epoch": 3.507976622966356,
"grad_norm": 0.17560289800167084,
"learning_rate": 2.4771047010984066e-06,
"loss": 0.2303,
"step": 5550
},
{
"epoch": 3.51429474016743,
"grad_norm": 0.16308391094207764,
"learning_rate": 2.4580793028026636e-06,
"loss": 0.2283,
"step": 5560
},
{
"epoch": 3.520612857368504,
"grad_norm": 0.16081936657428741,
"learning_rate": 2.439103406541821e-06,
"loss": 0.2323,
"step": 5570
},
{
"epoch": 3.526930974569578,
"grad_norm": 0.15498140454292297,
"learning_rate": 2.4201773818574956e-06,
"loss": 0.2305,
"step": 5580
},
{
"epoch": 3.5332490917706525,
"grad_norm": 0.16058135032653809,
"learning_rate": 2.4013015973200895e-06,
"loss": 0.2308,
"step": 5590
},
{
"epoch": 3.5395672089717265,
"grad_norm": 0.16022346913814545,
"learning_rate": 2.3824764205216144e-06,
"loss": 0.2308,
"step": 5600
},
{
"epoch": 3.5458853261728005,
"grad_norm": 0.1624903380870819,
"learning_rate": 2.363702218068535e-06,
"loss": 0.2316,
"step": 5610
},
{
"epoch": 3.5522034433738745,
"grad_norm": 0.15978513658046722,
"learning_rate": 2.344979355574629e-06,
"loss": 0.2279,
"step": 5620
},
{
"epoch": 3.5585215605749485,
"grad_norm": 0.15280455350875854,
"learning_rate": 2.326308197653862e-06,
"loss": 0.2283,
"step": 5630
},
{
"epoch": 3.564839677776023,
"grad_norm": 0.16099567711353302,
"learning_rate": 2.307689107913295e-06,
"loss": 0.2289,
"step": 5640
},
{
"epoch": 3.571157794977097,
"grad_norm": 0.15736475586891174,
"learning_rate": 2.289122448945997e-06,
"loss": 0.2293,
"step": 5650
},
{
"epoch": 3.577475912178171,
"grad_norm": 0.15528954565525055,
"learning_rate": 2.270608582323992e-06,
"loss": 0.2283,
"step": 5660
},
{
"epoch": 3.583794029379245,
"grad_norm": 0.1545080840587616,
"learning_rate": 2.2521478685912027e-06,
"loss": 0.2279,
"step": 5670
},
{
"epoch": 3.590112146580319,
"grad_norm": 0.17268432676792145,
"learning_rate": 2.233740667256446e-06,
"loss": 0.2264,
"step": 5680
},
{
"epoch": 3.596430263781393,
"grad_norm": 0.17080992460250854,
"learning_rate": 2.2153873367864203e-06,
"loss": 0.2307,
"step": 5690
},
{
"epoch": 3.602748380982467,
"grad_norm": 0.16012567281723022,
"learning_rate": 2.19708823459873e-06,
"loss": 0.2304,
"step": 5700
},
{
"epoch": 3.6090664981835414,
"grad_norm": 0.1589348316192627,
"learning_rate": 2.178843717054923e-06,
"loss": 0.229,
"step": 5710
},
{
"epoch": 3.6153846153846154,
"grad_norm": 0.16951771080493927,
"learning_rate": 2.1606541394535528e-06,
"loss": 0.2276,
"step": 5720
},
{
"epoch": 3.6217027325856894,
"grad_norm": 0.1633329540491104,
"learning_rate": 2.1425198560232585e-06,
"loss": 0.2286,
"step": 5730
},
{
"epoch": 3.6280208497867634,
"grad_norm": 0.15090343356132507,
"learning_rate": 2.12444121991586e-06,
"loss": 0.2299,
"step": 5740
},
{
"epoch": 3.634338966987838,
"grad_norm": 0.15929211676120758,
"learning_rate": 2.106418583199493e-06,
"loss": 0.231,
"step": 5750
},
{
"epoch": 3.640657084188912,
"grad_norm": 0.16133394837379456,
"learning_rate": 2.088452296851744e-06,
"loss": 0.2299,
"step": 5760
},
{
"epoch": 3.646975201389986,
"grad_norm": 0.15688304603099823,
"learning_rate": 2.070542710752818e-06,
"loss": 0.2282,
"step": 5770
},
{
"epoch": 3.65329331859106,
"grad_norm": 0.1701997071504593,
"learning_rate": 2.052690173678724e-06,
"loss": 0.2287,
"step": 5780
},
{
"epoch": 3.659611435792134,
"grad_norm": 0.16671252250671387,
"learning_rate": 2.034895033294483e-06,
"loss": 0.2299,
"step": 5790
},
{
"epoch": 3.665929552993208,
"grad_norm": 0.16977478563785553,
"learning_rate": 2.0171576361473587e-06,
"loss": 0.2282,
"step": 5800
},
{
"epoch": 3.672247670194282,
"grad_norm": 0.1764647513628006,
"learning_rate": 1.999478327660109e-06,
"loss": 0.2294,
"step": 5810
},
{
"epoch": 3.6785657873953563,
"grad_norm": 0.16209015250205994,
"learning_rate": 1.9818574521242507e-06,
"loss": 0.2306,
"step": 5820
},
{
"epoch": 3.6848839045964303,
"grad_norm": 0.16386057436466217,
"learning_rate": 1.9642953526933685e-06,
"loss": 0.2273,
"step": 5830
},
{
"epoch": 3.6912020217975043,
"grad_norm": 0.20157091319561005,
"learning_rate": 1.9467923713764296e-06,
"loss": 0.2285,
"step": 5840
},
{
"epoch": 3.6975201389985783,
"grad_norm": 0.14894433319568634,
"learning_rate": 1.9293488490311085e-06,
"loss": 0.2297,
"step": 5850
},
{
"epoch": 3.7038382561996523,
"grad_norm": 0.16043171286582947,
"learning_rate": 1.9119651253571676e-06,
"loss": 0.2301,
"step": 5860
},
{
"epoch": 3.710156373400727,
"grad_norm": 0.15590202808380127,
"learning_rate": 1.894641538889832e-06,
"loss": 0.2303,
"step": 5870
},
{
"epoch": 3.716474490601801,
"grad_norm": 0.15428245067596436,
"learning_rate": 1.877378426993201e-06,
"loss": 0.2268,
"step": 5880
},
{
"epoch": 3.722792607802875,
"grad_norm": 0.15511804819107056,
"learning_rate": 1.86017612585367e-06,
"loss": 0.2293,
"step": 5890
},
{
"epoch": 3.729110725003949,
"grad_norm": 0.15739892423152924,
"learning_rate": 1.843034970473398e-06,
"loss": 0.2307,
"step": 5900
},
{
"epoch": 3.735428842205023,
"grad_norm": 0.1598675698041916,
"learning_rate": 1.82595529466377e-06,
"loss": 0.2292,
"step": 5910
},
{
"epoch": 3.741746959406097,
"grad_norm": 0.1549026221036911,
"learning_rate": 1.8089374310389052e-06,
"loss": 0.2306,
"step": 5920
},
{
"epoch": 3.748065076607171,
"grad_norm": 0.16567422449588776,
"learning_rate": 1.7919817110091691e-06,
"loss": 0.2314,
"step": 5930
},
{
"epoch": 3.7543831938082453,
"grad_norm": 0.16314323246479034,
"learning_rate": 1.775088464774734e-06,
"loss": 0.231,
"step": 5940
},
{
"epoch": 3.7607013110093193,
"grad_norm": 0.15875166654586792,
"learning_rate": 1.7582580213191381e-06,
"loss": 0.2281,
"step": 5950
},
{
"epoch": 3.7670194282103933,
"grad_norm": 0.15357348322868347,
"learning_rate": 1.7414907084028804e-06,
"loss": 0.2265,
"step": 5960
},
{
"epoch": 3.7733375454114673,
"grad_norm": 0.16420722007751465,
"learning_rate": 1.724786852557041e-06,
"loss": 0.2307,
"step": 5970
},
{
"epoch": 3.7796556626125417,
"grad_norm": 0.1632334589958191,
"learning_rate": 1.70814677907692e-06,
"loss": 0.2309,
"step": 5980
},
{
"epoch": 3.7859737798136157,
"grad_norm": 0.16144877672195435,
"learning_rate": 1.6915708120157042e-06,
"loss": 0.2283,
"step": 5990
},
{
"epoch": 3.7922918970146897,
"grad_norm": 0.1612851768732071,
"learning_rate": 1.6750592741781496e-06,
"loss": 0.2284,
"step": 6000
},
{
"epoch": 3.7986100142157637,
"grad_norm": 0.1625714898109436,
"learning_rate": 1.6586124871143062e-06,
"loss": 0.2307,
"step": 6010
},
{
"epoch": 3.8049281314168377,
"grad_norm": 0.15983229875564575,
"learning_rate": 1.6422307711132462e-06,
"loss": 0.23,
"step": 6020
},
{
"epoch": 3.8112462486179117,
"grad_norm": 0.16138029098510742,
"learning_rate": 1.6259144451968383e-06,
"loss": 0.2293,
"step": 6030
},
{
"epoch": 3.8175643658189857,
"grad_norm": 0.15706180036067963,
"learning_rate": 1.6096638271135172e-06,
"loss": 0.2293,
"step": 6040
},
{
"epoch": 3.82388248302006,
"grad_norm": 0.15325595438480377,
"learning_rate": 1.593479233332112e-06,
"loss": 0.2276,
"step": 6050
},
{
"epoch": 3.830200600221134,
"grad_norm": 0.1517479419708252,
"learning_rate": 1.577360979035678e-06,
"loss": 0.2296,
"step": 6060
},
{
"epoch": 3.836518717422208,
"grad_norm": 0.15618766844272614,
"learning_rate": 1.5613093781153503e-06,
"loss": 0.2292,
"step": 6070
},
{
"epoch": 3.842836834623282,
"grad_norm": 0.1522364616394043,
"learning_rate": 1.5453247431642493e-06,
"loss": 0.2286,
"step": 6080
},
{
"epoch": 3.849154951824356,
"grad_norm": 0.1619284451007843,
"learning_rate": 1.5294073854713754e-06,
"loss": 0.2302,
"step": 6090
},
{
"epoch": 3.8554730690254306,
"grad_norm": 0.15237174928188324,
"learning_rate": 1.5135576150155567e-06,
"loss": 0.2303,
"step": 6100
},
{
"epoch": 3.8617911862265046,
"grad_norm": 0.15762847661972046,
"learning_rate": 1.4977757404594063e-06,
"loss": 0.2282,
"step": 6110
},
{
"epoch": 3.8681093034275786,
"grad_norm": 0.15904614329338074,
"learning_rate": 1.4820620691433175e-06,
"loss": 0.2298,
"step": 6120
},
{
"epoch": 3.8744274206286526,
"grad_norm": 0.159016951918602,
"learning_rate": 1.4664169070794753e-06,
"loss": 0.2301,
"step": 6130
},
{
"epoch": 3.8807455378297266,
"grad_norm": 0.15268373489379883,
"learning_rate": 1.4508405589458968e-06,
"loss": 0.2299,
"step": 6140
},
{
"epoch": 3.8870636550308006,
"grad_norm": 0.16221952438354492,
"learning_rate": 1.4353333280805e-06,
"loss": 0.2263,
"step": 6150
},
{
"epoch": 3.8933817722318746,
"grad_norm": 0.1568318009376526,
"learning_rate": 1.419895516475192e-06,
"loss": 0.2285,
"step": 6160
},
{
"epoch": 3.899699889432949,
"grad_norm": 0.15674127638339996,
"learning_rate": 1.4045274247699957e-06,
"loss": 0.2315,
"step": 6170
},
{
"epoch": 3.906018006634023,
"grad_norm": 0.15392176806926727,
"learning_rate": 1.3892293522471834e-06,
"loss": 0.2304,
"step": 6180
},
{
"epoch": 3.912336123835097,
"grad_norm": 0.15840460360050201,
"learning_rate": 1.374001596825461e-06,
"loss": 0.2272,
"step": 6190
},
{
"epoch": 3.918654241036171,
"grad_norm": 0.15263865888118744,
"learning_rate": 1.3588444550541568e-06,
"loss": 0.2313,
"step": 6200
},
{
"epoch": 3.9249723582372456,
"grad_norm": 0.14992570877075195,
"learning_rate": 1.3437582221074574e-06,
"loss": 0.2289,
"step": 6210
},
{
"epoch": 3.9312904754383196,
"grad_norm": 0.14820538461208344,
"learning_rate": 1.3287431917786426e-06,
"loss": 0.2302,
"step": 6220
},
{
"epoch": 3.9376085926393936,
"grad_norm": 0.15514026582241058,
"learning_rate": 1.3137996564743783e-06,
"loss": 0.2286,
"step": 6230
},
{
"epoch": 3.9439267098404676,
"grad_norm": 0.15012729167938232,
"learning_rate": 1.2989279072090184e-06,
"loss": 0.2301,
"step": 6240
},
{
"epoch": 3.9502448270415416,
"grad_norm": 0.15299195051193237,
"learning_rate": 1.2841282335989363e-06,
"loss": 0.2308,
"step": 6250
},
{
"epoch": 3.9565629442426156,
"grad_norm": 0.1466607302427292,
"learning_rate": 1.2694009238568794e-06,
"loss": 0.2291,
"step": 6260
},
{
"epoch": 3.9628810614436896,
"grad_norm": 0.15444868803024292,
"learning_rate": 1.2547462647863711e-06,
"loss": 0.2296,
"step": 6270
},
{
"epoch": 3.969199178644764,
"grad_norm": 0.15740527212619781,
"learning_rate": 1.2401645417761126e-06,
"loss": 0.2298,
"step": 6280
},
{
"epoch": 3.975517295845838,
"grad_norm": 0.1578647792339325,
"learning_rate": 1.225656038794425e-06,
"loss": 0.2321,
"step": 6290
},
{
"epoch": 3.981835413046912,
"grad_norm": 0.15657520294189453,
"learning_rate": 1.211221038383728e-06,
"loss": 0.2285,
"step": 6300
},
{
"epoch": 3.988153530247986,
"grad_norm": 0.1587335765361786,
"learning_rate": 1.1968598216550315e-06,
"loss": 0.2278,
"step": 6310
},
{
"epoch": 3.99447164744906,
"grad_norm": 0.15161466598510742,
"learning_rate": 1.182572668282463e-06,
"loss": 0.2261,
"step": 6320
},
{
"epoch": 4.001263623440215,
"grad_norm": 0.15584523975849152,
"learning_rate": 1.1683598564978188e-06,
"loss": 0.2443,
"step": 6330
},
{
"epoch": 4.007581740641289,
"grad_norm": 0.15158313512802124,
"learning_rate": 1.15422166308515e-06,
"loss": 0.2254,
"step": 6340
},
{
"epoch": 4.013899857842363,
"grad_norm": 0.15783625841140747,
"learning_rate": 1.1401583633753683e-06,
"loss": 0.2218,
"step": 6350
},
{
"epoch": 4.020217975043437,
"grad_norm": 0.151853546500206,
"learning_rate": 1.1261702312408867e-06,
"loss": 0.223,
"step": 6360
},
{
"epoch": 4.026536092244511,
"grad_norm": 0.14669708907604218,
"learning_rate": 1.1122575390902824e-06,
"loss": 0.2233,
"step": 6370
},
{
"epoch": 4.032854209445585,
"grad_norm": 0.1561277061700821,
"learning_rate": 1.0984205578629958e-06,
"loss": 0.2262,
"step": 6380
},
{
"epoch": 4.03917232664666,
"grad_norm": 0.15337461233139038,
"learning_rate": 1.084659557024057e-06,
"loss": 0.2248,
"step": 6390
},
{
"epoch": 4.045490443847734,
"grad_norm": 0.15551766753196716,
"learning_rate": 1.0709748045588269e-06,
"loss": 0.2248,
"step": 6400
},
{
"epoch": 4.051808561048808,
"grad_norm": 0.1567201167345047,
"learning_rate": 1.057366566967789e-06,
"loss": 0.2246,
"step": 6410
},
{
"epoch": 4.058126678249882,
"grad_norm": 0.14856794476509094,
"learning_rate": 1.043835109261357e-06,
"loss": 0.2241,
"step": 6420
},
{
"epoch": 4.064444795450956,
"grad_norm": 0.1545330137014389,
"learning_rate": 1.0303806949547118e-06,
"loss": 0.224,
"step": 6430
},
{
"epoch": 4.07076291265203,
"grad_norm": 0.1541059911251068,
"learning_rate": 1.0170035860626676e-06,
"loss": 0.2262,
"step": 6440
},
{
"epoch": 4.077081029853104,
"grad_norm": 0.15895813703536987,
"learning_rate": 1.0037040430945782e-06,
"loss": 0.2254,
"step": 6450
},
{
"epoch": 4.083399147054178,
"grad_norm": 0.15541358292102814,
"learning_rate": 9.904823250492546e-07,
"loss": 0.2258,
"step": 6460
},
{
"epoch": 4.089717264255252,
"grad_norm": 0.16455316543579102,
"learning_rate": 9.773386894099269e-07,
"loss": 0.2234,
"step": 6470
},
{
"epoch": 4.096035381456326,
"grad_norm": 0.15118283033370972,
"learning_rate": 9.642733921392233e-07,
"loss": 0.2252,
"step": 6480
},
{
"epoch": 4.1023534986574,
"grad_norm": 0.14733092486858368,
"learning_rate": 9.512866876741949e-07,
"loss": 0.2231,
"step": 6490
},
{
"epoch": 4.108671615858475,
"grad_norm": 0.15276247262954712,
"learning_rate": 9.383788289213541e-07,
"loss": 0.225,
"step": 6500
},
{
"epoch": 4.114989733059549,
"grad_norm": 0.1504809558391571,
"learning_rate": 9.255500672517497e-07,
"loss": 0.2242,
"step": 6510
},
{
"epoch": 4.121307850260623,
"grad_norm": 0.1528443992137909,
"learning_rate": 9.128006524960747e-07,
"loss": 0.2249,
"step": 6520
},
{
"epoch": 4.127625967461697,
"grad_norm": 0.147428497672081,
"learning_rate": 9.001308329397996e-07,
"loss": 0.2214,
"step": 6530
},
{
"epoch": 4.133944084662771,
"grad_norm": 0.1520494669675827,
"learning_rate": 8.875408553183357e-07,
"loss": 0.2249,
"step": 6540
},
{
"epoch": 4.140262201863845,
"grad_norm": 0.16425903141498566,
"learning_rate": 8.750309648122307e-07,
"loss": 0.2264,
"step": 6550
},
{
"epoch": 4.146580319064919,
"grad_norm": 0.15226700901985168,
"learning_rate": 8.62601405042397e-07,
"loss": 0.2245,
"step": 6560
},
{
"epoch": 4.152898436265993,
"grad_norm": 0.15050509572029114,
"learning_rate": 8.502524180653632e-07,
"loss": 0.2272,
"step": 6570
},
{
"epoch": 4.159216553467067,
"grad_norm": 0.15115346014499664,
"learning_rate": 8.379842443685626e-07,
"loss": 0.2259,
"step": 6580
},
{
"epoch": 4.165534670668141,
"grad_norm": 0.14687852561473846,
"learning_rate": 8.257971228656502e-07,
"loss": 0.224,
"step": 6590
},
{
"epoch": 4.171852787869215,
"grad_norm": 0.15161781013011932,
"learning_rate": 8.136912908918482e-07,
"loss": 0.2251,
"step": 6600
},
{
"epoch": 4.178170905070289,
"grad_norm": 0.15190783143043518,
"learning_rate": 8.016669841993258e-07,
"loss": 0.2241,
"step": 6610
},
{
"epoch": 4.1844890222713635,
"grad_norm": 0.14926742017269135,
"learning_rate": 7.897244369526036e-07,
"loss": 0.2249,
"step": 6620
},
{
"epoch": 4.1908071394724375,
"grad_norm": 0.16139356791973114,
"learning_rate": 7.778638817240042e-07,
"loss": 0.2264,
"step": 6630
},
{
"epoch": 4.1971252566735116,
"grad_norm": 0.15186412632465363,
"learning_rate": 7.660855494891107e-07,
"loss": 0.222,
"step": 6640
},
{
"epoch": 4.2034433738745856,
"grad_norm": 0.14822430908679962,
"learning_rate": 7.543896696222763e-07,
"loss": 0.2247,
"step": 6650
},
{
"epoch": 4.20976149107566,
"grad_norm": 0.15279975533485413,
"learning_rate": 7.427764698921519e-07,
"loss": 0.2239,
"step": 6660
},
{
"epoch": 4.216079608276734,
"grad_norm": 0.1534373015165329,
"learning_rate": 7.312461764572571e-07,
"loss": 0.2231,
"step": 6670
},
{
"epoch": 4.222397725477808,
"grad_norm": 0.15967117249965668,
"learning_rate": 7.197990138615712e-07,
"loss": 0.2247,
"step": 6680
},
{
"epoch": 4.228715842678882,
"grad_norm": 0.16143904626369476,
"learning_rate": 7.084352050301607e-07,
"loss": 0.2257,
"step": 6690
},
{
"epoch": 4.235033959879956,
"grad_norm": 0.1529376357793808,
"learning_rate": 6.971549712648401e-07,
"loss": 0.2242,
"step": 6700
},
{
"epoch": 4.24135207708103,
"grad_norm": 0.15520897507667542,
"learning_rate": 6.859585322398605e-07,
"loss": 0.2238,
"step": 6710
},
{
"epoch": 4.247670194282104,
"grad_norm": 0.1552317589521408,
"learning_rate": 6.74846105997633e-07,
"loss": 0.2235,
"step": 6720
},
{
"epoch": 4.2539883114831785,
"grad_norm": 0.15890224277973175,
"learning_rate": 6.638179089444791e-07,
"loss": 0.2253,
"step": 6730
},
{
"epoch": 4.2603064286842525,
"grad_norm": 0.15153637528419495,
"learning_rate": 6.528741558464207e-07,
"loss": 0.2232,
"step": 6740
},
{
"epoch": 4.2666245458853265,
"grad_norm": 0.15470515191555023,
"learning_rate": 6.420150598249947e-07,
"loss": 0.2244,
"step": 6750
},
{
"epoch": 4.2729426630864005,
"grad_norm": 0.1615689992904663,
"learning_rate": 6.312408323531083e-07,
"loss": 0.2246,
"step": 6760
},
{
"epoch": 4.2792607802874745,
"grad_norm": 0.1515345424413681,
"learning_rate": 6.205516832509089e-07,
"loss": 0.2239,
"step": 6770
},
{
"epoch": 4.2855788974885485,
"grad_norm": 0.14731772243976593,
"learning_rate": 6.0994782068171e-07,
"loss": 0.2249,
"step": 6780
},
{
"epoch": 4.2918970146896225,
"grad_norm": 0.15626221895217896,
"learning_rate": 5.99429451147932e-07,
"loss": 0.2264,
"step": 6790
},
{
"epoch": 4.2982151318906965,
"grad_norm": 0.15102536976337433,
"learning_rate": 5.889967794870794e-07,
"loss": 0.2244,
"step": 6800
},
{
"epoch": 4.3045332490917705,
"grad_norm": 0.150013267993927,
"learning_rate": 5.786500088677543e-07,
"loss": 0.2262,
"step": 6810
},
{
"epoch": 4.3108513662928445,
"grad_norm": 0.15447860956192017,
"learning_rate": 5.683893407857027e-07,
"loss": 0.2234,
"step": 6820
},
{
"epoch": 4.3171694834939185,
"grad_norm": 0.15842311084270477,
"learning_rate": 5.582149750598842e-07,
"loss": 0.2229,
"step": 6830
},
{
"epoch": 4.3234876006949925,
"grad_norm": 0.16022993624210358,
"learning_rate": 5.481271098285818e-07,
"loss": 0.2262,
"step": 6840
},
{
"epoch": 4.329805717896067,
"grad_norm": 0.15244120359420776,
"learning_rate": 5.381259415455475e-07,
"loss": 0.2241,
"step": 6850
},
{
"epoch": 4.336123835097141,
"grad_norm": 0.154579758644104,
"learning_rate": 5.282116649761738e-07,
"loss": 0.2225,
"step": 6860
},
{
"epoch": 4.342441952298215,
"grad_norm": 0.1539810597896576,
"learning_rate": 5.183844731937004e-07,
"loss": 0.2238,
"step": 6870
},
{
"epoch": 4.348760069499289,
"grad_norm": 0.15259358286857605,
"learning_rate": 5.086445575754551e-07,
"loss": 0.2249,
"step": 6880
},
{
"epoch": 4.355078186700363,
"grad_norm": 0.15488529205322266,
"learning_rate": 4.989921077991272e-07,
"loss": 0.2253,
"step": 6890
},
{
"epoch": 4.361396303901437,
"grad_norm": 0.15208259224891663,
"learning_rate": 4.89427311839073e-07,
"loss": 0.2232,
"step": 6900
},
{
"epoch": 4.367714421102511,
"grad_norm": 0.15295451879501343,
"learning_rate": 4.799503559626528e-07,
"loss": 0.2231,
"step": 6910
},
{
"epoch": 4.374032538303585,
"grad_norm": 0.14501479268074036,
"learning_rate": 4.7056142472660993e-07,
"loss": 0.2226,
"step": 6920
},
{
"epoch": 4.380350655504659,
"grad_norm": 0.15459899604320526,
"learning_rate": 4.6126070097346933e-07,
"loss": 0.2242,
"step": 6930
},
{
"epoch": 4.386668772705733,
"grad_norm": 0.14847847819328308,
"learning_rate": 4.520483658279817e-07,
"loss": 0.2223,
"step": 6940
},
{
"epoch": 4.392986889906807,
"grad_norm": 0.15139150619506836,
"learning_rate": 4.4292459869359484e-07,
"loss": 0.2253,
"step": 6950
},
{
"epoch": 4.399305007107882,
"grad_norm": 0.1547953188419342,
"learning_rate": 4.3388957724895874e-07,
"loss": 0.2246,
"step": 6960
},
{
"epoch": 4.405623124308956,
"grad_norm": 0.15533864498138428,
"learning_rate": 4.249434774444672e-07,
"loss": 0.2235,
"step": 6970
},
{
"epoch": 4.41194124151003,
"grad_norm": 0.15549246966838837,
"learning_rate": 4.1608647349883123e-07,
"loss": 0.2234,
"step": 6980
},
{
"epoch": 4.418259358711104,
"grad_norm": 0.15497823059558868,
"learning_rate": 4.073187378956811e-07,
"loss": 0.2258,
"step": 6990
},
{
"epoch": 4.424577475912178,
"grad_norm": 0.14938384294509888,
"learning_rate": 3.9864044138021915e-07,
"loss": 0.2238,
"step": 7000
},
{
"epoch": 4.430895593113252,
"grad_norm": 0.1605786234140396,
"learning_rate": 3.9005175295588227e-07,
"loss": 0.2269,
"step": 7010
},
{
"epoch": 4.437213710314326,
"grad_norm": 0.14927615225315094,
"learning_rate": 3.815528398810553e-07,
"loss": 0.2239,
"step": 7020
},
{
"epoch": 4.4435318275154,
"grad_norm": 0.15817302465438843,
"learning_rate": 3.7314386766581725e-07,
"loss": 0.2245,
"step": 7030
},
{
"epoch": 4.449849944716474,
"grad_norm": 0.14866997301578522,
"learning_rate": 3.6482500006871315e-07,
"loss": 0.2235,
"step": 7040
},
{
"epoch": 4.456168061917548,
"grad_norm": 0.15268754959106445,
"learning_rate": 3.5659639909356725e-07,
"loss": 0.2238,
"step": 7050
},
{
"epoch": 4.462486179118622,
"grad_norm": 0.15132968127727509,
"learning_rate": 3.4845822498632773e-07,
"loss": 0.2255,
"step": 7060
},
{
"epoch": 4.468804296319696,
"grad_norm": 0.15053577721118927,
"learning_rate": 3.4041063623194705e-07,
"loss": 0.2244,
"step": 7070
},
{
"epoch": 4.475122413520771,
"grad_norm": 0.15395694971084595,
"learning_rate": 3.3245378955129306e-07,
"loss": 0.2248,
"step": 7080
},
{
"epoch": 4.481440530721845,
"grad_norm": 0.15405914187431335,
"learning_rate": 3.245878398980995e-07,
"loss": 0.2238,
"step": 7090
},
{
"epoch": 4.487758647922919,
"grad_norm": 0.1462317258119583,
"learning_rate": 3.168129404559467e-07,
"loss": 0.2232,
"step": 7100
},
{
"epoch": 4.494076765123993,
"grad_norm": 0.15240703523159027,
"learning_rate": 3.0912924263527934e-07,
"loss": 0.2226,
"step": 7110
},
{
"epoch": 4.500394882325067,
"grad_norm": 0.1582237184047699,
"learning_rate": 3.015368960704584e-07,
"loss": 0.2238,
"step": 7120
},
{
"epoch": 4.506712999526141,
"grad_norm": 0.14886438846588135,
"learning_rate": 2.940360486168453e-07,
"loss": 0.2245,
"step": 7130
},
{
"epoch": 4.513031116727215,
"grad_norm": 0.15885640680789948,
"learning_rate": 2.8662684634792436e-07,
"loss": 0.2261,
"step": 7140
},
{
"epoch": 4.519349233928289,
"grad_norm": 0.1518273651599884,
"learning_rate": 2.793094335524571e-07,
"loss": 0.2236,
"step": 7150
},
{
"epoch": 4.525667351129363,
"grad_norm": 0.14824968576431274,
"learning_rate": 2.7208395273167376e-07,
"loss": 0.2243,
"step": 7160
},
{
"epoch": 4.531985468330437,
"grad_norm": 0.16252179443836212,
"learning_rate": 2.6495054459649285e-07,
"loss": 0.224,
"step": 7170
},
{
"epoch": 4.538303585531511,
"grad_norm": 0.1533941775560379,
"learning_rate": 2.5790934806479095e-07,
"loss": 0.2241,
"step": 7180
},
{
"epoch": 4.544621702732586,
"grad_norm": 0.1516910344362259,
"learning_rate": 2.5096050025868734e-07,
"loss": 0.2233,
"step": 7190
},
{
"epoch": 4.55093981993366,
"grad_norm": 0.15374279022216797,
"learning_rate": 2.4410413650188035e-07,
"loss": 0.2251,
"step": 7200
},
{
"epoch": 4.557257937134734,
"grad_norm": 0.15357162058353424,
"learning_rate": 2.3734039031700684e-07,
"loss": 0.2246,
"step": 7210
},
{
"epoch": 4.563576054335808,
"grad_norm": 0.1557237058877945,
"learning_rate": 2.3066939342304696e-07,
"loss": 0.2216,
"step": 7220
},
{
"epoch": 4.569894171536882,
"grad_norm": 0.15244849026203156,
"learning_rate": 2.240912757327557e-07,
"loss": 0.2248,
"step": 7230
},
{
"epoch": 4.576212288737956,
"grad_norm": 0.15775157511234283,
"learning_rate": 2.176061653501338e-07,
"loss": 0.2242,
"step": 7240
},
{
"epoch": 4.58253040593903,
"grad_norm": 0.15207399427890778,
"learning_rate": 2.1121418856793363e-07,
"loss": 0.2245,
"step": 7250
},
{
"epoch": 4.588848523140104,
"grad_norm": 0.1468561291694641,
"learning_rate": 2.0491546986519896e-07,
"loss": 0.2229,
"step": 7260
},
{
"epoch": 4.595166640341178,
"grad_norm": 0.15533696115016937,
"learning_rate": 1.987101319048418e-07,
"loss": 0.2245,
"step": 7270
},
{
"epoch": 4.601484757542252,
"grad_norm": 0.15302863717079163,
"learning_rate": 1.925982955312511e-07,
"loss": 0.2221,
"step": 7280
},
{
"epoch": 4.607802874743326,
"grad_norm": 0.15562526881694794,
"learning_rate": 1.8658007976794235e-07,
"loss": 0.226,
"step": 7290
},
{
"epoch": 4.6141209919444,
"grad_norm": 0.14804142713546753,
"learning_rate": 1.8065560181523889e-07,
"loss": 0.2225,
"step": 7300
},
{
"epoch": 4.620439109145474,
"grad_norm": 0.144419863820076,
"learning_rate": 1.748249770479893e-07,
"loss": 0.2246,
"step": 7310
},
{
"epoch": 4.626757226346549,
"grad_norm": 0.15179699659347534,
"learning_rate": 1.6908831901331968e-07,
"loss": 0.2279,
"step": 7320
},
{
"epoch": 4.633075343547623,
"grad_norm": 0.15003693103790283,
"learning_rate": 1.6344573942842333e-07,
"loss": 0.2248,
"step": 7330
},
{
"epoch": 4.639393460748697,
"grad_norm": 0.14552009105682373,
"learning_rate": 1.5789734817838577e-07,
"loss": 0.2237,
"step": 7340
},
{
"epoch": 4.645711577949771,
"grad_norm": 0.15160489082336426,
"learning_rate": 1.5244325331404242e-07,
"loss": 0.2232,
"step": 7350
},
{
"epoch": 4.652029695150845,
"grad_norm": 0.15886756777763367,
"learning_rate": 1.470835610498761e-07,
"loss": 0.2217,
"step": 7360
},
{
"epoch": 4.658347812351919,
"grad_norm": 0.1517808735370636,
"learning_rate": 1.4181837576195179e-07,
"loss": 0.2235,
"step": 7370
},
{
"epoch": 4.664665929552993,
"grad_norm": 0.14629001915454865,
"learning_rate": 1.366477999858773e-07,
"loss": 0.2251,
"step": 7380
},
{
"epoch": 4.670984046754067,
"grad_norm": 0.15069714188575745,
"learning_rate": 1.315719344148092e-07,
"loss": 0.2233,
"step": 7390
},
{
"epoch": 4.677302163955141,
"grad_norm": 0.15253259241580963,
"learning_rate": 1.2659087789749557e-07,
"loss": 0.2238,
"step": 7400
},
{
"epoch": 4.683620281156215,
"grad_norm": 0.15447266399860382,
"learning_rate": 1.2170472743634588e-07,
"loss": 0.2218,
"step": 7410
},
{
"epoch": 4.68993839835729,
"grad_norm": 0.15333184599876404,
"learning_rate": 1.1691357818554405e-07,
"loss": 0.226,
"step": 7420
},
{
"epoch": 4.696256515558364,
"grad_norm": 0.1548086404800415,
"learning_rate": 1.1221752344919679e-07,
"loss": 0.2252,
"step": 7430
},
{
"epoch": 4.702574632759438,
"grad_norm": 0.14932510256767273,
"learning_rate": 1.0761665467951321e-07,
"loss": 0.2232,
"step": 7440
},
{
"epoch": 4.708892749960512,
"grad_norm": 0.15253929793834686,
"learning_rate": 1.0311106147502747e-07,
"loss": 0.223,
"step": 7450
},
{
"epoch": 4.715210867161586,
"grad_norm": 0.1560056507587433,
"learning_rate": 9.870083157885068e-08,
"loss": 0.2248,
"step": 7460
},
{
"epoch": 4.72152898436266,
"grad_norm": 0.1542298048734665,
"learning_rate": 9.43860508769645e-08,
"loss": 0.2223,
"step": 7470
},
{
"epoch": 4.727847101563734,
"grad_norm": 0.1528465300798416,
"learning_rate": 9.01668033965486e-08,
"loss": 0.223,
"step": 7480
},
{
"epoch": 4.734165218764808,
"grad_norm": 0.14702333509922028,
"learning_rate": 8.604317130434137e-08,
"loss": 0.2242,
"step": 7490
},
{
"epoch": 4.740483335965882,
"grad_norm": 0.1520882397890091,
"learning_rate": 8.201523490504404e-08,
"loss": 0.2232,
"step": 7500
},
{
"epoch": 4.746801453166956,
"grad_norm": 0.14876076579093933,
"learning_rate": 7.808307263975301e-08,
"loss": 0.2244,
"step": 7510
},
{
"epoch": 4.75311957036803,
"grad_norm": 0.14352434873580933,
"learning_rate": 7.424676108443551e-08,
"loss": 0.2248,
"step": 7520
},
{
"epoch": 4.759437687569104,
"grad_norm": 0.14998690783977509,
"learning_rate": 7.050637494843526e-08,
"loss": 0.225,
"step": 7530
},
{
"epoch": 4.765755804770178,
"grad_norm": 0.15730910003185272,
"learning_rate": 6.686198707301861e-08,
"loss": 0.2256,
"step": 7540
},
{
"epoch": 4.772073921971253,
"grad_norm": 0.15142279863357544,
"learning_rate": 6.331366842995901e-08,
"loss": 0.2251,
"step": 7550
},
{
"epoch": 4.778392039172327,
"grad_norm": 0.15004810690879822,
"learning_rate": 5.986148812015036e-08,
"loss": 0.2271,
"step": 7560
},
{
"epoch": 4.784710156373401,
"grad_norm": 0.15459021925926208,
"learning_rate": 5.650551337226362e-08,
"loss": 0.2247,
"step": 7570
},
{
"epoch": 4.791028273574475,
"grad_norm": 0.1535128951072693,
"learning_rate": 5.324580954143621e-08,
"loss": 0.225,
"step": 7580
},
{
"epoch": 4.797346390775549,
"grad_norm": 0.14755982160568237,
"learning_rate": 5.008244010800245e-08,
"loss": 0.2244,
"step": 7590
},
{
"epoch": 4.803664507976623,
"grad_norm": 0.15288862586021423,
"learning_rate": 4.701546667625401e-08,
"loss": 0.2242,
"step": 7600
},
{
"epoch": 4.809982625177697,
"grad_norm": 0.14481881260871887,
"learning_rate": 4.4044948973240855e-08,
"loss": 0.2241,
"step": 7610
},
{
"epoch": 4.816300742378771,
"grad_norm": 0.1468980610370636,
"learning_rate": 4.117094484760942e-08,
"loss": 0.2223,
"step": 7620
},
{
"epoch": 4.822618859579845,
"grad_norm": 0.1565941572189331,
"learning_rate": 3.8393510268475155e-08,
"loss": 0.2245,
"step": 7630
},
{
"epoch": 4.828936976780919,
"grad_norm": 0.15280728042125702,
"learning_rate": 3.5712699324331745e-08,
"loss": 0.2237,
"step": 7640
},
{
"epoch": 4.835255093981994,
"grad_norm": 0.14640846848487854,
"learning_rate": 3.312856422200028e-08,
"loss": 0.2249,
"step": 7650
},
{
"epoch": 4.841573211183068,
"grad_norm": 0.15066128969192505,
"learning_rate": 3.064115528561007e-08,
"loss": 0.224,
"step": 7660
},
{
"epoch": 4.847891328384142,
"grad_norm": 0.14239919185638428,
"learning_rate": 2.8250520955618864e-08,
"loss": 0.2206,
"step": 7670
},
{
"epoch": 4.854209445585216,
"grad_norm": 0.1439589262008667,
"learning_rate": 2.595670778787196e-08,
"loss": 0.2254,
"step": 7680
},
{
"epoch": 4.86052756278629,
"grad_norm": 0.15104269981384277,
"learning_rate": 2.3759760452691794e-08,
"loss": 0.2249,
"step": 7690
},
{
"epoch": 4.866845679987364,
"grad_norm": 0.1463245004415512,
"learning_rate": 2.165972173401143e-08,
"loss": 0.2237,
"step": 7700
},
{
"epoch": 4.873163797188438,
"grad_norm": 0.14985409379005432,
"learning_rate": 1.965663252853911e-08,
"loss": 0.2246,
"step": 7710
},
{
"epoch": 4.879481914389512,
"grad_norm": 0.1502169370651245,
"learning_rate": 1.7750531844963335e-08,
"loss": 0.2245,
"step": 7720
},
{
"epoch": 4.885800031590586,
"grad_norm": 0.15175861120224,
"learning_rate": 1.5941456803191812e-08,
"loss": 0.2221,
"step": 7730
},
{
"epoch": 4.89211814879166,
"grad_norm": 0.15319672226905823,
"learning_rate": 1.4229442633630353e-08,
"loss": 0.2224,
"step": 7740
},
{
"epoch": 4.898436265992734,
"grad_norm": 0.14487479627132416,
"learning_rate": 1.2614522676493435e-08,
"loss": 0.2217,
"step": 7750
},
{
"epoch": 4.904754383193808,
"grad_norm": 0.14722299575805664,
"learning_rate": 1.1096728381160271e-08,
"loss": 0.2236,
"step": 7760
},
{
"epoch": 4.911072500394882,
"grad_norm": 0.16063782572746277,
"learning_rate": 9.676089305557523e-09,
"loss": 0.2244,
"step": 7770
},
{
"epoch": 4.917390617595957,
"grad_norm": 0.15409326553344727,
"learning_rate": 8.352633115584764e-09,
"loss": 0.2257,
"step": 7780
},
{
"epoch": 4.923708734797031,
"grad_norm": 0.1531437486410141,
"learning_rate": 7.1263855845782325e-09,
"loss": 0.2231,
"step": 7790
},
{
"epoch": 4.930026851998105,
"grad_norm": 0.15183156728744507,
"learning_rate": 5.997370592806251e-09,
"loss": 0.2241,
"step": 7800
},
{
"epoch": 4.936344969199179,
"grad_norm": 0.1480141282081604,
"learning_rate": 4.965610127004028e-09,
"loss": 0.2229,
"step": 7810
},
{
"epoch": 4.942663086400253,
"grad_norm": 0.15001103281974792,
"learning_rate": 4.031124279948451e-09,
"loss": 0.2231,
"step": 7820
},
{
"epoch": 4.948981203601327,
"grad_norm": 0.15596900880336761,
"learning_rate": 3.193931250062843e-09,
"loss": 0.2249,
"step": 7830
},
{
"epoch": 4.955299320802401,
"grad_norm": 0.15520991384983063,
"learning_rate": 2.45404734106558e-09,
"loss": 0.2253,
"step": 7840
},
{
"epoch": 4.961617438003475,
"grad_norm": 0.15404628217220306,
"learning_rate": 1.811486961650899e-09,
"loss": 0.2246,
"step": 7850
},
{
"epoch": 4.967935555204549,
"grad_norm": 0.1471293717622757,
"learning_rate": 1.266262625210235e-09,
"loss": 0.2225,
"step": 7860
},
{
"epoch": 4.974253672405623,
"grad_norm": 0.1584838181734085,
"learning_rate": 8.183849495851937e-10,
"loss": 0.2237,
"step": 7870
},
{
"epoch": 4.980571789606698,
"grad_norm": 0.14231979846954346,
"learning_rate": 4.678626568649369e-10,
"loss": 0.223,
"step": 7880
},
{
"epoch": 4.986889906807772,
"grad_norm": 0.15368352830410004,
"learning_rate": 2.1470257321298815e-10,
"loss": 0.2237,
"step": 7890
},
{
"epoch": 4.993208024008846,
"grad_norm": 0.1479187160730362,
"learning_rate": 5.890962873456029e-11,
"loss": 0.2235,
"step": 7900
},
{
"epoch": 4.99952614120992,
"grad_norm": 0.14893342554569244,
"learning_rate": 4.868573838523461e-13,
"loss": 0.2249,
"step": 7910
},
{
"epoch": 4.99952614120992,
"step": 7910,
"total_flos": 3.246606278526417e+20,
"train_loss": 0.08344055705064467,
"train_runtime": 26260.8831,
"train_samples_per_second": 308.537,
"train_steps_per_second": 0.301
}
],
"logging_steps": 10,
"max_steps": 7910,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.246606278526417e+20,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}