ESICA / ESICA_Lite /trainer_state.json
MagicXin's picture
Upload folder using huggingface_hub
2c4f694 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 7730,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00517464424320828,
"grad_norm": 4.884782314300537,
"learning_rate": 3.0172413793103453e-07,
"loss": 0.4996,
"step": 8
},
{
"epoch": 0.01034928848641656,
"grad_norm": 10.670414924621582,
"learning_rate": 6.465517241379311e-07,
"loss": 0.4662,
"step": 16
},
{
"epoch": 0.015523932729624839,
"grad_norm": 2.3693668842315674,
"learning_rate": 9.913793103448276e-07,
"loss": 0.4448,
"step": 24
},
{
"epoch": 0.02069857697283312,
"grad_norm": 4.083646297454834,
"learning_rate": 1.336206896551724e-06,
"loss": 0.4917,
"step": 32
},
{
"epoch": 0.0258732212160414,
"grad_norm": 8.517114639282227,
"learning_rate": 1.681034482758621e-06,
"loss": 0.5036,
"step": 40
},
{
"epoch": 0.031047865459249677,
"grad_norm": 6.137472629547119,
"learning_rate": 2.025862068965517e-06,
"loss": 0.4726,
"step": 48
},
{
"epoch": 0.03622250970245795,
"grad_norm": 2.339662551879883,
"learning_rate": 2.370689655172414e-06,
"loss": 0.4691,
"step": 56
},
{
"epoch": 0.04139715394566624,
"grad_norm": 1.5891815423965454,
"learning_rate": 2.7155172413793105e-06,
"loss": 0.4667,
"step": 64
},
{
"epoch": 0.04657179818887452,
"grad_norm": 8.638023376464844,
"learning_rate": 3.0603448275862068e-06,
"loss": 0.4776,
"step": 72
},
{
"epoch": 0.0517464424320828,
"grad_norm": 1.5453442335128784,
"learning_rate": 3.4051724137931034e-06,
"loss": 0.4724,
"step": 80
},
{
"epoch": 0.056921086675291076,
"grad_norm": 3.350733757019043,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.4675,
"step": 88
},
{
"epoch": 0.062095730918499355,
"grad_norm": 4.810270309448242,
"learning_rate": 4.094827586206897e-06,
"loss": 0.4592,
"step": 96
},
{
"epoch": 0.06727037516170763,
"grad_norm": 1.514150619506836,
"learning_rate": 4.439655172413794e-06,
"loss": 0.4825,
"step": 104
},
{
"epoch": 0.0724450194049159,
"grad_norm": 8.344229698181152,
"learning_rate": 4.78448275862069e-06,
"loss": 0.4604,
"step": 112
},
{
"epoch": 0.07761966364812418,
"grad_norm": 6.710835933685303,
"learning_rate": 5.129310344827587e-06,
"loss": 0.4659,
"step": 120
},
{
"epoch": 0.08279430789133248,
"grad_norm": 3.7732086181640625,
"learning_rate": 5.474137931034483e-06,
"loss": 0.4575,
"step": 128
},
{
"epoch": 0.08796895213454076,
"grad_norm": 1.9706354141235352,
"learning_rate": 5.81896551724138e-06,
"loss": 0.4473,
"step": 136
},
{
"epoch": 0.09314359637774904,
"grad_norm": 1.358317494392395,
"learning_rate": 6.163793103448276e-06,
"loss": 0.4724,
"step": 144
},
{
"epoch": 0.09831824062095731,
"grad_norm": 2.7171244621276855,
"learning_rate": 6.508620689655173e-06,
"loss": 0.4687,
"step": 152
},
{
"epoch": 0.1034928848641656,
"grad_norm": 1.794846773147583,
"learning_rate": 6.853448275862069e-06,
"loss": 0.466,
"step": 160
},
{
"epoch": 0.10866752910737387,
"grad_norm": 2.4740686416625977,
"learning_rate": 7.198275862068966e-06,
"loss": 0.4562,
"step": 168
},
{
"epoch": 0.11384217335058215,
"grad_norm": 12.792396545410156,
"learning_rate": 7.543103448275862e-06,
"loss": 0.4553,
"step": 176
},
{
"epoch": 0.11901681759379043,
"grad_norm": 4.893735408782959,
"learning_rate": 7.88793103448276e-06,
"loss": 0.4499,
"step": 184
},
{
"epoch": 0.12419146183699871,
"grad_norm": 2.4978456497192383,
"learning_rate": 8.232758620689656e-06,
"loss": 0.4405,
"step": 192
},
{
"epoch": 0.129366106080207,
"grad_norm": 1.71527099609375,
"learning_rate": 8.577586206896551e-06,
"loss": 0.4694,
"step": 200
},
{
"epoch": 0.13454075032341525,
"grad_norm": 1.814359188079834,
"learning_rate": 8.922413793103449e-06,
"loss": 0.4213,
"step": 208
},
{
"epoch": 0.13971539456662355,
"grad_norm": 1.0052739381790161,
"learning_rate": 9.267241379310346e-06,
"loss": 0.4165,
"step": 216
},
{
"epoch": 0.1448900388098318,
"grad_norm": 3.3144054412841797,
"learning_rate": 9.612068965517242e-06,
"loss": 0.4324,
"step": 224
},
{
"epoch": 0.1500646830530401,
"grad_norm": 14.789776802062988,
"learning_rate": 9.95689655172414e-06,
"loss": 0.4362,
"step": 232
},
{
"epoch": 0.15523932729624837,
"grad_norm": 2.4650723934173584,
"learning_rate": 9.999978494742326e-06,
"loss": 0.4394,
"step": 240
},
{
"epoch": 0.16041397153945666,
"grad_norm": 1.6728233098983765,
"learning_rate": 9.999901251622079e-06,
"loss": 0.4686,
"step": 248
},
{
"epoch": 0.16558861578266496,
"grad_norm": 1.9105558395385742,
"learning_rate": 9.999767832624e-06,
"loss": 0.4479,
"step": 256
},
{
"epoch": 0.17076326002587322,
"grad_norm": 2.219156503677368,
"learning_rate": 9.999578239247104e-06,
"loss": 0.4562,
"step": 264
},
{
"epoch": 0.1759379042690815,
"grad_norm": 2.4550633430480957,
"learning_rate": 9.999332473621544e-06,
"loss": 0.4546,
"step": 272
},
{
"epoch": 0.18111254851228978,
"grad_norm": 2.5587055683135986,
"learning_rate": 9.999030538508598e-06,
"loss": 0.4527,
"step": 280
},
{
"epoch": 0.18628719275549807,
"grad_norm": 3.2998480796813965,
"learning_rate": 9.99867243730063e-06,
"loss": 0.4528,
"step": 288
},
{
"epoch": 0.19146183699870634,
"grad_norm": 6.697995662689209,
"learning_rate": 9.998258174021043e-06,
"loss": 0.4488,
"step": 296
},
{
"epoch": 0.19663648124191463,
"grad_norm": 2.801957607269287,
"learning_rate": 9.997787753324253e-06,
"loss": 0.4644,
"step": 304
},
{
"epoch": 0.2018111254851229,
"grad_norm": 7.488362789154053,
"learning_rate": 9.997261180495623e-06,
"loss": 0.4567,
"step": 312
},
{
"epoch": 0.2069857697283312,
"grad_norm": 11.816390991210938,
"learning_rate": 9.996678461451408e-06,
"loss": 0.4542,
"step": 320
},
{
"epoch": 0.21216041397153945,
"grad_norm": 4.340507984161377,
"learning_rate": 9.996039602738688e-06,
"loss": 0.4299,
"step": 328
},
{
"epoch": 0.21733505821474774,
"grad_norm": 2.5847530364990234,
"learning_rate": 9.995344611535295e-06,
"loss": 0.4199,
"step": 336
},
{
"epoch": 0.222509702457956,
"grad_norm": 7.996049404144287,
"learning_rate": 9.994593495649733e-06,
"loss": 0.4006,
"step": 344
},
{
"epoch": 0.2276843467011643,
"grad_norm": 8.294257164001465,
"learning_rate": 9.993786263521083e-06,
"loss": 0.3779,
"step": 352
},
{
"epoch": 0.23285899094437257,
"grad_norm": 8.907233238220215,
"learning_rate": 9.992922924218924e-06,
"loss": 0.3424,
"step": 360
},
{
"epoch": 0.23803363518758086,
"grad_norm": 20.33425521850586,
"learning_rate": 9.99200348744321e-06,
"loss": 0.3199,
"step": 368
},
{
"epoch": 0.24320827943078913,
"grad_norm": 13.30887508392334,
"learning_rate": 9.991027963524188e-06,
"loss": 0.2968,
"step": 376
},
{
"epoch": 0.24838292367399742,
"grad_norm": 24.578723907470703,
"learning_rate": 9.989996363422246e-06,
"loss": 0.2971,
"step": 384
},
{
"epoch": 0.2535575679172057,
"grad_norm": 7.940666198730469,
"learning_rate": 9.988908698727828e-06,
"loss": 0.2772,
"step": 392
},
{
"epoch": 0.258732212160414,
"grad_norm": 48.541175842285156,
"learning_rate": 9.987764981661278e-06,
"loss": 0.2937,
"step": 400
},
{
"epoch": 0.26390685640362227,
"grad_norm": 8.2228364944458,
"learning_rate": 9.986565225072713e-06,
"loss": 0.2775,
"step": 408
},
{
"epoch": 0.2690815006468305,
"grad_norm": 37.225929260253906,
"learning_rate": 9.98530944244187e-06,
"loss": 0.2628,
"step": 416
},
{
"epoch": 0.2742561448900388,
"grad_norm": 28.484724044799805,
"learning_rate": 9.983997647877973e-06,
"loss": 0.2671,
"step": 424
},
{
"epoch": 0.2794307891332471,
"grad_norm": 7.030237197875977,
"learning_rate": 9.98262985611955e-06,
"loss": 0.2598,
"step": 432
},
{
"epoch": 0.2846054333764554,
"grad_norm": 6.647619247436523,
"learning_rate": 9.981206082534287e-06,
"loss": 0.2519,
"step": 440
},
{
"epoch": 0.2897800776196636,
"grad_norm": 17.09227180480957,
"learning_rate": 9.979726343118847e-06,
"loss": 0.2567,
"step": 448
},
{
"epoch": 0.2949547218628719,
"grad_norm": 24.190093994140625,
"learning_rate": 9.978190654498687e-06,
"loss": 0.2476,
"step": 456
},
{
"epoch": 0.3001293661060802,
"grad_norm": 160.548095703125,
"learning_rate": 9.976599033927884e-06,
"loss": 0.2546,
"step": 464
},
{
"epoch": 0.3053040103492885,
"grad_norm": 5.440623760223389,
"learning_rate": 9.974951499288925e-06,
"loss": 0.2497,
"step": 472
},
{
"epoch": 0.31047865459249674,
"grad_norm": 21.864320755004883,
"learning_rate": 9.973248069092516e-06,
"loss": 0.2766,
"step": 480
},
{
"epoch": 0.31565329883570503,
"grad_norm": 1.826672911643982,
"learning_rate": 9.971488762477373e-06,
"loss": 0.2578,
"step": 488
},
{
"epoch": 0.3208279430789133,
"grad_norm": 11.376463890075684,
"learning_rate": 9.969673599210006e-06,
"loss": 0.2619,
"step": 496
},
{
"epoch": 0.3260025873221216,
"grad_norm": 4.004533290863037,
"learning_rate": 9.967802599684494e-06,
"loss": 0.2586,
"step": 504
},
{
"epoch": 0.3311772315653299,
"grad_norm": 14.940202713012695,
"learning_rate": 9.965875784922261e-06,
"loss": 0.2624,
"step": 512
},
{
"epoch": 0.33635187580853815,
"grad_norm": 5.1946940422058105,
"learning_rate": 9.963893176571836e-06,
"loss": 0.2569,
"step": 520
},
{
"epoch": 0.34152652005174644,
"grad_norm": 6.96012544631958,
"learning_rate": 9.961854796908615e-06,
"loss": 0.2501,
"step": 528
},
{
"epoch": 0.34670116429495473,
"grad_norm": 20.033016204833984,
"learning_rate": 9.959760668834601e-06,
"loss": 0.2466,
"step": 536
},
{
"epoch": 0.351875808538163,
"grad_norm": 5.902473449707031,
"learning_rate": 9.957610815878156e-06,
"loss": 0.2424,
"step": 544
},
{
"epoch": 0.35705045278137126,
"grad_norm": 2.925157070159912,
"learning_rate": 9.955405262193731e-06,
"loss": 0.2429,
"step": 552
},
{
"epoch": 0.36222509702457956,
"grad_norm": 3.106954336166382,
"learning_rate": 9.9531440325616e-06,
"loss": 0.2367,
"step": 560
},
{
"epoch": 0.36739974126778785,
"grad_norm": 8.999578475952148,
"learning_rate": 9.950827152387575e-06,
"loss": 0.2428,
"step": 568
},
{
"epoch": 0.37257438551099614,
"grad_norm": 8.012633323669434,
"learning_rate": 9.948454647702727e-06,
"loss": 0.2447,
"step": 576
},
{
"epoch": 0.3777490297542044,
"grad_norm": 17.479957580566406,
"learning_rate": 9.94602654516309e-06,
"loss": 0.2539,
"step": 584
},
{
"epoch": 0.3829236739974127,
"grad_norm": 111.3963851928711,
"learning_rate": 9.94354287204936e-06,
"loss": 0.2536,
"step": 592
},
{
"epoch": 0.38809831824062097,
"grad_norm": 67.85525512695312,
"learning_rate": 9.941003656266589e-06,
"loss": 0.263,
"step": 600
},
{
"epoch": 0.39327296248382926,
"grad_norm": 5.47231912612915,
"learning_rate": 9.93840892634388e-06,
"loss": 0.2395,
"step": 608
},
{
"epoch": 0.3984476067270375,
"grad_norm": 20.308883666992188,
"learning_rate": 9.935758711434052e-06,
"loss": 0.2433,
"step": 616
},
{
"epoch": 0.4036222509702458,
"grad_norm": 5.908266544342041,
"learning_rate": 9.933053041313325e-06,
"loss": 0.2531,
"step": 624
},
{
"epoch": 0.4087968952134541,
"grad_norm": 4.62359094619751,
"learning_rate": 9.930291946380977e-06,
"loss": 0.2572,
"step": 632
},
{
"epoch": 0.4139715394566624,
"grad_norm": 33.82321548461914,
"learning_rate": 9.927475457659007e-06,
"loss": 0.252,
"step": 640
},
{
"epoch": 0.4191461836998706,
"grad_norm": 2.1320619583129883,
"learning_rate": 9.924603606791786e-06,
"loss": 0.2484,
"step": 648
},
{
"epoch": 0.4243208279430789,
"grad_norm": 5.690158367156982,
"learning_rate": 9.921676426045698e-06,
"loss": 0.242,
"step": 656
},
{
"epoch": 0.4294954721862872,
"grad_norm": 4.768186092376709,
"learning_rate": 9.918693948308783e-06,
"loss": 0.2258,
"step": 664
},
{
"epoch": 0.4346701164294955,
"grad_norm": 7.269800662994385,
"learning_rate": 9.915656207090367e-06,
"loss": 0.2397,
"step": 672
},
{
"epoch": 0.4398447606727037,
"grad_norm": 30.480161666870117,
"learning_rate": 9.912563236520675e-06,
"loss": 0.2296,
"step": 680
},
{
"epoch": 0.445019404915912,
"grad_norm": 22.762638092041016,
"learning_rate": 9.909415071350464e-06,
"loss": 0.233,
"step": 688
},
{
"epoch": 0.4501940491591203,
"grad_norm": 13.787392616271973,
"learning_rate": 9.90621174695062e-06,
"loss": 0.2333,
"step": 696
},
{
"epoch": 0.4553686934023286,
"grad_norm": 3.74238657951355,
"learning_rate": 9.902953299311763e-06,
"loss": 0.236,
"step": 704
},
{
"epoch": 0.46054333764553684,
"grad_norm": 71.07587432861328,
"learning_rate": 9.899639765043854e-06,
"loss": 0.2549,
"step": 712
},
{
"epoch": 0.46571798188874514,
"grad_norm": 3.8564956188201904,
"learning_rate": 9.89627118137576e-06,
"loss": 0.2488,
"step": 720
},
{
"epoch": 0.47089262613195343,
"grad_norm": 3.945932626724243,
"learning_rate": 9.892847586154863e-06,
"loss": 0.2435,
"step": 728
},
{
"epoch": 0.4760672703751617,
"grad_norm": 8.651078224182129,
"learning_rate": 9.889369017846616e-06,
"loss": 0.2301,
"step": 736
},
{
"epoch": 0.48124191461837,
"grad_norm": 23.654037475585938,
"learning_rate": 9.88583551553411e-06,
"loss": 0.2408,
"step": 744
},
{
"epoch": 0.48641655886157825,
"grad_norm": 9.601846694946289,
"learning_rate": 9.882247118917656e-06,
"loss": 0.2578,
"step": 752
},
{
"epoch": 0.49159120310478654,
"grad_norm": 7.171313762664795,
"learning_rate": 9.87860386831431e-06,
"loss": 0.2412,
"step": 760
},
{
"epoch": 0.49676584734799484,
"grad_norm": 13.73882007598877,
"learning_rate": 9.874905804657445e-06,
"loss": 0.235,
"step": 768
},
{
"epoch": 0.5019404915912031,
"grad_norm": 8.866873741149902,
"learning_rate": 9.871152969496274e-06,
"loss": 0.2259,
"step": 776
},
{
"epoch": 0.5071151358344114,
"grad_norm": 2.9023540019989014,
"learning_rate": 9.867345404995393e-06,
"loss": 0.2382,
"step": 784
},
{
"epoch": 0.5122897800776197,
"grad_norm": 8.159908294677734,
"learning_rate": 9.8634831539343e-06,
"loss": 0.2362,
"step": 792
},
{
"epoch": 0.517464424320828,
"grad_norm": 5.698594093322754,
"learning_rate": 9.85956625970692e-06,
"loss": 0.2312,
"step": 800
},
{
"epoch": 0.5226390685640362,
"grad_norm": 1.7953461408615112,
"learning_rate": 9.855594766321122e-06,
"loss": 0.2341,
"step": 808
},
{
"epoch": 0.5278137128072445,
"grad_norm": 6.394684791564941,
"learning_rate": 9.85156871839821e-06,
"loss": 0.2467,
"step": 816
},
{
"epoch": 0.5329883570504528,
"grad_norm": 3.978597402572632,
"learning_rate": 9.847488161172429e-06,
"loss": 0.2411,
"step": 824
},
{
"epoch": 0.538163001293661,
"grad_norm": 49.182125091552734,
"learning_rate": 9.843353140490466e-06,
"loss": 0.2394,
"step": 832
},
{
"epoch": 0.5433376455368694,
"grad_norm": 95.73147583007812,
"learning_rate": 9.839163702810922e-06,
"loss": 0.2247,
"step": 840
},
{
"epoch": 0.5485122897800776,
"grad_norm": 14.831591606140137,
"learning_rate": 9.834919895203789e-06,
"loss": 0.2471,
"step": 848
},
{
"epoch": 0.553686934023286,
"grad_norm": 5.494821548461914,
"learning_rate": 9.83062176534994e-06,
"loss": 0.244,
"step": 856
},
{
"epoch": 0.5588615782664942,
"grad_norm": 2.4115333557128906,
"learning_rate": 9.826269361540565e-06,
"loss": 0.2532,
"step": 864
},
{
"epoch": 0.5640362225097024,
"grad_norm": 13.977892875671387,
"learning_rate": 9.821862732676655e-06,
"loss": 0.2507,
"step": 872
},
{
"epoch": 0.5692108667529108,
"grad_norm": 9.79362964630127,
"learning_rate": 9.817401928268435e-06,
"loss": 0.2237,
"step": 880
},
{
"epoch": 0.574385510996119,
"grad_norm": 1.0879400968551636,
"learning_rate": 9.812886998434817e-06,
"loss": 0.2281,
"step": 888
},
{
"epoch": 0.5795601552393272,
"grad_norm": 7.494878768920898,
"learning_rate": 9.80831799390283e-06,
"loss": 0.2375,
"step": 896
},
{
"epoch": 0.5847347994825356,
"grad_norm": 6.011063098907471,
"learning_rate": 9.803694966007059e-06,
"loss": 0.2363,
"step": 904
},
{
"epoch": 0.5899094437257438,
"grad_norm": 3.7052977085113525,
"learning_rate": 9.799017966689057e-06,
"loss": 0.2217,
"step": 912
},
{
"epoch": 0.5950840879689522,
"grad_norm": 1.6096714735031128,
"learning_rate": 9.794287048496771e-06,
"loss": 0.2399,
"step": 920
},
{
"epoch": 0.6002587322121604,
"grad_norm": 4.586109638214111,
"learning_rate": 9.789502264583949e-06,
"loss": 0.2511,
"step": 928
},
{
"epoch": 0.6054333764553687,
"grad_norm": 6.05584192276001,
"learning_rate": 9.784663668709537e-06,
"loss": 0.2411,
"step": 936
},
{
"epoch": 0.610608020698577,
"grad_norm": 10.09545612335205,
"learning_rate": 9.779771315237086e-06,
"loss": 0.2565,
"step": 944
},
{
"epoch": 0.6157826649417852,
"grad_norm": 21.93640899658203,
"learning_rate": 9.77482525913413e-06,
"loss": 0.2294,
"step": 952
},
{
"epoch": 0.6209573091849935,
"grad_norm": 12.018024444580078,
"learning_rate": 9.769825555971575e-06,
"loss": 0.2268,
"step": 960
},
{
"epoch": 0.6261319534282018,
"grad_norm": 17.39588165283203,
"learning_rate": 9.764772261923074e-06,
"loss": 0.2349,
"step": 968
},
{
"epoch": 0.6313065976714101,
"grad_norm": 15.817480087280273,
"learning_rate": 9.759665433764393e-06,
"loss": 0.2238,
"step": 976
},
{
"epoch": 0.6364812419146184,
"grad_norm": 6.889886379241943,
"learning_rate": 9.754505128872778e-06,
"loss": 0.2409,
"step": 984
},
{
"epoch": 0.6416558861578266,
"grad_norm": 5.029130935668945,
"learning_rate": 9.749291405226304e-06,
"loss": 0.2388,
"step": 992
},
{
"epoch": 0.6468305304010349,
"grad_norm": 7.987931728363037,
"learning_rate": 9.744024321403229e-06,
"loss": 0.2306,
"step": 1000
},
{
"epoch": 0.6520051746442432,
"grad_norm": 3.22019100189209,
"learning_rate": 9.738703936581333e-06,
"loss": 0.2444,
"step": 1008
},
{
"epoch": 0.6571798188874515,
"grad_norm": 13.65085220336914,
"learning_rate": 9.733330310537255e-06,
"loss": 0.248,
"step": 1016
},
{
"epoch": 0.6623544631306598,
"grad_norm": 12.927434921264648,
"learning_rate": 9.727903503645818e-06,
"loss": 0.2226,
"step": 1024
},
{
"epoch": 0.6675291073738681,
"grad_norm": 1.27034592628479,
"learning_rate": 9.722423576879354e-06,
"loss": 0.237,
"step": 1032
},
{
"epoch": 0.6727037516170763,
"grad_norm": 266.6712951660156,
"learning_rate": 9.71689059180702e-06,
"loss": 0.2384,
"step": 1040
},
{
"epoch": 0.6778783958602846,
"grad_norm": 8.363792419433594,
"learning_rate": 9.711304610594104e-06,
"loss": 0.2278,
"step": 1048
},
{
"epoch": 0.6830530401034929,
"grad_norm": 2.284507989883423,
"learning_rate": 9.70566569600132e-06,
"loss": 0.2452,
"step": 1056
},
{
"epoch": 0.6882276843467011,
"grad_norm": 3.4681687355041504,
"learning_rate": 9.699973911384119e-06,
"loss": 0.2338,
"step": 1064
},
{
"epoch": 0.6934023285899095,
"grad_norm": 21.047893524169922,
"learning_rate": 9.694229320691961e-06,
"loss": 0.2189,
"step": 1072
},
{
"epoch": 0.6985769728331177,
"grad_norm": 7.996535778045654,
"learning_rate": 9.688431988467609e-06,
"loss": 0.2475,
"step": 1080
},
{
"epoch": 0.703751617076326,
"grad_norm": 3.221071481704712,
"learning_rate": 9.682581979846388e-06,
"loss": 0.2501,
"step": 1088
},
{
"epoch": 0.7089262613195343,
"grad_norm": 14.775407791137695,
"learning_rate": 9.676679360555479e-06,
"loss": 0.2245,
"step": 1096
},
{
"epoch": 0.7141009055627425,
"grad_norm": 3.179734945297241,
"learning_rate": 9.670724196913149e-06,
"loss": 0.2314,
"step": 1104
},
{
"epoch": 0.7192755498059509,
"grad_norm": 36.58845901489258,
"learning_rate": 9.66471655582803e-06,
"loss": 0.2414,
"step": 1112
},
{
"epoch": 0.7244501940491591,
"grad_norm": 2.0989410877227783,
"learning_rate": 9.658656504798361e-06,
"loss": 0.2373,
"step": 1120
},
{
"epoch": 0.7296248382923674,
"grad_norm": 67.86420440673828,
"learning_rate": 9.652544111911218e-06,
"loss": 0.2414,
"step": 1128
},
{
"epoch": 0.7347994825355757,
"grad_norm": 64.04298400878906,
"learning_rate": 9.646379445841769e-06,
"loss": 0.2419,
"step": 1136
},
{
"epoch": 0.7399741267787839,
"grad_norm": 5.908459663391113,
"learning_rate": 9.640162575852487e-06,
"loss": 0.2328,
"step": 1144
},
{
"epoch": 0.7451487710219923,
"grad_norm": 5.114171981811523,
"learning_rate": 9.633893571792375e-06,
"loss": 0.2117,
"step": 1152
},
{
"epoch": 0.7503234152652005,
"grad_norm": 8.035240173339844,
"learning_rate": 9.627572504096188e-06,
"loss": 0.2344,
"step": 1160
},
{
"epoch": 0.7554980595084088,
"grad_norm": 3.9648971557617188,
"learning_rate": 9.621199443783633e-06,
"loss": 0.2248,
"step": 1168
},
{
"epoch": 0.7606727037516171,
"grad_norm": 17.44873046875,
"learning_rate": 9.614774462458573e-06,
"loss": 0.2377,
"step": 1176
},
{
"epoch": 0.7658473479948253,
"grad_norm": 10.615303039550781,
"learning_rate": 9.608297632308233e-06,
"loss": 0.2278,
"step": 1184
},
{
"epoch": 0.7710219922380336,
"grad_norm": 29.23194122314453,
"learning_rate": 9.601769026102368e-06,
"loss": 0.2213,
"step": 1192
},
{
"epoch": 0.7761966364812419,
"grad_norm": 21.02916717529297,
"learning_rate": 9.595188717192466e-06,
"loss": 0.2382,
"step": 1200
},
{
"epoch": 0.7813712807244502,
"grad_norm": 9.737796783447266,
"learning_rate": 9.58855677951091e-06,
"loss": 0.2347,
"step": 1208
},
{
"epoch": 0.7865459249676585,
"grad_norm": 59.887420654296875,
"learning_rate": 9.581873287570164e-06,
"loss": 0.2379,
"step": 1216
},
{
"epoch": 0.7917205692108668,
"grad_norm": 25.03373146057129,
"learning_rate": 9.575138316461909e-06,
"loss": 0.2358,
"step": 1224
},
{
"epoch": 0.796895213454075,
"grad_norm": 3.6164708137512207,
"learning_rate": 9.568351941856223e-06,
"loss": 0.2511,
"step": 1232
},
{
"epoch": 0.8020698576972833,
"grad_norm": 86.80042266845703,
"learning_rate": 9.561514240000724e-06,
"loss": 0.2262,
"step": 1240
},
{
"epoch": 0.8072445019404916,
"grad_norm": 6.826155185699463,
"learning_rate": 9.554625287719711e-06,
"loss": 0.231,
"step": 1248
},
{
"epoch": 0.8124191461836999,
"grad_norm": 1.1142396926879883,
"learning_rate": 9.547685162413298e-06,
"loss": 0.2187,
"step": 1256
},
{
"epoch": 0.8175937904269082,
"grad_norm": 1.7491034269332886,
"learning_rate": 9.540693942056553e-06,
"loss": 0.237,
"step": 1264
},
{
"epoch": 0.8227684346701164,
"grad_norm": 43.96466064453125,
"learning_rate": 9.533651705198616e-06,
"loss": 0.2347,
"step": 1272
},
{
"epoch": 0.8279430789133247,
"grad_norm": 2.50284481048584,
"learning_rate": 9.526558530961817e-06,
"loss": 0.2199,
"step": 1280
},
{
"epoch": 0.833117723156533,
"grad_norm": 3.6089391708374023,
"learning_rate": 9.519414499040785e-06,
"loss": 0.2341,
"step": 1288
},
{
"epoch": 0.8382923673997412,
"grad_norm": 2.025322914123535,
"learning_rate": 9.51221968970156e-06,
"loss": 0.2317,
"step": 1296
},
{
"epoch": 0.8434670116429496,
"grad_norm": 3.1698544025421143,
"learning_rate": 9.504974183780686e-06,
"loss": 0.2133,
"step": 1304
},
{
"epoch": 0.8486416558861578,
"grad_norm": 2.9666683673858643,
"learning_rate": 9.497678062684301e-06,
"loss": 0.2224,
"step": 1312
},
{
"epoch": 0.8538163001293662,
"grad_norm": 6.248025894165039,
"learning_rate": 9.490331408387225e-06,
"loss": 0.2145,
"step": 1320
},
{
"epoch": 0.8589909443725744,
"grad_norm": 5.850505828857422,
"learning_rate": 9.482934303432038e-06,
"loss": 0.2277,
"step": 1328
},
{
"epoch": 0.8641655886157826,
"grad_norm": 8.283476829528809,
"learning_rate": 9.475486830928155e-06,
"loss": 0.2219,
"step": 1336
},
{
"epoch": 0.869340232858991,
"grad_norm": 1.7238540649414062,
"learning_rate": 9.467989074550891e-06,
"loss": 0.2384,
"step": 1344
},
{
"epoch": 0.8745148771021992,
"grad_norm": 2.008312225341797,
"learning_rate": 9.46044111854052e-06,
"loss": 0.2006,
"step": 1352
},
{
"epoch": 0.8796895213454075,
"grad_norm": 5.544405937194824,
"learning_rate": 9.452843047701324e-06,
"loss": 0.2313,
"step": 1360
},
{
"epoch": 0.8848641655886158,
"grad_norm": 2.758505344390869,
"learning_rate": 9.44519494740065e-06,
"loss": 0.2333,
"step": 1368
},
{
"epoch": 0.890038809831824,
"grad_norm": 1.708716630935669,
"learning_rate": 9.437496903567946e-06,
"loss": 0.2276,
"step": 1376
},
{
"epoch": 0.8952134540750324,
"grad_norm": 14.639440536499023,
"learning_rate": 9.429749002693793e-06,
"loss": 0.233,
"step": 1384
},
{
"epoch": 0.9003880983182406,
"grad_norm": 18.10475730895996,
"learning_rate": 9.421951331828938e-06,
"loss": 0.2351,
"step": 1392
},
{
"epoch": 0.9055627425614489,
"grad_norm": 15.641255378723145,
"learning_rate": 9.414103978583312e-06,
"loss": 0.2297,
"step": 1400
},
{
"epoch": 0.9107373868046572,
"grad_norm": 13.53865909576416,
"learning_rate": 9.406207031125048e-06,
"loss": 0.2171,
"step": 1408
},
{
"epoch": 0.9159120310478654,
"grad_norm": 7.33931827545166,
"learning_rate": 9.398260578179487e-06,
"loss": 0.2258,
"step": 1416
},
{
"epoch": 0.9210866752910737,
"grad_norm": 1.1460613012313843,
"learning_rate": 9.390264709028189e-06,
"loss": 0.2223,
"step": 1424
},
{
"epoch": 0.926261319534282,
"grad_norm": 3.664703607559204,
"learning_rate": 9.382219513507922e-06,
"loss": 0.2291,
"step": 1432
},
{
"epoch": 0.9314359637774903,
"grad_norm": 2.028292655944824,
"learning_rate": 9.374125082009654e-06,
"loss": 0.2197,
"step": 1440
},
{
"epoch": 0.9366106080206986,
"grad_norm": 1.853790044784546,
"learning_rate": 9.365981505477541e-06,
"loss": 0.2086,
"step": 1448
},
{
"epoch": 0.9417852522639069,
"grad_norm": 49.15569305419922,
"learning_rate": 9.3577888754079e-06,
"loss": 0.2245,
"step": 1456
},
{
"epoch": 0.9469598965071151,
"grad_norm": 3.0362915992736816,
"learning_rate": 9.34954728384819e-06,
"loss": 0.2304,
"step": 1464
},
{
"epoch": 0.9521345407503234,
"grad_norm": 4.143098831176758,
"learning_rate": 9.341256823395965e-06,
"loss": 0.237,
"step": 1472
},
{
"epoch": 0.9573091849935317,
"grad_norm": 5.784548759460449,
"learning_rate": 9.332917587197844e-06,
"loss": 0.2176,
"step": 1480
},
{
"epoch": 0.96248382923674,
"grad_norm": 13.278392791748047,
"learning_rate": 9.324529668948459e-06,
"loss": 0.2303,
"step": 1488
},
{
"epoch": 0.9676584734799483,
"grad_norm": 46.16282272338867,
"learning_rate": 9.316093162889407e-06,
"loss": 0.2226,
"step": 1496
},
{
"epoch": 0.9728331177231565,
"grad_norm": 2.1848196983337402,
"learning_rate": 9.307608163808189e-06,
"loss": 0.2481,
"step": 1504
},
{
"epoch": 0.9780077619663649,
"grad_norm": 3.2306673526763916,
"learning_rate": 9.299074767037137e-06,
"loss": 0.2144,
"step": 1512
},
{
"epoch": 0.9831824062095731,
"grad_norm": 7.1894755363464355,
"learning_rate": 9.290493068452357e-06,
"loss": 0.2319,
"step": 1520
},
{
"epoch": 0.9883570504527813,
"grad_norm": 13.211376190185547,
"learning_rate": 9.281863164472647e-06,
"loss": 0.2368,
"step": 1528
},
{
"epoch": 0.9935316946959897,
"grad_norm": 3.837167739868164,
"learning_rate": 9.273185152058406e-06,
"loss": 0.237,
"step": 1536
},
{
"epoch": 0.9987063389391979,
"grad_norm": 26.63664436340332,
"learning_rate": 9.26445912871055e-06,
"loss": 0.2236,
"step": 1544
},
{
"epoch": 1.0038809831824063,
"grad_norm": 5.332308292388916,
"learning_rate": 9.255685192469424e-06,
"loss": 0.2325,
"step": 1552
},
{
"epoch": 1.0090556274256144,
"grad_norm": 1.7717430591583252,
"learning_rate": 9.246863441913685e-06,
"loss": 0.2244,
"step": 1560
},
{
"epoch": 1.0142302716688227,
"grad_norm": 36.790260314941406,
"learning_rate": 9.237993976159211e-06,
"loss": 0.2414,
"step": 1568
},
{
"epoch": 1.019404915912031,
"grad_norm": 7.347696781158447,
"learning_rate": 9.229076894857973e-06,
"loss": 0.2339,
"step": 1576
},
{
"epoch": 1.0245795601552394,
"grad_norm": 7.630255699157715,
"learning_rate": 9.220112298196922e-06,
"loss": 0.2127,
"step": 1584
},
{
"epoch": 1.0297542043984476,
"grad_norm": 4.760798454284668,
"learning_rate": 9.211100286896865e-06,
"loss": 0.231,
"step": 1592
},
{
"epoch": 1.034928848641656,
"grad_norm": 17.15314483642578,
"learning_rate": 9.202040962211334e-06,
"loss": 0.233,
"step": 1600
},
{
"epoch": 1.0401034928848643,
"grad_norm": 7.275695323944092,
"learning_rate": 9.19293442592544e-06,
"loss": 0.2205,
"step": 1608
},
{
"epoch": 1.0452781371280724,
"grad_norm": 10.593427658081055,
"learning_rate": 9.183780780354736e-06,
"loss": 0.2137,
"step": 1616
},
{
"epoch": 1.0504527813712807,
"grad_norm": 84.94709014892578,
"learning_rate": 9.174580128344073e-06,
"loss": 0.2119,
"step": 1624
},
{
"epoch": 1.055627425614489,
"grad_norm": 6.472126007080078,
"learning_rate": 9.16533257326643e-06,
"loss": 0.235,
"step": 1632
},
{
"epoch": 1.0608020698576972,
"grad_norm": 2.1055634021759033,
"learning_rate": 9.156038219021764e-06,
"loss": 0.2329,
"step": 1640
},
{
"epoch": 1.0659767141009056,
"grad_norm": 4.520327091217041,
"learning_rate": 9.146697170035839e-06,
"loss": 0.2199,
"step": 1648
},
{
"epoch": 1.071151358344114,
"grad_norm": 2.18851375579834,
"learning_rate": 9.137309531259054e-06,
"loss": 0.2305,
"step": 1656
},
{
"epoch": 1.076326002587322,
"grad_norm": 7.414587497711182,
"learning_rate": 9.127875408165261e-06,
"loss": 0.2172,
"step": 1664
},
{
"epoch": 1.0815006468305304,
"grad_norm": 40.298065185546875,
"learning_rate": 9.118394906750585e-06,
"loss": 0.2222,
"step": 1672
},
{
"epoch": 1.0866752910737387,
"grad_norm": 2.0129568576812744,
"learning_rate": 9.108868133532224e-06,
"loss": 0.2385,
"step": 1680
},
{
"epoch": 1.0918499353169469,
"grad_norm": 10.248085975646973,
"learning_rate": 9.099295195547264e-06,
"loss": 0.2252,
"step": 1688
},
{
"epoch": 1.0970245795601552,
"grad_norm": 3.0196053981781006,
"learning_rate": 9.089676200351467e-06,
"loss": 0.2266,
"step": 1696
},
{
"epoch": 1.1021992238033635,
"grad_norm": 2.057884931564331,
"learning_rate": 9.08001125601807e-06,
"loss": 0.2353,
"step": 1704
},
{
"epoch": 1.107373868046572,
"grad_norm": 3.0884134769439697,
"learning_rate": 9.07030047113656e-06,
"loss": 0.2137,
"step": 1712
},
{
"epoch": 1.11254851228978,
"grad_norm": 2.399594783782959,
"learning_rate": 9.060543954811464e-06,
"loss": 0.2122,
"step": 1720
},
{
"epoch": 1.1177231565329884,
"grad_norm": 20.144880294799805,
"learning_rate": 9.050741816661128e-06,
"loss": 0.222,
"step": 1728
},
{
"epoch": 1.1228978007761967,
"grad_norm": 3.0582709312438965,
"learning_rate": 9.040894166816461e-06,
"loss": 0.2162,
"step": 1736
},
{
"epoch": 1.1280724450194048,
"grad_norm": 3.313769578933716,
"learning_rate": 9.031001115919732e-06,
"loss": 0.23,
"step": 1744
},
{
"epoch": 1.1332470892626132,
"grad_norm": 2.804013967514038,
"learning_rate": 9.02106277512329e-06,
"loss": 0.2367,
"step": 1752
},
{
"epoch": 1.1384217335058215,
"grad_norm": 9.238598823547363,
"learning_rate": 9.011079256088355e-06,
"loss": 0.2371,
"step": 1760
},
{
"epoch": 1.1435963777490297,
"grad_norm": 10.921507835388184,
"learning_rate": 9.001050670983721e-06,
"loss": 0.2327,
"step": 1768
},
{
"epoch": 1.148771021992238,
"grad_norm": 1.8150590658187866,
"learning_rate": 8.990977132484535e-06,
"loss": 0.233,
"step": 1776
},
{
"epoch": 1.1539456662354464,
"grad_norm": 1.9657435417175293,
"learning_rate": 8.980858753771002e-06,
"loss": 0.2172,
"step": 1784
},
{
"epoch": 1.1591203104786545,
"grad_norm": 3.5203697681427,
"learning_rate": 8.970695648527132e-06,
"loss": 0.2129,
"step": 1792
},
{
"epoch": 1.1642949547218628,
"grad_norm": 8.29598617553711,
"learning_rate": 8.96048793093945e-06,
"loss": 0.2325,
"step": 1800
},
{
"epoch": 1.1694695989650712,
"grad_norm": 1.7340894937515259,
"learning_rate": 8.950235715695717e-06,
"loss": 0.2177,
"step": 1808
},
{
"epoch": 1.1746442432082795,
"grad_norm": 8.192721366882324,
"learning_rate": 8.93993911798365e-06,
"loss": 0.2385,
"step": 1816
},
{
"epoch": 1.1798188874514877,
"grad_norm": 15.727509498596191,
"learning_rate": 8.929598253489617e-06,
"loss": 0.2367,
"step": 1824
},
{
"epoch": 1.184993531694696,
"grad_norm": 15.4706449508667,
"learning_rate": 8.91921323839734e-06,
"loss": 0.2373,
"step": 1832
},
{
"epoch": 1.1901681759379044,
"grad_norm": 7.3651227951049805,
"learning_rate": 8.908784189386589e-06,
"loss": 0.2352,
"step": 1840
},
{
"epoch": 1.1953428201811125,
"grad_norm": 15.465596199035645,
"learning_rate": 8.898311223631876e-06,
"loss": 0.223,
"step": 1848
},
{
"epoch": 1.2005174644243208,
"grad_norm": 7.5635528564453125,
"learning_rate": 8.887794458801137e-06,
"loss": 0.2179,
"step": 1856
},
{
"epoch": 1.2056921086675292,
"grad_norm": 9.759207725524902,
"learning_rate": 8.8772340130544e-06,
"loss": 0.2189,
"step": 1864
},
{
"epoch": 1.2108667529107373,
"grad_norm": 2.7430686950683594,
"learning_rate": 8.866630005042476e-06,
"loss": 0.2354,
"step": 1872
},
{
"epoch": 1.2160413971539457,
"grad_norm": 6.708351135253906,
"learning_rate": 8.855982553905604e-06,
"loss": 0.2191,
"step": 1880
},
{
"epoch": 1.221216041397154,
"grad_norm": 155.43638610839844,
"learning_rate": 8.845291779272131e-06,
"loss": 0.226,
"step": 1888
},
{
"epoch": 1.2263906856403621,
"grad_norm": 2.132230281829834,
"learning_rate": 8.834557801257162e-06,
"loss": 0.2087,
"step": 1896
},
{
"epoch": 1.2315653298835705,
"grad_norm": 4.70977783203125,
"learning_rate": 8.823780740461204e-06,
"loss": 0.2122,
"step": 1904
},
{
"epoch": 1.2367399741267788,
"grad_norm": 1.3468469381332397,
"learning_rate": 8.81296071796882e-06,
"loss": 0.2225,
"step": 1912
},
{
"epoch": 1.2419146183699872,
"grad_norm": 3.733586549758911,
"learning_rate": 8.80209785534726e-06,
"loss": 0.2395,
"step": 1920
},
{
"epoch": 1.2470892626131953,
"grad_norm": 4.492749214172363,
"learning_rate": 8.791192274645107e-06,
"loss": 0.2138,
"step": 1928
},
{
"epoch": 1.2522639068564037,
"grad_norm": 19.35169219970703,
"learning_rate": 8.780244098390891e-06,
"loss": 0.2287,
"step": 1936
},
{
"epoch": 1.2574385510996118,
"grad_norm": 1.480233073234558,
"learning_rate": 8.769253449591728e-06,
"loss": 0.2347,
"step": 1944
},
{
"epoch": 1.2626131953428201,
"grad_norm": 8.458354949951172,
"learning_rate": 8.758220451731922e-06,
"loss": 0.2327,
"step": 1952
},
{
"epoch": 1.2677878395860285,
"grad_norm": 10.992264747619629,
"learning_rate": 8.74714522877159e-06,
"loss": 0.221,
"step": 1960
},
{
"epoch": 1.2729624838292368,
"grad_norm": 15.178800582885742,
"learning_rate": 8.736027905145265e-06,
"loss": 0.2282,
"step": 1968
},
{
"epoch": 1.278137128072445,
"grad_norm": 22.52798843383789,
"learning_rate": 8.724868605760497e-06,
"loss": 0.2238,
"step": 1976
},
{
"epoch": 1.2833117723156533,
"grad_norm": 2.1530232429504395,
"learning_rate": 8.713667455996449e-06,
"loss": 0.2304,
"step": 1984
},
{
"epoch": 1.2884864165588616,
"grad_norm": 3.1006431579589844,
"learning_rate": 8.70242458170249e-06,
"loss": 0.2453,
"step": 1992
},
{
"epoch": 1.2936610608020698,
"grad_norm": 35.956756591796875,
"learning_rate": 8.691140109196782e-06,
"loss": 0.2018,
"step": 2000
},
{
"epoch": 1.2988357050452781,
"grad_norm": 32.85151672363281,
"learning_rate": 8.67981416526486e-06,
"loss": 0.2131,
"step": 2008
},
{
"epoch": 1.3040103492884865,
"grad_norm": 3.432091236114502,
"learning_rate": 8.668446877158205e-06,
"loss": 0.2433,
"step": 2016
},
{
"epoch": 1.3091849935316948,
"grad_norm": 14.427757263183594,
"learning_rate": 8.657038372592815e-06,
"loss": 0.2315,
"step": 2024
},
{
"epoch": 1.314359637774903,
"grad_norm": 2.7362518310546875,
"learning_rate": 8.645588779747775e-06,
"loss": 0.2295,
"step": 2032
},
{
"epoch": 1.3195342820181113,
"grad_norm": 3.2137720584869385,
"learning_rate": 8.634098227263809e-06,
"loss": 0.221,
"step": 2040
},
{
"epoch": 1.3247089262613194,
"grad_norm": 31.10691261291504,
"learning_rate": 8.622566844241846e-06,
"loss": 0.2174,
"step": 2048
},
{
"epoch": 1.3298835705045278,
"grad_norm": 4.460792541503906,
"learning_rate": 8.610994760241555e-06,
"loss": 0.2277,
"step": 2056
},
{
"epoch": 1.3350582147477361,
"grad_norm": 1.419631838798523,
"learning_rate": 8.599382105279899e-06,
"loss": 0.2259,
"step": 2064
},
{
"epoch": 1.3402328589909445,
"grad_norm": 6.29133939743042,
"learning_rate": 8.58772900982967e-06,
"loss": 0.2272,
"step": 2072
},
{
"epoch": 1.3454075032341526,
"grad_norm": 4.922664642333984,
"learning_rate": 8.576035604818031e-06,
"loss": 0.216,
"step": 2080
},
{
"epoch": 1.350582147477361,
"grad_norm": 4.346408843994141,
"learning_rate": 8.564302021625033e-06,
"loss": 0.212,
"step": 2088
},
{
"epoch": 1.3557567917205693,
"grad_norm": 27.521772384643555,
"learning_rate": 8.552528392082147e-06,
"loss": 0.2423,
"step": 2096
},
{
"epoch": 1.3609314359637774,
"grad_norm": 4.437674522399902,
"learning_rate": 8.54071484847078e-06,
"loss": 0.2133,
"step": 2104
},
{
"epoch": 1.3661060802069858,
"grad_norm": 18.69135284423828,
"learning_rate": 8.528861523520792e-06,
"loss": 0.2248,
"step": 2112
},
{
"epoch": 1.371280724450194,
"grad_norm": 1.3831626176834106,
"learning_rate": 8.516968550408998e-06,
"loss": 0.2158,
"step": 2120
},
{
"epoch": 1.3764553686934025,
"grad_norm": 2.99196720123291,
"learning_rate": 8.505036062757677e-06,
"loss": 0.2301,
"step": 2128
},
{
"epoch": 1.3816300129366106,
"grad_norm": 25.202573776245117,
"learning_rate": 8.493064194633072e-06,
"loss": 0.213,
"step": 2136
},
{
"epoch": 1.386804657179819,
"grad_norm": 40.16120529174805,
"learning_rate": 8.481053080543879e-06,
"loss": 0.2394,
"step": 2144
},
{
"epoch": 1.391979301423027,
"grad_norm": 32.01914596557617,
"learning_rate": 8.469002855439741e-06,
"loss": 0.2155,
"step": 2152
},
{
"epoch": 1.3971539456662354,
"grad_norm": 11.138020515441895,
"learning_rate": 8.456913654709725e-06,
"loss": 0.2337,
"step": 2160
},
{
"epoch": 1.4023285899094438,
"grad_norm": 4.366079807281494,
"learning_rate": 8.444785614180807e-06,
"loss": 0.2186,
"step": 2168
},
{
"epoch": 1.407503234152652,
"grad_norm": 19.91827392578125,
"learning_rate": 8.432618870116339e-06,
"loss": 0.2493,
"step": 2176
},
{
"epoch": 1.4126778783958602,
"grad_norm": 9.727320671081543,
"learning_rate": 8.42041355921453e-06,
"loss": 0.2207,
"step": 2184
},
{
"epoch": 1.4178525226390686,
"grad_norm": 15.481773376464844,
"learning_rate": 8.4081698186069e-06,
"loss": 0.2179,
"step": 2192
},
{
"epoch": 1.4230271668822767,
"grad_norm": 4.066092491149902,
"learning_rate": 8.39588778585674e-06,
"loss": 0.2304,
"step": 2200
},
{
"epoch": 1.428201811125485,
"grad_norm": 35.265018463134766,
"learning_rate": 8.383567598957567e-06,
"loss": 0.2237,
"step": 2208
},
{
"epoch": 1.4333764553686934,
"grad_norm": 10.66234302520752,
"learning_rate": 8.37120939633158e-06,
"loss": 0.2202,
"step": 2216
},
{
"epoch": 1.4385510996119018,
"grad_norm": 1.3266409635543823,
"learning_rate": 8.358813316828097e-06,
"loss": 0.2194,
"step": 2224
},
{
"epoch": 1.44372574385511,
"grad_norm": 4.940774440765381,
"learning_rate": 8.346379499722e-06,
"loss": 0.205,
"step": 2232
},
{
"epoch": 1.4489003880983182,
"grad_norm": 4.44492769241333,
"learning_rate": 8.333908084712163e-06,
"loss": 0.2241,
"step": 2240
},
{
"epoch": 1.4540750323415266,
"grad_norm": 2.2839596271514893,
"learning_rate": 8.321399211919893e-06,
"loss": 0.2245,
"step": 2248
},
{
"epoch": 1.4592496765847347,
"grad_norm": 22.686996459960938,
"learning_rate": 8.308853021887346e-06,
"loss": 0.2472,
"step": 2256
},
{
"epoch": 1.464424320827943,
"grad_norm": 3.752237319946289,
"learning_rate": 8.296269655575956e-06,
"loss": 0.2201,
"step": 2264
},
{
"epoch": 1.4695989650711514,
"grad_norm": 16.266639709472656,
"learning_rate": 8.283649254364843e-06,
"loss": 0.2298,
"step": 2272
},
{
"epoch": 1.4747736093143597,
"grad_norm": 1.5946576595306396,
"learning_rate": 8.270991960049231e-06,
"loss": 0.2144,
"step": 2280
},
{
"epoch": 1.4799482535575679,
"grad_norm": 38.729488372802734,
"learning_rate": 8.25829791483885e-06,
"loss": 0.2181,
"step": 2288
},
{
"epoch": 1.4851228978007762,
"grad_norm": 8.792683601379395,
"learning_rate": 8.245567261356347e-06,
"loss": 0.2204,
"step": 2296
},
{
"epoch": 1.4902975420439843,
"grad_norm": 3.90529203414917,
"learning_rate": 8.232800142635675e-06,
"loss": 0.2179,
"step": 2304
},
{
"epoch": 1.4954721862871927,
"grad_norm": 4.205716609954834,
"learning_rate": 8.219996702120482e-06,
"loss": 0.2391,
"step": 2312
},
{
"epoch": 1.500646830530401,
"grad_norm": 12.205682754516602,
"learning_rate": 8.207157083662516e-06,
"loss": 0.2365,
"step": 2320
},
{
"epoch": 1.5058214747736094,
"grad_norm": 3.6196138858795166,
"learning_rate": 8.19428143151999e-06,
"loss": 0.2246,
"step": 2328
},
{
"epoch": 1.5109961190168177,
"grad_norm": 3.0989346504211426,
"learning_rate": 8.181369890355975e-06,
"loss": 0.2266,
"step": 2336
},
{
"epoch": 1.5161707632600259,
"grad_norm": 3.95747447013855,
"learning_rate": 8.16842260523677e-06,
"loss": 0.2292,
"step": 2344
},
{
"epoch": 1.521345407503234,
"grad_norm": 4.447324752807617,
"learning_rate": 8.155439721630265e-06,
"loss": 0.2128,
"step": 2352
},
{
"epoch": 1.5265200517464423,
"grad_norm": 4.018327713012695,
"learning_rate": 8.14242138540432e-06,
"loss": 0.2098,
"step": 2360
},
{
"epoch": 1.5316946959896507,
"grad_norm": 6.639673709869385,
"learning_rate": 8.129367742825117e-06,
"loss": 0.2232,
"step": 2368
},
{
"epoch": 1.536869340232859,
"grad_norm": 17.369224548339844,
"learning_rate": 8.116278940555517e-06,
"loss": 0.2291,
"step": 2376
},
{
"epoch": 1.5420439844760674,
"grad_norm": 6.68892765045166,
"learning_rate": 8.103155125653419e-06,
"loss": 0.2425,
"step": 2384
},
{
"epoch": 1.5472186287192755,
"grad_norm": 34.08677673339844,
"learning_rate": 8.089996445570097e-06,
"loss": 0.2296,
"step": 2392
},
{
"epoch": 1.5523932729624839,
"grad_norm": 25.89212417602539,
"learning_rate": 8.076803048148553e-06,
"loss": 0.2526,
"step": 2400
},
{
"epoch": 1.557567917205692,
"grad_norm": 12.489855766296387,
"learning_rate": 8.06357508162185e-06,
"loss": 0.2218,
"step": 2408
},
{
"epoch": 1.5627425614489003,
"grad_norm": 1.4443012475967407,
"learning_rate": 8.050312694611451e-06,
"loss": 0.2239,
"step": 2416
},
{
"epoch": 1.5679172056921087,
"grad_norm": 9.047863960266113,
"learning_rate": 8.037016036125542e-06,
"loss": 0.2096,
"step": 2424
},
{
"epoch": 1.573091849935317,
"grad_norm": 8.109607696533203,
"learning_rate": 8.023685255557368e-06,
"loss": 0.2118,
"step": 2432
},
{
"epoch": 1.5782664941785254,
"grad_norm": 36.13426971435547,
"learning_rate": 8.010320502683549e-06,
"loss": 0.2083,
"step": 2440
},
{
"epoch": 1.5834411384217335,
"grad_norm": 17.517616271972656,
"learning_rate": 7.996921927662395e-06,
"loss": 0.2078,
"step": 2448
},
{
"epoch": 1.5886157826649416,
"grad_norm": 4.454436779022217,
"learning_rate": 7.983489681032219e-06,
"loss": 0.2428,
"step": 2456
},
{
"epoch": 1.59379042690815,
"grad_norm": 26.744815826416016,
"learning_rate": 7.970023913709652e-06,
"loss": 0.2263,
"step": 2464
},
{
"epoch": 1.5989650711513583,
"grad_norm": 2.163850784301758,
"learning_rate": 7.956524776987945e-06,
"loss": 0.2253,
"step": 2472
},
{
"epoch": 1.6041397153945667,
"grad_norm": 2.051708936691284,
"learning_rate": 7.94299242253526e-06,
"loss": 0.2352,
"step": 2480
},
{
"epoch": 1.609314359637775,
"grad_norm": 9.460783004760742,
"learning_rate": 7.929427002392981e-06,
"loss": 0.2407,
"step": 2488
},
{
"epoch": 1.6144890038809832,
"grad_norm": 13.439695358276367,
"learning_rate": 7.915828668973992e-06,
"loss": 0.2189,
"step": 2496
},
{
"epoch": 1.6196636481241915,
"grad_norm": 14.777132987976074,
"learning_rate": 7.902197575060978e-06,
"loss": 0.2232,
"step": 2504
},
{
"epoch": 1.6248382923673996,
"grad_norm": 36.123809814453125,
"learning_rate": 7.888533873804693e-06,
"loss": 0.2258,
"step": 2512
},
{
"epoch": 1.630012936610608,
"grad_norm": 1.7682701349258423,
"learning_rate": 7.874837718722254e-06,
"loss": 0.2339,
"step": 2520
},
{
"epoch": 1.6351875808538163,
"grad_norm": 13.471175193786621,
"learning_rate": 7.861109263695405e-06,
"loss": 0.2441,
"step": 2528
},
{
"epoch": 1.6403622250970247,
"grad_norm": 2.4435620307922363,
"learning_rate": 7.847348662968796e-06,
"loss": 0.2245,
"step": 2536
},
{
"epoch": 1.645536869340233,
"grad_norm": 2.9155242443084717,
"learning_rate": 7.833556071148245e-06,
"loss": 0.2229,
"step": 2544
},
{
"epoch": 1.6507115135834411,
"grad_norm": 1.8082088232040405,
"learning_rate": 7.819731643199006e-06,
"loss": 0.2273,
"step": 2552
},
{
"epoch": 1.6558861578266493,
"grad_norm": 1.26021409034729,
"learning_rate": 7.805875534444016e-06,
"loss": 0.2234,
"step": 2560
},
{
"epoch": 1.6610608020698576,
"grad_norm": 1.4038159847259521,
"learning_rate": 7.79198790056217e-06,
"loss": 0.2318,
"step": 2568
},
{
"epoch": 1.666235446313066,
"grad_norm": 217.4139862060547,
"learning_rate": 7.77806889758655e-06,
"loss": 0.2318,
"step": 2576
},
{
"epoch": 1.6714100905562743,
"grad_norm": 3.6848537921905518,
"learning_rate": 7.764118681902688e-06,
"loss": 0.2276,
"step": 2584
},
{
"epoch": 1.6765847347994827,
"grad_norm": 5.489861965179443,
"learning_rate": 7.750137410246803e-06,
"loss": 0.2255,
"step": 2592
},
{
"epoch": 1.6817593790426908,
"grad_norm": 3.472351312637329,
"learning_rate": 7.73612523970404e-06,
"loss": 0.2106,
"step": 2600
},
{
"epoch": 1.6869340232858991,
"grad_norm": 4.789738178253174,
"learning_rate": 7.722082327706701e-06,
"loss": 0.2432,
"step": 2608
},
{
"epoch": 1.6921086675291073,
"grad_norm": 1.9528974294662476,
"learning_rate": 7.708008832032485e-06,
"loss": 0.2263,
"step": 2616
},
{
"epoch": 1.6972833117723156,
"grad_norm": 21.182388305664062,
"learning_rate": 7.693904910802712e-06,
"loss": 0.2346,
"step": 2624
},
{
"epoch": 1.702457956015524,
"grad_norm": 12.879313468933105,
"learning_rate": 7.679770722480539e-06,
"loss": 0.2041,
"step": 2632
},
{
"epoch": 1.7076326002587323,
"grad_norm": 8.2993745803833,
"learning_rate": 7.665606425869194e-06,
"loss": 0.2193,
"step": 2640
},
{
"epoch": 1.7128072445019404,
"grad_norm": 8.905396461486816,
"learning_rate": 7.651412180110176e-06,
"loss": 0.2067,
"step": 2648
},
{
"epoch": 1.7179818887451488,
"grad_norm": 2.264564037322998,
"learning_rate": 7.637188144681478e-06,
"loss": 0.2225,
"step": 2656
},
{
"epoch": 1.723156532988357,
"grad_norm": 6.169471263885498,
"learning_rate": 7.622934479395792e-06,
"loss": 0.2128,
"step": 2664
},
{
"epoch": 1.7283311772315653,
"grad_norm": 13.35564136505127,
"learning_rate": 7.608651344398713e-06,
"loss": 0.2185,
"step": 2672
},
{
"epoch": 1.7335058214747736,
"grad_norm": 3.8094420433044434,
"learning_rate": 7.5943389001669395e-06,
"loss": 0.2038,
"step": 2680
},
{
"epoch": 1.738680465717982,
"grad_norm": 7.062588214874268,
"learning_rate": 7.579997307506472e-06,
"loss": 0.2247,
"step": 2688
},
{
"epoch": 1.7438551099611903,
"grad_norm": 4.3430962562561035,
"learning_rate": 7.565626727550804e-06,
"loss": 0.213,
"step": 2696
},
{
"epoch": 1.7490297542043984,
"grad_norm": 0.8228983283042908,
"learning_rate": 7.551227321759111e-06,
"loss": 0.2116,
"step": 2704
},
{
"epoch": 1.7542043984476066,
"grad_norm": 2.99155592918396,
"learning_rate": 7.536799251914442e-06,
"loss": 0.2295,
"step": 2712
},
{
"epoch": 1.759379042690815,
"grad_norm": 1.8452483415603638,
"learning_rate": 7.522342680121897e-06,
"loss": 0.2174,
"step": 2720
},
{
"epoch": 1.7645536869340233,
"grad_norm": 9.412191390991211,
"learning_rate": 7.507857768806803e-06,
"loss": 0.2125,
"step": 2728
},
{
"epoch": 1.7697283311772316,
"grad_norm": 1.4425644874572754,
"learning_rate": 7.4933446807129e-06,
"loss": 0.2283,
"step": 2736
},
{
"epoch": 1.77490297542044,
"grad_norm": 2.3930792808532715,
"learning_rate": 7.4788035789005e-06,
"loss": 0.2288,
"step": 2744
},
{
"epoch": 1.780077619663648,
"grad_norm": 8.610286712646484,
"learning_rate": 7.464234626744659e-06,
"loss": 0.2197,
"step": 2752
},
{
"epoch": 1.7852522639068564,
"grad_norm": 2.6517043113708496,
"learning_rate": 7.449637987933347e-06,
"loss": 0.2278,
"step": 2760
},
{
"epoch": 1.7904269081500646,
"grad_norm": 1.6406168937683105,
"learning_rate": 7.435013826465601e-06,
"loss": 0.2227,
"step": 2768
},
{
"epoch": 1.795601552393273,
"grad_norm": 11.188384056091309,
"learning_rate": 7.420362306649691e-06,
"loss": 0.2139,
"step": 2776
},
{
"epoch": 1.8007761966364813,
"grad_norm": 4.76533317565918,
"learning_rate": 7.405683593101263e-06,
"loss": 0.2279,
"step": 2784
},
{
"epoch": 1.8059508408796896,
"grad_norm": 3.7414357662200928,
"learning_rate": 7.390977850741498e-06,
"loss": 0.2098,
"step": 2792
},
{
"epoch": 1.811125485122898,
"grad_norm": 37.51686096191406,
"learning_rate": 7.376245244795255e-06,
"loss": 0.2204,
"step": 2800
},
{
"epoch": 1.816300129366106,
"grad_norm": 29.107175827026367,
"learning_rate": 7.361485940789221e-06,
"loss": 0.2254,
"step": 2808
},
{
"epoch": 1.8214747736093142,
"grad_norm": 51.822723388671875,
"learning_rate": 7.346700104550042e-06,
"loss": 0.2304,
"step": 2816
},
{
"epoch": 1.8266494178525226,
"grad_norm": 20.180925369262695,
"learning_rate": 7.331887902202463e-06,
"loss": 0.2262,
"step": 2824
},
{
"epoch": 1.831824062095731,
"grad_norm": 8.97240161895752,
"learning_rate": 7.317049500167466e-06,
"loss": 0.253,
"step": 2832
},
{
"epoch": 1.8369987063389392,
"grad_norm": 24.334049224853516,
"learning_rate": 7.3021850651603955e-06,
"loss": 0.219,
"step": 2840
},
{
"epoch": 1.8421733505821476,
"grad_norm": 14.938766479492188,
"learning_rate": 7.2872947641890854e-06,
"loss": 0.232,
"step": 2848
},
{
"epoch": 1.8473479948253557,
"grad_norm": 0.8988103270530701,
"learning_rate": 7.272378764551988e-06,
"loss": 0.213,
"step": 2856
},
{
"epoch": 1.852522639068564,
"grad_norm": 25.765361785888672,
"learning_rate": 7.257437233836285e-06,
"loss": 0.2185,
"step": 2864
},
{
"epoch": 1.8576972833117722,
"grad_norm": 21.85658836364746,
"learning_rate": 7.242470339916014e-06,
"loss": 0.2175,
"step": 2872
},
{
"epoch": 1.8628719275549805,
"grad_norm": 4.687010288238525,
"learning_rate": 7.227478250950178e-06,
"loss": 0.2221,
"step": 2880
},
{
"epoch": 1.868046571798189,
"grad_norm": 1.0599603652954102,
"learning_rate": 7.212461135380855e-06,
"loss": 0.214,
"step": 2888
},
{
"epoch": 1.8732212160413972,
"grad_norm": 8.412078857421875,
"learning_rate": 7.197419161931305e-06,
"loss": 0.2103,
"step": 2896
},
{
"epoch": 1.8783958602846056,
"grad_norm": 3.0134875774383545,
"learning_rate": 7.182352499604081e-06,
"loss": 0.2114,
"step": 2904
},
{
"epoch": 1.8835705045278137,
"grad_norm": 0.7685543298721313,
"learning_rate": 7.167261317679121e-06,
"loss": 0.2036,
"step": 2912
},
{
"epoch": 1.8887451487710218,
"grad_norm": 2.3020284175872803,
"learning_rate": 7.1521457857118525e-06,
"loss": 0.2265,
"step": 2920
},
{
"epoch": 1.8939197930142302,
"grad_norm": 18.331098556518555,
"learning_rate": 7.137006073531285e-06,
"loss": 0.2318,
"step": 2928
},
{
"epoch": 1.8990944372574385,
"grad_norm": 6.871685981750488,
"learning_rate": 7.121842351238102e-06,
"loss": 0.1977,
"step": 2936
},
{
"epoch": 1.9042690815006469,
"grad_norm": 8.529058456420898,
"learning_rate": 7.106654789202751e-06,
"loss": 0.1992,
"step": 2944
},
{
"epoch": 1.9094437257438552,
"grad_norm": 11.424173355102539,
"learning_rate": 7.0914435580635286e-06,
"loss": 0.2185,
"step": 2952
},
{
"epoch": 1.9146183699870634,
"grad_norm": 5.662269592285156,
"learning_rate": 7.076208828724661e-06,
"loss": 0.217,
"step": 2960
},
{
"epoch": 1.9197930142302717,
"grad_norm": 17.576711654663086,
"learning_rate": 7.060950772354389e-06,
"loss": 0.2251,
"step": 2968
},
{
"epoch": 1.9249676584734798,
"grad_norm": 65.82316589355469,
"learning_rate": 7.045669560383039e-06,
"loss": 0.2131,
"step": 2976
},
{
"epoch": 1.9301423027166882,
"grad_norm": 5.220096588134766,
"learning_rate": 7.030365364501104e-06,
"loss": 0.2263,
"step": 2984
},
{
"epoch": 1.9353169469598965,
"grad_norm": 13.813321113586426,
"learning_rate": 7.015038356657303e-06,
"loss": 0.2229,
"step": 2992
},
{
"epoch": 1.9404915912031049,
"grad_norm": 3.5180106163024902,
"learning_rate": 6.9996887090566645e-06,
"loss": 0.2055,
"step": 3000
},
{
"epoch": 1.9456662354463132,
"grad_norm": 13.823914527893066,
"learning_rate": 6.98431659415858e-06,
"loss": 0.2298,
"step": 3008
},
{
"epoch": 1.9508408796895214,
"grad_norm": 9.340493202209473,
"learning_rate": 6.968922184674868e-06,
"loss": 0.21,
"step": 3016
},
{
"epoch": 1.9560155239327295,
"grad_norm": 8.113641738891602,
"learning_rate": 6.95350565356784e-06,
"loss": 0.2215,
"step": 3024
},
{
"epoch": 1.9611901681759378,
"grad_norm": 4.113650321960449,
"learning_rate": 6.93806717404835e-06,
"loss": 0.2176,
"step": 3032
},
{
"epoch": 1.9663648124191462,
"grad_norm": 44.47676086425781,
"learning_rate": 6.922606919573851e-06,
"loss": 0.2188,
"step": 3040
},
{
"epoch": 1.9715394566623545,
"grad_norm": 4.712839126586914,
"learning_rate": 6.907125063846447e-06,
"loss": 0.2101,
"step": 3048
},
{
"epoch": 1.9767141009055629,
"grad_norm": 13.299867630004883,
"learning_rate": 6.891621780810941e-06,
"loss": 0.2207,
"step": 3056
},
{
"epoch": 1.981888745148771,
"grad_norm": 2.58492112159729,
"learning_rate": 6.876097244652879e-06,
"loss": 0.2233,
"step": 3064
},
{
"epoch": 1.9870633893919794,
"grad_norm": 4.740981101989746,
"learning_rate": 6.860551629796597e-06,
"loss": 0.2386,
"step": 3072
},
{
"epoch": 1.9922380336351875,
"grad_norm": 3.3356964588165283,
"learning_rate": 6.844985110903255e-06,
"loss": 0.208,
"step": 3080
},
{
"epoch": 1.9974126778783958,
"grad_norm": 11.826971054077148,
"learning_rate": 6.829397862868878e-06,
"loss": 0.2142,
"step": 3088
},
{
"epoch": 2.002587322121604,
"grad_norm": 5.242908477783203,
"learning_rate": 6.8137900608223985e-06,
"loss": 0.2276,
"step": 3096
},
{
"epoch": 2.0077619663648125,
"grad_norm": 7.4030327796936035,
"learning_rate": 6.798161880123671e-06,
"loss": 0.2199,
"step": 3104
},
{
"epoch": 2.012936610608021,
"grad_norm": 1.0129386186599731,
"learning_rate": 6.78251349636152e-06,
"loss": 0.2151,
"step": 3112
},
{
"epoch": 2.0181112548512288,
"grad_norm": 12.936131477355957,
"learning_rate": 6.766845085351755e-06,
"loss": 0.2103,
"step": 3120
},
{
"epoch": 2.023285899094437,
"grad_norm": 3.0860114097595215,
"learning_rate": 6.751156823135203e-06,
"loss": 0.2312,
"step": 3128
},
{
"epoch": 2.0284605433376455,
"grad_norm": 10.499250411987305,
"learning_rate": 6.735448885975724e-06,
"loss": 0.2236,
"step": 3136
},
{
"epoch": 2.033635187580854,
"grad_norm": 1.9619132280349731,
"learning_rate": 6.7197214503582355e-06,
"loss": 0.2222,
"step": 3144
},
{
"epoch": 2.038809831824062,
"grad_norm": 5.836687088012695,
"learning_rate": 6.703974692986729e-06,
"loss": 0.2057,
"step": 3152
},
{
"epoch": 2.0439844760672705,
"grad_norm": 1.3874237537384033,
"learning_rate": 6.68820879078228e-06,
"loss": 0.2332,
"step": 3160
},
{
"epoch": 2.049159120310479,
"grad_norm": 1.146099328994751,
"learning_rate": 6.672423920881068e-06,
"loss": 0.2266,
"step": 3168
},
{
"epoch": 2.0543337645536868,
"grad_norm": 3.993424654006958,
"learning_rate": 6.6566202606323806e-06,
"loss": 0.2172,
"step": 3176
},
{
"epoch": 2.059508408796895,
"grad_norm": 57.19075393676758,
"learning_rate": 6.640797987596621e-06,
"loss": 0.2056,
"step": 3184
},
{
"epoch": 2.0646830530401035,
"grad_norm": 2.535454511642456,
"learning_rate": 6.6249572795433155e-06,
"loss": 0.2082,
"step": 3192
},
{
"epoch": 2.069857697283312,
"grad_norm": 2.062868595123291,
"learning_rate": 6.609098314449116e-06,
"loss": 0.2182,
"step": 3200
},
{
"epoch": 2.07503234152652,
"grad_norm": 0.7799398303031921,
"learning_rate": 6.593221270495797e-06,
"loss": 0.2168,
"step": 3208
},
{
"epoch": 2.0802069857697285,
"grad_norm": 57.6228141784668,
"learning_rate": 6.5773263260682595e-06,
"loss": 0.2344,
"step": 3216
},
{
"epoch": 2.0853816300129364,
"grad_norm": 1.8435810804367065,
"learning_rate": 6.561413659752521e-06,
"loss": 0.2159,
"step": 3224
},
{
"epoch": 2.0905562742561448,
"grad_norm": 4.932186126708984,
"learning_rate": 6.545483450333712e-06,
"loss": 0.214,
"step": 3232
},
{
"epoch": 2.095730918499353,
"grad_norm": 2.2944324016571045,
"learning_rate": 6.529535876794069e-06,
"loss": 0.2212,
"step": 3240
},
{
"epoch": 2.1009055627425615,
"grad_norm": 2.7697372436523438,
"learning_rate": 6.5135711183109156e-06,
"loss": 0.2193,
"step": 3248
},
{
"epoch": 2.10608020698577,
"grad_norm": 2.9319217205047607,
"learning_rate": 6.497589354254662e-06,
"loss": 0.2292,
"step": 3256
},
{
"epoch": 2.111254851228978,
"grad_norm": 4.4201154708862305,
"learning_rate": 6.481590764186778e-06,
"loss": 0.2141,
"step": 3264
},
{
"epoch": 2.116429495472186,
"grad_norm": 14.810309410095215,
"learning_rate": 6.465575527857781e-06,
"loss": 0.1982,
"step": 3272
},
{
"epoch": 2.1216041397153944,
"grad_norm": 3.787808656692505,
"learning_rate": 6.44954382520522e-06,
"loss": 0.2116,
"step": 3280
},
{
"epoch": 2.1267787839586028,
"grad_norm": 7.30560827255249,
"learning_rate": 6.433495836351643e-06,
"loss": 0.2088,
"step": 3288
},
{
"epoch": 2.131953428201811,
"grad_norm": 1.6610970497131348,
"learning_rate": 6.417431741602585e-06,
"loss": 0.2189,
"step": 3296
},
{
"epoch": 2.1371280724450195,
"grad_norm": 3.0157978534698486,
"learning_rate": 6.401351721444533e-06,
"loss": 0.2197,
"step": 3304
},
{
"epoch": 2.142302716688228,
"grad_norm": 29.640609741210938,
"learning_rate": 6.385255956542907e-06,
"loss": 0.2209,
"step": 3312
},
{
"epoch": 2.147477360931436,
"grad_norm": 1.1481863260269165,
"learning_rate": 6.369144627740023e-06,
"loss": 0.2099,
"step": 3320
},
{
"epoch": 2.152652005174644,
"grad_norm": 4.599576473236084,
"learning_rate": 6.353017916053063e-06,
"loss": 0.2159,
"step": 3328
},
{
"epoch": 2.1578266494178524,
"grad_norm": 4.746689319610596,
"learning_rate": 6.336876002672042e-06,
"loss": 0.2289,
"step": 3336
},
{
"epoch": 2.1630012936610608,
"grad_norm": 14.463395118713379,
"learning_rate": 6.3207190689577745e-06,
"loss": 0.219,
"step": 3344
},
{
"epoch": 2.168175937904269,
"grad_norm": 7.709287166595459,
"learning_rate": 6.304547296439831e-06,
"loss": 0.2419,
"step": 3352
},
{
"epoch": 2.1733505821474774,
"grad_norm": 4.06973934173584,
"learning_rate": 6.288360866814504e-06,
"loss": 0.2434,
"step": 3360
},
{
"epoch": 2.178525226390686,
"grad_norm": 2.9332635402679443,
"learning_rate": 6.272159961942764e-06,
"loss": 0.2202,
"step": 3368
},
{
"epoch": 2.1836998706338937,
"grad_norm": 7.125377655029297,
"learning_rate": 6.255944763848215e-06,
"loss": 0.2049,
"step": 3376
},
{
"epoch": 2.188874514877102,
"grad_norm": 2.0361697673797607,
"learning_rate": 6.239715454715054e-06,
"loss": 0.2237,
"step": 3384
},
{
"epoch": 2.1940491591203104,
"grad_norm": 25.10670280456543,
"learning_rate": 6.223472216886021e-06,
"loss": 0.2088,
"step": 3392
},
{
"epoch": 2.1992238033635187,
"grad_norm": 16.464994430541992,
"learning_rate": 6.2072152328603464e-06,
"loss": 0.2224,
"step": 3400
},
{
"epoch": 2.204398447606727,
"grad_norm": 3.6457226276397705,
"learning_rate": 6.190944685291708e-06,
"loss": 0.2081,
"step": 3408
},
{
"epoch": 2.2095730918499354,
"grad_norm": 1.3341186046600342,
"learning_rate": 6.174660756986175e-06,
"loss": 0.211,
"step": 3416
},
{
"epoch": 2.214747736093144,
"grad_norm": 14.349130630493164,
"learning_rate": 6.158363630900155e-06,
"loss": 0.225,
"step": 3424
},
{
"epoch": 2.2199223803363517,
"grad_norm": 13.548636436462402,
"learning_rate": 6.142053490138335e-06,
"loss": 0.2251,
"step": 3432
},
{
"epoch": 2.22509702457956,
"grad_norm": 8.892589569091797,
"learning_rate": 6.1257305179516315e-06,
"loss": 0.2518,
"step": 3440
},
{
"epoch": 2.2302716688227684,
"grad_norm": 3.8122220039367676,
"learning_rate": 6.109394897735121e-06,
"loss": 0.2342,
"step": 3448
},
{
"epoch": 2.2354463130659767,
"grad_norm": 6.099566459655762,
"learning_rate": 6.093046813025995e-06,
"loss": 0.2175,
"step": 3456
},
{
"epoch": 2.240620957309185,
"grad_norm": 1.4928990602493286,
"learning_rate": 6.0766864475014785e-06,
"loss": 0.2383,
"step": 3464
},
{
"epoch": 2.2457956015523934,
"grad_norm": 11.29719352722168,
"learning_rate": 6.060313984976783e-06,
"loss": 0.2389,
"step": 3472
},
{
"epoch": 2.2509702457956013,
"grad_norm": 2.987386465072632,
"learning_rate": 6.043929609403032e-06,
"loss": 0.2113,
"step": 3480
},
{
"epoch": 2.2561448900388097,
"grad_norm": 2.351633310317993,
"learning_rate": 6.027533504865196e-06,
"loss": 0.2235,
"step": 3488
},
{
"epoch": 2.261319534282018,
"grad_norm": 5.436527729034424,
"learning_rate": 6.011125855580026e-06,
"loss": 0.2204,
"step": 3496
},
{
"epoch": 2.2664941785252264,
"grad_norm": 1.6124496459960938,
"learning_rate": 5.994706845893986e-06,
"loss": 0.2247,
"step": 3504
},
{
"epoch": 2.2716688227684347,
"grad_norm": 8.689626693725586,
"learning_rate": 5.978276660281174e-06,
"loss": 0.2345,
"step": 3512
},
{
"epoch": 2.276843467011643,
"grad_norm": 29.871366500854492,
"learning_rate": 5.961835483341255e-06,
"loss": 0.2154,
"step": 3520
},
{
"epoch": 2.2820181112548514,
"grad_norm": 11.956281661987305,
"learning_rate": 5.945383499797388e-06,
"loss": 0.2351,
"step": 3528
},
{
"epoch": 2.2871927554980593,
"grad_norm": 1.748079538345337,
"learning_rate": 5.928920894494147e-06,
"loss": 0.2083,
"step": 3536
},
{
"epoch": 2.2923673997412677,
"grad_norm": 22.06855010986328,
"learning_rate": 5.912447852395444e-06,
"loss": 0.2149,
"step": 3544
},
{
"epoch": 2.297542043984476,
"grad_norm": 7.1412529945373535,
"learning_rate": 5.8959645585824575e-06,
"loss": 0.2176,
"step": 3552
},
{
"epoch": 2.3027166882276844,
"grad_norm": 11.964238166809082,
"learning_rate": 5.879471198251544e-06,
"loss": 0.2235,
"step": 3560
},
{
"epoch": 2.3078913324708927,
"grad_norm": 4.59617280960083,
"learning_rate": 5.86296795671216e-06,
"loss": 0.2066,
"step": 3568
},
{
"epoch": 2.313065976714101,
"grad_norm": 27.847808837890625,
"learning_rate": 5.846455019384787e-06,
"loss": 0.2031,
"step": 3576
},
{
"epoch": 2.318240620957309,
"grad_norm": 1.6919810771942139,
"learning_rate": 5.8299325717988355e-06,
"loss": 0.2163,
"step": 3584
},
{
"epoch": 2.3234152652005173,
"grad_norm": 2.3351593017578125,
"learning_rate": 5.813400799590573e-06,
"loss": 0.2211,
"step": 3592
},
{
"epoch": 2.3285899094437257,
"grad_norm": 23.962520599365234,
"learning_rate": 5.7968598885010315e-06,
"loss": 0.2116,
"step": 3600
},
{
"epoch": 2.333764553686934,
"grad_norm": 19.117528915405273,
"learning_rate": 5.780310024373923e-06,
"loss": 0.2227,
"step": 3608
},
{
"epoch": 2.3389391979301424,
"grad_norm": 48.07025146484375,
"learning_rate": 5.763751393153545e-06,
"loss": 0.2183,
"step": 3616
},
{
"epoch": 2.3441138421733507,
"grad_norm": 1.309008240699768,
"learning_rate": 5.747184180882704e-06,
"loss": 0.2098,
"step": 3624
},
{
"epoch": 2.349288486416559,
"grad_norm": 2.3568334579467773,
"learning_rate": 5.730608573700613e-06,
"loss": 0.2062,
"step": 3632
},
{
"epoch": 2.354463130659767,
"grad_norm": 1.478174090385437,
"learning_rate": 5.714024757840806e-06,
"loss": 0.2175,
"step": 3640
},
{
"epoch": 2.3596377749029753,
"grad_norm": 4.405735492706299,
"learning_rate": 5.697432919629048e-06,
"loss": 0.2204,
"step": 3648
},
{
"epoch": 2.3648124191461837,
"grad_norm": 1.8726969957351685,
"learning_rate": 5.680833245481234e-06,
"loss": 0.2205,
"step": 3656
},
{
"epoch": 2.369987063389392,
"grad_norm": 2.596064805984497,
"learning_rate": 5.664225921901302e-06,
"loss": 0.2197,
"step": 3664
},
{
"epoch": 2.3751617076326004,
"grad_norm": 3.998558282852173,
"learning_rate": 5.647611135479133e-06,
"loss": 0.2132,
"step": 3672
},
{
"epoch": 2.3803363518758087,
"grad_norm": 23.586936950683594,
"learning_rate": 5.6309890728884555e-06,
"loss": 0.2174,
"step": 3680
},
{
"epoch": 2.3855109961190166,
"grad_norm": 41.73709487915039,
"learning_rate": 5.614359920884751e-06,
"loss": 0.2214,
"step": 3688
},
{
"epoch": 2.390685640362225,
"grad_norm": 25.809192657470703,
"learning_rate": 5.5977238663031495e-06,
"loss": 0.2193,
"step": 3696
},
{
"epoch": 2.3958602846054333,
"grad_norm": 2.3301281929016113,
"learning_rate": 5.581081096056337e-06,
"loss": 0.2192,
"step": 3704
},
{
"epoch": 2.4010349288486417,
"grad_norm": 2.47526478767395,
"learning_rate": 5.564431797132454e-06,
"loss": 0.2042,
"step": 3712
},
{
"epoch": 2.40620957309185,
"grad_norm": 24.38184928894043,
"learning_rate": 5.547776156592989e-06,
"loss": 0.2235,
"step": 3720
},
{
"epoch": 2.4113842173350584,
"grad_norm": 1.7953455448150635,
"learning_rate": 5.531114361570684e-06,
"loss": 0.231,
"step": 3728
},
{
"epoch": 2.4165588615782667,
"grad_norm": 2.250443935394287,
"learning_rate": 5.514446599267429e-06,
"loss": 0.2206,
"step": 3736
},
{
"epoch": 2.4217335058214746,
"grad_norm": 3.726274251937866,
"learning_rate": 5.497773056952159e-06,
"loss": 0.2133,
"step": 3744
},
{
"epoch": 2.426908150064683,
"grad_norm": 1.7468153238296509,
"learning_rate": 5.481093921958749e-06,
"loss": 0.2299,
"step": 3752
},
{
"epoch": 2.4320827943078913,
"grad_norm": 15.300987243652344,
"learning_rate": 5.4644093816839086e-06,
"loss": 0.2238,
"step": 3760
},
{
"epoch": 2.4372574385510997,
"grad_norm": 1.6782792806625366,
"learning_rate": 5.44771962358508e-06,
"loss": 0.2315,
"step": 3768
},
{
"epoch": 2.442432082794308,
"grad_norm": 18.028644561767578,
"learning_rate": 5.4310248351783264e-06,
"loss": 0.2366,
"step": 3776
},
{
"epoch": 2.4476067270375164,
"grad_norm": 4.083730220794678,
"learning_rate": 5.414325204036237e-06,
"loss": 0.207,
"step": 3784
},
{
"epoch": 2.4527813712807243,
"grad_norm": 1.102040410041809,
"learning_rate": 5.397620917785799e-06,
"loss": 0.2198,
"step": 3792
},
{
"epoch": 2.4579560155239326,
"grad_norm": 13.378700256347656,
"learning_rate": 5.380912164106312e-06,
"loss": 0.2193,
"step": 3800
},
{
"epoch": 2.463130659767141,
"grad_norm": 1.2787953615188599,
"learning_rate": 5.364199130727262e-06,
"loss": 0.2146,
"step": 3808
},
{
"epoch": 2.4683053040103493,
"grad_norm": 5.001540184020996,
"learning_rate": 5.347482005426224e-06,
"loss": 0.2128,
"step": 3816
},
{
"epoch": 2.4734799482535577,
"grad_norm": 32.12523651123047,
"learning_rate": 5.330760976026744e-06,
"loss": 0.2146,
"step": 3824
},
{
"epoch": 2.478654592496766,
"grad_norm": 2.327051877975464,
"learning_rate": 5.314036230396233e-06,
"loss": 0.2224,
"step": 3832
},
{
"epoch": 2.4838292367399744,
"grad_norm": 26.68153953552246,
"learning_rate": 5.297307956443856e-06,
"loss": 0.2238,
"step": 3840
},
{
"epoch": 2.4890038809831823,
"grad_norm": 2.472919225692749,
"learning_rate": 5.28057634211842e-06,
"loss": 0.2116,
"step": 3848
},
{
"epoch": 2.4941785252263906,
"grad_norm": 90.6447982788086,
"learning_rate": 5.2638415754062625e-06,
"loss": 0.2207,
"step": 3856
},
{
"epoch": 2.499353169469599,
"grad_norm": 1.0954121351242065,
"learning_rate": 5.247103844329137e-06,
"loss": 0.2277,
"step": 3864
},
{
"epoch": 2.5045278137128073,
"grad_norm": 1.563537836074829,
"learning_rate": 5.230363336942105e-06,
"loss": 0.2093,
"step": 3872
},
{
"epoch": 2.5097024579560157,
"grad_norm": 42.30322265625,
"learning_rate": 5.213620241331424e-06,
"loss": 0.2162,
"step": 3880
},
{
"epoch": 2.5148771021992236,
"grad_norm": 1.3426226377487183,
"learning_rate": 5.196874745612425e-06,
"loss": 0.2232,
"step": 3888
},
{
"epoch": 2.520051746442432,
"grad_norm": 9.39691162109375,
"learning_rate": 5.180127037927408e-06,
"loss": 0.2242,
"step": 3896
},
{
"epoch": 2.5252263906856403,
"grad_norm": 4.963808536529541,
"learning_rate": 5.163377306443527e-06,
"loss": 0.2156,
"step": 3904
},
{
"epoch": 2.5304010349288486,
"grad_norm": 5.381854057312012,
"learning_rate": 5.146625739350671e-06,
"loss": 0.2267,
"step": 3912
},
{
"epoch": 2.535575679172057,
"grad_norm": 11.522015571594238,
"learning_rate": 5.129872524859356e-06,
"loss": 0.2337,
"step": 3920
},
{
"epoch": 2.5407503234152653,
"grad_norm": 1.7935131788253784,
"learning_rate": 5.1131178511986045e-06,
"loss": 0.2213,
"step": 3928
},
{
"epoch": 2.5459249676584736,
"grad_norm": 3.484163284301758,
"learning_rate": 5.096361906613836e-06,
"loss": 0.2215,
"step": 3936
},
{
"epoch": 2.551099611901682,
"grad_norm": 1.6095975637435913,
"learning_rate": 5.079604879364746e-06,
"loss": 0.2164,
"step": 3944
},
{
"epoch": 2.55627425614489,
"grad_norm": 1.0419116020202637,
"learning_rate": 5.062846957723194e-06,
"loss": 0.2071,
"step": 3952
},
{
"epoch": 2.5614489003880982,
"grad_norm": 2.196648359298706,
"learning_rate": 5.046088329971095e-06,
"loss": 0.2071,
"step": 3960
},
{
"epoch": 2.5666235446313066,
"grad_norm": 11.791790962219238,
"learning_rate": 5.0293291843982896e-06,
"loss": 0.2325,
"step": 3968
},
{
"epoch": 2.571798188874515,
"grad_norm": 3.8279497623443604,
"learning_rate": 5.012569709300441e-06,
"loss": 0.2219,
"step": 3976
},
{
"epoch": 2.5769728331177233,
"grad_norm": 6.66563081741333,
"learning_rate": 4.995810092976912e-06,
"loss": 0.2211,
"step": 3984
},
{
"epoch": 2.582147477360931,
"grad_norm": 2.60664701461792,
"learning_rate": 4.979050523728654e-06,
"loss": 0.2128,
"step": 3992
},
{
"epoch": 2.5873221216041395,
"grad_norm": 7.721459865570068,
"learning_rate": 4.962291189856089e-06,
"loss": 0.2089,
"step": 4000
},
{
"epoch": 2.592496765847348,
"grad_norm": 4.121895790100098,
"learning_rate": 4.945532279656993e-06,
"loss": 0.2183,
"step": 4008
},
{
"epoch": 2.5976714100905562,
"grad_norm": 4.680197238922119,
"learning_rate": 4.9287739814243835e-06,
"loss": 0.2173,
"step": 4016
},
{
"epoch": 2.6028460543337646,
"grad_norm": 3.2336695194244385,
"learning_rate": 4.912016483444403e-06,
"loss": 0.2069,
"step": 4024
},
{
"epoch": 2.608020698576973,
"grad_norm": 5.2549309730529785,
"learning_rate": 4.8952599739942015e-06,
"loss": 0.2342,
"step": 4032
},
{
"epoch": 2.6131953428201813,
"grad_norm": 1.6960891485214233,
"learning_rate": 4.878504641339822e-06,
"loss": 0.2158,
"step": 4040
},
{
"epoch": 2.6183699870633896,
"grad_norm": 11.856524467468262,
"learning_rate": 4.861750673734085e-06,
"loss": 0.2135,
"step": 4048
},
{
"epoch": 2.6235446313065975,
"grad_norm": 25.278499603271484,
"learning_rate": 4.8449982594144786e-06,
"loss": 0.2054,
"step": 4056
},
{
"epoch": 2.628719275549806,
"grad_norm": 9.263405799865723,
"learning_rate": 4.828247586601035e-06,
"loss": 0.2099,
"step": 4064
},
{
"epoch": 2.6338939197930142,
"grad_norm": 5.896005153656006,
"learning_rate": 4.811498843494222e-06,
"loss": 0.207,
"step": 4072
},
{
"epoch": 2.6390685640362226,
"grad_norm": 1.7929736375808716,
"learning_rate": 4.794752218272824e-06,
"loss": 0.2267,
"step": 4080
},
{
"epoch": 2.644243208279431,
"grad_norm": 1.0446430444717407,
"learning_rate": 4.7780078990918326e-06,
"loss": 0.2206,
"step": 4088
},
{
"epoch": 2.649417852522639,
"grad_norm": 5.339548110961914,
"learning_rate": 4.761266074080326e-06,
"loss": 0.2187,
"step": 4096
},
{
"epoch": 2.654592496765847,
"grad_norm": 14.899389266967773,
"learning_rate": 4.744526931339367e-06,
"loss": 0.207,
"step": 4104
},
{
"epoch": 2.6597671410090555,
"grad_norm": 1.1871739625930786,
"learning_rate": 4.727790658939875e-06,
"loss": 0.2211,
"step": 4112
},
{
"epoch": 2.664941785252264,
"grad_norm": 1.4944238662719727,
"learning_rate": 4.711057444920522e-06,
"loss": 0.2206,
"step": 4120
},
{
"epoch": 2.6701164294954722,
"grad_norm": 62.428558349609375,
"learning_rate": 4.694327477285619e-06,
"loss": 0.2163,
"step": 4128
},
{
"epoch": 2.6752910737386806,
"grad_norm": 9.660483360290527,
"learning_rate": 4.6776009440030035e-06,
"loss": 0.2123,
"step": 4136
},
{
"epoch": 2.680465717981889,
"grad_norm": 7.151432991027832,
"learning_rate": 4.660878033001922e-06,
"loss": 0.2163,
"step": 4144
},
{
"epoch": 2.6856403622250973,
"grad_norm": 8.276389122009277,
"learning_rate": 4.644158932170929e-06,
"loss": 0.2239,
"step": 4152
},
{
"epoch": 2.690815006468305,
"grad_norm": 9.777862548828125,
"learning_rate": 4.627443829355765e-06,
"loss": 0.22,
"step": 4160
},
{
"epoch": 2.6959896507115135,
"grad_norm": 3.226142168045044,
"learning_rate": 4.610732912357256e-06,
"loss": 0.2278,
"step": 4168
},
{
"epoch": 2.701164294954722,
"grad_norm": 4.0428385734558105,
"learning_rate": 4.5940263689291955e-06,
"loss": 0.2135,
"step": 4176
},
{
"epoch": 2.7063389391979302,
"grad_norm": 2.2790136337280273,
"learning_rate": 4.57732438677624e-06,
"loss": 0.2022,
"step": 4184
},
{
"epoch": 2.7115135834411386,
"grad_norm": 8.921182632446289,
"learning_rate": 4.560627153551795e-06,
"loss": 0.2195,
"step": 4192
},
{
"epoch": 2.7166882276843465,
"grad_norm": 1.772870421409607,
"learning_rate": 4.543934856855913e-06,
"loss": 0.2088,
"step": 4200
},
{
"epoch": 2.721862871927555,
"grad_norm": 3.5123379230499268,
"learning_rate": 4.527247684233185e-06,
"loss": 0.2105,
"step": 4208
},
{
"epoch": 2.727037516170763,
"grad_norm": 2.6325693130493164,
"learning_rate": 4.510565823170625e-06,
"loss": 0.2158,
"step": 4216
},
{
"epoch": 2.7322121604139715,
"grad_norm": 79.38518524169922,
"learning_rate": 4.493889461095574e-06,
"loss": 0.2012,
"step": 4224
},
{
"epoch": 2.73738680465718,
"grad_norm": 12.619338035583496,
"learning_rate": 4.477218785373587e-06,
"loss": 0.2151,
"step": 4232
},
{
"epoch": 2.742561448900388,
"grad_norm": 1.3559527397155762,
"learning_rate": 4.460553983306332e-06,
"loss": 0.2048,
"step": 4240
},
{
"epoch": 2.7477360931435966,
"grad_norm": 9.754837036132812,
"learning_rate": 4.443895242129484e-06,
"loss": 0.2134,
"step": 4248
},
{
"epoch": 2.752910737386805,
"grad_norm": 4.612194538116455,
"learning_rate": 4.4272427490106215e-06,
"loss": 0.2063,
"step": 4256
},
{
"epoch": 2.758085381630013,
"grad_norm": 5.114107608795166,
"learning_rate": 4.410596691047123e-06,
"loss": 0.2185,
"step": 4264
},
{
"epoch": 2.763260025873221,
"grad_norm": 9.316654205322266,
"learning_rate": 4.3939572552640645e-06,
"loss": 0.2153,
"step": 4272
},
{
"epoch": 2.7684346701164295,
"grad_norm": 6.500330448150635,
"learning_rate": 4.377324628612123e-06,
"loss": 0.2101,
"step": 4280
},
{
"epoch": 2.773609314359638,
"grad_norm": 1.7955437898635864,
"learning_rate": 4.36069899796547e-06,
"loss": 0.2072,
"step": 4288
},
{
"epoch": 2.778783958602846,
"grad_norm": 113.924072265625,
"learning_rate": 4.344080550119672e-06,
"loss": 0.2066,
"step": 4296
},
{
"epoch": 2.783958602846054,
"grad_norm": 2.6472039222717285,
"learning_rate": 4.327469471789597e-06,
"loss": 0.2122,
"step": 4304
},
{
"epoch": 2.7891332470892625,
"grad_norm": 7.95417594909668,
"learning_rate": 4.310865949607311e-06,
"loss": 0.1984,
"step": 4312
},
{
"epoch": 2.794307891332471,
"grad_norm": 2.1271450519561768,
"learning_rate": 4.294270170119987e-06,
"loss": 0.2263,
"step": 4320
},
{
"epoch": 2.799482535575679,
"grad_norm": 1.0342472791671753,
"learning_rate": 4.277682319787802e-06,
"loss": 0.2248,
"step": 4328
},
{
"epoch": 2.8046571798188875,
"grad_norm": 143.9209747314453,
"learning_rate": 4.261102584981848e-06,
"loss": 0.2026,
"step": 4336
},
{
"epoch": 2.809831824062096,
"grad_norm": 7.666977405548096,
"learning_rate": 4.244531151982034e-06,
"loss": 0.2195,
"step": 4344
},
{
"epoch": 2.815006468305304,
"grad_norm": 123.94723510742188,
"learning_rate": 4.227968206974999e-06,
"loss": 0.2207,
"step": 4352
},
{
"epoch": 2.8201811125485126,
"grad_norm": 52.61326599121094,
"learning_rate": 4.211413936052013e-06,
"loss": 0.2026,
"step": 4360
},
{
"epoch": 2.8253557567917205,
"grad_norm": 6.78623628616333,
"learning_rate": 4.194868525206887e-06,
"loss": 0.2131,
"step": 4368
},
{
"epoch": 2.830530401034929,
"grad_norm": 1.9670113325119019,
"learning_rate": 4.178332160333891e-06,
"loss": 0.2268,
"step": 4376
},
{
"epoch": 2.835705045278137,
"grad_norm": 1.38176691532135,
"learning_rate": 4.161805027225655e-06,
"loss": 0.2192,
"step": 4384
},
{
"epoch": 2.8408796895213455,
"grad_norm": 3.1774935722351074,
"learning_rate": 4.145287311571089e-06,
"loss": 0.2164,
"step": 4392
},
{
"epoch": 2.8460543337645534,
"grad_norm": 4.968908786773682,
"learning_rate": 4.1287791989532935e-06,
"loss": 0.2089,
"step": 4400
},
{
"epoch": 2.8512289780077618,
"grad_norm": 16.97532081604004,
"learning_rate": 4.1122808748474745e-06,
"loss": 0.2143,
"step": 4408
},
{
"epoch": 2.85640362225097,
"grad_norm": 0.9508041739463806,
"learning_rate": 4.095792524618861e-06,
"loss": 0.2205,
"step": 4416
},
{
"epoch": 2.8615782664941785,
"grad_norm": 15.078865051269531,
"learning_rate": 4.079314333520623e-06,
"loss": 0.2224,
"step": 4424
},
{
"epoch": 2.866752910737387,
"grad_norm": 5.82922887802124,
"learning_rate": 4.062846486691784e-06,
"loss": 0.1991,
"step": 4432
},
{
"epoch": 2.871927554980595,
"grad_norm": 1.030604362487793,
"learning_rate": 4.04638916915515e-06,
"loss": 0.2134,
"step": 4440
},
{
"epoch": 2.8771021992238035,
"grad_norm": 1.6634279489517212,
"learning_rate": 4.0299425658152255e-06,
"loss": 0.2113,
"step": 4448
},
{
"epoch": 2.882276843467012,
"grad_norm": 28.07861328125,
"learning_rate": 4.013506861456136e-06,
"loss": 0.2113,
"step": 4456
},
{
"epoch": 2.88745148771022,
"grad_norm": 6.467888355255127,
"learning_rate": 3.997082240739551e-06,
"loss": 0.2299,
"step": 4464
},
{
"epoch": 2.892626131953428,
"grad_norm": 2.268150568008423,
"learning_rate": 3.9806688882026125e-06,
"loss": 0.2134,
"step": 4472
},
{
"epoch": 2.8978007761966365,
"grad_norm": 19.535581588745117,
"learning_rate": 3.964266988255861e-06,
"loss": 0.2224,
"step": 4480
},
{
"epoch": 2.902975420439845,
"grad_norm": 3.764432907104492,
"learning_rate": 3.94787672518116e-06,
"loss": 0.2122,
"step": 4488
},
{
"epoch": 2.908150064683053,
"grad_norm": 27.98623275756836,
"learning_rate": 3.931498283129631e-06,
"loss": 0.2009,
"step": 4496
},
{
"epoch": 2.913324708926261,
"grad_norm": 1.0380629301071167,
"learning_rate": 3.915131846119581e-06,
"loss": 0.2076,
"step": 4504
},
{
"epoch": 2.9184993531694694,
"grad_norm": 3.117368698120117,
"learning_rate": 3.898777598034434e-06,
"loss": 0.2179,
"step": 4512
},
{
"epoch": 2.9236739974126777,
"grad_norm": 13.296104431152344,
"learning_rate": 3.882435722620667e-06,
"loss": 0.2045,
"step": 4520
},
{
"epoch": 2.928848641655886,
"grad_norm": 2.6790828704833984,
"learning_rate": 3.866106403485745e-06,
"loss": 0.2138,
"step": 4528
},
{
"epoch": 2.9340232858990944,
"grad_norm": 0.8345991373062134,
"learning_rate": 3.849789824096061e-06,
"loss": 0.1957,
"step": 4536
},
{
"epoch": 2.939197930142303,
"grad_norm": 8.058691024780273,
"learning_rate": 3.833486167774867e-06,
"loss": 0.2193,
"step": 4544
},
{
"epoch": 2.944372574385511,
"grad_norm": 2.5544962882995605,
"learning_rate": 3.817195617700224e-06,
"loss": 0.2215,
"step": 4552
},
{
"epoch": 2.9495472186287195,
"grad_norm": 2.118175983428955,
"learning_rate": 3.800918356902936e-06,
"loss": 0.2082,
"step": 4560
},
{
"epoch": 2.9547218628719274,
"grad_norm": 1.6129382848739624,
"learning_rate": 3.784654568264497e-06,
"loss": 0.2148,
"step": 4568
},
{
"epoch": 2.9598965071151357,
"grad_norm": 14.738019943237305,
"learning_rate": 3.768404434515038e-06,
"loss": 0.216,
"step": 4576
},
{
"epoch": 2.965071151358344,
"grad_norm": 10.588520050048828,
"learning_rate": 3.7521681382312693e-06,
"loss": 0.2179,
"step": 4584
},
{
"epoch": 2.9702457956015524,
"grad_norm": 26.88380241394043,
"learning_rate": 3.735945861834434e-06,
"loss": 0.2132,
"step": 4592
},
{
"epoch": 2.975420439844761,
"grad_norm": 8.701356887817383,
"learning_rate": 3.7197377875882547e-06,
"loss": 0.2174,
"step": 4600
},
{
"epoch": 2.9805950840879687,
"grad_norm": 9.462160110473633,
"learning_rate": 3.703544097596887e-06,
"loss": 0.2296,
"step": 4608
},
{
"epoch": 2.985769728331177,
"grad_norm": 7.985735893249512,
"learning_rate": 3.6873649738028737e-06,
"loss": 0.2121,
"step": 4616
},
{
"epoch": 2.9909443725743854,
"grad_norm": 24.12420654296875,
"learning_rate": 3.671200597985104e-06,
"loss": 0.206,
"step": 4624
},
{
"epoch": 2.9961190168175937,
"grad_norm": 1.4237028360366821,
"learning_rate": 3.655051151756762e-06,
"loss": 0.2072,
"step": 4632
},
{
"epoch": 3.001293661060802,
"grad_norm": 4.103756427764893,
"learning_rate": 3.638916816563298e-06,
"loss": 0.1977,
"step": 4640
},
{
"epoch": 3.0064683053040104,
"grad_norm": 2.717452049255371,
"learning_rate": 3.622797773680379e-06,
"loss": 0.2233,
"step": 4648
},
{
"epoch": 3.011642949547219,
"grad_norm": 3.143430233001709,
"learning_rate": 3.6066942042118568e-06,
"loss": 0.2246,
"step": 4656
},
{
"epoch": 3.0168175937904267,
"grad_norm": 0.8686035871505737,
"learning_rate": 3.5906062890877368e-06,
"loss": 0.2112,
"step": 4664
},
{
"epoch": 3.021992238033635,
"grad_norm": 7.7689056396484375,
"learning_rate": 3.5745342090621406e-06,
"loss": 0.2288,
"step": 4672
},
{
"epoch": 3.0271668822768434,
"grad_norm": 7.347503185272217,
"learning_rate": 3.5584781447112737e-06,
"loss": 0.1989,
"step": 4680
},
{
"epoch": 3.0323415265200517,
"grad_norm": 29.066707611083984,
"learning_rate": 3.542438276431401e-06,
"loss": 0.1981,
"step": 4688
},
{
"epoch": 3.03751617076326,
"grad_norm": 16.907032012939453,
"learning_rate": 3.526414784436819e-06,
"loss": 0.2241,
"step": 4696
},
{
"epoch": 3.0426908150064684,
"grad_norm": 50.19180679321289,
"learning_rate": 3.510407848757828e-06,
"loss": 0.2103,
"step": 4704
},
{
"epoch": 3.047865459249677,
"grad_norm": 3.969433069229126,
"learning_rate": 3.494417649238713e-06,
"loss": 0.2084,
"step": 4712
},
{
"epoch": 3.0530401034928847,
"grad_norm": 1.613051176071167,
"learning_rate": 3.47844436553572e-06,
"loss": 0.207,
"step": 4720
},
{
"epoch": 3.058214747736093,
"grad_norm": 15.627549171447754,
"learning_rate": 3.462488177115041e-06,
"loss": 0.2232,
"step": 4728
},
{
"epoch": 3.0633893919793014,
"grad_norm": 4.300905704498291,
"learning_rate": 3.4465492632507946e-06,
"loss": 0.2122,
"step": 4736
},
{
"epoch": 3.0685640362225097,
"grad_norm": 7.382449150085449,
"learning_rate": 3.4306278030230143e-06,
"loss": 0.2146,
"step": 4744
},
{
"epoch": 3.073738680465718,
"grad_norm": 2.4655721187591553,
"learning_rate": 3.4147239753156324e-06,
"loss": 0.2172,
"step": 4752
},
{
"epoch": 3.0789133247089264,
"grad_norm": 3.6668355464935303,
"learning_rate": 3.398837958814475e-06,
"loss": 0.2068,
"step": 4760
},
{
"epoch": 3.0840879689521343,
"grad_norm": 2.1171956062316895,
"learning_rate": 3.382969932005252e-06,
"loss": 0.2049,
"step": 4768
},
{
"epoch": 3.0892626131953427,
"grad_norm": 2.6610488891601562,
"learning_rate": 3.367120073171548e-06,
"loss": 0.2132,
"step": 4776
},
{
"epoch": 3.094437257438551,
"grad_norm": 3.1115005016326904,
"learning_rate": 3.351288560392833e-06,
"loss": 0.2113,
"step": 4784
},
{
"epoch": 3.0996119016817594,
"grad_norm": 39.48991775512695,
"learning_rate": 3.335475571542442e-06,
"loss": 0.1985,
"step": 4792
},
{
"epoch": 3.1047865459249677,
"grad_norm": 4.187602996826172,
"learning_rate": 3.3196812842855895e-06,
"loss": 0.2209,
"step": 4800
},
{
"epoch": 3.109961190168176,
"grad_norm": 2.5106544494628906,
"learning_rate": 3.303905876077372e-06,
"loss": 0.2136,
"step": 4808
},
{
"epoch": 3.1151358344113844,
"grad_norm": 5.031343460083008,
"learning_rate": 3.28814952416077e-06,
"loss": 0.2079,
"step": 4816
},
{
"epoch": 3.1203104786545923,
"grad_norm": 4.405430316925049,
"learning_rate": 3.272412405564659e-06,
"loss": 0.2209,
"step": 4824
},
{
"epoch": 3.1254851228978007,
"grad_norm": 4.106354713439941,
"learning_rate": 3.2566946971018225e-06,
"loss": 0.2219,
"step": 4832
},
{
"epoch": 3.130659767141009,
"grad_norm": 74.13800811767578,
"learning_rate": 3.240996575366961e-06,
"loss": 0.2264,
"step": 4840
},
{
"epoch": 3.1358344113842174,
"grad_norm": 2.211841344833374,
"learning_rate": 3.225318216734713e-06,
"loss": 0.2095,
"step": 4848
},
{
"epoch": 3.1410090556274257,
"grad_norm": 5.970486164093018,
"learning_rate": 3.209659797357669e-06,
"loss": 0.2156,
"step": 4856
},
{
"epoch": 3.146183699870634,
"grad_norm": 2.485638380050659,
"learning_rate": 3.1940214931643945e-06,
"loss": 0.2137,
"step": 4864
},
{
"epoch": 3.151358344113842,
"grad_norm": 1.4314054250717163,
"learning_rate": 3.1784034798574514e-06,
"loss": 0.2071,
"step": 4872
},
{
"epoch": 3.1565329883570503,
"grad_norm": 2.9645638465881348,
"learning_rate": 3.1628059329114286e-06,
"loss": 0.2172,
"step": 4880
},
{
"epoch": 3.1617076326002587,
"grad_norm": 2.3624343872070312,
"learning_rate": 3.1472290275709642e-06,
"loss": 0.2201,
"step": 4888
},
{
"epoch": 3.166882276843467,
"grad_norm": 8.607010841369629,
"learning_rate": 3.1316729388487815e-06,
"loss": 0.2092,
"step": 4896
},
{
"epoch": 3.1720569210866754,
"grad_norm": 1.539337396621704,
"learning_rate": 3.1161378415237197e-06,
"loss": 0.2105,
"step": 4904
},
{
"epoch": 3.1772315653298837,
"grad_norm": 2.8210718631744385,
"learning_rate": 3.1006239101387725e-06,
"loss": 0.2279,
"step": 4912
},
{
"epoch": 3.1824062095730916,
"grad_norm": 2.121821641921997,
"learning_rate": 3.0851313189991226e-06,
"loss": 0.2033,
"step": 4920
},
{
"epoch": 3.1875808538163,
"grad_norm": 1.296933650970459,
"learning_rate": 3.0696602421701943e-06,
"loss": 0.2021,
"step": 4928
},
{
"epoch": 3.1927554980595083,
"grad_norm": 6.50001335144043,
"learning_rate": 3.054210853475682e-06,
"loss": 0.209,
"step": 4936
},
{
"epoch": 3.1979301423027167,
"grad_norm": 4.615538120269775,
"learning_rate": 3.0387833264956078e-06,
"loss": 0.2133,
"step": 4944
},
{
"epoch": 3.203104786545925,
"grad_norm": 2.2612783908843994,
"learning_rate": 3.02337783456437e-06,
"loss": 0.2207,
"step": 4952
},
{
"epoch": 3.2082794307891334,
"grad_norm": 5.742753028869629,
"learning_rate": 3.007994550768793e-06,
"loss": 0.2244,
"step": 4960
},
{
"epoch": 3.2134540750323417,
"grad_norm": 2.2950289249420166,
"learning_rate": 2.9926336479461846e-06,
"loss": 0.2055,
"step": 4968
},
{
"epoch": 3.2186287192755496,
"grad_norm": 1.1664949655532837,
"learning_rate": 2.9772952986823943e-06,
"loss": 0.2003,
"step": 4976
},
{
"epoch": 3.223803363518758,
"grad_norm": 16.022438049316406,
"learning_rate": 2.9619796753098716e-06,
"loss": 0.2171,
"step": 4984
},
{
"epoch": 3.2289780077619663,
"grad_norm": 1.7189433574676514,
"learning_rate": 2.946686949905733e-06,
"loss": 0.2308,
"step": 4992
},
{
"epoch": 3.2341526520051747,
"grad_norm": 9.547940254211426,
"learning_rate": 2.9314172942898257e-06,
"loss": 0.2124,
"step": 5000
},
{
"epoch": 3.239327296248383,
"grad_norm": 2.51373028755188,
"learning_rate": 2.9161708800228e-06,
"loss": 0.196,
"step": 5008
},
{
"epoch": 3.2445019404915914,
"grad_norm": 2.1994516849517822,
"learning_rate": 2.900947878404181e-06,
"loss": 0.2283,
"step": 5016
},
{
"epoch": 3.2496765847347993,
"grad_norm": 3.118130922317505,
"learning_rate": 2.8857484604704415e-06,
"loss": 0.2067,
"step": 5024
},
{
"epoch": 3.2548512289780076,
"grad_norm": 11.828572273254395,
"learning_rate": 2.870572796993084e-06,
"loss": 0.1918,
"step": 5032
},
{
"epoch": 3.260025873221216,
"grad_norm": 9.986909866333008,
"learning_rate": 2.8554210584767188e-06,
"loss": 0.2205,
"step": 5040
},
{
"epoch": 3.2652005174644243,
"grad_norm": 10.845985412597656,
"learning_rate": 2.8402934151571505e-06,
"loss": 0.2055,
"step": 5048
},
{
"epoch": 3.2703751617076326,
"grad_norm": 6.319619655609131,
"learning_rate": 2.8251900369994645e-06,
"loss": 0.2106,
"step": 5056
},
{
"epoch": 3.275549805950841,
"grad_norm": 6.275879859924316,
"learning_rate": 2.8101110936961153e-06,
"loss": 0.2055,
"step": 5064
},
{
"epoch": 3.2807244501940493,
"grad_norm": 56.02274703979492,
"learning_rate": 2.795056754665028e-06,
"loss": 0.2066,
"step": 5072
},
{
"epoch": 3.2858990944372573,
"grad_norm": 4.814873218536377,
"learning_rate": 2.7800271890476836e-06,
"loss": 0.2145,
"step": 5080
},
{
"epoch": 3.2910737386804656,
"grad_norm": 2.2435498237609863,
"learning_rate": 2.765022565707226e-06,
"loss": 0.2214,
"step": 5088
},
{
"epoch": 3.296248382923674,
"grad_norm": 13.148270606994629,
"learning_rate": 2.750043053226561e-06,
"loss": 0.2017,
"step": 5096
},
{
"epoch": 3.3014230271668823,
"grad_norm": 8.37467098236084,
"learning_rate": 2.735088819906465e-06,
"loss": 0.202,
"step": 5104
},
{
"epoch": 3.3065976714100906,
"grad_norm": 17.98732566833496,
"learning_rate": 2.7201600337636946e-06,
"loss": 0.2121,
"step": 5112
},
{
"epoch": 3.311772315653299,
"grad_norm": 6.076496601104736,
"learning_rate": 2.7052568625290955e-06,
"loss": 0.2187,
"step": 5120
},
{
"epoch": 3.316946959896507,
"grad_norm": 2.1653237342834473,
"learning_rate": 2.690379473645718e-06,
"loss": 0.2119,
"step": 5128
},
{
"epoch": 3.3221216041397152,
"grad_norm": 10.182047843933105,
"learning_rate": 2.675528034266941e-06,
"loss": 0.2204,
"step": 5136
},
{
"epoch": 3.3272962483829236,
"grad_norm": 29.412364959716797,
"learning_rate": 2.6607027112545893e-06,
"loss": 0.2093,
"step": 5144
},
{
"epoch": 3.332470892626132,
"grad_norm": 4.263775825500488,
"learning_rate": 2.645903671177058e-06,
"loss": 0.2191,
"step": 5152
},
{
"epoch": 3.3376455368693403,
"grad_norm": 30.59326934814453,
"learning_rate": 2.631131080307445e-06,
"loss": 0.2026,
"step": 5160
},
{
"epoch": 3.3428201811125486,
"grad_norm": 5.779555320739746,
"learning_rate": 2.6163851046216813e-06,
"loss": 0.2137,
"step": 5168
},
{
"epoch": 3.347994825355757,
"grad_norm": 3.681560754776001,
"learning_rate": 2.6016659097966636e-06,
"loss": 0.2146,
"step": 5176
},
{
"epoch": 3.353169469598965,
"grad_norm": 1.4380924701690674,
"learning_rate": 2.5869736612083955e-06,
"loss": 0.2087,
"step": 5184
},
{
"epoch": 3.3583441138421732,
"grad_norm": 12.789270401000977,
"learning_rate": 2.572308523930131e-06,
"loss": 0.216,
"step": 5192
},
{
"epoch": 3.3635187580853816,
"grad_norm": 17.20673370361328,
"learning_rate": 2.557670662730515e-06,
"loss": 0.2145,
"step": 5200
},
{
"epoch": 3.36869340232859,
"grad_norm": 1.4245859384536743,
"learning_rate": 2.5430602420717355e-06,
"loss": 0.2107,
"step": 5208
},
{
"epoch": 3.3738680465717983,
"grad_norm": 13.390450477600098,
"learning_rate": 2.528477426107678e-06,
"loss": 0.204,
"step": 5216
},
{
"epoch": 3.3790426908150066,
"grad_norm": 1.8627618551254272,
"learning_rate": 2.513922378682075e-06,
"loss": 0.2112,
"step": 5224
},
{
"epoch": 3.3842173350582145,
"grad_norm": 1.2581387758255005,
"learning_rate": 2.499395263326669e-06,
"loss": 0.2056,
"step": 5232
},
{
"epoch": 3.389391979301423,
"grad_norm": 1.6016255617141724,
"learning_rate": 2.484896243259375e-06,
"loss": 0.2077,
"step": 5240
},
{
"epoch": 3.3945666235446312,
"grad_norm": 6.12626314163208,
"learning_rate": 2.470425481382447e-06,
"loss": 0.2113,
"step": 5248
},
{
"epoch": 3.3997412677878396,
"grad_norm": 2.2390005588531494,
"learning_rate": 2.4559831402806454e-06,
"loss": 0.2097,
"step": 5256
},
{
"epoch": 3.404915912031048,
"grad_norm": 5.566039085388184,
"learning_rate": 2.441569382219413e-06,
"loss": 0.2065,
"step": 5264
},
{
"epoch": 3.4100905562742563,
"grad_norm": 1.4189672470092773,
"learning_rate": 2.427184369143051e-06,
"loss": 0.2182,
"step": 5272
},
{
"epoch": 3.4152652005174646,
"grad_norm": 22.4144287109375,
"learning_rate": 2.4128282626728985e-06,
"loss": 0.2052,
"step": 5280
},
{
"epoch": 3.4204398447606725,
"grad_norm": 2.110011339187622,
"learning_rate": 2.398501224105517e-06,
"loss": 0.2091,
"step": 5288
},
{
"epoch": 3.425614489003881,
"grad_norm": 2.668170928955078,
"learning_rate": 2.384203414410878e-06,
"loss": 0.2092,
"step": 5296
},
{
"epoch": 3.4307891332470892,
"grad_norm": 3.0023293495178223,
"learning_rate": 2.3699349942305603e-06,
"loss": 0.2116,
"step": 5304
},
{
"epoch": 3.4359637774902976,
"grad_norm": 4.757721900939941,
"learning_rate": 2.355696123875934e-06,
"loss": 0.2025,
"step": 5312
},
{
"epoch": 3.441138421733506,
"grad_norm": 19.3017635345459,
"learning_rate": 2.341486963326366e-06,
"loss": 0.2227,
"step": 5320
},
{
"epoch": 3.4463130659767143,
"grad_norm": 1.613916039466858,
"learning_rate": 2.3273076722274233e-06,
"loss": 0.1964,
"step": 5328
},
{
"epoch": 3.451487710219922,
"grad_norm": 2.9506986141204834,
"learning_rate": 2.3131584098890775e-06,
"loss": 0.2258,
"step": 5336
},
{
"epoch": 3.4566623544631305,
"grad_norm": 6.207396984100342,
"learning_rate": 2.299039335283914e-06,
"loss": 0.2156,
"step": 5344
},
{
"epoch": 3.461836998706339,
"grad_norm": 1.0315911769866943,
"learning_rate": 2.2849506070453466e-06,
"loss": 0.1993,
"step": 5352
},
{
"epoch": 3.4670116429495472,
"grad_norm": 39.45634078979492,
"learning_rate": 2.27089238346584e-06,
"loss": 0.201,
"step": 5360
},
{
"epoch": 3.4721862871927556,
"grad_norm": 13.167795181274414,
"learning_rate": 2.2568648224951217e-06,
"loss": 0.2168,
"step": 5368
},
{
"epoch": 3.477360931435964,
"grad_norm": 19.175676345825195,
"learning_rate": 2.2428680817384153e-06,
"loss": 0.1958,
"step": 5376
},
{
"epoch": 3.4825355756791723,
"grad_norm": 6.49905252456665,
"learning_rate": 2.228902318454666e-06,
"loss": 0.2009,
"step": 5384
},
{
"epoch": 3.48771021992238,
"grad_norm": 2.5502731800079346,
"learning_rate": 2.214967689554775e-06,
"loss": 0.2018,
"step": 5392
},
{
"epoch": 3.4928848641655885,
"grad_norm": 17.03938102722168,
"learning_rate": 2.201064351599837e-06,
"loss": 0.2102,
"step": 5400
},
{
"epoch": 3.498059508408797,
"grad_norm": 3.042534112930298,
"learning_rate": 2.18719246079938e-06,
"loss": 0.212,
"step": 5408
},
{
"epoch": 3.503234152652005,
"grad_norm": 1.2638347148895264,
"learning_rate": 2.17335217300961e-06,
"loss": 0.2273,
"step": 5416
},
{
"epoch": 3.5084087968952136,
"grad_norm": 13.239524841308594,
"learning_rate": 2.1595436437316614e-06,
"loss": 0.2107,
"step": 5424
},
{
"epoch": 3.5135834411384215,
"grad_norm": 1.3805886507034302,
"learning_rate": 2.1457670281098493e-06,
"loss": 0.2167,
"step": 5432
},
{
"epoch": 3.51875808538163,
"grad_norm": 4.028883934020996,
"learning_rate": 2.132022480929926e-06,
"loss": 0.2158,
"step": 5440
},
{
"epoch": 3.523932729624838,
"grad_norm": 5.846325397491455,
"learning_rate": 2.118310156617342e-06,
"loss": 0.2237,
"step": 5448
},
{
"epoch": 3.5291073738680465,
"grad_norm": 1.7205686569213867,
"learning_rate": 2.1046302092355107e-06,
"loss": 0.2206,
"step": 5456
},
{
"epoch": 3.534282018111255,
"grad_norm": 16.556434631347656,
"learning_rate": 2.0909827924840787e-06,
"loss": 0.208,
"step": 5464
},
{
"epoch": 3.539456662354463,
"grad_norm": 1.7649803161621094,
"learning_rate": 2.0773680596971976e-06,
"loss": 0.2087,
"step": 5472
},
{
"epoch": 3.5446313065976716,
"grad_norm": 33.007301330566406,
"learning_rate": 2.0637861638418003e-06,
"loss": 0.2162,
"step": 5480
},
{
"epoch": 3.54980595084088,
"grad_norm": 55.374752044677734,
"learning_rate": 2.0502372575158865e-06,
"loss": 0.2078,
"step": 5488
},
{
"epoch": 3.554980595084088,
"grad_norm": 9.658388137817383,
"learning_rate": 2.0367214929468036e-06,
"loss": 0.2036,
"step": 5496
},
{
"epoch": 3.560155239327296,
"grad_norm": 15.524160385131836,
"learning_rate": 2.0232390219895364e-06,
"loss": 0.2035,
"step": 5504
},
{
"epoch": 3.5653298835705045,
"grad_norm": 3.2139875888824463,
"learning_rate": 2.009789996125009e-06,
"loss": 0.2099,
"step": 5512
},
{
"epoch": 3.570504527813713,
"grad_norm": 1.951788067817688,
"learning_rate": 1.99637456645837e-06,
"loss": 0.2027,
"step": 5520
},
{
"epoch": 3.575679172056921,
"grad_norm": 1.508257508277893,
"learning_rate": 1.982992883717304e-06,
"loss": 0.2064,
"step": 5528
},
{
"epoch": 3.580853816300129,
"grad_norm": 1.6039284467697144,
"learning_rate": 1.9696450982503356e-06,
"loss": 0.2065,
"step": 5536
},
{
"epoch": 3.5860284605433375,
"grad_norm": 7.59080696105957,
"learning_rate": 1.95633136002514e-06,
"loss": 0.2112,
"step": 5544
},
{
"epoch": 3.591203104786546,
"grad_norm": 25.365097045898438,
"learning_rate": 1.943051818626857e-06,
"loss": 0.2115,
"step": 5552
},
{
"epoch": 3.596377749029754,
"grad_norm": 1.8311065435409546,
"learning_rate": 1.9298066232564135e-06,
"loss": 0.203,
"step": 5560
},
{
"epoch": 3.6015523932729625,
"grad_norm": 12.468267440795898,
"learning_rate": 1.916595922728843e-06,
"loss": 0.2106,
"step": 5568
},
{
"epoch": 3.606727037516171,
"grad_norm": 3.0780019760131836,
"learning_rate": 1.9034198654716163e-06,
"loss": 0.2152,
"step": 5576
},
{
"epoch": 3.611901681759379,
"grad_norm": 1.8586463928222656,
"learning_rate": 1.890278599522975e-06,
"loss": 0.203,
"step": 5584
},
{
"epoch": 3.6170763260025875,
"grad_norm": 1.7706098556518555,
"learning_rate": 1.8771722725302644e-06,
"loss": 0.2188,
"step": 5592
},
{
"epoch": 3.6222509702457955,
"grad_norm": 2.8252525329589844,
"learning_rate": 1.864101031748277e-06,
"loss": 0.2101,
"step": 5600
},
{
"epoch": 3.627425614489004,
"grad_norm": 2.265062093734741,
"learning_rate": 1.8510650240376e-06,
"loss": 0.2018,
"step": 5608
},
{
"epoch": 3.632600258732212,
"grad_norm": 2.893099546432495,
"learning_rate": 1.8380643958629596e-06,
"loss": 0.2047,
"step": 5616
},
{
"epoch": 3.6377749029754205,
"grad_norm": 1.9583306312561035,
"learning_rate": 1.8250992932915811e-06,
"loss": 0.2101,
"step": 5624
},
{
"epoch": 3.642949547218629,
"grad_norm": 1.5815550088882446,
"learning_rate": 1.8121698619915457e-06,
"loss": 0.2153,
"step": 5632
},
{
"epoch": 3.6481241914618368,
"grad_norm": 20.85481834411621,
"learning_rate": 1.7992762472301511e-06,
"loss": 0.2095,
"step": 5640
},
{
"epoch": 3.653298835705045,
"grad_norm": 3.830641269683838,
"learning_rate": 1.7864185938722868e-06,
"loss": 0.2056,
"step": 5648
},
{
"epoch": 3.6584734799482534,
"grad_norm": 15.252459526062012,
"learning_rate": 1.7735970463787967e-06,
"loss": 0.2233,
"step": 5656
},
{
"epoch": 3.663648124191462,
"grad_norm": 2.826512336730957,
"learning_rate": 1.7608117488048636e-06,
"loss": 0.2275,
"step": 5664
},
{
"epoch": 3.66882276843467,
"grad_norm": 2.0030300617218018,
"learning_rate": 1.7480628447983878e-06,
"loss": 0.2101,
"step": 5672
},
{
"epoch": 3.6739974126778785,
"grad_norm": 2.019261598587036,
"learning_rate": 1.735350477598372e-06,
"loss": 0.2121,
"step": 5680
},
{
"epoch": 3.679172056921087,
"grad_norm": 2.8684234619140625,
"learning_rate": 1.7226747900333135e-06,
"loss": 0.2239,
"step": 5688
},
{
"epoch": 3.684346701164295,
"grad_norm": 18.148910522460938,
"learning_rate": 1.7100359245196035e-06,
"loss": 0.2087,
"step": 5696
},
{
"epoch": 3.689521345407503,
"grad_norm": 2.289294719696045,
"learning_rate": 1.6974340230599173e-06,
"loss": 0.1977,
"step": 5704
},
{
"epoch": 3.6946959896507114,
"grad_norm": 7.737388610839844,
"learning_rate": 1.6848692272416268e-06,
"loss": 0.2152,
"step": 5712
},
{
"epoch": 3.69987063389392,
"grad_norm": 17.832653045654297,
"learning_rate": 1.6723416782352076e-06,
"loss": 0.2132,
"step": 5720
},
{
"epoch": 3.705045278137128,
"grad_norm": 2.064863443374634,
"learning_rate": 1.659851516792651e-06,
"loss": 0.2106,
"step": 5728
},
{
"epoch": 3.7102199223803365,
"grad_norm": 4.075965404510498,
"learning_rate": 1.647398883245886e-06,
"loss": 0.2105,
"step": 5736
},
{
"epoch": 3.7153945666235444,
"grad_norm": 18.446760177612305,
"learning_rate": 1.6349839175051995e-06,
"loss": 0.213,
"step": 5744
},
{
"epoch": 3.7205692108667527,
"grad_norm": 22.746885299682617,
"learning_rate": 1.622606759057666e-06,
"loss": 0.2037,
"step": 5752
},
{
"epoch": 3.725743855109961,
"grad_norm": 1.7176026105880737,
"learning_rate": 1.610267546965581e-06,
"loss": 0.2129,
"step": 5760
},
{
"epoch": 3.7309184993531694,
"grad_norm": 1.911559820175171,
"learning_rate": 1.5979664198648959e-06,
"loss": 0.227,
"step": 5768
},
{
"epoch": 3.736093143596378,
"grad_norm": 21.561500549316406,
"learning_rate": 1.5857035159636625e-06,
"loss": 0.2178,
"step": 5776
},
{
"epoch": 3.741267787839586,
"grad_norm": 14.1412353515625,
"learning_rate": 1.5734789730404815e-06,
"loss": 0.2048,
"step": 5784
},
{
"epoch": 3.7464424320827945,
"grad_norm": 8.0577392578125,
"learning_rate": 1.5612929284429484e-06,
"loss": 0.2079,
"step": 5792
},
{
"epoch": 3.751617076326003,
"grad_norm": 10.920722007751465,
"learning_rate": 1.549145519086122e-06,
"loss": 0.1922,
"step": 5800
},
{
"epoch": 3.7567917205692107,
"grad_norm": 1.764875888824463,
"learning_rate": 1.5370368814509727e-06,
"loss": 0.1979,
"step": 5808
},
{
"epoch": 3.761966364812419,
"grad_norm": 4.495997428894043,
"learning_rate": 1.5249671515828569e-06,
"loss": 0.2098,
"step": 5816
},
{
"epoch": 3.7671410090556274,
"grad_norm": 4.6347503662109375,
"learning_rate": 1.5129364650899869e-06,
"loss": 0.2254,
"step": 5824
},
{
"epoch": 3.772315653298836,
"grad_norm": 7.4554009437561035,
"learning_rate": 1.5009449571419077e-06,
"loss": 0.2071,
"step": 5832
},
{
"epoch": 3.777490297542044,
"grad_norm": 1.338654637336731,
"learning_rate": 1.4889927624679762e-06,
"loss": 0.2029,
"step": 5840
},
{
"epoch": 3.782664941785252,
"grad_norm": 3.0357022285461426,
"learning_rate": 1.4770800153558513e-06,
"loss": 0.2136,
"step": 5848
},
{
"epoch": 3.7878395860284604,
"grad_norm": 11.845126152038574,
"learning_rate": 1.4652068496499804e-06,
"loss": 0.2241,
"step": 5856
},
{
"epoch": 3.7930142302716687,
"grad_norm": 1.81815505027771,
"learning_rate": 1.4533733987501004e-06,
"loss": 0.2151,
"step": 5864
},
{
"epoch": 3.798188874514877,
"grad_norm": 1.015817403793335,
"learning_rate": 1.4415797956097356e-06,
"loss": 0.2179,
"step": 5872
},
{
"epoch": 3.8033635187580854,
"grad_norm": 5.7766218185424805,
"learning_rate": 1.4298261727347034e-06,
"loss": 0.2151,
"step": 5880
},
{
"epoch": 3.8085381630012938,
"grad_norm": 2.376643180847168,
"learning_rate": 1.41811266218163e-06,
"loss": 0.1969,
"step": 5888
},
{
"epoch": 3.813712807244502,
"grad_norm": 3.778541088104248,
"learning_rate": 1.4064393955564615e-06,
"loss": 0.211,
"step": 5896
},
{
"epoch": 3.8188874514877105,
"grad_norm": 8.010899543762207,
"learning_rate": 1.3948065040129882e-06,
"loss": 0.2075,
"step": 5904
},
{
"epoch": 3.8240620957309184,
"grad_norm": 11.207403182983398,
"learning_rate": 1.3832141182513699e-06,
"loss": 0.2022,
"step": 5912
},
{
"epoch": 3.8292367399741267,
"grad_norm": 1.3528246879577637,
"learning_rate": 1.3716623685166685e-06,
"loss": 0.2143,
"step": 5920
},
{
"epoch": 3.834411384217335,
"grad_norm": 26.18692970275879,
"learning_rate": 1.3601513845973835e-06,
"loss": 0.2224,
"step": 5928
},
{
"epoch": 3.8395860284605434,
"grad_norm": 1.8334400653839111,
"learning_rate": 1.3486812958239931e-06,
"loss": 0.2178,
"step": 5936
},
{
"epoch": 3.8447606727037518,
"grad_norm": 5.360263824462891,
"learning_rate": 1.3372522310675063e-06,
"loss": 0.2175,
"step": 5944
},
{
"epoch": 3.8499353169469597,
"grad_norm": 1.6526539325714111,
"learning_rate": 1.3258643187380071e-06,
"loss": 0.2074,
"step": 5952
},
{
"epoch": 3.855109961190168,
"grad_norm": 10.19829273223877,
"learning_rate": 1.3145176867832165e-06,
"loss": 0.2067,
"step": 5960
},
{
"epoch": 3.8602846054333764,
"grad_norm": 3.2970707416534424,
"learning_rate": 1.3032124626870546e-06,
"loss": 0.2229,
"step": 5968
},
{
"epoch": 3.8654592496765847,
"grad_norm": 2.3017404079437256,
"learning_rate": 1.2919487734682073e-06,
"loss": 0.2071,
"step": 5976
},
{
"epoch": 3.870633893919793,
"grad_norm": 11.9258394241333,
"learning_rate": 1.2807267456787004e-06,
"loss": 0.204,
"step": 5984
},
{
"epoch": 3.8758085381630014,
"grad_norm": 1.4012444019317627,
"learning_rate": 1.2695465054024752e-06,
"loss": 0.2191,
"step": 5992
},
{
"epoch": 3.8809831824062098,
"grad_norm": 1.8280630111694336,
"learning_rate": 1.2584081782539764e-06,
"loss": 0.2163,
"step": 6000
},
{
"epoch": 3.886157826649418,
"grad_norm": 5.412600994110107,
"learning_rate": 1.247311889376736e-06,
"loss": 0.2066,
"step": 6008
},
{
"epoch": 3.891332470892626,
"grad_norm": 18.54897117614746,
"learning_rate": 1.2362577634419692e-06,
"loss": 0.2104,
"step": 6016
},
{
"epoch": 3.8965071151358344,
"grad_norm": 21.60243034362793,
"learning_rate": 1.2252459246471754e-06,
"loss": 0.2074,
"step": 6024
},
{
"epoch": 3.9016817593790427,
"grad_norm": 24.753875732421875,
"learning_rate": 1.2142764967147385e-06,
"loss": 0.2005,
"step": 6032
},
{
"epoch": 3.906856403622251,
"grad_norm": 12.543983459472656,
"learning_rate": 1.2033496028905445e-06,
"loss": 0.204,
"step": 6040
},
{
"epoch": 3.9120310478654594,
"grad_norm": 8.506756782531738,
"learning_rate": 1.1924653659425862e-06,
"loss": 0.2109,
"step": 6048
},
{
"epoch": 3.9172056921086673,
"grad_norm": 6.222147464752197,
"learning_rate": 1.1816239081595926e-06,
"loss": 0.203,
"step": 6056
},
{
"epoch": 3.9223803363518757,
"grad_norm": 1.7484989166259766,
"learning_rate": 1.1708253513496504e-06,
"loss": 0.2183,
"step": 6064
},
{
"epoch": 3.927554980595084,
"grad_norm": 1.252619981765747,
"learning_rate": 1.160069816838838e-06,
"loss": 0.2018,
"step": 6072
},
{
"epoch": 3.9327296248382924,
"grad_norm": 8.604789733886719,
"learning_rate": 1.1493574254698598e-06,
"loss": 0.1997,
"step": 6080
},
{
"epoch": 3.9379042690815007,
"grad_norm": 21.417587280273438,
"learning_rate": 1.1386882976006897e-06,
"loss": 0.1985,
"step": 6088
},
{
"epoch": 3.943078913324709,
"grad_norm": 1.4360429048538208,
"learning_rate": 1.128062553103223e-06,
"loss": 0.214,
"step": 6096
},
{
"epoch": 3.9482535575679174,
"grad_norm": 28.317441940307617,
"learning_rate": 1.1174803113619204e-06,
"loss": 0.2086,
"step": 6104
},
{
"epoch": 3.9534282018111258,
"grad_norm": 4.1924052238464355,
"learning_rate": 1.106941691272474e-06,
"loss": 0.214,
"step": 6112
},
{
"epoch": 3.9586028460543337,
"grad_norm": 11.092174530029297,
"learning_rate": 1.0964468112404691e-06,
"loss": 0.2052,
"step": 6120
},
{
"epoch": 3.963777490297542,
"grad_norm": 0.8266966938972473,
"learning_rate": 1.0859957891800548e-06,
"loss": 0.2056,
"step": 6128
},
{
"epoch": 3.9689521345407504,
"grad_norm": 1.4189459085464478,
"learning_rate": 1.075588742512617e-06,
"loss": 0.2043,
"step": 6136
},
{
"epoch": 3.9741267787839587,
"grad_norm": 10.665148735046387,
"learning_rate": 1.0652257881654625e-06,
"loss": 0.2146,
"step": 6144
},
{
"epoch": 3.9793014230271666,
"grad_norm": 11.610654830932617,
"learning_rate": 1.0549070425705017e-06,
"loss": 0.2126,
"step": 6152
},
{
"epoch": 3.984476067270375,
"grad_norm": 1.4883841276168823,
"learning_rate": 1.0446326216629422e-06,
"loss": 0.2093,
"step": 6160
},
{
"epoch": 3.9896507115135833,
"grad_norm": 3.509707450866699,
"learning_rate": 1.0344026408799868e-06,
"loss": 0.2055,
"step": 6168
},
{
"epoch": 3.9948253557567917,
"grad_norm": 7.780543804168701,
"learning_rate": 1.0242172151595365e-06,
"loss": 0.2123,
"step": 6176
},
{
"epoch": 4.0,
"grad_norm": 12.673335075378418,
"learning_rate": 1.0140764589388963e-06,
"loss": 0.2044,
"step": 6184
},
{
"epoch": 4.005174644243208,
"grad_norm": 3.754645824432373,
"learning_rate": 1.003980486153494e-06,
"loss": 0.2155,
"step": 6192
},
{
"epoch": 4.010349288486417,
"grad_norm": 0.9349867105484009,
"learning_rate": 9.939294102355957e-07,
"loss": 0.211,
"step": 6200
},
{
"epoch": 4.015523932729625,
"grad_norm": 9.735404014587402,
"learning_rate": 9.839233441130353e-07,
"loss": 0.2043,
"step": 6208
},
{
"epoch": 4.020698576972833,
"grad_norm": 4.292990207672119,
"learning_rate": 9.739624002079412e-07,
"loss": 0.2239,
"step": 6216
},
{
"epoch": 4.025873221216042,
"grad_norm": 12.860719680786133,
"learning_rate": 9.640466904354778e-07,
"loss": 0.2163,
"step": 6224
},
{
"epoch": 4.03104786545925,
"grad_norm": 1.3650522232055664,
"learning_rate": 9.541763262025866e-07,
"loss": 0.2082,
"step": 6232
},
{
"epoch": 4.0362225097024576,
"grad_norm": 17.642620086669922,
"learning_rate": 9.443514184067326e-07,
"loss": 0.197,
"step": 6240
},
{
"epoch": 4.041397153945666,
"grad_norm": 5.8212714195251465,
"learning_rate": 9.345720774346589e-07,
"loss": 0.2059,
"step": 6248
},
{
"epoch": 4.046571798188874,
"grad_norm": 14.918822288513184,
"learning_rate": 9.248384131611493e-07,
"loss": 0.2074,
"step": 6256
},
{
"epoch": 4.051746442432083,
"grad_norm": 3.4913625717163086,
"learning_rate": 9.151505349477901e-07,
"loss": 0.2251,
"step": 6264
},
{
"epoch": 4.056921086675291,
"grad_norm": 17.606687545776367,
"learning_rate": 9.055085516417439e-07,
"loss": 0.2141,
"step": 6272
},
{
"epoch": 4.062095730918499,
"grad_norm": 0.8561938405036926,
"learning_rate": 8.959125715745248e-07,
"loss": 0.2123,
"step": 6280
},
{
"epoch": 4.067270375161708,
"grad_norm": 6.487976551055908,
"learning_rate": 8.863627025607835e-07,
"loss": 0.2218,
"step": 6288
},
{
"epoch": 4.072445019404916,
"grad_norm": 24.814111709594727,
"learning_rate": 8.768590518970938e-07,
"loss": 0.1991,
"step": 6296
},
{
"epoch": 4.077619663648124,
"grad_norm": 0.9031611084938049,
"learning_rate": 8.674017263607488e-07,
"loss": 0.2011,
"step": 6304
},
{
"epoch": 4.082794307891333,
"grad_norm": 32.17177200317383,
"learning_rate": 8.57990832208559e-07,
"loss": 0.2109,
"step": 6312
},
{
"epoch": 4.087968952134541,
"grad_norm": 4.342803478240967,
"learning_rate": 8.486264751756607e-07,
"loss": 0.1977,
"step": 6320
},
{
"epoch": 4.093143596377749,
"grad_norm": 4.845144748687744,
"learning_rate": 8.393087604743283e-07,
"loss": 0.2082,
"step": 6328
},
{
"epoch": 4.098318240620958,
"grad_norm": 7.410824298858643,
"learning_rate": 8.300377927927888e-07,
"loss": 0.2096,
"step": 6336
},
{
"epoch": 4.103492884864165,
"grad_norm": 21.402700424194336,
"learning_rate": 8.208136762940489e-07,
"loss": 0.2133,
"step": 6344
},
{
"epoch": 4.1086675291073735,
"grad_norm": 10.131089210510254,
"learning_rate": 8.116365146147243e-07,
"loss": 0.2217,
"step": 6352
},
{
"epoch": 4.113842173350582,
"grad_norm": 2.5814476013183594,
"learning_rate": 8.025064108638742e-07,
"loss": 0.1901,
"step": 6360
},
{
"epoch": 4.11901681759379,
"grad_norm": 3.8243138790130615,
"learning_rate": 7.934234676218411e-07,
"loss": 0.2239,
"step": 6368
},
{
"epoch": 4.124191461836999,
"grad_norm": 1.469176173210144,
"learning_rate": 7.843877869391053e-07,
"loss": 0.2088,
"step": 6376
},
{
"epoch": 4.129366106080207,
"grad_norm": 8.301488876342773,
"learning_rate": 7.753994703351298e-07,
"loss": 0.2082,
"step": 6384
},
{
"epoch": 4.134540750323415,
"grad_norm": 5.9099249839782715,
"learning_rate": 7.664586187972234e-07,
"loss": 0.1966,
"step": 6392
},
{
"epoch": 4.139715394566624,
"grad_norm": 10.461531639099121,
"learning_rate": 7.575653327794075e-07,
"loss": 0.2058,
"step": 6400
},
{
"epoch": 4.144890038809832,
"grad_norm": 6.918221950531006,
"learning_rate": 7.48719712201284e-07,
"loss": 0.212,
"step": 6408
},
{
"epoch": 4.15006468305304,
"grad_norm": 2.304258346557617,
"learning_rate": 7.399218564469174e-07,
"loss": 0.2005,
"step": 6416
},
{
"epoch": 4.155239327296249,
"grad_norm": 6.370954990386963,
"learning_rate": 7.311718643637134e-07,
"loss": 0.1985,
"step": 6424
},
{
"epoch": 4.160413971539457,
"grad_norm": 11.649602890014648,
"learning_rate": 7.224698342613096e-07,
"loss": 0.1978,
"step": 6432
},
{
"epoch": 4.165588615782665,
"grad_norm": 6.221530437469482,
"learning_rate": 7.138158639104748e-07,
"loss": 0.2098,
"step": 6440
},
{
"epoch": 4.170763260025873,
"grad_norm": 10.76377010345459,
"learning_rate": 7.052100505420051e-07,
"loss": 0.2189,
"step": 6448
},
{
"epoch": 4.175937904269081,
"grad_norm": 12.636096954345703,
"learning_rate": 6.96652490845634e-07,
"loss": 0.2253,
"step": 6456
},
{
"epoch": 4.1811125485122895,
"grad_norm": 1.3980625867843628,
"learning_rate": 6.881432809689459e-07,
"loss": 0.2044,
"step": 6464
},
{
"epoch": 4.186287192755498,
"grad_norm": 40.41677474975586,
"learning_rate": 6.796825165162951e-07,
"loss": 0.2063,
"step": 6472
},
{
"epoch": 4.191461836998706,
"grad_norm": 6.918355941772461,
"learning_rate": 6.712702925477343e-07,
"loss": 0.2095,
"step": 6480
},
{
"epoch": 4.196636481241915,
"grad_norm": 5.543708801269531,
"learning_rate": 6.62906703577943e-07,
"loss": 0.203,
"step": 6488
},
{
"epoch": 4.201811125485123,
"grad_norm": 0.7908322811126709,
"learning_rate": 6.545918435751669e-07,
"loss": 0.2164,
"step": 6496
},
{
"epoch": 4.206985769728331,
"grad_norm": 23.913345336914062,
"learning_rate": 6.463258059601635e-07,
"loss": 0.1994,
"step": 6504
},
{
"epoch": 4.21216041397154,
"grad_norm": 20.359169006347656,
"learning_rate": 6.381086836051498e-07,
"loss": 0.2175,
"step": 6512
},
{
"epoch": 4.217335058214748,
"grad_norm": 2.3722023963928223,
"learning_rate": 6.299405688327631e-07,
"loss": 0.2055,
"step": 6520
},
{
"epoch": 4.222509702457956,
"grad_norm": 96.0938491821289,
"learning_rate": 6.218215534150185e-07,
"loss": 0.1927,
"step": 6528
},
{
"epoch": 4.227684346701165,
"grad_norm": 7.64237642288208,
"learning_rate": 6.137517285722816e-07,
"loss": 0.2043,
"step": 6536
},
{
"epoch": 4.232858990944372,
"grad_norm": 4.935359477996826,
"learning_rate": 6.057311849722419e-07,
"loss": 0.2184,
"step": 6544
},
{
"epoch": 4.2380336351875805,
"grad_norm": 3.5845251083374023,
"learning_rate": 5.977600127288941e-07,
"loss": 0.2137,
"step": 6552
},
{
"epoch": 4.243208279430789,
"grad_norm": 16.71499252319336,
"learning_rate": 5.898383014015275e-07,
"loss": 0.2096,
"step": 6560
},
{
"epoch": 4.248382923673997,
"grad_norm": 17.370206832885742,
"learning_rate": 5.81966139993716e-07,
"loss": 0.2067,
"step": 6568
},
{
"epoch": 4.2535575679172055,
"grad_norm": 22.58631134033203,
"learning_rate": 5.741436169523234e-07,
"loss": 0.2232,
"step": 6576
},
{
"epoch": 4.258732212160414,
"grad_norm": 3.885636806488037,
"learning_rate": 5.663708201665041e-07,
"loss": 0.2096,
"step": 6584
},
{
"epoch": 4.263906856403622,
"grad_norm": 1.6453157663345337,
"learning_rate": 5.586478369667203e-07,
"loss": 0.2082,
"step": 6592
},
{
"epoch": 4.269081500646831,
"grad_norm": 6.002994060516357,
"learning_rate": 5.50974754123757e-07,
"loss": 0.201,
"step": 6600
},
{
"epoch": 4.274256144890039,
"grad_norm": 1.0541515350341797,
"learning_rate": 5.433516578477504e-07,
"loss": 0.2105,
"step": 6608
},
{
"epoch": 4.279430789133247,
"grad_norm": 1.4043939113616943,
"learning_rate": 5.357786337872168e-07,
"loss": 0.2143,
"step": 6616
},
{
"epoch": 4.284605433376456,
"grad_norm": 2.2832283973693848,
"learning_rate": 5.282557670280914e-07,
"loss": 0.2075,
"step": 6624
},
{
"epoch": 4.289780077619664,
"grad_norm": 1.8749516010284424,
"learning_rate": 5.207831420927722e-07,
"loss": 0.1923,
"step": 6632
},
{
"epoch": 4.294954721862872,
"grad_norm": 9.822781562805176,
"learning_rate": 5.133608429391706e-07,
"loss": 0.2093,
"step": 6640
},
{
"epoch": 4.300129366106081,
"grad_norm": 8.075210571289062,
"learning_rate": 5.059889529597678e-07,
"loss": 0.1995,
"step": 6648
},
{
"epoch": 4.305304010349288,
"grad_norm": 13.020132064819336,
"learning_rate": 4.986675549806769e-07,
"loss": 0.208,
"step": 6656
},
{
"epoch": 4.3104786545924965,
"grad_norm": 1.0930315256118774,
"learning_rate": 4.913967312607154e-07,
"loss": 0.1978,
"step": 6664
},
{
"epoch": 4.315653298835705,
"grad_norm": 1.784521222114563,
"learning_rate": 4.841765634904777e-07,
"loss": 0.1921,
"step": 6672
},
{
"epoch": 4.320827943078913,
"grad_norm": 10.063488960266113,
"learning_rate": 4.770071327914177e-07,
"loss": 0.2094,
"step": 6680
},
{
"epoch": 4.3260025873221215,
"grad_norm": 1.535333514213562,
"learning_rate": 4.6988851971493886e-07,
"loss": 0.2041,
"step": 6688
},
{
"epoch": 4.33117723156533,
"grad_norm": 3.97757625579834,
"learning_rate": 4.628208042414889e-07,
"loss": 0.2225,
"step": 6696
},
{
"epoch": 4.336351875808538,
"grad_norm": 8.291097640991211,
"learning_rate": 4.558040657796603e-07,
"loss": 0.2119,
"step": 6704
},
{
"epoch": 4.3415265200517466,
"grad_norm": 5.780373573303223,
"learning_rate": 4.4883838316529816e-07,
"loss": 0.2099,
"step": 6712
},
{
"epoch": 4.346701164294955,
"grad_norm": 23.213045120239258,
"learning_rate": 4.4192383466061583e-07,
"loss": 0.1992,
"step": 6720
},
{
"epoch": 4.351875808538163,
"grad_norm": 6.386680603027344,
"learning_rate": 4.350604979533135e-07,
"loss": 0.2085,
"step": 6728
},
{
"epoch": 4.357050452781372,
"grad_norm": 1.2865214347839355,
"learning_rate": 4.2824845015570713e-07,
"loss": 0.2168,
"step": 6736
},
{
"epoch": 4.36222509702458,
"grad_norm": 3.7142112255096436,
"learning_rate": 4.214877678038609e-07,
"loss": 0.2087,
"step": 6744
},
{
"epoch": 4.367399741267787,
"grad_norm": 15.213637351989746,
"learning_rate": 4.1477852685672895e-07,
"loss": 0.2107,
"step": 6752
},
{
"epoch": 4.372574385510996,
"grad_norm": 2.587907075881958,
"learning_rate": 4.0812080269529983e-07,
"loss": 0.2178,
"step": 6760
},
{
"epoch": 4.377749029754204,
"grad_norm": 3.547725200653076,
"learning_rate": 4.015146701217493e-07,
"loss": 0.2255,
"step": 6768
},
{
"epoch": 4.3829236739974125,
"grad_norm": 2.6407039165496826,
"learning_rate": 3.949602033586047e-07,
"loss": 0.2035,
"step": 6776
},
{
"epoch": 4.388098318240621,
"grad_norm": 7.901521682739258,
"learning_rate": 3.884574760479037e-07,
"loss": 0.2069,
"step": 6784
},
{
"epoch": 4.393272962483829,
"grad_norm": 2.049207925796509,
"learning_rate": 3.820065612503732e-07,
"loss": 0.2042,
"step": 6792
},
{
"epoch": 4.3984476067270375,
"grad_norm": 1.1140714883804321,
"learning_rate": 3.756075314446045e-07,
"loss": 0.2081,
"step": 6800
},
{
"epoch": 4.403622250970246,
"grad_norm": 1.4943286180496216,
"learning_rate": 3.6926045852624106e-07,
"loss": 0.2066,
"step": 6808
},
{
"epoch": 4.408796895213454,
"grad_norm": 2.857074499130249,
"learning_rate": 3.629654138071692e-07,
"loss": 0.2095,
"step": 6816
},
{
"epoch": 4.4139715394566625,
"grad_norm": 8.338090896606445,
"learning_rate": 3.56722468014718e-07,
"loss": 0.2238,
"step": 6824
},
{
"epoch": 4.419146183699871,
"grad_norm": 5.449411869049072,
"learning_rate": 3.505316912908668e-07,
"loss": 0.1984,
"step": 6832
},
{
"epoch": 4.424320827943079,
"grad_norm": 26.998971939086914,
"learning_rate": 3.443931531914507e-07,
"loss": 0.199,
"step": 6840
},
{
"epoch": 4.429495472186288,
"grad_norm": 2.477626085281372,
"learning_rate": 3.3830692268538637e-07,
"loss": 0.205,
"step": 6848
},
{
"epoch": 4.434670116429496,
"grad_norm": 5.2165632247924805,
"learning_rate": 3.3227306815389213e-07,
"loss": 0.2037,
"step": 6856
},
{
"epoch": 4.439844760672703,
"grad_norm": 1.6941031217575073,
"learning_rate": 3.262916573897218e-07,
"loss": 0.2006,
"step": 6864
},
{
"epoch": 4.445019404915912,
"grad_norm": 1.2534488439559937,
"learning_rate": 3.2036275759640245e-07,
"loss": 0.1979,
"step": 6872
},
{
"epoch": 4.45019404915912,
"grad_norm": 16.04631233215332,
"learning_rate": 3.1448643538748045e-07,
"loss": 0.2027,
"step": 6880
},
{
"epoch": 4.455368693402328,
"grad_norm": 2.139188289642334,
"learning_rate": 3.086627567857703e-07,
"loss": 0.2088,
"step": 6888
},
{
"epoch": 4.460543337645537,
"grad_norm": 1.968658685684204,
"learning_rate": 3.0289178722261726e-07,
"loss": 0.213,
"step": 6896
},
{
"epoch": 4.465717981888745,
"grad_norm": 5.241852760314941,
"learning_rate": 2.9717359153715707e-07,
"loss": 0.2227,
"step": 6904
},
{
"epoch": 4.4708926261319535,
"grad_norm": 2.9422459602355957,
"learning_rate": 2.9150823397559094e-07,
"loss": 0.2046,
"step": 6912
},
{
"epoch": 4.476067270375162,
"grad_norm": 8.094049453735352,
"learning_rate": 2.8589577819046364e-07,
"loss": 0.198,
"step": 6920
},
{
"epoch": 4.48124191461837,
"grad_norm": 1.4471999406814575,
"learning_rate": 2.8033628723994623e-07,
"loss": 0.2106,
"step": 6928
},
{
"epoch": 4.4864165588615785,
"grad_norm": 2.6254570484161377,
"learning_rate": 2.7482982358712885e-07,
"loss": 0.211,
"step": 6936
},
{
"epoch": 4.491591203104787,
"grad_norm": 11.787996292114258,
"learning_rate": 2.6937644909931893e-07,
"loss": 0.2103,
"step": 6944
},
{
"epoch": 4.496765847347995,
"grad_norm": 4.06654691696167,
"learning_rate": 2.639762250473482e-07,
"loss": 0.2116,
"step": 6952
},
{
"epoch": 4.501940491591203,
"grad_norm": 1.4215199947357178,
"learning_rate": 2.5862921210487833e-07,
"loss": 0.2039,
"step": 6960
},
{
"epoch": 4.507115135834411,
"grad_norm": 4.604936122894287,
"learning_rate": 2.5333547034772645e-07,
"loss": 0.2126,
"step": 6968
},
{
"epoch": 4.512289780077619,
"grad_norm": 29.087053298950195,
"learning_rate": 2.480950592531844e-07,
"loss": 0.195,
"step": 6976
},
{
"epoch": 4.517464424320828,
"grad_norm": 6.165823459625244,
"learning_rate": 2.429080376993537e-07,
"loss": 0.2141,
"step": 6984
},
{
"epoch": 4.522639068564036,
"grad_norm": 7.566137313842773,
"learning_rate": 2.37774463964483e-07,
"loss": 0.2013,
"step": 6992
},
{
"epoch": 4.527813712807244,
"grad_norm": 17.188518524169922,
"learning_rate": 2.3269439572631448e-07,
"loss": 0.213,
"step": 7000
},
{
"epoch": 4.532988357050453,
"grad_norm": 2.3657376766204834,
"learning_rate": 2.2766789006143265e-07,
"loss": 0.2087,
"step": 7008
},
{
"epoch": 4.538163001293661,
"grad_norm": 131.63343811035156,
"learning_rate": 2.226950034446279e-07,
"loss": 0.2219,
"step": 7016
},
{
"epoch": 4.5433376455368695,
"grad_norm": 9.159808158874512,
"learning_rate": 2.1777579174825703e-07,
"loss": 0.2194,
"step": 7024
},
{
"epoch": 4.548512289780078,
"grad_norm": 1.9207276105880737,
"learning_rate": 2.1291031024161856e-07,
"loss": 0.2093,
"step": 7032
},
{
"epoch": 4.553686934023286,
"grad_norm": 12.356361389160156,
"learning_rate": 2.0809861359033124e-07,
"loss": 0.214,
"step": 7040
},
{
"epoch": 4.5588615782664945,
"grad_norm": 2.2436163425445557,
"learning_rate": 2.0334075585571988e-07,
"loss": 0.2149,
"step": 7048
},
{
"epoch": 4.564036222509703,
"grad_norm": 3.7744526863098145,
"learning_rate": 1.986367904942066e-07,
"loss": 0.1967,
"step": 7056
},
{
"epoch": 4.569210866752911,
"grad_norm": 1.290109395980835,
"learning_rate": 1.9398677035671222e-07,
"loss": 0.2186,
"step": 7064
},
{
"epoch": 4.574385510996119,
"grad_norm": 1.6141724586486816,
"learning_rate": 1.8939074768806076e-07,
"loss": 0.2067,
"step": 7072
},
{
"epoch": 4.579560155239327,
"grad_norm": 2.7260425090789795,
"learning_rate": 1.8484877412639435e-07,
"loss": 0.1964,
"step": 7080
},
{
"epoch": 4.584734799482535,
"grad_norm": 1.980734944343567,
"learning_rate": 1.8036090070259026e-07,
"loss": 0.1991,
"step": 7088
},
{
"epoch": 4.589909443725744,
"grad_norm": 2.165452241897583,
"learning_rate": 1.7592717783969094e-07,
"loss": 0.2146,
"step": 7096
},
{
"epoch": 4.595084087968952,
"grad_norm": 1.303862452507019,
"learning_rate": 1.7154765535233486e-07,
"loss": 0.2152,
"step": 7104
},
{
"epoch": 4.60025873221216,
"grad_norm": 7.211986064910889,
"learning_rate": 1.6722238244619827e-07,
"loss": 0.2248,
"step": 7112
},
{
"epoch": 4.605433376455369,
"grad_norm": 7.8271870613098145,
"learning_rate": 1.6295140771744044e-07,
"loss": 0.209,
"step": 7120
},
{
"epoch": 4.610608020698577,
"grad_norm": 1.4870136976242065,
"learning_rate": 1.587347791521604e-07,
"loss": 0.2148,
"step": 7128
},
{
"epoch": 4.6157826649417855,
"grad_norm": 1.905441403388977,
"learning_rate": 1.5457254412585666e-07,
"loss": 0.2107,
"step": 7136
},
{
"epoch": 4.620957309184994,
"grad_norm": 0.9246317148208618,
"learning_rate": 1.5046474940289268e-07,
"loss": 0.2177,
"step": 7144
},
{
"epoch": 4.626131953428202,
"grad_norm": 12.103673934936523,
"learning_rate": 1.4641144113597628e-07,
"loss": 0.2049,
"step": 7152
},
{
"epoch": 4.63130659767141,
"grad_norm": 1.047852635383606,
"learning_rate": 1.4241266486563654e-07,
"loss": 0.2062,
"step": 7160
},
{
"epoch": 4.636481241914618,
"grad_norm": 0.9444906115531921,
"learning_rate": 1.3846846551971272e-07,
"loss": 0.2019,
"step": 7168
},
{
"epoch": 4.641655886157826,
"grad_norm": 1.3876017332077026,
"learning_rate": 1.3457888741285452e-07,
"loss": 0.1979,
"step": 7176
},
{
"epoch": 4.646830530401035,
"grad_norm": 1.2842018604278564,
"learning_rate": 1.307439742460165e-07,
"loss": 0.207,
"step": 7184
},
{
"epoch": 4.652005174644243,
"grad_norm": 15.236653327941895,
"learning_rate": 1.2696376910597275e-07,
"loss": 0.2146,
"step": 7192
},
{
"epoch": 4.657179818887451,
"grad_norm": 41.783870697021484,
"learning_rate": 1.2323831446483025e-07,
"loss": 0.207,
"step": 7200
},
{
"epoch": 4.66235446313066,
"grad_norm": 1.3897194862365723,
"learning_rate": 1.1956765217955302e-07,
"loss": 0.1963,
"step": 7208
},
{
"epoch": 4.667529107373868,
"grad_norm": 10.643123626708984,
"learning_rate": 1.1595182349149026e-07,
"loss": 0.2189,
"step": 7216
},
{
"epoch": 4.672703751617076,
"grad_norm": 1.3099812269210815,
"learning_rate": 1.1239086902591512e-07,
"loss": 0.2271,
"step": 7224
},
{
"epoch": 4.677878395860285,
"grad_norm": 9.640113830566406,
"learning_rate": 1.0888482879156503e-07,
"loss": 0.2085,
"step": 7232
},
{
"epoch": 4.683053040103493,
"grad_norm": 8.829296112060547,
"learning_rate": 1.0543374218019708e-07,
"loss": 0.2029,
"step": 7240
},
{
"epoch": 4.6882276843467015,
"grad_norm": 15.328158378601074,
"learning_rate": 1.0203764796614057e-07,
"loss": 0.2266,
"step": 7248
},
{
"epoch": 4.69340232858991,
"grad_norm": 3.7764499187469482,
"learning_rate": 9.869658430586349e-08,
"loss": 0.216,
"step": 7256
},
{
"epoch": 4.698576972833118,
"grad_norm": 8.380655288696289,
"learning_rate": 9.541058873754394e-08,
"loss": 0.213,
"step": 7264
},
{
"epoch": 4.7037516170763265,
"grad_norm": 5.44411563873291,
"learning_rate": 9.217969818064832e-08,
"loss": 0.1983,
"step": 7272
},
{
"epoch": 4.708926261319534,
"grad_norm": 20.862810134887695,
"learning_rate": 8.900394893551655e-08,
"loss": 0.2082,
"step": 7280
},
{
"epoch": 4.714100905562742,
"grad_norm": 8.134232521057129,
"learning_rate": 8.588337668295366e-08,
"loss": 0.1995,
"step": 7288
},
{
"epoch": 4.719275549805951,
"grad_norm": 5.843049049377441,
"learning_rate": 8.28180164838288e-08,
"loss": 0.1962,
"step": 7296
},
{
"epoch": 4.724450194049159,
"grad_norm": 14.079636573791504,
"learning_rate": 7.980790277868189e-08,
"loss": 0.2213,
"step": 7304
},
{
"epoch": 4.729624838292367,
"grad_norm": 1.857102870941162,
"learning_rate": 7.685306938733761e-08,
"loss": 0.2115,
"step": 7312
},
{
"epoch": 4.734799482535576,
"grad_norm": 4.96279239654541,
"learning_rate": 7.395354950852307e-08,
"loss": 0.2191,
"step": 7320
},
{
"epoch": 4.739974126778784,
"grad_norm": 3.5607805252075195,
"learning_rate": 7.110937571949639e-08,
"loss": 0.2076,
"step": 7328
},
{
"epoch": 4.745148771021992,
"grad_norm": 3.995842218399048,
"learning_rate": 6.832057997568087e-08,
"loss": 0.1983,
"step": 7336
},
{
"epoch": 4.750323415265201,
"grad_norm": 19.480527877807617,
"learning_rate": 6.55871936103053e-08,
"loss": 0.2037,
"step": 7344
},
{
"epoch": 4.755498059508409,
"grad_norm": 2.151970624923706,
"learning_rate": 6.290924733405201e-08,
"loss": 0.2137,
"step": 7352
},
{
"epoch": 4.760672703751617,
"grad_norm": 6.302921772003174,
"learning_rate": 6.028677123471105e-08,
"loss": 0.2095,
"step": 7360
},
{
"epoch": 4.765847347994825,
"grad_norm": 1.0118603706359863,
"learning_rate": 5.771979477684375e-08,
"loss": 0.221,
"step": 7368
},
{
"epoch": 4.771021992238033,
"grad_norm": 20.808563232421875,
"learning_rate": 5.5208346801451376e-08,
"loss": 0.2034,
"step": 7376
},
{
"epoch": 4.776196636481242,
"grad_norm": 4.907052040100098,
"learning_rate": 5.2752455525650334e-08,
"loss": 0.2076,
"step": 7384
},
{
"epoch": 4.78137128072445,
"grad_norm": 29.325668334960938,
"learning_rate": 5.035214854235526e-08,
"loss": 0.1882,
"step": 7392
},
{
"epoch": 4.786545924967658,
"grad_norm": 6.801876544952393,
"learning_rate": 4.8007452819968107e-08,
"loss": 0.2004,
"step": 7400
},
{
"epoch": 4.791720569210867,
"grad_norm": 42.309303283691406,
"learning_rate": 4.571839470207839e-08,
"loss": 0.2132,
"step": 7408
},
{
"epoch": 4.796895213454075,
"grad_norm": 5.460014820098877,
"learning_rate": 4.3484999907163484e-08,
"loss": 0.1956,
"step": 7416
},
{
"epoch": 4.802069857697283,
"grad_norm": 4.588939666748047,
"learning_rate": 4.130729352830154e-08,
"loss": 0.1942,
"step": 7424
},
{
"epoch": 4.807244501940492,
"grad_norm": 12.402462005615234,
"learning_rate": 3.9185300032889005e-08,
"loss": 0.2013,
"step": 7432
},
{
"epoch": 4.8124191461837,
"grad_norm": 2.2621071338653564,
"learning_rate": 3.711904326236693e-08,
"loss": 0.1916,
"step": 7440
},
{
"epoch": 4.817593790426908,
"grad_norm": 0.7548274993896484,
"learning_rate": 3.510854643195061e-08,
"loss": 0.2083,
"step": 7448
},
{
"epoch": 4.822768434670117,
"grad_norm": 6.128635883331299,
"learning_rate": 3.3153832130371486e-08,
"loss": 0.2125,
"step": 7456
},
{
"epoch": 4.827943078913325,
"grad_norm": 4.110105514526367,
"learning_rate": 3.1254922319621794e-08,
"loss": 0.2127,
"step": 7464
},
{
"epoch": 4.833117723156533,
"grad_norm": 13.073273658752441,
"learning_rate": 2.941183833470751e-08,
"loss": 0.2125,
"step": 7472
},
{
"epoch": 4.838292367399741,
"grad_norm": 4.8851542472839355,
"learning_rate": 2.7624600883410235e-08,
"loss": 0.2054,
"step": 7480
},
{
"epoch": 4.843467011642949,
"grad_norm": 12.184813499450684,
"learning_rate": 2.589323004605293e-08,
"loss": 0.2078,
"step": 7488
},
{
"epoch": 4.848641655886158,
"grad_norm": 22.534168243408203,
"learning_rate": 2.4217745275275094e-08,
"loss": 0.1981,
"step": 7496
},
{
"epoch": 4.853816300129366,
"grad_norm": 15.434218406677246,
"learning_rate": 2.2598165395813498e-08,
"loss": 0.2012,
"step": 7504
},
{
"epoch": 4.858990944372574,
"grad_norm": 1.728147029876709,
"learning_rate": 2.1034508604292904e-08,
"loss": 0.2149,
"step": 7512
},
{
"epoch": 4.864165588615783,
"grad_norm": 49.4590950012207,
"learning_rate": 1.9526792469017896e-08,
"loss": 0.2191,
"step": 7520
},
{
"epoch": 4.869340232858991,
"grad_norm": 0.7744470834732056,
"learning_rate": 1.807503392977916e-08,
"loss": 0.2149,
"step": 7528
},
{
"epoch": 4.874514877102199,
"grad_norm": 4.530129909515381,
"learning_rate": 1.6679249297660847e-08,
"loss": 0.2137,
"step": 7536
},
{
"epoch": 4.879689521345408,
"grad_norm": 6.442441463470459,
"learning_rate": 1.533945425485739e-08,
"loss": 0.208,
"step": 7544
},
{
"epoch": 4.884864165588616,
"grad_norm": 29.557950973510742,
"learning_rate": 1.405566385449919e-08,
"loss": 0.2039,
"step": 7552
},
{
"epoch": 4.890038809831824,
"grad_norm": 1.7751787900924683,
"learning_rate": 1.2827892520481667e-08,
"loss": 0.2158,
"step": 7560
},
{
"epoch": 4.895213454075033,
"grad_norm": 1.7882133722305298,
"learning_rate": 1.1656154047303691e-08,
"loss": 0.196,
"step": 7568
},
{
"epoch": 4.90038809831824,
"grad_norm": 1.204350233078003,
"learning_rate": 1.0540461599913287e-08,
"loss": 0.1944,
"step": 7576
},
{
"epoch": 4.9055627425614485,
"grad_norm": 3.3513760566711426,
"learning_rate": 9.480827713557183e-09,
"loss": 0.1995,
"step": 7584
},
{
"epoch": 4.910737386804657,
"grad_norm": 4.574145317077637,
"learning_rate": 8.47726429364426e-09,
"loss": 0.1927,
"step": 7592
},
{
"epoch": 4.915912031047865,
"grad_norm": 2.1629719734191895,
"learning_rate": 7.529782615608439e-09,
"loss": 0.1989,
"step": 7600
},
{
"epoch": 4.921086675291074,
"grad_norm": 1.5303400754928589,
"learning_rate": 6.638393324782111e-09,
"loss": 0.2084,
"step": 7608
},
{
"epoch": 4.926261319534282,
"grad_norm": 1.0018268823623657,
"learning_rate": 5.803106436279571e-09,
"loss": 0.2008,
"step": 7616
},
{
"epoch": 4.93143596377749,
"grad_norm": 1.660049319267273,
"learning_rate": 5.023931334879883e-09,
"loss": 0.201,
"step": 7624
},
{
"epoch": 4.936610608020699,
"grad_norm": 16.29326820373535,
"learning_rate": 4.3008767749253e-09,
"loss": 0.2237,
"step": 7632
},
{
"epoch": 4.941785252263907,
"grad_norm": 1.205179214477539,
"learning_rate": 3.6339508802213374e-09,
"loss": 0.2053,
"step": 7640
},
{
"epoch": 4.946959896507115,
"grad_norm": 1.7497557401657104,
"learning_rate": 3.0231611439457407e-09,
"loss": 0.2056,
"step": 7648
},
{
"epoch": 4.952134540750324,
"grad_norm": 2.757598400115967,
"learning_rate": 2.468514428563551e-09,
"loss": 0.2057,
"step": 7656
},
{
"epoch": 4.957309184993532,
"grad_norm": 0.9228636026382446,
"learning_rate": 1.9700169657510537e-09,
"loss": 0.2067,
"step": 7664
},
{
"epoch": 4.96248382923674,
"grad_norm": 1.4917523860931396,
"learning_rate": 1.5276743563258367e-09,
"loss": 0.2222,
"step": 7672
},
{
"epoch": 4.967658473479949,
"grad_norm": 4.442543029785156,
"learning_rate": 1.141491570182396e-09,
"loss": 0.2168,
"step": 7680
},
{
"epoch": 4.972833117723156,
"grad_norm": 1.5017764568328857,
"learning_rate": 8.114729462377346e-10,
"loss": 0.2137,
"step": 7688
},
{
"epoch": 4.9780077619663645,
"grad_norm": 3.795858144760132,
"learning_rate": 5.376221923830694e-10,
"loss": 0.2188,
"step": 7696
},
{
"epoch": 4.983182406209573,
"grad_norm": 2.518415927886963,
"learning_rate": 3.1994238543997526e-10,
"loss": 0.2288,
"step": 7704
},
{
"epoch": 4.988357050452781,
"grad_norm": 1.4229096174240112,
"learning_rate": 1.5843597112707997e-10,
"loss": 0.2053,
"step": 7712
},
{
"epoch": 4.99353169469599,
"grad_norm": 8.819074630737305,
"learning_rate": 5.3104764033973245e-11,
"loss": 0.2206,
"step": 7720
},
{
"epoch": 4.998706338939198,
"grad_norm": 4.728495121002197,
"learning_rate": 3.949947598447246e-12,
"loss": 0.2096,
"step": 7728
},
{
"epoch": 5.0,
"step": 7730,
"total_flos": 2.7880125934075904e+16,
"train_loss": 0.22995005332097354,
"train_runtime": 14201.6757,
"train_samples_per_second": 69.632,
"train_steps_per_second": 0.544
}
],
"logging_steps": 8,
"max_steps": 7730,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 387,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.7880125934075904e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}