qwen_7b_instruct_extra_unverified / trainer_state.json
gsmyrnis's picture
End of training
c4217e3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9989187240944313,
"eval_steps": 500,
"global_step": 5547,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005406379527842855,
"grad_norm": 5.67321238470604,
"learning_rate": 1.801801801801802e-07,
"loss": 0.8785,
"step": 10
},
{
"epoch": 0.01081275905568571,
"grad_norm": 5.2575759647356906,
"learning_rate": 3.603603603603604e-07,
"loss": 0.8654,
"step": 20
},
{
"epoch": 0.016219138583528563,
"grad_norm": 3.8360253130807958,
"learning_rate": 5.405405405405406e-07,
"loss": 0.8205,
"step": 30
},
{
"epoch": 0.02162551811137142,
"grad_norm": 1.722668988638544,
"learning_rate": 7.207207207207208e-07,
"loss": 0.778,
"step": 40
},
{
"epoch": 0.027031897639214274,
"grad_norm": 1.3197714991034968,
"learning_rate": 9.00900900900901e-07,
"loss": 0.7286,
"step": 50
},
{
"epoch": 0.03243827716705713,
"grad_norm": 0.8474482237034886,
"learning_rate": 1.0810810810810812e-06,
"loss": 0.6968,
"step": 60
},
{
"epoch": 0.03784465669489998,
"grad_norm": 0.5645420283585227,
"learning_rate": 1.2612612612612613e-06,
"loss": 0.6689,
"step": 70
},
{
"epoch": 0.04325103622274284,
"grad_norm": 0.43605656948964683,
"learning_rate": 1.4414414414414416e-06,
"loss": 0.6408,
"step": 80
},
{
"epoch": 0.04865741575058569,
"grad_norm": 0.4339497028480959,
"learning_rate": 1.6216216216216219e-06,
"loss": 0.6153,
"step": 90
},
{
"epoch": 0.05406379527842855,
"grad_norm": 0.3843592033040236,
"learning_rate": 1.801801801801802e-06,
"loss": 0.6082,
"step": 100
},
{
"epoch": 0.0594701748062714,
"grad_norm": 0.37685068673558353,
"learning_rate": 1.9819819819819822e-06,
"loss": 0.6049,
"step": 110
},
{
"epoch": 0.06487655433411425,
"grad_norm": 0.4392453448959536,
"learning_rate": 2.1621621621621623e-06,
"loss": 0.5889,
"step": 120
},
{
"epoch": 0.07028293386195711,
"grad_norm": 0.4212233804351266,
"learning_rate": 2.3423423423423424e-06,
"loss": 0.5842,
"step": 130
},
{
"epoch": 0.07568931338979996,
"grad_norm": 0.38709432000579613,
"learning_rate": 2.5225225225225225e-06,
"loss": 0.592,
"step": 140
},
{
"epoch": 0.08109569291764282,
"grad_norm": 0.3988233764060424,
"learning_rate": 2.702702702702703e-06,
"loss": 0.5732,
"step": 150
},
{
"epoch": 0.08650207244548568,
"grad_norm": 0.41395637177292804,
"learning_rate": 2.882882882882883e-06,
"loss": 0.5679,
"step": 160
},
{
"epoch": 0.09190845197332853,
"grad_norm": 0.37677030114794524,
"learning_rate": 3.063063063063063e-06,
"loss": 0.5583,
"step": 170
},
{
"epoch": 0.09731483150117139,
"grad_norm": 0.38451911721974225,
"learning_rate": 3.2432432432432437e-06,
"loss": 0.5658,
"step": 180
},
{
"epoch": 0.10272121102901424,
"grad_norm": 0.36190379869625294,
"learning_rate": 3.423423423423424e-06,
"loss": 0.5554,
"step": 190
},
{
"epoch": 0.1081275905568571,
"grad_norm": 0.3927866832932917,
"learning_rate": 3.603603603603604e-06,
"loss": 0.5534,
"step": 200
},
{
"epoch": 0.11353397008469994,
"grad_norm": 0.4109637951464883,
"learning_rate": 3.7837837837837844e-06,
"loss": 0.5527,
"step": 210
},
{
"epoch": 0.1189403496125428,
"grad_norm": 0.4189875109517182,
"learning_rate": 3.9639639639639645e-06,
"loss": 0.5521,
"step": 220
},
{
"epoch": 0.12434672914038565,
"grad_norm": 0.44103289873218365,
"learning_rate": 4.1441441441441446e-06,
"loss": 0.55,
"step": 230
},
{
"epoch": 0.1297531086682285,
"grad_norm": 0.47624121719255225,
"learning_rate": 4.324324324324325e-06,
"loss": 0.5455,
"step": 240
},
{
"epoch": 0.13515948819607138,
"grad_norm": 0.4127382950104387,
"learning_rate": 4.504504504504505e-06,
"loss": 0.5392,
"step": 250
},
{
"epoch": 0.14056586772391422,
"grad_norm": 0.42849081039324655,
"learning_rate": 4.684684684684685e-06,
"loss": 0.5317,
"step": 260
},
{
"epoch": 0.1459722472517571,
"grad_norm": 0.4104060308344588,
"learning_rate": 4.864864864864866e-06,
"loss": 0.5317,
"step": 270
},
{
"epoch": 0.15137862677959993,
"grad_norm": 0.5046982359974199,
"learning_rate": 5.045045045045045e-06,
"loss": 0.5342,
"step": 280
},
{
"epoch": 0.15678500630744277,
"grad_norm": 0.4507880118410215,
"learning_rate": 5.225225225225226e-06,
"loss": 0.5325,
"step": 290
},
{
"epoch": 0.16219138583528564,
"grad_norm": 0.42877102726223915,
"learning_rate": 5.405405405405406e-06,
"loss": 0.5236,
"step": 300
},
{
"epoch": 0.16759776536312848,
"grad_norm": 0.5283894117116334,
"learning_rate": 5.585585585585585e-06,
"loss": 0.5316,
"step": 310
},
{
"epoch": 0.17300414489097135,
"grad_norm": 0.45448942603717846,
"learning_rate": 5.765765765765766e-06,
"loss": 0.5304,
"step": 320
},
{
"epoch": 0.1784105244188142,
"grad_norm": 0.4459611601163911,
"learning_rate": 5.945945945945947e-06,
"loss": 0.5307,
"step": 330
},
{
"epoch": 0.18381690394665706,
"grad_norm": 0.4167802385045301,
"learning_rate": 6.126126126126126e-06,
"loss": 0.5142,
"step": 340
},
{
"epoch": 0.1892232834744999,
"grad_norm": 0.45167071134408077,
"learning_rate": 6.3063063063063065e-06,
"loss": 0.5252,
"step": 350
},
{
"epoch": 0.19462966300234277,
"grad_norm": 0.3815004250489287,
"learning_rate": 6.486486486486487e-06,
"loss": 0.5203,
"step": 360
},
{
"epoch": 0.20003604253018561,
"grad_norm": 0.4189611440474181,
"learning_rate": 6.666666666666667e-06,
"loss": 0.5198,
"step": 370
},
{
"epoch": 0.20544242205802848,
"grad_norm": 0.4356383135556994,
"learning_rate": 6.846846846846848e-06,
"loss": 0.5164,
"step": 380
},
{
"epoch": 0.21084880158587133,
"grad_norm": 0.4146665581812368,
"learning_rate": 7.027027027027028e-06,
"loss": 0.5201,
"step": 390
},
{
"epoch": 0.2162551811137142,
"grad_norm": 0.46098403607909094,
"learning_rate": 7.207207207207208e-06,
"loss": 0.5241,
"step": 400
},
{
"epoch": 0.22166156064155704,
"grad_norm": 0.4173832279688485,
"learning_rate": 7.387387387387388e-06,
"loss": 0.5141,
"step": 410
},
{
"epoch": 0.22706794016939988,
"grad_norm": 0.45342411753034784,
"learning_rate": 7.567567567567569e-06,
"loss": 0.5058,
"step": 420
},
{
"epoch": 0.23247431969724275,
"grad_norm": 0.5556218847582134,
"learning_rate": 7.747747747747749e-06,
"loss": 0.5132,
"step": 430
},
{
"epoch": 0.2378806992250856,
"grad_norm": 0.4159604294450067,
"learning_rate": 7.927927927927929e-06,
"loss": 0.5116,
"step": 440
},
{
"epoch": 0.24328707875292846,
"grad_norm": 0.5011827344554423,
"learning_rate": 8.108108108108109e-06,
"loss": 0.5168,
"step": 450
},
{
"epoch": 0.2486934582807713,
"grad_norm": 0.4837033851909487,
"learning_rate": 8.288288288288289e-06,
"loss": 0.5078,
"step": 460
},
{
"epoch": 0.25409983780861417,
"grad_norm": 0.43704376571990733,
"learning_rate": 8.46846846846847e-06,
"loss": 0.5033,
"step": 470
},
{
"epoch": 0.259506217336457,
"grad_norm": 0.3998543920237395,
"learning_rate": 8.64864864864865e-06,
"loss": 0.5023,
"step": 480
},
{
"epoch": 0.26491259686429985,
"grad_norm": 0.5026204387708488,
"learning_rate": 8.82882882882883e-06,
"loss": 0.5101,
"step": 490
},
{
"epoch": 0.27031897639214275,
"grad_norm": 0.5354755864920291,
"learning_rate": 9.00900900900901e-06,
"loss": 0.508,
"step": 500
},
{
"epoch": 0.2757253559199856,
"grad_norm": 0.4703091181508223,
"learning_rate": 9.189189189189191e-06,
"loss": 0.5057,
"step": 510
},
{
"epoch": 0.28113173544782843,
"grad_norm": 0.5066877793509437,
"learning_rate": 9.36936936936937e-06,
"loss": 0.5026,
"step": 520
},
{
"epoch": 0.2865381149756713,
"grad_norm": 0.46090960041448786,
"learning_rate": 9.54954954954955e-06,
"loss": 0.5106,
"step": 530
},
{
"epoch": 0.2919444945035142,
"grad_norm": 0.48562395925030005,
"learning_rate": 9.729729729729732e-06,
"loss": 0.4974,
"step": 540
},
{
"epoch": 0.297350874031357,
"grad_norm": 0.4646077201771921,
"learning_rate": 9.90990990990991e-06,
"loss": 0.4999,
"step": 550
},
{
"epoch": 0.30275725355919986,
"grad_norm": 0.4546070354869126,
"learning_rate": 9.999975246862685e-06,
"loss": 0.5103,
"step": 560
},
{
"epoch": 0.3081636330870427,
"grad_norm": 0.4529892857679444,
"learning_rate": 9.999777223234682e-06,
"loss": 0.5015,
"step": 570
},
{
"epoch": 0.31357001261488554,
"grad_norm": 0.42533238661448763,
"learning_rate": 9.999381183821387e-06,
"loss": 0.5079,
"step": 580
},
{
"epoch": 0.31897639214272844,
"grad_norm": 0.4319966793689572,
"learning_rate": 9.998787144307906e-06,
"loss": 0.4946,
"step": 590
},
{
"epoch": 0.3243827716705713,
"grad_norm": 0.5664739889982127,
"learning_rate": 9.997995128221131e-06,
"loss": 0.4963,
"step": 600
},
{
"epoch": 0.3297891511984141,
"grad_norm": 0.4571640893613164,
"learning_rate": 9.9970051669288e-06,
"loss": 0.4937,
"step": 610
},
{
"epoch": 0.33519553072625696,
"grad_norm": 0.46148944851299945,
"learning_rate": 9.995817299638244e-06,
"loss": 0.5002,
"step": 620
},
{
"epoch": 0.34060191025409986,
"grad_norm": 0.4844168889608816,
"learning_rate": 9.994431573394861e-06,
"loss": 0.5029,
"step": 630
},
{
"epoch": 0.3460082897819427,
"grad_norm": 0.4279693386473206,
"learning_rate": 9.99284804308023e-06,
"loss": 0.4952,
"step": 640
},
{
"epoch": 0.35141466930978554,
"grad_norm": 0.5233101609153901,
"learning_rate": 9.991066771409941e-06,
"loss": 0.4915,
"step": 650
},
{
"epoch": 0.3568210488376284,
"grad_norm": 0.4633208414221673,
"learning_rate": 9.989087828931121e-06,
"loss": 0.4981,
"step": 660
},
{
"epoch": 0.3622274283654713,
"grad_norm": 0.450997223108701,
"learning_rate": 9.986911294019631e-06,
"loss": 0.4975,
"step": 670
},
{
"epoch": 0.3676338078933141,
"grad_norm": 0.42452529740346523,
"learning_rate": 9.984537252876969e-06,
"loss": 0.4908,
"step": 680
},
{
"epoch": 0.37304018742115697,
"grad_norm": 0.46365207035760786,
"learning_rate": 9.981965799526846e-06,
"loss": 0.5016,
"step": 690
},
{
"epoch": 0.3784465669489998,
"grad_norm": 0.5296232726547591,
"learning_rate": 9.97919703581147e-06,
"loss": 0.4876,
"step": 700
},
{
"epoch": 0.38385294647684265,
"grad_norm": 0.401880074927354,
"learning_rate": 9.976231071387513e-06,
"loss": 0.4903,
"step": 710
},
{
"epoch": 0.38925932600468555,
"grad_norm": 0.42396559048043103,
"learning_rate": 9.973068023721761e-06,
"loss": 0.4898,
"step": 720
},
{
"epoch": 0.3946657055325284,
"grad_norm": 0.46944427807049693,
"learning_rate": 9.969708018086472e-06,
"loss": 0.4881,
"step": 730
},
{
"epoch": 0.40007208506037123,
"grad_norm": 0.4333253518146232,
"learning_rate": 9.966151187554403e-06,
"loss": 0.4895,
"step": 740
},
{
"epoch": 0.40547846458821407,
"grad_norm": 0.37661719489991125,
"learning_rate": 9.962397672993552e-06,
"loss": 0.487,
"step": 750
},
{
"epoch": 0.41088484411605697,
"grad_norm": 0.4603392631171023,
"learning_rate": 9.958447623061564e-06,
"loss": 0.4872,
"step": 760
},
{
"epoch": 0.4162912236438998,
"grad_norm": 0.3927558003883759,
"learning_rate": 9.954301194199864e-06,
"loss": 0.4903,
"step": 770
},
{
"epoch": 0.42169760317174265,
"grad_norm": 0.42897879593990096,
"learning_rate": 9.949958550627436e-06,
"loss": 0.4885,
"step": 780
},
{
"epoch": 0.4271039826995855,
"grad_norm": 0.4924374446694773,
"learning_rate": 9.945419864334344e-06,
"loss": 0.4774,
"step": 790
},
{
"epoch": 0.4325103622274284,
"grad_norm": 0.42518945879483444,
"learning_rate": 9.940685315074898e-06,
"loss": 0.4754,
"step": 800
},
{
"epoch": 0.43791674175527123,
"grad_norm": 0.399260485682431,
"learning_rate": 9.935755090360554e-06,
"loss": 0.4765,
"step": 810
},
{
"epoch": 0.4433231212831141,
"grad_norm": 0.37083672732602235,
"learning_rate": 9.930629385452475e-06,
"loss": 0.4757,
"step": 820
},
{
"epoch": 0.4487295008109569,
"grad_norm": 0.41759222116367195,
"learning_rate": 9.925308403353801e-06,
"loss": 0.4871,
"step": 830
},
{
"epoch": 0.45413588033879976,
"grad_norm": 0.4969932090759188,
"learning_rate": 9.919792354801614e-06,
"loss": 0.4792,
"step": 840
},
{
"epoch": 0.45954225986664266,
"grad_norm": 0.5029960802938596,
"learning_rate": 9.914081458258582e-06,
"loss": 0.4896,
"step": 850
},
{
"epoch": 0.4649486393944855,
"grad_norm": 0.40244747307174517,
"learning_rate": 9.908175939904317e-06,
"loss": 0.492,
"step": 860
},
{
"epoch": 0.47035501892232834,
"grad_norm": 0.4109529990790928,
"learning_rate": 9.902076033626409e-06,
"loss": 0.4863,
"step": 870
},
{
"epoch": 0.4757613984501712,
"grad_norm": 0.4151789891424962,
"learning_rate": 9.89578198101117e-06,
"loss": 0.48,
"step": 880
},
{
"epoch": 0.4811677779780141,
"grad_norm": 0.4884869421566706,
"learning_rate": 9.88929403133406e-06,
"loss": 0.4875,
"step": 890
},
{
"epoch": 0.4865741575058569,
"grad_norm": 0.39469839728031286,
"learning_rate": 9.882612441549817e-06,
"loss": 0.4886,
"step": 900
},
{
"epoch": 0.49198053703369976,
"grad_norm": 0.41142281651530643,
"learning_rate": 9.875737476282283e-06,
"loss": 0.4837,
"step": 910
},
{
"epoch": 0.4973869165615426,
"grad_norm": 0.4420691443729092,
"learning_rate": 9.868669407813919e-06,
"loss": 0.4877,
"step": 920
},
{
"epoch": 0.5027932960893855,
"grad_norm": 0.37836126000922937,
"learning_rate": 9.86140851607502e-06,
"loss": 0.4826,
"step": 930
},
{
"epoch": 0.5081996756172283,
"grad_norm": 0.42066137745562854,
"learning_rate": 9.85395508863264e-06,
"loss": 0.4827,
"step": 940
},
{
"epoch": 0.5136060551450712,
"grad_norm": 0.45522508321704436,
"learning_rate": 9.846309420679181e-06,
"loss": 0.4807,
"step": 950
},
{
"epoch": 0.519012434672914,
"grad_norm": 0.424109403832704,
"learning_rate": 9.838471815020731e-06,
"loss": 0.483,
"step": 960
},
{
"epoch": 0.5244188142007569,
"grad_norm": 0.4571075574503357,
"learning_rate": 9.830442582065046e-06,
"loss": 0.4847,
"step": 970
},
{
"epoch": 0.5298251937285997,
"grad_norm": 0.39544147521974715,
"learning_rate": 9.822222039809265e-06,
"loss": 0.4894,
"step": 980
},
{
"epoch": 0.5352315732564425,
"grad_norm": 0.41512982878770877,
"learning_rate": 9.813810513827324e-06,
"loss": 0.4757,
"step": 990
},
{
"epoch": 0.5406379527842855,
"grad_norm": 0.44241530882704766,
"learning_rate": 9.805208337257048e-06,
"loss": 0.4844,
"step": 1000
},
{
"epoch": 0.5460443323121283,
"grad_norm": 0.39829234416158904,
"learning_rate": 9.79641585078697e-06,
"loss": 0.4712,
"step": 1010
},
{
"epoch": 0.5514507118399712,
"grad_norm": 0.37741532471866907,
"learning_rate": 9.787433402642823e-06,
"loss": 0.4793,
"step": 1020
},
{
"epoch": 0.556857091367814,
"grad_norm": 0.4148300916885638,
"learning_rate": 9.778261348573766e-06,
"loss": 0.4838,
"step": 1030
},
{
"epoch": 0.5622634708956569,
"grad_norm": 0.4432803310345476,
"learning_rate": 9.76890005183828e-06,
"loss": 0.4808,
"step": 1040
},
{
"epoch": 0.5676698504234997,
"grad_norm": 0.44053440283249773,
"learning_rate": 9.759349883189788e-06,
"loss": 0.4855,
"step": 1050
},
{
"epoch": 0.5730762299513426,
"grad_norm": 0.47129417304470445,
"learning_rate": 9.749611220861975e-06,
"loss": 0.4825,
"step": 1060
},
{
"epoch": 0.5784826094791854,
"grad_norm": 0.3519052622952217,
"learning_rate": 9.739684450553796e-06,
"loss": 0.4672,
"step": 1070
},
{
"epoch": 0.5838889890070283,
"grad_norm": 0.41946435282373756,
"learning_rate": 9.729569965414214e-06,
"loss": 0.4749,
"step": 1080
},
{
"epoch": 0.5892953685348712,
"grad_norm": 0.40367405116733107,
"learning_rate": 9.719268166026619e-06,
"loss": 0.4714,
"step": 1090
},
{
"epoch": 0.594701748062714,
"grad_norm": 0.389163994716956,
"learning_rate": 9.70877946039297e-06,
"loss": 0.4762,
"step": 1100
},
{
"epoch": 0.6001081275905569,
"grad_norm": 0.3924144038563765,
"learning_rate": 9.698104263917632e-06,
"loss": 0.479,
"step": 1110
},
{
"epoch": 0.6055145071183997,
"grad_norm": 0.38077440580004723,
"learning_rate": 9.687242999390923e-06,
"loss": 0.4743,
"step": 1120
},
{
"epoch": 0.6109208866462426,
"grad_norm": 0.4144915670436874,
"learning_rate": 9.676196096972375e-06,
"loss": 0.4831,
"step": 1130
},
{
"epoch": 0.6163272661740854,
"grad_norm": 0.4019523099418982,
"learning_rate": 9.664963994173695e-06,
"loss": 0.4811,
"step": 1140
},
{
"epoch": 0.6217336457019282,
"grad_norm": 0.3870772083799463,
"learning_rate": 9.653547135841432e-06,
"loss": 0.482,
"step": 1150
},
{
"epoch": 0.6271400252297711,
"grad_norm": 0.3774486403943126,
"learning_rate": 9.641945974139368e-06,
"loss": 0.4808,
"step": 1160
},
{
"epoch": 0.632546404757614,
"grad_norm": 0.3669418201630717,
"learning_rate": 9.630160968530601e-06,
"loss": 0.4742,
"step": 1170
},
{
"epoch": 0.6379527842854569,
"grad_norm": 0.3767330377559856,
"learning_rate": 9.618192585759358e-06,
"loss": 0.4793,
"step": 1180
},
{
"epoch": 0.6433591638132997,
"grad_norm": 0.4109728050110914,
"learning_rate": 9.606041299832499e-06,
"loss": 0.476,
"step": 1190
},
{
"epoch": 0.6487655433411426,
"grad_norm": 0.42214280261521075,
"learning_rate": 9.593707592000751e-06,
"loss": 0.4719,
"step": 1200
},
{
"epoch": 0.6541719228689854,
"grad_norm": 0.40015675805718526,
"learning_rate": 9.581191950739651e-06,
"loss": 0.4802,
"step": 1210
},
{
"epoch": 0.6595783023968282,
"grad_norm": 0.3652325798758447,
"learning_rate": 9.568494871730184e-06,
"loss": 0.4751,
"step": 1220
},
{
"epoch": 0.6649846819246711,
"grad_norm": 0.4758040665812572,
"learning_rate": 9.555616857839171e-06,
"loss": 0.476,
"step": 1230
},
{
"epoch": 0.6703910614525139,
"grad_norm": 0.4088256926011169,
"learning_rate": 9.542558419099348e-06,
"loss": 0.4671,
"step": 1240
},
{
"epoch": 0.6757974409803568,
"grad_norm": 0.3777516778350075,
"learning_rate": 9.529320072689157e-06,
"loss": 0.4663,
"step": 1250
},
{
"epoch": 0.6812038205081997,
"grad_norm": 0.40279858714603456,
"learning_rate": 9.515902342912268e-06,
"loss": 0.4696,
"step": 1260
},
{
"epoch": 0.6866102000360426,
"grad_norm": 0.4553420901856075,
"learning_rate": 9.50230576117682e-06,
"loss": 0.4742,
"step": 1270
},
{
"epoch": 0.6920165795638854,
"grad_norm": 0.4339586123054069,
"learning_rate": 9.488530865974365e-06,
"loss": 0.4701,
"step": 1280
},
{
"epoch": 0.6974229590917282,
"grad_norm": 0.4249972919470697,
"learning_rate": 9.47457820285855e-06,
"loss": 0.4701,
"step": 1290
},
{
"epoch": 0.7028293386195711,
"grad_norm": 0.5108244833979698,
"learning_rate": 9.460448324423508e-06,
"loss": 0.4767,
"step": 1300
},
{
"epoch": 0.7082357181474139,
"grad_norm": 0.41029950466124815,
"learning_rate": 9.446141790281961e-06,
"loss": 0.4757,
"step": 1310
},
{
"epoch": 0.7136420976752568,
"grad_norm": 0.395665406767247,
"learning_rate": 9.431659167043079e-06,
"loss": 0.4657,
"step": 1320
},
{
"epoch": 0.7190484772030996,
"grad_norm": 0.3916187354896928,
"learning_rate": 9.417001028290019e-06,
"loss": 0.47,
"step": 1330
},
{
"epoch": 0.7244548567309426,
"grad_norm": 0.3841663885450239,
"learning_rate": 9.402167954557218e-06,
"loss": 0.4622,
"step": 1340
},
{
"epoch": 0.7298612362587854,
"grad_norm": 0.33000158409293234,
"learning_rate": 9.387160533307398e-06,
"loss": 0.4735,
"step": 1350
},
{
"epoch": 0.7352676157866282,
"grad_norm": 0.35110054752545317,
"learning_rate": 9.371979358908302e-06,
"loss": 0.4647,
"step": 1360
},
{
"epoch": 0.7406739953144711,
"grad_norm": 0.4060026085740451,
"learning_rate": 9.356625032609157e-06,
"loss": 0.4716,
"step": 1370
},
{
"epoch": 0.7460803748423139,
"grad_norm": 0.4014001214789219,
"learning_rate": 9.341098162516848e-06,
"loss": 0.4753,
"step": 1380
},
{
"epoch": 0.7514867543701568,
"grad_norm": 0.4466537387424745,
"learning_rate": 9.325399363571853e-06,
"loss": 0.4637,
"step": 1390
},
{
"epoch": 0.7568931338979996,
"grad_norm": 0.3789496760613153,
"learning_rate": 9.309529257523873e-06,
"loss": 0.4833,
"step": 1400
},
{
"epoch": 0.7622995134258425,
"grad_norm": 0.3871711262176569,
"learning_rate": 9.293488472907213e-06,
"loss": 0.4741,
"step": 1410
},
{
"epoch": 0.7677058929536853,
"grad_norm": 0.33522935773230744,
"learning_rate": 9.277277645015895e-06,
"loss": 0.4645,
"step": 1420
},
{
"epoch": 0.7731122724815283,
"grad_norm": 0.36926574454217775,
"learning_rate": 9.260897415878484e-06,
"loss": 0.4737,
"step": 1430
},
{
"epoch": 0.7785186520093711,
"grad_norm": 0.38628683202935965,
"learning_rate": 9.244348434232676e-06,
"loss": 0.4807,
"step": 1440
},
{
"epoch": 0.7839250315372139,
"grad_norm": 0.3723802508008121,
"learning_rate": 9.227631355499588e-06,
"loss": 0.4711,
"step": 1450
},
{
"epoch": 0.7893314110650568,
"grad_norm": 0.43275316141725356,
"learning_rate": 9.210746841757816e-06,
"loss": 0.4606,
"step": 1460
},
{
"epoch": 0.7947377905928996,
"grad_norm": 0.36470233384616396,
"learning_rate": 9.193695561717207e-06,
"loss": 0.4789,
"step": 1470
},
{
"epoch": 0.8001441701207425,
"grad_norm": 0.39548085338311784,
"learning_rate": 9.176478190692369e-06,
"loss": 0.4713,
"step": 1480
},
{
"epoch": 0.8055505496485853,
"grad_norm": 0.3553750033222167,
"learning_rate": 9.159095410575931e-06,
"loss": 0.4725,
"step": 1490
},
{
"epoch": 0.8109569291764281,
"grad_norm": 0.3637209745858356,
"learning_rate": 9.14154790981154e-06,
"loss": 0.4594,
"step": 1500
},
{
"epoch": 0.816363308704271,
"grad_norm": 0.3827679215177506,
"learning_rate": 9.12383638336659e-06,
"loss": 0.4731,
"step": 1510
},
{
"epoch": 0.8217696882321139,
"grad_norm": 0.3932319357502074,
"learning_rate": 9.105961532704695e-06,
"loss": 0.4744,
"step": 1520
},
{
"epoch": 0.8271760677599568,
"grad_norm": 0.37420610924572006,
"learning_rate": 9.08792406575792e-06,
"loss": 0.4596,
"step": 1530
},
{
"epoch": 0.8325824472877996,
"grad_norm": 0.36958869694379687,
"learning_rate": 9.069724696898727e-06,
"loss": 0.4644,
"step": 1540
},
{
"epoch": 0.8379888268156425,
"grad_norm": 0.4296266126218128,
"learning_rate": 9.051364146911696e-06,
"loss": 0.4695,
"step": 1550
},
{
"epoch": 0.8433952063434853,
"grad_norm": 0.3552866307907092,
"learning_rate": 9.03284314296497e-06,
"loss": 0.4699,
"step": 1560
},
{
"epoch": 0.8488015858713281,
"grad_norm": 0.36327016829544306,
"learning_rate": 9.01416241858146e-06,
"loss": 0.4669,
"step": 1570
},
{
"epoch": 0.854207965399171,
"grad_norm": 0.375420429355353,
"learning_rate": 8.995322713609792e-06,
"loss": 0.4672,
"step": 1580
},
{
"epoch": 0.8596143449270138,
"grad_norm": 0.5173900256611019,
"learning_rate": 8.976324774195005e-06,
"loss": 0.4683,
"step": 1590
},
{
"epoch": 0.8650207244548568,
"grad_norm": 0.39427484151317893,
"learning_rate": 8.957169352749005e-06,
"loss": 0.4652,
"step": 1600
},
{
"epoch": 0.8704271039826996,
"grad_norm": 0.4127231026821577,
"learning_rate": 8.937857207920751e-06,
"loss": 0.4693,
"step": 1610
},
{
"epoch": 0.8758334835105425,
"grad_norm": 0.3557084122875894,
"learning_rate": 8.918389104566232e-06,
"loss": 0.4653,
"step": 1620
},
{
"epoch": 0.8812398630383853,
"grad_norm": 0.32279027303173025,
"learning_rate": 8.898765813718155e-06,
"loss": 0.4575,
"step": 1630
},
{
"epoch": 0.8866462425662281,
"grad_norm": 0.3597815860403744,
"learning_rate": 8.878988112555415e-06,
"loss": 0.4635,
"step": 1640
},
{
"epoch": 0.892052622094071,
"grad_norm": 0.3672011391559523,
"learning_rate": 8.85905678437232e-06,
"loss": 0.4637,
"step": 1650
},
{
"epoch": 0.8974590016219138,
"grad_norm": 0.39802107641409196,
"learning_rate": 8.838972618547561e-06,
"loss": 0.4668,
"step": 1660
},
{
"epoch": 0.9028653811497567,
"grad_norm": 0.35901725656975336,
"learning_rate": 8.81873641051295e-06,
"loss": 0.4626,
"step": 1670
},
{
"epoch": 0.9082717606775995,
"grad_norm": 0.45574284613082794,
"learning_rate": 8.798348961721925e-06,
"loss": 0.4618,
"step": 1680
},
{
"epoch": 0.9136781402054425,
"grad_norm": 0.33960849857370073,
"learning_rate": 8.777811079617793e-06,
"loss": 0.4735,
"step": 1690
},
{
"epoch": 0.9190845197332853,
"grad_norm": 0.36806947123886746,
"learning_rate": 8.757123577601771e-06,
"loss": 0.4642,
"step": 1700
},
{
"epoch": 0.9244908992611282,
"grad_norm": 0.36728162811734544,
"learning_rate": 8.736287275000755e-06,
"loss": 0.465,
"step": 1710
},
{
"epoch": 0.929897278788971,
"grad_norm": 0.38164336488797146,
"learning_rate": 8.715302997034876e-06,
"loss": 0.4702,
"step": 1720
},
{
"epoch": 0.9353036583168138,
"grad_norm": 0.34605322849280384,
"learning_rate": 8.694171574784818e-06,
"loss": 0.4674,
"step": 1730
},
{
"epoch": 0.9407100378446567,
"grad_norm": 0.3353439147558085,
"learning_rate": 8.672893845158908e-06,
"loss": 0.4701,
"step": 1740
},
{
"epoch": 0.9461164173724995,
"grad_norm": 0.3437002297587831,
"learning_rate": 8.651470650859955e-06,
"loss": 0.4599,
"step": 1750
},
{
"epoch": 0.9515227969003424,
"grad_norm": 0.3431363969879203,
"learning_rate": 8.629902840351898e-06,
"loss": 0.4637,
"step": 1760
},
{
"epoch": 0.9569291764281853,
"grad_norm": 0.3765462141591892,
"learning_rate": 8.608191267826179e-06,
"loss": 0.4694,
"step": 1770
},
{
"epoch": 0.9623355559560282,
"grad_norm": 0.420048049416004,
"learning_rate": 8.586336793167926e-06,
"loss": 0.4641,
"step": 1780
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.412279889648995,
"learning_rate": 8.5643402819219e-06,
"loss": 0.4566,
"step": 1790
},
{
"epoch": 0.9731483150117138,
"grad_norm": 0.3299568555620076,
"learning_rate": 8.542202605258204e-06,
"loss": 0.463,
"step": 1800
},
{
"epoch": 0.9785546945395567,
"grad_norm": 0.32198105439404867,
"learning_rate": 8.519924639937786e-06,
"loss": 0.4617,
"step": 1810
},
{
"epoch": 0.9839610740673995,
"grad_norm": 0.3549245136848414,
"learning_rate": 8.49750726827772e-06,
"loss": 0.4565,
"step": 1820
},
{
"epoch": 0.9893674535952424,
"grad_norm": 0.3392271575380573,
"learning_rate": 8.474951378116253e-06,
"loss": 0.4639,
"step": 1830
},
{
"epoch": 0.9947738331230852,
"grad_norm": 0.3208227345701,
"learning_rate": 8.452257862777653e-06,
"loss": 0.4546,
"step": 1840
},
{
"epoch": 1.000180212650928,
"grad_norm": 0.4559641919273857,
"learning_rate": 8.42942762103681e-06,
"loss": 0.4837,
"step": 1850
},
{
"epoch": 1.005586592178771,
"grad_norm": 0.3598410288175877,
"learning_rate": 8.406461557083666e-06,
"loss": 0.4404,
"step": 1860
},
{
"epoch": 1.0109929717066137,
"grad_norm": 0.3857145460836866,
"learning_rate": 8.383360580487378e-06,
"loss": 0.4393,
"step": 1870
},
{
"epoch": 1.0163993512344567,
"grad_norm": 0.34505752597289024,
"learning_rate": 8.360125606160323e-06,
"loss": 0.4422,
"step": 1880
},
{
"epoch": 1.0218057307622994,
"grad_norm": 0.3739277339941646,
"learning_rate": 8.336757554321832e-06,
"loss": 0.4424,
"step": 1890
},
{
"epoch": 1.0272121102901424,
"grad_norm": 0.3968787668713752,
"learning_rate": 8.313257350461774e-06,
"loss": 0.4376,
"step": 1900
},
{
"epoch": 1.0326184898179853,
"grad_norm": 0.3451897271410753,
"learning_rate": 8.289625925303877e-06,
"loss": 0.4425,
"step": 1910
},
{
"epoch": 1.038024869345828,
"grad_norm": 0.40010047495902706,
"learning_rate": 8.265864214768883e-06,
"loss": 0.4503,
"step": 1920
},
{
"epoch": 1.043431248873671,
"grad_norm": 0.3736188460908676,
"learning_rate": 8.241973159937482e-06,
"loss": 0.4406,
"step": 1930
},
{
"epoch": 1.0488376284015137,
"grad_norm": 0.3394542766186862,
"learning_rate": 8.217953707013025e-06,
"loss": 0.4393,
"step": 1940
},
{
"epoch": 1.0542440079293567,
"grad_norm": 0.35077872709329283,
"learning_rate": 8.193806807284064e-06,
"loss": 0.4383,
"step": 1950
},
{
"epoch": 1.0596503874571994,
"grad_norm": 0.3441941331677373,
"learning_rate": 8.169533417086673e-06,
"loss": 0.4286,
"step": 1960
},
{
"epoch": 1.0650567669850424,
"grad_norm": 0.34884852607611294,
"learning_rate": 8.145134497766566e-06,
"loss": 0.4467,
"step": 1970
},
{
"epoch": 1.070463146512885,
"grad_norm": 0.40097746242132437,
"learning_rate": 8.120611015641036e-06,
"loss": 0.4363,
"step": 1980
},
{
"epoch": 1.075869526040728,
"grad_norm": 0.33184835023647064,
"learning_rate": 8.095963941960667e-06,
"loss": 0.437,
"step": 1990
},
{
"epoch": 1.081275905568571,
"grad_norm": 0.394546885758411,
"learning_rate": 8.071194252870887e-06,
"loss": 0.432,
"step": 2000
},
{
"epoch": 1.0866822850964137,
"grad_norm": 0.472784994513626,
"learning_rate": 8.046302929373286e-06,
"loss": 0.4367,
"step": 2010
},
{
"epoch": 1.0920886646242567,
"grad_norm": 0.3602670786653786,
"learning_rate": 8.021290957286787e-06,
"loss": 0.4352,
"step": 2020
},
{
"epoch": 1.0974950441520994,
"grad_norm": 0.3963387130392289,
"learning_rate": 7.996159327208581e-06,
"loss": 0.4434,
"step": 2030
},
{
"epoch": 1.1029014236799424,
"grad_norm": 0.37403782295160953,
"learning_rate": 7.97090903447491e-06,
"loss": 0.4326,
"step": 2040
},
{
"epoch": 1.108307803207785,
"grad_norm": 0.37350913921356577,
"learning_rate": 7.945541079121642e-06,
"loss": 0.4485,
"step": 2050
},
{
"epoch": 1.113714182735628,
"grad_norm": 0.3661212920976343,
"learning_rate": 7.920056465844658e-06,
"loss": 0.4328,
"step": 2060
},
{
"epoch": 1.119120562263471,
"grad_norm": 0.3507951321263283,
"learning_rate": 7.894456203960075e-06,
"loss": 0.4339,
"step": 2070
},
{
"epoch": 1.1245269417913137,
"grad_norm": 0.31935101139873434,
"learning_rate": 7.868741307364255e-06,
"loss": 0.4307,
"step": 2080
},
{
"epoch": 1.1299333213191567,
"grad_norm": 0.3240469373544592,
"learning_rate": 7.842912794493667e-06,
"loss": 0.4357,
"step": 2090
},
{
"epoch": 1.1353397008469994,
"grad_norm": 0.4024576218630106,
"learning_rate": 7.81697168828454e-06,
"loss": 0.4429,
"step": 2100
},
{
"epoch": 1.1407460803748424,
"grad_norm": 0.4057186928939639,
"learning_rate": 7.790919016132351e-06,
"loss": 0.4435,
"step": 2110
},
{
"epoch": 1.146152459902685,
"grad_norm": 0.4339123108369387,
"learning_rate": 7.764755809851141e-06,
"loss": 0.4375,
"step": 2120
},
{
"epoch": 1.151558839430528,
"grad_norm": 0.3423301493159426,
"learning_rate": 7.738483105632644e-06,
"loss": 0.4408,
"step": 2130
},
{
"epoch": 1.1569652189583708,
"grad_norm": 0.3049599421413694,
"learning_rate": 7.712101944005256e-06,
"loss": 0.442,
"step": 2140
},
{
"epoch": 1.1623715984862137,
"grad_norm": 0.3235699906736669,
"learning_rate": 7.685613369792815e-06,
"loss": 0.4389,
"step": 2150
},
{
"epoch": 1.1677779780140565,
"grad_norm": 0.38824198475727123,
"learning_rate": 7.65901843207323e-06,
"loss": 0.4372,
"step": 2160
},
{
"epoch": 1.1731843575418994,
"grad_norm": 0.3485465278129701,
"learning_rate": 7.63231818413692e-06,
"loss": 0.4313,
"step": 2170
},
{
"epoch": 1.1785907370697424,
"grad_norm": 0.3607061695090595,
"learning_rate": 7.605513683445118e-06,
"loss": 0.433,
"step": 2180
},
{
"epoch": 1.183997116597585,
"grad_norm": 0.35864049794241826,
"learning_rate": 7.578605991587974e-06,
"loss": 0.43,
"step": 2190
},
{
"epoch": 1.189403496125428,
"grad_norm": 0.3622129404816991,
"learning_rate": 7.5515961742425146e-06,
"loss": 0.4357,
"step": 2200
},
{
"epoch": 1.1948098756532708,
"grad_norm": 0.37719764002603634,
"learning_rate": 7.524485301130443e-06,
"loss": 0.4363,
"step": 2210
},
{
"epoch": 1.2002162551811137,
"grad_norm": 0.32038054153975193,
"learning_rate": 7.497274445975762e-06,
"loss": 0.4283,
"step": 2220
},
{
"epoch": 1.2056226347089565,
"grad_norm": 0.3897896894072551,
"learning_rate": 7.469964686462261e-06,
"loss": 0.4416,
"step": 2230
},
{
"epoch": 1.2110290142367994,
"grad_norm": 0.32144151391797593,
"learning_rate": 7.4425571041908254e-06,
"loss": 0.4388,
"step": 2240
},
{
"epoch": 1.2164353937646424,
"grad_norm": 0.3553047783046372,
"learning_rate": 7.415052784636603e-06,
"loss": 0.4401,
"step": 2250
},
{
"epoch": 1.2218417732924851,
"grad_norm": 0.31787401750902194,
"learning_rate": 7.387452817106017e-06,
"loss": 0.4313,
"step": 2260
},
{
"epoch": 1.227248152820328,
"grad_norm": 0.3736244875654426,
"learning_rate": 7.359758294693618e-06,
"loss": 0.4392,
"step": 2270
},
{
"epoch": 1.2326545323481708,
"grad_norm": 0.34863542131710556,
"learning_rate": 7.331970314238799e-06,
"loss": 0.4405,
"step": 2280
},
{
"epoch": 1.2380609118760137,
"grad_norm": 0.414690288534652,
"learning_rate": 7.304089976282348e-06,
"loss": 0.4401,
"step": 2290
},
{
"epoch": 1.2434672914038565,
"grad_norm": 0.356866165228421,
"learning_rate": 7.276118385022865e-06,
"loss": 0.4241,
"step": 2300
},
{
"epoch": 1.2488736709316994,
"grad_norm": 0.33264484884680307,
"learning_rate": 7.248056648273034e-06,
"loss": 0.4425,
"step": 2310
},
{
"epoch": 1.2542800504595424,
"grad_norm": 0.4175310788334551,
"learning_rate": 7.2199058774157375e-06,
"loss": 0.4276,
"step": 2320
},
{
"epoch": 1.2596864299873851,
"grad_norm": 0.38229588901030637,
"learning_rate": 7.1916671873600515e-06,
"loss": 0.4312,
"step": 2330
},
{
"epoch": 1.2650928095152278,
"grad_norm": 0.338696312422094,
"learning_rate": 7.163341696497084e-06,
"loss": 0.4405,
"step": 2340
},
{
"epoch": 1.2704991890430708,
"grad_norm": 0.32136223620818055,
"learning_rate": 7.134930526655679e-06,
"loss": 0.4347,
"step": 2350
},
{
"epoch": 1.2759055685709138,
"grad_norm": 0.3590441906111087,
"learning_rate": 7.106434803057998e-06,
"loss": 0.4392,
"step": 2360
},
{
"epoch": 1.2813119480987565,
"grad_norm": 0.3822900334441054,
"learning_rate": 7.077855654274939e-06,
"loss": 0.4329,
"step": 2370
},
{
"epoch": 1.2867183276265994,
"grad_norm": 0.4150924729603716,
"learning_rate": 7.04919421218145e-06,
"loss": 0.4344,
"step": 2380
},
{
"epoch": 1.2921247071544422,
"grad_norm": 0.31977805162237566,
"learning_rate": 7.020451611911703e-06,
"loss": 0.4274,
"step": 2390
},
{
"epoch": 1.2975310866822851,
"grad_norm": 0.4042413750463481,
"learning_rate": 6.9916289918141265e-06,
"loss": 0.4383,
"step": 2400
},
{
"epoch": 1.3029374662101278,
"grad_norm": 0.32750161889881924,
"learning_rate": 6.962727493406335e-06,
"loss": 0.4363,
"step": 2410
},
{
"epoch": 1.3083438457379708,
"grad_norm": 0.34681784503652924,
"learning_rate": 6.9337482613299065e-06,
"loss": 0.4251,
"step": 2420
},
{
"epoch": 1.3137502252658138,
"grad_norm": 0.31392667825247955,
"learning_rate": 6.904692443305059e-06,
"loss": 0.439,
"step": 2430
},
{
"epoch": 1.3191566047936565,
"grad_norm": 0.3080535811767778,
"learning_rate": 6.87556119008519e-06,
"loss": 0.4268,
"step": 2440
},
{
"epoch": 1.3245629843214994,
"grad_norm": 0.37030845399385603,
"learning_rate": 6.8463556554113005e-06,
"loss": 0.4353,
"step": 2450
},
{
"epoch": 1.3299693638493422,
"grad_norm": 0.3473034342384458,
"learning_rate": 6.8170769959663045e-06,
"loss": 0.4292,
"step": 2460
},
{
"epoch": 1.3353757433771851,
"grad_norm": 0.322256198293079,
"learning_rate": 6.787726371329214e-06,
"loss": 0.4402,
"step": 2470
},
{
"epoch": 1.3407821229050279,
"grad_norm": 0.3907219151376363,
"learning_rate": 6.7583049439292205e-06,
"loss": 0.4369,
"step": 2480
},
{
"epoch": 1.3461885024328708,
"grad_norm": 0.34928113227903806,
"learning_rate": 6.728813878999652e-06,
"loss": 0.4377,
"step": 2490
},
{
"epoch": 1.3515948819607138,
"grad_norm": 0.35544626757027864,
"learning_rate": 6.699254344531821e-06,
"loss": 0.4309,
"step": 2500
},
{
"epoch": 1.3570012614885565,
"grad_norm": 0.366218747083373,
"learning_rate": 6.669627511228778e-06,
"loss": 0.434,
"step": 2510
},
{
"epoch": 1.3624076410163992,
"grad_norm": 0.3580871935273299,
"learning_rate": 6.6399345524589366e-06,
"loss": 0.4401,
"step": 2520
},
{
"epoch": 1.3678140205442422,
"grad_norm": 0.29886314913995143,
"learning_rate": 6.610176644209602e-06,
"loss": 0.4266,
"step": 2530
},
{
"epoch": 1.3732204000720851,
"grad_norm": 0.3571328312104908,
"learning_rate": 6.580354965040396e-06,
"loss": 0.4393,
"step": 2540
},
{
"epoch": 1.3786267795999279,
"grad_norm": 0.3568154757493318,
"learning_rate": 6.550470696036591e-06,
"loss": 0.4276,
"step": 2550
},
{
"epoch": 1.3840331591277708,
"grad_norm": 0.3020834353942124,
"learning_rate": 6.520525020762318e-06,
"loss": 0.4374,
"step": 2560
},
{
"epoch": 1.3894395386556138,
"grad_norm": 0.4345861239807074,
"learning_rate": 6.490519125213701e-06,
"loss": 0.44,
"step": 2570
},
{
"epoch": 1.3948459181834565,
"grad_norm": 0.4164116140474957,
"learning_rate": 6.460454197771881e-06,
"loss": 0.4347,
"step": 2580
},
{
"epoch": 1.4002522977112992,
"grad_norm": 0.3698597319632245,
"learning_rate": 6.430331429155956e-06,
"loss": 0.4398,
"step": 2590
},
{
"epoch": 1.4056586772391422,
"grad_norm": 0.3557941383592286,
"learning_rate": 6.400152012375818e-06,
"loss": 0.4361,
"step": 2600
},
{
"epoch": 1.4110650567669851,
"grad_norm": 0.3703620913980966,
"learning_rate": 6.3699171426849036e-06,
"loss": 0.433,
"step": 2610
},
{
"epoch": 1.4164714362948279,
"grad_norm": 0.312372238883981,
"learning_rate": 6.339628017532858e-06,
"loss": 0.4305,
"step": 2620
},
{
"epoch": 1.4218778158226708,
"grad_norm": 0.32819677760603516,
"learning_rate": 6.309285836518113e-06,
"loss": 0.4289,
"step": 2630
},
{
"epoch": 1.4272841953505135,
"grad_norm": 0.34835896987461035,
"learning_rate": 6.2788918013403695e-06,
"loss": 0.4312,
"step": 2640
},
{
"epoch": 1.4326905748783565,
"grad_norm": 0.34043287674955064,
"learning_rate": 6.248447115753009e-06,
"loss": 0.4327,
"step": 2650
},
{
"epoch": 1.4380969544061992,
"grad_norm": 0.32777806734674225,
"learning_rate": 6.21795298551542e-06,
"loss": 0.4206,
"step": 2660
},
{
"epoch": 1.4435033339340422,
"grad_norm": 0.2839690869238431,
"learning_rate": 6.187410618345241e-06,
"loss": 0.4337,
"step": 2670
},
{
"epoch": 1.4489097134618851,
"grad_norm": 0.2845491198333412,
"learning_rate": 6.156821223870533e-06,
"loss": 0.428,
"step": 2680
},
{
"epoch": 1.4543160929897279,
"grad_norm": 0.3381278947086419,
"learning_rate": 6.126186013581868e-06,
"loss": 0.4442,
"step": 2690
},
{
"epoch": 1.4597224725175708,
"grad_norm": 0.2678673584947001,
"learning_rate": 6.095506200784349e-06,
"loss": 0.4313,
"step": 2700
},
{
"epoch": 1.4651288520454135,
"grad_norm": 0.32064492812884415,
"learning_rate": 6.06478300054956e-06,
"loss": 0.4443,
"step": 2710
},
{
"epoch": 1.4705352315732565,
"grad_norm": 0.33114310721210843,
"learning_rate": 6.034017629667439e-06,
"loss": 0.4321,
"step": 2720
},
{
"epoch": 1.4759416111010992,
"grad_norm": 0.3407274170049336,
"learning_rate": 6.003211306598089e-06,
"loss": 0.4302,
"step": 2730
},
{
"epoch": 1.4813479906289422,
"grad_norm": 0.3655959799961016,
"learning_rate": 5.972365251423521e-06,
"loss": 0.4331,
"step": 2740
},
{
"epoch": 1.4867543701567851,
"grad_norm": 0.3707027911602118,
"learning_rate": 5.941480685799338e-06,
"loss": 0.433,
"step": 2750
},
{
"epoch": 1.4921607496846279,
"grad_norm": 0.30224309374010494,
"learning_rate": 5.910558832906341e-06,
"loss": 0.4378,
"step": 2760
},
{
"epoch": 1.4975671292124706,
"grad_norm": 0.3421553953269554,
"learning_rate": 5.879600917402089e-06,
"loss": 0.4322,
"step": 2770
},
{
"epoch": 1.5029735087403135,
"grad_norm": 0.33381909956811917,
"learning_rate": 5.848608165372403e-06,
"loss": 0.425,
"step": 2780
},
{
"epoch": 1.5083798882681565,
"grad_norm": 0.3189833875248174,
"learning_rate": 5.8175818042828e-06,
"loss": 0.4357,
"step": 2790
},
{
"epoch": 1.5137862677959992,
"grad_norm": 0.36173513055424256,
"learning_rate": 5.78652306292988e-06,
"loss": 0.4395,
"step": 2800
},
{
"epoch": 1.5191926473238422,
"grad_norm": 0.3265416603091211,
"learning_rate": 5.75543317139266e-06,
"loss": 0.4426,
"step": 2810
},
{
"epoch": 1.5245990268516851,
"grad_norm": 0.33495795652653004,
"learning_rate": 5.724313360983859e-06,
"loss": 0.4335,
"step": 2820
},
{
"epoch": 1.5300054063795279,
"grad_norm": 0.35637908471545576,
"learning_rate": 5.693164864201134e-06,
"loss": 0.4343,
"step": 2830
},
{
"epoch": 1.5354117859073706,
"grad_norm": 0.3422755476029069,
"learning_rate": 5.661988914678257e-06,
"loss": 0.4201,
"step": 2840
},
{
"epoch": 1.5408181654352135,
"grad_norm": 0.29401423880776295,
"learning_rate": 5.630786747136269e-06,
"loss": 0.4263,
"step": 2850
},
{
"epoch": 1.5462245449630565,
"grad_norm": 0.35559246067713574,
"learning_rate": 5.599559597334568e-06,
"loss": 0.4327,
"step": 2860
},
{
"epoch": 1.5516309244908992,
"grad_norm": 0.3234026109207772,
"learning_rate": 5.56830870202198e-06,
"loss": 0.4284,
"step": 2870
},
{
"epoch": 1.557037304018742,
"grad_norm": 0.3041181368480941,
"learning_rate": 5.537035298887764e-06,
"loss": 0.4291,
"step": 2880
},
{
"epoch": 1.562443683546585,
"grad_norm": 0.4152034967270183,
"learning_rate": 5.505740626512601e-06,
"loss": 0.4333,
"step": 2890
},
{
"epoch": 1.5678500630744279,
"grad_norm": 0.32189843480023705,
"learning_rate": 5.474425924319538e-06,
"loss": 0.4313,
"step": 2900
},
{
"epoch": 1.5732564426022706,
"grad_norm": 0.3400408960358337,
"learning_rate": 5.443092432524906e-06,
"loss": 0.4446,
"step": 2910
},
{
"epoch": 1.5786628221301136,
"grad_norm": 0.3253331216756115,
"learning_rate": 5.411741392089192e-06,
"loss": 0.4276,
"step": 2920
},
{
"epoch": 1.5840692016579565,
"grad_norm": 0.34364169352732366,
"learning_rate": 5.380374044667896e-06,
"loss": 0.4363,
"step": 2930
},
{
"epoch": 1.5894755811857992,
"grad_norm": 0.2993302543547276,
"learning_rate": 5.348991632562355e-06,
"loss": 0.4347,
"step": 2940
},
{
"epoch": 1.594881960713642,
"grad_norm": 0.31140003151111195,
"learning_rate": 5.317595398670543e-06,
"loss": 0.4203,
"step": 2950
},
{
"epoch": 1.600288340241485,
"grad_norm": 0.34917215566088183,
"learning_rate": 5.286186586437845e-06,
"loss": 0.4394,
"step": 2960
},
{
"epoch": 1.6056947197693279,
"grad_norm": 0.3099678473182354,
"learning_rate": 5.254766439807807e-06,
"loss": 0.4224,
"step": 2970
},
{
"epoch": 1.6111010992971706,
"grad_norm": 0.32027842285858055,
"learning_rate": 5.223336203172874e-06,
"loss": 0.4289,
"step": 2980
},
{
"epoch": 1.6165074788250136,
"grad_norm": 0.29377503624337103,
"learning_rate": 5.191897121325111e-06,
"loss": 0.43,
"step": 2990
},
{
"epoch": 1.6219138583528565,
"grad_norm": 0.3286814138894788,
"learning_rate": 5.16045043940689e-06,
"loss": 0.4344,
"step": 3000
},
{
"epoch": 1.6273202378806992,
"grad_norm": 0.35588674616258936,
"learning_rate": 5.128997402861584e-06,
"loss": 0.4306,
"step": 3010
},
{
"epoch": 1.632726617408542,
"grad_norm": 0.33501603495492577,
"learning_rate": 5.09753925738424e-06,
"loss": 0.4154,
"step": 3020
},
{
"epoch": 1.638132996936385,
"grad_norm": 0.3011476898703049,
"learning_rate": 5.06607724887225e-06,
"loss": 0.4314,
"step": 3030
},
{
"epoch": 1.6435393764642279,
"grad_norm": 0.3879201939655995,
"learning_rate": 5.034612623375993e-06,
"loss": 0.4412,
"step": 3040
},
{
"epoch": 1.6489457559920706,
"grad_norm": 0.3426764786646151,
"learning_rate": 5.003146627049499e-06,
"loss": 0.4295,
"step": 3050
},
{
"epoch": 1.6543521355199133,
"grad_norm": 0.3408786770769329,
"learning_rate": 4.971680506101086e-06,
"loss": 0.4259,
"step": 3060
},
{
"epoch": 1.6597585150477565,
"grad_norm": 0.3689333373771858,
"learning_rate": 4.940215506744011e-06,
"loss": 0.4254,
"step": 3070
},
{
"epoch": 1.6651648945755992,
"grad_norm": 0.33725311763702437,
"learning_rate": 4.90875287514711e-06,
"loss": 0.4286,
"step": 3080
},
{
"epoch": 1.670571274103442,
"grad_norm": 0.3106105413402686,
"learning_rate": 4.87729385738544e-06,
"loss": 0.426,
"step": 3090
},
{
"epoch": 1.675977653631285,
"grad_norm": 0.361491556160267,
"learning_rate": 4.845839699390936e-06,
"loss": 0.4229,
"step": 3100
},
{
"epoch": 1.6813840331591279,
"grad_norm": 0.3012437306295753,
"learning_rate": 4.814391646903063e-06,
"loss": 0.4296,
"step": 3110
},
{
"epoch": 1.6867904126869706,
"grad_norm": 0.3142934287582159,
"learning_rate": 4.782950945419475e-06,
"loss": 0.4304,
"step": 3120
},
{
"epoch": 1.6921967922148133,
"grad_norm": 0.3024864799296645,
"learning_rate": 4.751518840146695e-06,
"loss": 0.4329,
"step": 3130
},
{
"epoch": 1.6976031717426563,
"grad_norm": 0.3081924919099197,
"learning_rate": 4.720096575950784e-06,
"loss": 0.4319,
"step": 3140
},
{
"epoch": 1.7030095512704992,
"grad_norm": 0.32189094915170496,
"learning_rate": 4.688685397308061e-06,
"loss": 0.42,
"step": 3150
},
{
"epoch": 1.708415930798342,
"grad_norm": 0.33972262308693657,
"learning_rate": 4.657286548255789e-06,
"loss": 0.4369,
"step": 3160
},
{
"epoch": 1.713822310326185,
"grad_norm": 0.30741331028975344,
"learning_rate": 4.6259012723429285e-06,
"loss": 0.4274,
"step": 3170
},
{
"epoch": 1.7192286898540279,
"grad_norm": 0.28971622178653267,
"learning_rate": 4.594530812580876e-06,
"loss": 0.4216,
"step": 3180
},
{
"epoch": 1.7246350693818706,
"grad_norm": 0.2792098363578085,
"learning_rate": 4.563176411394229e-06,
"loss": 0.4238,
"step": 3190
},
{
"epoch": 1.7300414489097133,
"grad_norm": 0.29274514837335597,
"learning_rate": 4.531839310571595e-06,
"loss": 0.4291,
"step": 3200
},
{
"epoch": 1.7354478284375563,
"grad_norm": 0.32996912353874136,
"learning_rate": 4.5005207512163914e-06,
"loss": 0.4388,
"step": 3210
},
{
"epoch": 1.7408542079653992,
"grad_norm": 0.34282857698540753,
"learning_rate": 4.469221973697714e-06,
"loss": 0.4373,
"step": 3220
},
{
"epoch": 1.746260587493242,
"grad_norm": 0.3147983795136612,
"learning_rate": 4.43794421760119e-06,
"loss": 0.4291,
"step": 3230
},
{
"epoch": 1.751666967021085,
"grad_norm": 0.2953517288607898,
"learning_rate": 4.4066887216799055e-06,
"loss": 0.4219,
"step": 3240
},
{
"epoch": 1.7570733465489279,
"grad_norm": 0.30489564567587807,
"learning_rate": 4.375456723805321e-06,
"loss": 0.4308,
"step": 3250
},
{
"epoch": 1.7624797260767706,
"grad_norm": 0.30950501632812377,
"learning_rate": 4.344249460918271e-06,
"loss": 0.4213,
"step": 3260
},
{
"epoch": 1.7678861056046133,
"grad_norm": 0.30230325895579757,
"learning_rate": 4.313068168979957e-06,
"loss": 0.4364,
"step": 3270
},
{
"epoch": 1.7732924851324563,
"grad_norm": 0.30774095159515363,
"learning_rate": 4.281914082923002e-06,
"loss": 0.4165,
"step": 3280
},
{
"epoch": 1.7786988646602993,
"grad_norm": 0.3275433264912912,
"learning_rate": 4.250788436602548e-06,
"loss": 0.4269,
"step": 3290
},
{
"epoch": 1.784105244188142,
"grad_norm": 0.3270523212461865,
"learning_rate": 4.2196924627473715e-06,
"loss": 0.4304,
"step": 3300
},
{
"epoch": 1.7895116237159847,
"grad_norm": 0.28953105726529316,
"learning_rate": 4.188627392911091e-06,
"loss": 0.4281,
"step": 3310
},
{
"epoch": 1.7949180032438277,
"grad_norm": 0.34157770345495453,
"learning_rate": 4.157594457423357e-06,
"loss": 0.432,
"step": 3320
},
{
"epoch": 1.8003243827716706,
"grad_norm": 0.2952227481543905,
"learning_rate": 4.1265948853411506e-06,
"loss": 0.427,
"step": 3330
},
{
"epoch": 1.8057307622995133,
"grad_norm": 0.3058432699391948,
"learning_rate": 4.095629904400097e-06,
"loss": 0.4268,
"step": 3340
},
{
"epoch": 1.8111371418273563,
"grad_norm": 0.32888818257409286,
"learning_rate": 4.06470074096584e-06,
"loss": 0.4334,
"step": 3350
},
{
"epoch": 1.8165435213551993,
"grad_norm": 0.29929296938295863,
"learning_rate": 4.0338086199854765e-06,
"loss": 0.4248,
"step": 3360
},
{
"epoch": 1.821949900883042,
"grad_norm": 0.33418978699429813,
"learning_rate": 4.0029547649390346e-06,
"loss": 0.4307,
"step": 3370
},
{
"epoch": 1.8273562804108847,
"grad_norm": 0.2991040804166494,
"learning_rate": 3.97214039779103e-06,
"loss": 0.435,
"step": 3380
},
{
"epoch": 1.8327626599387277,
"grad_norm": 0.2829911428105187,
"learning_rate": 3.941366738942058e-06,
"loss": 0.4246,
"step": 3390
},
{
"epoch": 1.8381690394665706,
"grad_norm": 0.2990384176756561,
"learning_rate": 3.910635007180468e-06,
"loss": 0.4394,
"step": 3400
},
{
"epoch": 1.8435754189944134,
"grad_norm": 0.28487793163600966,
"learning_rate": 3.879946419634087e-06,
"loss": 0.4268,
"step": 3410
},
{
"epoch": 1.8489817985222563,
"grad_norm": 0.30066911074015307,
"learning_rate": 3.8493021917220225e-06,
"loss": 0.4289,
"step": 3420
},
{
"epoch": 1.8543881780500993,
"grad_norm": 0.3145700146426358,
"learning_rate": 3.818703537106522e-06,
"loss": 0.427,
"step": 3430
},
{
"epoch": 1.859794557577942,
"grad_norm": 0.3121437364875441,
"learning_rate": 3.7881516676449014e-06,
"loss": 0.4334,
"step": 3440
},
{
"epoch": 1.8652009371057847,
"grad_norm": 0.2914138429548545,
"learning_rate": 3.7576477933415612e-06,
"loss": 0.4358,
"step": 3450
},
{
"epoch": 1.8706073166336277,
"grad_norm": 0.3263366427961882,
"learning_rate": 3.7271931223000507e-06,
"loss": 0.4294,
"step": 3460
},
{
"epoch": 1.8760136961614706,
"grad_norm": 0.3181986581808925,
"learning_rate": 3.6967888606752345e-06,
"loss": 0.433,
"step": 3470
},
{
"epoch": 1.8814200756893134,
"grad_norm": 0.31837041508546626,
"learning_rate": 3.6664362126255087e-06,
"loss": 0.4283,
"step": 3480
},
{
"epoch": 1.886826455217156,
"grad_norm": 0.2876960972161682,
"learning_rate": 3.636136380265124e-06,
"loss": 0.4189,
"step": 3490
},
{
"epoch": 1.8922328347449993,
"grad_norm": 0.30867320900321366,
"learning_rate": 3.6058905636165674e-06,
"loss": 0.4309,
"step": 3500
},
{
"epoch": 1.897639214272842,
"grad_norm": 0.29104980848951667,
"learning_rate": 3.575699960563038e-06,
"loss": 0.4184,
"step": 3510
},
{
"epoch": 1.9030455938006847,
"grad_norm": 0.2859389528274554,
"learning_rate": 3.5455657668010057e-06,
"loss": 0.4253,
"step": 3520
},
{
"epoch": 1.9084519733285277,
"grad_norm": 0.30910611127718657,
"learning_rate": 3.5154891757928523e-06,
"loss": 0.4257,
"step": 3530
},
{
"epoch": 1.9138583528563706,
"grad_norm": 0.31381289055858025,
"learning_rate": 3.4854713787196105e-06,
"loss": 0.4324,
"step": 3540
},
{
"epoch": 1.9192647323842134,
"grad_norm": 0.33654431291917486,
"learning_rate": 3.4555135644337803e-06,
"loss": 0.4262,
"step": 3550
},
{
"epoch": 1.924671111912056,
"grad_norm": 0.30712399081960845,
"learning_rate": 3.42561691941225e-06,
"loss": 0.4344,
"step": 3560
},
{
"epoch": 1.930077491439899,
"grad_norm": 0.2989668977037765,
"learning_rate": 3.3957826277093074e-06,
"loss": 0.4278,
"step": 3570
},
{
"epoch": 1.935483870967742,
"grad_norm": 0.3259516671848096,
"learning_rate": 3.3660118709097347e-06,
"loss": 0.4242,
"step": 3580
},
{
"epoch": 1.9408902504955847,
"grad_norm": 0.29719187591192203,
"learning_rate": 3.336305828082024e-06,
"loss": 0.4319,
"step": 3590
},
{
"epoch": 1.9462966300234277,
"grad_norm": 0.3250815058947025,
"learning_rate": 3.306665675731674e-06,
"loss": 0.4324,
"step": 3600
},
{
"epoch": 1.9517030095512706,
"grad_norm": 0.3196705993035981,
"learning_rate": 3.277092587754598e-06,
"loss": 0.4283,
"step": 3610
},
{
"epoch": 1.9571093890791134,
"grad_norm": 0.2836241969868925,
"learning_rate": 3.247587735390628e-06,
"loss": 0.4285,
"step": 3620
},
{
"epoch": 1.962515768606956,
"grad_norm": 0.2963451307813687,
"learning_rate": 3.218152287177133e-06,
"loss": 0.4233,
"step": 3630
},
{
"epoch": 1.967922148134799,
"grad_norm": 0.32162438964611967,
"learning_rate": 3.1887874089027304e-06,
"loss": 0.4275,
"step": 3640
},
{
"epoch": 1.973328527662642,
"grad_norm": 0.2858747270839711,
"learning_rate": 3.159494263561126e-06,
"loss": 0.429,
"step": 3650
},
{
"epoch": 1.9787349071904847,
"grad_norm": 0.294205581889964,
"learning_rate": 3.130274011305047e-06,
"loss": 0.4261,
"step": 3660
},
{
"epoch": 1.9841412867183277,
"grad_norm": 0.3271655262933234,
"learning_rate": 3.1011278094002928e-06,
"loss": 0.4352,
"step": 3670
},
{
"epoch": 1.9895476662461706,
"grad_norm": 0.3151321646815863,
"learning_rate": 3.0720568121799105e-06,
"loss": 0.4302,
"step": 3680
},
{
"epoch": 1.9949540457740134,
"grad_norm": 0.3069606817223593,
"learning_rate": 3.043062170998464e-06,
"loss": 0.4274,
"step": 3690
},
{
"epoch": 2.000360425301856,
"grad_norm": 0.3418886732932903,
"learning_rate": 3.0141450341864486e-06,
"loss": 0.4368,
"step": 3700
},
{
"epoch": 2.005766804829699,
"grad_norm": 0.28231273100784204,
"learning_rate": 2.9853065470048016e-06,
"loss": 0.4084,
"step": 3710
},
{
"epoch": 2.011173184357542,
"grad_norm": 0.27285411121752895,
"learning_rate": 2.956547851599548e-06,
"loss": 0.3899,
"step": 3720
},
{
"epoch": 2.0165795638853847,
"grad_norm": 0.31740692003997667,
"learning_rate": 2.9278700869565713e-06,
"loss": 0.406,
"step": 3730
},
{
"epoch": 2.0219859434132275,
"grad_norm": 0.32723222207620034,
"learning_rate": 2.8992743888564886e-06,
"loss": 0.4107,
"step": 3740
},
{
"epoch": 2.0273923229410706,
"grad_norm": 0.3293876655149398,
"learning_rate": 2.8707618898296864e-06,
"loss": 0.4052,
"step": 3750
},
{
"epoch": 2.0327987024689134,
"grad_norm": 0.26473497263074053,
"learning_rate": 2.8423337191114495e-06,
"loss": 0.402,
"step": 3760
},
{
"epoch": 2.038205081996756,
"grad_norm": 0.31910999655360905,
"learning_rate": 2.8139910025972622e-06,
"loss": 0.4134,
"step": 3770
},
{
"epoch": 2.043611461524599,
"grad_norm": 0.29154253424627524,
"learning_rate": 2.785734862798184e-06,
"loss": 0.4086,
"step": 3780
},
{
"epoch": 2.049017841052442,
"grad_norm": 0.2910125618297838,
"learning_rate": 2.7575664187964236e-06,
"loss": 0.4007,
"step": 3790
},
{
"epoch": 2.0544242205802847,
"grad_norm": 0.28793585101610353,
"learning_rate": 2.7294867862009937e-06,
"loss": 0.4053,
"step": 3800
},
{
"epoch": 2.0598306001081275,
"grad_norm": 0.2731032601573403,
"learning_rate": 2.7014970771035474e-06,
"loss": 0.4138,
"step": 3810
},
{
"epoch": 2.0652369796359706,
"grad_norm": 0.29876809472359783,
"learning_rate": 2.6735984000343216e-06,
"loss": 0.4156,
"step": 3820
},
{
"epoch": 2.0706433591638134,
"grad_norm": 0.3100743441240049,
"learning_rate": 2.645791859918234e-06,
"loss": 0.4089,
"step": 3830
},
{
"epoch": 2.076049738691656,
"grad_norm": 0.34676569440909566,
"learning_rate": 2.6180785580311284e-06,
"loss": 0.3998,
"step": 3840
},
{
"epoch": 2.081456118219499,
"grad_norm": 0.28331404223893575,
"learning_rate": 2.5904595919561563e-06,
"loss": 0.3935,
"step": 3850
},
{
"epoch": 2.086862497747342,
"grad_norm": 0.2892120423588288,
"learning_rate": 2.562936055540307e-06,
"loss": 0.411,
"step": 3860
},
{
"epoch": 2.0922688772751847,
"grad_norm": 0.29210558202813347,
"learning_rate": 2.5355090388510806e-06,
"loss": 0.4108,
"step": 3870
},
{
"epoch": 2.0976752568030275,
"grad_norm": 0.29027866503096267,
"learning_rate": 2.508179628133326e-06,
"loss": 0.4016,
"step": 3880
},
{
"epoch": 2.1030816363308706,
"grad_norm": 0.2876065349136538,
"learning_rate": 2.4809489057662168e-06,
"loss": 0.4101,
"step": 3890
},
{
"epoch": 2.1084880158587134,
"grad_norm": 0.3135899601532618,
"learning_rate": 2.4538179502203753e-06,
"loss": 0.4001,
"step": 3900
},
{
"epoch": 2.113894395386556,
"grad_norm": 0.30848425065584256,
"learning_rate": 2.4267878360151747e-06,
"loss": 0.3997,
"step": 3910
},
{
"epoch": 2.119300774914399,
"grad_norm": 0.2923032276510183,
"learning_rate": 2.399859633676165e-06,
"loss": 0.4049,
"step": 3920
},
{
"epoch": 2.124707154442242,
"grad_norm": 0.29055776768248115,
"learning_rate": 2.3730344096926974e-06,
"loss": 0.3981,
"step": 3930
},
{
"epoch": 2.1301135339700847,
"grad_norm": 0.3161385412337821,
"learning_rate": 2.3463132264756617e-06,
"loss": 0.4075,
"step": 3940
},
{
"epoch": 2.1355199134979275,
"grad_norm": 0.2828900068372096,
"learning_rate": 2.319697142315428e-06,
"loss": 0.3906,
"step": 3950
},
{
"epoch": 2.14092629302577,
"grad_norm": 0.26292390614915356,
"learning_rate": 2.293187211339926e-06,
"loss": 0.3991,
"step": 3960
},
{
"epoch": 2.1463326725536134,
"grad_norm": 0.2987394527032652,
"learning_rate": 2.2667844834728923e-06,
"loss": 0.3999,
"step": 3970
},
{
"epoch": 2.151739052081456,
"grad_norm": 0.27915670540136367,
"learning_rate": 2.2404900043922996e-06,
"loss": 0.3995,
"step": 3980
},
{
"epoch": 2.157145431609299,
"grad_norm": 0.2818164391888048,
"learning_rate": 2.2143048154889272e-06,
"loss": 0.4015,
"step": 3990
},
{
"epoch": 2.162551811137142,
"grad_norm": 0.26044900685376793,
"learning_rate": 2.1882299538251352e-06,
"loss": 0.4003,
"step": 4000
},
{
"epoch": 2.1679581906649847,
"grad_norm": 0.27297932069072756,
"learning_rate": 2.162266452093774e-06,
"loss": 0.4149,
"step": 4010
},
{
"epoch": 2.1733645701928275,
"grad_norm": 0.2978434115081757,
"learning_rate": 2.1364153385773007e-06,
"loss": 0.4018,
"step": 4020
},
{
"epoch": 2.17877094972067,
"grad_norm": 0.31586609932366294,
"learning_rate": 2.110677637107036e-06,
"loss": 0.4053,
"step": 4030
},
{
"epoch": 2.1841773292485134,
"grad_norm": 0.29030802044428805,
"learning_rate": 2.0850543670226318e-06,
"loss": 0.4065,
"step": 4040
},
{
"epoch": 2.189583708776356,
"grad_norm": 0.3365802334808058,
"learning_rate": 2.059546543131696e-06,
"loss": 0.405,
"step": 4050
},
{
"epoch": 2.194990088304199,
"grad_norm": 0.2995355365322975,
"learning_rate": 2.034155175669592e-06,
"loss": 0.4044,
"step": 4060
},
{
"epoch": 2.200396467832042,
"grad_norm": 0.2868235821916637,
"learning_rate": 2.0088812702594424e-06,
"loss": 0.4023,
"step": 4070
},
{
"epoch": 2.2058028473598847,
"grad_norm": 0.29532698621262965,
"learning_rate": 1.9837258278722855e-06,
"loss": 0.413,
"step": 4080
},
{
"epoch": 2.2112092268877275,
"grad_norm": 0.282345122194298,
"learning_rate": 1.9586898447874543e-06,
"loss": 0.4033,
"step": 4090
},
{
"epoch": 2.21661560641557,
"grad_norm": 0.28744059302390934,
"learning_rate": 1.933774312553092e-06,
"loss": 0.4002,
"step": 4100
},
{
"epoch": 2.2220219859434134,
"grad_norm": 0.29637974416632634,
"learning_rate": 1.9089802179469036e-06,
"loss": 0.397,
"step": 4110
},
{
"epoch": 2.227428365471256,
"grad_norm": 0.29136812414474506,
"learning_rate": 1.884308542937065e-06,
"loss": 0.4198,
"step": 4120
},
{
"epoch": 2.232834744999099,
"grad_norm": 0.28845833396948634,
"learning_rate": 1.8597602646433294e-06,
"loss": 0.4012,
"step": 4130
},
{
"epoch": 2.238241124526942,
"grad_norm": 0.31515767696033387,
"learning_rate": 1.8353363552983382e-06,
"loss": 0.4084,
"step": 4140
},
{
"epoch": 2.2436475040547847,
"grad_norm": 0.2852056906534805,
"learning_rate": 1.8110377822091057e-06,
"loss": 0.4129,
"step": 4150
},
{
"epoch": 2.2490538835826275,
"grad_norm": 0.2961534698999477,
"learning_rate": 1.7868655077187175e-06,
"loss": 0.404,
"step": 4160
},
{
"epoch": 2.25446026311047,
"grad_norm": 0.3026130823215708,
"learning_rate": 1.76282048916821e-06,
"loss": 0.4105,
"step": 4170
},
{
"epoch": 2.2598666426383134,
"grad_norm": 0.295103201693147,
"learning_rate": 1.7389036788586627e-06,
"loss": 0.4057,
"step": 4180
},
{
"epoch": 2.265273022166156,
"grad_norm": 0.26979492433946,
"learning_rate": 1.7151160240134702e-06,
"loss": 0.4027,
"step": 4190
},
{
"epoch": 2.270679401693999,
"grad_norm": 0.3069718829915049,
"learning_rate": 1.6914584667408408e-06,
"loss": 0.407,
"step": 4200
},
{
"epoch": 2.276085781221842,
"grad_norm": 0.2582555297518662,
"learning_rate": 1.6679319439964797e-06,
"loss": 0.3943,
"step": 4210
},
{
"epoch": 2.2814921607496847,
"grad_norm": 0.30300112933414725,
"learning_rate": 1.6445373875464738e-06,
"loss": 0.4073,
"step": 4220
},
{
"epoch": 2.2868985402775275,
"grad_norm": 0.27640155584834986,
"learning_rate": 1.6212757239304e-06,
"loss": 0.4074,
"step": 4230
},
{
"epoch": 2.29230491980537,
"grad_norm": 0.288482277273483,
"learning_rate": 1.5981478744246242e-06,
"loss": 0.3961,
"step": 4240
},
{
"epoch": 2.297711299333213,
"grad_norm": 0.2968944260811366,
"learning_rate": 1.575154755005816e-06,
"loss": 0.403,
"step": 4250
},
{
"epoch": 2.303117678861056,
"grad_norm": 0.29278471655933946,
"learning_rate": 1.5522972763146653e-06,
"loss": 0.4019,
"step": 4260
},
{
"epoch": 2.308524058388899,
"grad_norm": 0.2729883421366084,
"learning_rate": 1.5295763436198274e-06,
"loss": 0.4148,
"step": 4270
},
{
"epoch": 2.3139304379167416,
"grad_norm": 0.30284845140590294,
"learning_rate": 1.5069928567820635e-06,
"loss": 0.4016,
"step": 4280
},
{
"epoch": 2.3193368174445848,
"grad_norm": 0.3044664985270554,
"learning_rate": 1.4845477102185974e-06,
"loss": 0.4092,
"step": 4290
},
{
"epoch": 2.3247431969724275,
"grad_norm": 0.30467048506977945,
"learning_rate": 1.4622417928677034e-06,
"loss": 0.3997,
"step": 4300
},
{
"epoch": 2.33014957650027,
"grad_norm": 0.25546815283849933,
"learning_rate": 1.4400759881534886e-06,
"loss": 0.3988,
"step": 4310
},
{
"epoch": 2.335555956028113,
"grad_norm": 0.2852027186621198,
"learning_rate": 1.418051173950914e-06,
"loss": 0.4124,
"step": 4320
},
{
"epoch": 2.340962335555956,
"grad_norm": 0.28906302811953016,
"learning_rate": 1.3961682225510203e-06,
"loss": 0.3993,
"step": 4330
},
{
"epoch": 2.346368715083799,
"grad_norm": 0.27197836639387235,
"learning_rate": 1.3744280006263839e-06,
"loss": 0.408,
"step": 4340
},
{
"epoch": 2.3517750946116416,
"grad_norm": 0.2668399923208869,
"learning_rate": 1.3528313691967926e-06,
"loss": 0.4134,
"step": 4350
},
{
"epoch": 2.3571814741394848,
"grad_norm": 0.2872848077693314,
"learning_rate": 1.3313791835951396e-06,
"loss": 0.4045,
"step": 4360
},
{
"epoch": 2.3625878536673275,
"grad_norm": 0.29802601615160446,
"learning_rate": 1.310072293433558e-06,
"loss": 0.4014,
"step": 4370
},
{
"epoch": 2.36799423319517,
"grad_norm": 0.25723071187565805,
"learning_rate": 1.2889115425697612e-06,
"loss": 0.399,
"step": 4380
},
{
"epoch": 2.373400612723013,
"grad_norm": 0.2842104581531295,
"learning_rate": 1.2678977690736311e-06,
"loss": 0.4015,
"step": 4390
},
{
"epoch": 2.378806992250856,
"grad_norm": 0.2813179130833351,
"learning_rate": 1.2470318051940205e-06,
"loss": 0.4026,
"step": 4400
},
{
"epoch": 2.384213371778699,
"grad_norm": 0.27762098429764004,
"learning_rate": 1.2263144773257967e-06,
"loss": 0.4068,
"step": 4410
},
{
"epoch": 2.3896197513065416,
"grad_norm": 0.27848678899943174,
"learning_rate": 1.2057466059771035e-06,
"loss": 0.4006,
"step": 4420
},
{
"epoch": 2.3950261308343848,
"grad_norm": 0.27875535013460345,
"learning_rate": 1.1853290057368754e-06,
"loss": 0.4088,
"step": 4430
},
{
"epoch": 2.4004325103622275,
"grad_norm": 0.2662344684523685,
"learning_rate": 1.165062485242574e-06,
"loss": 0.4019,
"step": 4440
},
{
"epoch": 2.40583888989007,
"grad_norm": 0.3005215328293971,
"learning_rate": 1.1449478471481512e-06,
"loss": 0.411,
"step": 4450
},
{
"epoch": 2.411245269417913,
"grad_norm": 0.2712567161403629,
"learning_rate": 1.1249858880922771e-06,
"loss": 0.4059,
"step": 4460
},
{
"epoch": 2.416651648945756,
"grad_norm": 0.26211955276644977,
"learning_rate": 1.1051773986667735e-06,
"loss": 0.4051,
"step": 4470
},
{
"epoch": 2.422058028473599,
"grad_norm": 0.26165210615685336,
"learning_rate": 1.0855231633853137e-06,
"loss": 0.4068,
"step": 4480
},
{
"epoch": 2.4274644080014416,
"grad_norm": 0.2765363606523804,
"learning_rate": 1.0660239606523466e-06,
"loss": 0.4128,
"step": 4490
},
{
"epoch": 2.4328707875292848,
"grad_norm": 0.2770223660740028,
"learning_rate": 1.0466805627322685e-06,
"loss": 0.4055,
"step": 4500
},
{
"epoch": 2.4382771670571275,
"grad_norm": 0.266013699998984,
"learning_rate": 1.0274937357188414e-06,
"loss": 0.4049,
"step": 4510
},
{
"epoch": 2.4436835465849702,
"grad_norm": 0.25683355130670393,
"learning_rate": 1.0084642395048428e-06,
"loss": 0.4078,
"step": 4520
},
{
"epoch": 2.449089926112813,
"grad_norm": 0.2811697424270643,
"learning_rate": 9.895928277519822e-07,
"loss": 0.4092,
"step": 4530
},
{
"epoch": 2.454496305640656,
"grad_norm": 0.2836256278223854,
"learning_rate": 9.708802478610413e-07,
"loss": 0.4059,
"step": 4540
},
{
"epoch": 2.459902685168499,
"grad_norm": 0.2771952071252828,
"learning_rate": 9.523272409422829e-07,
"loss": 0.4112,
"step": 4550
},
{
"epoch": 2.4653090646963416,
"grad_norm": 0.2965292468618203,
"learning_rate": 9.339345417860918e-07,
"loss": 0.4028,
"step": 4560
},
{
"epoch": 2.4707154442241848,
"grad_norm": 0.307263683184186,
"learning_rate": 9.157028788338795e-07,
"loss": 0.4029,
"step": 4570
},
{
"epoch": 2.4761218237520275,
"grad_norm": 0.2922545833760392,
"learning_rate": 8.976329741492262e-07,
"loss": 0.3939,
"step": 4580
},
{
"epoch": 2.4815282032798702,
"grad_norm": 0.29211120065069335,
"learning_rate": 8.797255433892926e-07,
"loss": 0.4086,
"step": 4590
},
{
"epoch": 2.486934582807713,
"grad_norm": 0.28634400793358533,
"learning_rate": 8.619812957764729e-07,
"loss": 0.4059,
"step": 4600
},
{
"epoch": 2.492340962335556,
"grad_norm": 0.2646272575948771,
"learning_rate": 8.444009340703008e-07,
"loss": 0.398,
"step": 4610
},
{
"epoch": 2.497747341863399,
"grad_norm": 0.29066647888917396,
"learning_rate": 8.269851545396279e-07,
"loss": 0.4025,
"step": 4620
},
{
"epoch": 2.5031537213912416,
"grad_norm": 0.28424280479329644,
"learning_rate": 8.097346469350348e-07,
"loss": 0.4013,
"step": 4630
},
{
"epoch": 2.5085601009190848,
"grad_norm": 0.2896529003620974,
"learning_rate": 7.926500944615267e-07,
"loss": 0.4108,
"step": 4640
},
{
"epoch": 2.5139664804469275,
"grad_norm": 0.27346406286896946,
"learning_rate": 7.757321737514645e-07,
"loss": 0.3941,
"step": 4650
},
{
"epoch": 2.5193728599747702,
"grad_norm": 0.26882609264045565,
"learning_rate": 7.589815548377738e-07,
"loss": 0.4035,
"step": 4660
},
{
"epoch": 2.524779239502613,
"grad_norm": 0.27733293233890505,
"learning_rate": 7.423989011274052e-07,
"loss": 0.4085,
"step": 4670
},
{
"epoch": 2.5301856190304557,
"grad_norm": 0.25627085107348396,
"learning_rate": 7.259848693750582e-07,
"loss": 0.4017,
"step": 4680
},
{
"epoch": 2.535591998558299,
"grad_norm": 0.2691243234604463,
"learning_rate": 7.097401096571765e-07,
"loss": 0.3996,
"step": 4690
},
{
"epoch": 2.5409983780861416,
"grad_norm": 0.2764529789534093,
"learning_rate": 6.936652653461939e-07,
"loss": 0.4145,
"step": 4700
},
{
"epoch": 2.5464047576139848,
"grad_norm": 0.2902741811813119,
"learning_rate": 6.777609730850615e-07,
"loss": 0.4007,
"step": 4710
},
{
"epoch": 2.5518111371418275,
"grad_norm": 0.265969991168333,
"learning_rate": 6.620278627620286e-07,
"loss": 0.402,
"step": 4720
},
{
"epoch": 2.5572175166696702,
"grad_norm": 0.259196836837019,
"learning_rate": 6.464665574856977e-07,
"loss": 0.4124,
"step": 4730
},
{
"epoch": 2.562623896197513,
"grad_norm": 0.2829926842253021,
"learning_rate": 6.310776735603452e-07,
"loss": 0.3989,
"step": 4740
},
{
"epoch": 2.5680302757253557,
"grad_norm": 0.2694529736291035,
"learning_rate": 6.158618204615119e-07,
"loss": 0.4032,
"step": 4750
},
{
"epoch": 2.573436655253199,
"grad_norm": 0.2630102431201598,
"learning_rate": 6.008196008118705e-07,
"loss": 0.407,
"step": 4760
},
{
"epoch": 2.5788430347810416,
"grad_norm": 0.27146999027694685,
"learning_rate": 5.859516103573492e-07,
"loss": 0.3982,
"step": 4770
},
{
"epoch": 2.5842494143088843,
"grad_norm": 0.28346284777141134,
"learning_rate": 5.712584379435482e-07,
"loss": 0.3984,
"step": 4780
},
{
"epoch": 2.5896557938367275,
"grad_norm": 0.28197172604169823,
"learning_rate": 5.567406654924074e-07,
"loss": 0.3988,
"step": 4790
},
{
"epoch": 2.5950621733645702,
"grad_norm": 0.2717022634001503,
"learning_rate": 5.423988679791686e-07,
"loss": 0.4098,
"step": 4800
},
{
"epoch": 2.600468552892413,
"grad_norm": 0.276903744178795,
"learning_rate": 5.282336134095994e-07,
"loss": 0.4043,
"step": 4810
},
{
"epoch": 2.6058749324202557,
"grad_norm": 0.25453566586188486,
"learning_rate": 5.142454627974969e-07,
"loss": 0.3976,
"step": 4820
},
{
"epoch": 2.611281311948099,
"grad_norm": 0.2784736093310705,
"learning_rate": 5.00434970142471e-07,
"loss": 0.4062,
"step": 4830
},
{
"epoch": 2.6166876914759416,
"grad_norm": 0.24784017038474418,
"learning_rate": 4.868026824080008e-07,
"loss": 0.4061,
"step": 4840
},
{
"epoch": 2.6220940710037843,
"grad_norm": 0.2807417719405863,
"learning_rate": 4.7334913949977526e-07,
"loss": 0.4075,
"step": 4850
},
{
"epoch": 2.6275004505316275,
"grad_norm": 0.25346910500895187,
"learning_rate": 4.6007487424430565e-07,
"loss": 0.3964,
"step": 4860
},
{
"epoch": 2.6329068300594702,
"grad_norm": 0.27364761903392193,
"learning_rate": 4.46980412367829e-07,
"loss": 0.3938,
"step": 4870
},
{
"epoch": 2.638313209587313,
"grad_norm": 0.2765709048501121,
"learning_rate": 4.3406627247548184e-07,
"loss": 0.4074,
"step": 4880
},
{
"epoch": 2.6437195891151557,
"grad_norm": 0.2776500402889704,
"learning_rate": 4.21332966030763e-07,
"loss": 0.3994,
"step": 4890
},
{
"epoch": 2.649125968642999,
"grad_norm": 0.26079072827311783,
"learning_rate": 4.08780997335278e-07,
"loss": 0.4045,
"step": 4900
},
{
"epoch": 2.6545323481708416,
"grad_norm": 0.2397016051949167,
"learning_rate": 3.9641086350876155e-07,
"loss": 0.4029,
"step": 4910
},
{
"epoch": 2.6599387276986843,
"grad_norm": 0.29754617724142174,
"learning_rate": 3.84223054469397e-07,
"loss": 0.4018,
"step": 4920
},
{
"epoch": 2.6653451072265275,
"grad_norm": 0.27568276310419043,
"learning_rate": 3.722180529144054e-07,
"loss": 0.4096,
"step": 4930
},
{
"epoch": 2.6707514867543702,
"grad_norm": 0.25544292907340554,
"learning_rate": 3.6039633430093367e-07,
"loss": 0.4006,
"step": 4940
},
{
"epoch": 2.676157866282213,
"grad_norm": 0.2904302979415872,
"learning_rate": 3.4875836682722096e-07,
"loss": 0.4093,
"step": 4950
},
{
"epoch": 2.6815642458100557,
"grad_norm": 0.2796446372356396,
"learning_rate": 3.373046114140571e-07,
"loss": 0.4037,
"step": 4960
},
{
"epoch": 2.686970625337899,
"grad_norm": 0.2690617997319961,
"learning_rate": 3.260355216865291e-07,
"loss": 0.4058,
"step": 4970
},
{
"epoch": 2.6923770048657416,
"grad_norm": 0.27708751977237855,
"learning_rate": 3.149515439560524e-07,
"loss": 0.4084,
"step": 4980
},
{
"epoch": 2.6977833843935843,
"grad_norm": 0.25923770611284674,
"learning_rate": 3.040531172026978e-07,
"loss": 0.4035,
"step": 4990
},
{
"epoch": 2.7031897639214275,
"grad_norm": 0.2503752240400745,
"learning_rate": 2.933406730578009e-07,
"loss": 0.4094,
"step": 5000
},
{
"epoch": 2.7085961434492702,
"grad_norm": 0.27256002841564525,
"learning_rate": 2.828146357868755e-07,
"loss": 0.4049,
"step": 5010
},
{
"epoch": 2.714002522977113,
"grad_norm": 0.262526407381437,
"learning_rate": 2.7247542227280155e-07,
"loss": 0.399,
"step": 5020
},
{
"epoch": 2.7194089025049557,
"grad_norm": 0.26889496739047675,
"learning_rate": 2.6232344199932034e-07,
"loss": 0.3974,
"step": 5030
},
{
"epoch": 2.7248152820327984,
"grad_norm": 0.2581699169174531,
"learning_rate": 2.523590970348166e-07,
"loss": 0.4078,
"step": 5040
},
{
"epoch": 2.7302216615606416,
"grad_norm": 0.2681313769671267,
"learning_rate": 2.4258278201639117e-07,
"loss": 0.4083,
"step": 5050
},
{
"epoch": 2.7356280410884843,
"grad_norm": 0.2583458633767275,
"learning_rate": 2.3299488413423554e-07,
"loss": 0.4033,
"step": 5060
},
{
"epoch": 2.7410344206163275,
"grad_norm": 0.27176652448537475,
"learning_rate": 2.2359578311629272e-07,
"loss": 0.41,
"step": 5070
},
{
"epoch": 2.7464408001441702,
"grad_norm": 0.2651677980954859,
"learning_rate": 2.1438585121322465e-07,
"loss": 0.4048,
"step": 5080
},
{
"epoch": 2.751847179672013,
"grad_norm": 0.26468667998207535,
"learning_rate": 2.0536545318366018e-07,
"loss": 0.4089,
"step": 5090
},
{
"epoch": 2.7572535591998557,
"grad_norm": 0.2682578170402083,
"learning_rate": 1.9653494627975888e-07,
"loss": 0.404,
"step": 5100
},
{
"epoch": 2.7626599387276984,
"grad_norm": 0.27087994511441277,
"learning_rate": 1.8789468023305334e-07,
"loss": 0.4033,
"step": 5110
},
{
"epoch": 2.7680663182555416,
"grad_norm": 0.25252752081120117,
"learning_rate": 1.7944499724060484e-07,
"loss": 0.4086,
"step": 5120
},
{
"epoch": 2.7734726977833843,
"grad_norm": 0.2765603337180068,
"learning_rate": 1.711862319514457e-07,
"loss": 0.4058,
"step": 5130
},
{
"epoch": 2.7788790773112275,
"grad_norm": 0.2662570880480703,
"learning_rate": 1.6311871145332836e-07,
"loss": 0.4016,
"step": 5140
},
{
"epoch": 2.7842854568390702,
"grad_norm": 0.26536562491010973,
"learning_rate": 1.5524275525977073e-07,
"loss": 0.3961,
"step": 5150
},
{
"epoch": 2.789691836366913,
"grad_norm": 0.2696933797225792,
"learning_rate": 1.4755867529740064e-07,
"loss": 0.402,
"step": 5160
},
{
"epoch": 2.7950982158947557,
"grad_norm": 0.26230277928432566,
"learning_rate": 1.4006677589360307e-07,
"loss": 0.4006,
"step": 5170
},
{
"epoch": 2.8005045954225984,
"grad_norm": 0.2618189445881308,
"learning_rate": 1.3276735376446693e-07,
"loss": 0.4101,
"step": 5180
},
{
"epoch": 2.8059109749504416,
"grad_norm": 0.26154419260033057,
"learning_rate": 1.2566069800303393e-07,
"loss": 0.4007,
"step": 5190
},
{
"epoch": 2.8113173544782843,
"grad_norm": 0.26129803510244903,
"learning_rate": 1.1874709006784891e-07,
"loss": 0.4108,
"step": 5200
},
{
"epoch": 2.816723734006127,
"grad_norm": 0.2755262239215911,
"learning_rate": 1.1202680377181252e-07,
"loss": 0.4081,
"step": 5210
},
{
"epoch": 2.8221301135339703,
"grad_norm": 0.27615467193849846,
"learning_rate": 1.055001052713378e-07,
"loss": 0.4057,
"step": 5220
},
{
"epoch": 2.827536493061813,
"grad_norm": 0.2565394448779921,
"learning_rate": 9.916725305580632e-08,
"loss": 0.4074,
"step": 5230
},
{
"epoch": 2.8329428725896557,
"grad_norm": 0.29481883515723867,
"learning_rate": 9.302849793733526e-08,
"loss": 0.4037,
"step": 5240
},
{
"epoch": 2.8383492521174984,
"grad_norm": 0.2628737439763179,
"learning_rate": 8.708408304083927e-08,
"loss": 0.3982,
"step": 5250
},
{
"epoch": 2.8437556316453416,
"grad_norm": 0.2856973586242492,
"learning_rate": 8.133424379440535e-08,
"loss": 0.4098,
"step": 5260
},
{
"epoch": 2.8491620111731844,
"grad_norm": 0.2573191532815954,
"learning_rate": 7.577920791996595e-08,
"loss": 0.4021,
"step": 5270
},
{
"epoch": 2.854568390701027,
"grad_norm": 0.2671924144995498,
"learning_rate": 7.041919542428221e-08,
"loss": 0.4046,
"step": 5280
},
{
"epoch": 2.8599747702288703,
"grad_norm": 0.27125026996972024,
"learning_rate": 6.525441859022873e-08,
"loss": 0.3996,
"step": 5290
},
{
"epoch": 2.865381149756713,
"grad_norm": 0.2597885306736867,
"learning_rate": 6.028508196838811e-08,
"loss": 0.3991,
"step": 5300
},
{
"epoch": 2.8707875292845557,
"grad_norm": 0.2661065612840173,
"learning_rate": 5.551138236894793e-08,
"loss": 0.4082,
"step": 5310
},
{
"epoch": 2.8761939088123984,
"grad_norm": 0.27596106902272594,
"learning_rate": 5.093350885390591e-08,
"loss": 0.4092,
"step": 5320
},
{
"epoch": 2.8816002883402416,
"grad_norm": 0.2798778899386736,
"learning_rate": 4.655164272958534e-08,
"loss": 0.3935,
"step": 5330
},
{
"epoch": 2.8870066678680844,
"grad_norm": 0.2675281011170649,
"learning_rate": 4.236595753944972e-08,
"loss": 0.4049,
"step": 5340
},
{
"epoch": 2.892413047395927,
"grad_norm": 0.24219018671622744,
"learning_rate": 3.837661905723378e-08,
"loss": 0.4061,
"step": 5350
},
{
"epoch": 2.8978194269237703,
"grad_norm": 0.26852051522723963,
"learning_rate": 3.458378528037598e-08,
"loss": 0.3982,
"step": 5360
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.2598218760743794,
"learning_rate": 3.0987606423759644e-08,
"loss": 0.3978,
"step": 5370
},
{
"epoch": 2.9086321859794557,
"grad_norm": 0.24224454585639746,
"learning_rate": 2.7588224913768225e-08,
"loss": 0.4056,
"step": 5380
},
{
"epoch": 2.9140385655072985,
"grad_norm": 0.28293842876891173,
"learning_rate": 2.438577538263931e-08,
"loss": 0.4041,
"step": 5390
},
{
"epoch": 2.9194449450351416,
"grad_norm": 0.24273867782068695,
"learning_rate": 2.1380384663135523e-08,
"loss": 0.4046,
"step": 5400
},
{
"epoch": 2.9248513245629844,
"grad_norm": 0.2589867572465761,
"learning_rate": 1.8572171783521885e-08,
"loss": 0.4016,
"step": 5410
},
{
"epoch": 2.930257704090827,
"grad_norm": 0.26040920179163585,
"learning_rate": 1.596124796284848e-08,
"loss": 0.4048,
"step": 5420
},
{
"epoch": 2.9356640836186703,
"grad_norm": 0.28129280293565423,
"learning_rate": 1.3547716606548967e-08,
"loss": 0.4082,
"step": 5430
},
{
"epoch": 2.941070463146513,
"grad_norm": 0.27263421805264343,
"learning_rate": 1.133167330234386e-08,
"loss": 0.3957,
"step": 5440
},
{
"epoch": 2.9464768426743557,
"grad_norm": 0.27306797377575853,
"learning_rate": 9.313205816454674e-09,
"loss": 0.4097,
"step": 5450
},
{
"epoch": 2.9518832222021985,
"grad_norm": 0.26535989264790094,
"learning_rate": 7.492394090128364e-09,
"loss": 0.4091,
"step": 5460
},
{
"epoch": 2.957289601730041,
"grad_norm": 0.26682062170730547,
"learning_rate": 5.8693102364698604e-09,
"loss": 0.3975,
"step": 5470
},
{
"epoch": 2.9626959812578844,
"grad_norm": 0.2848285894683682,
"learning_rate": 4.444018537588801e-09,
"loss": 0.4075,
"step": 5480
},
{
"epoch": 2.968102360785727,
"grad_norm": 0.2853108418534249,
"learning_rate": 3.2165754420510063e-09,
"loss": 0.4107,
"step": 5490
},
{
"epoch": 2.9735087403135703,
"grad_norm": 0.26447810990716136,
"learning_rate": 2.1870295626441607e-09,
"loss": 0.4022,
"step": 5500
},
{
"epoch": 2.978915119841413,
"grad_norm": 0.2661971477507847,
"learning_rate": 1.3554216744521287e-09,
"loss": 0.4041,
"step": 5510
},
{
"epoch": 2.9843214993692557,
"grad_norm": 0.25582504114161564,
"learning_rate": 7.217847132401367e-10,
"loss": 0.4064,
"step": 5520
},
{
"epoch": 2.9897278788970985,
"grad_norm": 0.26069476073784237,
"learning_rate": 2.861437741508155e-10,
"loss": 0.4115,
"step": 5530
},
{
"epoch": 2.995134258424941,
"grad_norm": 0.27554755453273777,
"learning_rate": 4.851611070832984e-11,
"loss": 0.4016,
"step": 5540
},
{
"epoch": 2.9989187240944313,
"step": 5547,
"total_flos": 8484146955288576.0,
"train_loss": 0.44718967426225087,
"train_runtime": 93872.001,
"train_samples_per_second": 5.675,
"train_steps_per_second": 0.059
}
],
"logging_steps": 10,
"max_steps": 5547,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8484146955288576.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}