SFT-165k-V28 / trainer_state.json
Taskii's picture
Upload folder using huggingface_hub
31835a7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9274431057563588,
"eval_steps": 500,
"global_step": 4500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00428380187416332,
"grad_norm": 1.7204455338427447,
"learning_rate": 9.635974304068523e-08,
"loss": 0.310353684425354,
"step": 10
},
{
"epoch": 0.00856760374832664,
"grad_norm": 1.5542088461890025,
"learning_rate": 2.0342612419700217e-07,
"loss": 0.3165663003921509,
"step": 20
},
{
"epoch": 0.01285140562248996,
"grad_norm": 1.2986368981078442,
"learning_rate": 3.1049250535331905e-07,
"loss": 0.2931360721588135,
"step": 30
},
{
"epoch": 0.01713520749665328,
"grad_norm": 1.3372226018458075,
"learning_rate": 4.1755888650963603e-07,
"loss": 0.29114551544189454,
"step": 40
},
{
"epoch": 0.0214190093708166,
"grad_norm": 1.065879980657948,
"learning_rate": 5.24625267665953e-07,
"loss": 0.2506369352340698,
"step": 50
},
{
"epoch": 0.02570281124497992,
"grad_norm": 0.6781359530445974,
"learning_rate": 6.3169164882227e-07,
"loss": 0.22955031394958497,
"step": 60
},
{
"epoch": 0.02998661311914324,
"grad_norm": 0.5249895834183264,
"learning_rate": 7.387580299785868e-07,
"loss": 0.22067618370056152,
"step": 70
},
{
"epoch": 0.03427041499330656,
"grad_norm": 0.5375825140839746,
"learning_rate": 8.458244111349037e-07,
"loss": 0.21640052795410156,
"step": 80
},
{
"epoch": 0.03855421686746988,
"grad_norm": 0.5549702260654302,
"learning_rate": 9.528907922912206e-07,
"loss": 0.22633485794067382,
"step": 90
},
{
"epoch": 0.0428380187416332,
"grad_norm": 0.43849850542690183,
"learning_rate": 1.0599571734475375e-06,
"loss": 0.20759968757629393,
"step": 100
},
{
"epoch": 0.04712182061579652,
"grad_norm": 0.46359825184269493,
"learning_rate": 1.1670235546038546e-06,
"loss": 0.1973546862602234,
"step": 110
},
{
"epoch": 0.05140562248995984,
"grad_norm": 0.41875998061321557,
"learning_rate": 1.2740899357601712e-06,
"loss": 0.19023516178131103,
"step": 120
},
{
"epoch": 0.055689424364123156,
"grad_norm": 0.5127942743282548,
"learning_rate": 1.3811563169164883e-06,
"loss": 0.2127223491668701,
"step": 130
},
{
"epoch": 0.05997322623828648,
"grad_norm": 0.46830156678706125,
"learning_rate": 1.4882226980728054e-06,
"loss": 0.1934453845024109,
"step": 140
},
{
"epoch": 0.0642570281124498,
"grad_norm": 0.508710707179685,
"learning_rate": 1.5952890792291223e-06,
"loss": 0.2092526912689209,
"step": 150
},
{
"epoch": 0.06854082998661312,
"grad_norm": 0.5421780654693655,
"learning_rate": 1.7023554603854392e-06,
"loss": 0.2070756435394287,
"step": 160
},
{
"epoch": 0.07282463186077644,
"grad_norm": 0.493152534196984,
"learning_rate": 1.809421841541756e-06,
"loss": 0.19875586032867432,
"step": 170
},
{
"epoch": 0.07710843373493977,
"grad_norm": 0.4980754232181657,
"learning_rate": 1.916488222698073e-06,
"loss": 0.18669115304946898,
"step": 180
},
{
"epoch": 0.08139223560910308,
"grad_norm": 0.5304243381141518,
"learning_rate": 2.02355460385439e-06,
"loss": 0.2146810531616211,
"step": 190
},
{
"epoch": 0.0856760374832664,
"grad_norm": 0.5500032540817943,
"learning_rate": 2.1306209850107067e-06,
"loss": 0.19487186670303344,
"step": 200
},
{
"epoch": 0.08995983935742972,
"grad_norm": 0.42930678514550324,
"learning_rate": 2.2376873661670238e-06,
"loss": 0.18126411437988282,
"step": 210
},
{
"epoch": 0.09424364123159304,
"grad_norm": 0.5711883922291429,
"learning_rate": 2.3447537473233404e-06,
"loss": 0.2076016664505005,
"step": 220
},
{
"epoch": 0.09852744310575635,
"grad_norm": 0.6678390082859929,
"learning_rate": 2.4518201284796575e-06,
"loss": 0.20209894180297852,
"step": 230
},
{
"epoch": 0.10281124497991968,
"grad_norm": 0.4835994184206877,
"learning_rate": 2.558886509635974e-06,
"loss": 0.16395035982131959,
"step": 240
},
{
"epoch": 0.107095046854083,
"grad_norm": 0.42091478620818606,
"learning_rate": 2.6659528907922917e-06,
"loss": 0.17233937978744507,
"step": 250
},
{
"epoch": 0.11137884872824631,
"grad_norm": 0.5320893808200788,
"learning_rate": 2.7730192719486084e-06,
"loss": 0.18311020135879516,
"step": 260
},
{
"epoch": 0.11566265060240964,
"grad_norm": 0.5257058584608517,
"learning_rate": 2.8800856531049255e-06,
"loss": 0.20057764053344726,
"step": 270
},
{
"epoch": 0.11994645247657296,
"grad_norm": 0.5293308935757324,
"learning_rate": 2.987152034261242e-06,
"loss": 0.1837336540222168,
"step": 280
},
{
"epoch": 0.12423025435073627,
"grad_norm": 0.46642029540934604,
"learning_rate": 3.0942184154175592e-06,
"loss": 0.19081385135650636,
"step": 290
},
{
"epoch": 0.1285140562248996,
"grad_norm": 0.5628183880631954,
"learning_rate": 3.201284796573876e-06,
"loss": 0.17274467945098876,
"step": 300
},
{
"epoch": 0.13279785809906292,
"grad_norm": 0.48916829855974603,
"learning_rate": 3.308351177730193e-06,
"loss": 0.18039458990097046,
"step": 310
},
{
"epoch": 0.13708165997322624,
"grad_norm": 0.5298792138248726,
"learning_rate": 3.41541755888651e-06,
"loss": 0.1898115634918213,
"step": 320
},
{
"epoch": 0.14136546184738955,
"grad_norm": 0.439555530924186,
"learning_rate": 3.5224839400428268e-06,
"loss": 0.17530070543289183,
"step": 330
},
{
"epoch": 0.14564926372155287,
"grad_norm": 0.5117824224937999,
"learning_rate": 3.629550321199144e-06,
"loss": 0.17437742948532103,
"step": 340
},
{
"epoch": 0.1499330655957162,
"grad_norm": 0.4753694102031763,
"learning_rate": 3.7366167023554605e-06,
"loss": 0.17850689888000487,
"step": 350
},
{
"epoch": 0.15421686746987953,
"grad_norm": 0.6306563088828507,
"learning_rate": 3.843683083511778e-06,
"loss": 0.18741222620010375,
"step": 360
},
{
"epoch": 0.15850066934404283,
"grad_norm": 0.4694179327929818,
"learning_rate": 3.950749464668095e-06,
"loss": 0.17026090621948242,
"step": 370
},
{
"epoch": 0.16278447121820616,
"grad_norm": 0.5048197989896139,
"learning_rate": 4.057815845824411e-06,
"loss": 0.1726588487625122,
"step": 380
},
{
"epoch": 0.1670682730923695,
"grad_norm": 0.524700358081214,
"learning_rate": 4.164882226980728e-06,
"loss": 0.18944069147109985,
"step": 390
},
{
"epoch": 0.1713520749665328,
"grad_norm": 0.4571670229694066,
"learning_rate": 4.2719486081370455e-06,
"loss": 0.16420159339904786,
"step": 400
},
{
"epoch": 0.17563587684069612,
"grad_norm": 0.46820018814554304,
"learning_rate": 4.379014989293362e-06,
"loss": 0.19183117151260376,
"step": 410
},
{
"epoch": 0.17991967871485945,
"grad_norm": 0.46894253012471776,
"learning_rate": 4.486081370449679e-06,
"loss": 0.17184211015701295,
"step": 420
},
{
"epoch": 0.18420348058902275,
"grad_norm": 0.46858595995063135,
"learning_rate": 4.593147751605996e-06,
"loss": 0.17618422508239745,
"step": 430
},
{
"epoch": 0.18848728246318608,
"grad_norm": 0.5091947698167847,
"learning_rate": 4.700214132762313e-06,
"loss": 0.18246437311172486,
"step": 440
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.5203679422298269,
"learning_rate": 4.807280513918631e-06,
"loss": 0.16799516677856446,
"step": 450
},
{
"epoch": 0.1970548862115127,
"grad_norm": 0.4306784639956151,
"learning_rate": 4.914346895074946e-06,
"loss": 0.1661084771156311,
"step": 460
},
{
"epoch": 0.20133868808567604,
"grad_norm": 0.4604689622728843,
"learning_rate": 5.021413276231264e-06,
"loss": 0.17491416931152343,
"step": 470
},
{
"epoch": 0.20562248995983937,
"grad_norm": 0.47273083263588245,
"learning_rate": 5.128479657387581e-06,
"loss": 0.16252427101135253,
"step": 480
},
{
"epoch": 0.20990629183400267,
"grad_norm": 0.4611929063195057,
"learning_rate": 5.235546038543897e-06,
"loss": 0.1942029356956482,
"step": 490
},
{
"epoch": 0.214190093708166,
"grad_norm": 0.4640899103515948,
"learning_rate": 5.342612419700215e-06,
"loss": 0.1781969666481018,
"step": 500
},
{
"epoch": 0.214190093708166,
"eval_loss": 0.17298774421215057,
"eval_runtime": 813.3235,
"eval_samples_per_second": 20.41,
"eval_steps_per_second": 5.103,
"step": 500
},
{
"epoch": 0.21847389558232932,
"grad_norm": 0.4892486598590822,
"learning_rate": 5.4496788008565314e-06,
"loss": 0.17805953025817872,
"step": 510
},
{
"epoch": 0.22275769745649263,
"grad_norm": 0.45908639825034264,
"learning_rate": 5.556745182012848e-06,
"loss": 0.1704517126083374,
"step": 520
},
{
"epoch": 0.22704149933065595,
"grad_norm": 0.4606868972349124,
"learning_rate": 5.663811563169165e-06,
"loss": 0.17605620622634888,
"step": 530
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.4883203630934758,
"learning_rate": 5.770877944325482e-06,
"loss": 0.185607647895813,
"step": 540
},
{
"epoch": 0.23560910307898258,
"grad_norm": 0.480706769968442,
"learning_rate": 5.877944325481799e-06,
"loss": 0.1776334285736084,
"step": 550
},
{
"epoch": 0.2398929049531459,
"grad_norm": 0.43013827677127364,
"learning_rate": 5.985010706638116e-06,
"loss": 0.17925962209701538,
"step": 560
},
{
"epoch": 0.24417670682730924,
"grad_norm": 0.43681041122775155,
"learning_rate": 6.092077087794433e-06,
"loss": 0.15904269218444825,
"step": 570
},
{
"epoch": 0.24846050870147254,
"grad_norm": 0.4057531376060292,
"learning_rate": 6.19914346895075e-06,
"loss": 0.17201122045516967,
"step": 580
},
{
"epoch": 0.2527443105756359,
"grad_norm": 0.5693511659878766,
"learning_rate": 6.3062098501070665e-06,
"loss": 0.1783498764038086,
"step": 590
},
{
"epoch": 0.2570281124497992,
"grad_norm": 0.5038894023292907,
"learning_rate": 6.413276231263383e-06,
"loss": 0.16208181381225586,
"step": 600
},
{
"epoch": 0.2613119143239625,
"grad_norm": 0.4255056407918071,
"learning_rate": 6.5203426124197015e-06,
"loss": 0.1778697967529297,
"step": 610
},
{
"epoch": 0.26559571619812583,
"grad_norm": 0.42463834883952506,
"learning_rate": 6.627408993576018e-06,
"loss": 0.17847087383270263,
"step": 620
},
{
"epoch": 0.26987951807228916,
"grad_norm": 0.4280904585197745,
"learning_rate": 6.734475374732334e-06,
"loss": 0.16192808151245117,
"step": 630
},
{
"epoch": 0.2741633199464525,
"grad_norm": 0.4032310396751306,
"learning_rate": 6.841541755888651e-06,
"loss": 0.1500581383705139,
"step": 640
},
{
"epoch": 0.2784471218206158,
"grad_norm": 0.36936808807497884,
"learning_rate": 6.948608137044969e-06,
"loss": 0.1805708885192871,
"step": 650
},
{
"epoch": 0.2827309236947791,
"grad_norm": 0.41279770820447376,
"learning_rate": 7.055674518201286e-06,
"loss": 0.15682549476623536,
"step": 660
},
{
"epoch": 0.2870147255689424,
"grad_norm": 0.48113068018089383,
"learning_rate": 7.162740899357602e-06,
"loss": 0.17637710571289061,
"step": 670
},
{
"epoch": 0.29129852744310575,
"grad_norm": 0.45019312769869485,
"learning_rate": 7.26980728051392e-06,
"loss": 0.16801434755325317,
"step": 680
},
{
"epoch": 0.2955823293172691,
"grad_norm": 0.4323771559896418,
"learning_rate": 7.3768736616702365e-06,
"loss": 0.1738981246948242,
"step": 690
},
{
"epoch": 0.2998661311914324,
"grad_norm": 0.4445466528485117,
"learning_rate": 7.483940042826553e-06,
"loss": 0.17883800268173217,
"step": 700
},
{
"epoch": 0.30414993306559573,
"grad_norm": 0.4169235110055358,
"learning_rate": 7.59100642398287e-06,
"loss": 0.1757150650024414,
"step": 710
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.46124417838321063,
"learning_rate": 7.698072805139187e-06,
"loss": 0.16563992500305175,
"step": 720
},
{
"epoch": 0.31271753681392234,
"grad_norm": 0.4455214464656937,
"learning_rate": 7.805139186295504e-06,
"loss": 0.15891735553741454,
"step": 730
},
{
"epoch": 0.31700133868808567,
"grad_norm": 0.48435793526108334,
"learning_rate": 7.91220556745182e-06,
"loss": 0.16565344333648682,
"step": 740
},
{
"epoch": 0.321285140562249,
"grad_norm": 0.425099998591317,
"learning_rate": 8.019271948608137e-06,
"loss": 0.16711184978485108,
"step": 750
},
{
"epoch": 0.3255689424364123,
"grad_norm": 0.4137507644842352,
"learning_rate": 8.126338329764456e-06,
"loss": 0.17436256408691406,
"step": 760
},
{
"epoch": 0.32985274431057565,
"grad_norm": 0.5261718559693129,
"learning_rate": 8.23340471092077e-06,
"loss": 0.17338960170745848,
"step": 770
},
{
"epoch": 0.334136546184739,
"grad_norm": 0.44161850092055,
"learning_rate": 8.340471092077087e-06,
"loss": 0.15373395681381224,
"step": 780
},
{
"epoch": 0.33842034805890225,
"grad_norm": 0.42667362111196244,
"learning_rate": 8.447537473233406e-06,
"loss": 0.170109760761261,
"step": 790
},
{
"epoch": 0.3427041499330656,
"grad_norm": 0.4289414936466275,
"learning_rate": 8.554603854389722e-06,
"loss": 0.16255849599838257,
"step": 800
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.4462302049947027,
"learning_rate": 8.661670235546039e-06,
"loss": 0.1558121919631958,
"step": 810
},
{
"epoch": 0.35127175368139224,
"grad_norm": 0.39502484462695925,
"learning_rate": 8.768736616702356e-06,
"loss": 0.14783246517181398,
"step": 820
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.4872082027579418,
"learning_rate": 8.875802997858674e-06,
"loss": 0.162847638130188,
"step": 830
},
{
"epoch": 0.3598393574297189,
"grad_norm": 0.427450758730554,
"learning_rate": 8.98286937901499e-06,
"loss": 0.16352038383483886,
"step": 840
},
{
"epoch": 0.36412315930388217,
"grad_norm": 0.4528788471261664,
"learning_rate": 9.089935760171307e-06,
"loss": 0.16523147821426393,
"step": 850
},
{
"epoch": 0.3684069611780455,
"grad_norm": 0.37599550924220604,
"learning_rate": 9.197002141327624e-06,
"loss": 0.15126256942749022,
"step": 860
},
{
"epoch": 0.37269076305220883,
"grad_norm": 0.40592589779270666,
"learning_rate": 9.30406852248394e-06,
"loss": 0.15496088266372682,
"step": 870
},
{
"epoch": 0.37697456492637216,
"grad_norm": 0.39024589028386475,
"learning_rate": 9.411134903640257e-06,
"loss": 0.16612087488174437,
"step": 880
},
{
"epoch": 0.3812583668005355,
"grad_norm": 0.46291145028584035,
"learning_rate": 9.518201284796574e-06,
"loss": 0.16229329109191895,
"step": 890
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.4622950426469592,
"learning_rate": 9.625267665952892e-06,
"loss": 0.16289321184158326,
"step": 900
},
{
"epoch": 0.3898259705488621,
"grad_norm": 0.4404561037311073,
"learning_rate": 9.732334047109209e-06,
"loss": 0.16939005851745606,
"step": 910
},
{
"epoch": 0.3941097724230254,
"grad_norm": 0.4522735218377503,
"learning_rate": 9.839400428265526e-06,
"loss": 0.1664318323135376,
"step": 920
},
{
"epoch": 0.39839357429718875,
"grad_norm": 0.39873486371619626,
"learning_rate": 9.946466809421842e-06,
"loss": 0.17302082777023314,
"step": 930
},
{
"epoch": 0.4026773761713521,
"grad_norm": 0.4403816711799427,
"learning_rate": 9.99999127026893e-06,
"loss": 0.1635822534561157,
"step": 940
},
{
"epoch": 0.4069611780455154,
"grad_norm": 0.42736129250630583,
"learning_rate": 9.999921432603256e-06,
"loss": 0.16364479064941406,
"step": 950
},
{
"epoch": 0.41124497991967873,
"grad_norm": 0.48227838403112244,
"learning_rate": 9.999781758247374e-06,
"loss": 0.1692502498626709,
"step": 960
},
{
"epoch": 0.41552878179384206,
"grad_norm": 0.4325606288398738,
"learning_rate": 9.999572249152187e-06,
"loss": 0.1753953218460083,
"step": 970
},
{
"epoch": 0.41981258366800533,
"grad_norm": 0.4184812393572346,
"learning_rate": 9.999292908244031e-06,
"loss": 0.15361449718475342,
"step": 980
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.3536766183699388,
"learning_rate": 9.998943739424614e-06,
"loss": 0.16968698501586915,
"step": 990
},
{
"epoch": 0.428380187416332,
"grad_norm": 0.4421427075174403,
"learning_rate": 9.99852474757097e-06,
"loss": 0.17062946557998657,
"step": 1000
},
{
"epoch": 0.428380187416332,
"eval_loss": 0.1632310301065445,
"eval_runtime": 809.0798,
"eval_samples_per_second": 20.517,
"eval_steps_per_second": 5.129,
"step": 1000
},
{
"epoch": 0.4326639892904953,
"grad_norm": 0.4501749565827156,
"learning_rate": 9.998035938535395e-06,
"loss": 0.17221925258636475,
"step": 1010
},
{
"epoch": 0.43694779116465865,
"grad_norm": 0.35159158648894256,
"learning_rate": 9.997477319145354e-06,
"loss": 0.18630390167236327,
"step": 1020
},
{
"epoch": 0.441231593038822,
"grad_norm": 0.38517475012295227,
"learning_rate": 9.9968488972034e-06,
"loss": 0.17598154544830322,
"step": 1030
},
{
"epoch": 0.44551539491298525,
"grad_norm": 0.3612688847646603,
"learning_rate": 9.996150681487047e-06,
"loss": 0.1822005033493042,
"step": 1040
},
{
"epoch": 0.4497991967871486,
"grad_norm": 0.3577703505886406,
"learning_rate": 9.995382681748667e-06,
"loss": 0.16494649648666382,
"step": 1050
},
{
"epoch": 0.4540829986613119,
"grad_norm": 0.4006846696021192,
"learning_rate": 9.99454490871534e-06,
"loss": 0.1681265115737915,
"step": 1060
},
{
"epoch": 0.45836680053547524,
"grad_norm": 0.41680986168641504,
"learning_rate": 9.99363737408871e-06,
"loss": 0.15723063945770263,
"step": 1070
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.3955828911870276,
"learning_rate": 9.992660090544814e-06,
"loss": 0.17240710258483888,
"step": 1080
},
{
"epoch": 0.4669344042838019,
"grad_norm": 0.40819367835971887,
"learning_rate": 9.991613071733923e-06,
"loss": 0.1590951204299927,
"step": 1090
},
{
"epoch": 0.47121820615796517,
"grad_norm": 0.4961313776161533,
"learning_rate": 9.990496332280327e-06,
"loss": 0.16744821071624755,
"step": 1100
},
{
"epoch": 0.4755020080321285,
"grad_norm": 0.40118583702904315,
"learning_rate": 9.989309887782153e-06,
"loss": 0.16566444635391236,
"step": 1110
},
{
"epoch": 0.4797858099062918,
"grad_norm": 0.38801799234687073,
"learning_rate": 9.988053754811129e-06,
"loss": 0.16186387538909913,
"step": 1120
},
{
"epoch": 0.48406961178045516,
"grad_norm": 0.40747871131177194,
"learning_rate": 9.986727950912364e-06,
"loss": 0.162397563457489,
"step": 1130
},
{
"epoch": 0.4883534136546185,
"grad_norm": 0.3728066796444714,
"learning_rate": 9.985332494604107e-06,
"loss": 0.1676606059074402,
"step": 1140
},
{
"epoch": 0.4926372155287818,
"grad_norm": 0.40985672457156785,
"learning_rate": 9.983867405377467e-06,
"loss": 0.1700581431388855,
"step": 1150
},
{
"epoch": 0.4969210174029451,
"grad_norm": 0.4419642574041659,
"learning_rate": 9.982332703696165e-06,
"loss": 0.16604260206222535,
"step": 1160
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.4019173064441985,
"learning_rate": 9.980728410996235e-06,
"loss": 0.16702601909637452,
"step": 1170
},
{
"epoch": 0.5054886211512718,
"grad_norm": 0.37849315597886735,
"learning_rate": 9.979054549685726e-06,
"loss": 0.17048054933547974,
"step": 1180
},
{
"epoch": 0.5097724230254351,
"grad_norm": 0.41757098420175776,
"learning_rate": 9.977311143144392e-06,
"loss": 0.1623483419418335,
"step": 1190
},
{
"epoch": 0.5140562248995983,
"grad_norm": 0.39624453257545467,
"learning_rate": 9.97549821572337e-06,
"loss": 0.18060542345046998,
"step": 1200
},
{
"epoch": 0.5183400267737617,
"grad_norm": 0.3790478315082819,
"learning_rate": 9.97361579274482e-06,
"loss": 0.15714950561523439,
"step": 1210
},
{
"epoch": 0.522623828647925,
"grad_norm": 0.47455877319994494,
"learning_rate": 9.971663900501597e-06,
"loss": 0.1706780195236206,
"step": 1220
},
{
"epoch": 0.5269076305220883,
"grad_norm": 0.41051182237414957,
"learning_rate": 9.969642566256869e-06,
"loss": 0.17303004264831542,
"step": 1230
},
{
"epoch": 0.5311914323962517,
"grad_norm": 0.3664624770254722,
"learning_rate": 9.967551818243738e-06,
"loss": 0.16188311576843262,
"step": 1240
},
{
"epoch": 0.535475234270415,
"grad_norm": 0.37091548258017915,
"learning_rate": 9.965391685664844e-06,
"loss": 0.14944344758987427,
"step": 1250
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.36490771306848957,
"learning_rate": 9.963162198691967e-06,
"loss": 0.17565066814422609,
"step": 1260
},
{
"epoch": 0.5440428380187416,
"grad_norm": 0.36889359791667947,
"learning_rate": 9.960863388465592e-06,
"loss": 0.14779505729675294,
"step": 1270
},
{
"epoch": 0.548326639892905,
"grad_norm": 0.437009279584505,
"learning_rate": 9.958495287094485e-06,
"loss": 0.16427998542785643,
"step": 1280
},
{
"epoch": 0.5526104417670683,
"grad_norm": 0.39554167977080396,
"learning_rate": 9.956057927655236e-06,
"loss": 0.15541106462478638,
"step": 1290
},
{
"epoch": 0.5568942436412316,
"grad_norm": 0.45029869907045383,
"learning_rate": 9.953551344191806e-06,
"loss": 0.16692056655883789,
"step": 1300
},
{
"epoch": 0.561178045515395,
"grad_norm": 0.4199591876603144,
"learning_rate": 9.95097557171504e-06,
"loss": 0.14758901596069335,
"step": 1310
},
{
"epoch": 0.5654618473895582,
"grad_norm": 0.4075053894893693,
"learning_rate": 9.948330646202192e-06,
"loss": 0.14503839015960693,
"step": 1320
},
{
"epoch": 0.5697456492637215,
"grad_norm": 0.43881797258551375,
"learning_rate": 9.94561660459641e-06,
"loss": 0.16932222843170167,
"step": 1330
},
{
"epoch": 0.5740294511378848,
"grad_norm": 0.3663150123238361,
"learning_rate": 9.942833484806224e-06,
"loss": 0.1607386827468872,
"step": 1340
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.3407640340916305,
"learning_rate": 9.939981325705022e-06,
"loss": 0.1527782440185547,
"step": 1350
},
{
"epoch": 0.5825970548862115,
"grad_norm": 0.37405131401648734,
"learning_rate": 9.937060167130499e-06,
"loss": 0.171100389957428,
"step": 1360
},
{
"epoch": 0.5868808567603748,
"grad_norm": 0.37761828710703715,
"learning_rate": 9.934070049884108e-06,
"loss": 0.15846436023712157,
"step": 1370
},
{
"epoch": 0.5911646586345382,
"grad_norm": 0.3693798885089601,
"learning_rate": 9.931011015730481e-06,
"loss": 0.16067838668823242,
"step": 1380
},
{
"epoch": 0.5954484605087015,
"grad_norm": 0.3911912390175172,
"learning_rate": 9.927883107396855e-06,
"loss": 0.17477972507476808,
"step": 1390
},
{
"epoch": 0.5997322623828648,
"grad_norm": 0.3338272015147582,
"learning_rate": 9.924686368572467e-06,
"loss": 0.15092020034790038,
"step": 1400
},
{
"epoch": 0.6040160642570281,
"grad_norm": 0.4078089060822029,
"learning_rate": 9.921420843907954e-06,
"loss": 0.1569045066833496,
"step": 1410
},
{
"epoch": 0.6082998661311915,
"grad_norm": 0.4794604605869132,
"learning_rate": 9.918086579014719e-06,
"loss": 0.15916914939880372,
"step": 1420
},
{
"epoch": 0.6125836680053548,
"grad_norm": 0.3657509810107675,
"learning_rate": 9.914683620464296e-06,
"loss": 0.1613703727722168,
"step": 1430
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.3781037238260815,
"learning_rate": 9.911212015787705e-06,
"loss": 0.1711595058441162,
"step": 1440
},
{
"epoch": 0.6211512717536813,
"grad_norm": 0.4177181930728637,
"learning_rate": 9.907671813474787e-06,
"loss": 0.1607887864112854,
"step": 1450
},
{
"epoch": 0.6254350736278447,
"grad_norm": 0.43202846060475897,
"learning_rate": 9.904063062973518e-06,
"loss": 0.1596811056137085,
"step": 1460
},
{
"epoch": 0.629718875502008,
"grad_norm": 0.37967408761934135,
"learning_rate": 9.90038581468933e-06,
"loss": 0.15738776922225953,
"step": 1470
},
{
"epoch": 0.6340026773761713,
"grad_norm": 0.35431221490178816,
"learning_rate": 9.8966401199844e-06,
"loss": 0.16409718990325928,
"step": 1480
},
{
"epoch": 0.6382864792503347,
"grad_norm": 0.5424318304534815,
"learning_rate": 9.892826031176932e-06,
"loss": 0.1624216079711914,
"step": 1490
},
{
"epoch": 0.642570281124498,
"grad_norm": 0.4154998221845867,
"learning_rate": 9.888943601540435e-06,
"loss": 0.16612136363983154,
"step": 1500
},
{
"epoch": 0.642570281124498,
"eval_loss": 0.15596744418144226,
"eval_runtime": 5287.3746,
"eval_samples_per_second": 3.14,
"eval_steps_per_second": 0.785,
"step": 1500
},
{
"epoch": 0.6468540829986613,
"grad_norm": 0.31788237500758254,
"learning_rate": 9.884992885302964e-06,
"loss": 0.16352603435516358,
"step": 1510
},
{
"epoch": 0.6511378848728246,
"grad_norm": 0.3933875722388967,
"learning_rate": 9.880973937646376e-06,
"loss": 0.16239913702011108,
"step": 1520
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.3911043138186677,
"learning_rate": 9.876886814705557e-06,
"loss": 0.15573612451553345,
"step": 1530
},
{
"epoch": 0.6597054886211513,
"grad_norm": 0.4165342919082731,
"learning_rate": 9.87273157356763e-06,
"loss": 0.15565356016159057,
"step": 1540
},
{
"epoch": 0.6639892904953146,
"grad_norm": 0.4041990333202639,
"learning_rate": 9.868508272271162e-06,
"loss": 0.15832991600036622,
"step": 1550
},
{
"epoch": 0.668273092369478,
"grad_norm": 0.42249142494241126,
"learning_rate": 9.86421696980536e-06,
"loss": 0.15069495439529418,
"step": 1560
},
{
"epoch": 0.6725568942436412,
"grad_norm": 0.3703727697545347,
"learning_rate": 9.859857726109237e-06,
"loss": 0.1529747486114502,
"step": 1570
},
{
"epoch": 0.6768406961178045,
"grad_norm": 0.340178722202618,
"learning_rate": 9.85543060207078e-06,
"loss": 0.15539826154708863,
"step": 1580
},
{
"epoch": 0.6811244979919678,
"grad_norm": 0.3731049216784043,
"learning_rate": 9.850935659526097e-06,
"loss": 0.15447347164154052,
"step": 1590
},
{
"epoch": 0.6854082998661312,
"grad_norm": 0.4042949825278044,
"learning_rate": 9.84637296125856e-06,
"loss": 0.17724437713623048,
"step": 1600
},
{
"epoch": 0.6896921017402945,
"grad_norm": 0.3660587911460726,
"learning_rate": 9.841742570997916e-06,
"loss": 0.16080789566040038,
"step": 1610
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.41736075936721456,
"learning_rate": 9.837044553419411e-06,
"loss": 0.16406190395355225,
"step": 1620
},
{
"epoch": 0.6982597054886212,
"grad_norm": 0.36046398358975057,
"learning_rate": 9.832278974142872e-06,
"loss": 0.15605542659759522,
"step": 1630
},
{
"epoch": 0.7025435073627845,
"grad_norm": 0.3954125125143182,
"learning_rate": 9.827445899731805e-06,
"loss": 0.16570944786071778,
"step": 1640
},
{
"epoch": 0.7068273092369478,
"grad_norm": 0.40637254190631067,
"learning_rate": 9.822545397692453e-06,
"loss": 0.16883254051208496,
"step": 1650
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.41139029483344075,
"learning_rate": 9.81757753647286e-06,
"loss": 0.16364901065826415,
"step": 1660
},
{
"epoch": 0.7153949129852745,
"grad_norm": 0.4536938064672351,
"learning_rate": 9.812542385461912e-06,
"loss": 0.1675459623336792,
"step": 1670
},
{
"epoch": 0.7196787148594378,
"grad_norm": 0.3585184083438791,
"learning_rate": 9.807440014988375e-06,
"loss": 0.16231054067611694,
"step": 1680
},
{
"epoch": 0.7239625167336011,
"grad_norm": 0.3167942544933684,
"learning_rate": 9.802270496319896e-06,
"loss": 0.14959096908569336,
"step": 1690
},
{
"epoch": 0.7282463186077643,
"grad_norm": 0.3699602110663905,
"learning_rate": 9.79703390166203e-06,
"loss": 0.14526536464691162,
"step": 1700
},
{
"epoch": 0.7325301204819277,
"grad_norm": 0.3613102627272191,
"learning_rate": 9.791730304157212e-06,
"loss": 0.15053074359893798,
"step": 1710
},
{
"epoch": 0.736813922356091,
"grad_norm": 0.36315339995103474,
"learning_rate": 9.786359777883743e-06,
"loss": 0.1579727292060852,
"step": 1720
},
{
"epoch": 0.7410977242302543,
"grad_norm": 0.35352051713516114,
"learning_rate": 9.78092239785476e-06,
"loss": 0.16381702423095704,
"step": 1730
},
{
"epoch": 0.7453815261044177,
"grad_norm": 0.41420218762506095,
"learning_rate": 9.775418240017183e-06,
"loss": 0.16737335920333862,
"step": 1740
},
{
"epoch": 0.749665327978581,
"grad_norm": 0.32647328326287134,
"learning_rate": 9.769847381250647e-06,
"loss": 0.16527401208877562,
"step": 1750
},
{
"epoch": 0.7539491298527443,
"grad_norm": 0.35594572768523836,
"learning_rate": 9.764209899366451e-06,
"loss": 0.17207796573638917,
"step": 1760
},
{
"epoch": 0.7582329317269076,
"grad_norm": 0.3529492671194906,
"learning_rate": 9.75850587310644e-06,
"loss": 0.15534259080886842,
"step": 1770
},
{
"epoch": 0.762516733601071,
"grad_norm": 0.41518414925000824,
"learning_rate": 9.752735382141931e-06,
"loss": 0.17126250267028809,
"step": 1780
},
{
"epoch": 0.7668005354752343,
"grad_norm": 0.3416212552791915,
"learning_rate": 9.74689850707259e-06,
"loss": 0.17300653457641602,
"step": 1790
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.4506868511706448,
"learning_rate": 9.740995329425304e-06,
"loss": 0.16119366884231567,
"step": 1800
},
{
"epoch": 0.775368139223561,
"grad_norm": 0.4127032617250803,
"learning_rate": 9.735025931653047e-06,
"loss": 0.1660417675971985,
"step": 1810
},
{
"epoch": 0.7796519410977242,
"grad_norm": 0.3711426866374276,
"learning_rate": 9.728990397133725e-06,
"loss": 0.15557256937026978,
"step": 1820
},
{
"epoch": 0.7839357429718875,
"grad_norm": 0.4403684798533838,
"learning_rate": 9.722888810169015e-06,
"loss": 0.14504989385604858,
"step": 1830
},
{
"epoch": 0.7882195448460508,
"grad_norm": 0.43523370881285106,
"learning_rate": 9.716721255983184e-06,
"loss": 0.18080484867095947,
"step": 1840
},
{
"epoch": 0.7925033467202142,
"grad_norm": 0.39796977768343905,
"learning_rate": 9.710487820721897e-06,
"loss": 0.16169551610946656,
"step": 1850
},
{
"epoch": 0.7967871485943775,
"grad_norm": 0.3696365244924919,
"learning_rate": 9.704188591451021e-06,
"loss": 0.1710440158843994,
"step": 1860
},
{
"epoch": 0.8010709504685408,
"grad_norm": 0.3501836475183059,
"learning_rate": 9.697823656155404e-06,
"loss": 0.14459784030914308,
"step": 1870
},
{
"epoch": 0.8053547523427041,
"grad_norm": 0.4008873451421875,
"learning_rate": 9.691393103737646e-06,
"loss": 0.15653254985809326,
"step": 1880
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.3726778794979056,
"learning_rate": 9.684897024016856e-06,
"loss": 0.15802738666534424,
"step": 1890
},
{
"epoch": 0.8139223560910308,
"grad_norm": 0.3602622222902254,
"learning_rate": 9.678335507727406e-06,
"loss": 0.15577685832977295,
"step": 1900
},
{
"epoch": 0.8182061579651941,
"grad_norm": 0.36180831789633733,
"learning_rate": 9.671708646517644e-06,
"loss": 0.1501927375793457,
"step": 1910
},
{
"epoch": 0.8224899598393575,
"grad_norm": 0.4291946610668789,
"learning_rate": 9.665016532948643e-06,
"loss": 0.1524329662322998,
"step": 1920
},
{
"epoch": 0.8267737617135208,
"grad_norm": 0.36439021529215626,
"learning_rate": 9.658259260492879e-06,
"loss": 0.1579957962036133,
"step": 1930
},
{
"epoch": 0.8310575635876841,
"grad_norm": 0.36185634405902617,
"learning_rate": 9.651436923532947e-06,
"loss": 0.1648595690727234,
"step": 1940
},
{
"epoch": 0.8353413654618473,
"grad_norm": 0.3896804732201538,
"learning_rate": 9.644549617360227e-06,
"loss": 0.14703061580657958,
"step": 1950
},
{
"epoch": 0.8396251673360107,
"grad_norm": 0.3270830246578632,
"learning_rate": 9.63759743817357e-06,
"loss": 0.14643968343734742,
"step": 1960
},
{
"epoch": 0.843908969210174,
"grad_norm": 0.36683351098847644,
"learning_rate": 9.630580483077934e-06,
"loss": 0.15101373195648193,
"step": 1970
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.2637127315901447,
"learning_rate": 9.623498850083043e-06,
"loss": 0.1591057300567627,
"step": 1980
},
{
"epoch": 0.8524765729585007,
"grad_norm": 0.3681053572408943,
"learning_rate": 9.616352638102017e-06,
"loss": 0.1697171926498413,
"step": 1990
},
{
"epoch": 0.856760374832664,
"grad_norm": 0.40805430553066435,
"learning_rate": 9.609141946949978e-06,
"loss": 0.1591539740562439,
"step": 2000
},
{
"epoch": 0.856760374832664,
"eval_loss": 0.15128476917743683,
"eval_runtime": 813.5807,
"eval_samples_per_second": 20.404,
"eval_steps_per_second": 5.101,
"step": 2000
},
{
"epoch": 0.8610441767068273,
"grad_norm": 0.3914592710894462,
"learning_rate": 9.601866877342673e-06,
"loss": 0.15913846492767333,
"step": 2010
},
{
"epoch": 0.8653279785809906,
"grad_norm": 0.34232621179600625,
"learning_rate": 9.594527530895055e-06,
"loss": 0.1589035987854004,
"step": 2020
},
{
"epoch": 0.869611780455154,
"grad_norm": 0.35138032967412824,
"learning_rate": 9.587124010119866e-06,
"loss": 0.15038516521453857,
"step": 2030
},
{
"epoch": 0.8738955823293173,
"grad_norm": 0.38790494555500904,
"learning_rate": 9.579656418426208e-06,
"loss": 0.14970223903656005,
"step": 2040
},
{
"epoch": 0.8781793842034806,
"grad_norm": 0.453347749337455,
"learning_rate": 9.572124860118099e-06,
"loss": 0.15592522621154786,
"step": 2050
},
{
"epoch": 0.882463186077644,
"grad_norm": 0.36254040692639466,
"learning_rate": 9.564529440393013e-06,
"loss": 0.14756847620010377,
"step": 2060
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.28500745218910845,
"learning_rate": 9.55687026534041e-06,
"loss": 0.15284668207168578,
"step": 2070
},
{
"epoch": 0.8910307898259705,
"grad_norm": 0.33059182026983963,
"learning_rate": 9.54914744194026e-06,
"loss": 0.13931398391723632,
"step": 2080
},
{
"epoch": 0.8953145917001338,
"grad_norm": 0.3308533363527482,
"learning_rate": 9.541361078061543e-06,
"loss": 0.152490496635437,
"step": 2090
},
{
"epoch": 0.8995983935742972,
"grad_norm": 0.39410920160803786,
"learning_rate": 9.533511282460744e-06,
"loss": 0.15455267429351807,
"step": 2100
},
{
"epoch": 0.9038821954484605,
"grad_norm": 0.3510760250872994,
"learning_rate": 9.525598164780335e-06,
"loss": 0.15271444320678712,
"step": 2110
},
{
"epoch": 0.9081659973226238,
"grad_norm": 0.34879574406946134,
"learning_rate": 9.51762183554724e-06,
"loss": 0.145074462890625,
"step": 2120
},
{
"epoch": 0.9124497991967871,
"grad_norm": 0.4102513842794922,
"learning_rate": 9.5095824061713e-06,
"loss": 0.1671789288520813,
"step": 2130
},
{
"epoch": 0.9167336010709505,
"grad_norm": 0.32100215647635666,
"learning_rate": 9.501479988943705e-06,
"loss": 0.14845454692840576,
"step": 2140
},
{
"epoch": 0.9210174029451138,
"grad_norm": 0.35303111230486783,
"learning_rate": 9.493314697035433e-06,
"loss": 0.14766921997070312,
"step": 2150
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.3595530843531657,
"learning_rate": 9.48508664449567e-06,
"loss": 0.1577920436859131,
"step": 2160
},
{
"epoch": 0.9295850066934405,
"grad_norm": 0.3500784633268657,
"learning_rate": 9.476795946250213e-06,
"loss": 0.15419769287109375,
"step": 2170
},
{
"epoch": 0.9338688085676038,
"grad_norm": 0.5035759293187142,
"learning_rate": 9.468442718099866e-06,
"loss": 0.15254662036895753,
"step": 2180
},
{
"epoch": 0.9381526104417671,
"grad_norm": 0.3597669443798906,
"learning_rate": 9.460027076718825e-06,
"loss": 0.15965031385421752,
"step": 2190
},
{
"epoch": 0.9424364123159303,
"grad_norm": 0.32302117680971176,
"learning_rate": 9.451549139653043e-06,
"loss": 0.15642788410186767,
"step": 2200
},
{
"epoch": 0.9467202141900937,
"grad_norm": 0.37709479129796397,
"learning_rate": 9.443009025318595e-06,
"loss": 0.16215311288833617,
"step": 2210
},
{
"epoch": 0.951004016064257,
"grad_norm": 0.41863991954422164,
"learning_rate": 9.434406853000017e-06,
"loss": 0.16595734357833863,
"step": 2220
},
{
"epoch": 0.9552878179384203,
"grad_norm": 0.3895832137317719,
"learning_rate": 9.425742742848652e-06,
"loss": 0.1542948842048645,
"step": 2230
},
{
"epoch": 0.9595716198125837,
"grad_norm": 0.3383760951721925,
"learning_rate": 9.417016815880948e-06,
"loss": 0.1523042917251587,
"step": 2240
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.4388306567649398,
"learning_rate": 9.4082291939768e-06,
"loss": 0.13539564609527588,
"step": 2250
},
{
"epoch": 0.9681392235609103,
"grad_norm": 0.392487987824093,
"learning_rate": 9.399379999877816e-06,
"loss": 0.16397664546966553,
"step": 2260
},
{
"epoch": 0.9724230254350736,
"grad_norm": 0.37843944567360804,
"learning_rate": 9.390469357185626e-06,
"loss": 0.1599686861038208,
"step": 2270
},
{
"epoch": 0.976706827309237,
"grad_norm": 0.36495911845917256,
"learning_rate": 9.381497390360146e-06,
"loss": 0.16280412673950195,
"step": 2280
},
{
"epoch": 0.9809906291834003,
"grad_norm": 0.3098293192725145,
"learning_rate": 9.372464224717836e-06,
"loss": 0.16709411144256592,
"step": 2290
},
{
"epoch": 0.9852744310575636,
"grad_norm": 0.36503501082057177,
"learning_rate": 9.36336998642996e-06,
"loss": 0.14577250480651854,
"step": 2300
},
{
"epoch": 0.989558232931727,
"grad_norm": 0.369748777319339,
"learning_rate": 9.354214802520813e-06,
"loss": 0.15008455514907837,
"step": 2310
},
{
"epoch": 0.9938420348058902,
"grad_norm": 0.38954595915895235,
"learning_rate": 9.344998800865949e-06,
"loss": 0.16494543552398683,
"step": 2320
},
{
"epoch": 0.9981258366800535,
"grad_norm": 0.3263933545214738,
"learning_rate": 9.335722110190409e-06,
"loss": 0.1547703266143799,
"step": 2330
},
{
"epoch": 1.0021419009370816,
"grad_norm": 0.3240736359093112,
"learning_rate": 9.326384860066894e-06,
"loss": 0.1678773880004883,
"step": 2340
},
{
"epoch": 1.0064257028112449,
"grad_norm": 0.4121951074794008,
"learning_rate": 9.316987180913993e-06,
"loss": 0.13320955038070678,
"step": 2350
},
{
"epoch": 1.0107095046854082,
"grad_norm": 0.35703547426799104,
"learning_rate": 9.30752920399432e-06,
"loss": 0.12546956539154053,
"step": 2360
},
{
"epoch": 1.0149933065595715,
"grad_norm": 0.40623072991807463,
"learning_rate": 9.298011061412718e-06,
"loss": 0.13189778327941895,
"step": 2370
},
{
"epoch": 1.0192771084337349,
"grad_norm": 0.44222195266756315,
"learning_rate": 9.288432886114388e-06,
"loss": 0.12098613977432252,
"step": 2380
},
{
"epoch": 1.0235609103078982,
"grad_norm": 0.367684966832025,
"learning_rate": 9.278794811883047e-06,
"loss": 0.11746659278869628,
"step": 2390
},
{
"epoch": 1.0278447121820615,
"grad_norm": 0.33284392755056474,
"learning_rate": 9.26909697333905e-06,
"loss": 0.12567752599716187,
"step": 2400
},
{
"epoch": 1.0321285140562249,
"grad_norm": 0.3455297587313404,
"learning_rate": 9.259339505937514e-06,
"loss": 0.12083170413970948,
"step": 2410
},
{
"epoch": 1.0364123159303882,
"grad_norm": 0.3389705981902334,
"learning_rate": 9.249522545966427e-06,
"loss": 0.12095551490783692,
"step": 2420
},
{
"epoch": 1.0406961178045515,
"grad_norm": 0.40601150502203404,
"learning_rate": 9.239646230544741e-06,
"loss": 0.14402755498886108,
"step": 2430
},
{
"epoch": 1.0449799196787148,
"grad_norm": 0.3559777449007349,
"learning_rate": 9.229710697620462e-06,
"loss": 0.1495804786682129,
"step": 2440
},
{
"epoch": 1.0492637215528782,
"grad_norm": 0.36896684434500243,
"learning_rate": 9.219716085968716e-06,
"loss": 0.12875673770904542,
"step": 2450
},
{
"epoch": 1.0535475234270415,
"grad_norm": 0.39146972255890167,
"learning_rate": 9.209662535189814e-06,
"loss": 0.13340590000152588,
"step": 2460
},
{
"epoch": 1.0578313253012048,
"grad_norm": 0.40291541972442413,
"learning_rate": 9.199550185707309e-06,
"loss": 0.1337528109550476,
"step": 2470
},
{
"epoch": 1.0621151271753682,
"grad_norm": 0.37956437532491505,
"learning_rate": 9.189379178766022e-06,
"loss": 0.12576285600662232,
"step": 2480
},
{
"epoch": 1.0663989290495315,
"grad_norm": 0.41298961387679495,
"learning_rate": 9.179149656430077e-06,
"loss": 0.1333579182624817,
"step": 2490
},
{
"epoch": 1.0706827309236948,
"grad_norm": 0.36210102393181387,
"learning_rate": 9.168861761580916e-06,
"loss": 0.13212097883224488,
"step": 2500
},
{
"epoch": 1.0706827309236948,
"eval_loss": 0.15047596395015717,
"eval_runtime": 816.7373,
"eval_samples_per_second": 20.325,
"eval_steps_per_second": 5.081,
"step": 2500
},
{
"epoch": 1.0749665327978581,
"grad_norm": 0.3726254379576281,
"learning_rate": 9.158515637915303e-06,
"loss": 0.12463078498840333,
"step": 2510
},
{
"epoch": 1.0792503346720215,
"grad_norm": 0.37712160221949104,
"learning_rate": 9.148111429943316e-06,
"loss": 0.12076478004455567,
"step": 2520
},
{
"epoch": 1.0835341365461848,
"grad_norm": 0.34263415579260603,
"learning_rate": 9.137649282986326e-06,
"loss": 0.11901497840881348,
"step": 2530
},
{
"epoch": 1.0878179384203481,
"grad_norm": 0.43199587554265134,
"learning_rate": 9.127129343174974e-06,
"loss": 0.1473910093307495,
"step": 2540
},
{
"epoch": 1.0921017402945115,
"grad_norm": 0.3888436375726906,
"learning_rate": 9.116551757447124e-06,
"loss": 0.12526917457580566,
"step": 2550
},
{
"epoch": 1.0963855421686748,
"grad_norm": 0.3720699240255782,
"learning_rate": 9.105916673545811e-06,
"loss": 0.12781134843826295,
"step": 2560
},
{
"epoch": 1.1006693440428381,
"grad_norm": 0.3528738970780735,
"learning_rate": 9.095224240017187e-06,
"loss": 0.12412866353988647,
"step": 2570
},
{
"epoch": 1.1049531459170012,
"grad_norm": 0.4631292939251323,
"learning_rate": 9.084474606208426e-06,
"loss": 0.11998128890991211,
"step": 2580
},
{
"epoch": 1.1092369477911648,
"grad_norm": 0.3596397164936987,
"learning_rate": 9.073667922265659e-06,
"loss": 0.13821544647216796,
"step": 2590
},
{
"epoch": 1.1135207496653279,
"grad_norm": 0.37491061752134996,
"learning_rate": 9.062804339131865e-06,
"loss": 0.12905315160751343,
"step": 2600
},
{
"epoch": 1.1178045515394912,
"grad_norm": 0.33236296782840824,
"learning_rate": 9.051884008544769e-06,
"loss": 0.11152592897415162,
"step": 2610
},
{
"epoch": 1.1220883534136545,
"grad_norm": 0.41099647558351027,
"learning_rate": 9.040907083034714e-06,
"loss": 0.120727276802063,
"step": 2620
},
{
"epoch": 1.1263721552878179,
"grad_norm": 0.3859893211528485,
"learning_rate": 9.02987371592254e-06,
"loss": 0.14195597171783447,
"step": 2630
},
{
"epoch": 1.1306559571619812,
"grad_norm": 0.36839784644083184,
"learning_rate": 9.018784061317434e-06,
"loss": 0.12041090726852417,
"step": 2640
},
{
"epoch": 1.1349397590361445,
"grad_norm": 0.41679177088273905,
"learning_rate": 9.007638274114787e-06,
"loss": 0.13752386569976807,
"step": 2650
},
{
"epoch": 1.1392235609103079,
"grad_norm": 0.38785605712752647,
"learning_rate": 8.996436509994022e-06,
"loss": 0.12111247777938842,
"step": 2660
},
{
"epoch": 1.1435073627844712,
"grad_norm": 0.37367256419499406,
"learning_rate": 8.985178925416424e-06,
"loss": 0.13275750875473022,
"step": 2670
},
{
"epoch": 1.1477911646586345,
"grad_norm": 0.3674681161529881,
"learning_rate": 8.973865677622954e-06,
"loss": 0.13491373062133788,
"step": 2680
},
{
"epoch": 1.1520749665327978,
"grad_norm": 0.34447615774959234,
"learning_rate": 8.962496924632051e-06,
"loss": 0.13558318614959716,
"step": 2690
},
{
"epoch": 1.1563587684069612,
"grad_norm": 0.3564170987558211,
"learning_rate": 8.951072825237426e-06,
"loss": 0.1193579912185669,
"step": 2700
},
{
"epoch": 1.1606425702811245,
"grad_norm": 0.35305355840674923,
"learning_rate": 8.939593539005842e-06,
"loss": 0.13529754877090455,
"step": 2710
},
{
"epoch": 1.1649263721552878,
"grad_norm": 0.35352573268560833,
"learning_rate": 8.928059226274894e-06,
"loss": 0.12423286437988282,
"step": 2720
},
{
"epoch": 1.1692101740294512,
"grad_norm": 0.36689834147420736,
"learning_rate": 8.916470048150756e-06,
"loss": 0.13518364429473878,
"step": 2730
},
{
"epoch": 1.1734939759036145,
"grad_norm": 0.38162988673475534,
"learning_rate": 8.90482616650594e-06,
"loss": 0.13908401727676392,
"step": 2740
},
{
"epoch": 1.1777777777777778,
"grad_norm": 0.40336299975505086,
"learning_rate": 8.893127743977036e-06,
"loss": 0.13255722522735597,
"step": 2750
},
{
"epoch": 1.1820615796519411,
"grad_norm": 0.3324644606155607,
"learning_rate": 8.881374943962426e-06,
"loss": 0.1357291579246521,
"step": 2760
},
{
"epoch": 1.1863453815261045,
"grad_norm": 0.3994255780678427,
"learning_rate": 8.869567930620027e-06,
"loss": 0.12042539119720459,
"step": 2770
},
{
"epoch": 1.1906291834002678,
"grad_norm": 0.35581614764806313,
"learning_rate": 8.857706868864977e-06,
"loss": 0.13282716274261475,
"step": 2780
},
{
"epoch": 1.1949129852744311,
"grad_norm": 0.3106663113756059,
"learning_rate": 8.845791924367334e-06,
"loss": 0.12471635341644287,
"step": 2790
},
{
"epoch": 1.1991967871485945,
"grad_norm": 0.4256051698707425,
"learning_rate": 8.833823263549775e-06,
"loss": 0.11954612731933593,
"step": 2800
},
{
"epoch": 1.2034805890227578,
"grad_norm": 0.41689423223672023,
"learning_rate": 8.821801053585254e-06,
"loss": 0.12010161876678467,
"step": 2810
},
{
"epoch": 1.2077643908969211,
"grad_norm": 0.3817725350186758,
"learning_rate": 8.809725462394684e-06,
"loss": 0.11917848587036133,
"step": 2820
},
{
"epoch": 1.2120481927710842,
"grad_norm": 0.35927209564755835,
"learning_rate": 8.797596658644581e-06,
"loss": 0.12020325660705566,
"step": 2830
},
{
"epoch": 1.2163319946452478,
"grad_norm": 0.4238739504322855,
"learning_rate": 8.785414811744703e-06,
"loss": 0.13289868831634521,
"step": 2840
},
{
"epoch": 1.2206157965194109,
"grad_norm": 0.39167399451224444,
"learning_rate": 8.773180091845701e-06,
"loss": 0.12138681411743164,
"step": 2850
},
{
"epoch": 1.2248995983935742,
"grad_norm": 0.3947355797116567,
"learning_rate": 8.760892669836729e-06,
"loss": 0.14103634357452394,
"step": 2860
},
{
"epoch": 1.2291834002677375,
"grad_norm": 0.39740732729868383,
"learning_rate": 8.74855271734306e-06,
"loss": 0.13904783725738526,
"step": 2870
},
{
"epoch": 1.2334672021419009,
"grad_norm": 0.41730678044784,
"learning_rate": 8.736160406723688e-06,
"loss": 0.12443190813064575,
"step": 2880
},
{
"epoch": 1.2377510040160642,
"grad_norm": 0.3765448851707534,
"learning_rate": 8.723715911068931e-06,
"loss": 0.1321355938911438,
"step": 2890
},
{
"epoch": 1.2420348058902275,
"grad_norm": 0.3634462539369135,
"learning_rate": 8.71121940419799e-06,
"loss": 0.14078364372253419,
"step": 2900
},
{
"epoch": 1.2463186077643909,
"grad_norm": 0.4094668843551737,
"learning_rate": 8.698671060656549e-06,
"loss": 0.13006095886230468,
"step": 2910
},
{
"epoch": 1.2506024096385542,
"grad_norm": 0.37644871257316387,
"learning_rate": 8.686071055714318e-06,
"loss": 0.12324719429016114,
"step": 2920
},
{
"epoch": 1.2548862115127175,
"grad_norm": 0.4032403895979568,
"learning_rate": 8.673419565362587e-06,
"loss": 0.14000382423400878,
"step": 2930
},
{
"epoch": 1.2591700133868808,
"grad_norm": 0.4398800669174728,
"learning_rate": 8.660716766311778e-06,
"loss": 0.11818475723266601,
"step": 2940
},
{
"epoch": 1.2634538152610442,
"grad_norm": 0.3615959969952865,
"learning_rate": 8.647962835988968e-06,
"loss": 0.1338767886161804,
"step": 2950
},
{
"epoch": 1.2677376171352075,
"grad_norm": 0.31737564808536584,
"learning_rate": 8.635157952535411e-06,
"loss": 0.1270219087600708,
"step": 2960
},
{
"epoch": 1.2720214190093708,
"grad_norm": 0.3582723999510731,
"learning_rate": 8.622302294804052e-06,
"loss": 0.12293977737426758,
"step": 2970
},
{
"epoch": 1.2763052208835342,
"grad_norm": 0.35972611924117803,
"learning_rate": 8.609396042357033e-06,
"loss": 0.12699666023254394,
"step": 2980
},
{
"epoch": 1.2805890227576975,
"grad_norm": 0.4025263144287077,
"learning_rate": 8.596439375463174e-06,
"loss": 0.13745148181915284,
"step": 2990
},
{
"epoch": 1.2848728246318608,
"grad_norm": 0.3227213368538748,
"learning_rate": 8.583432475095468e-06,
"loss": 0.11785190105438233,
"step": 3000
},
{
"epoch": 1.2848728246318608,
"eval_loss": 0.1485673487186432,
"eval_runtime": 812.0391,
"eval_samples_per_second": 20.442,
"eval_steps_per_second": 5.111,
"step": 3000
},
{
"epoch": 1.2891566265060241,
"grad_norm": 0.35470826251565785,
"learning_rate": 8.570375522928543e-06,
"loss": 0.12998595237731933,
"step": 3010
},
{
"epoch": 1.2934404283801875,
"grad_norm": 0.3775363159731956,
"learning_rate": 8.55726870133613e-06,
"loss": 0.11246494054794312,
"step": 3020
},
{
"epoch": 1.2977242302543508,
"grad_norm": 0.36015262199345144,
"learning_rate": 8.544112193388513e-06,
"loss": 0.1255005955696106,
"step": 3030
},
{
"epoch": 1.3020080321285141,
"grad_norm": 0.32095363304273905,
"learning_rate": 8.530906182849971e-06,
"loss": 0.14123222827911378,
"step": 3040
},
{
"epoch": 1.3062918340026775,
"grad_norm": 0.35963777187492285,
"learning_rate": 8.51765085417622e-06,
"loss": 0.12764023542404174,
"step": 3050
},
{
"epoch": 1.3105756358768406,
"grad_norm": 0.3545121600646447,
"learning_rate": 8.504346392511824e-06,
"loss": 0.12473820447921753,
"step": 3060
},
{
"epoch": 1.3148594377510041,
"grad_norm": 0.3752599966671012,
"learning_rate": 8.490992983687617e-06,
"loss": 0.12995026111602784,
"step": 3070
},
{
"epoch": 1.3191432396251672,
"grad_norm": 0.3326424253698993,
"learning_rate": 8.477590814218104e-06,
"loss": 0.13189772367477418,
"step": 3080
},
{
"epoch": 1.3234270414993308,
"grad_norm": 0.33944521013309487,
"learning_rate": 8.464140071298858e-06,
"loss": 0.12935359477996827,
"step": 3090
},
{
"epoch": 1.3277108433734939,
"grad_norm": 0.37010591984301416,
"learning_rate": 8.450640942803904e-06,
"loss": 0.13249437808990477,
"step": 3100
},
{
"epoch": 1.3319946452476574,
"grad_norm": 0.3615034420980659,
"learning_rate": 8.437093617283099e-06,
"loss": 0.12562718391418456,
"step": 3110
},
{
"epoch": 1.3362784471218205,
"grad_norm": 0.3917767190914898,
"learning_rate": 8.423498283959487e-06,
"loss": 0.13038911819458007,
"step": 3120
},
{
"epoch": 1.3405622489959839,
"grad_norm": 0.38109850464604067,
"learning_rate": 8.40985513272667e-06,
"loss": 0.13978877067565917,
"step": 3130
},
{
"epoch": 1.3448460508701472,
"grad_norm": 0.4445890595042772,
"learning_rate": 8.39616435414615e-06,
"loss": 0.13834033012390137,
"step": 3140
},
{
"epoch": 1.3491298527443105,
"grad_norm": 0.31874973345433283,
"learning_rate": 8.38242613944466e-06,
"loss": 0.1258203625679016,
"step": 3150
},
{
"epoch": 1.3534136546184738,
"grad_norm": 0.33127205404029225,
"learning_rate": 8.368640680511507e-06,
"loss": 0.12356986999511718,
"step": 3160
},
{
"epoch": 1.3576974564926372,
"grad_norm": 0.416559211705474,
"learning_rate": 8.35480816989588e-06,
"loss": 0.11982156038284301,
"step": 3170
},
{
"epoch": 1.3619812583668005,
"grad_norm": 0.3660453384090912,
"learning_rate": 8.34092880080417e-06,
"loss": 0.11788184642791748,
"step": 3180
},
{
"epoch": 1.3662650602409638,
"grad_norm": 0.34339125686903177,
"learning_rate": 8.32700276709726e-06,
"loss": 0.13102638721466064,
"step": 3190
},
{
"epoch": 1.3705488621151272,
"grad_norm": 0.35489193494077403,
"learning_rate": 8.313030263287825e-06,
"loss": 0.1122696876525879,
"step": 3200
},
{
"epoch": 1.3748326639892905,
"grad_norm": 0.3746174683003833,
"learning_rate": 8.299011484537621e-06,
"loss": 0.1276139497756958,
"step": 3210
},
{
"epoch": 1.3791164658634538,
"grad_norm": 0.7123969948931433,
"learning_rate": 8.284946626654743e-06,
"loss": 0.1328984022140503,
"step": 3220
},
{
"epoch": 1.3834002677376172,
"grad_norm": 0.3822847406441411,
"learning_rate": 8.270835886090901e-06,
"loss": 0.11024882793426513,
"step": 3230
},
{
"epoch": 1.3876840696117805,
"grad_norm": 0.300720958006405,
"learning_rate": 8.256679459938681e-06,
"loss": 0.11192436218261718,
"step": 3240
},
{
"epoch": 1.3919678714859438,
"grad_norm": 0.36933913785412426,
"learning_rate": 8.242477545928775e-06,
"loss": 0.1279488682746887,
"step": 3250
},
{
"epoch": 1.3962516733601071,
"grad_norm": 0.3733629104677544,
"learning_rate": 8.228230342427237e-06,
"loss": 0.12411469221115112,
"step": 3260
},
{
"epoch": 1.4005354752342705,
"grad_norm": 0.37527660608807045,
"learning_rate": 8.213938048432697e-06,
"loss": 0.12071568965911865,
"step": 3270
},
{
"epoch": 1.4048192771084338,
"grad_norm": 0.32477552350056993,
"learning_rate": 8.199600863573599e-06,
"loss": 0.10580611228942871,
"step": 3280
},
{
"epoch": 1.4091030789825971,
"grad_norm": 0.4717886686044222,
"learning_rate": 8.185218988105392e-06,
"loss": 0.14088404178619385,
"step": 3290
},
{
"epoch": 1.4133868808567605,
"grad_norm": 0.3538053722734618,
"learning_rate": 8.170792622907751e-06,
"loss": 0.14626517295837402,
"step": 3300
},
{
"epoch": 1.4176706827309236,
"grad_norm": 0.3513040684652719,
"learning_rate": 8.156321969481762e-06,
"loss": 0.11440718173980713,
"step": 3310
},
{
"epoch": 1.421954484605087,
"grad_norm": 0.3591684736408224,
"learning_rate": 8.14180722994711e-06,
"loss": 0.13487778902053832,
"step": 3320
},
{
"epoch": 1.4262382864792502,
"grad_norm": 0.3658104845595591,
"learning_rate": 8.127248607039254e-06,
"loss": 0.11574537754058838,
"step": 3330
},
{
"epoch": 1.4305220883534138,
"grad_norm": 0.3197726257189657,
"learning_rate": 8.112646304106593e-06,
"loss": 0.12187765836715699,
"step": 3340
},
{
"epoch": 1.4348058902275769,
"grad_norm": 0.3846940368635854,
"learning_rate": 8.09800052510764e-06,
"loss": 0.11478321552276612,
"step": 3350
},
{
"epoch": 1.4390896921017404,
"grad_norm": 0.4176339612370988,
"learning_rate": 8.08331147460815e-06,
"loss": 0.1217038869857788,
"step": 3360
},
{
"epoch": 1.4433734939759035,
"grad_norm": 0.37885782340374674,
"learning_rate": 8.068579357778284e-06,
"loss": 0.12176965475082398,
"step": 3370
},
{
"epoch": 1.4476572958500669,
"grad_norm": 0.41287039590285307,
"learning_rate": 8.053804380389728e-06,
"loss": 0.12061818838119506,
"step": 3380
},
{
"epoch": 1.4519410977242302,
"grad_norm": 0.3494750223733423,
"learning_rate": 8.038986748812832e-06,
"loss": 0.131140398979187,
"step": 3390
},
{
"epoch": 1.4562248995983935,
"grad_norm": 0.3952195504175884,
"learning_rate": 8.024126670013716e-06,
"loss": 0.11915416717529297,
"step": 3400
},
{
"epoch": 1.4605087014725568,
"grad_norm": 0.3737316963804442,
"learning_rate": 8.009224351551386e-06,
"loss": 0.11794298887252808,
"step": 3410
},
{
"epoch": 1.4647925033467202,
"grad_norm": 0.3446468035702987,
"learning_rate": 7.99428000157483e-06,
"loss": 0.1277950167655945,
"step": 3420
},
{
"epoch": 1.4690763052208835,
"grad_norm": 0.37272918562452995,
"learning_rate": 7.979293828820119e-06,
"loss": 0.14721099138259888,
"step": 3430
},
{
"epoch": 1.4733601070950468,
"grad_norm": 0.33085072922732706,
"learning_rate": 7.96426604260748e-06,
"loss": 0.11756453514099122,
"step": 3440
},
{
"epoch": 1.4776439089692102,
"grad_norm": 0.320178401178284,
"learning_rate": 7.949196852838383e-06,
"loss": 0.1269507050514221,
"step": 3450
},
{
"epoch": 1.4819277108433735,
"grad_norm": 0.3580459421820677,
"learning_rate": 7.934086469992605e-06,
"loss": 0.1412634253501892,
"step": 3460
},
{
"epoch": 1.4862115127175368,
"grad_norm": 0.36913989344261383,
"learning_rate": 7.918935105125283e-06,
"loss": 0.15048539638519287,
"step": 3470
},
{
"epoch": 1.4904953145917001,
"grad_norm": 0.38425139309308326,
"learning_rate": 7.903742969863982e-06,
"loss": 0.13397784233093263,
"step": 3480
},
{
"epoch": 1.4947791164658635,
"grad_norm": 0.3627531984044689,
"learning_rate": 7.88851027640572e-06,
"loss": 0.11737120151519775,
"step": 3490
},
{
"epoch": 1.4990629183400268,
"grad_norm": 0.30678086877528343,
"learning_rate": 7.873237237514024e-06,
"loss": 0.1271947741508484,
"step": 3500
},
{
"epoch": 1.4990629183400268,
"eval_loss": 0.14634032547473907,
"eval_runtime": 11508.5453,
"eval_samples_per_second": 1.442,
"eval_steps_per_second": 0.361,
"step": 3500
},
{
"epoch": 1.5033467202141901,
"grad_norm": 0.3989430501599751,
"learning_rate": 7.857924066515941e-06,
"loss": 0.1253154993057251,
"step": 3510
},
{
"epoch": 1.5076305220883535,
"grad_norm": 0.3712393247049027,
"learning_rate": 7.842570977299067e-06,
"loss": 0.13159399032592772,
"step": 3520
},
{
"epoch": 1.5119143239625168,
"grad_norm": 0.39043985321189406,
"learning_rate": 7.827178184308559e-06,
"loss": 0.12818803787231445,
"step": 3530
},
{
"epoch": 1.51619812583668,
"grad_norm": 0.3783719248133356,
"learning_rate": 7.81174590254414e-06,
"loss": 0.12482264041900634,
"step": 3540
},
{
"epoch": 1.5204819277108435,
"grad_norm": 0.33627341086836304,
"learning_rate": 7.796274347557094e-06,
"loss": 0.1259792685508728,
"step": 3550
},
{
"epoch": 1.5247657295850066,
"grad_norm": 0.3307003231873695,
"learning_rate": 7.780763735447252e-06,
"loss": 0.11816374063491822,
"step": 3560
},
{
"epoch": 1.52904953145917,
"grad_norm": 0.41275730039950287,
"learning_rate": 7.765214282859981e-06,
"loss": 0.12664893865585328,
"step": 3570
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.3033638102712773,
"learning_rate": 7.749626206983157e-06,
"loss": 0.1236607551574707,
"step": 3580
},
{
"epoch": 1.5376171352074968,
"grad_norm": 0.3554700928985279,
"learning_rate": 7.733999725544126e-06,
"loss": 0.12761454582214354,
"step": 3590
},
{
"epoch": 1.5419009370816599,
"grad_norm": 0.35291611398156203,
"learning_rate": 7.718335056806665e-06,
"loss": 0.1287233352661133,
"step": 3600
},
{
"epoch": 1.5461847389558234,
"grad_norm": 0.3567309323303257,
"learning_rate": 7.702632419567937e-06,
"loss": 0.14273253679275513,
"step": 3610
},
{
"epoch": 1.5504685408299865,
"grad_norm": 0.3185688429925057,
"learning_rate": 7.68689203315543e-06,
"loss": 0.12159850597381591,
"step": 3620
},
{
"epoch": 1.55475234270415,
"grad_norm": 0.36346114221079345,
"learning_rate": 7.671114117423896e-06,
"loss": 0.12236592769622803,
"step": 3630
},
{
"epoch": 1.5590361445783132,
"grad_norm": 0.28944951523091206,
"learning_rate": 7.655298892752281e-06,
"loss": 0.1200286865234375,
"step": 3640
},
{
"epoch": 1.5633199464524767,
"grad_norm": 0.3320579302602756,
"learning_rate": 7.639446580040647e-06,
"loss": 0.13653804063796998,
"step": 3650
},
{
"epoch": 1.5676037483266398,
"grad_norm": 0.3367309725103469,
"learning_rate": 7.623557400707081e-06,
"loss": 0.12761712074279785,
"step": 3660
},
{
"epoch": 1.5718875502008032,
"grad_norm": 0.36883406849675304,
"learning_rate": 7.607631576684611e-06,
"loss": 0.12503886222839355,
"step": 3670
},
{
"epoch": 1.5761713520749665,
"grad_norm": 0.35021731907363346,
"learning_rate": 7.5916693304181e-06,
"loss": 0.11194202899932862,
"step": 3680
},
{
"epoch": 1.5804551539491298,
"grad_norm": 0.3466816731323651,
"learning_rate": 7.575670884861142e-06,
"loss": 0.11533315181732177,
"step": 3690
},
{
"epoch": 1.5847389558232932,
"grad_norm": 0.3695468619685566,
"learning_rate": 7.559636463472941e-06,
"loss": 0.12558252811431886,
"step": 3700
},
{
"epoch": 1.5890227576974565,
"grad_norm": 0.33317140225660996,
"learning_rate": 7.543566290215205e-06,
"loss": 0.11223011016845703,
"step": 3710
},
{
"epoch": 1.5933065595716198,
"grad_norm": 0.3802726049715593,
"learning_rate": 7.5274605895490014e-06,
"loss": 0.11428353786468506,
"step": 3720
},
{
"epoch": 1.5975903614457831,
"grad_norm": 0.3502543345535625,
"learning_rate": 7.511319586431631e-06,
"loss": 0.12747797966003419,
"step": 3730
},
{
"epoch": 1.6018741633199465,
"grad_norm": 0.48600388474175416,
"learning_rate": 7.495143506313484e-06,
"loss": 0.12503063678741455,
"step": 3740
},
{
"epoch": 1.6061579651941098,
"grad_norm": 0.402765639804346,
"learning_rate": 7.478932575134887e-06,
"loss": 0.1338959217071533,
"step": 3750
},
{
"epoch": 1.6104417670682731,
"grad_norm": 0.3921866593643898,
"learning_rate": 7.462687019322957e-06,
"loss": 0.11669353246688843,
"step": 3760
},
{
"epoch": 1.6147255689424365,
"grad_norm": 0.33652188082752615,
"learning_rate": 7.446407065788428e-06,
"loss": 0.12007842063903809,
"step": 3770
},
{
"epoch": 1.6190093708165998,
"grad_norm": 0.4118194067707435,
"learning_rate": 7.4300929419224866e-06,
"loss": 0.12169758081436158,
"step": 3780
},
{
"epoch": 1.623293172690763,
"grad_norm": 0.36857356877163894,
"learning_rate": 7.413744875593597e-06,
"loss": 0.12564884424209594,
"step": 3790
},
{
"epoch": 1.6275769745649264,
"grad_norm": 0.350693413841003,
"learning_rate": 7.397363095144318e-06,
"loss": 0.12418256998062134,
"step": 3800
},
{
"epoch": 1.6318607764390896,
"grad_norm": 0.3712677998496879,
"learning_rate": 7.380947829388108e-06,
"loss": 0.12151600122451782,
"step": 3810
},
{
"epoch": 1.636144578313253,
"grad_norm": 0.4183039288576934,
"learning_rate": 7.364499307606136e-06,
"loss": 0.11588020324707031,
"step": 3820
},
{
"epoch": 1.6404283801874162,
"grad_norm": 0.39624838378484395,
"learning_rate": 7.348017759544075e-06,
"loss": 0.12545753717422486,
"step": 3830
},
{
"epoch": 1.6447121820615798,
"grad_norm": 0.335543915765519,
"learning_rate": 7.331503415408899e-06,
"loss": 0.11865659952163696,
"step": 3840
},
{
"epoch": 1.6489959839357429,
"grad_norm": 0.30699590589486353,
"learning_rate": 7.3149565058656545e-06,
"loss": 0.11257133483886719,
"step": 3850
},
{
"epoch": 1.6532797858099064,
"grad_norm": 0.4211864176178027,
"learning_rate": 7.298377262034258e-06,
"loss": 0.12412948608398437,
"step": 3860
},
{
"epoch": 1.6575635876840695,
"grad_norm": 0.5480668142726313,
"learning_rate": 7.281765915486247e-06,
"loss": 0.11110868453979492,
"step": 3870
},
{
"epoch": 1.661847389558233,
"grad_norm": 0.38707346036306395,
"learning_rate": 7.265122698241562e-06,
"loss": 0.1353888154029846,
"step": 3880
},
{
"epoch": 1.6661311914323962,
"grad_norm": 0.3496172916459521,
"learning_rate": 7.248447842765298e-06,
"loss": 0.12294532060623169,
"step": 3890
},
{
"epoch": 1.6704149933065597,
"grad_norm": 0.3178905513170639,
"learning_rate": 7.231741581964455e-06,
"loss": 0.11635351181030273,
"step": 3900
},
{
"epoch": 1.6746987951807228,
"grad_norm": 0.37730744546548595,
"learning_rate": 7.2150041491846965e-06,
"loss": 0.13707247972488404,
"step": 3910
},
{
"epoch": 1.6789825970548862,
"grad_norm": 0.4258774014748926,
"learning_rate": 7.198235778207072e-06,
"loss": 0.11108559370040894,
"step": 3920
},
{
"epoch": 1.6832663989290495,
"grad_norm": 0.3269689561831232,
"learning_rate": 7.181436703244773e-06,
"loss": 0.13123619556427002,
"step": 3930
},
{
"epoch": 1.6875502008032128,
"grad_norm": 0.3328432989440898,
"learning_rate": 7.1646071589398406e-06,
"loss": 0.11167018413543701,
"step": 3940
},
{
"epoch": 1.6918340026773762,
"grad_norm": 0.3844316794696797,
"learning_rate": 7.147747380359905e-06,
"loss": 0.11800698041915894,
"step": 3950
},
{
"epoch": 1.6961178045515395,
"grad_norm": 0.33099179444642823,
"learning_rate": 7.130857602994894e-06,
"loss": 0.13457157611846923,
"step": 3960
},
{
"epoch": 1.7004016064257028,
"grad_norm": 0.3147285218500962,
"learning_rate": 7.113938062753742e-06,
"loss": 0.13172318935394287,
"step": 3970
},
{
"epoch": 1.7046854082998661,
"grad_norm": 0.3592833207498237,
"learning_rate": 7.0969889959611045e-06,
"loss": 0.1196314811706543,
"step": 3980
},
{
"epoch": 1.7089692101740295,
"grad_norm": 0.29811223409083043,
"learning_rate": 7.080010639354045e-06,
"loss": 0.11256670951843262,
"step": 3990
},
{
"epoch": 1.7132530120481928,
"grad_norm": 0.4270868815948092,
"learning_rate": 7.063003230078734e-06,
"loss": 0.12309803962707519,
"step": 4000
},
{
"epoch": 1.7132530120481928,
"eval_loss": 0.144321471452713,
"eval_runtime": 817.2721,
"eval_samples_per_second": 20.311,
"eval_steps_per_second": 5.078,
"step": 4000
},
{
"epoch": 1.7175368139223561,
"grad_norm": 0.3733167797076492,
"learning_rate": 7.045967005687141e-06,
"loss": 0.11690073013305664,
"step": 4010
},
{
"epoch": 1.7218206157965195,
"grad_norm": 0.32612931848843507,
"learning_rate": 7.028902204133711e-06,
"loss": 0.1235615611076355,
"step": 4020
},
{
"epoch": 1.7261044176706828,
"grad_norm": 0.3537546537362819,
"learning_rate": 7.011809063772038e-06,
"loss": 0.1282111883163452,
"step": 4030
},
{
"epoch": 1.730388219544846,
"grad_norm": 0.41855495134878623,
"learning_rate": 6.994687823351547e-06,
"loss": 0.13276000022888185,
"step": 4040
},
{
"epoch": 1.7346720214190094,
"grad_norm": 0.3640723677373699,
"learning_rate": 6.9775387220141465e-06,
"loss": 0.12338956594467163,
"step": 4050
},
{
"epoch": 1.7389558232931726,
"grad_norm": 0.348482478201222,
"learning_rate": 6.960361999290894e-06,
"loss": 0.1142328143119812,
"step": 4060
},
{
"epoch": 1.743239625167336,
"grad_norm": 0.41291989661610773,
"learning_rate": 6.943157895098656e-06,
"loss": 0.12496788501739502,
"step": 4070
},
{
"epoch": 1.7475234270414992,
"grad_norm": 0.31746340210362767,
"learning_rate": 6.925926649736745e-06,
"loss": 0.11045465469360352,
"step": 4080
},
{
"epoch": 1.7518072289156628,
"grad_norm": 0.32179304285895316,
"learning_rate": 6.9086685038835725e-06,
"loss": 0.13367241621017456,
"step": 4090
},
{
"epoch": 1.7560910307898259,
"grad_norm": 0.3467502021616522,
"learning_rate": 6.891383698593283e-06,
"loss": 0.11450705528259278,
"step": 4100
},
{
"epoch": 1.7603748326639894,
"grad_norm": 0.37824785627911034,
"learning_rate": 6.874072475292388e-06,
"loss": 0.11085845232009887,
"step": 4110
},
{
"epoch": 1.7646586345381525,
"grad_norm": 0.33242640245264393,
"learning_rate": 6.856735075776395e-06,
"loss": 0.12101356983184815,
"step": 4120
},
{
"epoch": 1.768942436412316,
"grad_norm": 0.3295693613929198,
"learning_rate": 6.839371742206432e-06,
"loss": 0.11143279075622559,
"step": 4130
},
{
"epoch": 1.7732262382864792,
"grad_norm": 0.41043258389255455,
"learning_rate": 6.821982717105855e-06,
"loss": 0.11657199859619141,
"step": 4140
},
{
"epoch": 1.7775100401606427,
"grad_norm": 0.3336241961556357,
"learning_rate": 6.804568243356876e-06,
"loss": 0.12107970714569091,
"step": 4150
},
{
"epoch": 1.7817938420348058,
"grad_norm": 0.404764797519025,
"learning_rate": 6.7871285641971576e-06,
"loss": 0.12142288684844971,
"step": 4160
},
{
"epoch": 1.7860776439089692,
"grad_norm": 0.35528280014790076,
"learning_rate": 6.769663923216419e-06,
"loss": 0.14445422887802123,
"step": 4170
},
{
"epoch": 1.7903614457831325,
"grad_norm": 0.36424811344112645,
"learning_rate": 6.75217456435304e-06,
"loss": 0.11748452186584472,
"step": 4180
},
{
"epoch": 1.7946452476572958,
"grad_norm": 0.3580564279402089,
"learning_rate": 6.734660731890645e-06,
"loss": 0.11877243518829346,
"step": 4190
},
{
"epoch": 1.7989290495314592,
"grad_norm": 0.3945693311810663,
"learning_rate": 6.717122670454701e-06,
"loss": 0.12274388074874878,
"step": 4200
},
{
"epoch": 1.8032128514056225,
"grad_norm": 0.3274495553953029,
"learning_rate": 6.699560625009085e-06,
"loss": 0.11418673992156983,
"step": 4210
},
{
"epoch": 1.8074966532797858,
"grad_norm": 0.43405948322435506,
"learning_rate": 6.6819748408526775e-06,
"loss": 0.11989142894744872,
"step": 4220
},
{
"epoch": 1.8117804551539491,
"grad_norm": 0.34302792735595455,
"learning_rate": 6.6643655636159325e-06,
"loss": 0.10752333402633667,
"step": 4230
},
{
"epoch": 1.8160642570281125,
"grad_norm": 0.38396837748701773,
"learning_rate": 6.646733039257442e-06,
"loss": 0.12758421897888184,
"step": 4240
},
{
"epoch": 1.8203480589022758,
"grad_norm": 0.35943197748111966,
"learning_rate": 6.629077514060501e-06,
"loss": 0.11687214374542236,
"step": 4250
},
{
"epoch": 1.8246318607764391,
"grad_norm": 0.33900665564961463,
"learning_rate": 6.611399234629679e-06,
"loss": 0.1235961675643921,
"step": 4260
},
{
"epoch": 1.8289156626506025,
"grad_norm": 0.36539098779168305,
"learning_rate": 6.593698447887357e-06,
"loss": 0.12241628170013427,
"step": 4270
},
{
"epoch": 1.8331994645247658,
"grad_norm": 0.38361329899883734,
"learning_rate": 6.575975401070291e-06,
"loss": 0.12448443174362182,
"step": 4280
},
{
"epoch": 1.837483266398929,
"grad_norm": 0.4014122394041882,
"learning_rate": 6.5582303417261605e-06,
"loss": 0.1193004846572876,
"step": 4290
},
{
"epoch": 1.8417670682730924,
"grad_norm": 0.3678903848404944,
"learning_rate": 6.540463517710099e-06,
"loss": 0.1212453842163086,
"step": 4300
},
{
"epoch": 1.8460508701472556,
"grad_norm": 0.3251163301086072,
"learning_rate": 6.5226751771812476e-06,
"loss": 0.12798908948898316,
"step": 4310
},
{
"epoch": 1.850334672021419,
"grad_norm": 0.3415099254328554,
"learning_rate": 6.5048655685992705e-06,
"loss": 0.13018690347671508,
"step": 4320
},
{
"epoch": 1.8546184738955822,
"grad_norm": 0.3905905047279772,
"learning_rate": 6.487034940720902e-06,
"loss": 0.12057719230651856,
"step": 4330
},
{
"epoch": 1.8589022757697458,
"grad_norm": 0.3646836032160996,
"learning_rate": 6.469183542596464e-06,
"loss": 0.13052282333374024,
"step": 4340
},
{
"epoch": 1.8631860776439089,
"grad_norm": 0.33435104754269,
"learning_rate": 6.451311623566386e-06,
"loss": 0.11543186902999877,
"step": 4350
},
{
"epoch": 1.8674698795180724,
"grad_norm": 0.3562601136655919,
"learning_rate": 6.433419433257726e-06,
"loss": 0.12250864505767822,
"step": 4360
},
{
"epoch": 1.8717536813922355,
"grad_norm": 0.3226539154934918,
"learning_rate": 6.415507221580678e-06,
"loss": 0.12082786560058593,
"step": 4370
},
{
"epoch": 1.876037483266399,
"grad_norm": 0.3635681601652211,
"learning_rate": 6.397575238725091e-06,
"loss": 0.12619302272796631,
"step": 4380
},
{
"epoch": 1.8803212851405622,
"grad_norm": 0.3607934399845053,
"learning_rate": 6.379623735156968e-06,
"loss": 0.12855522632598876,
"step": 4390
},
{
"epoch": 1.8846050870147257,
"grad_norm": 0.33220984445822355,
"learning_rate": 6.361652961614966e-06,
"loss": 0.11576036214828492,
"step": 4400
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.3957629269071009,
"learning_rate": 6.343663169106897e-06,
"loss": 0.12123892307281495,
"step": 4410
},
{
"epoch": 1.8931726907630522,
"grad_norm": 0.38648528140436955,
"learning_rate": 6.325654608906228e-06,
"loss": 0.13391902446746826,
"step": 4420
},
{
"epoch": 1.8974564926372155,
"grad_norm": 0.322831029116286,
"learning_rate": 6.307627532548554e-06,
"loss": 0.11682146787643433,
"step": 4430
},
{
"epoch": 1.9017402945113788,
"grad_norm": 0.34943896220332243,
"learning_rate": 6.289582191828102e-06,
"loss": 0.10885384082794189,
"step": 4440
},
{
"epoch": 1.9060240963855422,
"grad_norm": 0.3885943387224764,
"learning_rate": 6.2715188387942085e-06,
"loss": 0.11223304271697998,
"step": 4450
},
{
"epoch": 1.9103078982597055,
"grad_norm": 0.336737608622642,
"learning_rate": 6.253437725747795e-06,
"loss": 0.11982736587524415,
"step": 4460
},
{
"epoch": 1.9145917001338688,
"grad_norm": 0.3692087496930761,
"learning_rate": 6.235339105237849e-06,
"loss": 0.12470091581344604,
"step": 4470
},
{
"epoch": 1.9188755020080321,
"grad_norm": 0.39542747620483304,
"learning_rate": 6.217223230057891e-06,
"loss": 0.1260706901550293,
"step": 4480
},
{
"epoch": 1.9231593038821955,
"grad_norm": 0.36423974792020714,
"learning_rate": 6.199090353242452e-06,
"loss": 0.11962894201278687,
"step": 4490
},
{
"epoch": 1.9274431057563588,
"grad_norm": 0.36962654876484385,
"learning_rate": 6.18094072806353e-06,
"loss": 0.11819722652435302,
"step": 4500
},
{
"epoch": 1.9274431057563588,
"eval_loss": 0.14271628856658936,
"eval_runtime": 1019.4039,
"eval_samples_per_second": 16.284,
"eval_steps_per_second": 4.071,
"step": 4500
}
],
"logging_steps": 10,
"max_steps": 9340,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 481757134651392.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}