reranker-large-v1 / checkpoint-20000 /trainer_state.json
peter831's picture
Upload folder using huggingface_hub
669b002 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9181102994761033,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002295275748690258,
"grad_norm": 2.0098984241485596,
"learning_rate": 1.4856966372720683e-06,
"loss": 2.0874,
"step": 50
},
{
"epoch": 0.004590551497380516,
"grad_norm": 7.3466691970825195,
"learning_rate": 1.7773258340037704e-06,
"loss": 2.0765,
"step": 100
},
{
"epoch": 0.006885827246070775,
"grad_norm": 64.36688232421875,
"learning_rate": 1.9369412440149804e-06,
"loss": 1.9359,
"step": 150
},
{
"epoch": 0.009181102994761032,
"grad_norm": 30.22006607055664,
"learning_rate": 2.0539675208065135e-06,
"loss": 1.4178,
"step": 200
},
{
"epoch": 0.01147637874345129,
"grad_norm": 26.016658782958984,
"learning_rate": 2.1422693353722617e-06,
"loss": 1.0832,
"step": 250
},
{
"epoch": 0.01377165449214155,
"grad_norm": 20.14085578918457,
"learning_rate": 2.2142328937343315e-06,
"loss": 0.9394,
"step": 300
},
{
"epoch": 0.01606693024083181,
"grad_norm": 37.867881774902344,
"learning_rate": 2.2761167318484284e-06,
"loss": 0.8149,
"step": 350
},
{
"epoch": 0.018362205989522064,
"grad_norm": 29.71302604675293,
"learning_rate": 2.329517989724819e-06,
"loss": 0.7927,
"step": 400
},
{
"epoch": 0.020657481738212323,
"grad_norm": 23.509214401245117,
"learning_rate": 2.3764842614654632e-06,
"loss": 0.7256,
"step": 450
},
{
"epoch": 0.02295275748690258,
"grad_norm": 25.001569747924805,
"learning_rate": 2.418400955810514e-06,
"loss": 0.6645,
"step": 500
},
{
"epoch": 0.02524803323559284,
"grad_norm": 17.22730827331543,
"learning_rate": 2.4562492431761594e-06,
"loss": 0.6266,
"step": 550
},
{
"epoch": 0.0275433089842831,
"grad_norm": 27.04228973388672,
"learning_rate": 2.490749474698331e-06,
"loss": 0.6193,
"step": 600
},
{
"epoch": 0.029838584732973357,
"grad_norm": 29.859375,
"learning_rate": 2.5224461471278787e-06,
"loss": 0.6038,
"step": 650
},
{
"epoch": 0.03213386048166362,
"grad_norm": 23.304059982299805,
"learning_rate": 2.5517608604098523e-06,
"loss": 0.5889,
"step": 700
},
{
"epoch": 0.034429136230353874,
"grad_norm": 34.74980163574219,
"learning_rate": 2.5790267286571216e-06,
"loss": 0.5936,
"step": 750
},
{
"epoch": 0.03672441197904413,
"grad_norm": 25.56021499633789,
"learning_rate": 2.6045115312613743e-06,
"loss": 0.5748,
"step": 800
},
{
"epoch": 0.03901968772773439,
"grad_norm": 12.490568161010742,
"learning_rate": 2.6284337551622617e-06,
"loss": 0.558,
"step": 850
},
{
"epoch": 0.041314963476424646,
"grad_norm": 17.215822219848633,
"learning_rate": 2.6509739968294195e-06,
"loss": 0.5535,
"step": 900
},
{
"epoch": 0.04361023922511491,
"grad_norm": 21.178848266601562,
"learning_rate": 2.6722832469155484e-06,
"loss": 0.5271,
"step": 950
},
{
"epoch": 0.04590551497380516,
"grad_norm": 14.22229290008545,
"learning_rate": 2.6924890275135524e-06,
"loss": 0.5421,
"step": 1000
},
{
"epoch": 0.048200790722495425,
"grad_norm": 77.64906311035156,
"learning_rate": 2.711700017156508e-06,
"loss": 0.5329,
"step": 1050
},
{
"epoch": 0.05049606647118568,
"grad_norm": 25.775665283203125,
"learning_rate": 2.73000958979208e-06,
"loss": 0.5169,
"step": 1100
},
{
"epoch": 0.05279134221987594,
"grad_norm": 16.82468032836914,
"learning_rate": 2.7471563517988443e-06,
"loss": 0.4989,
"step": 1150
},
{
"epoch": 0.0550866179685662,
"grad_norm": 20.88793182373047,
"learning_rate": 2.7639095036115487e-06,
"loss": 0.5076,
"step": 1200
},
{
"epoch": 0.05738189371725646,
"grad_norm": 15.96468448638916,
"learning_rate": 2.779973026465106e-06,
"loss": 0.4996,
"step": 1250
},
{
"epoch": 0.059677169465946714,
"grad_norm": 29.40988540649414,
"learning_rate": 2.7954014586854842e-06,
"loss": 0.523,
"step": 1300
},
{
"epoch": 0.061972445214636976,
"grad_norm": 19.3817081451416,
"learning_rate": 2.810243114309793e-06,
"loss": 0.4857,
"step": 1350
},
{
"epoch": 0.06426772096332724,
"grad_norm": 30.938968658447266,
"learning_rate": 2.824540995622056e-06,
"loss": 0.4948,
"step": 1400
},
{
"epoch": 0.06656299671201749,
"grad_norm": 26.087671279907227,
"learning_rate": 2.83833354435266e-06,
"loss": 0.4907,
"step": 1450
},
{
"epoch": 0.06885827246070775,
"grad_norm": 16.257780075073242,
"learning_rate": 2.8516552646048146e-06,
"loss": 0.4792,
"step": 1500
},
{
"epoch": 0.07115354820939801,
"grad_norm": 20.534271240234375,
"learning_rate": 2.8645372429265973e-06,
"loss": 0.4889,
"step": 1550
},
{
"epoch": 0.07344882395808826,
"grad_norm": 13.257925987243652,
"learning_rate": 2.877007585258154e-06,
"loss": 0.4745,
"step": 1600
},
{
"epoch": 0.07574409970677852,
"grad_norm": 14.961082458496094,
"learning_rate": 2.889091786204755e-06,
"loss": 0.4922,
"step": 1650
},
{
"epoch": 0.07803937545546878,
"grad_norm": 15.788146018981934,
"learning_rate": 2.90081304283633e-06,
"loss": 0.4781,
"step": 1700
},
{
"epoch": 0.08033465120415904,
"grad_norm": 15.50967025756836,
"learning_rate": 2.912192522722599e-06,
"loss": 0.4875,
"step": 1750
},
{
"epoch": 0.08262992695284929,
"grad_norm": 13.156537055969238,
"learning_rate": 2.9232495939864444e-06,
"loss": 0.4703,
"step": 1800
},
{
"epoch": 0.08492520270153955,
"grad_norm": 32.00178909301758,
"learning_rate": 2.9340020236565454e-06,
"loss": 0.4702,
"step": 1850
},
{
"epoch": 0.08722047845022982,
"grad_norm": 19.58460807800293,
"learning_rate": 2.9444661494209185e-06,
"loss": 0.4938,
"step": 1900
},
{
"epoch": 0.08951575419892008,
"grad_norm": 40.492218017578125,
"learning_rate": 2.954657028950126e-06,
"loss": 0.4523,
"step": 1950
},
{
"epoch": 0.09181102994761033,
"grad_norm": 84.74713897705078,
"learning_rate": 2.9645885702160444e-06,
"loss": 0.4712,
"step": 2000
},
{
"epoch": 0.09410630569630059,
"grad_norm": 26.168777465820312,
"learning_rate": 2.9742736456367274e-06,
"loss": 0.4397,
"step": 2050
},
{
"epoch": 0.09640158144499085,
"grad_norm": 22.866653442382812,
"learning_rate": 2.9837241923979606e-06,
"loss": 0.4404,
"step": 2100
},
{
"epoch": 0.09869685719368111,
"grad_norm": 20.17527198791504,
"learning_rate": 2.992951300912944e-06,
"loss": 0.4592,
"step": 2150
},
{
"epoch": 0.10099213294237136,
"grad_norm": 24.15152359008789,
"learning_rate": 2.998469700061212e-06,
"loss": 0.4293,
"step": 2200
},
{
"epoch": 0.10328740869106162,
"grad_norm": 35.48339080810547,
"learning_rate": 2.990818200367272e-06,
"loss": 0.447,
"step": 2250
},
{
"epoch": 0.10558268443975188,
"grad_norm": 19.331541061401367,
"learning_rate": 2.9831667006733323e-06,
"loss": 0.4335,
"step": 2300
},
{
"epoch": 0.10787796018844215,
"grad_norm": 16.164037704467773,
"learning_rate": 2.975515200979392e-06,
"loss": 0.4428,
"step": 2350
},
{
"epoch": 0.1101732359371324,
"grad_norm": 14.07393741607666,
"learning_rate": 2.967863701285452e-06,
"loss": 0.4337,
"step": 2400
},
{
"epoch": 0.11246851168582266,
"grad_norm": 18.08838653564453,
"learning_rate": 2.960212201591512e-06,
"loss": 0.4366,
"step": 2450
},
{
"epoch": 0.11476378743451292,
"grad_norm": 20.341272354125977,
"learning_rate": 2.952560701897572e-06,
"loss": 0.4624,
"step": 2500
},
{
"epoch": 0.11705906318320317,
"grad_norm": 17.554807662963867,
"learning_rate": 2.944909202203632e-06,
"loss": 0.4517,
"step": 2550
},
{
"epoch": 0.11935433893189343,
"grad_norm": 12.912613868713379,
"learning_rate": 2.9372577025096922e-06,
"loss": 0.4191,
"step": 2600
},
{
"epoch": 0.12164961468058369,
"grad_norm": 11.591246604919434,
"learning_rate": 2.9296062028157517e-06,
"loss": 0.4577,
"step": 2650
},
{
"epoch": 0.12394489042927395,
"grad_norm": 12.689074516296387,
"learning_rate": 2.921954703121812e-06,
"loss": 0.4291,
"step": 2700
},
{
"epoch": 0.1262401661779642,
"grad_norm": 12.471826553344727,
"learning_rate": 2.914303203427872e-06,
"loss": 0.4528,
"step": 2750
},
{
"epoch": 0.12853544192665448,
"grad_norm": 12.39678955078125,
"learning_rate": 2.906804733727811e-06,
"loss": 0.4328,
"step": 2800
},
{
"epoch": 0.1308307176753447,
"grad_norm": 15.050296783447266,
"learning_rate": 2.8991532340338704e-06,
"loss": 0.4258,
"step": 2850
},
{
"epoch": 0.13312599342403497,
"grad_norm": 18.09667205810547,
"learning_rate": 2.8915017343399307e-06,
"loss": 0.4313,
"step": 2900
},
{
"epoch": 0.13542126917272523,
"grad_norm": 11.64979362487793,
"learning_rate": 2.8838502346459907e-06,
"loss": 0.4306,
"step": 2950
},
{
"epoch": 0.1377165449214155,
"grad_norm": 13.034558296203613,
"learning_rate": 2.8761987349520506e-06,
"loss": 0.4337,
"step": 3000
},
{
"epoch": 0.14001182067010576,
"grad_norm": 18.609317779541016,
"learning_rate": 2.868547235258111e-06,
"loss": 0.4308,
"step": 3050
},
{
"epoch": 0.14230709641879602,
"grad_norm": 12.441306114196777,
"learning_rate": 2.8608957355641704e-06,
"loss": 0.4267,
"step": 3100
},
{
"epoch": 0.14460237216748628,
"grad_norm": 20.22740364074707,
"learning_rate": 2.8532442358702308e-06,
"loss": 0.4229,
"step": 3150
},
{
"epoch": 0.14689764791617652,
"grad_norm": 14.57115650177002,
"learning_rate": 2.8455927361762907e-06,
"loss": 0.4196,
"step": 3200
},
{
"epoch": 0.14919292366486678,
"grad_norm": 15.352262496948242,
"learning_rate": 2.8379412364823506e-06,
"loss": 0.4367,
"step": 3250
},
{
"epoch": 0.15148819941355704,
"grad_norm": 36.657840728759766,
"learning_rate": 2.8302897367884105e-06,
"loss": 0.4529,
"step": 3300
},
{
"epoch": 0.1537834751622473,
"grad_norm": 14.538572311401367,
"learning_rate": 2.822638237094471e-06,
"loss": 0.4098,
"step": 3350
},
{
"epoch": 0.15607875091093756,
"grad_norm": 14.28237533569336,
"learning_rate": 2.8149867374005303e-06,
"loss": 0.4128,
"step": 3400
},
{
"epoch": 0.15837402665962783,
"grad_norm": 13.038817405700684,
"learning_rate": 2.8073352377065907e-06,
"loss": 0.4396,
"step": 3450
},
{
"epoch": 0.1606693024083181,
"grad_norm": 12.356093406677246,
"learning_rate": 2.799836768006529e-06,
"loss": 0.4445,
"step": 3500
},
{
"epoch": 0.16296457815700835,
"grad_norm": 11.58459186553955,
"learning_rate": 2.7921852683125895e-06,
"loss": 0.4383,
"step": 3550
},
{
"epoch": 0.16525985390569858,
"grad_norm": 13.039240837097168,
"learning_rate": 2.784533768618649e-06,
"loss": 0.4109,
"step": 3600
},
{
"epoch": 0.16755512965438885,
"grad_norm": 34.44761276245117,
"learning_rate": 2.7768822689247094e-06,
"loss": 0.427,
"step": 3650
},
{
"epoch": 0.1698504054030791,
"grad_norm": 17.821250915527344,
"learning_rate": 2.7692307692307693e-06,
"loss": 0.4197,
"step": 3700
},
{
"epoch": 0.17214568115176937,
"grad_norm": 21.884822845458984,
"learning_rate": 2.761579269536829e-06,
"loss": 0.4071,
"step": 3750
},
{
"epoch": 0.17444095690045963,
"grad_norm": 18.419849395751953,
"learning_rate": 2.7539277698428895e-06,
"loss": 0.4531,
"step": 3800
},
{
"epoch": 0.1767362326491499,
"grad_norm": 13.88978099822998,
"learning_rate": 2.746276270148949e-06,
"loss": 0.4231,
"step": 3850
},
{
"epoch": 0.17903150839784016,
"grad_norm": 18.00520133972168,
"learning_rate": 2.7386247704550094e-06,
"loss": 0.4307,
"step": 3900
},
{
"epoch": 0.18132678414653042,
"grad_norm": 11.312788963317871,
"learning_rate": 2.7309732707610693e-06,
"loss": 0.4236,
"step": 3950
},
{
"epoch": 0.18362205989522065,
"grad_norm": 18.427597045898438,
"learning_rate": 2.723321771067129e-06,
"loss": 0.4188,
"step": 4000
},
{
"epoch": 0.1859173356439109,
"grad_norm": 15.946565628051758,
"learning_rate": 2.715670271373189e-06,
"loss": 0.4294,
"step": 4050
},
{
"epoch": 0.18821261139260118,
"grad_norm": 10.010071754455566,
"learning_rate": 2.7080187716792494e-06,
"loss": 0.4069,
"step": 4100
},
{
"epoch": 0.19050788714129144,
"grad_norm": 14.598346710205078,
"learning_rate": 2.700367271985309e-06,
"loss": 0.4198,
"step": 4150
},
{
"epoch": 0.1928031628899817,
"grad_norm": 14.54223918914795,
"learning_rate": 2.6927157722913693e-06,
"loss": 0.4051,
"step": 4200
},
{
"epoch": 0.19509843863867196,
"grad_norm": 22.899091720581055,
"learning_rate": 2.685064272597429e-06,
"loss": 0.418,
"step": 4250
},
{
"epoch": 0.19739371438736222,
"grad_norm": 11.836400032043457,
"learning_rate": 2.677412772903489e-06,
"loss": 0.4164,
"step": 4300
},
{
"epoch": 0.19968899013605246,
"grad_norm": 10.089373588562012,
"learning_rate": 2.669761273209549e-06,
"loss": 0.4088,
"step": 4350
},
{
"epoch": 0.20198426588474272,
"grad_norm": 13.1040620803833,
"learning_rate": 2.662109773515609e-06,
"loss": 0.4104,
"step": 4400
},
{
"epoch": 0.20427954163343298,
"grad_norm": 19.278318405151367,
"learning_rate": 2.6544582738216693e-06,
"loss": 0.4109,
"step": 4450
},
{
"epoch": 0.20657481738212324,
"grad_norm": 22.169904708862305,
"learning_rate": 2.646806774127729e-06,
"loss": 0.4067,
"step": 4500
},
{
"epoch": 0.2088700931308135,
"grad_norm": 39.922847747802734,
"learning_rate": 2.639155274433789e-06,
"loss": 0.4035,
"step": 4550
},
{
"epoch": 0.21116536887950377,
"grad_norm": 13.038665771484375,
"learning_rate": 2.631503774739849e-06,
"loss": 0.4208,
"step": 4600
},
{
"epoch": 0.21346064462819403,
"grad_norm": 8.684808731079102,
"learning_rate": 2.6238522750459094e-06,
"loss": 0.4071,
"step": 4650
},
{
"epoch": 0.2157559203768843,
"grad_norm": 19.35813331604004,
"learning_rate": 2.616200775351969e-06,
"loss": 0.415,
"step": 4700
},
{
"epoch": 0.21805119612557453,
"grad_norm": 13.930493354797363,
"learning_rate": 2.608549275658029e-06,
"loss": 0.4126,
"step": 4750
},
{
"epoch": 0.2203464718742648,
"grad_norm": 11.336668968200684,
"learning_rate": 2.600897775964089e-06,
"loss": 0.4006,
"step": 4800
},
{
"epoch": 0.22264174762295505,
"grad_norm": 11.625572204589844,
"learning_rate": 2.593246276270149e-06,
"loss": 0.4016,
"step": 4850
},
{
"epoch": 0.2249370233716453,
"grad_norm": 13.675556182861328,
"learning_rate": 2.585594776576209e-06,
"loss": 0.4002,
"step": 4900
},
{
"epoch": 0.22723229912033557,
"grad_norm": 10.017621994018555,
"learning_rate": 2.577943276882269e-06,
"loss": 0.3831,
"step": 4950
},
{
"epoch": 0.22952757486902584,
"grad_norm": 9.974747657775879,
"learning_rate": 2.570444807182208e-06,
"loss": 0.4271,
"step": 5000
},
{
"epoch": 0.2318228506177161,
"grad_norm": 17.395328521728516,
"learning_rate": 2.5627933074882677e-06,
"loss": 0.4088,
"step": 5050
},
{
"epoch": 0.23411812636640633,
"grad_norm": 11.723817825317383,
"learning_rate": 2.5551418077943276e-06,
"loss": 0.3957,
"step": 5100
},
{
"epoch": 0.2364134021150966,
"grad_norm": 11.623498916625977,
"learning_rate": 2.5474903081003875e-06,
"loss": 0.4107,
"step": 5150
},
{
"epoch": 0.23870867786378686,
"grad_norm": 18.517606735229492,
"learning_rate": 2.539838808406448e-06,
"loss": 0.4228,
"step": 5200
},
{
"epoch": 0.24100395361247712,
"grad_norm": 16.989261627197266,
"learning_rate": 2.532187308712508e-06,
"loss": 0.4057,
"step": 5250
},
{
"epoch": 0.24329922936116738,
"grad_norm": 14.439526557922363,
"learning_rate": 2.5245358090185677e-06,
"loss": 0.4152,
"step": 5300
},
{
"epoch": 0.24559450510985764,
"grad_norm": 17.96590805053711,
"learning_rate": 2.5168843093246276e-06,
"loss": 0.412,
"step": 5350
},
{
"epoch": 0.2478897808585479,
"grad_norm": 18.715192794799805,
"learning_rate": 2.509232809630688e-06,
"loss": 0.3926,
"step": 5400
},
{
"epoch": 0.25018505660723817,
"grad_norm": 9.67862606048584,
"learning_rate": 2.5015813099367475e-06,
"loss": 0.3897,
"step": 5450
},
{
"epoch": 0.2524803323559284,
"grad_norm": 10.955434799194336,
"learning_rate": 2.493929810242808e-06,
"loss": 0.4127,
"step": 5500
},
{
"epoch": 0.2547756081046187,
"grad_norm": 13.451887130737305,
"learning_rate": 2.4862783105488677e-06,
"loss": 0.4027,
"step": 5550
},
{
"epoch": 0.25707088385330895,
"grad_norm": 17.29932403564453,
"learning_rate": 2.4786268108549276e-06,
"loss": 0.3976,
"step": 5600
},
{
"epoch": 0.25936615960199916,
"grad_norm": 9.819147109985352,
"learning_rate": 2.4709753111609876e-06,
"loss": 0.3667,
"step": 5650
},
{
"epoch": 0.2616614353506894,
"grad_norm": 25.263425827026367,
"learning_rate": 2.4633238114670475e-06,
"loss": 0.3816,
"step": 5700
},
{
"epoch": 0.2639567110993797,
"grad_norm": 12.653718948364258,
"learning_rate": 2.4556723117731074e-06,
"loss": 0.4104,
"step": 5750
},
{
"epoch": 0.26625198684806994,
"grad_norm": 25.030275344848633,
"learning_rate": 2.4480208120791677e-06,
"loss": 0.3916,
"step": 5800
},
{
"epoch": 0.2685472625967602,
"grad_norm": 11.694486618041992,
"learning_rate": 2.4403693123852272e-06,
"loss": 0.3901,
"step": 5850
},
{
"epoch": 0.27084253834545047,
"grad_norm": 13.822489738464355,
"learning_rate": 2.4327178126912876e-06,
"loss": 0.3861,
"step": 5900
},
{
"epoch": 0.27313781409414073,
"grad_norm": 13.548593521118164,
"learning_rate": 2.4250663129973475e-06,
"loss": 0.4043,
"step": 5950
},
{
"epoch": 0.275433089842831,
"grad_norm": 16.44365119934082,
"learning_rate": 2.4174148133034074e-06,
"loss": 0.3927,
"step": 6000
},
{
"epoch": 0.27772836559152125,
"grad_norm": 23.707990646362305,
"learning_rate": 2.4099163436033463e-06,
"loss": 0.3864,
"step": 6050
},
{
"epoch": 0.2800236413402115,
"grad_norm": 14.827530860900879,
"learning_rate": 2.4022648439094062e-06,
"loss": 0.3904,
"step": 6100
},
{
"epoch": 0.2823189170889018,
"grad_norm": 21.141281127929688,
"learning_rate": 2.394766374209345e-06,
"loss": 0.4152,
"step": 6150
},
{
"epoch": 0.28461419283759204,
"grad_norm": 20.442838668823242,
"learning_rate": 2.387114874515405e-06,
"loss": 0.4072,
"step": 6200
},
{
"epoch": 0.2869094685862823,
"grad_norm": 10.607434272766113,
"learning_rate": 2.379463374821465e-06,
"loss": 0.4083,
"step": 6250
},
{
"epoch": 0.28920474433497256,
"grad_norm": 15.464150428771973,
"learning_rate": 2.371811875127525e-06,
"loss": 0.3963,
"step": 6300
},
{
"epoch": 0.2915000200836628,
"grad_norm": 11.382309913635254,
"learning_rate": 2.3641603754335853e-06,
"loss": 0.3966,
"step": 6350
},
{
"epoch": 0.29379529583235303,
"grad_norm": 13.88147258758545,
"learning_rate": 2.3565088757396448e-06,
"loss": 0.3961,
"step": 6400
},
{
"epoch": 0.2960905715810433,
"grad_norm": 11.285943031311035,
"learning_rate": 2.348857376045705e-06,
"loss": 0.3667,
"step": 6450
},
{
"epoch": 0.29838584732973356,
"grad_norm": 13.869823455810547,
"learning_rate": 2.341205876351765e-06,
"loss": 0.4129,
"step": 6500
},
{
"epoch": 0.3006811230784238,
"grad_norm": 31.149734497070312,
"learning_rate": 2.333554376657825e-06,
"loss": 0.4148,
"step": 6550
},
{
"epoch": 0.3029763988271141,
"grad_norm": 125.76400756835938,
"learning_rate": 2.325902876963885e-06,
"loss": 0.3898,
"step": 6600
},
{
"epoch": 0.30527167457580434,
"grad_norm": 12.42544937133789,
"learning_rate": 2.3182513772699448e-06,
"loss": 0.406,
"step": 6650
},
{
"epoch": 0.3075669503244946,
"grad_norm": 17.587514877319336,
"learning_rate": 2.310599877576005e-06,
"loss": 0.3945,
"step": 6700
},
{
"epoch": 0.30986222607318487,
"grad_norm": 10.764501571655273,
"learning_rate": 2.302948377882065e-06,
"loss": 0.3842,
"step": 6750
},
{
"epoch": 0.3121575018218751,
"grad_norm": 9.208351135253906,
"learning_rate": 2.295296878188125e-06,
"loss": 0.3923,
"step": 6800
},
{
"epoch": 0.3144527775705654,
"grad_norm": 11.656021118164062,
"learning_rate": 2.287645378494185e-06,
"loss": 0.3886,
"step": 6850
},
{
"epoch": 0.31674805331925565,
"grad_norm": 12.415077209472656,
"learning_rate": 2.279993878800245e-06,
"loss": 0.3739,
"step": 6900
},
{
"epoch": 0.3190433290679459,
"grad_norm": 9.68492317199707,
"learning_rate": 2.2723423791063047e-06,
"loss": 0.3891,
"step": 6950
},
{
"epoch": 0.3213386048166362,
"grad_norm": 10.831034660339355,
"learning_rate": 2.264690879412365e-06,
"loss": 0.402,
"step": 7000
},
{
"epoch": 0.32363388056532644,
"grad_norm": 15.833775520324707,
"learning_rate": 2.257039379718425e-06,
"loss": 0.3936,
"step": 7050
},
{
"epoch": 0.3259291563140167,
"grad_norm": 20.576457977294922,
"learning_rate": 2.249387880024485e-06,
"loss": 0.3855,
"step": 7100
},
{
"epoch": 0.32822443206270696,
"grad_norm": 17.22242546081543,
"learning_rate": 2.2417363803305448e-06,
"loss": 0.4012,
"step": 7150
},
{
"epoch": 0.33051970781139717,
"grad_norm": 9.966532707214355,
"learning_rate": 2.2340848806366047e-06,
"loss": 0.3984,
"step": 7200
},
{
"epoch": 0.33281498356008743,
"grad_norm": 7.574636936187744,
"learning_rate": 2.2264333809426646e-06,
"loss": 0.38,
"step": 7250
},
{
"epoch": 0.3351102593087777,
"grad_norm": 11.461008071899414,
"learning_rate": 2.218781881248725e-06,
"loss": 0.3902,
"step": 7300
},
{
"epoch": 0.33740553505746795,
"grad_norm": 17.567983627319336,
"learning_rate": 2.2111303815547844e-06,
"loss": 0.3944,
"step": 7350
},
{
"epoch": 0.3397008108061582,
"grad_norm": 11.135239601135254,
"learning_rate": 2.2036319118547234e-06,
"loss": 0.3916,
"step": 7400
},
{
"epoch": 0.3419960865548485,
"grad_norm": 9.449058532714844,
"learning_rate": 2.1959804121607837e-06,
"loss": 0.3929,
"step": 7450
},
{
"epoch": 0.34429136230353874,
"grad_norm": 8.860933303833008,
"learning_rate": 2.1883289124668436e-06,
"loss": 0.3831,
"step": 7500
},
{
"epoch": 0.346586638052229,
"grad_norm": 15.684256553649902,
"learning_rate": 2.1806774127729035e-06,
"loss": 0.3954,
"step": 7550
},
{
"epoch": 0.34888191380091926,
"grad_norm": 9.633450508117676,
"learning_rate": 2.1730259130789634e-06,
"loss": 0.3965,
"step": 7600
},
{
"epoch": 0.3511771895496095,
"grad_norm": 9.775280952453613,
"learning_rate": 2.165374413385024e-06,
"loss": 0.3858,
"step": 7650
},
{
"epoch": 0.3534724652982998,
"grad_norm": 18.91486930847168,
"learning_rate": 2.1577229136910833e-06,
"loss": 0.4009,
"step": 7700
},
{
"epoch": 0.35576774104699005,
"grad_norm": 9.630500793457031,
"learning_rate": 2.1500714139971436e-06,
"loss": 0.4133,
"step": 7750
},
{
"epoch": 0.3580630167956803,
"grad_norm": 10.628037452697754,
"learning_rate": 2.1424199143032035e-06,
"loss": 0.3597,
"step": 7800
},
{
"epoch": 0.3603582925443706,
"grad_norm": 12.760895729064941,
"learning_rate": 2.1347684146092635e-06,
"loss": 0.3967,
"step": 7850
},
{
"epoch": 0.36265356829306084,
"grad_norm": 20.6715145111084,
"learning_rate": 2.1271169149153234e-06,
"loss": 0.384,
"step": 7900
},
{
"epoch": 0.36494884404175104,
"grad_norm": 9.833721160888672,
"learning_rate": 2.1196184452152623e-06,
"loss": 0.3994,
"step": 7950
},
{
"epoch": 0.3672441197904413,
"grad_norm": 11.794584274291992,
"learning_rate": 2.1119669455213222e-06,
"loss": 0.3819,
"step": 8000
},
{
"epoch": 0.36953939553913157,
"grad_norm": 11.88609790802002,
"learning_rate": 2.104315445827382e-06,
"loss": 0.3848,
"step": 8050
},
{
"epoch": 0.3718346712878218,
"grad_norm": 18.272483825683594,
"learning_rate": 2.0966639461334425e-06,
"loss": 0.3828,
"step": 8100
},
{
"epoch": 0.3741299470365121,
"grad_norm": 12.808188438415527,
"learning_rate": 2.089012446439502e-06,
"loss": 0.3771,
"step": 8150
},
{
"epoch": 0.37642522278520235,
"grad_norm": 17.478059768676758,
"learning_rate": 2.0813609467455623e-06,
"loss": 0.4058,
"step": 8200
},
{
"epoch": 0.3787204985338926,
"grad_norm": 9.85326099395752,
"learning_rate": 2.0737094470516222e-06,
"loss": 0.3605,
"step": 8250
},
{
"epoch": 0.3810157742825829,
"grad_norm": 11.933965682983398,
"learning_rate": 2.066057947357682e-06,
"loss": 0.3979,
"step": 8300
},
{
"epoch": 0.38331105003127314,
"grad_norm": 14.13598346710205,
"learning_rate": 2.058406447663742e-06,
"loss": 0.372,
"step": 8350
},
{
"epoch": 0.3856063257799634,
"grad_norm": 18.427085876464844,
"learning_rate": 2.0507549479698024e-06,
"loss": 0.382,
"step": 8400
},
{
"epoch": 0.38790160152865366,
"grad_norm": 11.309691429138184,
"learning_rate": 2.043103448275862e-06,
"loss": 0.3965,
"step": 8450
},
{
"epoch": 0.3901968772773439,
"grad_norm": 26.351280212402344,
"learning_rate": 2.0354519485819222e-06,
"loss": 0.3794,
"step": 8500
},
{
"epoch": 0.3924921530260342,
"grad_norm": 39.77188491821289,
"learning_rate": 2.027800448887982e-06,
"loss": 0.3715,
"step": 8550
},
{
"epoch": 0.39478742877472445,
"grad_norm": 11.656793594360352,
"learning_rate": 2.020148949194042e-06,
"loss": 0.3724,
"step": 8600
},
{
"epoch": 0.3970827045234147,
"grad_norm": 10.332427978515625,
"learning_rate": 2.012497449500102e-06,
"loss": 0.3913,
"step": 8650
},
{
"epoch": 0.3993779802721049,
"grad_norm": 10.971294403076172,
"learning_rate": 2.004845949806162e-06,
"loss": 0.3852,
"step": 8700
},
{
"epoch": 0.4016732560207952,
"grad_norm": 11.635868072509766,
"learning_rate": 1.997194450112222e-06,
"loss": 0.3918,
"step": 8750
},
{
"epoch": 0.40396853176948544,
"grad_norm": 14.208995819091797,
"learning_rate": 1.989542950418282e-06,
"loss": 0.38,
"step": 8800
},
{
"epoch": 0.4062638075181757,
"grad_norm": 14.743267059326172,
"learning_rate": 1.981891450724342e-06,
"loss": 0.378,
"step": 8850
},
{
"epoch": 0.40855908326686596,
"grad_norm": 11.725529670715332,
"learning_rate": 1.974239951030402e-06,
"loss": 0.3592,
"step": 8900
},
{
"epoch": 0.4108543590155562,
"grad_norm": 9.404533386230469,
"learning_rate": 1.966741481330341e-06,
"loss": 0.4008,
"step": 8950
},
{
"epoch": 0.4131496347642465,
"grad_norm": 10.354850769042969,
"learning_rate": 1.959089981636401e-06,
"loss": 0.3891,
"step": 9000
},
{
"epoch": 0.41544491051293675,
"grad_norm": 10.396027565002441,
"learning_rate": 1.9515915119363398e-06,
"loss": 0.3756,
"step": 9050
},
{
"epoch": 0.417740186261627,
"grad_norm": 14.872404098510742,
"learning_rate": 1.9439400122423997e-06,
"loss": 0.3672,
"step": 9100
},
{
"epoch": 0.4200354620103173,
"grad_norm": 12.267988204956055,
"learning_rate": 1.9362885125484596e-06,
"loss": 0.3694,
"step": 9150
},
{
"epoch": 0.42233073775900754,
"grad_norm": 10.50540828704834,
"learning_rate": 1.9286370128545195e-06,
"loss": 0.3746,
"step": 9200
},
{
"epoch": 0.4246260135076978,
"grad_norm": 9.12032699584961,
"learning_rate": 1.9209855131605794e-06,
"loss": 0.3739,
"step": 9250
},
{
"epoch": 0.42692128925638806,
"grad_norm": 19.14887046813965,
"learning_rate": 1.9133340134666393e-06,
"loss": 0.3725,
"step": 9300
},
{
"epoch": 0.4292165650050783,
"grad_norm": 12.185490608215332,
"learning_rate": 1.9056825137726995e-06,
"loss": 0.3742,
"step": 9350
},
{
"epoch": 0.4315118407537686,
"grad_norm": 6.92899751663208,
"learning_rate": 1.8980310140787594e-06,
"loss": 0.3818,
"step": 9400
},
{
"epoch": 0.4338071165024588,
"grad_norm": 11.001760482788086,
"learning_rate": 1.8905325443786983e-06,
"loss": 0.3744,
"step": 9450
},
{
"epoch": 0.43610239225114905,
"grad_norm": 14.638999938964844,
"learning_rate": 1.8828810446847584e-06,
"loss": 0.3873,
"step": 9500
},
{
"epoch": 0.4383976679998393,
"grad_norm": 11.44972038269043,
"learning_rate": 1.8752295449908181e-06,
"loss": 0.3743,
"step": 9550
},
{
"epoch": 0.4406929437485296,
"grad_norm": 10.270658493041992,
"learning_rate": 1.8675780452968783e-06,
"loss": 0.4017,
"step": 9600
},
{
"epoch": 0.44298821949721984,
"grad_norm": 55.21454620361328,
"learning_rate": 1.8599265456029382e-06,
"loss": 0.3776,
"step": 9650
},
{
"epoch": 0.4452834952459101,
"grad_norm": 16.605663299560547,
"learning_rate": 1.8522750459089983e-06,
"loss": 0.3807,
"step": 9700
},
{
"epoch": 0.44757877099460036,
"grad_norm": 10.325900077819824,
"learning_rate": 1.8446235462150582e-06,
"loss": 0.3842,
"step": 9750
},
{
"epoch": 0.4498740467432906,
"grad_norm": 11.08858585357666,
"learning_rate": 1.8369720465211184e-06,
"loss": 0.3913,
"step": 9800
},
{
"epoch": 0.4521693224919809,
"grad_norm": 11.547320365905762,
"learning_rate": 1.829320546827178e-06,
"loss": 0.3857,
"step": 9850
},
{
"epoch": 0.45446459824067115,
"grad_norm": 7.3234171867370605,
"learning_rate": 1.8216690471332382e-06,
"loss": 0.3591,
"step": 9900
},
{
"epoch": 0.4567598739893614,
"grad_norm": 15.973671913146973,
"learning_rate": 1.8140175474392981e-06,
"loss": 0.3862,
"step": 9950
},
{
"epoch": 0.45905514973805167,
"grad_norm": 9.876672744750977,
"learning_rate": 1.8063660477453582e-06,
"loss": 0.3736,
"step": 10000
},
{
"epoch": 0.46135042548674193,
"grad_norm": 10.364340782165527,
"learning_rate": 1.798714548051418e-06,
"loss": 0.3753,
"step": 10050
},
{
"epoch": 0.4636457012354322,
"grad_norm": 12.696479797363281,
"learning_rate": 1.791063048357478e-06,
"loss": 0.3925,
"step": 10100
},
{
"epoch": 0.46594097698412246,
"grad_norm": 9.908900260925293,
"learning_rate": 1.783411548663538e-06,
"loss": 0.3909,
"step": 10150
},
{
"epoch": 0.46823625273281266,
"grad_norm": 9.967103958129883,
"learning_rate": 1.7757600489695981e-06,
"loss": 0.3645,
"step": 10200
},
{
"epoch": 0.4705315284815029,
"grad_norm": 18.3125057220459,
"learning_rate": 1.7681085492756582e-06,
"loss": 0.3909,
"step": 10250
},
{
"epoch": 0.4728268042301932,
"grad_norm": 9.652514457702637,
"learning_rate": 1.760457049581718e-06,
"loss": 0.3677,
"step": 10300
},
{
"epoch": 0.47512207997888345,
"grad_norm": 11.558232307434082,
"learning_rate": 1.7528055498877783e-06,
"loss": 0.3898,
"step": 10350
},
{
"epoch": 0.4774173557275737,
"grad_norm": 12.501564025878906,
"learning_rate": 1.745154050193838e-06,
"loss": 0.3712,
"step": 10400
},
{
"epoch": 0.479712631476264,
"grad_norm": 16.78426742553711,
"learning_rate": 1.7375025504998981e-06,
"loss": 0.3563,
"step": 10450
},
{
"epoch": 0.48200790722495424,
"grad_norm": 17.035160064697266,
"learning_rate": 1.729851050805958e-06,
"loss": 0.3754,
"step": 10500
},
{
"epoch": 0.4843031829736445,
"grad_norm": 10.664167404174805,
"learning_rate": 1.7221995511120182e-06,
"loss": 0.396,
"step": 10550
},
{
"epoch": 0.48659845872233476,
"grad_norm": 11.713300704956055,
"learning_rate": 1.7145480514180779e-06,
"loss": 0.3612,
"step": 10600
},
{
"epoch": 0.488893734471025,
"grad_norm": 10.911885261535645,
"learning_rate": 1.706896551724138e-06,
"loss": 0.3943,
"step": 10650
},
{
"epoch": 0.4911890102197153,
"grad_norm": 20.872018814086914,
"learning_rate": 1.699245052030198e-06,
"loss": 0.3781,
"step": 10700
},
{
"epoch": 0.49348428596840554,
"grad_norm": 10.593243598937988,
"learning_rate": 1.691593552336258e-06,
"loss": 0.3885,
"step": 10750
},
{
"epoch": 0.4957795617170958,
"grad_norm": 14.705113410949707,
"learning_rate": 1.6839420526423177e-06,
"loss": 0.364,
"step": 10800
},
{
"epoch": 0.49807483746578607,
"grad_norm": 10.650996208190918,
"learning_rate": 1.6762905529483779e-06,
"loss": 0.3723,
"step": 10850
},
{
"epoch": 0.5003701132144763,
"grad_norm": 16.529132843017578,
"learning_rate": 1.6686390532544378e-06,
"loss": 0.3663,
"step": 10900
},
{
"epoch": 0.5026653889631666,
"grad_norm": 13.437551498413086,
"learning_rate": 1.660987553560498e-06,
"loss": 0.3704,
"step": 10950
},
{
"epoch": 0.5049606647118569,
"grad_norm": 9.07573127746582,
"learning_rate": 1.6533360538665578e-06,
"loss": 0.3767,
"step": 11000
},
{
"epoch": 0.5072559404605471,
"grad_norm": 29.058597564697266,
"learning_rate": 1.645684554172618e-06,
"loss": 0.3693,
"step": 11050
},
{
"epoch": 0.5095512162092374,
"grad_norm": 9.73399543762207,
"learning_rate": 1.638033054478678e-06,
"loss": 0.3755,
"step": 11100
},
{
"epoch": 0.5118464919579276,
"grad_norm": 19.876361846923828,
"learning_rate": 1.6303815547847378e-06,
"loss": 0.3807,
"step": 11150
},
{
"epoch": 0.5141417677066179,
"grad_norm": 12.44278621673584,
"learning_rate": 1.622730055090798e-06,
"loss": 0.3732,
"step": 11200
},
{
"epoch": 0.5164370434553082,
"grad_norm": 10.846384048461914,
"learning_rate": 1.6150785553968578e-06,
"loss": 0.3786,
"step": 11250
},
{
"epoch": 0.5187323192039983,
"grad_norm": 14.698681831359863,
"learning_rate": 1.607427055702918e-06,
"loss": 0.3792,
"step": 11300
},
{
"epoch": 0.5210275949526886,
"grad_norm": 10.92941951751709,
"learning_rate": 1.5997755560089777e-06,
"loss": 0.3849,
"step": 11350
},
{
"epoch": 0.5233228707013788,
"grad_norm": 12.259003639221191,
"learning_rate": 1.5921240563150378e-06,
"loss": 0.3764,
"step": 11400
},
{
"epoch": 0.5256181464500691,
"grad_norm": 12.608525276184082,
"learning_rate": 1.5844725566210977e-06,
"loss": 0.381,
"step": 11450
},
{
"epoch": 0.5279134221987594,
"grad_norm": 14.017390251159668,
"learning_rate": 1.5768210569271578e-06,
"loss": 0.3761,
"step": 11500
},
{
"epoch": 0.5302086979474496,
"grad_norm": 11.806578636169434,
"learning_rate": 1.5691695572332175e-06,
"loss": 0.3738,
"step": 11550
},
{
"epoch": 0.5325039736961399,
"grad_norm": 14.099353790283203,
"learning_rate": 1.5615180575392779e-06,
"loss": 0.3631,
"step": 11600
},
{
"epoch": 0.5347992494448301,
"grad_norm": 12.497589111328125,
"learning_rate": 1.5538665578453376e-06,
"loss": 0.3861,
"step": 11650
},
{
"epoch": 0.5370945251935204,
"grad_norm": 12.558547973632812,
"learning_rate": 1.5462150581513977e-06,
"loss": 0.3632,
"step": 11700
},
{
"epoch": 0.5393898009422107,
"grad_norm": 30.996755599975586,
"learning_rate": 1.5385635584574576e-06,
"loss": 0.3793,
"step": 11750
},
{
"epoch": 0.5416850766909009,
"grad_norm": 7.529469013214111,
"learning_rate": 1.5309120587635178e-06,
"loss": 0.3503,
"step": 11800
},
{
"epoch": 0.5439803524395912,
"grad_norm": 9.519927978515625,
"learning_rate": 1.5232605590695779e-06,
"loss": 0.374,
"step": 11850
},
{
"epoch": 0.5462756281882815,
"grad_norm": 16.44211769104004,
"learning_rate": 1.5156090593756376e-06,
"loss": 0.3647,
"step": 11900
},
{
"epoch": 0.5485709039369717,
"grad_norm": 30.076602935791016,
"learning_rate": 1.5079575596816977e-06,
"loss": 0.3795,
"step": 11950
},
{
"epoch": 0.550866179685662,
"grad_norm": 12.661836624145508,
"learning_rate": 1.5003060599877576e-06,
"loss": 0.3615,
"step": 12000
},
{
"epoch": 0.5531614554343522,
"grad_norm": 11.008061408996582,
"learning_rate": 1.4926545602938176e-06,
"loss": 0.3898,
"step": 12050
},
{
"epoch": 0.5554567311830425,
"grad_norm": 8.93970775604248,
"learning_rate": 1.4851560905937563e-06,
"loss": 0.3768,
"step": 12100
},
{
"epoch": 0.5577520069317328,
"grad_norm": 11.853392601013184,
"learning_rate": 1.4775045908998164e-06,
"loss": 0.3755,
"step": 12150
},
{
"epoch": 0.560047282680423,
"grad_norm": 13.085156440734863,
"learning_rate": 1.4698530912058765e-06,
"loss": 0.3617,
"step": 12200
},
{
"epoch": 0.5623425584291133,
"grad_norm": 10.487837791442871,
"learning_rate": 1.4622015915119364e-06,
"loss": 0.3806,
"step": 12250
},
{
"epoch": 0.5646378341778036,
"grad_norm": 9.621622085571289,
"learning_rate": 1.4545500918179964e-06,
"loss": 0.3789,
"step": 12300
},
{
"epoch": 0.5669331099264938,
"grad_norm": 18.29271125793457,
"learning_rate": 1.4468985921240565e-06,
"loss": 0.3809,
"step": 12350
},
{
"epoch": 0.5692283856751841,
"grad_norm": 15.107403755187988,
"learning_rate": 1.4392470924301164e-06,
"loss": 0.3847,
"step": 12400
},
{
"epoch": 0.5715236614238743,
"grad_norm": 9.128915786743164,
"learning_rate": 1.4315955927361763e-06,
"loss": 0.3718,
"step": 12450
},
{
"epoch": 0.5738189371725646,
"grad_norm": 10.191695213317871,
"learning_rate": 1.424097123036115e-06,
"loss": 0.3678,
"step": 12500
},
{
"epoch": 0.5761142129212549,
"grad_norm": 10.501177787780762,
"learning_rate": 1.4164456233421752e-06,
"loss": 0.3727,
"step": 12550
},
{
"epoch": 0.5784094886699451,
"grad_norm": 17.270280838012695,
"learning_rate": 1.408794123648235e-06,
"loss": 0.3725,
"step": 12600
},
{
"epoch": 0.5807047644186354,
"grad_norm": 11.970187187194824,
"learning_rate": 1.401142623954295e-06,
"loss": 0.3641,
"step": 12650
},
{
"epoch": 0.5830000401673257,
"grad_norm": 8.886324882507324,
"learning_rate": 1.393491124260355e-06,
"loss": 0.3663,
"step": 12700
},
{
"epoch": 0.5852953159160159,
"grad_norm": 10.107802391052246,
"learning_rate": 1.385839624566415e-06,
"loss": 0.3927,
"step": 12750
},
{
"epoch": 0.5875905916647061,
"grad_norm": 10.90116024017334,
"learning_rate": 1.378188124872475e-06,
"loss": 0.3745,
"step": 12800
},
{
"epoch": 0.5898858674133963,
"grad_norm": 27.726293563842773,
"learning_rate": 1.3705366251785349e-06,
"loss": 0.3691,
"step": 12850
},
{
"epoch": 0.5921811431620866,
"grad_norm": 14.674005508422852,
"learning_rate": 1.362885125484595e-06,
"loss": 0.3566,
"step": 12900
},
{
"epoch": 0.5944764189107768,
"grad_norm": 8.121840476989746,
"learning_rate": 1.3552336257906551e-06,
"loss": 0.3711,
"step": 12950
},
{
"epoch": 0.5967716946594671,
"grad_norm": 30.958778381347656,
"learning_rate": 1.347582126096715e-06,
"loss": 0.371,
"step": 13000
},
{
"epoch": 0.5990669704081574,
"grad_norm": 7.38535213470459,
"learning_rate": 1.339930626402775e-06,
"loss": 0.3679,
"step": 13050
},
{
"epoch": 0.6013622461568476,
"grad_norm": 34.25715255737305,
"learning_rate": 1.332279126708835e-06,
"loss": 0.3897,
"step": 13100
},
{
"epoch": 0.6036575219055379,
"grad_norm": 13.392374992370605,
"learning_rate": 1.324627627014895e-06,
"loss": 0.3873,
"step": 13150
},
{
"epoch": 0.6059527976542282,
"grad_norm": 12.169110298156738,
"learning_rate": 1.316976127320955e-06,
"loss": 0.3533,
"step": 13200
},
{
"epoch": 0.6082480734029184,
"grad_norm": 10.7171049118042,
"learning_rate": 1.3093246276270148e-06,
"loss": 0.3814,
"step": 13250
},
{
"epoch": 0.6105433491516087,
"grad_norm": 9.975994110107422,
"learning_rate": 1.301673127933075e-06,
"loss": 0.3817,
"step": 13300
},
{
"epoch": 0.612838624900299,
"grad_norm": 9.311064720153809,
"learning_rate": 1.2940216282391349e-06,
"loss": 0.3602,
"step": 13350
},
{
"epoch": 0.6151339006489892,
"grad_norm": 6.753169536590576,
"learning_rate": 1.2863701285451948e-06,
"loss": 0.3739,
"step": 13400
},
{
"epoch": 0.6174291763976795,
"grad_norm": 14.21111011505127,
"learning_rate": 1.278718628851255e-06,
"loss": 0.3677,
"step": 13450
},
{
"epoch": 0.6197244521463697,
"grad_norm": 10.715375900268555,
"learning_rate": 1.2710671291573149e-06,
"loss": 0.3575,
"step": 13500
},
{
"epoch": 0.62201972789506,
"grad_norm": 18.988964080810547,
"learning_rate": 1.2635686594572536e-06,
"loss": 0.3678,
"step": 13550
},
{
"epoch": 0.6243150036437503,
"grad_norm": 11.761975288391113,
"learning_rate": 1.2559171597633135e-06,
"loss": 0.3552,
"step": 13600
},
{
"epoch": 0.6266102793924405,
"grad_norm": 15.087313652038574,
"learning_rate": 1.2484186900632524e-06,
"loss": 0.3811,
"step": 13650
},
{
"epoch": 0.6289055551411308,
"grad_norm": 10.330754280090332,
"learning_rate": 1.2407671903693125e-06,
"loss": 0.3772,
"step": 13700
},
{
"epoch": 0.631200830889821,
"grad_norm": 18.443931579589844,
"learning_rate": 1.2331156906753725e-06,
"loss": 0.3739,
"step": 13750
},
{
"epoch": 0.6334961066385113,
"grad_norm": 10.0236177444458,
"learning_rate": 1.2254641909814324e-06,
"loss": 0.3811,
"step": 13800
},
{
"epoch": 0.6357913823872016,
"grad_norm": 12.954588890075684,
"learning_rate": 1.2178126912874923e-06,
"loss": 0.3658,
"step": 13850
},
{
"epoch": 0.6380866581358918,
"grad_norm": 11.381658554077148,
"learning_rate": 1.2101611915935524e-06,
"loss": 0.3718,
"step": 13900
},
{
"epoch": 0.6403819338845821,
"grad_norm": 12.019516944885254,
"learning_rate": 1.2025096918996123e-06,
"loss": 0.3622,
"step": 13950
},
{
"epoch": 0.6426772096332724,
"grad_norm": 10.334041595458984,
"learning_rate": 1.1948581922056723e-06,
"loss": 0.3636,
"step": 14000
},
{
"epoch": 0.6449724853819626,
"grad_norm": 10.666448593139648,
"learning_rate": 1.1872066925117324e-06,
"loss": 0.3451,
"step": 14050
},
{
"epoch": 0.6472677611306529,
"grad_norm": 9.785908699035645,
"learning_rate": 1.1795551928177923e-06,
"loss": 0.364,
"step": 14100
},
{
"epoch": 0.6495630368793431,
"grad_norm": 17.802858352661133,
"learning_rate": 1.1719036931238522e-06,
"loss": 0.365,
"step": 14150
},
{
"epoch": 0.6518583126280334,
"grad_norm": 11.654463768005371,
"learning_rate": 1.1642521934299123e-06,
"loss": 0.3755,
"step": 14200
},
{
"epoch": 0.6541535883767237,
"grad_norm": 27.977380752563477,
"learning_rate": 1.1566006937359723e-06,
"loss": 0.3865,
"step": 14250
},
{
"epoch": 0.6564488641254139,
"grad_norm": 12.833617210388184,
"learning_rate": 1.1489491940420322e-06,
"loss": 0.3761,
"step": 14300
},
{
"epoch": 0.6587441398741041,
"grad_norm": 10.381138801574707,
"learning_rate": 1.1412976943480923e-06,
"loss": 0.3693,
"step": 14350
},
{
"epoch": 0.6610394156227943,
"grad_norm": 10.039835929870605,
"learning_rate": 1.1336461946541522e-06,
"loss": 0.3621,
"step": 14400
},
{
"epoch": 0.6633346913714846,
"grad_norm": 9.079568862915039,
"learning_rate": 1.1259946949602123e-06,
"loss": 0.385,
"step": 14450
},
{
"epoch": 0.6656299671201749,
"grad_norm": 9.107003211975098,
"learning_rate": 1.1183431952662723e-06,
"loss": 0.3798,
"step": 14500
},
{
"epoch": 0.6679252428688651,
"grad_norm": 10.977982521057129,
"learning_rate": 1.1106916955723322e-06,
"loss": 0.3721,
"step": 14550
},
{
"epoch": 0.6702205186175554,
"grad_norm": 14.997126579284668,
"learning_rate": 1.1030401958783923e-06,
"loss": 0.347,
"step": 14600
},
{
"epoch": 0.6725157943662456,
"grad_norm": 13.861760139465332,
"learning_rate": 1.0953886961844522e-06,
"loss": 0.3727,
"step": 14650
},
{
"epoch": 0.6748110701149359,
"grad_norm": 10.945677757263184,
"learning_rate": 1.0877371964905121e-06,
"loss": 0.3741,
"step": 14700
},
{
"epoch": 0.6771063458636262,
"grad_norm": 14.002128601074219,
"learning_rate": 1.080085696796572e-06,
"loss": 0.3698,
"step": 14750
},
{
"epoch": 0.6794016216123164,
"grad_norm": 17.263320922851562,
"learning_rate": 1.0724341971026322e-06,
"loss": 0.3614,
"step": 14800
},
{
"epoch": 0.6816968973610067,
"grad_norm": 12.30994987487793,
"learning_rate": 1.064782697408692e-06,
"loss": 0.3683,
"step": 14850
},
{
"epoch": 0.683992173109697,
"grad_norm": 9.857229232788086,
"learning_rate": 1.057131197714752e-06,
"loss": 0.3698,
"step": 14900
},
{
"epoch": 0.6862874488583872,
"grad_norm": 18.498348236083984,
"learning_rate": 1.0494796980208121e-06,
"loss": 0.3665,
"step": 14950
},
{
"epoch": 0.6885827246070775,
"grad_norm": 22.762800216674805,
"learning_rate": 1.041828198326872e-06,
"loss": 0.3656,
"step": 15000
},
{
"epoch": 0.6908780003557677,
"grad_norm": 9.595602035522461,
"learning_rate": 1.034176698632932e-06,
"loss": 0.3658,
"step": 15050
},
{
"epoch": 0.693173276104458,
"grad_norm": 9.138602256774902,
"learning_rate": 1.0265251989389919e-06,
"loss": 0.3459,
"step": 15100
},
{
"epoch": 0.6954685518531483,
"grad_norm": 21.768016815185547,
"learning_rate": 1.0188736992450522e-06,
"loss": 0.3768,
"step": 15150
},
{
"epoch": 0.6977638276018385,
"grad_norm": 16.048538208007812,
"learning_rate": 1.0112221995511121e-06,
"loss": 0.3578,
"step": 15200
},
{
"epoch": 0.7000591033505288,
"grad_norm": 18.636796951293945,
"learning_rate": 1.003570699857172e-06,
"loss": 0.3632,
"step": 15250
},
{
"epoch": 0.702354379099219,
"grad_norm": 21.801054000854492,
"learning_rate": 9.95919200163232e-07,
"loss": 0.3631,
"step": 15300
},
{
"epoch": 0.7046496548479093,
"grad_norm": 25.04891014099121,
"learning_rate": 9.882677004692921e-07,
"loss": 0.3558,
"step": 15350
},
{
"epoch": 0.7069449305965996,
"grad_norm": 10.539541244506836,
"learning_rate": 9.80616200775352e-07,
"loss": 0.3644,
"step": 15400
},
{
"epoch": 0.7092402063452898,
"grad_norm": 13.989766120910645,
"learning_rate": 9.72964701081412e-07,
"loss": 0.3618,
"step": 15450
},
{
"epoch": 0.7115354820939801,
"grad_norm": 10.638326644897461,
"learning_rate": 9.65313201387472e-07,
"loss": 0.3675,
"step": 15500
},
{
"epoch": 0.7138307578426704,
"grad_norm": 11.744914054870605,
"learning_rate": 9.57661701693532e-07,
"loss": 0.3657,
"step": 15550
},
{
"epoch": 0.7161260335913606,
"grad_norm": 10.490307807922363,
"learning_rate": 9.500102019995919e-07,
"loss": 0.3736,
"step": 15600
},
{
"epoch": 0.7184213093400509,
"grad_norm": 24.758380889892578,
"learning_rate": 9.425117322995307e-07,
"loss": 0.3817,
"step": 15650
},
{
"epoch": 0.7207165850887411,
"grad_norm": 13.692597389221191,
"learning_rate": 9.348602326055906e-07,
"loss": 0.3561,
"step": 15700
},
{
"epoch": 0.7230118608374314,
"grad_norm": 9.888838768005371,
"learning_rate": 9.272087329116507e-07,
"loss": 0.3581,
"step": 15750
},
{
"epoch": 0.7253071365861217,
"grad_norm": 12.952566146850586,
"learning_rate": 9.195572332177106e-07,
"loss": 0.3672,
"step": 15800
},
{
"epoch": 0.7276024123348118,
"grad_norm": 11.688536643981934,
"learning_rate": 9.120587635176495e-07,
"loss": 0.3766,
"step": 15850
},
{
"epoch": 0.7298976880835021,
"grad_norm": 17.879161834716797,
"learning_rate": 9.044072638237095e-07,
"loss": 0.3622,
"step": 15900
},
{
"epoch": 0.7321929638321923,
"grad_norm": 20.887718200683594,
"learning_rate": 8.967557641297696e-07,
"loss": 0.3649,
"step": 15950
},
{
"epoch": 0.7344882395808826,
"grad_norm": 9.507134437561035,
"learning_rate": 8.891042644358295e-07,
"loss": 0.3561,
"step": 16000
},
{
"epoch": 0.7367835153295729,
"grad_norm": 15.035599708557129,
"learning_rate": 8.814527647418895e-07,
"loss": 0.3428,
"step": 16050
},
{
"epoch": 0.7390787910782631,
"grad_norm": 18.98991584777832,
"learning_rate": 8.738012650479494e-07,
"loss": 0.3647,
"step": 16100
},
{
"epoch": 0.7413740668269534,
"grad_norm": 11.455354690551758,
"learning_rate": 8.661497653540094e-07,
"loss": 0.3546,
"step": 16150
},
{
"epoch": 0.7436693425756437,
"grad_norm": 10.941442489624023,
"learning_rate": 8.584982656600693e-07,
"loss": 0.3704,
"step": 16200
},
{
"epoch": 0.7459646183243339,
"grad_norm": 11.790033340454102,
"learning_rate": 8.508467659661294e-07,
"loss": 0.3589,
"step": 16250
},
{
"epoch": 0.7482598940730242,
"grad_norm": 7.833493232727051,
"learning_rate": 8.431952662721894e-07,
"loss": 0.3517,
"step": 16300
},
{
"epoch": 0.7505551698217144,
"grad_norm": 12.305365562438965,
"learning_rate": 8.355437665782493e-07,
"loss": 0.3595,
"step": 16350
},
{
"epoch": 0.7528504455704047,
"grad_norm": 14.888938903808594,
"learning_rate": 8.278922668843093e-07,
"loss": 0.3793,
"step": 16400
},
{
"epoch": 0.755145721319095,
"grad_norm": 9.640114784240723,
"learning_rate": 8.202407671903692e-07,
"loss": 0.3482,
"step": 16450
},
{
"epoch": 0.7574409970677852,
"grad_norm": 14.406376838684082,
"learning_rate": 8.125892674964293e-07,
"loss": 0.3607,
"step": 16500
},
{
"epoch": 0.7597362728164755,
"grad_norm": 47.160240173339844,
"learning_rate": 8.049377678024892e-07,
"loss": 0.3612,
"step": 16550
},
{
"epoch": 0.7620315485651658,
"grad_norm": 13.127697944641113,
"learning_rate": 7.972862681085494e-07,
"loss": 0.3573,
"step": 16600
},
{
"epoch": 0.764326824313856,
"grad_norm": 14.993858337402344,
"learning_rate": 7.896347684146093e-07,
"loss": 0.3652,
"step": 16650
},
{
"epoch": 0.7666221000625463,
"grad_norm": 10.984702110290527,
"learning_rate": 7.819832687206694e-07,
"loss": 0.3756,
"step": 16700
},
{
"epoch": 0.7689173758112365,
"grad_norm": 14.359518051147461,
"learning_rate": 7.743317690267293e-07,
"loss": 0.3458,
"step": 16750
},
{
"epoch": 0.7712126515599268,
"grad_norm": 11.842260360717773,
"learning_rate": 7.666802693327893e-07,
"loss": 0.3471,
"step": 16800
},
{
"epoch": 0.7735079273086171,
"grad_norm": 19.50934410095215,
"learning_rate": 7.590287696388492e-07,
"loss": 0.3696,
"step": 16850
},
{
"epoch": 0.7758032030573073,
"grad_norm": 9.951668739318848,
"learning_rate": 7.513772699449092e-07,
"loss": 0.3619,
"step": 16900
},
{
"epoch": 0.7780984788059976,
"grad_norm": 9.378868103027344,
"learning_rate": 7.437257702509693e-07,
"loss": 0.367,
"step": 16950
},
{
"epoch": 0.7803937545546878,
"grad_norm": 10.392294883728027,
"learning_rate": 7.360742705570292e-07,
"loss": 0.3729,
"step": 17000
},
{
"epoch": 0.7826890303033781,
"grad_norm": 10.031806945800781,
"learning_rate": 7.284227708630892e-07,
"loss": 0.3491,
"step": 17050
},
{
"epoch": 0.7849843060520684,
"grad_norm": 9.783834457397461,
"learning_rate": 7.209243011630279e-07,
"loss": 0.364,
"step": 17100
},
{
"epoch": 0.7872795818007586,
"grad_norm": 13.27206802368164,
"learning_rate": 7.13272801469088e-07,
"loss": 0.3544,
"step": 17150
},
{
"epoch": 0.7895748575494489,
"grad_norm": 84.03044891357422,
"learning_rate": 7.05621301775148e-07,
"loss": 0.3701,
"step": 17200
},
{
"epoch": 0.7918701332981392,
"grad_norm": 9.07703971862793,
"learning_rate": 6.97969802081208e-07,
"loss": 0.3366,
"step": 17250
},
{
"epoch": 0.7941654090468294,
"grad_norm": 9.118254661560059,
"learning_rate": 6.904713323811467e-07,
"loss": 0.3728,
"step": 17300
},
{
"epoch": 0.7964606847955196,
"grad_norm": 20.656383514404297,
"learning_rate": 6.828198326872067e-07,
"loss": 0.3576,
"step": 17350
},
{
"epoch": 0.7987559605442098,
"grad_norm": 12.286031723022461,
"learning_rate": 6.751683329932666e-07,
"loss": 0.3612,
"step": 17400
},
{
"epoch": 0.8010512362929001,
"grad_norm": 9.951446533203125,
"learning_rate": 6.675168332993268e-07,
"loss": 0.3527,
"step": 17450
},
{
"epoch": 0.8033465120415904,
"grad_norm": 8.655536651611328,
"learning_rate": 6.598653336053867e-07,
"loss": 0.357,
"step": 17500
},
{
"epoch": 0.8056417877902806,
"grad_norm": 11.027405738830566,
"learning_rate": 6.522138339114467e-07,
"loss": 0.3494,
"step": 17550
},
{
"epoch": 0.8079370635389709,
"grad_norm": 10.6943359375,
"learning_rate": 6.445623342175066e-07,
"loss": 0.3659,
"step": 17600
},
{
"epoch": 0.8102323392876611,
"grad_norm": 12.674545288085938,
"learning_rate": 6.369108345235666e-07,
"loss": 0.3545,
"step": 17650
},
{
"epoch": 0.8125276150363514,
"grad_norm": 12.870709419250488,
"learning_rate": 6.292593348296266e-07,
"loss": 0.3521,
"step": 17700
},
{
"epoch": 0.8148228907850417,
"grad_norm": 11.121269226074219,
"learning_rate": 6.216078351356866e-07,
"loss": 0.3786,
"step": 17750
},
{
"epoch": 0.8171181665337319,
"grad_norm": 13.682574272155762,
"learning_rate": 6.139563354417466e-07,
"loss": 0.3633,
"step": 17800
},
{
"epoch": 0.8194134422824222,
"grad_norm": 11.04853630065918,
"learning_rate": 6.063048357478066e-07,
"loss": 0.3745,
"step": 17850
},
{
"epoch": 0.8217087180311125,
"grad_norm": 9.358290672302246,
"learning_rate": 5.986533360538666e-07,
"loss": 0.369,
"step": 17900
},
{
"epoch": 0.8240039937798027,
"grad_norm": 7.955355644226074,
"learning_rate": 5.910018363599266e-07,
"loss": 0.3639,
"step": 17950
},
{
"epoch": 0.826299269528493,
"grad_norm": 25.4930362701416,
"learning_rate": 5.833503366659866e-07,
"loss": 0.3396,
"step": 18000
},
{
"epoch": 0.8285945452771832,
"grad_norm": 8.573712348937988,
"learning_rate": 5.756988369720465e-07,
"loss": 0.3453,
"step": 18050
},
{
"epoch": 0.8308898210258735,
"grad_norm": 9.966428756713867,
"learning_rate": 5.680473372781065e-07,
"loss": 0.3578,
"step": 18100
},
{
"epoch": 0.8331850967745638,
"grad_norm": 10.190299034118652,
"learning_rate": 5.603958375841664e-07,
"loss": 0.3727,
"step": 18150
},
{
"epoch": 0.835480372523254,
"grad_norm": 16.960046768188477,
"learning_rate": 5.527443378902265e-07,
"loss": 0.3585,
"step": 18200
},
{
"epoch": 0.8377756482719443,
"grad_norm": 15.969688415527344,
"learning_rate": 5.450928381962865e-07,
"loss": 0.3795,
"step": 18250
},
{
"epoch": 0.8400709240206345,
"grad_norm": 10.289957046508789,
"learning_rate": 5.374413385023465e-07,
"loss": 0.3623,
"step": 18300
},
{
"epoch": 0.8423661997693248,
"grad_norm": 15.047087669372559,
"learning_rate": 5.297898388084065e-07,
"loss": 0.371,
"step": 18350
},
{
"epoch": 0.8446614755180151,
"grad_norm": 18.332683563232422,
"learning_rate": 5.221383391144664e-07,
"loss": 0.362,
"step": 18400
},
{
"epoch": 0.8469567512667053,
"grad_norm": 9.587491989135742,
"learning_rate": 5.144868394205265e-07,
"loss": 0.3656,
"step": 18450
},
{
"epoch": 0.8492520270153956,
"grad_norm": 15.642866134643555,
"learning_rate": 5.068353397265864e-07,
"loss": 0.3469,
"step": 18500
},
{
"epoch": 0.8515473027640859,
"grad_norm": 8.69747543334961,
"learning_rate": 4.991838400326464e-07,
"loss": 0.3594,
"step": 18550
},
{
"epoch": 0.8538425785127761,
"grad_norm": 11.755226135253906,
"learning_rate": 4.915323403387063e-07,
"loss": 0.3597,
"step": 18600
},
{
"epoch": 0.8561378542614664,
"grad_norm": 13.479146957397461,
"learning_rate": 4.838808406447664e-07,
"loss": 0.3545,
"step": 18650
},
{
"epoch": 0.8584331300101566,
"grad_norm": 12.416823387145996,
"learning_rate": 4.762293409508264e-07,
"loss": 0.3668,
"step": 18700
},
{
"epoch": 0.8607284057588469,
"grad_norm": 9.566291809082031,
"learning_rate": 4.687308712507652e-07,
"loss": 0.3612,
"step": 18750
},
{
"epoch": 0.8630236815075372,
"grad_norm": 15.225481986999512,
"learning_rate": 4.6107937155682516e-07,
"loss": 0.3604,
"step": 18800
},
{
"epoch": 0.8653189572562273,
"grad_norm": 42.362083435058594,
"learning_rate": 4.5342787186288513e-07,
"loss": 0.3607,
"step": 18850
},
{
"epoch": 0.8676142330049176,
"grad_norm": 29.728242874145508,
"learning_rate": 4.457763721689451e-07,
"loss": 0.3431,
"step": 18900
},
{
"epoch": 0.8699095087536078,
"grad_norm": 11.724947929382324,
"learning_rate": 4.3812487247500506e-07,
"loss": 0.3595,
"step": 18950
},
{
"epoch": 0.8722047845022981,
"grad_norm": 13.001766204833984,
"learning_rate": 4.3047337278106514e-07,
"loss": 0.3551,
"step": 19000
},
{
"epoch": 0.8745000602509884,
"grad_norm": 14.711641311645508,
"learning_rate": 4.228218730871251e-07,
"loss": 0.3598,
"step": 19050
},
{
"epoch": 0.8767953359996786,
"grad_norm": 23.77117347717285,
"learning_rate": 4.151703733931851e-07,
"loss": 0.3667,
"step": 19100
},
{
"epoch": 0.8790906117483689,
"grad_norm": 12.186518669128418,
"learning_rate": 4.0751887369924505e-07,
"loss": 0.3578,
"step": 19150
},
{
"epoch": 0.8813858874970592,
"grad_norm": 9.117465019226074,
"learning_rate": 3.99867374005305e-07,
"loss": 0.3634,
"step": 19200
},
{
"epoch": 0.8836811632457494,
"grad_norm": 7.487546443939209,
"learning_rate": 3.9221587431136504e-07,
"loss": 0.3485,
"step": 19250
},
{
"epoch": 0.8859764389944397,
"grad_norm": 9.04023265838623,
"learning_rate": 3.84564374617425e-07,
"loss": 0.3543,
"step": 19300
},
{
"epoch": 0.8882717147431299,
"grad_norm": 17.847694396972656,
"learning_rate": 3.76912874923485e-07,
"loss": 0.3514,
"step": 19350
},
{
"epoch": 0.8905669904918202,
"grad_norm": 27.124319076538086,
"learning_rate": 3.69261375229545e-07,
"loss": 0.3658,
"step": 19400
},
{
"epoch": 0.8928622662405105,
"grad_norm": 22.0621280670166,
"learning_rate": 3.6160987553560497e-07,
"loss": 0.351,
"step": 19450
},
{
"epoch": 0.8951575419892007,
"grad_norm": 11.973557472229004,
"learning_rate": 3.53958375841665e-07,
"loss": 0.3485,
"step": 19500
},
{
"epoch": 0.897452817737891,
"grad_norm": 15.552183151245117,
"learning_rate": 3.4630687614772496e-07,
"loss": 0.3482,
"step": 19550
},
{
"epoch": 0.8997480934865812,
"grad_norm": 37.236507415771484,
"learning_rate": 3.3865537645378493e-07,
"loss": 0.3493,
"step": 19600
},
{
"epoch": 0.9020433692352715,
"grad_norm": 12.226764678955078,
"learning_rate": 3.3115690675372375e-07,
"loss": 0.3582,
"step": 19650
},
{
"epoch": 0.9043386449839618,
"grad_norm": 9.519804000854492,
"learning_rate": 3.235054070597837e-07,
"loss": 0.3603,
"step": 19700
},
{
"epoch": 0.906633920732652,
"grad_norm": 11.330206871032715,
"learning_rate": 3.158539073658437e-07,
"loss": 0.3564,
"step": 19750
},
{
"epoch": 0.9089291964813423,
"grad_norm": 9.512042999267578,
"learning_rate": 3.082024076719037e-07,
"loss": 0.3578,
"step": 19800
},
{
"epoch": 0.9112244722300326,
"grad_norm": 27.564258575439453,
"learning_rate": 3.005509079779637e-07,
"loss": 0.348,
"step": 19850
},
{
"epoch": 0.9135197479787228,
"grad_norm": 9.83193302154541,
"learning_rate": 2.9289940828402365e-07,
"loss": 0.3635,
"step": 19900
},
{
"epoch": 0.9158150237274131,
"grad_norm": 21.163455963134766,
"learning_rate": 2.852479085900837e-07,
"loss": 0.3457,
"step": 19950
},
{
"epoch": 0.9181102994761033,
"grad_norm": 7.9303975105285645,
"learning_rate": 2.7759640889614364e-07,
"loss": 0.351,
"step": 20000
}
],
"logging_steps": 50,
"max_steps": 21783,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}