german-babylm-cds-char-sample3 / trainer_state.json
bbunzeck's picture
Upload 11 files
cda6ec4 verified
{
"best_metric": 1.1948587894439697,
"best_model_checkpoint": "/Users/bbunzeck/Documents/german-llamas/cxn-llamas/cds3-llama/checkpoint-6012",
"epoch": 0.9999812653390037,
"eval_steps": 668,
"global_step": 6672,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014987728797047418,
"grad_norm": 4.475478649139404,
"learning_rate": 1.4999999999999999e-05,
"loss": 4.4935,
"step": 10
},
{
"epoch": 0.0029975457594094835,
"grad_norm": 3.118635416030884,
"learning_rate": 2.9999999999999997e-05,
"loss": 4.159,
"step": 20
},
{
"epoch": 0.004496318639114225,
"grad_norm": 2.2133734226226807,
"learning_rate": 4.4999999999999996e-05,
"loss": 3.7894,
"step": 30
},
{
"epoch": 0.005995091518818967,
"grad_norm": 1.8941925764083862,
"learning_rate": 5.9999999999999995e-05,
"loss": 3.56,
"step": 40
},
{
"epoch": 0.007493864398523709,
"grad_norm": 1.5571386814117432,
"learning_rate": 7.5e-05,
"loss": 3.4336,
"step": 50
},
{
"epoch": 0.00899263727822845,
"grad_norm": 1.125948190689087,
"learning_rate": 8.999999999999999e-05,
"loss": 3.2953,
"step": 60
},
{
"epoch": 0.010491410157933192,
"grad_norm": 0.9792301058769226,
"learning_rate": 0.00010499999999999999,
"loss": 3.1549,
"step": 70
},
{
"epoch": 0.011990183037637934,
"grad_norm": 1.3225536346435547,
"learning_rate": 0.00011999999999999999,
"loss": 2.9355,
"step": 80
},
{
"epoch": 0.013488955917342676,
"grad_norm": 1.0157508850097656,
"learning_rate": 0.000135,
"loss": 2.7319,
"step": 90
},
{
"epoch": 0.014987728797047418,
"grad_norm": 1.0096309185028076,
"learning_rate": 0.00015,
"loss": 2.6215,
"step": 100
},
{
"epoch": 0.01648650167675216,
"grad_norm": 1.1225345134735107,
"learning_rate": 0.000165,
"loss": 2.5117,
"step": 110
},
{
"epoch": 0.0179852745564569,
"grad_norm": 1.4821795225143433,
"learning_rate": 0.00017999999999999998,
"loss": 2.448,
"step": 120
},
{
"epoch": 0.019484047436161642,
"grad_norm": 1.3336493968963623,
"learning_rate": 0.000195,
"loss": 2.3504,
"step": 130
},
{
"epoch": 0.020982820315866384,
"grad_norm": 1.2394251823425293,
"learning_rate": 0.00020999999999999998,
"loss": 2.2978,
"step": 140
},
{
"epoch": 0.022481593195571126,
"grad_norm": 1.6399582624435425,
"learning_rate": 0.000225,
"loss": 2.2439,
"step": 150
},
{
"epoch": 0.02398036607527587,
"grad_norm": 2.268030881881714,
"learning_rate": 0.00023999999999999998,
"loss": 2.1803,
"step": 160
},
{
"epoch": 0.02547913895498061,
"grad_norm": 2.153691291809082,
"learning_rate": 0.00025499999999999996,
"loss": 2.1265,
"step": 170
},
{
"epoch": 0.026977911834685352,
"grad_norm": 1.5142606496810913,
"learning_rate": 0.00027,
"loss": 2.0739,
"step": 180
},
{
"epoch": 0.028476684714390094,
"grad_norm": 1.531332015991211,
"learning_rate": 0.000285,
"loss": 2.0148,
"step": 190
},
{
"epoch": 0.029975457594094836,
"grad_norm": 1.551711916923523,
"learning_rate": 0.0003,
"loss": 1.9854,
"step": 200
},
{
"epoch": 0.031474230473799575,
"grad_norm": 1.8056877851486206,
"learning_rate": 0.0002999982328104334,
"loss": 1.9536,
"step": 210
},
{
"epoch": 0.03297300335350432,
"grad_norm": 1.5872079133987427,
"learning_rate": 0.00029999293128337313,
"loss": 1.9306,
"step": 220
},
{
"epoch": 0.03447177623320906,
"grad_norm": 1.6114915609359741,
"learning_rate": 0.00029998409554373644,
"loss": 1.8826,
"step": 230
},
{
"epoch": 0.0359705491129138,
"grad_norm": 1.188768744468689,
"learning_rate": 0.00029997172579971585,
"loss": 1.86,
"step": 240
},
{
"epoch": 0.03746932199261854,
"grad_norm": 1.5296919345855713,
"learning_rate": 0.0002999558223427737,
"loss": 1.8221,
"step": 250
},
{
"epoch": 0.038968094872323285,
"grad_norm": 1.196419596672058,
"learning_rate": 0.0002999363855476357,
"loss": 1.8225,
"step": 260
},
{
"epoch": 0.04046686775202803,
"grad_norm": 1.453387975692749,
"learning_rate": 0.0002999134158722818,
"loss": 1.7898,
"step": 270
},
{
"epoch": 0.04196564063173277,
"grad_norm": 1.1857067346572876,
"learning_rate": 0.00029988691385793553,
"loss": 1.7829,
"step": 280
},
{
"epoch": 0.04346441351143751,
"grad_norm": 1.294594645500183,
"learning_rate": 0.00029985688012905155,
"loss": 1.7712,
"step": 290
},
{
"epoch": 0.04496318639114225,
"grad_norm": 1.1869618892669678,
"learning_rate": 0.0002998233153933003,
"loss": 1.7575,
"step": 300
},
{
"epoch": 0.046461959270846995,
"grad_norm": 1.1660012006759644,
"learning_rate": 0.00029978622044155175,
"loss": 1.7293,
"step": 310
},
{
"epoch": 0.04796073215055174,
"grad_norm": 1.0317786931991577,
"learning_rate": 0.0002997455961478568,
"loss": 1.7288,
"step": 320
},
{
"epoch": 0.04945950503025648,
"grad_norm": 1.1452827453613281,
"learning_rate": 0.0002997014434694265,
"loss": 1.7044,
"step": 330
},
{
"epoch": 0.05095827790996122,
"grad_norm": 1.0942342281341553,
"learning_rate": 0.0002996537634466094,
"loss": 1.7116,
"step": 340
},
{
"epoch": 0.05245705078966596,
"grad_norm": 1.1483399868011475,
"learning_rate": 0.00029960255720286755,
"loss": 1.6756,
"step": 350
},
{
"epoch": 0.053955823669370705,
"grad_norm": 1.2330204248428345,
"learning_rate": 0.0002995478259447494,
"loss": 1.6669,
"step": 360
},
{
"epoch": 0.05545459654907545,
"grad_norm": 1.1838743686676025,
"learning_rate": 0.00029948957096186167,
"loss": 1.6474,
"step": 370
},
{
"epoch": 0.05695336942878019,
"grad_norm": 1.2730305194854736,
"learning_rate": 0.00029942779362683906,
"loss": 1.6409,
"step": 380
},
{
"epoch": 0.05845214230848493,
"grad_norm": 1.1066051721572876,
"learning_rate": 0.00029936249539531175,
"loss": 1.6216,
"step": 390
},
{
"epoch": 0.05995091518818967,
"grad_norm": 1.0314743518829346,
"learning_rate": 0.0002992936778058711,
"loss": 1.6263,
"step": 400
},
{
"epoch": 0.061449688067894415,
"grad_norm": 1.0880593061447144,
"learning_rate": 0.00029922134248003344,
"loss": 1.6176,
"step": 410
},
{
"epoch": 0.06294846094759915,
"grad_norm": 1.1548967361450195,
"learning_rate": 0.0002991454911222019,
"loss": 1.6251,
"step": 420
},
{
"epoch": 0.0644472338273039,
"grad_norm": 1.1637712717056274,
"learning_rate": 0.0002990661255196261,
"loss": 1.607,
"step": 430
},
{
"epoch": 0.06594600670700863,
"grad_norm": 1.0483553409576416,
"learning_rate": 0.00029898324754236037,
"loss": 1.6015,
"step": 440
},
{
"epoch": 0.06744477958671338,
"grad_norm": 1.0421444177627563,
"learning_rate": 0.00029889685914321923,
"loss": 1.5941,
"step": 450
},
{
"epoch": 0.06894355246641812,
"grad_norm": 1.091874361038208,
"learning_rate": 0.0002988069623577318,
"loss": 1.5884,
"step": 460
},
{
"epoch": 0.07044232534612287,
"grad_norm": 1.1074745655059814,
"learning_rate": 0.00029871355930409353,
"loss": 1.6035,
"step": 470
},
{
"epoch": 0.0719410982258276,
"grad_norm": 1.0914019346237183,
"learning_rate": 0.00029861665218311646,
"loss": 1.5799,
"step": 480
},
{
"epoch": 0.07343987110553235,
"grad_norm": 0.9614572525024414,
"learning_rate": 0.00029851624327817725,
"loss": 1.5599,
"step": 490
},
{
"epoch": 0.07493864398523709,
"grad_norm": 1.052742600440979,
"learning_rate": 0.0002984123349551635,
"loss": 1.5627,
"step": 500
},
{
"epoch": 0.07643741686494183,
"grad_norm": 1.032119631767273,
"learning_rate": 0.00029830492966241795,
"loss": 1.5547,
"step": 510
},
{
"epoch": 0.07793618974464657,
"grad_norm": 1.0201342105865479,
"learning_rate": 0.0002981940299306808,
"loss": 1.5422,
"step": 520
},
{
"epoch": 0.07943496262435132,
"grad_norm": 1.087877631187439,
"learning_rate": 0.00029807963837303003,
"loss": 1.5543,
"step": 530
},
{
"epoch": 0.08093373550405605,
"grad_norm": 1.0491427183151245,
"learning_rate": 0.00029796175768481974,
"loss": 1.5389,
"step": 540
},
{
"epoch": 0.0824325083837608,
"grad_norm": 0.9637844562530518,
"learning_rate": 0.0002978403906436171,
"loss": 1.5477,
"step": 550
},
{
"epoch": 0.08393128126346554,
"grad_norm": 0.9702052474021912,
"learning_rate": 0.0002977155401091362,
"loss": 1.5303,
"step": 560
},
{
"epoch": 0.08543005414317027,
"grad_norm": 0.9769977331161499,
"learning_rate": 0.0002975872090231713,
"loss": 1.535,
"step": 570
},
{
"epoch": 0.08692882702287502,
"grad_norm": 1.0341148376464844,
"learning_rate": 0.0002974554004095271,
"loss": 1.5356,
"step": 580
},
{
"epoch": 0.08842759990257976,
"grad_norm": 0.995041012763977,
"learning_rate": 0.00029732011737394775,
"loss": 1.5232,
"step": 590
},
{
"epoch": 0.0899263727822845,
"grad_norm": 1.0219428539276123,
"learning_rate": 0.0002971813631040434,
"loss": 1.5193,
"step": 600
},
{
"epoch": 0.09142514566198924,
"grad_norm": 1.099050521850586,
"learning_rate": 0.0002970391408692154,
"loss": 1.535,
"step": 610
},
{
"epoch": 0.09292391854169399,
"grad_norm": 1.247942566871643,
"learning_rate": 0.0002968934540205791,
"loss": 1.5252,
"step": 620
},
{
"epoch": 0.09442269142139872,
"grad_norm": 0.9947668313980103,
"learning_rate": 0.0002967443059908849,
"loss": 1.5118,
"step": 630
},
{
"epoch": 0.09592146430110347,
"grad_norm": 0.9500942230224609,
"learning_rate": 0.0002965917002944373,
"loss": 1.4768,
"step": 640
},
{
"epoch": 0.09742023718080821,
"grad_norm": 0.9748952984809875,
"learning_rate": 0.0002964356405270123,
"loss": 1.4791,
"step": 650
},
{
"epoch": 0.09891901006051296,
"grad_norm": 1.0195879936218262,
"learning_rate": 0.0002962761303657724,
"loss": 1.5021,
"step": 660
},
{
"epoch": 0.10011802836427675,
"eval_loss": 1.5316129922866821,
"eval_runtime": 34.726,
"eval_samples_per_second": 719.921,
"eval_steps_per_second": 89.99,
"step": 668
},
{
"epoch": 0.10041778294021769,
"grad_norm": 1.1510100364685059,
"learning_rate": 0.00029611317356918027,
"loss": 1.4856,
"step": 670
},
{
"epoch": 0.10191655581992244,
"grad_norm": 0.8920977711677551,
"learning_rate": 0.00029594677397690975,
"loss": 1.4896,
"step": 680
},
{
"epoch": 0.10341532869962718,
"grad_norm": 0.9080698490142822,
"learning_rate": 0.00029577693550975596,
"loss": 1.471,
"step": 690
},
{
"epoch": 0.10491410157933193,
"grad_norm": 0.903261661529541,
"learning_rate": 0.0002956036621695424,
"loss": 1.5153,
"step": 700
},
{
"epoch": 0.10641287445903666,
"grad_norm": 0.9598987698554993,
"learning_rate": 0.000295426958039027,
"loss": 1.478,
"step": 710
},
{
"epoch": 0.10791164733874141,
"grad_norm": 0.9518589973449707,
"learning_rate": 0.00029524682728180565,
"loss": 1.4713,
"step": 720
},
{
"epoch": 0.10941042021844614,
"grad_norm": 0.8936556577682495,
"learning_rate": 0.0002950632741422142,
"loss": 1.4658,
"step": 730
},
{
"epoch": 0.1109091930981509,
"grad_norm": 0.9834211468696594,
"learning_rate": 0.0002948763029452287,
"loss": 1.4611,
"step": 740
},
{
"epoch": 0.11240796597785563,
"grad_norm": 0.9097837805747986,
"learning_rate": 0.0002946859180963631,
"loss": 1.4696,
"step": 750
},
{
"epoch": 0.11390673885756038,
"grad_norm": 0.8856462836265564,
"learning_rate": 0.00029449212408156554,
"loss": 1.4676,
"step": 760
},
{
"epoch": 0.11540551173726511,
"grad_norm": 0.9628478288650513,
"learning_rate": 0.0002942949254671129,
"loss": 1.4582,
"step": 770
},
{
"epoch": 0.11690428461696986,
"grad_norm": 0.8835513591766357,
"learning_rate": 0.000294094326899503,
"loss": 1.4547,
"step": 780
},
{
"epoch": 0.1184030574966746,
"grad_norm": 0.9671273827552795,
"learning_rate": 0.00029389033310534517,
"loss": 1.4504,
"step": 790
},
{
"epoch": 0.11990183037637935,
"grad_norm": 0.9154276847839355,
"learning_rate": 0.00029368294889124864,
"loss": 1.4387,
"step": 800
},
{
"epoch": 0.12140060325608408,
"grad_norm": 0.8806156516075134,
"learning_rate": 0.0002934721791437098,
"loss": 1.4452,
"step": 810
},
{
"epoch": 0.12289937613578883,
"grad_norm": 0.8779545426368713,
"learning_rate": 0.0002932580288289966,
"loss": 1.4425,
"step": 820
},
{
"epoch": 0.12439814901549356,
"grad_norm": 0.9174330234527588,
"learning_rate": 0.0002930405029930317,
"loss": 1.4466,
"step": 830
},
{
"epoch": 0.1258969218951983,
"grad_norm": 0.904965341091156,
"learning_rate": 0.00029281960676127365,
"loss": 1.4289,
"step": 840
},
{
"epoch": 0.12739569477490303,
"grad_norm": 0.9320288300514221,
"learning_rate": 0.0002925953453385959,
"loss": 1.44,
"step": 850
},
{
"epoch": 0.1288944676546078,
"grad_norm": 0.8714675307273865,
"learning_rate": 0.00029236772400916455,
"loss": 1.4153,
"step": 860
},
{
"epoch": 0.13039324053431253,
"grad_norm": 0.9434481859207153,
"learning_rate": 0.0002921367481363134,
"loss": 1.421,
"step": 870
},
{
"epoch": 0.13189201341401727,
"grad_norm": 0.9329891800880432,
"learning_rate": 0.00029190242316241773,
"loss": 1.4459,
"step": 880
},
{
"epoch": 0.133390786293722,
"grad_norm": 0.9669409394264221,
"learning_rate": 0.0002916647546087663,
"loss": 1.4254,
"step": 890
},
{
"epoch": 0.13488955917342677,
"grad_norm": 0.9065726399421692,
"learning_rate": 0.00029142374807543083,
"loss": 1.4255,
"step": 900
},
{
"epoch": 0.1363883320531315,
"grad_norm": 0.9425346255302429,
"learning_rate": 0.0002911794092411345,
"loss": 1.4276,
"step": 910
},
{
"epoch": 0.13788710493283624,
"grad_norm": 0.9356669783592224,
"learning_rate": 0.0002909317438631179,
"loss": 1.4211,
"step": 920
},
{
"epoch": 0.13938587781254097,
"grad_norm": 0.9592692852020264,
"learning_rate": 0.0002906807577770031,
"loss": 1.4053,
"step": 930
},
{
"epoch": 0.14088465069224573,
"grad_norm": 0.8722407817840576,
"learning_rate": 0.0002904264568966569,
"loss": 1.406,
"step": 940
},
{
"epoch": 0.14238342357195047,
"grad_norm": 0.812244713306427,
"learning_rate": 0.0002901688472140507,
"loss": 1.4193,
"step": 950
},
{
"epoch": 0.1438821964516552,
"grad_norm": 1.0006746053695679,
"learning_rate": 0.00028990793479911973,
"loss": 1.4092,
"step": 960
},
{
"epoch": 0.14538096933135994,
"grad_norm": 0.9519333243370056,
"learning_rate": 0.00028964372579961997,
"loss": 1.4055,
"step": 970
},
{
"epoch": 0.1468797422110647,
"grad_norm": 0.8306151628494263,
"learning_rate": 0.0002893762264409832,
"loss": 1.3887,
"step": 980
},
{
"epoch": 0.14837851509076944,
"grad_norm": 0.9266255497932434,
"learning_rate": 0.00028910544302617055,
"loss": 1.4288,
"step": 990
},
{
"epoch": 0.14987728797047417,
"grad_norm": 0.890106201171875,
"learning_rate": 0.0002888313819355236,
"loss": 1.4126,
"step": 1000
},
{
"epoch": 0.1513760608501789,
"grad_norm": 0.8950091600418091,
"learning_rate": 0.0002885540496266144,
"loss": 1.4022,
"step": 1010
},
{
"epoch": 0.15287483372988367,
"grad_norm": 0.8897644877433777,
"learning_rate": 0.00028827345263409304,
"loss": 1.4005,
"step": 1020
},
{
"epoch": 0.1543736066095884,
"grad_norm": 0.8642116189002991,
"learning_rate": 0.000287989597569534,
"loss": 1.371,
"step": 1030
},
{
"epoch": 0.15587237948929314,
"grad_norm": 0.9173794984817505,
"learning_rate": 0.00028770249112128,
"loss": 1.3888,
"step": 1040
},
{
"epoch": 0.15737115236899787,
"grad_norm": 0.9254131317138672,
"learning_rate": 0.0002874121400542846,
"loss": 1.3762,
"step": 1050
},
{
"epoch": 0.15886992524870264,
"grad_norm": 0.8322397470474243,
"learning_rate": 0.00028711855120995284,
"loss": 1.4087,
"step": 1060
},
{
"epoch": 0.16036869812840737,
"grad_norm": 0.8882824182510376,
"learning_rate": 0.00028682173150598007,
"loss": 1.4032,
"step": 1070
},
{
"epoch": 0.1618674710081121,
"grad_norm": 0.8823337554931641,
"learning_rate": 0.00028652168793618857,
"loss": 1.3995,
"step": 1080
},
{
"epoch": 0.16336624388781684,
"grad_norm": 1.0256520509719849,
"learning_rate": 0.0002862184275703633,
"loss": 1.3702,
"step": 1090
},
{
"epoch": 0.1648650167675216,
"grad_norm": 0.8852070569992065,
"learning_rate": 0.00028591195755408504,
"loss": 1.3974,
"step": 1100
},
{
"epoch": 0.16636378964722634,
"grad_norm": 0.9529812335968018,
"learning_rate": 0.00028560228510856185,
"loss": 1.3999,
"step": 1110
},
{
"epoch": 0.16786256252693108,
"grad_norm": 0.9794729948043823,
"learning_rate": 0.0002852894175304594,
"loss": 1.3841,
"step": 1120
},
{
"epoch": 0.1693613354066358,
"grad_norm": 0.8799359798431396,
"learning_rate": 0.00028497336219172854,
"loss": 1.38,
"step": 1130
},
{
"epoch": 0.17086010828634055,
"grad_norm": 0.8865130543708801,
"learning_rate": 0.00028465412653943194,
"loss": 1.3698,
"step": 1140
},
{
"epoch": 0.1723588811660453,
"grad_norm": 0.8830373883247375,
"learning_rate": 0.00028433171809556844,
"loss": 1.3813,
"step": 1150
},
{
"epoch": 0.17385765404575004,
"grad_norm": 0.9166454672813416,
"learning_rate": 0.00028400614445689583,
"loss": 1.3681,
"step": 1160
},
{
"epoch": 0.17535642692545478,
"grad_norm": 0.9003053307533264,
"learning_rate": 0.000283677413294752,
"loss": 1.3547,
"step": 1170
},
{
"epoch": 0.1768551998051595,
"grad_norm": 0.8830690979957581,
"learning_rate": 0.0002833455323548741,
"loss": 1.3778,
"step": 1180
},
{
"epoch": 0.17835397268486428,
"grad_norm": 0.8167861104011536,
"learning_rate": 0.00028301050945721577,
"loss": 1.3754,
"step": 1190
},
{
"epoch": 0.179852745564569,
"grad_norm": 0.8883965015411377,
"learning_rate": 0.00028267235249576335,
"loss": 1.369,
"step": 1200
},
{
"epoch": 0.18135151844427375,
"grad_norm": 0.8578314185142517,
"learning_rate": 0.00028233106943834947,
"loss": 1.3535,
"step": 1210
},
{
"epoch": 0.18285029132397848,
"grad_norm": 0.9007347822189331,
"learning_rate": 0.0002819866683264657,
"loss": 1.3629,
"step": 1220
},
{
"epoch": 0.18434906420368324,
"grad_norm": 0.8820904493331909,
"learning_rate": 0.00028163915727507266,
"loss": 1.3603,
"step": 1230
},
{
"epoch": 0.18584783708338798,
"grad_norm": 0.8988449573516846,
"learning_rate": 0.00028128854447240903,
"loss": 1.363,
"step": 1240
},
{
"epoch": 0.18734660996309271,
"grad_norm": 0.9054597616195679,
"learning_rate": 0.0002809348381797988,
"loss": 1.3555,
"step": 1250
},
{
"epoch": 0.18884538284279745,
"grad_norm": 0.901901125907898,
"learning_rate": 0.000280578046731456,
"loss": 1.3636,
"step": 1260
},
{
"epoch": 0.1903441557225022,
"grad_norm": 0.8617561459541321,
"learning_rate": 0.0002802181785342892,
"loss": 1.3747,
"step": 1270
},
{
"epoch": 0.19184292860220695,
"grad_norm": 0.9050679802894592,
"learning_rate": 0.0002798552420677025,
"loss": 1.358,
"step": 1280
},
{
"epoch": 0.19334170148191168,
"grad_norm": 0.8624427318572998,
"learning_rate": 0.00027948924588339655,
"loss": 1.3676,
"step": 1290
},
{
"epoch": 0.19484047436161642,
"grad_norm": 0.9002446532249451,
"learning_rate": 0.00027912019860516644,
"loss": 1.3553,
"step": 1300
},
{
"epoch": 0.19633924724132118,
"grad_norm": 0.8707396388053894,
"learning_rate": 0.0002787481089286989,
"loss": 1.3593,
"step": 1310
},
{
"epoch": 0.19783802012102591,
"grad_norm": 0.8842359781265259,
"learning_rate": 0.0002783729856213671,
"loss": 1.3466,
"step": 1320
},
{
"epoch": 0.19933679300073065,
"grad_norm": 0.9021062850952148,
"learning_rate": 0.00027799483752202444,
"loss": 1.3603,
"step": 1330
},
{
"epoch": 0.2002360567285535,
"eval_loss": 1.3936063051223755,
"eval_runtime": 34.728,
"eval_samples_per_second": 719.881,
"eval_steps_per_second": 89.985,
"step": 1336
},
{
"epoch": 0.20083556588043538,
"grad_norm": 0.8968262672424316,
"learning_rate": 0.00027761367354079574,
"loss": 1.3579,
"step": 1340
},
{
"epoch": 0.20233433876014015,
"grad_norm": 0.8595730662345886,
"learning_rate": 0.00027722950265886796,
"loss": 1.3514,
"step": 1350
},
{
"epoch": 0.20383311163984488,
"grad_norm": 0.8064100742340088,
"learning_rate": 0.00027684233392827806,
"loss": 1.3599,
"step": 1360
},
{
"epoch": 0.20533188451954962,
"grad_norm": 0.9714745879173279,
"learning_rate": 0.0002764521764716999,
"loss": 1.3328,
"step": 1370
},
{
"epoch": 0.20683065739925435,
"grad_norm": 0.8713521957397461,
"learning_rate": 0.0002760590394822293,
"loss": 1.3375,
"step": 1380
},
{
"epoch": 0.20832943027895912,
"grad_norm": 0.9022751450538635,
"learning_rate": 0.00027566293222316734,
"loss": 1.34,
"step": 1390
},
{
"epoch": 0.20982820315866385,
"grad_norm": 0.8861002326011658,
"learning_rate": 0.0002752638640278024,
"loss": 1.3429,
"step": 1400
},
{
"epoch": 0.21132697603836859,
"grad_norm": 0.908525288105011,
"learning_rate": 0.0002748618442991897,
"loss": 1.3416,
"step": 1410
},
{
"epoch": 0.21282574891807332,
"grad_norm": 0.8318501114845276,
"learning_rate": 0.0002744568825099302,
"loss": 1.3453,
"step": 1420
},
{
"epoch": 0.21432452179777806,
"grad_norm": 0.8497442603111267,
"learning_rate": 0.00027404898820194724,
"loss": 1.3434,
"step": 1430
},
{
"epoch": 0.21582329467748282,
"grad_norm": 0.8935710191726685,
"learning_rate": 0.00027363817098626165,
"loss": 1.3443,
"step": 1440
},
{
"epoch": 0.21732206755718755,
"grad_norm": 0.8185181617736816,
"learning_rate": 0.00027322444054276543,
"loss": 1.345,
"step": 1450
},
{
"epoch": 0.2188208404368923,
"grad_norm": 0.8508570194244385,
"learning_rate": 0.00027280780661999353,
"loss": 1.3476,
"step": 1460
},
{
"epoch": 0.22031961331659702,
"grad_norm": 0.9571162462234497,
"learning_rate": 0.00027238827903489424,
"loss": 1.3536,
"step": 1470
},
{
"epoch": 0.2218183861963018,
"grad_norm": 0.8471242785453796,
"learning_rate": 0.0002719658676725979,
"loss": 1.3338,
"step": 1480
},
{
"epoch": 0.22331715907600652,
"grad_norm": 0.817711353302002,
"learning_rate": 0.00027154058248618376,
"loss": 1.3425,
"step": 1490
},
{
"epoch": 0.22481593195571126,
"grad_norm": 0.8738152980804443,
"learning_rate": 0.00027111243349644583,
"loss": 1.365,
"step": 1500
},
{
"epoch": 0.226314704835416,
"grad_norm": 0.8730055689811707,
"learning_rate": 0.0002706814307916565,
"loss": 1.3256,
"step": 1510
},
{
"epoch": 0.22781347771512075,
"grad_norm": 0.8565317392349243,
"learning_rate": 0.00027024758452732876,
"loss": 1.3305,
"step": 1520
},
{
"epoch": 0.2293122505948255,
"grad_norm": 0.8381545543670654,
"learning_rate": 0.0002698109049259773,
"loss": 1.3436,
"step": 1530
},
{
"epoch": 0.23081102347453022,
"grad_norm": 0.8256663680076599,
"learning_rate": 0.0002693714022768772,
"loss": 1.3357,
"step": 1540
},
{
"epoch": 0.23230979635423496,
"grad_norm": 0.8698524832725525,
"learning_rate": 0.00026892908693582166,
"loss": 1.3357,
"step": 1550
},
{
"epoch": 0.23380856923393972,
"grad_norm": 0.8229045271873474,
"learning_rate": 0.00026848396932487826,
"loss": 1.333,
"step": 1560
},
{
"epoch": 0.23530734211364446,
"grad_norm": 0.793002188205719,
"learning_rate": 0.00026803605993214283,
"loss": 1.3492,
"step": 1570
},
{
"epoch": 0.2368061149933492,
"grad_norm": 0.862382709980011,
"learning_rate": 0.0002675853693114929,
"loss": 1.3286,
"step": 1580
},
{
"epoch": 0.23830488787305393,
"grad_norm": 0.8608536124229431,
"learning_rate": 0.00026713190808233853,
"loss": 1.3212,
"step": 1590
},
{
"epoch": 0.2398036607527587,
"grad_norm": 0.7945990562438965,
"learning_rate": 0.00026667568692937245,
"loss": 1.3226,
"step": 1600
},
{
"epoch": 0.24130243363246343,
"grad_norm": 0.7914056777954102,
"learning_rate": 0.0002662167166023182,
"loss": 1.3323,
"step": 1610
},
{
"epoch": 0.24280120651216816,
"grad_norm": 0.8237165212631226,
"learning_rate": 0.0002657550079156767,
"loss": 1.3586,
"step": 1620
},
{
"epoch": 0.2442999793918729,
"grad_norm": 0.8703092932701111,
"learning_rate": 0.0002652905717484716,
"loss": 1.323,
"step": 1630
},
{
"epoch": 0.24579875227157766,
"grad_norm": 0.8693041205406189,
"learning_rate": 0.0002648234190439929,
"loss": 1.3182,
"step": 1640
},
{
"epoch": 0.2472975251512824,
"grad_norm": 0.7807459831237793,
"learning_rate": 0.00026435356080953916,
"loss": 1.3429,
"step": 1650
},
{
"epoch": 0.24879629803098713,
"grad_norm": 0.810480535030365,
"learning_rate": 0.00026388100811615785,
"loss": 1.3036,
"step": 1660
},
{
"epoch": 0.25029507091069186,
"grad_norm": 0.8039631247520447,
"learning_rate": 0.0002634057720983849,
"loss": 1.2937,
"step": 1670
},
{
"epoch": 0.2517938437903966,
"grad_norm": 0.9298089146614075,
"learning_rate": 0.000262927863953982,
"loss": 1.3453,
"step": 1680
},
{
"epoch": 0.25329261667010133,
"grad_norm": 0.8398979902267456,
"learning_rate": 0.00026244729494367307,
"loss": 1.3188,
"step": 1690
},
{
"epoch": 0.25479138954980607,
"grad_norm": 0.8078634142875671,
"learning_rate": 0.0002619640763908786,
"loss": 1.3268,
"step": 1700
},
{
"epoch": 0.25629016242951086,
"grad_norm": 0.7533845901489258,
"learning_rate": 0.000261478219681449,
"loss": 1.3005,
"step": 1710
},
{
"epoch": 0.2577889353092156,
"grad_norm": 0.852730929851532,
"learning_rate": 0.00026098973626339654,
"loss": 1.3106,
"step": 1720
},
{
"epoch": 0.25928770818892033,
"grad_norm": 0.9218925833702087,
"learning_rate": 0.0002604986376466251,
"loss": 1.3169,
"step": 1730
},
{
"epoch": 0.26078648106862506,
"grad_norm": 0.858396589756012,
"learning_rate": 0.00026000493540265934,
"loss": 1.3261,
"step": 1740
},
{
"epoch": 0.2622852539483298,
"grad_norm": 0.8293901085853577,
"learning_rate": 0.000259508641164372,
"loss": 1.3184,
"step": 1750
},
{
"epoch": 0.26378402682803453,
"grad_norm": 0.7981786131858826,
"learning_rate": 0.0002590097666257099,
"loss": 1.2922,
"step": 1760
},
{
"epoch": 0.26528279970773927,
"grad_norm": 0.881025493144989,
"learning_rate": 0.00025850832354141784,
"loss": 1.3039,
"step": 1770
},
{
"epoch": 0.266781572587444,
"grad_norm": 0.8400424718856812,
"learning_rate": 0.0002580043237267625,
"loss": 1.3208,
"step": 1780
},
{
"epoch": 0.2682803454671488,
"grad_norm": 0.8640297055244446,
"learning_rate": 0.00025749777905725336,
"loss": 1.2938,
"step": 1790
},
{
"epoch": 0.26977911834685353,
"grad_norm": 0.820582389831543,
"learning_rate": 0.00025698870146836315,
"loss": 1.3163,
"step": 1800
},
{
"epoch": 0.27127789122655827,
"grad_norm": 0.8231412172317505,
"learning_rate": 0.00025647710295524656,
"loss": 1.2841,
"step": 1810
},
{
"epoch": 0.272776664106263,
"grad_norm": 0.8273252248764038,
"learning_rate": 0.00025596299557245774,
"loss": 1.2995,
"step": 1820
},
{
"epoch": 0.27427543698596774,
"grad_norm": 0.9586191177368164,
"learning_rate": 0.0002554463914336659,
"loss": 1.3058,
"step": 1830
},
{
"epoch": 0.27577420986567247,
"grad_norm": 0.839583158493042,
"learning_rate": 0.0002549273027113704,
"loss": 1.3043,
"step": 1840
},
{
"epoch": 0.2772729827453772,
"grad_norm": 0.8683123588562012,
"learning_rate": 0.00025440574163661364,
"loss": 1.2954,
"step": 1850
},
{
"epoch": 0.27877175562508194,
"grad_norm": 0.7915263175964355,
"learning_rate": 0.0002538817204986926,
"loss": 1.3061,
"step": 1860
},
{
"epoch": 0.28027052850478673,
"grad_norm": 0.7897734045982361,
"learning_rate": 0.00025335525164487,
"loss": 1.3015,
"step": 1870
},
{
"epoch": 0.28176930138449147,
"grad_norm": 0.8129907250404358,
"learning_rate": 0.0002528263474800826,
"loss": 1.3008,
"step": 1880
},
{
"epoch": 0.2832680742641962,
"grad_norm": 0.8599696159362793,
"learning_rate": 0.0002522950204666494,
"loss": 1.2996,
"step": 1890
},
{
"epoch": 0.28476684714390094,
"grad_norm": 0.79160475730896,
"learning_rate": 0.00025176128312397774,
"loss": 1.2994,
"step": 1900
},
{
"epoch": 0.28626562002360567,
"grad_norm": 0.9080151915550232,
"learning_rate": 0.0002512251480282685,
"loss": 1.2902,
"step": 1910
},
{
"epoch": 0.2877643929033104,
"grad_norm": 0.8382280468940735,
"learning_rate": 0.00025068662781221966,
"loss": 1.2938,
"step": 1920
},
{
"epoch": 0.28926316578301514,
"grad_norm": 0.8656803369522095,
"learning_rate": 0.00025014573516472864,
"loss": 1.292,
"step": 1930
},
{
"epoch": 0.2907619386627199,
"grad_norm": 0.8192686438560486,
"learning_rate": 0.0002496024828305933,
"loss": 1.3002,
"step": 1940
},
{
"epoch": 0.29226071154242467,
"grad_norm": 0.7879363894462585,
"learning_rate": 0.0002490568836102118,
"loss": 1.3009,
"step": 1950
},
{
"epoch": 0.2937594844221294,
"grad_norm": 0.8016439080238342,
"learning_rate": 0.0002485089503592808,
"loss": 1.2998,
"step": 1960
},
{
"epoch": 0.29525825730183414,
"grad_norm": 0.8002453446388245,
"learning_rate": 0.0002479586959884926,
"loss": 1.291,
"step": 1970
},
{
"epoch": 0.2967570301815389,
"grad_norm": 0.8366382718086243,
"learning_rate": 0.00024740613346323095,
"loss": 1.3098,
"step": 1980
},
{
"epoch": 0.2982558030612436,
"grad_norm": 0.8672970533370972,
"learning_rate": 0.0002468512758032656,
"loss": 1.2958,
"step": 1990
},
{
"epoch": 0.29975457594094834,
"grad_norm": 0.8241642713546753,
"learning_rate": 0.0002462941360824454,
"loss": 1.3034,
"step": 2000
},
{
"epoch": 0.30035408509283024,
"eval_loss": 1.3332195281982422,
"eval_runtime": 34.8158,
"eval_samples_per_second": 718.065,
"eval_steps_per_second": 89.758,
"step": 2004
},
{
"epoch": 0.3012533488206531,
"grad_norm": 0.874294638633728,
"learning_rate": 0.00024573472742839053,
"loss": 1.2974,
"step": 2010
},
{
"epoch": 0.3027521217003578,
"grad_norm": 0.788142204284668,
"learning_rate": 0.0002451730630221827,
"loss": 1.3018,
"step": 2020
},
{
"epoch": 0.30425089458006255,
"grad_norm": 0.8617194294929504,
"learning_rate": 0.0002446091560980549,
"loss": 1.2913,
"step": 2030
},
{
"epoch": 0.30574966745976734,
"grad_norm": 0.7948266267776489,
"learning_rate": 0.00024404301994307968,
"loss": 1.2885,
"step": 2040
},
{
"epoch": 0.3072484403394721,
"grad_norm": 0.8624528050422668,
"learning_rate": 0.00024347466789685575,
"loss": 1.2823,
"step": 2050
},
{
"epoch": 0.3087472132191768,
"grad_norm": 0.8431428670883179,
"learning_rate": 0.00024290411335119386,
"loss": 1.2784,
"step": 2060
},
{
"epoch": 0.31024598609888154,
"grad_norm": 0.7940576672554016,
"learning_rate": 0.0002423313697498012,
"loss": 1.3008,
"step": 2070
},
{
"epoch": 0.3117447589785863,
"grad_norm": 0.7964574098587036,
"learning_rate": 0.0002417564505879647,
"loss": 1.292,
"step": 2080
},
{
"epoch": 0.313243531858291,
"grad_norm": 0.7948952317237854,
"learning_rate": 0.00024117936941223293,
"loss": 1.2897,
"step": 2090
},
{
"epoch": 0.31474230473799575,
"grad_norm": 0.7771734595298767,
"learning_rate": 0.00024060013982009695,
"loss": 1.2737,
"step": 2100
},
{
"epoch": 0.3162410776177005,
"grad_norm": 0.7866066098213196,
"learning_rate": 0.00024001877545967005,
"loss": 1.2908,
"step": 2110
},
{
"epoch": 0.3177398504974053,
"grad_norm": 0.8679277300834656,
"learning_rate": 0.00023943529002936595,
"loss": 1.2916,
"step": 2120
},
{
"epoch": 0.31923862337711,
"grad_norm": 0.7685953974723816,
"learning_rate": 0.0002388496972775762,
"loss": 1.2669,
"step": 2130
},
{
"epoch": 0.32073739625681474,
"grad_norm": 0.8774948716163635,
"learning_rate": 0.00023826201100234613,
"loss": 1.2834,
"step": 2140
},
{
"epoch": 0.3222361691365195,
"grad_norm": 0.7911577820777893,
"learning_rate": 0.00023767224505104984,
"loss": 1.2833,
"step": 2150
},
{
"epoch": 0.3237349420162242,
"grad_norm": 0.8205265402793884,
"learning_rate": 0.00023708041332006375,
"loss": 1.2902,
"step": 2160
},
{
"epoch": 0.32523371489592895,
"grad_norm": 0.7738403677940369,
"learning_rate": 0.00023648652975443937,
"loss": 1.2879,
"step": 2170
},
{
"epoch": 0.3267324877756337,
"grad_norm": 0.9220579862594604,
"learning_rate": 0.00023589060834757454,
"loss": 1.2841,
"step": 2180
},
{
"epoch": 0.3282312606553384,
"grad_norm": 0.7917136549949646,
"learning_rate": 0.00023529266314088388,
"loss": 1.2795,
"step": 2190
},
{
"epoch": 0.3297300335350432,
"grad_norm": 0.8733565211296082,
"learning_rate": 0.00023469270822346774,
"loss": 1.2896,
"step": 2200
},
{
"epoch": 0.33122880641474794,
"grad_norm": 0.874913215637207,
"learning_rate": 0.00023409075773178045,
"loss": 1.2607,
"step": 2210
},
{
"epoch": 0.3327275792944527,
"grad_norm": 0.7840304970741272,
"learning_rate": 0.00023348682584929702,
"loss": 1.2737,
"step": 2220
},
{
"epoch": 0.3342263521741574,
"grad_norm": 0.8352317810058594,
"learning_rate": 0.00023288092680617912,
"loss": 1.2804,
"step": 2230
},
{
"epoch": 0.33572512505386215,
"grad_norm": 0.8565024137496948,
"learning_rate": 0.00023227307487893957,
"loss": 1.2931,
"step": 2240
},
{
"epoch": 0.3372238979335669,
"grad_norm": 0.8791518211364746,
"learning_rate": 0.00023166328439010625,
"loss": 1.282,
"step": 2250
},
{
"epoch": 0.3387226708132716,
"grad_norm": 0.8080510497093201,
"learning_rate": 0.00023105156970788424,
"loss": 1.2986,
"step": 2260
},
{
"epoch": 0.34022144369297636,
"grad_norm": 0.8406448364257812,
"learning_rate": 0.0002304379452458177,
"loss": 1.2699,
"step": 2270
},
{
"epoch": 0.3417202165726811,
"grad_norm": 0.8735692501068115,
"learning_rate": 0.00022982242546244985,
"loss": 1.2762,
"step": 2280
},
{
"epoch": 0.3432189894523859,
"grad_norm": 0.8007855415344238,
"learning_rate": 0.00022920502486098262,
"loss": 1.27,
"step": 2290
},
{
"epoch": 0.3447177623320906,
"grad_norm": 0.7858870625495911,
"learning_rate": 0.0002285857579889346,
"loss": 1.2785,
"step": 2300
},
{
"epoch": 0.34621653521179535,
"grad_norm": 0.8488030433654785,
"learning_rate": 0.00022796463943779862,
"loss": 1.2687,
"step": 2310
},
{
"epoch": 0.3477153080915001,
"grad_norm": 0.8431515693664551,
"learning_rate": 0.0002273416838426976,
"loss": 1.2522,
"step": 2320
},
{
"epoch": 0.3492140809712048,
"grad_norm": 0.8310567140579224,
"learning_rate": 0.00022671690588203994,
"loss": 1.2826,
"step": 2330
},
{
"epoch": 0.35071285385090956,
"grad_norm": 0.7767550945281982,
"learning_rate": 0.00022609032027717357,
"loss": 1.2721,
"step": 2340
},
{
"epoch": 0.3522116267306143,
"grad_norm": 0.8902162909507751,
"learning_rate": 0.00022546194179203904,
"loss": 1.2795,
"step": 2350
},
{
"epoch": 0.353710399610319,
"grad_norm": 0.7950676083564758,
"learning_rate": 0.0002248317852328217,
"loss": 1.2703,
"step": 2360
},
{
"epoch": 0.3552091724900238,
"grad_norm": 0.8022187352180481,
"learning_rate": 0.00022419986544760284,
"loss": 1.2626,
"step": 2370
},
{
"epoch": 0.35670794536972855,
"grad_norm": 0.7801185846328735,
"learning_rate": 0.00022356619732600988,
"loss": 1.2781,
"step": 2380
},
{
"epoch": 0.3582067182494333,
"grad_norm": 0.7817099094390869,
"learning_rate": 0.0002229307957988653,
"loss": 1.2757,
"step": 2390
},
{
"epoch": 0.359705491129138,
"grad_norm": 0.8795964121818542,
"learning_rate": 0.0002222936758378352,
"loss": 1.2573,
"step": 2400
},
{
"epoch": 0.36120426400884276,
"grad_norm": 0.8010522723197937,
"learning_rate": 0.0002216548524550761,
"loss": 1.2747,
"step": 2410
},
{
"epoch": 0.3627030368885475,
"grad_norm": 0.8054040670394897,
"learning_rate": 0.0002210143407028817,
"loss": 1.2608,
"step": 2420
},
{
"epoch": 0.3642018097682522,
"grad_norm": 0.826788067817688,
"learning_rate": 0.00022037215567332767,
"loss": 1.2691,
"step": 2430
},
{
"epoch": 0.36570058264795696,
"grad_norm": 0.7943429350852966,
"learning_rate": 0.00021972831249791652,
"loss": 1.2683,
"step": 2440
},
{
"epoch": 0.36719935552766175,
"grad_norm": 0.7942007780075073,
"learning_rate": 0.00021908282634722082,
"loss": 1.2685,
"step": 2450
},
{
"epoch": 0.3686981284073665,
"grad_norm": 0.8077208995819092,
"learning_rate": 0.00021843571243052577,
"loss": 1.2548,
"step": 2460
},
{
"epoch": 0.3701969012870712,
"grad_norm": 0.8264975547790527,
"learning_rate": 0.00021778698599547088,
"loss": 1.2642,
"step": 2470
},
{
"epoch": 0.37169567416677596,
"grad_norm": 0.8238746523857117,
"learning_rate": 0.00021713666232769067,
"loss": 1.2619,
"step": 2480
},
{
"epoch": 0.3731944470464807,
"grad_norm": 0.8039039373397827,
"learning_rate": 0.00021648475675045445,
"loss": 1.2541,
"step": 2490
},
{
"epoch": 0.37469321992618543,
"grad_norm": 0.7852529883384705,
"learning_rate": 0.00021583128462430529,
"loss": 1.2595,
"step": 2500
},
{
"epoch": 0.37619199280589016,
"grad_norm": 0.7786864638328552,
"learning_rate": 0.00021517626134669824,
"loss": 1.264,
"step": 2510
},
{
"epoch": 0.3776907656855949,
"grad_norm": 0.7981049418449402,
"learning_rate": 0.0002145197023516374,
"loss": 1.2687,
"step": 2520
},
{
"epoch": 0.3791895385652997,
"grad_norm": 0.8835734128952026,
"learning_rate": 0.000213861623109312,
"loss": 1.2732,
"step": 2530
},
{
"epoch": 0.3806883114450044,
"grad_norm": 0.7418168187141418,
"learning_rate": 0.00021320203912573245,
"loss": 1.248,
"step": 2540
},
{
"epoch": 0.38218708432470916,
"grad_norm": 0.7426604628562927,
"learning_rate": 0.00021254096594236447,
"loss": 1.2472,
"step": 2550
},
{
"epoch": 0.3836858572044139,
"grad_norm": 0.7566102743148804,
"learning_rate": 0.00021187841913576324,
"loss": 1.2612,
"step": 2560
},
{
"epoch": 0.38518463008411863,
"grad_norm": 0.7827373743057251,
"learning_rate": 0.00021121441431720607,
"loss": 1.261,
"step": 2570
},
{
"epoch": 0.38668340296382336,
"grad_norm": 0.8316198587417603,
"learning_rate": 0.00021054896713232482,
"loss": 1.2649,
"step": 2580
},
{
"epoch": 0.3881821758435281,
"grad_norm": 0.7967325448989868,
"learning_rate": 0.00020988209326073713,
"loss": 1.2588,
"step": 2590
},
{
"epoch": 0.38968094872323283,
"grad_norm": 0.7973827719688416,
"learning_rate": 0.00020921380841567702,
"loss": 1.2534,
"step": 2600
},
{
"epoch": 0.39117972160293757,
"grad_norm": 0.8205732703208923,
"learning_rate": 0.00020854412834362445,
"loss": 1.2608,
"step": 2610
},
{
"epoch": 0.39267849448264236,
"grad_norm": 0.7526434063911438,
"learning_rate": 0.00020787306882393464,
"loss": 1.2517,
"step": 2620
},
{
"epoch": 0.3941772673623471,
"grad_norm": 0.8194977045059204,
"learning_rate": 0.00020720064566846603,
"loss": 1.2303,
"step": 2630
},
{
"epoch": 0.39567604024205183,
"grad_norm": 0.790014386177063,
"learning_rate": 0.0002065268747212077,
"loss": 1.2579,
"step": 2640
},
{
"epoch": 0.39717481312175656,
"grad_norm": 0.8768025040626526,
"learning_rate": 0.00020585177185790618,
"loss": 1.2728,
"step": 2650
},
{
"epoch": 0.3986735860014613,
"grad_norm": 0.7993418574333191,
"learning_rate": 0.00020517535298569134,
"loss": 1.2738,
"step": 2660
},
{
"epoch": 0.40017235888116603,
"grad_norm": 0.8214737176895142,
"learning_rate": 0.00020449763404270136,
"loss": 1.2519,
"step": 2670
},
{
"epoch": 0.400472113457107,
"eval_loss": 1.2892086505889893,
"eval_runtime": 34.8937,
"eval_samples_per_second": 716.462,
"eval_steps_per_second": 89.558,
"step": 2672
},
{
"epoch": 0.40167113176087077,
"grad_norm": 0.8561115264892578,
"learning_rate": 0.00020381863099770768,
"loss": 1.2384,
"step": 2680
},
{
"epoch": 0.4031699046405755,
"grad_norm": 0.7806345224380493,
"learning_rate": 0.00020313835984973815,
"loss": 1.2698,
"step": 2690
},
{
"epoch": 0.4046686775202803,
"grad_norm": 0.8666788935661316,
"learning_rate": 0.00020245683662770047,
"loss": 1.2461,
"step": 2700
},
{
"epoch": 0.40616745039998503,
"grad_norm": 0.8963791728019714,
"learning_rate": 0.0002017740773900043,
"loss": 1.2401,
"step": 2710
},
{
"epoch": 0.40766622327968977,
"grad_norm": 0.8115370273590088,
"learning_rate": 0.00020109009822418311,
"loss": 1.252,
"step": 2720
},
{
"epoch": 0.4091649961593945,
"grad_norm": 0.7880274653434753,
"learning_rate": 0.0002004049152465147,
"loss": 1.2622,
"step": 2730
},
{
"epoch": 0.41066376903909924,
"grad_norm": 0.8020321726799011,
"learning_rate": 0.0001997185446016419,
"loss": 1.2632,
"step": 2740
},
{
"epoch": 0.41216254191880397,
"grad_norm": 0.8517338037490845,
"learning_rate": 0.00019903100246219198,
"loss": 1.2514,
"step": 2750
},
{
"epoch": 0.4136613147985087,
"grad_norm": 0.7712230682373047,
"learning_rate": 0.00019834230502839548,
"loss": 1.2572,
"step": 2760
},
{
"epoch": 0.41516008767821344,
"grad_norm": 0.7825449705123901,
"learning_rate": 0.0001976524685277047,
"loss": 1.2391,
"step": 2770
},
{
"epoch": 0.41665886055791823,
"grad_norm": 0.8370509147644043,
"learning_rate": 0.00019696150921441125,
"loss": 1.2505,
"step": 2780
},
{
"epoch": 0.41815763343762297,
"grad_norm": 0.8516371846199036,
"learning_rate": 0.0001962694433692629,
"loss": 1.239,
"step": 2790
},
{
"epoch": 0.4196564063173277,
"grad_norm": 0.8301183581352234,
"learning_rate": 0.0001955762872990803,
"loss": 1.2399,
"step": 2800
},
{
"epoch": 0.42115517919703244,
"grad_norm": 0.7094582319259644,
"learning_rate": 0.00019488205733637234,
"loss": 1.2636,
"step": 2810
},
{
"epoch": 0.42265395207673717,
"grad_norm": 0.8317636847496033,
"learning_rate": 0.00019418676983895167,
"loss": 1.2369,
"step": 2820
},
{
"epoch": 0.4241527249564419,
"grad_norm": 0.8261407017707825,
"learning_rate": 0.00019349044118954916,
"loss": 1.2454,
"step": 2830
},
{
"epoch": 0.42565149783614664,
"grad_norm": 0.8200167417526245,
"learning_rate": 0.00019279308779542782,
"loss": 1.2505,
"step": 2840
},
{
"epoch": 0.4271502707158514,
"grad_norm": 0.8785801529884338,
"learning_rate": 0.00019209472608799604,
"loss": 1.2341,
"step": 2850
},
{
"epoch": 0.4286490435955561,
"grad_norm": 0.8413455486297607,
"learning_rate": 0.000191395372522421,
"loss": 1.2524,
"step": 2860
},
{
"epoch": 0.4301478164752609,
"grad_norm": 0.782440721988678,
"learning_rate": 0.00019069504357724024,
"loss": 1.2673,
"step": 2870
},
{
"epoch": 0.43164658935496564,
"grad_norm": 0.7999353408813477,
"learning_rate": 0.00018999375575397387,
"loss": 1.251,
"step": 2880
},
{
"epoch": 0.4331453622346704,
"grad_norm": 0.832058310508728,
"learning_rate": 0.00018929152557673555,
"loss": 1.253,
"step": 2890
},
{
"epoch": 0.4346441351143751,
"grad_norm": 0.7583170533180237,
"learning_rate": 0.0001885883695918432,
"loss": 1.2449,
"step": 2900
},
{
"epoch": 0.43614290799407984,
"grad_norm": 0.890659749507904,
"learning_rate": 0.000187884304367429,
"loss": 1.221,
"step": 2910
},
{
"epoch": 0.4376416808737846,
"grad_norm": 0.7491910457611084,
"learning_rate": 0.0001871793464930493,
"loss": 1.2231,
"step": 2920
},
{
"epoch": 0.4391404537534893,
"grad_norm": 0.8058717250823975,
"learning_rate": 0.0001864735125792934,
"loss": 1.2362,
"step": 2930
},
{
"epoch": 0.44063922663319405,
"grad_norm": 0.8197429776191711,
"learning_rate": 0.00018576681925739234,
"loss": 1.2177,
"step": 2940
},
{
"epoch": 0.44213799951289884,
"grad_norm": 0.8527374267578125,
"learning_rate": 0.00018505928317882696,
"loss": 1.2395,
"step": 2950
},
{
"epoch": 0.4436367723926036,
"grad_norm": 0.7480415105819702,
"learning_rate": 0.00018435092101493569,
"loss": 1.2462,
"step": 2960
},
{
"epoch": 0.4451355452723083,
"grad_norm": 0.7680370807647705,
"learning_rate": 0.00018364174945652146,
"loss": 1.2358,
"step": 2970
},
{
"epoch": 0.44663431815201304,
"grad_norm": 0.8246043920516968,
"learning_rate": 0.00018293178521345868,
"loss": 1.2222,
"step": 2980
},
{
"epoch": 0.4481330910317178,
"grad_norm": 0.7838782668113708,
"learning_rate": 0.0001822210450142994,
"loss": 1.2292,
"step": 2990
},
{
"epoch": 0.4496318639114225,
"grad_norm": 0.7519893646240234,
"learning_rate": 0.00018150954560587913,
"loss": 1.2536,
"step": 3000
},
{
"epoch": 0.45113063679112725,
"grad_norm": 0.8429172039031982,
"learning_rate": 0.00018079730375292232,
"loss": 1.2141,
"step": 3010
},
{
"epoch": 0.452629409670832,
"grad_norm": 0.8938582539558411,
"learning_rate": 0.00018008433623764721,
"loss": 1.2318,
"step": 3020
},
{
"epoch": 0.4541281825505368,
"grad_norm": 0.7177508473396301,
"learning_rate": 0.00017937065985937055,
"loss": 1.2494,
"step": 3030
},
{
"epoch": 0.4556269554302415,
"grad_norm": 0.8130243420600891,
"learning_rate": 0.00017865629143411162,
"loss": 1.2409,
"step": 3040
},
{
"epoch": 0.45712572830994624,
"grad_norm": 0.8147348165512085,
"learning_rate": 0.0001779412477941962,
"loss": 1.2347,
"step": 3050
},
{
"epoch": 0.458624501189651,
"grad_norm": 0.8635284900665283,
"learning_rate": 0.00017722554578785972,
"loss": 1.24,
"step": 3060
},
{
"epoch": 0.4601232740693557,
"grad_norm": 0.8101358413696289,
"learning_rate": 0.00017650920227885045,
"loss": 1.2441,
"step": 3070
},
{
"epoch": 0.46162204694906045,
"grad_norm": 0.7738655209541321,
"learning_rate": 0.00017579223414603202,
"loss": 1.2395,
"step": 3080
},
{
"epoch": 0.4631208198287652,
"grad_norm": 0.7867494225502014,
"learning_rate": 0.00017507465828298587,
"loss": 1.2334,
"step": 3090
},
{
"epoch": 0.4646195927084699,
"grad_norm": 0.8157032132148743,
"learning_rate": 0.00017435649159761298,
"loss": 1.232,
"step": 3100
},
{
"epoch": 0.46611836558817465,
"grad_norm": 0.7872126698493958,
"learning_rate": 0.0001736377510117357,
"loss": 1.231,
"step": 3110
},
{
"epoch": 0.46761713846787945,
"grad_norm": 0.8418169021606445,
"learning_rate": 0.00017291845346069888,
"loss": 1.238,
"step": 3120
},
{
"epoch": 0.4691159113475842,
"grad_norm": 0.8292982578277588,
"learning_rate": 0.00017219861589297083,
"loss": 1.2445,
"step": 3130
},
{
"epoch": 0.4706146842272889,
"grad_norm": 0.8435372710227966,
"learning_rate": 0.00017147825526974417,
"loss": 1.2397,
"step": 3140
},
{
"epoch": 0.47211345710699365,
"grad_norm": 0.8216134309768677,
"learning_rate": 0.0001707573885645359,
"loss": 1.2427,
"step": 3150
},
{
"epoch": 0.4736122299866984,
"grad_norm": 0.7630952000617981,
"learning_rate": 0.00017003603276278764,
"loss": 1.2406,
"step": 3160
},
{
"epoch": 0.4751110028664031,
"grad_norm": 0.7559238076210022,
"learning_rate": 0.0001693142048614653,
"loss": 1.2268,
"step": 3170
},
{
"epoch": 0.47660977574610786,
"grad_norm": 0.7821029424667358,
"learning_rate": 0.00016859192186865875,
"loss": 1.2235,
"step": 3180
},
{
"epoch": 0.4781085486258126,
"grad_norm": 0.7591744065284729,
"learning_rate": 0.00016786920080318085,
"loss": 1.2291,
"step": 3190
},
{
"epoch": 0.4796073215055174,
"grad_norm": 0.8490301966667175,
"learning_rate": 0.00016714605869416668,
"loss": 1.2494,
"step": 3200
},
{
"epoch": 0.4811060943852221,
"grad_norm": 0.8557929396629333,
"learning_rate": 0.00016642251258067205,
"loss": 1.2428,
"step": 3210
},
{
"epoch": 0.48260486726492685,
"grad_norm": 0.7579270601272583,
"learning_rate": 0.0001656985795112722,
"loss": 1.2354,
"step": 3220
},
{
"epoch": 0.4841036401446316,
"grad_norm": 0.7294782996177673,
"learning_rate": 0.0001649742765436601,
"loss": 1.2302,
"step": 3230
},
{
"epoch": 0.4856024130243363,
"grad_norm": 0.9022475481033325,
"learning_rate": 0.0001642496207442443,
"loss": 1.2174,
"step": 3240
},
{
"epoch": 0.48710118590404106,
"grad_norm": 0.8305564522743225,
"learning_rate": 0.0001635246291877471,
"loss": 1.2357,
"step": 3250
},
{
"epoch": 0.4885999587837458,
"grad_norm": 0.808020293712616,
"learning_rate": 0.000162799318956802,
"loss": 1.2283,
"step": 3260
},
{
"epoch": 0.4900987316634505,
"grad_norm": 0.8041689991950989,
"learning_rate": 0.00016207370714155128,
"loss": 1.223,
"step": 3270
},
{
"epoch": 0.4915975045431553,
"grad_norm": 0.7986804246902466,
"learning_rate": 0.0001613478108392434,
"loss": 1.2296,
"step": 3280
},
{
"epoch": 0.49309627742286005,
"grad_norm": 0.8282227516174316,
"learning_rate": 0.00016062164715382988,
"loss": 1.2211,
"step": 3290
},
{
"epoch": 0.4945950503025648,
"grad_norm": 0.7985265254974365,
"learning_rate": 0.00015989523319556265,
"loss": 1.2028,
"step": 3300
},
{
"epoch": 0.4960938231822695,
"grad_norm": 0.8276951909065247,
"learning_rate": 0.00015916858608059058,
"loss": 1.2534,
"step": 3310
},
{
"epoch": 0.49759259606197426,
"grad_norm": 0.8164412379264832,
"learning_rate": 0.00015844172293055637,
"loss": 1.2222,
"step": 3320
},
{
"epoch": 0.499091368941679,
"grad_norm": 0.7917633652687073,
"learning_rate": 0.000157714660872193,
"loss": 1.2351,
"step": 3330
},
{
"epoch": 0.5005901418213837,
"grad_norm": 0.8134817481040955,
"learning_rate": 0.00015698741703692025,
"loss": 1.2266,
"step": 3340
},
{
"epoch": 0.5005901418213837,
"eval_loss": 1.2627006769180298,
"eval_runtime": 34.7715,
"eval_samples_per_second": 718.979,
"eval_steps_per_second": 89.872,
"step": 3340
},
{
"epoch": 0.5020889147010885,
"grad_norm": 0.7944011092185974,
"learning_rate": 0.00015626000856044106,
"loss": 1.2145,
"step": 3350
},
{
"epoch": 0.5035876875807932,
"grad_norm": 0.8224478363990784,
"learning_rate": 0.00015553245258233763,
"loss": 1.2451,
"step": 3360
},
{
"epoch": 0.505086460460498,
"grad_norm": 0.7887598276138306,
"learning_rate": 0.0001548047662456678,
"loss": 1.2217,
"step": 3370
},
{
"epoch": 0.5065852333402027,
"grad_norm": 0.800266444683075,
"learning_rate": 0.00015407696669656091,
"loss": 1.2187,
"step": 3380
},
{
"epoch": 0.5080840062199075,
"grad_norm": 0.8337759971618652,
"learning_rate": 0.0001533490710838139,
"loss": 1.2172,
"step": 3390
},
{
"epoch": 0.5095827790996121,
"grad_norm": 0.7768263816833496,
"learning_rate": 0.0001526210965584872,
"loss": 1.2252,
"step": 3400
},
{
"epoch": 0.5110815519793169,
"grad_norm": 0.767500638961792,
"learning_rate": 0.00015189306027350063,
"loss": 1.2322,
"step": 3410
},
{
"epoch": 0.5125803248590217,
"grad_norm": 0.7581789493560791,
"learning_rate": 0.00015116497938322913,
"loss": 1.2376,
"step": 3420
},
{
"epoch": 0.5140790977387264,
"grad_norm": 0.7668277621269226,
"learning_rate": 0.00015043687104309886,
"loss": 1.2384,
"step": 3430
},
{
"epoch": 0.5155778706184312,
"grad_norm": 0.8178005218505859,
"learning_rate": 0.00014970875240918262,
"loss": 1.2252,
"step": 3440
},
{
"epoch": 0.5170766434981359,
"grad_norm": 0.7495589256286621,
"learning_rate": 0.00014898064063779574,
"loss": 1.2225,
"step": 3450
},
{
"epoch": 0.5185754163778407,
"grad_norm": 0.8504411578178406,
"learning_rate": 0.00014825255288509193,
"loss": 1.21,
"step": 3460
},
{
"epoch": 0.5200741892575453,
"grad_norm": 0.8137525916099548,
"learning_rate": 0.00014752450630665893,
"loss": 1.2015,
"step": 3470
},
{
"epoch": 0.5215729621372501,
"grad_norm": 0.7947141528129578,
"learning_rate": 0.00014679651805711428,
"loss": 1.2131,
"step": 3480
},
{
"epoch": 0.5230717350169549,
"grad_norm": 0.765978217124939,
"learning_rate": 0.00014606860528970116,
"loss": 1.225,
"step": 3490
},
{
"epoch": 0.5245705078966596,
"grad_norm": 0.8446719646453857,
"learning_rate": 0.00014534078515588425,
"loss": 1.2174,
"step": 3500
},
{
"epoch": 0.5260692807763644,
"grad_norm": 0.8156980276107788,
"learning_rate": 0.00014461307480494553,
"loss": 1.2255,
"step": 3510
},
{
"epoch": 0.5275680536560691,
"grad_norm": 0.747254490852356,
"learning_rate": 0.00014388549138358007,
"loss": 1.2382,
"step": 3520
},
{
"epoch": 0.5290668265357739,
"grad_norm": 0.8017699122428894,
"learning_rate": 0.0001431580520354924,
"loss": 1.2271,
"step": 3530
},
{
"epoch": 0.5305655994154785,
"grad_norm": 0.8388494849205017,
"learning_rate": 0.00014243077390099218,
"loss": 1.211,
"step": 3540
},
{
"epoch": 0.5320643722951833,
"grad_norm": 0.7686325311660767,
"learning_rate": 0.00014170367411659048,
"loss": 1.2073,
"step": 3550
},
{
"epoch": 0.533563145174888,
"grad_norm": 0.8303519487380981,
"learning_rate": 0.00014097676981459598,
"loss": 1.2087,
"step": 3560
},
{
"epoch": 0.5350619180545928,
"grad_norm": 0.7963255047798157,
"learning_rate": 0.0001402500781227114,
"loss": 1.2108,
"step": 3570
},
{
"epoch": 0.5365606909342976,
"grad_norm": 0.8091217875480652,
"learning_rate": 0.00013952361616362968,
"loss": 1.2041,
"step": 3580
},
{
"epoch": 0.5380594638140023,
"grad_norm": 0.8334858417510986,
"learning_rate": 0.00013879740105463074,
"loss": 1.2113,
"step": 3590
},
{
"epoch": 0.5395582366937071,
"grad_norm": 0.7692158818244934,
"learning_rate": 0.00013807144990717816,
"loss": 1.2095,
"step": 3600
},
{
"epoch": 0.5410570095734117,
"grad_norm": 0.8363123536109924,
"learning_rate": 0.00013734577982651584,
"loss": 1.2195,
"step": 3610
},
{
"epoch": 0.5425557824531165,
"grad_norm": 0.8173678517341614,
"learning_rate": 0.00013662040791126502,
"loss": 1.1964,
"step": 3620
},
{
"epoch": 0.5440545553328212,
"grad_norm": 0.7436100244522095,
"learning_rate": 0.0001358953512530215,
"loss": 1.2124,
"step": 3630
},
{
"epoch": 0.545553328212526,
"grad_norm": 0.7557327747344971,
"learning_rate": 0.00013517062693595266,
"loss": 1.208,
"step": 3640
},
{
"epoch": 0.5470521010922308,
"grad_norm": 0.7731929421424866,
"learning_rate": 0.00013444625203639531,
"loss": 1.2023,
"step": 3650
},
{
"epoch": 0.5485508739719355,
"grad_norm": 0.8247756958007812,
"learning_rate": 0.0001337222436224529,
"loss": 1.2056,
"step": 3660
},
{
"epoch": 0.5500496468516403,
"grad_norm": 0.8017778992652893,
"learning_rate": 0.00013299861875359367,
"loss": 1.2057,
"step": 3670
},
{
"epoch": 0.5515484197313449,
"grad_norm": 0.84889817237854,
"learning_rate": 0.00013227539448024855,
"loss": 1.2148,
"step": 3680
},
{
"epoch": 0.5530471926110497,
"grad_norm": 0.808515191078186,
"learning_rate": 0.00013155258784340934,
"loss": 1.2029,
"step": 3690
},
{
"epoch": 0.5545459654907544,
"grad_norm": 0.7231589555740356,
"learning_rate": 0.00013083021587422737,
"loss": 1.1948,
"step": 3700
},
{
"epoch": 0.5560447383704592,
"grad_norm": 0.8200321793556213,
"learning_rate": 0.0001301082955936121,
"loss": 1.2122,
"step": 3710
},
{
"epoch": 0.5575435112501639,
"grad_norm": 0.7973082065582275,
"learning_rate": 0.00012938684401183,
"loss": 1.2214,
"step": 3720
},
{
"epoch": 0.5590422841298687,
"grad_norm": 0.7436721324920654,
"learning_rate": 0.00012866587812810384,
"loss": 1.2124,
"step": 3730
},
{
"epoch": 0.5605410570095735,
"grad_norm": 0.794087290763855,
"learning_rate": 0.00012794541493021217,
"loss": 1.2057,
"step": 3740
},
{
"epoch": 0.5620398298892781,
"grad_norm": 0.815791666507721,
"learning_rate": 0.0001272254713940889,
"loss": 1.2078,
"step": 3750
},
{
"epoch": 0.5635386027689829,
"grad_norm": 0.8807167410850525,
"learning_rate": 0.0001265060644834235,
"loss": 1.228,
"step": 3760
},
{
"epoch": 0.5650373756486876,
"grad_norm": 0.8193700313568115,
"learning_rate": 0.00012578721114926098,
"loss": 1.1931,
"step": 3770
},
{
"epoch": 0.5665361485283924,
"grad_norm": 0.7985761761665344,
"learning_rate": 0.00012506892832960296,
"loss": 1.2146,
"step": 3780
},
{
"epoch": 0.5680349214080971,
"grad_norm": 0.7901711463928223,
"learning_rate": 0.00012435123294900815,
"loss": 1.1876,
"step": 3790
},
{
"epoch": 0.5695336942878019,
"grad_norm": 0.8111531138420105,
"learning_rate": 0.00012363414191819368,
"loss": 1.1815,
"step": 3800
},
{
"epoch": 0.5710324671675066,
"grad_norm": 0.7858944535255432,
"learning_rate": 0.00012291767213363678,
"loss": 1.2132,
"step": 3810
},
{
"epoch": 0.5725312400472113,
"grad_norm": 0.8336951732635498,
"learning_rate": 0.00012220184047717647,
"loss": 1.1849,
"step": 3820
},
{
"epoch": 0.5740300129269161,
"grad_norm": 0.8228222131729126,
"learning_rate": 0.00012148666381561589,
"loss": 1.2119,
"step": 3830
},
{
"epoch": 0.5755287858066208,
"grad_norm": 0.7975912690162659,
"learning_rate": 0.0001207721590003248,
"loss": 1.1882,
"step": 3840
},
{
"epoch": 0.5770275586863256,
"grad_norm": 0.816520631313324,
"learning_rate": 0.00012005834286684263,
"loss": 1.2164,
"step": 3850
},
{
"epoch": 0.5785263315660303,
"grad_norm": 0.8109871745109558,
"learning_rate": 0.00011934523223448168,
"loss": 1.1933,
"step": 3860
},
{
"epoch": 0.5800251044457351,
"grad_norm": 0.8165502548217773,
"learning_rate": 0.00011863284390593089,
"loss": 1.2,
"step": 3870
},
{
"epoch": 0.5815238773254398,
"grad_norm": 0.7544243931770325,
"learning_rate": 0.00011792119466685983,
"loss": 1.2173,
"step": 3880
},
{
"epoch": 0.5830226502051445,
"grad_norm": 0.822040855884552,
"learning_rate": 0.00011721030128552338,
"loss": 1.205,
"step": 3890
},
{
"epoch": 0.5845214230848493,
"grad_norm": 0.8056491017341614,
"learning_rate": 0.0001165001805123664,
"loss": 1.2011,
"step": 3900
},
{
"epoch": 0.586020195964554,
"grad_norm": 0.8770938515663147,
"learning_rate": 0.00011579084907962914,
"loss": 1.1903,
"step": 3910
},
{
"epoch": 0.5875189688442588,
"grad_norm": 0.833992600440979,
"learning_rate": 0.0001150823237009531,
"loss": 1.193,
"step": 3920
},
{
"epoch": 0.5890177417239635,
"grad_norm": 0.8358356356620789,
"learning_rate": 0.00011437462107098694,
"loss": 1.191,
"step": 3930
},
{
"epoch": 0.5905165146036683,
"grad_norm": 0.9014224410057068,
"learning_rate": 0.00011366775786499347,
"loss": 1.2004,
"step": 3940
},
{
"epoch": 0.592015287483373,
"grad_norm": 0.8393827080726624,
"learning_rate": 0.00011296175073845642,
"loss": 1.2118,
"step": 3950
},
{
"epoch": 0.5935140603630777,
"grad_norm": 0.8268956542015076,
"learning_rate": 0.00011225661632668815,
"loss": 1.2083,
"step": 3960
},
{
"epoch": 0.5950128332427824,
"grad_norm": 0.8107516169548035,
"learning_rate": 0.00011155237124443766,
"loss": 1.2094,
"step": 3970
},
{
"epoch": 0.5965116061224872,
"grad_norm": 0.8380411267280579,
"learning_rate": 0.00011084903208549916,
"loss": 1.2159,
"step": 3980
},
{
"epoch": 0.598010379002192,
"grad_norm": 0.8165388107299805,
"learning_rate": 0.00011014661542232089,
"loss": 1.2024,
"step": 3990
},
{
"epoch": 0.5995091518818967,
"grad_norm": 0.8068062663078308,
"learning_rate": 0.00010944513780561495,
"loss": 1.1826,
"step": 4000
},
{
"epoch": 0.6007081701856605,
"eval_loss": 1.2374228239059448,
"eval_runtime": 34.9516,
"eval_samples_per_second": 715.274,
"eval_steps_per_second": 89.409,
"step": 4008
},
{
"epoch": 0.6010079247616015,
"grad_norm": 0.7994382381439209,
"learning_rate": 0.00010874461576396688,
"loss": 1.2043,
"step": 4010
},
{
"epoch": 0.6025066976413062,
"grad_norm": 0.8711692690849304,
"learning_rate": 0.00010804506580344664,
"loss": 1.2145,
"step": 4020
},
{
"epoch": 0.604005470521011,
"grad_norm": 0.8061762452125549,
"learning_rate": 0.00010734650440721944,
"loss": 1.1931,
"step": 4030
},
{
"epoch": 0.6055042434007156,
"grad_norm": 0.8066831827163696,
"learning_rate": 0.00010664894803515744,
"loss": 1.1848,
"step": 4040
},
{
"epoch": 0.6070030162804204,
"grad_norm": 0.8515334725379944,
"learning_rate": 0.00010595241312345186,
"loss": 1.2201,
"step": 4050
},
{
"epoch": 0.6085017891601251,
"grad_norm": 0.8231693506240845,
"learning_rate": 0.00010525691608422577,
"loss": 1.2027,
"step": 4060
},
{
"epoch": 0.6100005620398299,
"grad_norm": 0.8122180104255676,
"learning_rate": 0.00010456247330514733,
"loss": 1.1939,
"step": 4070
},
{
"epoch": 0.6114993349195347,
"grad_norm": 0.8237882852554321,
"learning_rate": 0.00010386910114904364,
"loss": 1.1879,
"step": 4080
},
{
"epoch": 0.6129981077992394,
"grad_norm": 0.8404508829116821,
"learning_rate": 0.00010317681595351525,
"loss": 1.201,
"step": 4090
},
{
"epoch": 0.6144968806789441,
"grad_norm": 0.7942978143692017,
"learning_rate": 0.00010248563403055112,
"loss": 1.1978,
"step": 4100
},
{
"epoch": 0.6159956535586488,
"grad_norm": 0.8365768194198608,
"learning_rate": 0.00010179557166614439,
"loss": 1.1903,
"step": 4110
},
{
"epoch": 0.6174944264383536,
"grad_norm": 0.8552048802375793,
"learning_rate": 0.00010110664511990852,
"loss": 1.1894,
"step": 4120
},
{
"epoch": 0.6189931993180583,
"grad_norm": 0.7810205817222595,
"learning_rate": 0.00010041887062469425,
"loss": 1.2134,
"step": 4130
},
{
"epoch": 0.6204919721977631,
"grad_norm": 0.8970271348953247,
"learning_rate": 9.973226438620703e-05,
"loss": 1.1936,
"step": 4140
},
{
"epoch": 0.6219907450774679,
"grad_norm": 0.8107681274414062,
"learning_rate": 9.904684258262535e-05,
"loss": 1.1842,
"step": 4150
},
{
"epoch": 0.6234895179571726,
"grad_norm": 0.7766697406768799,
"learning_rate": 9.836262136421924e-05,
"loss": 1.2083,
"step": 4160
},
{
"epoch": 0.6249882908368773,
"grad_norm": 0.818970799446106,
"learning_rate": 9.767961685297012e-05,
"loss": 1.2042,
"step": 4170
},
{
"epoch": 0.626487063716582,
"grad_norm": 0.8687489628791809,
"learning_rate": 9.699784514219056e-05,
"loss": 1.2028,
"step": 4180
},
{
"epoch": 0.6279858365962868,
"grad_norm": 0.8318779468536377,
"learning_rate": 9.631732229614529e-05,
"loss": 1.1856,
"step": 4190
},
{
"epoch": 0.6294846094759915,
"grad_norm": 0.8384169340133667,
"learning_rate": 9.56380643496726e-05,
"loss": 1.1887,
"step": 4200
},
{
"epoch": 0.6309833823556963,
"grad_norm": 0.8248523473739624,
"learning_rate": 9.496008730780657e-05,
"loss": 1.1771,
"step": 4210
},
{
"epoch": 0.632482155235401,
"grad_norm": 0.8473740220069885,
"learning_rate": 9.428340714539999e-05,
"loss": 1.1899,
"step": 4220
},
{
"epoch": 0.6339809281151058,
"grad_norm": 0.8694414496421814,
"learning_rate": 9.360803980674773e-05,
"loss": 1.2023,
"step": 4230
},
{
"epoch": 0.6354797009948105,
"grad_norm": 0.7950302958488464,
"learning_rate": 9.29340012052114e-05,
"loss": 1.1996,
"step": 4240
},
{
"epoch": 0.6369784738745152,
"grad_norm": 0.8233305215835571,
"learning_rate": 9.226130722284413e-05,
"loss": 1.1922,
"step": 4250
},
{
"epoch": 0.63847724675422,
"grad_norm": 0.894221842288971,
"learning_rate": 9.158997371001634e-05,
"loss": 1.1839,
"step": 4260
},
{
"epoch": 0.6399760196339247,
"grad_norm": 0.8127320408821106,
"learning_rate": 9.092001648504245e-05,
"loss": 1.186,
"step": 4270
},
{
"epoch": 0.6414747925136295,
"grad_norm": 0.8138620257377625,
"learning_rate": 9.025145133380806e-05,
"loss": 1.1845,
"step": 4280
},
{
"epoch": 0.6429735653933342,
"grad_norm": 0.7820717692375183,
"learning_rate": 8.958429400939794e-05,
"loss": 1.2,
"step": 4290
},
{
"epoch": 0.644472338273039,
"grad_norm": 0.8710906505584717,
"learning_rate": 8.891856023172496e-05,
"loss": 1.1838,
"step": 4300
},
{
"epoch": 0.6459711111527436,
"grad_norm": 0.8574820756912231,
"learning_rate": 8.825426568715958e-05,
"loss": 1.1876,
"step": 4310
},
{
"epoch": 0.6474698840324484,
"grad_norm": 0.8116622567176819,
"learning_rate": 8.759142602816032e-05,
"loss": 1.1908,
"step": 4320
},
{
"epoch": 0.6489686569121532,
"grad_norm": 0.82753586769104,
"learning_rate": 8.693005687290486e-05,
"loss": 1.1915,
"step": 4330
},
{
"epoch": 0.6504674297918579,
"grad_norm": 0.8335912227630615,
"learning_rate": 8.627017380492228e-05,
"loss": 1.191,
"step": 4340
},
{
"epoch": 0.6519662026715627,
"grad_norm": 0.8342856168746948,
"learning_rate": 8.561179237272537e-05,
"loss": 1.2059,
"step": 4350
},
{
"epoch": 0.6534649755512674,
"grad_norm": 0.8831748366355896,
"learning_rate": 8.495492808944492e-05,
"loss": 1.1895,
"step": 4360
},
{
"epoch": 0.6549637484309722,
"grad_norm": 0.7899265885353088,
"learning_rate": 8.429959643246359e-05,
"loss": 1.1913,
"step": 4370
},
{
"epoch": 0.6564625213106768,
"grad_norm": 0.7902125716209412,
"learning_rate": 8.364581284305171e-05,
"loss": 1.1883,
"step": 4380
},
{
"epoch": 0.6579612941903816,
"grad_norm": 0.8503438830375671,
"learning_rate": 8.299359272600301e-05,
"loss": 1.1887,
"step": 4390
},
{
"epoch": 0.6594600670700864,
"grad_norm": 0.8556089997291565,
"learning_rate": 8.234295144927204e-05,
"loss": 1.1751,
"step": 4400
},
{
"epoch": 0.6609588399497911,
"grad_norm": 0.837721586227417,
"learning_rate": 8.169390434361184e-05,
"loss": 1.1833,
"step": 4410
},
{
"epoch": 0.6624576128294959,
"grad_norm": 0.8476043343544006,
"learning_rate": 8.104646670221263e-05,
"loss": 1.2005,
"step": 4420
},
{
"epoch": 0.6639563857092006,
"grad_norm": 0.8832587599754333,
"learning_rate": 8.040065378034176e-05,
"loss": 1.1833,
"step": 4430
},
{
"epoch": 0.6654551585889054,
"grad_norm": 0.8358840346336365,
"learning_rate": 7.975648079498393e-05,
"loss": 1.1691,
"step": 4440
},
{
"epoch": 0.66695393146861,
"grad_norm": 0.8542131185531616,
"learning_rate": 7.911396292448295e-05,
"loss": 1.1891,
"step": 4450
},
{
"epoch": 0.6684527043483148,
"grad_norm": 0.7880673408508301,
"learning_rate": 7.847311530818372e-05,
"loss": 1.1668,
"step": 4460
},
{
"epoch": 0.6699514772280195,
"grad_norm": 0.8573721051216125,
"learning_rate": 7.783395304607596e-05,
"loss": 1.1895,
"step": 4470
},
{
"epoch": 0.6714502501077243,
"grad_norm": 0.8283008337020874,
"learning_rate": 7.719649119843801e-05,
"loss": 1.1685,
"step": 4480
},
{
"epoch": 0.6729490229874291,
"grad_norm": 0.8286172151565552,
"learning_rate": 7.656074478548231e-05,
"loss": 1.1723,
"step": 4490
},
{
"epoch": 0.6744477958671338,
"grad_norm": 0.9178438782691956,
"learning_rate": 7.592672878700118e-05,
"loss": 1.1861,
"step": 4500
},
{
"epoch": 0.6759465687468386,
"grad_norm": 0.8172318339347839,
"learning_rate": 7.529445814201399e-05,
"loss": 1.1899,
"step": 4510
},
{
"epoch": 0.6774453416265432,
"grad_norm": 0.8141478896141052,
"learning_rate": 7.466394774841536e-05,
"loss": 1.1707,
"step": 4520
},
{
"epoch": 0.678944114506248,
"grad_norm": 0.8715807795524597,
"learning_rate": 7.40352124626237e-05,
"loss": 1.1672,
"step": 4530
},
{
"epoch": 0.6804428873859527,
"grad_norm": 0.909372091293335,
"learning_rate": 7.340826709923161e-05,
"loss": 1.1773,
"step": 4540
},
{
"epoch": 0.6819416602656575,
"grad_norm": 0.7548136115074158,
"learning_rate": 7.278312643065637e-05,
"loss": 1.1886,
"step": 4550
},
{
"epoch": 0.6834404331453622,
"grad_norm": 0.8248059749603271,
"learning_rate": 7.215980518679235e-05,
"loss": 1.1779,
"step": 4560
},
{
"epoch": 0.684939206025067,
"grad_norm": 0.8389328122138977,
"learning_rate": 7.153831805466337e-05,
"loss": 1.1894,
"step": 4570
},
{
"epoch": 0.6864379789047718,
"grad_norm": 0.8137337565422058,
"learning_rate": 7.091867967807722e-05,
"loss": 1.1864,
"step": 4580
},
{
"epoch": 0.6879367517844764,
"grad_norm": 0.884548008441925,
"learning_rate": 7.030090465728023e-05,
"loss": 1.198,
"step": 4590
},
{
"epoch": 0.6894355246641812,
"grad_norm": 0.803050696849823,
"learning_rate": 6.968500754861329e-05,
"loss": 1.1778,
"step": 4600
},
{
"epoch": 0.6909342975438859,
"grad_norm": 0.870222270488739,
"learning_rate": 6.907100286416906e-05,
"loss": 1.174,
"step": 4610
},
{
"epoch": 0.6924330704235907,
"grad_norm": 0.8954461812973022,
"learning_rate": 6.845890507144973e-05,
"loss": 1.1967,
"step": 4620
},
{
"epoch": 0.6939318433032954,
"grad_norm": 0.8398353457450867,
"learning_rate": 6.784872859302653e-05,
"loss": 1.1678,
"step": 4630
},
{
"epoch": 0.6954306161830002,
"grad_norm": 0.8398712277412415,
"learning_rate": 6.724048780619943e-05,
"loss": 1.1912,
"step": 4640
},
{
"epoch": 0.696929389062705,
"grad_norm": 0.7871512770652771,
"learning_rate": 6.663419704265887e-05,
"loss": 1.1626,
"step": 4650
},
{
"epoch": 0.6984281619424096,
"grad_norm": 0.7826485633850098,
"learning_rate": 6.602987058814751e-05,
"loss": 1.1594,
"step": 4660
},
{
"epoch": 0.6999269348221144,
"grad_norm": 0.8265202641487122,
"learning_rate": 6.542752268212422e-05,
"loss": 1.1572,
"step": 4670
},
{
"epoch": 0.7008261985499372,
"eval_loss": 1.2169182300567627,
"eval_runtime": 34.6978,
"eval_samples_per_second": 720.507,
"eval_steps_per_second": 90.063,
"step": 4676
},
{
"epoch": 0.7014257077018191,
"grad_norm": 0.8427773714065552,
"learning_rate": 6.482716751742804e-05,
"loss": 1.1973,
"step": 4680
},
{
"epoch": 0.7029244805815239,
"grad_norm": 0.886134684085846,
"learning_rate": 6.422881923994411e-05,
"loss": 1.1851,
"step": 4690
},
{
"epoch": 0.7044232534612286,
"grad_norm": 0.8166589140892029,
"learning_rate": 6.363249194827026e-05,
"loss": 1.1851,
"step": 4700
},
{
"epoch": 0.7059220263409334,
"grad_norm": 0.9108282923698425,
"learning_rate": 6.303819969338465e-05,
"loss": 1.1689,
"step": 4710
},
{
"epoch": 0.707420799220638,
"grad_norm": 0.927238404750824,
"learning_rate": 6.2445956478315e-05,
"loss": 1.1975,
"step": 4720
},
{
"epoch": 0.7089195721003428,
"grad_norm": 0.8622543811798096,
"learning_rate": 6.185577625780826e-05,
"loss": 1.1834,
"step": 4730
},
{
"epoch": 0.7104183449800476,
"grad_norm": 0.8211912512779236,
"learning_rate": 6.126767293800227e-05,
"loss": 1.1775,
"step": 4740
},
{
"epoch": 0.7119171178597523,
"grad_norm": 0.8494730591773987,
"learning_rate": 6.0681660376097654e-05,
"loss": 1.1705,
"step": 4750
},
{
"epoch": 0.7134158907394571,
"grad_norm": 0.8458199501037598,
"learning_rate": 6.00977523800315e-05,
"loss": 1.1944,
"step": 4760
},
{
"epoch": 0.7149146636191618,
"grad_norm": 0.8529589176177979,
"learning_rate": 5.951596270815212e-05,
"loss": 1.1913,
"step": 4770
},
{
"epoch": 0.7164134364988666,
"grad_norm": 0.8020589351654053,
"learning_rate": 5.893630506889463e-05,
"loss": 1.1746,
"step": 4780
},
{
"epoch": 0.7179122093785713,
"grad_norm": 0.8732789158821106,
"learning_rate": 5.835879312045821e-05,
"loss": 1.1859,
"step": 4790
},
{
"epoch": 0.719410982258276,
"grad_norm": 0.8340930342674255,
"learning_rate": 5.7783440470483965e-05,
"loss": 1.1795,
"step": 4800
},
{
"epoch": 0.7209097551379807,
"grad_norm": 0.816852867603302,
"learning_rate": 5.7210260675734656e-05,
"loss": 1.1825,
"step": 4810
},
{
"epoch": 0.7224085280176855,
"grad_norm": 0.9594895243644714,
"learning_rate": 5.663926724177489e-05,
"loss": 1.1665,
"step": 4820
},
{
"epoch": 0.7239073008973903,
"grad_norm": 0.8372864127159119,
"learning_rate": 5.6070473622653293e-05,
"loss": 1.1694,
"step": 4830
},
{
"epoch": 0.725406073777095,
"grad_norm": 0.8622620105743408,
"learning_rate": 5.5503893220585096e-05,
"loss": 1.1676,
"step": 4840
},
{
"epoch": 0.7269048466567998,
"grad_norm": 0.8314744234085083,
"learning_rate": 5.493953938563666e-05,
"loss": 1.1847,
"step": 4850
},
{
"epoch": 0.7284036195365045,
"grad_norm": 0.7822853326797485,
"learning_rate": 5.437742541541085e-05,
"loss": 1.1617,
"step": 4860
},
{
"epoch": 0.7299023924162092,
"grad_norm": 0.8349812030792236,
"learning_rate": 5.381756455473346e-05,
"loss": 1.167,
"step": 4870
},
{
"epoch": 0.7314011652959139,
"grad_norm": 0.8560863733291626,
"learning_rate": 5.3259969995341535e-05,
"loss": 1.1723,
"step": 4880
},
{
"epoch": 0.7328999381756187,
"grad_norm": 0.8179499506950378,
"learning_rate": 5.270465487557218e-05,
"loss": 1.1685,
"step": 4890
},
{
"epoch": 0.7343987110553235,
"grad_norm": 0.8827424645423889,
"learning_rate": 5.215163228005328e-05,
"loss": 1.1701,
"step": 4900
},
{
"epoch": 0.7358974839350282,
"grad_norm": 0.847288191318512,
"learning_rate": 5.16009152393949e-05,
"loss": 1.1827,
"step": 4910
},
{
"epoch": 0.737396256814733,
"grad_norm": 0.8526114225387573,
"learning_rate": 5.105251672988256e-05,
"loss": 1.2011,
"step": 4920
},
{
"epoch": 0.7388950296944377,
"grad_norm": 0.9031111598014832,
"learning_rate": 5.050644967317117e-05,
"loss": 1.1769,
"step": 4930
},
{
"epoch": 0.7403938025741424,
"grad_norm": 0.8632171154022217,
"learning_rate": 4.996272693598088e-05,
"loss": 1.1855,
"step": 4940
},
{
"epoch": 0.7418925754538471,
"grad_norm": 0.8957777619361877,
"learning_rate": 4.9421361329793593e-05,
"loss": 1.162,
"step": 4950
},
{
"epoch": 0.7433913483335519,
"grad_norm": 0.8075733184814453,
"learning_rate": 4.888236561055135e-05,
"loss": 1.1684,
"step": 4960
},
{
"epoch": 0.7448901212132566,
"grad_norm": 0.8080265522003174,
"learning_rate": 4.834575247835571e-05,
"loss": 1.1775,
"step": 4970
},
{
"epoch": 0.7463888940929614,
"grad_norm": 0.8255054950714111,
"learning_rate": 4.7811534577168265e-05,
"loss": 1.2002,
"step": 4980
},
{
"epoch": 0.7478876669726662,
"grad_norm": 0.8013522624969482,
"learning_rate": 4.7279724494513196e-05,
"loss": 1.1784,
"step": 4990
},
{
"epoch": 0.7493864398523709,
"grad_norm": 0.8615186810493469,
"learning_rate": 4.675033476118002e-05,
"loss": 1.169,
"step": 5000
},
{
"epoch": 0.7508852127320756,
"grad_norm": 0.8255153894424438,
"learning_rate": 4.622337785092908e-05,
"loss": 1.1701,
"step": 5010
},
{
"epoch": 0.7523839856117803,
"grad_norm": 0.9061243534088135,
"learning_rate": 4.569886618019698e-05,
"loss": 1.1844,
"step": 5020
},
{
"epoch": 0.7538827584914851,
"grad_norm": 0.8674840927124023,
"learning_rate": 4.517681210780446e-05,
"loss": 1.1731,
"step": 5030
},
{
"epoch": 0.7553815313711898,
"grad_norm": 0.8248318433761597,
"learning_rate": 4.465722793466503e-05,
"loss": 1.1787,
"step": 5040
},
{
"epoch": 0.7568803042508946,
"grad_norm": 0.8240404725074768,
"learning_rate": 4.414012590349503e-05,
"loss": 1.1802,
"step": 5050
},
{
"epoch": 0.7583790771305994,
"grad_norm": 0.9083341956138611,
"learning_rate": 4.362551819852536e-05,
"loss": 1.1687,
"step": 5060
},
{
"epoch": 0.7598778500103041,
"grad_norm": 0.929366409778595,
"learning_rate": 4.3113416945214186e-05,
"loss": 1.2057,
"step": 5070
},
{
"epoch": 0.7613766228900088,
"grad_norm": 0.8615067005157471,
"learning_rate": 4.26038342099615e-05,
"loss": 1.172,
"step": 5080
},
{
"epoch": 0.7628753957697135,
"grad_norm": 0.8247563242912292,
"learning_rate": 4.209678199982441e-05,
"loss": 1.188,
"step": 5090
},
{
"epoch": 0.7643741686494183,
"grad_norm": 0.9103371500968933,
"learning_rate": 4.1592272262234714e-05,
"loss": 1.1673,
"step": 5100
},
{
"epoch": 0.765872941529123,
"grad_norm": 0.9434977769851685,
"learning_rate": 4.109031688471692e-05,
"loss": 1.1764,
"step": 5110
},
{
"epoch": 0.7673717144088278,
"grad_norm": 0.912162721157074,
"learning_rate": 4.059092769460852e-05,
"loss": 1.1589,
"step": 5120
},
{
"epoch": 0.7688704872885325,
"grad_norm": 0.9519455432891846,
"learning_rate": 4.009411645878097e-05,
"loss": 1.1769,
"step": 5130
},
{
"epoch": 0.7703692601682373,
"grad_norm": 0.8289220929145813,
"learning_rate": 3.9599894883362757e-05,
"loss": 1.1705,
"step": 5140
},
{
"epoch": 0.771868033047942,
"grad_norm": 0.8658756613731384,
"learning_rate": 3.910827461346339e-05,
"loss": 1.1668,
"step": 5150
},
{
"epoch": 0.7733668059276467,
"grad_norm": 0.8500393629074097,
"learning_rate": 3.8619267232898974e-05,
"loss": 1.1676,
"step": 5160
},
{
"epoch": 0.7748655788073515,
"grad_norm": 0.8148017525672913,
"learning_rate": 3.813288426391946e-05,
"loss": 1.1767,
"step": 5170
},
{
"epoch": 0.7763643516870562,
"grad_norm": 0.868813693523407,
"learning_rate": 3.7649137166936865e-05,
"loss": 1.1652,
"step": 5180
},
{
"epoch": 0.777863124566761,
"grad_norm": 0.8340365290641785,
"learning_rate": 3.716803734025559e-05,
"loss": 1.1581,
"step": 5190
},
{
"epoch": 0.7793618974464657,
"grad_norm": 0.8384237885475159,
"learning_rate": 3.668959611980345e-05,
"loss": 1.1673,
"step": 5200
},
{
"epoch": 0.7808606703261705,
"grad_norm": 0.9455569386482239,
"learning_rate": 3.6213824778865e-05,
"loss": 1.1678,
"step": 5210
},
{
"epoch": 0.7823594432058751,
"grad_norm": 0.8394315838813782,
"learning_rate": 3.574073452781544e-05,
"loss": 1.1683,
"step": 5220
},
{
"epoch": 0.7838582160855799,
"grad_norm": 0.8814610242843628,
"learning_rate": 3.527033651385699e-05,
"loss": 1.1647,
"step": 5230
},
{
"epoch": 0.7853569889652847,
"grad_norm": 0.8106175661087036,
"learning_rate": 3.480264182075573e-05,
"loss": 1.1664,
"step": 5240
},
{
"epoch": 0.7868557618449894,
"grad_norm": 0.8509274125099182,
"learning_rate": 3.4337661468580715e-05,
"loss": 1.1634,
"step": 5250
},
{
"epoch": 0.7883545347246942,
"grad_norm": 0.8374130725860596,
"learning_rate": 3.387540641344441e-05,
"loss": 1.1955,
"step": 5260
},
{
"epoch": 0.7898533076043989,
"grad_norm": 0.8402609825134277,
"learning_rate": 3.34158875472442e-05,
"loss": 1.1601,
"step": 5270
},
{
"epoch": 0.7913520804841037,
"grad_norm": 0.8734246492385864,
"learning_rate": 3.29591156974061e-05,
"loss": 1.1602,
"step": 5280
},
{
"epoch": 0.7928508533638083,
"grad_norm": 0.9936115741729736,
"learning_rate": 3.250510162662933e-05,
"loss": 1.1824,
"step": 5290
},
{
"epoch": 0.7943496262435131,
"grad_norm": 0.8631161451339722,
"learning_rate": 3.2053856032633016e-05,
"loss": 1.1679,
"step": 5300
},
{
"epoch": 0.7958483991232179,
"grad_norm": 0.8633742928504944,
"learning_rate": 3.160538954790385e-05,
"loss": 1.1603,
"step": 5310
},
{
"epoch": 0.7973471720029226,
"grad_norm": 0.8703827857971191,
"learning_rate": 3.1159712739445755e-05,
"loss": 1.1476,
"step": 5320
},
{
"epoch": 0.7988459448826274,
"grad_norm": 0.9506652355194092,
"learning_rate": 3.071683610853085e-05,
"loss": 1.1796,
"step": 5330
},
{
"epoch": 0.8003447177623321,
"grad_norm": 0.8602906465530396,
"learning_rate": 3.0276770090451873e-05,
"loss": 1.1754,
"step": 5340
},
{
"epoch": 0.800944226914214,
"eval_loss": 1.2033511400222778,
"eval_runtime": 34.2855,
"eval_samples_per_second": 729.172,
"eval_steps_per_second": 91.146,
"step": 5344
},
{
"epoch": 0.8018434906420369,
"grad_norm": 0.8773970007896423,
"learning_rate": 2.983952505427659e-05,
"loss": 1.1674,
"step": 5350
},
{
"epoch": 0.8033422635217415,
"grad_norm": 0.91343754529953,
"learning_rate": 2.940511130260314e-05,
"loss": 1.1703,
"step": 5360
},
{
"epoch": 0.8048410364014463,
"grad_norm": 0.8615610003471375,
"learning_rate": 2.8973539071317558e-05,
"loss": 1.158,
"step": 5370
},
{
"epoch": 0.806339809281151,
"grad_norm": 0.8163581490516663,
"learning_rate": 2.8544818529352408e-05,
"loss": 1.1588,
"step": 5380
},
{
"epoch": 0.8078385821608558,
"grad_norm": 0.8749719262123108,
"learning_rate": 2.8118959778447318e-05,
"loss": 1.1594,
"step": 5390
},
{
"epoch": 0.8093373550405606,
"grad_norm": 0.8937957882881165,
"learning_rate": 2.7695972852910774e-05,
"loss": 1.1598,
"step": 5400
},
{
"epoch": 0.8108361279202653,
"grad_norm": 0.8792014122009277,
"learning_rate": 2.7275867719383954e-05,
"loss": 1.1771,
"step": 5410
},
{
"epoch": 0.8123349007999701,
"grad_norm": 0.9492007493972778,
"learning_rate": 2.6858654276605536e-05,
"loss": 1.1764,
"step": 5420
},
{
"epoch": 0.8138336736796747,
"grad_norm": 0.8084315061569214,
"learning_rate": 2.6444342355178816e-05,
"loss": 1.1667,
"step": 5430
},
{
"epoch": 0.8153324465593795,
"grad_norm": 0.8339678049087524,
"learning_rate": 2.6032941717339882e-05,
"loss": 1.1612,
"step": 5440
},
{
"epoch": 0.8168312194390842,
"grad_norm": 0.8784351348876953,
"learning_rate": 2.5624462056727563e-05,
"loss": 1.1567,
"step": 5450
},
{
"epoch": 0.818329992318789,
"grad_norm": 0.8871691226959229,
"learning_rate": 2.521891299815515e-05,
"loss": 1.1672,
"step": 5460
},
{
"epoch": 0.8198287651984937,
"grad_norm": 0.8839700222015381,
"learning_rate": 2.4816304097383462e-05,
"loss": 1.1592,
"step": 5470
},
{
"epoch": 0.8213275380781985,
"grad_norm": 0.931260883808136,
"learning_rate": 2.4416644840895912e-05,
"loss": 1.1625,
"step": 5480
},
{
"epoch": 0.8228263109579033,
"grad_norm": 0.9054010510444641,
"learning_rate": 2.4019944645674595e-05,
"loss": 1.165,
"step": 5490
},
{
"epoch": 0.8243250838376079,
"grad_norm": 0.9103521704673767,
"learning_rate": 2.3626212858978894e-05,
"loss": 1.1598,
"step": 5500
},
{
"epoch": 0.8258238567173127,
"grad_norm": 0.8303549885749817,
"learning_rate": 2.3235458758124876e-05,
"loss": 1.1667,
"step": 5510
},
{
"epoch": 0.8273226295970174,
"grad_norm": 0.9100333452224731,
"learning_rate": 2.284769155026678e-05,
"loss": 1.1634,
"step": 5520
},
{
"epoch": 0.8288214024767222,
"grad_norm": 0.841582715511322,
"learning_rate": 2.2462920372180154e-05,
"loss": 1.1561,
"step": 5530
},
{
"epoch": 0.8303201753564269,
"grad_norm": 0.8073222637176514,
"learning_rate": 2.2081154290046445e-05,
"loss": 1.1584,
"step": 5540
},
{
"epoch": 0.8318189482361317,
"grad_norm": 0.8581207990646362,
"learning_rate": 2.170240229923954e-05,
"loss": 1.1547,
"step": 5550
},
{
"epoch": 0.8333177211158365,
"grad_norm": 0.859908401966095,
"learning_rate": 2.1326673324113603e-05,
"loss": 1.1783,
"step": 5560
},
{
"epoch": 0.8348164939955411,
"grad_norm": 0.8941123485565186,
"learning_rate": 2.0953976217792995e-05,
"loss": 1.1543,
"step": 5570
},
{
"epoch": 0.8363152668752459,
"grad_norm": 0.8776352405548096,
"learning_rate": 2.0584319761963532e-05,
"loss": 1.1656,
"step": 5580
},
{
"epoch": 0.8378140397549506,
"grad_norm": 0.8063647150993347,
"learning_rate": 2.021771266666568e-05,
"loss": 1.1668,
"step": 5590
},
{
"epoch": 0.8393128126346554,
"grad_norm": 0.856473445892334,
"learning_rate": 1.9854163570089175e-05,
"loss": 1.1535,
"step": 5600
},
{
"epoch": 0.8408115855143601,
"grad_norm": 0.7813337445259094,
"learning_rate": 1.9493681038369634e-05,
"loss": 1.163,
"step": 5610
},
{
"epoch": 0.8423103583940649,
"grad_norm": 0.854742705821991,
"learning_rate": 1.9136273565386674e-05,
"loss": 1.1579,
"step": 5620
},
{
"epoch": 0.8438091312737696,
"grad_norm": 0.8952386975288391,
"learning_rate": 1.8781949572563682e-05,
"loss": 1.167,
"step": 5630
},
{
"epoch": 0.8453079041534743,
"grad_norm": 0.8518572449684143,
"learning_rate": 1.843071740866957e-05,
"loss": 1.1531,
"step": 5640
},
{
"epoch": 0.8468066770331791,
"grad_norm": 0.9791853427886963,
"learning_rate": 1.808258534962179e-05,
"loss": 1.1657,
"step": 5650
},
{
"epoch": 0.8483054499128838,
"grad_norm": 0.8777291178703308,
"learning_rate": 1.7737561598291644e-05,
"loss": 1.1598,
"step": 5660
},
{
"epoch": 0.8498042227925886,
"grad_norm": 0.9136359095573425,
"learning_rate": 1.7395654284310743e-05,
"loss": 1.1567,
"step": 5670
},
{
"epoch": 0.8513029956722933,
"grad_norm": 0.8480714559555054,
"learning_rate": 1.7056871463879616e-05,
"loss": 1.1711,
"step": 5680
},
{
"epoch": 0.8528017685519981,
"grad_norm": 0.9030656814575195,
"learning_rate": 1.6721221119577778e-05,
"loss": 1.1501,
"step": 5690
},
{
"epoch": 0.8543005414317028,
"grad_norm": 0.845726490020752,
"learning_rate": 1.6388711160175744e-05,
"loss": 1.1734,
"step": 5700
},
{
"epoch": 0.8557993143114075,
"grad_norm": 0.8673616647720337,
"learning_rate": 1.6059349420448566e-05,
"loss": 1.1442,
"step": 5710
},
{
"epoch": 0.8572980871911122,
"grad_norm": 0.8604286909103394,
"learning_rate": 1.5733143660991354e-05,
"loss": 1.1631,
"step": 5720
},
{
"epoch": 0.858796860070817,
"grad_norm": 0.9111164808273315,
"learning_rate": 1.5410101568036266e-05,
"loss": 1.1671,
"step": 5730
},
{
"epoch": 0.8602956329505218,
"grad_norm": 0.8710470199584961,
"learning_rate": 1.509023075327151e-05,
"loss": 1.1632,
"step": 5740
},
{
"epoch": 0.8617944058302265,
"grad_norm": 1.0082249641418457,
"learning_rate": 1.4773538753662006e-05,
"loss": 1.1686,
"step": 5750
},
{
"epoch": 0.8632931787099313,
"grad_norm": 0.8322862386703491,
"learning_rate": 1.4460033031271707e-05,
"loss": 1.1677,
"step": 5760
},
{
"epoch": 0.864791951589636,
"grad_norm": 0.897473156452179,
"learning_rate": 1.4149720973087814e-05,
"loss": 1.1712,
"step": 5770
},
{
"epoch": 0.8662907244693407,
"grad_norm": 0.8400557041168213,
"learning_rate": 1.3842609890846795e-05,
"loss": 1.1481,
"step": 5780
},
{
"epoch": 0.8677894973490454,
"grad_norm": 0.8493039608001709,
"learning_rate": 1.353870702086195e-05,
"loss": 1.1645,
"step": 5790
},
{
"epoch": 0.8692882702287502,
"grad_norm": 0.9090917110443115,
"learning_rate": 1.3238019523853043e-05,
"loss": 1.1598,
"step": 5800
},
{
"epoch": 0.870787043108455,
"grad_norm": 0.8982146978378296,
"learning_rate": 1.2940554484777498e-05,
"loss": 1.1804,
"step": 5810
},
{
"epoch": 0.8722858159881597,
"grad_norm": 0.8175159096717834,
"learning_rate": 1.2646318912663522e-05,
"loss": 1.1681,
"step": 5820
},
{
"epoch": 0.8737845888678645,
"grad_norm": 0.7891288995742798,
"learning_rate": 1.235531974044484e-05,
"loss": 1.1586,
"step": 5830
},
{
"epoch": 0.8752833617475692,
"grad_norm": 0.8779332041740417,
"learning_rate": 1.2067563824797516e-05,
"loss": 1.1588,
"step": 5840
},
{
"epoch": 0.876782134627274,
"grad_norm": 0.882290244102478,
"learning_rate": 1.1783057945978203e-05,
"loss": 1.1521,
"step": 5850
},
{
"epoch": 0.8782809075069786,
"grad_norm": 0.8421468138694763,
"learning_rate": 1.1501808807664547e-05,
"loss": 1.1295,
"step": 5860
},
{
"epoch": 0.8797796803866834,
"grad_norm": 0.8814191222190857,
"learning_rate": 1.122382303679708e-05,
"loss": 1.1588,
"step": 5870
},
{
"epoch": 0.8812784532663881,
"grad_norm": 0.819195568561554,
"learning_rate": 1.0949107183423205e-05,
"loss": 1.1536,
"step": 5880
},
{
"epoch": 0.8827772261460929,
"grad_norm": 0.9204806685447693,
"learning_rate": 1.067766772054281e-05,
"loss": 1.1613,
"step": 5890
},
{
"epoch": 0.8842759990257977,
"grad_norm": 0.9718281030654907,
"learning_rate": 1.0409511043955664e-05,
"loss": 1.1609,
"step": 5900
},
{
"epoch": 0.8857747719055024,
"grad_norm": 0.8659542202949524,
"learning_rate": 1.0144643472110919e-05,
"loss": 1.1701,
"step": 5910
},
{
"epoch": 0.8872735447852071,
"grad_norm": 0.8663217425346375,
"learning_rate": 9.883071245957964e-06,
"loss": 1.1524,
"step": 5920
},
{
"epoch": 0.8887723176649118,
"grad_norm": 0.8552528023719788,
"learning_rate": 9.624800528799648e-06,
"loss": 1.1732,
"step": 5930
},
{
"epoch": 0.8902710905446166,
"grad_norm": 0.8265079855918884,
"learning_rate": 9.369837406146802e-06,
"loss": 1.1497,
"step": 5940
},
{
"epoch": 0.8917698634243213,
"grad_norm": 0.9253767132759094,
"learning_rate": 9.118187885575096e-06,
"loss": 1.1591,
"step": 5950
},
{
"epoch": 0.8932686363040261,
"grad_norm": 0.8842642307281494,
"learning_rate": 8.869857896583204e-06,
"loss": 1.1541,
"step": 5960
},
{
"epoch": 0.8947674091837308,
"grad_norm": 0.8671165108680725,
"learning_rate": 8.624853290453438e-06,
"loss": 1.1563,
"step": 5970
},
{
"epoch": 0.8962661820634356,
"grad_norm": 0.8760147094726562,
"learning_rate": 8.383179840113497e-06,
"loss": 1.1505,
"step": 5980
},
{
"epoch": 0.8977649549431403,
"grad_norm": 0.8540050983428955,
"learning_rate": 8.144843240000737e-06,
"loss": 1.144,
"step": 5990
},
{
"epoch": 0.899263727822845,
"grad_norm": 0.9016329050064087,
"learning_rate": 7.909849105927907e-06,
"loss": 1.1496,
"step": 6000
},
{
"epoch": 0.9007625007025498,
"grad_norm": 0.8828489780426025,
"learning_rate": 7.678202974950687e-06,
"loss": 1.1526,
"step": 6010
},
{
"epoch": 0.9010622552784907,
"eval_loss": 1.1948587894439697,
"eval_runtime": 36.3071,
"eval_samples_per_second": 688.57,
"eval_steps_per_second": 86.071,
"step": 6012
},
{
"epoch": 0.9022612735822545,
"grad_norm": 0.8661078810691833,
"learning_rate": 7.4499103052374945e-06,
"loss": 1.1705,
"step": 6020
},
{
"epoch": 0.9037600464619593,
"grad_norm": 0.8815616369247437,
"learning_rate": 7.224976475940603e-06,
"loss": 1.1633,
"step": 6030
},
{
"epoch": 0.905258819341664,
"grad_norm": 0.8972934484481812,
"learning_rate": 7.00340678706961e-06,
"loss": 1.153,
"step": 6040
},
{
"epoch": 0.9067575922213688,
"grad_norm": 0.8003550171852112,
"learning_rate": 6.785206459366355e-06,
"loss": 1.168,
"step": 6050
},
{
"epoch": 0.9082563651010735,
"grad_norm": 0.856256902217865,
"learning_rate": 6.570380634182098e-06,
"loss": 1.156,
"step": 6060
},
{
"epoch": 0.9097551379807782,
"grad_norm": 0.8487430214881897,
"learning_rate": 6.3589343733563055e-06,
"loss": 1.1568,
"step": 6070
},
{
"epoch": 0.911253910860483,
"grad_norm": 0.8809577226638794,
"learning_rate": 6.150872659097255e-06,
"loss": 1.1517,
"step": 6080
},
{
"epoch": 0.9127526837401877,
"grad_norm": 0.8905763626098633,
"learning_rate": 5.946200393864886e-06,
"loss": 1.1632,
"step": 6090
},
{
"epoch": 0.9142514566198925,
"grad_norm": 0.8179495334625244,
"learning_rate": 5.74492240025502e-06,
"loss": 1.1528,
"step": 6100
},
{
"epoch": 0.9157502294995972,
"grad_norm": 0.9552949070930481,
"learning_rate": 5.547043420886005e-06,
"loss": 1.1442,
"step": 6110
},
{
"epoch": 0.917249002379302,
"grad_norm": 0.9348698854446411,
"learning_rate": 5.352568118286671e-06,
"loss": 1.1548,
"step": 6120
},
{
"epoch": 0.9187477752590066,
"grad_norm": 0.8429141640663147,
"learning_rate": 5.16150107478675e-06,
"loss": 1.1619,
"step": 6130
},
{
"epoch": 0.9202465481387114,
"grad_norm": 0.8613331913948059,
"learning_rate": 4.973846792408681e-06,
"loss": 1.1575,
"step": 6140
},
{
"epoch": 0.9217453210184162,
"grad_norm": 0.8871680498123169,
"learning_rate": 4.7896096927616925e-06,
"loss": 1.1686,
"step": 6150
},
{
"epoch": 0.9232440938981209,
"grad_norm": 0.850810170173645,
"learning_rate": 4.608794116937487e-06,
"loss": 1.1697,
"step": 6160
},
{
"epoch": 0.9247428667778257,
"grad_norm": 0.8758169412612915,
"learning_rate": 4.4314043254080725e-06,
"loss": 1.1813,
"step": 6170
},
{
"epoch": 0.9262416396575304,
"grad_norm": 0.8653439879417419,
"learning_rate": 4.257444497925328e-06,
"loss": 1.1517,
"step": 6180
},
{
"epoch": 0.9277404125372352,
"grad_norm": 0.8698475360870361,
"learning_rate": 4.086918733422429e-06,
"loss": 1.1531,
"step": 6190
},
{
"epoch": 0.9292391854169398,
"grad_norm": 0.9468777179718018,
"learning_rate": 3.919831049917444e-06,
"loss": 1.1491,
"step": 6200
},
{
"epoch": 0.9307379582966446,
"grad_norm": 0.8888046741485596,
"learning_rate": 3.7561853844185084e-06,
"loss": 1.1593,
"step": 6210
},
{
"epoch": 0.9322367311763493,
"grad_norm": 0.926652729511261,
"learning_rate": 3.595985592831102e-06,
"loss": 1.1669,
"step": 6220
},
{
"epoch": 0.9337355040560541,
"grad_norm": 0.8611651659011841,
"learning_rate": 3.43923544986725e-06,
"loss": 1.1625,
"step": 6230
},
{
"epoch": 0.9352342769357589,
"grad_norm": 0.8281158804893494,
"learning_rate": 3.285938648956482e-06,
"loss": 1.1468,
"step": 6240
},
{
"epoch": 0.9367330498154636,
"grad_norm": 0.833707332611084,
"learning_rate": 3.1360988021589483e-06,
"loss": 1.1656,
"step": 6250
},
{
"epoch": 0.9382318226951684,
"grad_norm": 0.912096381187439,
"learning_rate": 2.989719440080124e-06,
"loss": 1.1436,
"step": 6260
},
{
"epoch": 0.939730595574873,
"grad_norm": 0.8979214429855347,
"learning_rate": 2.8468040117878065e-06,
"loss": 1.1524,
"step": 6270
},
{
"epoch": 0.9412293684545778,
"grad_norm": 0.8215987682342529,
"learning_rate": 2.70735588473065e-06,
"loss": 1.157,
"step": 6280
},
{
"epoch": 0.9427281413342825,
"grad_norm": 0.8943039774894714,
"learning_rate": 2.571378344659042e-06,
"loss": 1.16,
"step": 6290
},
{
"epoch": 0.9442269142139873,
"grad_norm": 0.8807237148284912,
"learning_rate": 2.438874595547485e-06,
"loss": 1.1391,
"step": 6300
},
{
"epoch": 0.9457256870936921,
"grad_norm": 0.8931160569190979,
"learning_rate": 2.3098477595192566e-06,
"loss": 1.1675,
"step": 6310
},
{
"epoch": 0.9472244599733968,
"grad_norm": 0.8743401765823364,
"learning_rate": 2.1843008767726823e-06,
"loss": 1.1542,
"step": 6320
},
{
"epoch": 0.9487232328531016,
"grad_norm": 0.8389233946800232,
"learning_rate": 2.062236905509712e-06,
"loss": 1.1517,
"step": 6330
},
{
"epoch": 0.9502220057328062,
"grad_norm": 0.911375880241394,
"learning_rate": 1.9436587218659593e-06,
"loss": 1.1294,
"step": 6340
},
{
"epoch": 0.951720778612511,
"grad_norm": 0.8729566931724548,
"learning_rate": 1.828569119843204e-06,
"loss": 1.1672,
"step": 6350
},
{
"epoch": 0.9532195514922157,
"grad_norm": 0.9174323081970215,
"learning_rate": 1.716970811243329e-06,
"loss": 1.1851,
"step": 6360
},
{
"epoch": 0.9547183243719205,
"grad_norm": 0.8884388208389282,
"learning_rate": 1.6088664256045713e-06,
"loss": 1.1473,
"step": 6370
},
{
"epoch": 0.9562170972516252,
"grad_norm": 0.8815491795539856,
"learning_rate": 1.5042585101395055e-06,
"loss": 1.1601,
"step": 6380
},
{
"epoch": 0.95771587013133,
"grad_norm": 0.869717538356781,
"learning_rate": 1.4031495296749906e-06,
"loss": 1.1522,
"step": 6390
},
{
"epoch": 0.9592146430110348,
"grad_norm": 1.0628986358642578,
"learning_rate": 1.3055418665942009e-06,
"loss": 1.1279,
"step": 6400
},
{
"epoch": 0.9607134158907394,
"grad_norm": 0.8930190801620483,
"learning_rate": 1.21143782078037e-06,
"loss": 1.1602,
"step": 6410
},
{
"epoch": 0.9622121887704442,
"grad_norm": 1.1220424175262451,
"learning_rate": 1.1208396095626682e-06,
"loss": 1.1703,
"step": 6420
},
{
"epoch": 0.9637109616501489,
"grad_norm": 0.8779110312461853,
"learning_rate": 1.0337493676639442e-06,
"loss": 1.1473,
"step": 6430
},
{
"epoch": 0.9652097345298537,
"grad_norm": 0.9182150363922119,
"learning_rate": 9.501691471504146e-07,
"loss": 1.164,
"step": 6440
},
{
"epoch": 0.9667085074095584,
"grad_norm": 0.8875131607055664,
"learning_rate": 8.70100917383354e-07,
"loss": 1.1555,
"step": 6450
},
{
"epoch": 0.9682072802892632,
"grad_norm": 0.9799543619155884,
"learning_rate": 7.935465649726136e-07,
"loss": 1.1604,
"step": 6460
},
{
"epoch": 0.969706053168968,
"grad_norm": 0.8576260805130005,
"learning_rate": 7.205078937322417e-07,
"loss": 1.1585,
"step": 6470
},
{
"epoch": 0.9712048260486726,
"grad_norm": 0.9388023614883423,
"learning_rate": 6.50986624637917e-07,
"loss": 1.1675,
"step": 6480
},
{
"epoch": 0.9727035989283774,
"grad_norm": 0.8880642056465149,
"learning_rate": 5.849843957864808e-07,
"loss": 1.1591,
"step": 6490
},
{
"epoch": 0.9742023718080821,
"grad_norm": 0.9257229566574097,
"learning_rate": 5.225027623572686e-07,
"loss": 1.1445,
"step": 6500
},
{
"epoch": 0.9757011446877869,
"grad_norm": 0.9189469814300537,
"learning_rate": 4.635431965754888e-07,
"loss": 1.143,
"step": 6510
},
{
"epoch": 0.9771999175674916,
"grad_norm": 0.9432305693626404,
"learning_rate": 4.081070876775172e-07,
"loss": 1.1525,
"step": 6520
},
{
"epoch": 0.9786986904471964,
"grad_norm": 0.915924072265625,
"learning_rate": 3.5619574187822354e-07,
"loss": 1.1687,
"step": 6530
},
{
"epoch": 0.980197463326901,
"grad_norm": 0.8820152878761292,
"learning_rate": 3.078103823401123e-07,
"loss": 1.1697,
"step": 6540
},
{
"epoch": 0.9816962362066058,
"grad_norm": 0.8938290476799011,
"learning_rate": 2.629521491445463e-07,
"loss": 1.1666,
"step": 6550
},
{
"epoch": 0.9831950090863106,
"grad_norm": 0.8603911995887756,
"learning_rate": 2.216220992648843e-07,
"loss": 1.1634,
"step": 6560
},
{
"epoch": 0.9846937819660153,
"grad_norm": 0.8677682280540466,
"learning_rate": 1.8382120654156785e-07,
"loss": 1.1641,
"step": 6570
},
{
"epoch": 0.9861925548457201,
"grad_norm": 0.8882606625556946,
"learning_rate": 1.495503616591731e-07,
"loss": 1.1415,
"step": 6580
},
{
"epoch": 0.9876913277254248,
"grad_norm": 0.807101309299469,
"learning_rate": 1.1881037212542744e-07,
"loss": 1.1574,
"step": 6590
},
{
"epoch": 0.9891901006051296,
"grad_norm": 0.8506379723548889,
"learning_rate": 9.160196225217465e-08,
"loss": 1.1653,
"step": 6600
},
{
"epoch": 0.9906888734848343,
"grad_norm": 0.871025800704956,
"learning_rate": 6.792577313833868e-08,
"loss": 1.1458,
"step": 6610
},
{
"epoch": 0.992187646364539,
"grad_norm": 0.8364571928977966,
"learning_rate": 4.778236265475244e-08,
"loss": 1.1727,
"step": 6620
},
{
"epoch": 0.9936864192442437,
"grad_norm": 0.8232174515724182,
"learning_rate": 3.117220543110144e-08,
"loss": 1.1693,
"step": 6630
},
{
"epoch": 0.9951851921239485,
"grad_norm": 0.8296674489974976,
"learning_rate": 1.8095692844649625e-08,
"loss": 1.1582,
"step": 6640
},
{
"epoch": 0.9966839650036533,
"grad_norm": 0.8697881698608398,
"learning_rate": 8.553133011113267e-09,
"loss": 1.161,
"step": 6650
},
{
"epoch": 0.998182737883358,
"grad_norm": 0.9544343948364258,
"learning_rate": 2.544750777316862e-09,
"loss": 1.1713,
"step": 6660
},
{
"epoch": 0.9996815107630628,
"grad_norm": 0.9088115096092224,
"learning_rate": 7.068771591400845e-11,
"loss": 1.1531,
"step": 6670
}
],
"logging_steps": 10,
"max_steps": 6672,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 668,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2428429941080064.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}