Amit5674's picture
Upload folder using huggingface_hub
38a8c2b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9993075751280986,
"eval_steps": 50.0,
"global_step": 2706,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007385865300281586,
"grad_norm": 56.16927719116211,
"learning_rate": 6.666666666666667e-07,
"loss": 2.2986,
"step": 10
},
{
"epoch": 0.014771730600563172,
"grad_norm": 52.235191345214844,
"learning_rate": 1.3333333333333334e-06,
"loss": 2.1946,
"step": 20
},
{
"epoch": 0.02215759590084476,
"grad_norm": 14.341564178466797,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.9498,
"step": 30
},
{
"epoch": 0.029543461201126345,
"grad_norm": 9.743363380432129,
"learning_rate": 2.666666666666667e-06,
"loss": 1.7599,
"step": 40
},
{
"epoch": 0.03692932650140793,
"grad_norm": 10.694592475891113,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.77,
"step": 50
},
{
"epoch": 0.04431519180168952,
"grad_norm": 6.704069137573242,
"learning_rate": 4.000000000000001e-06,
"loss": 1.5609,
"step": 60
},
{
"epoch": 0.0517010571019711,
"grad_norm": 5.9378342628479,
"learning_rate": 4.666666666666667e-06,
"loss": 1.5134,
"step": 70
},
{
"epoch": 0.05908692240225269,
"grad_norm": 5.821998119354248,
"learning_rate": 5.333333333333334e-06,
"loss": 1.4795,
"step": 80
},
{
"epoch": 0.06647278770253427,
"grad_norm": 6.466773986816406,
"learning_rate": 6e-06,
"loss": 1.4666,
"step": 90
},
{
"epoch": 0.07385865300281585,
"grad_norm": 5.7971625328063965,
"learning_rate": 6.666666666666667e-06,
"loss": 1.4187,
"step": 100
},
{
"epoch": 0.08124451830309745,
"grad_norm": 19.75885581970215,
"learning_rate": 7.333333333333333e-06,
"loss": 1.4012,
"step": 110
},
{
"epoch": 0.08863038360337903,
"grad_norm": 6.692321300506592,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3932,
"step": 120
},
{
"epoch": 0.09601624890366062,
"grad_norm": 8.816634178161621,
"learning_rate": 8.666666666666668e-06,
"loss": 1.3924,
"step": 130
},
{
"epoch": 0.1034021142039422,
"grad_norm": 6.486945152282715,
"learning_rate": 9.333333333333334e-06,
"loss": 1.3117,
"step": 140
},
{
"epoch": 0.1107879795042238,
"grad_norm": 8.362743377685547,
"learning_rate": 1e-05,
"loss": 1.2642,
"step": 150
},
{
"epoch": 0.11817384480450538,
"grad_norm": 7.534619331359863,
"learning_rate": 1.0666666666666667e-05,
"loss": 1.2891,
"step": 160
},
{
"epoch": 0.12555971010478698,
"grad_norm": 7.239850997924805,
"learning_rate": 1.1333333333333334e-05,
"loss": 1.2664,
"step": 170
},
{
"epoch": 0.13294557540506854,
"grad_norm": 6.650047779083252,
"learning_rate": 1.2e-05,
"loss": 1.2494,
"step": 180
},
{
"epoch": 0.14033144070535014,
"grad_norm": 5.859479904174805,
"learning_rate": 1.2666666666666667e-05,
"loss": 1.2844,
"step": 190
},
{
"epoch": 0.1477173060056317,
"grad_norm": 7.5547027587890625,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.2898,
"step": 200
},
{
"epoch": 0.1551031713059133,
"grad_norm": 8.316688537597656,
"learning_rate": 1.4e-05,
"loss": 1.1774,
"step": 210
},
{
"epoch": 0.1624890366061949,
"grad_norm": 7.763572692871094,
"learning_rate": 1.4666666666666666e-05,
"loss": 1.2881,
"step": 220
},
{
"epoch": 0.16987490190647647,
"grad_norm": 7.132694244384766,
"learning_rate": 1.5333333333333334e-05,
"loss": 1.2359,
"step": 230
},
{
"epoch": 0.17726076720675807,
"grad_norm": 6.167331218719482,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.1939,
"step": 240
},
{
"epoch": 0.18464663250703967,
"grad_norm": 7.399999141693115,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.2213,
"step": 250
},
{
"epoch": 0.19203249780732123,
"grad_norm": 5.161776065826416,
"learning_rate": 1.7333333333333336e-05,
"loss": 1.1747,
"step": 260
},
{
"epoch": 0.19941836310760283,
"grad_norm": 9.162799835205078,
"learning_rate": 1.8e-05,
"loss": 1.1751,
"step": 270
},
{
"epoch": 0.2068042284078844,
"grad_norm": 6.043332576751709,
"learning_rate": 1.866666666666667e-05,
"loss": 1.181,
"step": 280
},
{
"epoch": 0.214190093708166,
"grad_norm": 5.533137798309326,
"learning_rate": 1.9333333333333333e-05,
"loss": 1.1727,
"step": 290
},
{
"epoch": 0.2215759590084476,
"grad_norm": 4.7085862159729,
"learning_rate": 2e-05,
"loss": 1.3127,
"step": 300
},
{
"epoch": 0.22896182430872916,
"grad_norm": 5.2254815101623535,
"learning_rate": 1.9999147543290536e-05,
"loss": 1.1853,
"step": 310
},
{
"epoch": 0.23634768960901076,
"grad_norm": 5.015223503112793,
"learning_rate": 1.999659031849863e-05,
"loss": 1.1846,
"step": 320
},
{
"epoch": 0.24373355490929235,
"grad_norm": 6.505156993865967,
"learning_rate": 1.9992328761608965e-05,
"loss": 1.1572,
"step": 330
},
{
"epoch": 0.25111942020957395,
"grad_norm": 4.331061840057373,
"learning_rate": 1.99863635991801e-05,
"loss": 1.0744,
"step": 340
},
{
"epoch": 0.2585052855098555,
"grad_norm": 6.760496616363525,
"learning_rate": 1.997869584822058e-05,
"loss": 1.1019,
"step": 350
},
{
"epoch": 0.2658911508101371,
"grad_norm": 6.3948235511779785,
"learning_rate": 1.9969326816015556e-05,
"loss": 1.1073,
"step": 360
},
{
"epoch": 0.2732770161104187,
"grad_norm": 5.087249279022217,
"learning_rate": 1.9958258099903894e-05,
"loss": 1.0751,
"step": 370
},
{
"epoch": 0.2806628814107003,
"grad_norm": 10.829612731933594,
"learning_rate": 1.9945491587005867e-05,
"loss": 1.083,
"step": 380
},
{
"epoch": 0.28804874671098185,
"grad_norm": 5.7423176765441895,
"learning_rate": 1.9931029453901384e-05,
"loss": 1.0639,
"step": 390
},
{
"epoch": 0.2954346120112634,
"grad_norm": 4.613246440887451,
"learning_rate": 1.9914874166258927e-05,
"loss": 1.0604,
"step": 400
},
{
"epoch": 0.30282047731154504,
"grad_norm": 4.079463005065918,
"learning_rate": 1.9897028478415165e-05,
"loss": 1.0017,
"step": 410
},
{
"epoch": 0.3102063426118266,
"grad_norm": 4.641962051391602,
"learning_rate": 1.9877495432905363e-05,
"loss": 1.0263,
"step": 420
},
{
"epoch": 0.3175922079121082,
"grad_norm": 6.14805269241333,
"learning_rate": 1.9856278359944664e-05,
"loss": 1.0451,
"step": 430
},
{
"epoch": 0.3249780732123898,
"grad_norm": 5.665846824645996,
"learning_rate": 1.9833380876860305e-05,
"loss": 1.0361,
"step": 440
},
{
"epoch": 0.3323639385126714,
"grad_norm": 7.826813220977783,
"learning_rate": 1.9808806887474907e-05,
"loss": 0.9795,
"step": 450
},
{
"epoch": 0.33974980381295294,
"grad_norm": 4.955426216125488,
"learning_rate": 1.9782560581440894e-05,
"loss": 1.0433,
"step": 460
},
{
"epoch": 0.34713566911323457,
"grad_norm": 5.327470302581787,
"learning_rate": 1.97546464335262e-05,
"loss": 0.9605,
"step": 470
},
{
"epoch": 0.35452153441351614,
"grad_norm": 4.838713645935059,
"learning_rate": 1.972506920285136e-05,
"loss": 0.9935,
"step": 480
},
{
"epoch": 0.3619073997137977,
"grad_norm": 6.030056476593018,
"learning_rate": 1.969383393207813e-05,
"loss": 1.0043,
"step": 490
},
{
"epoch": 0.36929326501407933,
"grad_norm": 5.917972087860107,
"learning_rate": 1.9660945946549727e-05,
"loss": 0.9701,
"step": 500
},
{
"epoch": 0.3766791303143609,
"grad_norm": 4.341779708862305,
"learning_rate": 1.962641085338294e-05,
"loss": 0.9913,
"step": 510
},
{
"epoch": 0.38406499561464247,
"grad_norm": 4.399661064147949,
"learning_rate": 1.959023454051215e-05,
"loss": 0.9196,
"step": 520
},
{
"epoch": 0.39145086091492404,
"grad_norm": 4.028534412384033,
"learning_rate": 1.9552423175685478e-05,
"loss": 0.9369,
"step": 530
},
{
"epoch": 0.39883672621520566,
"grad_norm": 4.389466285705566,
"learning_rate": 1.9512983205413253e-05,
"loss": 1.0191,
"step": 540
},
{
"epoch": 0.40622259151548723,
"grad_norm": 5.277081489562988,
"learning_rate": 1.9471921353868932e-05,
"loss": 0.9399,
"step": 550
},
{
"epoch": 0.4136084568157688,
"grad_norm": 4.73630428314209,
"learning_rate": 1.9429244621742685e-05,
"loss": 0.9588,
"step": 560
},
{
"epoch": 0.4209943221160504,
"grad_norm": 3.3033573627471924,
"learning_rate": 1.938496028504784e-05,
"loss": 0.9038,
"step": 570
},
{
"epoch": 0.428380187416332,
"grad_norm": 7.80294942855835,
"learning_rate": 1.9339075893880382e-05,
"loss": 0.9403,
"step": 580
},
{
"epoch": 0.43576605271661356,
"grad_norm": 4.098162651062012,
"learning_rate": 1.9291599271131737e-05,
"loss": 0.9344,
"step": 590
},
{
"epoch": 0.4431519180168952,
"grad_norm": 3.7808070182800293,
"learning_rate": 1.9242538511155024e-05,
"loss": 0.8939,
"step": 600
},
{
"epoch": 0.45053778331717675,
"grad_norm": 4.160403728485107,
"learning_rate": 1.9191901978385048e-05,
"loss": 0.8786,
"step": 610
},
{
"epoch": 0.4579236486174583,
"grad_norm": 3.7800965309143066,
"learning_rate": 1.9139698305912227e-05,
"loss": 0.8977,
"step": 620
},
{
"epoch": 0.46530951391773995,
"grad_norm": 3.8200621604919434,
"learning_rate": 1.9085936394010733e-05,
"loss": 0.8793,
"step": 630
},
{
"epoch": 0.4726953792180215,
"grad_norm": 4.453779220581055,
"learning_rate": 1.903062540862107e-05,
"loss": 0.8813,
"step": 640
},
{
"epoch": 0.4800812445183031,
"grad_norm": 5.653434753417969,
"learning_rate": 1.897377477978736e-05,
"loss": 0.9544,
"step": 650
},
{
"epoch": 0.4874671098185847,
"grad_norm": 4.868826389312744,
"learning_rate": 1.8915394200049597e-05,
"loss": 0.8858,
"step": 660
},
{
"epoch": 0.4948529751188663,
"grad_norm": 4.187640190124512,
"learning_rate": 1.8855493622791163e-05,
"loss": 0.9077,
"step": 670
},
{
"epoch": 0.5022388404191479,
"grad_norm": 4.503122806549072,
"learning_rate": 1.8794083260541853e-05,
"loss": 0.9278,
"step": 680
},
{
"epoch": 0.5096247057194294,
"grad_norm": 4.902103900909424,
"learning_rate": 1.8731173583236737e-05,
"loss": 0.8281,
"step": 690
},
{
"epoch": 0.517010571019711,
"grad_norm": 4.273303031921387,
"learning_rate": 1.8666775316431113e-05,
"loss": 0.8054,
"step": 700
},
{
"epoch": 0.5243964363199927,
"grad_norm": 55.874359130859375,
"learning_rate": 1.8600899439471902e-05,
"loss": 0.8091,
"step": 710
},
{
"epoch": 0.5317823016202742,
"grad_norm": 4.271385192871094,
"learning_rate": 1.8533557183625773e-05,
"loss": 0.788,
"step": 720
},
{
"epoch": 0.5391681669205558,
"grad_norm": 5.59772253036499,
"learning_rate": 1.8464760030164287e-05,
"loss": 0.7942,
"step": 730
},
{
"epoch": 0.5465540322208374,
"grad_norm": 3.724728584289551,
"learning_rate": 1.8394519708406454e-05,
"loss": 0.8234,
"step": 740
},
{
"epoch": 0.5539398975211189,
"grad_norm": 5.2906036376953125,
"learning_rate": 1.8322848193718984e-05,
"loss": 0.8143,
"step": 750
},
{
"epoch": 0.5613257628214006,
"grad_norm": 5.114410877227783,
"learning_rate": 1.82497577054746e-05,
"loss": 0.7946,
"step": 760
},
{
"epoch": 0.5687116281216821,
"grad_norm": 4.730770587921143,
"learning_rate": 1.8175260704968716e-05,
"loss": 0.7771,
"step": 770
},
{
"epoch": 0.5760974934219637,
"grad_norm": 3.0836727619171143,
"learning_rate": 1.809936989329492e-05,
"loss": 0.739,
"step": 780
},
{
"epoch": 0.5834833587222453,
"grad_norm": 2.7664663791656494,
"learning_rate": 1.802209820917952e-05,
"loss": 0.731,
"step": 790
},
{
"epoch": 0.5908692240225268,
"grad_norm": 3.5617446899414062,
"learning_rate": 1.7943458826775646e-05,
"loss": 0.6807,
"step": 800
},
{
"epoch": 0.5982550893228085,
"grad_norm": 7.652963638305664,
"learning_rate": 1.786346515341712e-05,
"loss": 0.6883,
"step": 810
},
{
"epoch": 0.6056409546230901,
"grad_norm": 3.5472395420074463,
"learning_rate": 1.778213082733266e-05,
"loss": 0.6822,
"step": 820
},
{
"epoch": 0.6130268199233716,
"grad_norm": 4.652453899383545,
"learning_rate": 1.7699469715320663e-05,
"loss": 0.6508,
"step": 830
},
{
"epoch": 0.6204126852236532,
"grad_norm": 3.976405620574951,
"learning_rate": 1.7615495910385036e-05,
"loss": 0.6007,
"step": 840
},
{
"epoch": 0.6277985505239349,
"grad_norm": 3.0713090896606445,
"learning_rate": 1.7530223729332464e-05,
"loss": 0.6174,
"step": 850
},
{
"epoch": 0.6351844158242164,
"grad_norm": 4.036540508270264,
"learning_rate": 1.7443667710331523e-05,
"loss": 0.617,
"step": 860
},
{
"epoch": 0.642570281124498,
"grad_norm": 7.731866836547852,
"learning_rate": 1.7355842610434045e-05,
"loss": 0.6245,
"step": 870
},
{
"epoch": 0.6499561464247796,
"grad_norm": 4.550940036773682,
"learning_rate": 1.7266763403059162e-05,
"loss": 0.593,
"step": 880
},
{
"epoch": 0.6573420117250611,
"grad_norm": 2.5473084449768066,
"learning_rate": 1.7176445275440468e-05,
"loss": 0.5677,
"step": 890
},
{
"epoch": 0.6647278770253428,
"grad_norm": 2.1716835498809814,
"learning_rate": 1.7084903626036743e-05,
"loss": 0.5452,
"step": 900
},
{
"epoch": 0.6721137423256244,
"grad_norm": 4.398560523986816,
"learning_rate": 1.6992154061906637e-05,
"loss": 0.5599,
"step": 910
},
{
"epoch": 0.6794996076259059,
"grad_norm": 2.8742692470550537,
"learning_rate": 1.6898212396047788e-05,
"loss": 0.5004,
"step": 920
},
{
"epoch": 0.6868854729261875,
"grad_norm": 3.202517032623291,
"learning_rate": 1.6803094644700878e-05,
"loss": 0.5079,
"step": 930
},
{
"epoch": 0.6942713382264691,
"grad_norm": 5.449188232421875,
"learning_rate": 1.6706817024618966e-05,
"loss": 0.5122,
"step": 940
},
{
"epoch": 0.7016572035267507,
"grad_norm": 5.538541316986084,
"learning_rate": 1.6609395950302693e-05,
"loss": 0.5241,
"step": 950
},
{
"epoch": 0.7090430688270323,
"grad_norm": 3.380526304244995,
"learning_rate": 1.6510848031201755e-05,
"loss": 0.4631,
"step": 960
},
{
"epoch": 0.7164289341273139,
"grad_norm": 3.240527629852295,
"learning_rate": 1.6411190068883114e-05,
"loss": 0.5214,
"step": 970
},
{
"epoch": 0.7238147994275954,
"grad_norm": 16.668127059936523,
"learning_rate": 1.63104390541665e-05,
"loss": 0.5373,
"step": 980
},
{
"epoch": 0.731200664727877,
"grad_norm": 3.9278078079223633,
"learning_rate": 1.6208612164227605e-05,
"loss": 0.4789,
"step": 990
},
{
"epoch": 0.7385865300281587,
"grad_norm": 3.5258326530456543,
"learning_rate": 1.6105726759669534e-05,
"loss": 0.465,
"step": 1000
},
{
"epoch": 0.7459723953284402,
"grad_norm": 2.779311418533325,
"learning_rate": 1.600180038156298e-05,
"loss": 0.4501,
"step": 1010
},
{
"epoch": 0.7533582606287218,
"grad_norm": 3.857485771179199,
"learning_rate": 1.58968507484556e-05,
"loss": 0.4519,
"step": 1020
},
{
"epoch": 0.7607441259290034,
"grad_norm": 2.959052324295044,
"learning_rate": 1.579089575335117e-05,
"loss": 0.4357,
"step": 1030
},
{
"epoch": 0.7681299912292849,
"grad_norm": 1.8662097454071045,
"learning_rate": 1.568395346065899e-05,
"loss": 0.3633,
"step": 1040
},
{
"epoch": 0.7755158565295666,
"grad_norm": 5.543001174926758,
"learning_rate": 1.5576042103114043e-05,
"loss": 0.4111,
"step": 1050
},
{
"epoch": 0.7829017218298481,
"grad_norm": 6.083206653594971,
"learning_rate": 1.5467180078668485e-05,
"loss": 0.3764,
"step": 1060
},
{
"epoch": 0.7902875871301297,
"grad_norm": 2.5218305587768555,
"learning_rate": 1.5357385947354945e-05,
"loss": 0.374,
"step": 1070
},
{
"epoch": 0.7976734524304113,
"grad_norm": 4.317601680755615,
"learning_rate": 1.52466784281222e-05,
"loss": 0.3571,
"step": 1080
},
{
"epoch": 0.8050593177306928,
"grad_norm": 2.0782041549682617,
"learning_rate": 1.5135076395643765e-05,
"loss": 0.3739,
"step": 1090
},
{
"epoch": 0.8124451830309745,
"grad_norm": 2.443953037261963,
"learning_rate": 1.5022598877099913e-05,
"loss": 0.3607,
"step": 1100
},
{
"epoch": 0.8198310483312561,
"grad_norm": 2.276827573776245,
"learning_rate": 1.4909265048933716e-05,
"loss": 0.3607,
"step": 1110
},
{
"epoch": 0.8272169136315376,
"grad_norm": 2.808431386947632,
"learning_rate": 1.4795094233581616e-05,
"loss": 0.3387,
"step": 1120
},
{
"epoch": 0.8346027789318192,
"grad_norm": 2.5325915813446045,
"learning_rate": 1.468010589617913e-05,
"loss": 0.3172,
"step": 1130
},
{
"epoch": 0.8419886442321008,
"grad_norm": 2.4943833351135254,
"learning_rate": 1.4564319641242202e-05,
"loss": 0.3193,
"step": 1140
},
{
"epoch": 0.8493745095323824,
"grad_norm": 2.2182066440582275,
"learning_rate": 1.4447755209324807e-05,
"loss": 0.3118,
"step": 1150
},
{
"epoch": 0.856760374832664,
"grad_norm": 1.920409083366394,
"learning_rate": 1.4330432473653369e-05,
"loss": 0.3246,
"step": 1160
},
{
"epoch": 0.8641462401329456,
"grad_norm": 3.2863781452178955,
"learning_rate": 1.4212371436738518e-05,
"loss": 0.3065,
"step": 1170
},
{
"epoch": 0.8715321054332271,
"grad_norm": 2.6266987323760986,
"learning_rate": 1.4093592226964863e-05,
"loss": 0.2813,
"step": 1180
},
{
"epoch": 0.8789179707335087,
"grad_norm": 2.526742935180664,
"learning_rate": 1.3974115095159273e-05,
"loss": 0.284,
"step": 1190
},
{
"epoch": 0.8863038360337904,
"grad_norm": 2.1190872192382812,
"learning_rate": 1.3853960411138272e-05,
"loss": 0.2865,
"step": 1200
},
{
"epoch": 0.8936897013340719,
"grad_norm": 3.0260584354400635,
"learning_rate": 1.373314866023517e-05,
"loss": 0.3019,
"step": 1210
},
{
"epoch": 0.9010755666343535,
"grad_norm": 4.537729740142822,
"learning_rate": 1.3611700439807503e-05,
"loss": 0.2946,
"step": 1220
},
{
"epoch": 0.9084614319346351,
"grad_norm": 3.150209903717041,
"learning_rate": 1.3489636455725337e-05,
"loss": 0.2795,
"step": 1230
},
{
"epoch": 0.9158472972349166,
"grad_norm": 1.6362818479537964,
"learning_rate": 1.336697751884111e-05,
"loss": 0.2815,
"step": 1240
},
{
"epoch": 0.9232331625351983,
"grad_norm": 1.3282984495162964,
"learning_rate": 1.3243744541441578e-05,
"loss": 0.2679,
"step": 1250
},
{
"epoch": 0.9306190278354799,
"grad_norm": 4.261312961578369,
"learning_rate": 1.3119958533682417e-05,
"loss": 0.2634,
"step": 1260
},
{
"epoch": 0.9380048931357614,
"grad_norm": 2.1109001636505127,
"learning_rate": 1.2995640600006196e-05,
"loss": 0.2566,
"step": 1270
},
{
"epoch": 0.945390758436043,
"grad_norm": 2.4117610454559326,
"learning_rate": 1.2870811935544252e-05,
"loss": 0.2502,
"step": 1280
},
{
"epoch": 0.9527766237363247,
"grad_norm": 2.0748672485351562,
"learning_rate": 1.2745493822503096e-05,
"loss": 0.2422,
"step": 1290
},
{
"epoch": 0.9601624890366062,
"grad_norm": 3.0310394763946533,
"learning_rate": 1.261970762653598e-05,
"loss": 0.2508,
"step": 1300
},
{
"epoch": 0.9675483543368878,
"grad_norm": 2.0341477394104004,
"learning_rate": 1.2493474793100249e-05,
"loss": 0.2467,
"step": 1310
},
{
"epoch": 0.9749342196371694,
"grad_norm": 1.4582960605621338,
"learning_rate": 1.2366816843801066e-05,
"loss": 0.2479,
"step": 1320
},
{
"epoch": 0.9823200849374509,
"grad_norm": 3.3330225944519043,
"learning_rate": 1.2239755372722169e-05,
"loss": 0.2516,
"step": 1330
},
{
"epoch": 0.9897059502377326,
"grad_norm": 1.4349642992019653,
"learning_rate": 1.2112312042744263e-05,
"loss": 0.2153,
"step": 1340
},
{
"epoch": 0.9970918155380141,
"grad_norm": 2.073673725128174,
"learning_rate": 1.1984508581851694e-05,
"loss": 0.1858,
"step": 1350
},
{
"epoch": 1.0051701057101972,
"grad_norm": 4.247702598571777,
"learning_rate": 1.1856366779428008e-05,
"loss": 0.2183,
"step": 1360
},
{
"epoch": 1.0125559710104788,
"grad_norm": 4.242294788360596,
"learning_rate": 1.1727908482541048e-05,
"loss": 0.2059,
"step": 1370
},
{
"epoch": 1.0199418363107602,
"grad_norm": 2.2901999950408936,
"learning_rate": 1.1599155592218234e-05,
"loss": 0.2207,
"step": 1380
},
{
"epoch": 1.0273277016110418,
"grad_norm": 1.7798693180084229,
"learning_rate": 1.1470130059712607e-05,
"loss": 0.1898,
"step": 1390
},
{
"epoch": 1.0347135669113234,
"grad_norm": 1.9651380777359009,
"learning_rate": 1.1340853882760343e-05,
"loss": 0.1958,
"step": 1400
},
{
"epoch": 1.042099432211605,
"grad_norm": 1.8335607051849365,
"learning_rate": 1.1211349101830323e-05,
"loss": 0.2201,
"step": 1410
},
{
"epoch": 1.0494852975118867,
"grad_norm": 2.270725965499878,
"learning_rate": 1.1081637796366432e-05,
"loss": 0.1881,
"step": 1420
},
{
"epoch": 1.0568711628121683,
"grad_norm": 3.337350368499756,
"learning_rate": 1.0951742081023196e-05,
"loss": 0.2176,
"step": 1430
},
{
"epoch": 1.0642570281124497,
"grad_norm": 3.7382607460021973,
"learning_rate": 1.0821684101895429e-05,
"loss": 0.2043,
"step": 1440
},
{
"epoch": 1.0716428934127313,
"grad_norm": 1.3422726392745972,
"learning_rate": 1.0691486032742522e-05,
"loss": 0.1908,
"step": 1450
},
{
"epoch": 1.079028758713013,
"grad_norm": 3.4625842571258545,
"learning_rate": 1.0561170071207987e-05,
"loss": 0.1747,
"step": 1460
},
{
"epoch": 1.0864146240132946,
"grad_norm": 1.8566938638687134,
"learning_rate": 1.0430758435034985e-05,
"loss": 0.2003,
"step": 1470
},
{
"epoch": 1.0938004893135762,
"grad_norm": 4.041960716247559,
"learning_rate": 1.0300273358278362e-05,
"loss": 0.1716,
"step": 1480
},
{
"epoch": 1.1011863546138578,
"grad_norm": 1.5447806119918823,
"learning_rate": 1.016973708751395e-05,
"loss": 0.1911,
"step": 1490
},
{
"epoch": 1.1085722199141392,
"grad_norm": 1.8091706037521362,
"learning_rate": 1.003917187804572e-05,
"loss": 0.1687,
"step": 1500
},
{
"epoch": 1.1159580852144209,
"grad_norm": 1.5981247425079346,
"learning_rate": 9.908599990111438e-06,
"loss": 0.1706,
"step": 1510
},
{
"epoch": 1.1233439505147025,
"grad_norm": 1.5762553215026855,
"learning_rate": 9.778043685087488e-06,
"loss": 0.1896,
"step": 1520
},
{
"epoch": 1.130729815814984,
"grad_norm": 1.4694616794586182,
"learning_rate": 9.64752522169351e-06,
"loss": 0.1718,
"step": 1530
},
{
"epoch": 1.1381156811152657,
"grad_norm": 1.4669324159622192,
"learning_rate": 9.517066852197469e-06,
"loss": 0.1481,
"step": 1540
},
{
"epoch": 1.1455015464155474,
"grad_norm": 2.1808154582977295,
"learning_rate": 9.386690818621845e-06,
"loss": 0.1878,
"step": 1550
},
{
"epoch": 1.1528874117158288,
"grad_norm": 1.0794235467910767,
"learning_rate": 9.256419348951545e-06,
"loss": 0.1809,
"step": 1560
},
{
"epoch": 1.1602732770161104,
"grad_norm": 1.1634767055511475,
"learning_rate": 9.126274653344249e-06,
"loss": 0.1558,
"step": 1570
},
{
"epoch": 1.167659142316392,
"grad_norm": 3.980741024017334,
"learning_rate": 8.996278920343753e-06,
"loss": 0.1714,
"step": 1580
},
{
"epoch": 1.1750450076166736,
"grad_norm": 1.3018531799316406,
"learning_rate": 8.866454313097011e-06,
"loss": 0.1476,
"step": 1590
},
{
"epoch": 1.1824308729169553,
"grad_norm": 1.6033530235290527,
"learning_rate": 8.736822965575526e-06,
"loss": 0.1702,
"step": 1600
},
{
"epoch": 1.1898167382172367,
"grad_norm": 1.6837263107299805,
"learning_rate": 8.607406978801692e-06,
"loss": 0.1622,
"step": 1610
},
{
"epoch": 1.1972026035175183,
"grad_norm": 4.44855260848999,
"learning_rate": 8.478228417080749e-06,
"loss": 0.2111,
"step": 1620
},
{
"epoch": 1.2045884688178,
"grad_norm": 1.133955478668213,
"learning_rate": 8.349309304239033e-06,
"loss": 0.1407,
"step": 1630
},
{
"epoch": 1.2119743341180815,
"grad_norm": 2.430974006652832,
"learning_rate": 8.22067161986909e-06,
"loss": 0.1502,
"step": 1640
},
{
"epoch": 1.2193601994183632,
"grad_norm": 1.0593976974487305,
"learning_rate": 8.092337295582342e-06,
"loss": 0.1461,
"step": 1650
},
{
"epoch": 1.2267460647186448,
"grad_norm": 1.5466171503067017,
"learning_rate": 7.964328211269949e-06,
"loss": 0.1257,
"step": 1660
},
{
"epoch": 1.2341319300189264,
"grad_norm": 3.7850043773651123,
"learning_rate": 7.83666619137247e-06,
"loss": 0.1237,
"step": 1670
},
{
"epoch": 1.2415177953192078,
"grad_norm": 2.987395763397217,
"learning_rate": 7.709373001158989e-06,
"loss": 0.135,
"step": 1680
},
{
"epoch": 1.2489036606194894,
"grad_norm": 1.1026815176010132,
"learning_rate": 7.582470343016315e-06,
"loss": 0.1339,
"step": 1690
},
{
"epoch": 1.256289525919771,
"grad_norm": 0.8675901293754578,
"learning_rate": 7.455979852748926e-06,
"loss": 0.1187,
"step": 1700
},
{
"epoch": 1.2636753912200527,
"grad_norm": 1.0071134567260742,
"learning_rate": 7.3299230958902455e-06,
"loss": 0.1288,
"step": 1710
},
{
"epoch": 1.2710612565203343,
"grad_norm": 1.257807731628418,
"learning_rate": 7.2043215640259045e-06,
"loss": 0.1219,
"step": 1720
},
{
"epoch": 1.2784471218206157,
"grad_norm": 1.5844953060150146,
"learning_rate": 7.079196671129613e-06,
"loss": 0.1293,
"step": 1730
},
{
"epoch": 1.2858329871208973,
"grad_norm": 1.242968201637268,
"learning_rate": 6.954569749912268e-06,
"loss": 0.1242,
"step": 1740
},
{
"epoch": 1.293218852421179,
"grad_norm": 6.035883903503418,
"learning_rate": 6.8304620481849e-06,
"loss": 0.1324,
"step": 1750
},
{
"epoch": 1.3006047177214606,
"grad_norm": 1.1064496040344238,
"learning_rate": 6.706894725236118e-06,
"loss": 0.113,
"step": 1760
},
{
"epoch": 1.3079905830217422,
"grad_norm": 3.75222110748291,
"learning_rate": 6.583888848224628e-06,
"loss": 0.1402,
"step": 1770
},
{
"epoch": 1.3153764483220236,
"grad_norm": 2.064958333969116,
"learning_rate": 6.4614653885874564e-06,
"loss": 0.1354,
"step": 1780
},
{
"epoch": 1.3227623136223052,
"grad_norm": 1.2012087106704712,
"learning_rate": 6.339645218464521e-06,
"loss": 0.1162,
"step": 1790
},
{
"epoch": 1.3301481789225869,
"grad_norm": 3.533600330352783,
"learning_rate": 6.218449107140093e-06,
"loss": 0.114,
"step": 1800
},
{
"epoch": 1.3375340442228685,
"grad_norm": 1.0663248300552368,
"learning_rate": 6.097897717501829e-06,
"loss": 0.1102,
"step": 1810
},
{
"epoch": 1.34491990952315,
"grad_norm": 2.6653411388397217,
"learning_rate": 5.978011602517908e-06,
"loss": 0.1115,
"step": 1820
},
{
"epoch": 1.3523057748234317,
"grad_norm": 2.8922715187072754,
"learning_rate": 5.858811201732952e-06,
"loss": 0.1168,
"step": 1830
},
{
"epoch": 1.3596916401237134,
"grad_norm": 0.7805532813072205,
"learning_rate": 5.740316837783247e-06,
"loss": 0.0985,
"step": 1840
},
{
"epoch": 1.3670775054239948,
"grad_norm": 1.6969873905181885,
"learning_rate": 5.622548712931907e-06,
"loss": 0.115,
"step": 1850
},
{
"epoch": 1.3744633707242764,
"grad_norm": 1.0871217250823975,
"learning_rate": 5.50552690562457e-06,
"loss": 0.1077,
"step": 1860
},
{
"epoch": 1.381849236024558,
"grad_norm": 1.25892174243927,
"learning_rate": 5.389271367066193e-06,
"loss": 0.0974,
"step": 1870
},
{
"epoch": 1.3892351013248396,
"grad_norm": 0.6338607668876648,
"learning_rate": 5.273801917819552e-06,
"loss": 0.098,
"step": 1880
},
{
"epoch": 1.3966209666251213,
"grad_norm": 0.43911364674568176,
"learning_rate": 5.159138244425996e-06,
"loss": 0.0965,
"step": 1890
},
{
"epoch": 1.4040068319254027,
"grad_norm": 0.7171842455863953,
"learning_rate": 5.045299896049063e-06,
"loss": 0.1043,
"step": 1900
},
{
"epoch": 1.4113926972256843,
"grad_norm": 0.7495408058166504,
"learning_rate": 4.932306281141531e-06,
"loss": 0.1067,
"step": 1910
},
{
"epoch": 1.418778562525966,
"grad_norm": 0.6386808753013611,
"learning_rate": 4.82017666413643e-06,
"loss": 0.095,
"step": 1920
},
{
"epoch": 1.4261644278262475,
"grad_norm": 0.4710920751094818,
"learning_rate": 4.7089301621626285e-06,
"loss": 0.0946,
"step": 1930
},
{
"epoch": 1.4335502931265292,
"grad_norm": 2.0037851333618164,
"learning_rate": 4.598585741785529e-06,
"loss": 0.1343,
"step": 1940
},
{
"epoch": 1.4409361584268106,
"grad_norm": 0.731887936592102,
"learning_rate": 4.489162215773437e-06,
"loss": 0.1021,
"step": 1950
},
{
"epoch": 1.4483220237270924,
"grad_norm": 1.012526035308838,
"learning_rate": 4.380678239890128e-06,
"loss": 0.0986,
"step": 1960
},
{
"epoch": 1.4557078890273738,
"grad_norm": 1.7591279745101929,
"learning_rate": 4.273152309714231e-06,
"loss": 0.0921,
"step": 1970
},
{
"epoch": 1.4630937543276554,
"grad_norm": 0.5881451964378357,
"learning_rate": 4.166602757485865e-06,
"loss": 0.0889,
"step": 1980
},
{
"epoch": 1.470479619627937,
"grad_norm": 0.6772285103797913,
"learning_rate": 4.061047748981171e-06,
"loss": 0.0999,
"step": 1990
},
{
"epoch": 1.4778654849282187,
"grad_norm": 1.0633774995803833,
"learning_rate": 3.9565052804151925e-06,
"loss": 0.0929,
"step": 2000
},
{
"epoch": 1.4852513502285003,
"grad_norm": 0.5887898802757263,
"learning_rate": 3.852993175373679e-06,
"loss": 0.0929,
"step": 2010
},
{
"epoch": 1.4926372155287817,
"grad_norm": 0.9685658812522888,
"learning_rate": 3.7505290817743256e-06,
"loss": 0.0932,
"step": 2020
},
{
"epoch": 1.5000230808290633,
"grad_norm": 3.481058120727539,
"learning_rate": 3.6491304688579376e-06,
"loss": 0.1034,
"step": 2030
},
{
"epoch": 1.507408946129345,
"grad_norm": 1.2913931608200073,
"learning_rate": 3.5488146242101018e-06,
"loss": 0.0914,
"step": 2040
},
{
"epoch": 1.5147948114296266,
"grad_norm": 0.49071353673934937,
"learning_rate": 3.4495986508137847e-06,
"loss": 0.097,
"step": 2050
},
{
"epoch": 1.5221806767299082,
"grad_norm": 0.7845070362091064,
"learning_rate": 3.3514994641334274e-06,
"loss": 0.0895,
"step": 2060
},
{
"epoch": 1.5295665420301896,
"grad_norm": 0.7540778517723083,
"learning_rate": 3.254533789231008e-06,
"loss": 0.094,
"step": 2070
},
{
"epoch": 1.5369524073304714,
"grad_norm": 0.8221713900566101,
"learning_rate": 3.158718157914559e-06,
"loss": 0.0857,
"step": 2080
},
{
"epoch": 1.5443382726307529,
"grad_norm": 0.458886057138443,
"learning_rate": 3.0640689059196328e-06,
"loss": 0.0834,
"step": 2090
},
{
"epoch": 1.5517241379310345,
"grad_norm": 5.687739372253418,
"learning_rate": 2.9706021701242127e-06,
"loss": 0.0944,
"step": 2100
},
{
"epoch": 1.559110003231316,
"grad_norm": 0.609434962272644,
"learning_rate": 2.8783338857975087e-06,
"loss": 0.0926,
"step": 2110
},
{
"epoch": 1.5664958685315975,
"grad_norm": 3.346607208251953,
"learning_rate": 2.787279783883129e-06,
"loss": 0.087,
"step": 2120
},
{
"epoch": 1.5738817338318793,
"grad_norm": 2.047215700149536,
"learning_rate": 2.697455388317094e-06,
"loss": 0.0807,
"step": 2130
},
{
"epoch": 1.5812675991321608,
"grad_norm": 1.0655306577682495,
"learning_rate": 2.6088760133811418e-06,
"loss": 0.0857,
"step": 2140
},
{
"epoch": 1.5886534644324424,
"grad_norm": 1.1660749912261963,
"learning_rate": 2.5215567610917623e-06,
"loss": 0.08,
"step": 2150
},
{
"epoch": 1.596039329732724,
"grad_norm": 0.45875102281570435,
"learning_rate": 2.4355125186254547e-06,
"loss": 0.0931,
"step": 2160
},
{
"epoch": 1.6034251950330056,
"grad_norm": 1.5347977876663208,
"learning_rate": 2.3507579557805803e-06,
"loss": 0.083,
"step": 2170
},
{
"epoch": 1.6108110603332872,
"grad_norm": 1.1268221139907837,
"learning_rate": 2.26730752247629e-06,
"loss": 0.0841,
"step": 2180
},
{
"epoch": 1.6181969256335687,
"grad_norm": 0.4492045044898987,
"learning_rate": 2.1851754462889373e-06,
"loss": 0.0791,
"step": 2190
},
{
"epoch": 1.6255827909338505,
"grad_norm": 0.9329794645309448,
"learning_rate": 2.104375730026406e-06,
"loss": 0.0827,
"step": 2200
},
{
"epoch": 1.632968656234132,
"grad_norm": 0.4460253119468689,
"learning_rate": 2.024922149340748e-06,
"loss": 0.0812,
"step": 2210
},
{
"epoch": 1.6403545215344135,
"grad_norm": 3.0073747634887695,
"learning_rate": 1.9468282503795465e-06,
"loss": 0.0836,
"step": 2220
},
{
"epoch": 1.6477403868346951,
"grad_norm": 0.7037497758865356,
"learning_rate": 1.8701073474764342e-06,
"loss": 0.0757,
"step": 2230
},
{
"epoch": 1.6551262521349765,
"grad_norm": 2.326693058013916,
"learning_rate": 1.7947725208810962e-06,
"loss": 0.0743,
"step": 2240
},
{
"epoch": 1.6625121174352584,
"grad_norm": 0.2990873456001282,
"learning_rate": 1.720836614529211e-06,
"loss": 0.0799,
"step": 2250
},
{
"epoch": 1.6698979827355398,
"grad_norm": 0.4213595688343048,
"learning_rate": 1.648312233852666e-06,
"loss": 0.0802,
"step": 2260
},
{
"epoch": 1.6772838480358214,
"grad_norm": 0.5848265290260315,
"learning_rate": 1.5772117436304446e-06,
"loss": 0.0795,
"step": 2270
},
{
"epoch": 1.684669713336103,
"grad_norm": 0.6411451697349548,
"learning_rate": 1.5075472658805301e-06,
"loss": 0.0739,
"step": 2280
},
{
"epoch": 1.6920555786363847,
"grad_norm": 0.8654035925865173,
"learning_rate": 1.4393306777932192e-06,
"loss": 0.0796,
"step": 2290
},
{
"epoch": 1.6994414439366663,
"grad_norm": 0.7043092250823975,
"learning_rate": 1.3725736097061537e-06,
"loss": 0.0811,
"step": 2300
},
{
"epoch": 1.7068273092369477,
"grad_norm": 1.6693702936172485,
"learning_rate": 1.307287443121452e-06,
"loss": 0.094,
"step": 2310
},
{
"epoch": 1.7142131745372293,
"grad_norm": 0.33761119842529297,
"learning_rate": 1.2434833087652642e-06,
"loss": 0.0759,
"step": 2320
},
{
"epoch": 1.721599039837511,
"grad_norm": 0.9389520883560181,
"learning_rate": 1.181172084690072e-06,
"loss": 0.0727,
"step": 2330
},
{
"epoch": 1.7289849051377926,
"grad_norm": 0.2903837263584137,
"learning_rate": 1.120364394420087e-06,
"loss": 0.0743,
"step": 2340
},
{
"epoch": 1.7363707704380742,
"grad_norm": 0.325009822845459,
"learning_rate": 1.0610706051400165e-06,
"loss": 0.0801,
"step": 2350
},
{
"epoch": 1.7437566357383556,
"grad_norm": 0.9325069785118103,
"learning_rate": 1.0033008259275635e-06,
"loss": 0.0759,
"step": 2360
},
{
"epoch": 1.7511425010386374,
"grad_norm": 1.0802961587905884,
"learning_rate": 9.470649060299041e-07,
"loss": 0.0779,
"step": 2370
},
{
"epoch": 1.7585283663389188,
"grad_norm": 0.4947347939014435,
"learning_rate": 8.923724331844875e-07,
"loss": 0.0786,
"step": 2380
},
{
"epoch": 1.7659142316392005,
"grad_norm": 0.47125598788261414,
"learning_rate": 8.392327319843985e-07,
"loss": 0.0751,
"step": 2390
},
{
"epoch": 1.773300096939482,
"grad_norm": 0.3219301402568817,
"learning_rate": 7.876548622886038e-07,
"loss": 0.0702,
"step": 2400
},
{
"epoch": 1.7806859622397635,
"grad_norm": 0.602854311466217,
"learning_rate": 7.376476176773184e-07,
"loss": 0.0772,
"step": 2410
},
{
"epoch": 1.7880718275400453,
"grad_norm": 0.48326513171195984,
"learning_rate": 6.89219523952781e-07,
"loss": 0.0797,
"step": 2420
},
{
"epoch": 1.7954576928403267,
"grad_norm": 0.5595663189888,
"learning_rate": 6.423788376856765e-07,
"loss": 0.066,
"step": 2430
},
{
"epoch": 1.8028435581406084,
"grad_norm": 1.7976887226104736,
"learning_rate": 5.971335448074611e-07,
"loss": 0.0732,
"step": 2440
},
{
"epoch": 1.81022942344089,
"grad_norm": 1.282763957977295,
"learning_rate": 5.534913592488322e-07,
"loss": 0.0816,
"step": 2450
},
{
"epoch": 1.8176152887411716,
"grad_norm": 0.9589461088180542,
"learning_rate": 5.114597216245698e-07,
"loss": 0.0798,
"step": 2460
},
{
"epoch": 1.8250011540414532,
"grad_norm": 0.43628719449043274,
"learning_rate": 4.7104579796497405e-07,
"loss": 0.0835,
"step": 2470
},
{
"epoch": 1.8323870193417346,
"grad_norm": 0.49431607127189636,
"learning_rate": 4.3225647849411854e-07,
"loss": 0.074,
"step": 2480
},
{
"epoch": 1.8397728846420165,
"grad_norm": 0.9135465025901794,
"learning_rate": 3.9509837645513306e-07,
"loss": 0.0736,
"step": 2490
},
{
"epoch": 1.847158749942298,
"grad_norm": 0.6499918103218079,
"learning_rate": 3.595778269826966e-07,
"loss": 0.0723,
"step": 2500
},
{
"epoch": 1.8545446152425795,
"grad_norm": 1.299659013748169,
"learning_rate": 3.257008860229527e-07,
"loss": 0.0735,
"step": 2510
},
{
"epoch": 1.8619304805428611,
"grad_norm": 0.7049327492713928,
"learning_rate": 2.9347332930102503e-07,
"loss": 0.0713,
"step": 2520
},
{
"epoch": 1.8693163458431425,
"grad_norm": 0.29024580121040344,
"learning_rate": 2.6290065133630637e-07,
"loss": 0.0774,
"step": 2530
},
{
"epoch": 1.8767022111434244,
"grad_norm": 0.7386340498924255,
"learning_rate": 2.3398806450568577e-07,
"loss": 0.0739,
"step": 2540
},
{
"epoch": 1.8840880764437058,
"grad_norm": 0.5153611898422241,
"learning_rate": 2.067404981548915e-07,
"loss": 0.0702,
"step": 2550
},
{
"epoch": 1.8914739417439874,
"grad_norm": 1.2201671600341797,
"learning_rate": 1.811625977580722e-07,
"loss": 0.082,
"step": 2560
},
{
"epoch": 1.898859807044269,
"grad_norm": 0.7881399989128113,
"learning_rate": 1.5725872412579058e-07,
"loss": 0.0677,
"step": 2570
},
{
"epoch": 1.9062456723445507,
"grad_norm": 0.3312283456325531,
"learning_rate": 1.3503295266153903e-07,
"loss": 0.0756,
"step": 2580
},
{
"epoch": 1.9136315376448323,
"grad_norm": 0.4955926239490509,
"learning_rate": 1.14489072666919e-07,
"loss": 0.0692,
"step": 2590
},
{
"epoch": 1.9210174029451137,
"grad_norm": 0.45805656909942627,
"learning_rate": 9.563058669559755e-08,
"loss": 0.0753,
"step": 2600
},
{
"epoch": 1.9284032682453955,
"grad_norm": 0.5555469393730164,
"learning_rate": 7.846070995615518e-08,
"loss": 0.0716,
"step": 2610
},
{
"epoch": 1.935789133545677,
"grad_norm": 0.5252045392990112,
"learning_rate": 6.298236976391537e-08,
"loss": 0.0772,
"step": 2620
},
{
"epoch": 1.9431749988459586,
"grad_norm": 1.8346993923187256,
"learning_rate": 4.919820504186934e-08,
"loss": 0.0764,
"step": 2630
},
{
"epoch": 1.9505608641462402,
"grad_norm": 0.4004700481891632,
"learning_rate": 3.711056587075712e-08,
"loss": 0.0739,
"step": 2640
},
{
"epoch": 1.9579467294465216,
"grad_norm": 1.077645182609558,
"learning_rate": 2.672151308840243e-08,
"loss": 0.07,
"step": 2650
},
{
"epoch": 1.9653325947468034,
"grad_norm": 0.6247801184654236,
"learning_rate": 1.8032817938352653e-08,
"loss": 0.0666,
"step": 2660
},
{
"epoch": 1.9727184600470848,
"grad_norm": 0.4016879200935364,
"learning_rate": 1.1045961767904844e-08,
"loss": 0.0695,
"step": 2670
},
{
"epoch": 1.9801043253473665,
"grad_norm": 0.5175566673278809,
"learning_rate": 5.7621357755432984e-09,
"loss": 0.0722,
"step": 2680
},
{
"epoch": 1.987490190647648,
"grad_norm": 0.5656958222389221,
"learning_rate": 2.1822408078508994e-09,
"loss": 0.0728,
"step": 2690
},
{
"epoch": 1.9948760559479295,
"grad_norm": 0.5182742476463318,
"learning_rate": 3.068872059253103e-10,
"loss": 0.0727,
"step": 2700
}
],
"logging_steps": 10,
"max_steps": 2706,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.22919470739456e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}