reiprasetya-study's picture
Upload folder using huggingface_hub
0d5ff43 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.999818820886328,
"eval_steps": 1000,
"global_step": 34495,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007247164546871037,
"grad_norm": 3.8405699729919434,
"learning_rate": 4.9928975213799106e-05,
"loss": 1.4877,
"step": 50
},
{
"epoch": 0.014494329093742073,
"grad_norm": 4.489353179931641,
"learning_rate": 4.9856500942165534e-05,
"loss": 1.1021,
"step": 100
},
{
"epoch": 0.021741493640613112,
"grad_norm": 5.1422295570373535,
"learning_rate": 4.978402667053196e-05,
"loss": 1.0659,
"step": 150
},
{
"epoch": 0.028988658187484147,
"grad_norm": 4.703744411468506,
"learning_rate": 4.971155239889839e-05,
"loss": 1.0636,
"step": 200
},
{
"epoch": 0.03623582273435518,
"grad_norm": 3.330397605895996,
"learning_rate": 4.963907812726482e-05,
"loss": 1.0025,
"step": 250
},
{
"epoch": 0.043482987281226224,
"grad_norm": 3.511565685272217,
"learning_rate": 4.9566603855631256e-05,
"loss": 0.9945,
"step": 300
},
{
"epoch": 0.05073015182809726,
"grad_norm": 3.470792055130005,
"learning_rate": 4.9494129583997684e-05,
"loss": 1.0228,
"step": 350
},
{
"epoch": 0.057977316374968293,
"grad_norm": 3.1396453380584717,
"learning_rate": 4.942165531236411e-05,
"loss": 0.9935,
"step": 400
},
{
"epoch": 0.06522448092183933,
"grad_norm": 3.4228012561798096,
"learning_rate": 4.934918104073054e-05,
"loss": 0.9904,
"step": 450
},
{
"epoch": 0.07247164546871036,
"grad_norm": 3.423574447631836,
"learning_rate": 4.927670676909698e-05,
"loss": 0.9884,
"step": 500
},
{
"epoch": 0.0797188100155814,
"grad_norm": 2.9020352363586426,
"learning_rate": 4.9204232497463405e-05,
"loss": 0.9093,
"step": 550
},
{
"epoch": 0.08696597456245245,
"grad_norm": 3.0173707008361816,
"learning_rate": 4.9131758225829834e-05,
"loss": 0.9374,
"step": 600
},
{
"epoch": 0.09421313910932348,
"grad_norm": 3.4516496658325195,
"learning_rate": 4.905928395419626e-05,
"loss": 0.9798,
"step": 650
},
{
"epoch": 0.10146030365619452,
"grad_norm": 2.8202695846557617,
"learning_rate": 4.898680968256269e-05,
"loss": 0.9293,
"step": 700
},
{
"epoch": 0.10870746820306555,
"grad_norm": 3.3415849208831787,
"learning_rate": 4.8914335410929126e-05,
"loss": 0.9622,
"step": 750
},
{
"epoch": 0.11595463274993659,
"grad_norm": 3.4417173862457275,
"learning_rate": 4.8841861139295555e-05,
"loss": 0.9363,
"step": 800
},
{
"epoch": 0.12320179729680762,
"grad_norm": 3.051495313644409,
"learning_rate": 4.8769386867661983e-05,
"loss": 0.9086,
"step": 850
},
{
"epoch": 0.13044896184367866,
"grad_norm": 2.5990054607391357,
"learning_rate": 4.869691259602841e-05,
"loss": 0.8976,
"step": 900
},
{
"epoch": 0.1376961263905497,
"grad_norm": 3.3123867511749268,
"learning_rate": 4.862443832439485e-05,
"loss": 0.9568,
"step": 950
},
{
"epoch": 0.14494329093742073,
"grad_norm": 2.7175092697143555,
"learning_rate": 4.8551964052761276e-05,
"loss": 0.8665,
"step": 1000
},
{
"epoch": 0.14494329093742073,
"eval_loss": 0.855056643486023,
"eval_runtime": 245.3157,
"eval_samples_per_second": 140.615,
"eval_steps_per_second": 14.064,
"step": 1000
},
{
"epoch": 0.15219045548429178,
"grad_norm": 2.541044235229492,
"learning_rate": 4.848093926656037e-05,
"loss": 0.887,
"step": 1050
},
{
"epoch": 0.1594376200311628,
"grad_norm": 3.0265707969665527,
"learning_rate": 4.840846499492681e-05,
"loss": 0.8987,
"step": 1100
},
{
"epoch": 0.16668478457803385,
"grad_norm": 3.479360818862915,
"learning_rate": 4.8335990723293237e-05,
"loss": 0.8744,
"step": 1150
},
{
"epoch": 0.1739319491249049,
"grad_norm": 3.381410598754883,
"learning_rate": 4.8263516451659665e-05,
"loss": 0.869,
"step": 1200
},
{
"epoch": 0.18117911367177592,
"grad_norm": 3.526550769805908,
"learning_rate": 4.8191042180026094e-05,
"loss": 0.8753,
"step": 1250
},
{
"epoch": 0.18842627821864696,
"grad_norm": 3.5922884941101074,
"learning_rate": 4.811856790839252e-05,
"loss": 0.8661,
"step": 1300
},
{
"epoch": 0.19567344276551799,
"grad_norm": 3.390925168991089,
"learning_rate": 4.804609363675896e-05,
"loss": 0.865,
"step": 1350
},
{
"epoch": 0.20292060731238903,
"grad_norm": 3.6313884258270264,
"learning_rate": 4.7973619365125386e-05,
"loss": 0.9077,
"step": 1400
},
{
"epoch": 0.21016777185926006,
"grad_norm": 3.6675050258636475,
"learning_rate": 4.7901145093491815e-05,
"loss": 0.8724,
"step": 1450
},
{
"epoch": 0.2174149364061311,
"grad_norm": 2.9119815826416016,
"learning_rate": 4.782867082185824e-05,
"loss": 0.887,
"step": 1500
},
{
"epoch": 0.22466210095300213,
"grad_norm": 3.978513479232788,
"learning_rate": 4.775619655022467e-05,
"loss": 0.909,
"step": 1550
},
{
"epoch": 0.23190926549987317,
"grad_norm": 2.6962685585021973,
"learning_rate": 4.76837222785911e-05,
"loss": 0.9013,
"step": 1600
},
{
"epoch": 0.23915643004674422,
"grad_norm": 3.688558578491211,
"learning_rate": 4.761124800695753e-05,
"loss": 0.8777,
"step": 1650
},
{
"epoch": 0.24640359459361524,
"grad_norm": 2.6610682010650635,
"learning_rate": 4.753877373532396e-05,
"loss": 0.854,
"step": 1700
},
{
"epoch": 0.2536507591404863,
"grad_norm": 2.8136966228485107,
"learning_rate": 4.746629946369039e-05,
"loss": 0.8523,
"step": 1750
},
{
"epoch": 0.2608979236873573,
"grad_norm": 2.660381555557251,
"learning_rate": 4.739382519205682e-05,
"loss": 0.8764,
"step": 1800
},
{
"epoch": 0.26814508823422833,
"grad_norm": 2.8120713233947754,
"learning_rate": 4.732135092042325e-05,
"loss": 0.8401,
"step": 1850
},
{
"epoch": 0.2753922527810994,
"grad_norm": 3.686016321182251,
"learning_rate": 4.724887664878968e-05,
"loss": 0.8891,
"step": 1900
},
{
"epoch": 0.28263941732797043,
"grad_norm": 3.8460748195648193,
"learning_rate": 4.717640237715611e-05,
"loss": 0.8909,
"step": 1950
},
{
"epoch": 0.28988658187484145,
"grad_norm": 3.1707077026367188,
"learning_rate": 4.710392810552254e-05,
"loss": 0.8564,
"step": 2000
},
{
"epoch": 0.28988658187484145,
"eval_loss": 0.8084499835968018,
"eval_runtime": 245.4746,
"eval_samples_per_second": 140.524,
"eval_steps_per_second": 14.054,
"step": 2000
},
{
"epoch": 0.29713374642171253,
"grad_norm": 2.892038106918335,
"learning_rate": 4.703145383388897e-05,
"loss": 0.8496,
"step": 2050
},
{
"epoch": 0.30438091096858355,
"grad_norm": 3.043569803237915,
"learning_rate": 4.69589795622554e-05,
"loss": 0.9013,
"step": 2100
},
{
"epoch": 0.3116280755154546,
"grad_norm": 2.6177148818969727,
"learning_rate": 4.688650529062183e-05,
"loss": 0.8438,
"step": 2150
},
{
"epoch": 0.3188752400623256,
"grad_norm": 3.0475521087646484,
"learning_rate": 4.6814031018988264e-05,
"loss": 0.8892,
"step": 2200
},
{
"epoch": 0.32612240460919667,
"grad_norm": 3.0972824096679688,
"learning_rate": 4.674155674735469e-05,
"loss": 0.8019,
"step": 2250
},
{
"epoch": 0.3333695691560677,
"grad_norm": 3.916921854019165,
"learning_rate": 4.666908247572112e-05,
"loss": 0.8278,
"step": 2300
},
{
"epoch": 0.3406167337029387,
"grad_norm": 2.8206372261047363,
"learning_rate": 4.659660820408755e-05,
"loss": 0.8532,
"step": 2350
},
{
"epoch": 0.3478638982498098,
"grad_norm": 3.05037260055542,
"learning_rate": 4.6524133932453985e-05,
"loss": 0.8516,
"step": 2400
},
{
"epoch": 0.3551110627966808,
"grad_norm": 2.940920829772949,
"learning_rate": 4.645165966082041e-05,
"loss": 0.8326,
"step": 2450
},
{
"epoch": 0.36235822734355183,
"grad_norm": 2.752655267715454,
"learning_rate": 4.637918538918684e-05,
"loss": 0.8705,
"step": 2500
},
{
"epoch": 0.36960539189042285,
"grad_norm": 2.369166851043701,
"learning_rate": 4.630671111755327e-05,
"loss": 0.8111,
"step": 2550
},
{
"epoch": 0.37685255643729393,
"grad_norm": 2.991446018218994,
"learning_rate": 4.62342368459197e-05,
"loss": 0.8379,
"step": 2600
},
{
"epoch": 0.38409972098416495,
"grad_norm": 2.6964468955993652,
"learning_rate": 4.6161762574286134e-05,
"loss": 0.8459,
"step": 2650
},
{
"epoch": 0.39134688553103597,
"grad_norm": 2.8385136127471924,
"learning_rate": 4.608928830265256e-05,
"loss": 0.8301,
"step": 2700
},
{
"epoch": 0.398594050077907,
"grad_norm": 2.3763439655303955,
"learning_rate": 4.601681403101899e-05,
"loss": 0.7899,
"step": 2750
},
{
"epoch": 0.40584121462477807,
"grad_norm": 3.157857894897461,
"learning_rate": 4.594433975938542e-05,
"loss": 0.8118,
"step": 2800
},
{
"epoch": 0.4130883791716491,
"grad_norm": 3.2475242614746094,
"learning_rate": 4.5871865487751855e-05,
"loss": 0.8096,
"step": 2850
},
{
"epoch": 0.4203355437185201,
"grad_norm": 2.474079132080078,
"learning_rate": 4.5799391216118284e-05,
"loss": 0.8164,
"step": 2900
},
{
"epoch": 0.4275827082653912,
"grad_norm": 2.8297600746154785,
"learning_rate": 4.572691694448471e-05,
"loss": 0.7803,
"step": 2950
},
{
"epoch": 0.4348298728122622,
"grad_norm": 3.616157054901123,
"learning_rate": 4.565444267285114e-05,
"loss": 0.7867,
"step": 3000
},
{
"epoch": 0.4348298728122622,
"eval_loss": 0.7722809910774231,
"eval_runtime": 245.4414,
"eval_samples_per_second": 140.543,
"eval_steps_per_second": 14.056,
"step": 3000
},
{
"epoch": 0.44207703735913323,
"grad_norm": 2.828176736831665,
"learning_rate": 4.558196840121757e-05,
"loss": 0.8496,
"step": 3050
},
{
"epoch": 0.44932420190600425,
"grad_norm": 3.1643614768981934,
"learning_rate": 4.5509494129584e-05,
"loss": 0.7921,
"step": 3100
},
{
"epoch": 0.4565713664528753,
"grad_norm": 3.158106803894043,
"learning_rate": 4.543701985795043e-05,
"loss": 0.8128,
"step": 3150
},
{
"epoch": 0.46381853099974635,
"grad_norm": 3.462926149368286,
"learning_rate": 4.536454558631686e-05,
"loss": 0.7772,
"step": 3200
},
{
"epoch": 0.47106569554661737,
"grad_norm": 2.819894552230835,
"learning_rate": 4.529207131468329e-05,
"loss": 0.7911,
"step": 3250
},
{
"epoch": 0.47831286009348845,
"grad_norm": 2.3228747844696045,
"learning_rate": 4.521959704304972e-05,
"loss": 0.7974,
"step": 3300
},
{
"epoch": 0.48556002464035947,
"grad_norm": 2.4810431003570557,
"learning_rate": 4.514712277141615e-05,
"loss": 0.8064,
"step": 3350
},
{
"epoch": 0.4928071891872305,
"grad_norm": 2.91050124168396,
"learning_rate": 4.5074648499782577e-05,
"loss": 0.8269,
"step": 3400
},
{
"epoch": 0.5000543537341016,
"grad_norm": 2.8636369705200195,
"learning_rate": 4.5002174228149005e-05,
"loss": 0.8139,
"step": 3450
},
{
"epoch": 0.5073015182809726,
"grad_norm": 2.4558308124542236,
"learning_rate": 4.4929699956515434e-05,
"loss": 0.7364,
"step": 3500
},
{
"epoch": 0.5145486828278436,
"grad_norm": 2.5896005630493164,
"learning_rate": 4.485722568488187e-05,
"loss": 0.7858,
"step": 3550
},
{
"epoch": 0.5217958473747146,
"grad_norm": 2.745985746383667,
"learning_rate": 4.47847514132483e-05,
"loss": 0.8279,
"step": 3600
},
{
"epoch": 0.5290430119215856,
"grad_norm": 3.3217363357543945,
"learning_rate": 4.4712277141614726e-05,
"loss": 0.7929,
"step": 3650
},
{
"epoch": 0.5362901764684567,
"grad_norm": 2.981140375137329,
"learning_rate": 4.4639802869981155e-05,
"loss": 0.8282,
"step": 3700
},
{
"epoch": 0.5435373410153278,
"grad_norm": 3.112213611602783,
"learning_rate": 4.456732859834759e-05,
"loss": 0.7823,
"step": 3750
},
{
"epoch": 0.5507845055621988,
"grad_norm": 2.5669238567352295,
"learning_rate": 4.449485432671402e-05,
"loss": 0.8118,
"step": 3800
},
{
"epoch": 0.5580316701090698,
"grad_norm": 2.7261078357696533,
"learning_rate": 4.442238005508045e-05,
"loss": 0.8098,
"step": 3850
},
{
"epoch": 0.5652788346559409,
"grad_norm": 2.5646941661834717,
"learning_rate": 4.4349905783446876e-05,
"loss": 0.8,
"step": 3900
},
{
"epoch": 0.5725259992028119,
"grad_norm": 2.627681016921997,
"learning_rate": 4.427743151181331e-05,
"loss": 0.7732,
"step": 3950
},
{
"epoch": 0.5797731637496829,
"grad_norm": 2.443345069885254,
"learning_rate": 4.420495724017974e-05,
"loss": 0.7949,
"step": 4000
},
{
"epoch": 0.5797731637496829,
"eval_loss": 0.7400076389312744,
"eval_runtime": 245.51,
"eval_samples_per_second": 140.503,
"eval_steps_per_second": 14.052,
"step": 4000
},
{
"epoch": 0.5870203282965539,
"grad_norm": 2.263333559036255,
"learning_rate": 4.413248296854617e-05,
"loss": 0.7523,
"step": 4050
},
{
"epoch": 0.5942674928434251,
"grad_norm": 2.655808210372925,
"learning_rate": 4.40600086969126e-05,
"loss": 0.7489,
"step": 4100
},
{
"epoch": 0.6015146573902961,
"grad_norm": 3.2610058784484863,
"learning_rate": 4.3987534425279025e-05,
"loss": 0.7853,
"step": 4150
},
{
"epoch": 0.6087618219371671,
"grad_norm": 2.82106876373291,
"learning_rate": 4.391506015364546e-05,
"loss": 0.7688,
"step": 4200
},
{
"epoch": 0.6160089864840381,
"grad_norm": 1.9377267360687256,
"learning_rate": 4.384258588201189e-05,
"loss": 0.7887,
"step": 4250
},
{
"epoch": 0.6232561510309091,
"grad_norm": 2.4087536334991455,
"learning_rate": 4.377011161037832e-05,
"loss": 0.8098,
"step": 4300
},
{
"epoch": 0.6305033155777802,
"grad_norm": 5.600491046905518,
"learning_rate": 4.3697637338744747e-05,
"loss": 0.7683,
"step": 4350
},
{
"epoch": 0.6377504801246512,
"grad_norm": 3.0500433444976807,
"learning_rate": 4.362516306711118e-05,
"loss": 0.7995,
"step": 4400
},
{
"epoch": 0.6449976446715223,
"grad_norm": 1.7417362928390503,
"learning_rate": 4.355268879547761e-05,
"loss": 0.7822,
"step": 4450
},
{
"epoch": 0.6522448092183933,
"grad_norm": 2.1057236194610596,
"learning_rate": 4.348021452384404e-05,
"loss": 0.7812,
"step": 4500
},
{
"epoch": 0.6594919737652644,
"grad_norm": 2.780358076095581,
"learning_rate": 4.341063922307581e-05,
"loss": 0.849,
"step": 4550
},
{
"epoch": 0.6667391383121354,
"grad_norm": 2.8940329551696777,
"learning_rate": 4.333816495144224e-05,
"loss": 0.7883,
"step": 4600
},
{
"epoch": 0.6739863028590064,
"grad_norm": 2.65120530128479,
"learning_rate": 4.326569067980867e-05,
"loss": 0.8011,
"step": 4650
},
{
"epoch": 0.6812334674058774,
"grad_norm": 3.0340774059295654,
"learning_rate": 4.31932164081751e-05,
"loss": 0.7784,
"step": 4700
},
{
"epoch": 0.6884806319527484,
"grad_norm": 3.227555990219116,
"learning_rate": 4.312074213654153e-05,
"loss": 0.7798,
"step": 4750
},
{
"epoch": 0.6957277964996196,
"grad_norm": 2.9701457023620605,
"learning_rate": 4.304826786490796e-05,
"loss": 0.7623,
"step": 4800
},
{
"epoch": 0.7029749610464906,
"grad_norm": 3.183608293533325,
"learning_rate": 4.297579359327439e-05,
"loss": 0.7591,
"step": 4850
},
{
"epoch": 0.7102221255933616,
"grad_norm": 3.034790277481079,
"learning_rate": 4.290331932164082e-05,
"loss": 0.75,
"step": 4900
},
{
"epoch": 0.7174692901402326,
"grad_norm": 2.8872621059417725,
"learning_rate": 4.283084505000725e-05,
"loss": 0.7672,
"step": 4950
},
{
"epoch": 0.7247164546871037,
"grad_norm": 2.583475112915039,
"learning_rate": 4.275837077837368e-05,
"loss": 0.7396,
"step": 5000
},
{
"epoch": 0.7247164546871037,
"eval_loss": 0.7259831428527832,
"eval_runtime": 245.9324,
"eval_samples_per_second": 140.262,
"eval_steps_per_second": 14.028,
"step": 5000
},
{
"epoch": 0.7319636192339747,
"grad_norm": 2.8117451667785645,
"learning_rate": 4.268589650674011e-05,
"loss": 0.7471,
"step": 5050
},
{
"epoch": 0.7392107837808457,
"grad_norm": 2.556171178817749,
"learning_rate": 4.261342223510654e-05,
"loss": 0.734,
"step": 5100
},
{
"epoch": 0.7464579483277168,
"grad_norm": 2.494264602661133,
"learning_rate": 4.2540947963472974e-05,
"loss": 0.7657,
"step": 5150
},
{
"epoch": 0.7537051128745879,
"grad_norm": 2.5196897983551025,
"learning_rate": 4.24684736918394e-05,
"loss": 0.7373,
"step": 5200
},
{
"epoch": 0.7609522774214589,
"grad_norm": 1.9920554161071777,
"learning_rate": 4.239599942020583e-05,
"loss": 0.7603,
"step": 5250
},
{
"epoch": 0.7681994419683299,
"grad_norm": 3.2272536754608154,
"learning_rate": 4.232352514857226e-05,
"loss": 0.798,
"step": 5300
},
{
"epoch": 0.7754466065152009,
"grad_norm": 2.447430372238159,
"learning_rate": 4.225105087693869e-05,
"loss": 0.7918,
"step": 5350
},
{
"epoch": 0.7826937710620719,
"grad_norm": 2.375603199005127,
"learning_rate": 4.217857660530512e-05,
"loss": 0.7578,
"step": 5400
},
{
"epoch": 0.789940935608943,
"grad_norm": 3.0463624000549316,
"learning_rate": 4.210610233367155e-05,
"loss": 0.749,
"step": 5450
},
{
"epoch": 0.797188100155814,
"grad_norm": 1.8498198986053467,
"learning_rate": 4.203362806203798e-05,
"loss": 0.7619,
"step": 5500
},
{
"epoch": 0.8044352647026851,
"grad_norm": 3.3921091556549072,
"learning_rate": 4.196115379040441e-05,
"loss": 0.7655,
"step": 5550
},
{
"epoch": 0.8116824292495561,
"grad_norm": 3.1778969764709473,
"learning_rate": 4.188867951877084e-05,
"loss": 0.7119,
"step": 5600
},
{
"epoch": 0.8189295937964272,
"grad_norm": 2.292695999145508,
"learning_rate": 4.1816205247137266e-05,
"loss": 0.7378,
"step": 5650
},
{
"epoch": 0.8261767583432982,
"grad_norm": 2.388732671737671,
"learning_rate": 4.1743730975503695e-05,
"loss": 0.7376,
"step": 5700
},
{
"epoch": 0.8334239228901692,
"grad_norm": 2.4461729526519775,
"learning_rate": 4.1671256703870124e-05,
"loss": 0.7656,
"step": 5750
},
{
"epoch": 0.8406710874370402,
"grad_norm": 3.55210280418396,
"learning_rate": 4.159878243223656e-05,
"loss": 0.7814,
"step": 5800
},
{
"epoch": 0.8479182519839112,
"grad_norm": 2.640709400177002,
"learning_rate": 4.152630816060299e-05,
"loss": 0.7541,
"step": 5850
},
{
"epoch": 0.8551654165307824,
"grad_norm": 2.837186336517334,
"learning_rate": 4.1453833888969416e-05,
"loss": 0.7492,
"step": 5900
},
{
"epoch": 0.8624125810776534,
"grad_norm": 3.748387336730957,
"learning_rate": 4.1381359617335845e-05,
"loss": 0.7317,
"step": 5950
},
{
"epoch": 0.8696597456245244,
"grad_norm": 2.7219724655151367,
"learning_rate": 4.130888534570227e-05,
"loss": 0.7546,
"step": 6000
},
{
"epoch": 0.8696597456245244,
"eval_loss": 0.7048457860946655,
"eval_runtime": 245.6596,
"eval_samples_per_second": 140.418,
"eval_steps_per_second": 14.044,
"step": 6000
},
{
"epoch": 0.8769069101713954,
"grad_norm": 3.017869234085083,
"learning_rate": 4.123641107406871e-05,
"loss": 0.7354,
"step": 6050
},
{
"epoch": 0.8841540747182665,
"grad_norm": 2.677828550338745,
"learning_rate": 4.116393680243514e-05,
"loss": 0.746,
"step": 6100
},
{
"epoch": 0.8914012392651375,
"grad_norm": 2.0534462928771973,
"learning_rate": 4.1091462530801566e-05,
"loss": 0.7614,
"step": 6150
},
{
"epoch": 0.8986484038120085,
"grad_norm": 3.0294041633605957,
"learning_rate": 4.1018988259167994e-05,
"loss": 0.7256,
"step": 6200
},
{
"epoch": 0.9058955683588796,
"grad_norm": 2.640902042388916,
"learning_rate": 4.094651398753443e-05,
"loss": 0.7276,
"step": 6250
},
{
"epoch": 0.9131427329057507,
"grad_norm": 2.773237466812134,
"learning_rate": 4.087403971590086e-05,
"loss": 0.7404,
"step": 6300
},
{
"epoch": 0.9203898974526217,
"grad_norm": 2.8102800846099854,
"learning_rate": 4.080156544426729e-05,
"loss": 0.7631,
"step": 6350
},
{
"epoch": 0.9276370619994927,
"grad_norm": 2.767054796218872,
"learning_rate": 4.0729091172633715e-05,
"loss": 0.764,
"step": 6400
},
{
"epoch": 0.9348842265463637,
"grad_norm": 2.9610891342163086,
"learning_rate": 4.065661690100015e-05,
"loss": 0.7536,
"step": 6450
},
{
"epoch": 0.9421313910932347,
"grad_norm": 3.0401785373687744,
"learning_rate": 4.058414262936658e-05,
"loss": 0.7824,
"step": 6500
},
{
"epoch": 0.9493785556401058,
"grad_norm": 2.3925185203552246,
"learning_rate": 4.051166835773301e-05,
"loss": 0.7277,
"step": 6550
},
{
"epoch": 0.9566257201869769,
"grad_norm": 2.707064390182495,
"learning_rate": 4.0439194086099436e-05,
"loss": 0.7133,
"step": 6600
},
{
"epoch": 0.9638728847338479,
"grad_norm": 2.639535665512085,
"learning_rate": 4.0366719814465865e-05,
"loss": 0.7198,
"step": 6650
},
{
"epoch": 0.9711200492807189,
"grad_norm": 2.1093649864196777,
"learning_rate": 4.02942455428323e-05,
"loss": 0.7417,
"step": 6700
},
{
"epoch": 0.97836721382759,
"grad_norm": 4.515178203582764,
"learning_rate": 4.022177127119873e-05,
"loss": 0.7299,
"step": 6750
},
{
"epoch": 0.985614378374461,
"grad_norm": 2.626970052719116,
"learning_rate": 4.014929699956516e-05,
"loss": 0.741,
"step": 6800
},
{
"epoch": 0.992861542921332,
"grad_norm": 3.5896389484405518,
"learning_rate": 4.0076822727931586e-05,
"loss": 0.7568,
"step": 6850
},
{
"epoch": 1.0001087074682031,
"grad_norm": 5.62574577331543,
"learning_rate": 4.000434845629802e-05,
"loss": 0.7374,
"step": 6900
},
{
"epoch": 1.0073558720150741,
"grad_norm": 3.0508015155792236,
"learning_rate": 3.993187418466445e-05,
"loss": 0.5983,
"step": 6950
},
{
"epoch": 1.0146030365619452,
"grad_norm": 1.7448738813400269,
"learning_rate": 3.985939991303088e-05,
"loss": 0.5915,
"step": 7000
},
{
"epoch": 1.0146030365619452,
"eval_loss": 0.696858823299408,
"eval_runtime": 245.4484,
"eval_samples_per_second": 140.539,
"eval_steps_per_second": 14.056,
"step": 7000
},
{
"epoch": 1.0218502011088162,
"grad_norm": 3.0310137271881104,
"learning_rate": 3.978692564139731e-05,
"loss": 0.5752,
"step": 7050
},
{
"epoch": 1.0290973656556872,
"grad_norm": 2.422286033630371,
"learning_rate": 3.9714451369763736e-05,
"loss": 0.6085,
"step": 7100
},
{
"epoch": 1.0363445302025582,
"grad_norm": 2.385861396789551,
"learning_rate": 3.9641977098130164e-05,
"loss": 0.5922,
"step": 7150
},
{
"epoch": 1.0435916947494293,
"grad_norm": 2.8933980464935303,
"learning_rate": 3.956950282649659e-05,
"loss": 0.6146,
"step": 7200
},
{
"epoch": 1.0508388592963003,
"grad_norm": 2.2987074851989746,
"learning_rate": 3.949702855486303e-05,
"loss": 0.6061,
"step": 7250
},
{
"epoch": 1.0580860238431713,
"grad_norm": 2.174790859222412,
"learning_rate": 3.942455428322946e-05,
"loss": 0.6033,
"step": 7300
},
{
"epoch": 1.0653331883900423,
"grad_norm": 2.604837417602539,
"learning_rate": 3.9352080011595885e-05,
"loss": 0.6027,
"step": 7350
},
{
"epoch": 1.0725803529369133,
"grad_norm": 3.116417646408081,
"learning_rate": 3.9279605739962314e-05,
"loss": 0.5783,
"step": 7400
},
{
"epoch": 1.0798275174837846,
"grad_norm": 2.4891812801361084,
"learning_rate": 3.920713146832874e-05,
"loss": 0.6108,
"step": 7450
},
{
"epoch": 1.0870746820306556,
"grad_norm": 2.293124198913574,
"learning_rate": 3.913465719669517e-05,
"loss": 0.5853,
"step": 7500
},
{
"epoch": 1.0943218465775266,
"grad_norm": 2.6642067432403564,
"learning_rate": 3.90621829250616e-05,
"loss": 0.569,
"step": 7550
},
{
"epoch": 1.1015690111243976,
"grad_norm": 2.5504302978515625,
"learning_rate": 3.8989708653428035e-05,
"loss": 0.6019,
"step": 7600
},
{
"epoch": 1.1088161756712687,
"grad_norm": 2.672874927520752,
"learning_rate": 3.8917234381794463e-05,
"loss": 0.5799,
"step": 7650
},
{
"epoch": 1.1160633402181397,
"grad_norm": 2.6319384574890137,
"learning_rate": 3.884476011016089e-05,
"loss": 0.6357,
"step": 7700
},
{
"epoch": 1.1233105047650107,
"grad_norm": 3.3530545234680176,
"learning_rate": 3.877228583852732e-05,
"loss": 0.5737,
"step": 7750
},
{
"epoch": 1.1305576693118817,
"grad_norm": 2.6204335689544678,
"learning_rate": 3.8699811566893756e-05,
"loss": 0.6139,
"step": 7800
},
{
"epoch": 1.1378048338587528,
"grad_norm": 2.570143938064575,
"learning_rate": 3.8627337295260185e-05,
"loss": 0.6038,
"step": 7850
},
{
"epoch": 1.1450519984056238,
"grad_norm": 2.826798439025879,
"learning_rate": 3.855486302362661e-05,
"loss": 0.5985,
"step": 7900
},
{
"epoch": 1.1522991629524948,
"grad_norm": 1.774380087852478,
"learning_rate": 3.848238875199304e-05,
"loss": 0.6042,
"step": 7950
},
{
"epoch": 1.1595463274993658,
"grad_norm": 2.9775238037109375,
"learning_rate": 3.840991448035948e-05,
"loss": 0.5887,
"step": 8000
},
{
"epoch": 1.1595463274993658,
"eval_loss": 0.69095379114151,
"eval_runtime": 245.6192,
"eval_samples_per_second": 140.441,
"eval_steps_per_second": 14.046,
"step": 8000
},
{
"epoch": 1.1667934920462368,
"grad_norm": 2.4235680103302,
"learning_rate": 3.8337440208725906e-05,
"loss": 0.6397,
"step": 8050
},
{
"epoch": 1.1740406565931079,
"grad_norm": 2.372230291366577,
"learning_rate": 3.8264965937092334e-05,
"loss": 0.6083,
"step": 8100
},
{
"epoch": 1.1812878211399789,
"grad_norm": 2.4329166412353516,
"learning_rate": 3.819249166545876e-05,
"loss": 0.5956,
"step": 8150
},
{
"epoch": 1.1885349856868501,
"grad_norm": 3.0749635696411133,
"learning_rate": 3.812001739382519e-05,
"loss": 0.6162,
"step": 8200
},
{
"epoch": 1.1957821502337211,
"grad_norm": 2.3049559593200684,
"learning_rate": 3.804754312219163e-05,
"loss": 0.593,
"step": 8250
},
{
"epoch": 1.2030293147805922,
"grad_norm": 2.3846399784088135,
"learning_rate": 3.7975068850558055e-05,
"loss": 0.6,
"step": 8300
},
{
"epoch": 1.2102764793274632,
"grad_norm": 2.4246721267700195,
"learning_rate": 3.7902594578924484e-05,
"loss": 0.6299,
"step": 8350
},
{
"epoch": 1.2175236438743342,
"grad_norm": 3.0294981002807617,
"learning_rate": 3.783012030729091e-05,
"loss": 0.5957,
"step": 8400
},
{
"epoch": 1.2247708084212052,
"grad_norm": 2.184633255004883,
"learning_rate": 3.775764603565735e-05,
"loss": 0.5781,
"step": 8450
},
{
"epoch": 1.2320179729680762,
"grad_norm": 2.423351287841797,
"learning_rate": 3.7685171764023776e-05,
"loss": 0.571,
"step": 8500
},
{
"epoch": 1.2392651375149473,
"grad_norm": 3.334449529647827,
"learning_rate": 3.7612697492390205e-05,
"loss": 0.579,
"step": 8550
},
{
"epoch": 1.2465123020618183,
"grad_norm": 2.2588014602661133,
"learning_rate": 3.7540223220756633e-05,
"loss": 0.5874,
"step": 8600
},
{
"epoch": 1.2537594666086893,
"grad_norm": 3.5975542068481445,
"learning_rate": 3.746774894912306e-05,
"loss": 0.5633,
"step": 8650
},
{
"epoch": 1.2610066311555603,
"grad_norm": 2.7754578590393066,
"learning_rate": 3.73952746774895e-05,
"loss": 0.6276,
"step": 8700
},
{
"epoch": 1.2682537957024314,
"grad_norm": 2.410757064819336,
"learning_rate": 3.7322800405855926e-05,
"loss": 0.6118,
"step": 8750
},
{
"epoch": 1.2755009602493024,
"grad_norm": 2.5323429107666016,
"learning_rate": 3.7250326134222355e-05,
"loss": 0.5945,
"step": 8800
},
{
"epoch": 1.2827481247961736,
"grad_norm": 2.5905425548553467,
"learning_rate": 3.717785186258878e-05,
"loss": 0.5903,
"step": 8850
},
{
"epoch": 1.2899952893430444,
"grad_norm": 2.042097568511963,
"learning_rate": 3.710537759095521e-05,
"loss": 0.5997,
"step": 8900
},
{
"epoch": 1.2972424538899157,
"grad_norm": 2.1564393043518066,
"learning_rate": 3.703290331932164e-05,
"loss": 0.5697,
"step": 8950
},
{
"epoch": 1.3044896184367867,
"grad_norm": 2.8595190048217773,
"learning_rate": 3.696042904768807e-05,
"loss": 0.6026,
"step": 9000
},
{
"epoch": 1.3044896184367867,
"eval_loss": 0.6820585131645203,
"eval_runtime": 245.5719,
"eval_samples_per_second": 140.468,
"eval_steps_per_second": 14.049,
"step": 9000
},
{
"epoch": 1.3117367829836577,
"grad_norm": 2.401057243347168,
"learning_rate": 3.6887954776054504e-05,
"loss": 0.615,
"step": 9050
},
{
"epoch": 1.3189839475305287,
"grad_norm": 2.1161983013153076,
"learning_rate": 3.681548050442093e-05,
"loss": 0.6468,
"step": 9100
},
{
"epoch": 1.3262311120773997,
"grad_norm": 2.9642062187194824,
"learning_rate": 3.674300623278736e-05,
"loss": 0.5908,
"step": 9150
},
{
"epoch": 1.3334782766242708,
"grad_norm": 2.200223445892334,
"learning_rate": 3.667053196115379e-05,
"loss": 0.5493,
"step": 9200
},
{
"epoch": 1.3407254411711418,
"grad_norm": 2.8368406295776367,
"learning_rate": 3.659805768952022e-05,
"loss": 0.6157,
"step": 9250
},
{
"epoch": 1.3479726057180128,
"grad_norm": 2.500457286834717,
"learning_rate": 3.652558341788665e-05,
"loss": 0.6217,
"step": 9300
},
{
"epoch": 1.3552197702648838,
"grad_norm": 2.435392379760742,
"learning_rate": 3.645310914625308e-05,
"loss": 0.5988,
"step": 9350
},
{
"epoch": 1.3624669348117548,
"grad_norm": 2.5148136615753174,
"learning_rate": 3.638063487461951e-05,
"loss": 0.6043,
"step": 9400
},
{
"epoch": 1.3697140993586259,
"grad_norm": 2.5917887687683105,
"learning_rate": 3.630816060298594e-05,
"loss": 0.6048,
"step": 9450
},
{
"epoch": 1.3769612639054971,
"grad_norm": 3.429553747177124,
"learning_rate": 3.623568633135237e-05,
"loss": 0.5958,
"step": 9500
},
{
"epoch": 1.384208428452368,
"grad_norm": 3.326967239379883,
"learning_rate": 3.6163212059718803e-05,
"loss": 0.5629,
"step": 9550
},
{
"epoch": 1.3914555929992392,
"grad_norm": 2.0327517986297607,
"learning_rate": 3.609073778808523e-05,
"loss": 0.6238,
"step": 9600
},
{
"epoch": 1.39870275754611,
"grad_norm": 2.4792556762695312,
"learning_rate": 3.601826351645166e-05,
"loss": 0.5662,
"step": 9650
},
{
"epoch": 1.4059499220929812,
"grad_norm": 2.692080497741699,
"learning_rate": 3.594578924481809e-05,
"loss": 0.6257,
"step": 9700
},
{
"epoch": 1.4131970866398522,
"grad_norm": 3.3779165744781494,
"learning_rate": 3.587331497318452e-05,
"loss": 0.5654,
"step": 9750
},
{
"epoch": 1.4204442511867232,
"grad_norm": 2.2963967323303223,
"learning_rate": 3.580084070155095e-05,
"loss": 0.6,
"step": 9800
},
{
"epoch": 1.4276914157335943,
"grad_norm": 2.7266321182250977,
"learning_rate": 3.572836642991738e-05,
"loss": 0.6333,
"step": 9850
},
{
"epoch": 1.4349385802804653,
"grad_norm": 1.9801080226898193,
"learning_rate": 3.565589215828381e-05,
"loss": 0.583,
"step": 9900
},
{
"epoch": 1.4421857448273363,
"grad_norm": 2.2797017097473145,
"learning_rate": 3.558341788665024e-05,
"loss": 0.5918,
"step": 9950
},
{
"epoch": 1.4494329093742073,
"grad_norm": 2.325143575668335,
"learning_rate": 3.5510943615016674e-05,
"loss": 0.601,
"step": 10000
},
{
"epoch": 1.4494329093742073,
"eval_loss": 0.6719304919242859,
"eval_runtime": 245.6786,
"eval_samples_per_second": 140.407,
"eval_steps_per_second": 14.043,
"step": 10000
},
{
"epoch": 1.4566800739210783,
"grad_norm": 2.4414122104644775,
"learning_rate": 3.54384693433831e-05,
"loss": 0.5557,
"step": 10050
},
{
"epoch": 1.4639272384679494,
"grad_norm": 3.4254276752471924,
"learning_rate": 3.536599507174953e-05,
"loss": 0.5961,
"step": 10100
},
{
"epoch": 1.4711744030148204,
"grad_norm": 1.9034006595611572,
"learning_rate": 3.529352080011596e-05,
"loss": 0.5716,
"step": 10150
},
{
"epoch": 1.4784215675616914,
"grad_norm": 1.9632625579833984,
"learning_rate": 3.5221046528482395e-05,
"loss": 0.6656,
"step": 10200
},
{
"epoch": 1.4856687321085627,
"grad_norm": 2.634131669998169,
"learning_rate": 3.5148572256848824e-05,
"loss": 0.605,
"step": 10250
},
{
"epoch": 1.4929158966554334,
"grad_norm": 2.171637535095215,
"learning_rate": 3.507609798521525e-05,
"loss": 0.5942,
"step": 10300
},
{
"epoch": 1.5001630612023047,
"grad_norm": 2.2301058769226074,
"learning_rate": 3.500362371358168e-05,
"loss": 0.5776,
"step": 10350
},
{
"epoch": 1.5074102257491755,
"grad_norm": 2.1996707916259766,
"learning_rate": 3.493114944194811e-05,
"loss": 0.6082,
"step": 10400
},
{
"epoch": 1.5146573902960467,
"grad_norm": 2.6578609943389893,
"learning_rate": 3.4858675170314545e-05,
"loss": 0.637,
"step": 10450
},
{
"epoch": 1.5219045548429178,
"grad_norm": 2.902642011642456,
"learning_rate": 3.4786200898680973e-05,
"loss": 0.5898,
"step": 10500
},
{
"epoch": 1.5291517193897888,
"grad_norm": 2.331738233566284,
"learning_rate": 3.47137266270474e-05,
"loss": 0.6314,
"step": 10550
},
{
"epoch": 1.5363988839366598,
"grad_norm": 2.1168956756591797,
"learning_rate": 3.464125235541383e-05,
"loss": 0.6306,
"step": 10600
},
{
"epoch": 1.5436460484835308,
"grad_norm": 3.4085164070129395,
"learning_rate": 3.456877808378026e-05,
"loss": 0.6065,
"step": 10650
},
{
"epoch": 1.5508932130304018,
"grad_norm": 3.533252000808716,
"learning_rate": 3.449630381214669e-05,
"loss": 0.5818,
"step": 10700
},
{
"epoch": 1.5581403775772729,
"grad_norm": 2.3345284461975098,
"learning_rate": 3.4423829540513116e-05,
"loss": 0.5762,
"step": 10750
},
{
"epoch": 1.5653875421241439,
"grad_norm": 2.5388126373291016,
"learning_rate": 3.4351355268879545e-05,
"loss": 0.5967,
"step": 10800
},
{
"epoch": 1.572634706671015,
"grad_norm": 2.583822250366211,
"learning_rate": 3.4278880997245973e-05,
"loss": 0.6388,
"step": 10850
},
{
"epoch": 1.5798818712178861,
"grad_norm": 2.8442914485931396,
"learning_rate": 3.420640672561241e-05,
"loss": 0.6049,
"step": 10900
},
{
"epoch": 1.587129035764757,
"grad_norm": 2.080157518386841,
"learning_rate": 3.413393245397884e-05,
"loss": 0.5681,
"step": 10950
},
{
"epoch": 1.5943762003116282,
"grad_norm": 3.178879976272583,
"learning_rate": 3.4061458182345266e-05,
"loss": 0.5877,
"step": 11000
},
{
"epoch": 1.5943762003116282,
"eval_loss": 0.6599454283714294,
"eval_runtime": 245.7578,
"eval_samples_per_second": 140.362,
"eval_steps_per_second": 14.038,
"step": 11000
},
{
"epoch": 1.601623364858499,
"grad_norm": 2.6149051189422607,
"learning_rate": 3.3988983910711694e-05,
"loss": 0.5706,
"step": 11050
},
{
"epoch": 1.6088705294053702,
"grad_norm": 2.2884621620178223,
"learning_rate": 3.391650963907813e-05,
"loss": 0.5929,
"step": 11100
},
{
"epoch": 1.616117693952241,
"grad_norm": 2.5482795238494873,
"learning_rate": 3.384403536744456e-05,
"loss": 0.5772,
"step": 11150
},
{
"epoch": 1.6233648584991123,
"grad_norm": 2.499694347381592,
"learning_rate": 3.377156109581099e-05,
"loss": 0.589,
"step": 11200
},
{
"epoch": 1.6306120230459833,
"grad_norm": 3.6115667819976807,
"learning_rate": 3.3699086824177416e-05,
"loss": 0.5824,
"step": 11250
},
{
"epoch": 1.6378591875928543,
"grad_norm": 3.2503881454467773,
"learning_rate": 3.3626612552543844e-05,
"loss": 0.5768,
"step": 11300
},
{
"epoch": 1.6451063521397253,
"grad_norm": 2.821199893951416,
"learning_rate": 3.355413828091028e-05,
"loss": 0.5685,
"step": 11350
},
{
"epoch": 1.6523535166865964,
"grad_norm": 2.602804660797119,
"learning_rate": 3.348166400927671e-05,
"loss": 0.595,
"step": 11400
},
{
"epoch": 1.6596006812334674,
"grad_norm": 3.2898738384246826,
"learning_rate": 3.340918973764314e-05,
"loss": 0.5737,
"step": 11450
},
{
"epoch": 1.6668478457803384,
"grad_norm": 3.5453619956970215,
"learning_rate": 3.3336715466009565e-05,
"loss": 0.6154,
"step": 11500
},
{
"epoch": 1.6740950103272096,
"grad_norm": 2.276144027709961,
"learning_rate": 3.3264241194376e-05,
"loss": 0.5962,
"step": 11550
},
{
"epoch": 1.6813421748740804,
"grad_norm": 2.5161609649658203,
"learning_rate": 3.319176692274243e-05,
"loss": 0.5486,
"step": 11600
},
{
"epoch": 1.6885893394209517,
"grad_norm": 2.3692619800567627,
"learning_rate": 3.311929265110886e-05,
"loss": 0.5959,
"step": 11650
},
{
"epoch": 1.6958365039678225,
"grad_norm": 2.2731165885925293,
"learning_rate": 3.3046818379475286e-05,
"loss": 0.5484,
"step": 11700
},
{
"epoch": 1.7030836685146937,
"grad_norm": 2.9670755863189697,
"learning_rate": 3.297434410784172e-05,
"loss": 0.5648,
"step": 11750
},
{
"epoch": 1.7103308330615645,
"grad_norm": 2.7299599647521973,
"learning_rate": 3.290186983620815e-05,
"loss": 0.6107,
"step": 11800
},
{
"epoch": 1.7175779976084358,
"grad_norm": 2.5043752193450928,
"learning_rate": 3.282939556457458e-05,
"loss": 0.5998,
"step": 11850
},
{
"epoch": 1.7248251621553068,
"grad_norm": 2.849438428878784,
"learning_rate": 3.275692129294101e-05,
"loss": 0.5729,
"step": 11900
},
{
"epoch": 1.7320723267021778,
"grad_norm": 2.4421026706695557,
"learning_rate": 3.2684447021307436e-05,
"loss": 0.5821,
"step": 11950
},
{
"epoch": 1.7393194912490488,
"grad_norm": 2.8539223670959473,
"learning_rate": 3.261197274967387e-05,
"loss": 0.566,
"step": 12000
},
{
"epoch": 1.7393194912490488,
"eval_loss": 0.6502553820610046,
"eval_runtime": 247.5286,
"eval_samples_per_second": 139.358,
"eval_steps_per_second": 13.938,
"step": 12000
},
{
"epoch": 1.7465666557959199,
"grad_norm": 2.71627140045166,
"learning_rate": 3.25394984780403e-05,
"loss": 0.6029,
"step": 12050
},
{
"epoch": 1.7538138203427909,
"grad_norm": 2.3385589122772217,
"learning_rate": 3.246702420640673e-05,
"loss": 0.5514,
"step": 12100
},
{
"epoch": 1.761060984889662,
"grad_norm": 2.6175739765167236,
"learning_rate": 3.239454993477316e-05,
"loss": 0.6304,
"step": 12150
},
{
"epoch": 1.768308149436533,
"grad_norm": 2.545201301574707,
"learning_rate": 3.232207566313959e-05,
"loss": 0.5833,
"step": 12200
},
{
"epoch": 1.775555313983404,
"grad_norm": 3.0050761699676514,
"learning_rate": 3.225105087693869e-05,
"loss": 0.5914,
"step": 12250
},
{
"epoch": 1.7828024785302752,
"grad_norm": 2.7314724922180176,
"learning_rate": 3.217857660530512e-05,
"loss": 0.5652,
"step": 12300
},
{
"epoch": 1.790049643077146,
"grad_norm": 2.4256742000579834,
"learning_rate": 3.210610233367155e-05,
"loss": 0.5891,
"step": 12350
},
{
"epoch": 1.7972968076240172,
"grad_norm": 3.057068109512329,
"learning_rate": 3.203362806203798e-05,
"loss": 0.5873,
"step": 12400
},
{
"epoch": 1.804543972170888,
"grad_norm": 2.393101215362549,
"learning_rate": 3.196115379040441e-05,
"loss": 0.5735,
"step": 12450
},
{
"epoch": 1.8117911367177593,
"grad_norm": 2.644343614578247,
"learning_rate": 3.188867951877084e-05,
"loss": 0.569,
"step": 12500
},
{
"epoch": 1.81903830126463,
"grad_norm": 2.401324987411499,
"learning_rate": 3.181620524713727e-05,
"loss": 0.5358,
"step": 12550
},
{
"epoch": 1.8262854658115013,
"grad_norm": 2.5138838291168213,
"learning_rate": 3.17437309755037e-05,
"loss": 0.5348,
"step": 12600
},
{
"epoch": 1.8335326303583723,
"grad_norm": 1.8828959465026855,
"learning_rate": 3.167125670387013e-05,
"loss": 0.5899,
"step": 12650
},
{
"epoch": 1.8407797949052433,
"grad_norm": 2.3968441486358643,
"learning_rate": 3.159878243223656e-05,
"loss": 0.5385,
"step": 12700
},
{
"epoch": 1.8480269594521144,
"grad_norm": 2.0079128742218018,
"learning_rate": 3.152630816060299e-05,
"loss": 0.5341,
"step": 12750
},
{
"epoch": 1.8552741239989854,
"grad_norm": 1.722406029701233,
"learning_rate": 3.145383388896942e-05,
"loss": 0.5592,
"step": 12800
},
{
"epoch": 1.8625212885458564,
"grad_norm": 2.9170126914978027,
"learning_rate": 3.1381359617335845e-05,
"loss": 0.5637,
"step": 12850
},
{
"epoch": 1.8697684530927274,
"grad_norm": 3.159693479537964,
"learning_rate": 3.1308885345702274e-05,
"loss": 0.6119,
"step": 12900
},
{
"epoch": 1.8770156176395985,
"grad_norm": 1.9590739011764526,
"learning_rate": 3.12364110740687e-05,
"loss": 0.6167,
"step": 12950
},
{
"epoch": 1.8842627821864695,
"grad_norm": 2.080077886581421,
"learning_rate": 3.116393680243514e-05,
"loss": 0.5713,
"step": 13000
},
{
"epoch": 1.8842627821864695,
"eval_loss": 0.6442924737930298,
"eval_runtime": 246.2742,
"eval_samples_per_second": 140.067,
"eval_steps_per_second": 14.009,
"step": 13000
},
{
"epoch": 1.8915099467333407,
"grad_norm": 3.022768974304199,
"learning_rate": 3.1091462530801567e-05,
"loss": 0.5889,
"step": 13050
},
{
"epoch": 1.8987571112802115,
"grad_norm": 2.6845195293426514,
"learning_rate": 3.1018988259167995e-05,
"loss": 0.5536,
"step": 13100
},
{
"epoch": 1.9060042758270828,
"grad_norm": 3.178252696990967,
"learning_rate": 3.0946513987534424e-05,
"loss": 0.6397,
"step": 13150
},
{
"epoch": 1.9132514403739536,
"grad_norm": 3.4886860847473145,
"learning_rate": 3.087403971590085e-05,
"loss": 0.5945,
"step": 13200
},
{
"epoch": 1.9204986049208248,
"grad_norm": 2.4418983459472656,
"learning_rate": 3.080156544426729e-05,
"loss": 0.6039,
"step": 13250
},
{
"epoch": 1.9277457694676956,
"grad_norm": 2.7599799633026123,
"learning_rate": 3.0729091172633716e-05,
"loss": 0.5496,
"step": 13300
},
{
"epoch": 1.9349929340145668,
"grad_norm": 2.276477098464966,
"learning_rate": 3.0656616901000145e-05,
"loss": 0.5836,
"step": 13350
},
{
"epoch": 1.9422400985614379,
"grad_norm": 2.3959388732910156,
"learning_rate": 3.058414262936657e-05,
"loss": 0.603,
"step": 13400
},
{
"epoch": 1.9494872631083089,
"grad_norm": 2.58145809173584,
"learning_rate": 3.051166835773301e-05,
"loss": 0.5671,
"step": 13450
},
{
"epoch": 1.95673442765518,
"grad_norm": 1.9884346723556519,
"learning_rate": 3.0439194086099437e-05,
"loss": 0.5766,
"step": 13500
},
{
"epoch": 1.963981592202051,
"grad_norm": 2.8933162689208984,
"learning_rate": 3.0366719814465866e-05,
"loss": 0.5529,
"step": 13550
},
{
"epoch": 1.971228756748922,
"grad_norm": 3.837768316268921,
"learning_rate": 3.0294245542832294e-05,
"loss": 0.5698,
"step": 13600
},
{
"epoch": 1.978475921295793,
"grad_norm": 5.475804805755615,
"learning_rate": 3.0221771271198723e-05,
"loss": 0.5887,
"step": 13650
},
{
"epoch": 1.9857230858426642,
"grad_norm": 2.9813613891601562,
"learning_rate": 3.014929699956516e-05,
"loss": 0.5685,
"step": 13700
},
{
"epoch": 1.992970250389535,
"grad_norm": 2.6293349266052246,
"learning_rate": 3.0076822727931587e-05,
"loss": 0.5773,
"step": 13750
},
{
"epoch": 2.0002174149364063,
"grad_norm": 2.593838691711426,
"learning_rate": 3.0004348456298015e-05,
"loss": 0.5628,
"step": 13800
},
{
"epoch": 2.007464579483277,
"grad_norm": 1.6289088726043701,
"learning_rate": 2.9931874184664444e-05,
"loss": 0.4418,
"step": 13850
},
{
"epoch": 2.0147117440301483,
"grad_norm": 2.8888840675354004,
"learning_rate": 2.985939991303088e-05,
"loss": 0.4652,
"step": 13900
},
{
"epoch": 2.021958908577019,
"grad_norm": 2.572951316833496,
"learning_rate": 2.9786925641397308e-05,
"loss": 0.4685,
"step": 13950
},
{
"epoch": 2.0292060731238903,
"grad_norm": 3.1881916522979736,
"learning_rate": 2.9714451369763737e-05,
"loss": 0.4583,
"step": 14000
},
{
"epoch": 2.0292060731238903,
"eval_loss": 0.6516901254653931,
"eval_runtime": 247.3517,
"eval_samples_per_second": 139.457,
"eval_steps_per_second": 13.948,
"step": 14000
},
{
"epoch": 2.036453237670761,
"grad_norm": 2.9465830326080322,
"learning_rate": 2.9641977098130165e-05,
"loss": 0.4494,
"step": 14050
},
{
"epoch": 2.0437004022176324,
"grad_norm": 1.7231141328811646,
"learning_rate": 2.9569502826496597e-05,
"loss": 0.4183,
"step": 14100
},
{
"epoch": 2.050947566764503,
"grad_norm": 3.4768612384796143,
"learning_rate": 2.9497028554863026e-05,
"loss": 0.4223,
"step": 14150
},
{
"epoch": 2.0581947313113744,
"grad_norm": 2.902104139328003,
"learning_rate": 2.9424554283229454e-05,
"loss": 0.4569,
"step": 14200
},
{
"epoch": 2.0654418958582457,
"grad_norm": 2.661472797393799,
"learning_rate": 2.9352080011595883e-05,
"loss": 0.4551,
"step": 14250
},
{
"epoch": 2.0726890604051165,
"grad_norm": 2.9054114818573,
"learning_rate": 2.927960573996231e-05,
"loss": 0.4279,
"step": 14300
},
{
"epoch": 2.0799362249519877,
"grad_norm": 2.9954373836517334,
"learning_rate": 2.9207131468328747e-05,
"loss": 0.4291,
"step": 14350
},
{
"epoch": 2.0871833894988585,
"grad_norm": 2.9396731853485107,
"learning_rate": 2.9134657196695175e-05,
"loss": 0.4392,
"step": 14400
},
{
"epoch": 2.0944305540457298,
"grad_norm": 2.4928243160247803,
"learning_rate": 2.9062182925061604e-05,
"loss": 0.402,
"step": 14450
},
{
"epoch": 2.1016777185926006,
"grad_norm": 3.2848997116088867,
"learning_rate": 2.8989708653428032e-05,
"loss": 0.4189,
"step": 14500
},
{
"epoch": 2.108924883139472,
"grad_norm": 3.1870994567871094,
"learning_rate": 2.8917234381794468e-05,
"loss": 0.4421,
"step": 14550
},
{
"epoch": 2.1161720476863426,
"grad_norm": 2.7032647132873535,
"learning_rate": 2.8844760110160896e-05,
"loss": 0.4415,
"step": 14600
},
{
"epoch": 2.123419212233214,
"grad_norm": 3.0945403575897217,
"learning_rate": 2.8772285838527325e-05,
"loss": 0.4515,
"step": 14650
},
{
"epoch": 2.1306663767800846,
"grad_norm": 2.7250170707702637,
"learning_rate": 2.8699811566893753e-05,
"loss": 0.4374,
"step": 14700
},
{
"epoch": 2.137913541326956,
"grad_norm": 3.7444469928741455,
"learning_rate": 2.8627337295260182e-05,
"loss": 0.4738,
"step": 14750
},
{
"epoch": 2.1451607058738267,
"grad_norm": 3.042750597000122,
"learning_rate": 2.8554863023626614e-05,
"loss": 0.444,
"step": 14800
},
{
"epoch": 2.152407870420698,
"grad_norm": 2.8017637729644775,
"learning_rate": 2.8482388751993046e-05,
"loss": 0.4425,
"step": 14850
},
{
"epoch": 2.159655034967569,
"grad_norm": 2.268974781036377,
"learning_rate": 2.8409914480359475e-05,
"loss": 0.4594,
"step": 14900
},
{
"epoch": 2.16690219951444,
"grad_norm": 2.553354263305664,
"learning_rate": 2.8337440208725903e-05,
"loss": 0.451,
"step": 14950
},
{
"epoch": 2.174149364061311,
"grad_norm": 3.928313970565796,
"learning_rate": 2.8264965937092335e-05,
"loss": 0.4609,
"step": 15000
},
{
"epoch": 2.174149364061311,
"eval_loss": 0.6510897278785706,
"eval_runtime": 245.6483,
"eval_samples_per_second": 140.424,
"eval_steps_per_second": 14.044,
"step": 15000
},
{
"epoch": 2.181396528608182,
"grad_norm": 2.9727916717529297,
"learning_rate": 2.8192491665458764e-05,
"loss": 0.445,
"step": 15050
},
{
"epoch": 2.1886436931550532,
"grad_norm": 2.9330101013183594,
"learning_rate": 2.8120017393825192e-05,
"loss": 0.4395,
"step": 15100
},
{
"epoch": 2.195890857701924,
"grad_norm": 2.5062899589538574,
"learning_rate": 2.804754312219162e-05,
"loss": 0.4293,
"step": 15150
},
{
"epoch": 2.2031380222487953,
"grad_norm": 3.488398313522339,
"learning_rate": 2.797506885055805e-05,
"loss": 0.4382,
"step": 15200
},
{
"epoch": 2.210385186795666,
"grad_norm": 2.6354928016662598,
"learning_rate": 2.7902594578924485e-05,
"loss": 0.4348,
"step": 15250
},
{
"epoch": 2.2176323513425373,
"grad_norm": 2.0676591396331787,
"learning_rate": 2.7830120307290913e-05,
"loss": 0.4506,
"step": 15300
},
{
"epoch": 2.224879515889408,
"grad_norm": 3.5719449520111084,
"learning_rate": 2.7757646035657342e-05,
"loss": 0.4718,
"step": 15350
},
{
"epoch": 2.2321266804362794,
"grad_norm": 2.755336284637451,
"learning_rate": 2.768517176402377e-05,
"loss": 0.4934,
"step": 15400
},
{
"epoch": 2.23937384498315,
"grad_norm": 2.0585694313049316,
"learning_rate": 2.7612697492390206e-05,
"loss": 0.4199,
"step": 15450
},
{
"epoch": 2.2466210095300214,
"grad_norm": 3.553250551223755,
"learning_rate": 2.7540223220756634e-05,
"loss": 0.4414,
"step": 15500
},
{
"epoch": 2.253868174076892,
"grad_norm": 2.6371939182281494,
"learning_rate": 2.7467748949123063e-05,
"loss": 0.4238,
"step": 15550
},
{
"epoch": 2.2611153386237635,
"grad_norm": 2.935482978820801,
"learning_rate": 2.739527467748949e-05,
"loss": 0.4421,
"step": 15600
},
{
"epoch": 2.2683625031706347,
"grad_norm": 2.8056271076202393,
"learning_rate": 2.7322800405855923e-05,
"loss": 0.4838,
"step": 15650
},
{
"epoch": 2.2756096677175055,
"grad_norm": 1.6445329189300537,
"learning_rate": 2.7250326134222352e-05,
"loss": 0.4727,
"step": 15700
},
{
"epoch": 2.2828568322643767,
"grad_norm": 2.901073694229126,
"learning_rate": 2.7177851862588784e-05,
"loss": 0.451,
"step": 15750
},
{
"epoch": 2.2901039968112475,
"grad_norm": 3.6521453857421875,
"learning_rate": 2.7105377590955213e-05,
"loss": 0.4441,
"step": 15800
},
{
"epoch": 2.297351161358119,
"grad_norm": 3.1472737789154053,
"learning_rate": 2.703290331932164e-05,
"loss": 0.4436,
"step": 15850
},
{
"epoch": 2.3045983259049896,
"grad_norm": 2.873993396759033,
"learning_rate": 2.6960429047688073e-05,
"loss": 0.4555,
"step": 15900
},
{
"epoch": 2.311845490451861,
"grad_norm": 2.5647237300872803,
"learning_rate": 2.68879547760545e-05,
"loss": 0.4621,
"step": 15950
},
{
"epoch": 2.3190926549987316,
"grad_norm": 2.7584807872772217,
"learning_rate": 2.681548050442093e-05,
"loss": 0.4497,
"step": 16000
},
{
"epoch": 2.3190926549987316,
"eval_loss": 0.644283652305603,
"eval_runtime": 245.6009,
"eval_samples_per_second": 140.451,
"eval_steps_per_second": 14.047,
"step": 16000
},
{
"epoch": 2.326339819545603,
"grad_norm": 3.315335750579834,
"learning_rate": 2.674300623278736e-05,
"loss": 0.4505,
"step": 16050
},
{
"epoch": 2.3335869840924737,
"grad_norm": 2.8071439266204834,
"learning_rate": 2.6670531961153794e-05,
"loss": 0.4457,
"step": 16100
},
{
"epoch": 2.340834148639345,
"grad_norm": 2.4625017642974854,
"learning_rate": 2.6598057689520223e-05,
"loss": 0.418,
"step": 16150
},
{
"epoch": 2.3480813131862157,
"grad_norm": 3.5972659587860107,
"learning_rate": 2.652558341788665e-05,
"loss": 0.4188,
"step": 16200
},
{
"epoch": 2.355328477733087,
"grad_norm": 2.557579517364502,
"learning_rate": 2.645310914625308e-05,
"loss": 0.4161,
"step": 16250
},
{
"epoch": 2.3625756422799578,
"grad_norm": 2.0252676010131836,
"learning_rate": 2.638063487461951e-05,
"loss": 0.4504,
"step": 16300
},
{
"epoch": 2.369822806826829,
"grad_norm": 2.324404001235962,
"learning_rate": 2.6308160602985944e-05,
"loss": 0.454,
"step": 16350
},
{
"epoch": 2.3770699713737002,
"grad_norm": 2.3498804569244385,
"learning_rate": 2.6235686331352372e-05,
"loss": 0.4782,
"step": 16400
},
{
"epoch": 2.384317135920571,
"grad_norm": 2.642273187637329,
"learning_rate": 2.61632120597188e-05,
"loss": 0.4531,
"step": 16450
},
{
"epoch": 2.3915643004674423,
"grad_norm": 3.5636932849884033,
"learning_rate": 2.609073778808523e-05,
"loss": 0.4545,
"step": 16500
},
{
"epoch": 2.398811465014313,
"grad_norm": 3.15792179107666,
"learning_rate": 2.601826351645166e-05,
"loss": 0.4827,
"step": 16550
},
{
"epoch": 2.4060586295611843,
"grad_norm": 2.4860501289367676,
"learning_rate": 2.594578924481809e-05,
"loss": 0.4629,
"step": 16600
},
{
"epoch": 2.413305794108055,
"grad_norm": 5.471420764923096,
"learning_rate": 2.5873314973184522e-05,
"loss": 0.4376,
"step": 16650
},
{
"epoch": 2.4205529586549264,
"grad_norm": 3.098515033721924,
"learning_rate": 2.580084070155095e-05,
"loss": 0.4371,
"step": 16700
},
{
"epoch": 2.427800123201797,
"grad_norm": 2.5739009380340576,
"learning_rate": 2.5728366429917383e-05,
"loss": 0.4339,
"step": 16750
},
{
"epoch": 2.4350472877486684,
"grad_norm": 3.0826826095581055,
"learning_rate": 2.565589215828381e-05,
"loss": 0.4541,
"step": 16800
},
{
"epoch": 2.442294452295539,
"grad_norm": 3.0049545764923096,
"learning_rate": 2.558341788665024e-05,
"loss": 0.4492,
"step": 16850
},
{
"epoch": 2.4495416168424105,
"grad_norm": 3.194600820541382,
"learning_rate": 2.5510943615016668e-05,
"loss": 0.4509,
"step": 16900
},
{
"epoch": 2.4567887813892813,
"grad_norm": 2.9486989974975586,
"learning_rate": 2.5438469343383097e-05,
"loss": 0.4104,
"step": 16950
},
{
"epoch": 2.4640359459361525,
"grad_norm": 2.8794190883636475,
"learning_rate": 2.5365995071749532e-05,
"loss": 0.4308,
"step": 17000
},
{
"epoch": 2.4640359459361525,
"eval_loss": 0.6368168592453003,
"eval_runtime": 245.6078,
"eval_samples_per_second": 140.447,
"eval_steps_per_second": 14.047,
"step": 17000
},
{
"epoch": 2.4712831104830233,
"grad_norm": 2.2457711696624756,
"learning_rate": 2.529352080011596e-05,
"loss": 0.4467,
"step": 17050
},
{
"epoch": 2.4785302750298945,
"grad_norm": 3.0438215732574463,
"learning_rate": 2.522104652848239e-05,
"loss": 0.4748,
"step": 17100
},
{
"epoch": 2.485777439576766,
"grad_norm": 3.1453700065612793,
"learning_rate": 2.5148572256848818e-05,
"loss": 0.4594,
"step": 17150
},
{
"epoch": 2.4930246041236366,
"grad_norm": 2.476498603820801,
"learning_rate": 2.5076097985215253e-05,
"loss": 0.4637,
"step": 17200
},
{
"epoch": 2.500271768670508,
"grad_norm": 2.5618162155151367,
"learning_rate": 2.5003623713581682e-05,
"loss": 0.4272,
"step": 17250
},
{
"epoch": 2.5075189332173786,
"grad_norm": 2.719830274581909,
"learning_rate": 2.4932598927380782e-05,
"loss": 0.4436,
"step": 17300
},
{
"epoch": 2.51476609776425,
"grad_norm": 2.393423557281494,
"learning_rate": 2.486012465574721e-05,
"loss": 0.5006,
"step": 17350
},
{
"epoch": 2.5220132623111207,
"grad_norm": 2.518490791320801,
"learning_rate": 2.4787650384113642e-05,
"loss": 0.4378,
"step": 17400
},
{
"epoch": 2.529260426857992,
"grad_norm": 3.152761697769165,
"learning_rate": 2.471517611248007e-05,
"loss": 0.4415,
"step": 17450
},
{
"epoch": 2.5365075914048627,
"grad_norm": 2.689821243286133,
"learning_rate": 2.46427018408465e-05,
"loss": 0.4736,
"step": 17500
},
{
"epoch": 2.543754755951734,
"grad_norm": 2.7041850090026855,
"learning_rate": 2.457022756921293e-05,
"loss": 0.4619,
"step": 17550
},
{
"epoch": 2.5510019204986047,
"grad_norm": 2.8072993755340576,
"learning_rate": 2.449775329757936e-05,
"loss": 0.443,
"step": 17600
},
{
"epoch": 2.558249085045476,
"grad_norm": 2.700951099395752,
"learning_rate": 2.4425279025945792e-05,
"loss": 0.4338,
"step": 17650
},
{
"epoch": 2.5654962495923472,
"grad_norm": 2.2559311389923096,
"learning_rate": 2.435280475431222e-05,
"loss": 0.4371,
"step": 17700
},
{
"epoch": 2.572743414139218,
"grad_norm": 2.7183778285980225,
"learning_rate": 2.4280330482678653e-05,
"loss": 0.4479,
"step": 17750
},
{
"epoch": 2.579990578686089,
"grad_norm": 2.337385654449463,
"learning_rate": 2.420785621104508e-05,
"loss": 0.4703,
"step": 17800
},
{
"epoch": 2.58723774323296,
"grad_norm": 2.985521078109741,
"learning_rate": 2.413538193941151e-05,
"loss": 0.44,
"step": 17850
},
{
"epoch": 2.5944849077798313,
"grad_norm": 2.26230788230896,
"learning_rate": 2.406290766777794e-05,
"loss": 0.4485,
"step": 17900
},
{
"epoch": 2.601732072326702,
"grad_norm": 3.0831480026245117,
"learning_rate": 2.399043339614437e-05,
"loss": 0.439,
"step": 17950
},
{
"epoch": 2.6089792368735734,
"grad_norm": 3.325925827026367,
"learning_rate": 2.39179591245108e-05,
"loss": 0.4809,
"step": 18000
},
{
"epoch": 2.6089792368735734,
"eval_loss": 0.6339168548583984,
"eval_runtime": 245.517,
"eval_samples_per_second": 140.499,
"eval_steps_per_second": 14.052,
"step": 18000
},
{
"epoch": 2.616226401420444,
"grad_norm": 3.043121099472046,
"learning_rate": 2.3845484852877227e-05,
"loss": 0.4382,
"step": 18050
},
{
"epoch": 2.6234735659673154,
"grad_norm": 2.225372314453125,
"learning_rate": 2.377301058124366e-05,
"loss": 0.4379,
"step": 18100
},
{
"epoch": 2.630720730514186,
"grad_norm": 3.059220790863037,
"learning_rate": 2.3700536309610088e-05,
"loss": 0.4475,
"step": 18150
},
{
"epoch": 2.6379678950610574,
"grad_norm": 3.8372066020965576,
"learning_rate": 2.362806203797652e-05,
"loss": 0.4272,
"step": 18200
},
{
"epoch": 2.6452150596079282,
"grad_norm": 1.4763438701629639,
"learning_rate": 2.355558776634295e-05,
"loss": 0.4629,
"step": 18250
},
{
"epoch": 2.6524622241547995,
"grad_norm": 3.0369021892547607,
"learning_rate": 2.348311349470938e-05,
"loss": 0.438,
"step": 18300
},
{
"epoch": 2.6597093887016703,
"grad_norm": 2.6003599166870117,
"learning_rate": 2.341063922307581e-05,
"loss": 0.4205,
"step": 18350
},
{
"epoch": 2.6669565532485415,
"grad_norm": 2.9726734161376953,
"learning_rate": 2.333816495144224e-05,
"loss": 0.4453,
"step": 18400
},
{
"epoch": 2.6742037177954128,
"grad_norm": 3.3538806438446045,
"learning_rate": 2.326569067980867e-05,
"loss": 0.4237,
"step": 18450
},
{
"epoch": 2.6814508823422836,
"grad_norm": 2.160491704940796,
"learning_rate": 2.31932164081751e-05,
"loss": 0.4433,
"step": 18500
},
{
"epoch": 2.6886980468891544,
"grad_norm": 3.117530345916748,
"learning_rate": 2.312074213654153e-05,
"loss": 0.4219,
"step": 18550
},
{
"epoch": 2.6959452114360256,
"grad_norm": 2.787057638168335,
"learning_rate": 2.304826786490796e-05,
"loss": 0.4391,
"step": 18600
},
{
"epoch": 2.703192375982897,
"grad_norm": 3.635530471801758,
"learning_rate": 2.2975793593274387e-05,
"loss": 0.461,
"step": 18650
},
{
"epoch": 2.7104395405297677,
"grad_norm": 2.3657073974609375,
"learning_rate": 2.290331932164082e-05,
"loss": 0.4692,
"step": 18700
},
{
"epoch": 2.717686705076639,
"grad_norm": 2.611757516860962,
"learning_rate": 2.2830845050007248e-05,
"loss": 0.4479,
"step": 18750
},
{
"epoch": 2.7249338696235097,
"grad_norm": 2.967528820037842,
"learning_rate": 2.2758370778373676e-05,
"loss": 0.424,
"step": 18800
},
{
"epoch": 2.732181034170381,
"grad_norm": 5.77213191986084,
"learning_rate": 2.268589650674011e-05,
"loss": 0.4389,
"step": 18850
},
{
"epoch": 2.7394281987172517,
"grad_norm": 2.8429954051971436,
"learning_rate": 2.2613422235106537e-05,
"loss": 0.4829,
"step": 18900
},
{
"epoch": 2.746675363264123,
"grad_norm": 2.9622368812561035,
"learning_rate": 2.254094796347297e-05,
"loss": 0.4781,
"step": 18950
},
{
"epoch": 2.7539225278109942,
"grad_norm": 3.898066997528076,
"learning_rate": 2.2468473691839397e-05,
"loss": 0.4847,
"step": 19000
},
{
"epoch": 2.7539225278109942,
"eval_loss": 0.6265138983726501,
"eval_runtime": 245.5072,
"eval_samples_per_second": 140.505,
"eval_steps_per_second": 14.053,
"step": 19000
},
{
"epoch": 2.761169692357865,
"grad_norm": 2.6696014404296875,
"learning_rate": 2.239599942020583e-05,
"loss": 0.4233,
"step": 19050
},
{
"epoch": 2.768416856904736,
"grad_norm": 2.9069409370422363,
"learning_rate": 2.2323525148572258e-05,
"loss": 0.4632,
"step": 19100
},
{
"epoch": 2.775664021451607,
"grad_norm": 3.3168020248413086,
"learning_rate": 2.2251050876938687e-05,
"loss": 0.4483,
"step": 19150
},
{
"epoch": 2.7829111859984783,
"grad_norm": 2.685267686843872,
"learning_rate": 2.217857660530512e-05,
"loss": 0.4321,
"step": 19200
},
{
"epoch": 2.790158350545349,
"grad_norm": 3.203169584274292,
"learning_rate": 2.2106102333671547e-05,
"loss": 0.4187,
"step": 19250
},
{
"epoch": 2.79740551509222,
"grad_norm": 2.244285821914673,
"learning_rate": 2.203362806203798e-05,
"loss": 0.4336,
"step": 19300
},
{
"epoch": 2.804652679639091,
"grad_norm": 2.8344221115112305,
"learning_rate": 2.1961153790404408e-05,
"loss": 0.4469,
"step": 19350
},
{
"epoch": 2.8118998441859624,
"grad_norm": 2.193204879760742,
"learning_rate": 2.188867951877084e-05,
"loss": 0.4394,
"step": 19400
},
{
"epoch": 2.819147008732833,
"grad_norm": 2.7660672664642334,
"learning_rate": 2.1816205247137268e-05,
"loss": 0.4646,
"step": 19450
},
{
"epoch": 2.8263941732797044,
"grad_norm": 1.7778669595718384,
"learning_rate": 2.1743730975503697e-05,
"loss": 0.4176,
"step": 19500
},
{
"epoch": 2.8336413378265752,
"grad_norm": 2.1182384490966797,
"learning_rate": 2.1671256703870125e-05,
"loss": 0.4266,
"step": 19550
},
{
"epoch": 2.8408885023734465,
"grad_norm": 3.023648262023926,
"learning_rate": 2.1598782432236557e-05,
"loss": 0.4378,
"step": 19600
},
{
"epoch": 2.8481356669203173,
"grad_norm": 3.217515707015991,
"learning_rate": 2.1526308160602986e-05,
"loss": 0.4328,
"step": 19650
},
{
"epoch": 2.8553828314671885,
"grad_norm": 2.5275774002075195,
"learning_rate": 2.1453833888969414e-05,
"loss": 0.3978,
"step": 19700
},
{
"epoch": 2.8626299960140598,
"grad_norm": 3.3163788318634033,
"learning_rate": 2.1381359617335846e-05,
"loss": 0.4594,
"step": 19750
},
{
"epoch": 2.8698771605609306,
"grad_norm": 3.3472440242767334,
"learning_rate": 2.1308885345702275e-05,
"loss": 0.4368,
"step": 19800
},
{
"epoch": 2.8771243251078014,
"grad_norm": 3.1864593029022217,
"learning_rate": 2.1236411074068707e-05,
"loss": 0.4715,
"step": 19850
},
{
"epoch": 2.8843714896546726,
"grad_norm": 2.73544979095459,
"learning_rate": 2.1163936802435135e-05,
"loss": 0.443,
"step": 19900
},
{
"epoch": 2.891618654201544,
"grad_norm": 3.4767727851867676,
"learning_rate": 2.1091462530801567e-05,
"loss": 0.4429,
"step": 19950
},
{
"epoch": 2.8988658187484146,
"grad_norm": 2.7174811363220215,
"learning_rate": 2.1018988259167996e-05,
"loss": 0.4067,
"step": 20000
},
{
"epoch": 2.8988658187484146,
"eval_loss": 0.622748076915741,
"eval_runtime": 245.5244,
"eval_samples_per_second": 140.495,
"eval_steps_per_second": 14.052,
"step": 20000
},
{
"epoch": 2.9061129832952854,
"grad_norm": 2.311414957046509,
"learning_rate": 2.0947963472967096e-05,
"loss": 0.451,
"step": 20050
},
{
"epoch": 2.9133601478421567,
"grad_norm": 2.9489965438842773,
"learning_rate": 2.0875489201333528e-05,
"loss": 0.4621,
"step": 20100
},
{
"epoch": 2.920607312389028,
"grad_norm": 3.1284987926483154,
"learning_rate": 2.0803014929699957e-05,
"loss": 0.4455,
"step": 20150
},
{
"epoch": 2.9278544769358987,
"grad_norm": 2.579033613204956,
"learning_rate": 2.073054065806639e-05,
"loss": 0.4555,
"step": 20200
},
{
"epoch": 2.93510164148277,
"grad_norm": 2.4020378589630127,
"learning_rate": 2.0658066386432817e-05,
"loss": 0.4531,
"step": 20250
},
{
"epoch": 2.9423488060296408,
"grad_norm": 2.3254435062408447,
"learning_rate": 2.0585592114799246e-05,
"loss": 0.4434,
"step": 20300
},
{
"epoch": 2.949595970576512,
"grad_norm": 2.742219924926758,
"learning_rate": 2.0513117843165678e-05,
"loss": 0.444,
"step": 20350
},
{
"epoch": 2.956843135123383,
"grad_norm": 2.3544771671295166,
"learning_rate": 2.0440643571532106e-05,
"loss": 0.4576,
"step": 20400
},
{
"epoch": 2.964090299670254,
"grad_norm": 2.842970609664917,
"learning_rate": 2.0368169299898538e-05,
"loss": 0.4416,
"step": 20450
},
{
"epoch": 2.9713374642171253,
"grad_norm": 2.2534501552581787,
"learning_rate": 2.0295695028264967e-05,
"loss": 0.4348,
"step": 20500
},
{
"epoch": 2.978584628763996,
"grad_norm": 2.976383924484253,
"learning_rate": 2.02232207566314e-05,
"loss": 0.4237,
"step": 20550
},
{
"epoch": 2.985831793310867,
"grad_norm": 3.3762621879577637,
"learning_rate": 2.0150746484997827e-05,
"loss": 0.4506,
"step": 20600
},
{
"epoch": 2.993078957857738,
"grad_norm": 3.0444388389587402,
"learning_rate": 2.0078272213364256e-05,
"loss": 0.4357,
"step": 20650
},
{
"epoch": 3.0003261224046094,
"grad_norm": 3.294370412826538,
"learning_rate": 2.0005797941730688e-05,
"loss": 0.4358,
"step": 20700
},
{
"epoch": 3.00757328695148,
"grad_norm": 2.1435303688049316,
"learning_rate": 1.9933323670097116e-05,
"loss": 0.343,
"step": 20750
},
{
"epoch": 3.0148204514983514,
"grad_norm": 2.5070137977600098,
"learning_rate": 1.9860849398463545e-05,
"loss": 0.3397,
"step": 20800
},
{
"epoch": 3.0220676160452222,
"grad_norm": 2.5345394611358643,
"learning_rate": 1.9788375126829974e-05,
"loss": 0.3221,
"step": 20850
},
{
"epoch": 3.0293147805920935,
"grad_norm": 2.4468677043914795,
"learning_rate": 1.9715900855196406e-05,
"loss": 0.3404,
"step": 20900
},
{
"epoch": 3.0365619451389643,
"grad_norm": 3.5916829109191895,
"learning_rate": 1.964487606899551e-05,
"loss": 0.3412,
"step": 20950
},
{
"epoch": 3.0438091096858355,
"grad_norm": 1.7320780754089355,
"learning_rate": 1.9572401797361938e-05,
"loss": 0.3573,
"step": 21000
},
{
"epoch": 3.0438091096858355,
"eval_loss": 0.6396881937980652,
"eval_runtime": 245.0839,
"eval_samples_per_second": 140.748,
"eval_steps_per_second": 14.077,
"step": 21000
},
{
"epoch": 3.0510562742327063,
"grad_norm": 2.4702775478363037,
"learning_rate": 1.9499927525728366e-05,
"loss": 0.3485,
"step": 21050
},
{
"epoch": 3.0583034387795776,
"grad_norm": 3.04402232170105,
"learning_rate": 1.9427453254094795e-05,
"loss": 0.3575,
"step": 21100
},
{
"epoch": 3.0655506033264484,
"grad_norm": 2.487696886062622,
"learning_rate": 1.9354978982461227e-05,
"loss": 0.3178,
"step": 21150
},
{
"epoch": 3.0727977678733196,
"grad_norm": 2.565079689025879,
"learning_rate": 1.9282504710827655e-05,
"loss": 0.3311,
"step": 21200
},
{
"epoch": 3.0800449324201904,
"grad_norm": 3.043264865875244,
"learning_rate": 1.9210030439194087e-05,
"loss": 0.3201,
"step": 21250
},
{
"epoch": 3.0872920969670616,
"grad_norm": 3.086071729660034,
"learning_rate": 1.9137556167560516e-05,
"loss": 0.3385,
"step": 21300
},
{
"epoch": 3.094539261513933,
"grad_norm": 1.5752780437469482,
"learning_rate": 1.9065081895926948e-05,
"loss": 0.305,
"step": 21350
},
{
"epoch": 3.1017864260608037,
"grad_norm": 2.464972496032715,
"learning_rate": 1.8992607624293376e-05,
"loss": 0.3421,
"step": 21400
},
{
"epoch": 3.109033590607675,
"grad_norm": 2.98641037940979,
"learning_rate": 1.892013335265981e-05,
"loss": 0.3581,
"step": 21450
},
{
"epoch": 3.1162807551545457,
"grad_norm": 2.293949842453003,
"learning_rate": 1.8847659081026237e-05,
"loss": 0.3241,
"step": 21500
},
{
"epoch": 3.123527919701417,
"grad_norm": 2.361656427383423,
"learning_rate": 1.8775184809392666e-05,
"loss": 0.3154,
"step": 21550
},
{
"epoch": 3.1307750842482878,
"grad_norm": 3.095930576324463,
"learning_rate": 1.8702710537759097e-05,
"loss": 0.3043,
"step": 21600
},
{
"epoch": 3.138022248795159,
"grad_norm": 2.254836320877075,
"learning_rate": 1.8630236266125526e-05,
"loss": 0.3381,
"step": 21650
},
{
"epoch": 3.14526941334203,
"grad_norm": 3.281912088394165,
"learning_rate": 1.8557761994491958e-05,
"loss": 0.3292,
"step": 21700
},
{
"epoch": 3.152516577888901,
"grad_norm": 2.4811136722564697,
"learning_rate": 1.8485287722858387e-05,
"loss": 0.3221,
"step": 21750
},
{
"epoch": 3.159763742435772,
"grad_norm": 2.5498745441436768,
"learning_rate": 1.8412813451224815e-05,
"loss": 0.3548,
"step": 21800
},
{
"epoch": 3.167010906982643,
"grad_norm": 2.9773364067077637,
"learning_rate": 1.8340339179591247e-05,
"loss": 0.3199,
"step": 21850
},
{
"epoch": 3.174258071529514,
"grad_norm": 2.239015579223633,
"learning_rate": 1.8267864907957676e-05,
"loss": 0.3258,
"step": 21900
},
{
"epoch": 3.181505236076385,
"grad_norm": 2.470496416091919,
"learning_rate": 1.8195390636324104e-05,
"loss": 0.3199,
"step": 21950
},
{
"epoch": 3.1887524006232564,
"grad_norm": 2.567301034927368,
"learning_rate": 1.8122916364690536e-05,
"loss": 0.3859,
"step": 22000
},
{
"epoch": 3.1887524006232564,
"eval_loss": 0.6378082036972046,
"eval_runtime": 245.7126,
"eval_samples_per_second": 140.388,
"eval_steps_per_second": 14.041,
"step": 22000
},
{
"epoch": 3.195999565170127,
"grad_norm": 3.044546127319336,
"learning_rate": 1.8050442093056965e-05,
"loss": 0.3116,
"step": 22050
},
{
"epoch": 3.2032467297169984,
"grad_norm": 2.832991600036621,
"learning_rate": 1.7977967821423393e-05,
"loss": 0.3343,
"step": 22100
},
{
"epoch": 3.210493894263869,
"grad_norm": 2.9658920764923096,
"learning_rate": 1.7905493549789825e-05,
"loss": 0.3576,
"step": 22150
},
{
"epoch": 3.2177410588107405,
"grad_norm": 3.258549213409424,
"learning_rate": 1.7833019278156254e-05,
"loss": 0.3396,
"step": 22200
},
{
"epoch": 3.2249882233576113,
"grad_norm": 3.3627419471740723,
"learning_rate": 1.7760545006522686e-05,
"loss": 0.3206,
"step": 22250
},
{
"epoch": 3.2322353879044825,
"grad_norm": 2.9110984802246094,
"learning_rate": 1.7688070734889114e-05,
"loss": 0.3612,
"step": 22300
},
{
"epoch": 3.2394825524513533,
"grad_norm": 2.8267645835876465,
"learning_rate": 1.7615596463255546e-05,
"loss": 0.3477,
"step": 22350
},
{
"epoch": 3.2467297169982245,
"grad_norm": 2.8990402221679688,
"learning_rate": 1.7543122191621975e-05,
"loss": 0.3301,
"step": 22400
},
{
"epoch": 3.2539768815450953,
"grad_norm": 2.5321147441864014,
"learning_rate": 1.7470647919988407e-05,
"loss": 0.3212,
"step": 22450
},
{
"epoch": 3.2612240460919666,
"grad_norm": 3.9064178466796875,
"learning_rate": 1.7398173648354835e-05,
"loss": 0.3752,
"step": 22500
},
{
"epoch": 3.2684712106388374,
"grad_norm": 2.5248515605926514,
"learning_rate": 1.7325699376721267e-05,
"loss": 0.3467,
"step": 22550
},
{
"epoch": 3.2757183751857086,
"grad_norm": 2.442370653152466,
"learning_rate": 1.7253225105087696e-05,
"loss": 0.3171,
"step": 22600
},
{
"epoch": 3.2829655397325794,
"grad_norm": 2.523216485977173,
"learning_rate": 1.7180750833454125e-05,
"loss": 0.3422,
"step": 22650
},
{
"epoch": 3.2902127042794507,
"grad_norm": 3.633876323699951,
"learning_rate": 1.7108276561820553e-05,
"loss": 0.3236,
"step": 22700
},
{
"epoch": 3.297459868826322,
"grad_norm": 3.5989325046539307,
"learning_rate": 1.7035802290186985e-05,
"loss": 0.3562,
"step": 22750
},
{
"epoch": 3.3047070333731927,
"grad_norm": 2.8764047622680664,
"learning_rate": 1.6963328018553414e-05,
"loss": 0.3432,
"step": 22800
},
{
"epoch": 3.311954197920064,
"grad_norm": 2.8960604667663574,
"learning_rate": 1.6890853746919842e-05,
"loss": 0.339,
"step": 22850
},
{
"epoch": 3.3192013624669348,
"grad_norm": 3.1542296409606934,
"learning_rate": 1.6818379475286274e-05,
"loss": 0.3578,
"step": 22900
},
{
"epoch": 3.326448527013806,
"grad_norm": 3.966387987136841,
"learning_rate": 1.6745905203652703e-05,
"loss": 0.3437,
"step": 22950
},
{
"epoch": 3.333695691560677,
"grad_norm": 2.1080222129821777,
"learning_rate": 1.6673430932019135e-05,
"loss": 0.3476,
"step": 23000
},
{
"epoch": 3.333695691560677,
"eval_loss": 0.6372683644294739,
"eval_runtime": 245.8192,
"eval_samples_per_second": 140.327,
"eval_steps_per_second": 14.035,
"step": 23000
},
{
"epoch": 3.340942856107548,
"grad_norm": 2.4654605388641357,
"learning_rate": 1.6600956660385563e-05,
"loss": 0.3582,
"step": 23050
},
{
"epoch": 3.348190020654419,
"grad_norm": 3.2088425159454346,
"learning_rate": 1.6528482388751995e-05,
"loss": 0.3121,
"step": 23100
},
{
"epoch": 3.35543718520129,
"grad_norm": 2.5280325412750244,
"learning_rate": 1.6456008117118424e-05,
"loss": 0.3233,
"step": 23150
},
{
"epoch": 3.362684349748161,
"grad_norm": 2.5772511959075928,
"learning_rate": 1.6383533845484852e-05,
"loss": 0.3277,
"step": 23200
},
{
"epoch": 3.369931514295032,
"grad_norm": 2.4769599437713623,
"learning_rate": 1.6311059573851284e-05,
"loss": 0.3349,
"step": 23250
},
{
"epoch": 3.377178678841903,
"grad_norm": 3.1656038761138916,
"learning_rate": 1.6238585302217713e-05,
"loss": 0.3601,
"step": 23300
},
{
"epoch": 3.384425843388774,
"grad_norm": 3.1911141872406006,
"learning_rate": 1.6166111030584145e-05,
"loss": 0.3664,
"step": 23350
},
{
"epoch": 3.391673007935645,
"grad_norm": 3.001246213912964,
"learning_rate": 1.6093636758950574e-05,
"loss": 0.3547,
"step": 23400
},
{
"epoch": 3.398920172482516,
"grad_norm": 2.2357707023620605,
"learning_rate": 1.6021162487317005e-05,
"loss": 0.3581,
"step": 23450
},
{
"epoch": 3.4061673370293875,
"grad_norm": 2.813751459121704,
"learning_rate": 1.5948688215683434e-05,
"loss": 0.3436,
"step": 23500
},
{
"epoch": 3.4134145015762583,
"grad_norm": 2.3340563774108887,
"learning_rate": 1.5876213944049863e-05,
"loss": 0.3424,
"step": 23550
},
{
"epoch": 3.4206616661231295,
"grad_norm": 3.3509624004364014,
"learning_rate": 1.580373967241629e-05,
"loss": 0.365,
"step": 23600
},
{
"epoch": 3.4279088306700003,
"grad_norm": 2.6918370723724365,
"learning_rate": 1.5731265400782723e-05,
"loss": 0.3396,
"step": 23650
},
{
"epoch": 3.4351559952168715,
"grad_norm": 3.110868215560913,
"learning_rate": 1.5658791129149152e-05,
"loss": 0.3303,
"step": 23700
},
{
"epoch": 3.4424031597637423,
"grad_norm": 3.6813771724700928,
"learning_rate": 1.558631685751558e-05,
"loss": 0.3726,
"step": 23750
},
{
"epoch": 3.4496503243106136,
"grad_norm": 2.564406633377075,
"learning_rate": 1.5513842585882012e-05,
"loss": 0.3507,
"step": 23800
},
{
"epoch": 3.4568974888574844,
"grad_norm": 2.496525526046753,
"learning_rate": 1.544136831424844e-05,
"loss": 0.3348,
"step": 23850
},
{
"epoch": 3.4641446534043556,
"grad_norm": 3.00034761428833,
"learning_rate": 1.5368894042614873e-05,
"loss": 0.35,
"step": 23900
},
{
"epoch": 3.4713918179512264,
"grad_norm": 2.688913106918335,
"learning_rate": 1.52964197709813e-05,
"loss": 0.3123,
"step": 23950
},
{
"epoch": 3.4786389824980977,
"grad_norm": 3.100461721420288,
"learning_rate": 1.5223945499347733e-05,
"loss": 0.3559,
"step": 24000
},
{
"epoch": 3.4786389824980977,
"eval_loss": 0.6312422156333923,
"eval_runtime": 246.9159,
"eval_samples_per_second": 139.703,
"eval_steps_per_second": 13.972,
"step": 24000
},
{
"epoch": 3.4858861470449685,
"grad_norm": 2.221304178237915,
"learning_rate": 1.5151471227714162e-05,
"loss": 0.3437,
"step": 24050
},
{
"epoch": 3.4931333115918397,
"grad_norm": 2.807159662246704,
"learning_rate": 1.5078996956080594e-05,
"loss": 0.3284,
"step": 24100
},
{
"epoch": 3.5003804761387105,
"grad_norm": 2.910870313644409,
"learning_rate": 1.5006522684447022e-05,
"loss": 0.3436,
"step": 24150
},
{
"epoch": 3.5076276406855817,
"grad_norm": 2.7148754596710205,
"learning_rate": 1.4934048412813451e-05,
"loss": 0.364,
"step": 24200
},
{
"epoch": 3.514874805232453,
"grad_norm": 3.629567861557007,
"learning_rate": 1.4861574141179881e-05,
"loss": 0.3293,
"step": 24250
},
{
"epoch": 3.522121969779324,
"grad_norm": 1.6957030296325684,
"learning_rate": 1.478909986954631e-05,
"loss": 0.3418,
"step": 24300
},
{
"epoch": 3.529369134326195,
"grad_norm": 2.671588659286499,
"learning_rate": 1.4716625597912742e-05,
"loss": 0.3054,
"step": 24350
},
{
"epoch": 3.536616298873066,
"grad_norm": 2.832435369491577,
"learning_rate": 1.464415132627917e-05,
"loss": 0.3457,
"step": 24400
},
{
"epoch": 3.543863463419937,
"grad_norm": 3.806084156036377,
"learning_rate": 1.4571677054645602e-05,
"loss": 0.3366,
"step": 24450
},
{
"epoch": 3.551110627966808,
"grad_norm": 3.169780731201172,
"learning_rate": 1.4499202783012031e-05,
"loss": 0.3337,
"step": 24500
},
{
"epoch": 3.558357792513679,
"grad_norm": 2.461219310760498,
"learning_rate": 1.4426728511378463e-05,
"loss": 0.3394,
"step": 24550
},
{
"epoch": 3.56560495706055,
"grad_norm": 2.458402633666992,
"learning_rate": 1.4354254239744891e-05,
"loss": 0.3245,
"step": 24600
},
{
"epoch": 3.572852121607421,
"grad_norm": 2.172034740447998,
"learning_rate": 1.4281779968111322e-05,
"loss": 0.3335,
"step": 24650
},
{
"epoch": 3.580099286154292,
"grad_norm": 2.7269339561462402,
"learning_rate": 1.420930569647775e-05,
"loss": 0.3268,
"step": 24700
},
{
"epoch": 3.587346450701163,
"grad_norm": 3.2520856857299805,
"learning_rate": 1.4136831424844179e-05,
"loss": 0.3282,
"step": 24750
},
{
"epoch": 3.5945936152480344,
"grad_norm": 3.6039845943450928,
"learning_rate": 1.406435715321061e-05,
"loss": 0.3375,
"step": 24800
},
{
"epoch": 3.6018407797949052,
"grad_norm": 2.7368907928466797,
"learning_rate": 1.399188288157704e-05,
"loss": 0.3368,
"step": 24850
},
{
"epoch": 3.609087944341776,
"grad_norm": 3.1287124156951904,
"learning_rate": 1.3919408609943471e-05,
"loss": 0.3517,
"step": 24900
},
{
"epoch": 3.6163351088886473,
"grad_norm": 3.3379523754119873,
"learning_rate": 1.38469343383099e-05,
"loss": 0.3334,
"step": 24950
},
{
"epoch": 3.6235822734355185,
"grad_norm": 2.828714609146118,
"learning_rate": 1.3774460066676332e-05,
"loss": 0.3148,
"step": 25000
},
{
"epoch": 3.6235822734355185,
"eval_loss": 0.6326374411582947,
"eval_runtime": 245.5715,
"eval_samples_per_second": 140.468,
"eval_steps_per_second": 14.049,
"step": 25000
},
{
"epoch": 3.6308294379823893,
"grad_norm": 2.7624423503875732,
"learning_rate": 1.370198579504276e-05,
"loss": 0.3333,
"step": 25050
},
{
"epoch": 3.6380766025292606,
"grad_norm": 2.4403984546661377,
"learning_rate": 1.362951152340919e-05,
"loss": 0.3306,
"step": 25100
},
{
"epoch": 3.6453237670761314,
"grad_norm": 3.4315109252929688,
"learning_rate": 1.355703725177562e-05,
"loss": 0.3428,
"step": 25150
},
{
"epoch": 3.6525709316230026,
"grad_norm": 4.232142925262451,
"learning_rate": 1.3484562980142051e-05,
"loss": 0.3437,
"step": 25200
},
{
"epoch": 3.6598180961698734,
"grad_norm": 2.562215805053711,
"learning_rate": 1.341208870850848e-05,
"loss": 0.3437,
"step": 25250
},
{
"epoch": 3.6670652607167447,
"grad_norm": 2.4503726959228516,
"learning_rate": 1.3339614436874908e-05,
"loss": 0.3018,
"step": 25300
},
{
"epoch": 3.6743124252636155,
"grad_norm": 2.709066390991211,
"learning_rate": 1.3268589650674012e-05,
"loss": 0.3582,
"step": 25350
},
{
"epoch": 3.6815595898104867,
"grad_norm": 2.3442864418029785,
"learning_rate": 1.319611537904044e-05,
"loss": 0.3224,
"step": 25400
},
{
"epoch": 3.6888067543573575,
"grad_norm": 4.138051509857178,
"learning_rate": 1.312364110740687e-05,
"loss": 0.3791,
"step": 25450
},
{
"epoch": 3.6960539189042287,
"grad_norm": 3.238833427429199,
"learning_rate": 1.3051166835773301e-05,
"loss": 0.3191,
"step": 25500
},
{
"epoch": 3.7033010834511,
"grad_norm": 3.0697717666625977,
"learning_rate": 1.297869256413973e-05,
"loss": 0.32,
"step": 25550
},
{
"epoch": 3.710548247997971,
"grad_norm": 2.4563581943511963,
"learning_rate": 1.2906218292506162e-05,
"loss": 0.3269,
"step": 25600
},
{
"epoch": 3.7177954125448416,
"grad_norm": 2.1714043617248535,
"learning_rate": 1.283374402087259e-05,
"loss": 0.3085,
"step": 25650
},
{
"epoch": 3.725042577091713,
"grad_norm": 2.205698013305664,
"learning_rate": 1.2761269749239022e-05,
"loss": 0.3558,
"step": 25700
},
{
"epoch": 3.732289741638584,
"grad_norm": 2.4091830253601074,
"learning_rate": 1.268879547760545e-05,
"loss": 0.3631,
"step": 25750
},
{
"epoch": 3.739536906185455,
"grad_norm": 1.7875028848648071,
"learning_rate": 1.2616321205971881e-05,
"loss": 0.3241,
"step": 25800
},
{
"epoch": 3.746784070732326,
"grad_norm": 3.206444501876831,
"learning_rate": 1.2545296419770983e-05,
"loss": 0.3237,
"step": 25850
},
{
"epoch": 3.754031235279197,
"grad_norm": 2.4100027084350586,
"learning_rate": 1.2472822148137411e-05,
"loss": 0.3116,
"step": 25900
},
{
"epoch": 3.761278399826068,
"grad_norm": 3.0889265537261963,
"learning_rate": 1.2400347876503842e-05,
"loss": 0.3599,
"step": 25950
},
{
"epoch": 3.768525564372939,
"grad_norm": 2.965827703475952,
"learning_rate": 1.2327873604870272e-05,
"loss": 0.3477,
"step": 26000
},
{
"epoch": 3.768525564372939,
"eval_loss": 0.6264123320579529,
"eval_runtime": 245.5751,
"eval_samples_per_second": 140.466,
"eval_steps_per_second": 14.049,
"step": 26000
},
{
"epoch": 3.77577272891981,
"grad_norm": 3.572783946990967,
"learning_rate": 1.2255399333236702e-05,
"loss": 0.3519,
"step": 26050
},
{
"epoch": 3.7830198934666814,
"grad_norm": 3.2291600704193115,
"learning_rate": 1.218292506160313e-05,
"loss": 0.3377,
"step": 26100
},
{
"epoch": 3.7902670580135522,
"grad_norm": 3.1608381271362305,
"learning_rate": 1.2110450789969561e-05,
"loss": 0.3306,
"step": 26150
},
{
"epoch": 3.797514222560423,
"grad_norm": 2.326995611190796,
"learning_rate": 1.2037976518335991e-05,
"loss": 0.3425,
"step": 26200
},
{
"epoch": 3.8047613871072943,
"grad_norm": 2.580730438232422,
"learning_rate": 1.1965502246702421e-05,
"loss": 0.3544,
"step": 26250
},
{
"epoch": 3.8120085516541655,
"grad_norm": 3.1434969902038574,
"learning_rate": 1.1893027975068852e-05,
"loss": 0.3475,
"step": 26300
},
{
"epoch": 3.8192557162010363,
"grad_norm": 2.08758282661438,
"learning_rate": 1.1820553703435282e-05,
"loss": 0.3287,
"step": 26350
},
{
"epoch": 3.826502880747907,
"grad_norm": 2.8469362258911133,
"learning_rate": 1.174807943180171e-05,
"loss": 0.3236,
"step": 26400
},
{
"epoch": 3.8337500452947784,
"grad_norm": 3.5601906776428223,
"learning_rate": 1.1675605160168141e-05,
"loss": 0.3263,
"step": 26450
},
{
"epoch": 3.8409972098416496,
"grad_norm": 3.756640911102295,
"learning_rate": 1.1603130888534571e-05,
"loss": 0.3382,
"step": 26500
},
{
"epoch": 3.8482443743885204,
"grad_norm": 2.394885778427124,
"learning_rate": 1.1530656616901e-05,
"loss": 0.3373,
"step": 26550
},
{
"epoch": 3.8554915389353916,
"grad_norm": 2.798363208770752,
"learning_rate": 1.145818234526743e-05,
"loss": 0.3259,
"step": 26600
},
{
"epoch": 3.8627387034822624,
"grad_norm": 2.411869764328003,
"learning_rate": 1.138570807363386e-05,
"loss": 0.3418,
"step": 26650
},
{
"epoch": 3.8699858680291337,
"grad_norm": 3.126814603805542,
"learning_rate": 1.131323380200029e-05,
"loss": 0.3334,
"step": 26700
},
{
"epoch": 3.8772330325760045,
"grad_norm": 3.4210116863250732,
"learning_rate": 1.124075953036672e-05,
"loss": 0.3249,
"step": 26750
},
{
"epoch": 3.8844801971228757,
"grad_norm": 2.846679925918579,
"learning_rate": 1.1168285258733151e-05,
"loss": 0.3377,
"step": 26800
},
{
"epoch": 3.891727361669747,
"grad_norm": 3.338003635406494,
"learning_rate": 1.1095810987099581e-05,
"loss": 0.3626,
"step": 26850
},
{
"epoch": 3.8989745262166178,
"grad_norm": 3.7777626514434814,
"learning_rate": 1.102333671546601e-05,
"loss": 0.3385,
"step": 26900
},
{
"epoch": 3.9062216907634886,
"grad_norm": 2.5645010471343994,
"learning_rate": 1.095086244383244e-05,
"loss": 0.3485,
"step": 26950
},
{
"epoch": 3.91346885531036,
"grad_norm": 2.8242435455322266,
"learning_rate": 1.0878388172198869e-05,
"loss": 0.353,
"step": 27000
},
{
"epoch": 3.91346885531036,
"eval_loss": 0.6270226836204529,
"eval_runtime": 245.4756,
"eval_samples_per_second": 140.523,
"eval_steps_per_second": 14.054,
"step": 27000
},
{
"epoch": 3.920716019857231,
"grad_norm": 3.0427374839782715,
"learning_rate": 1.0805913900565299e-05,
"loss": 0.3625,
"step": 27050
},
{
"epoch": 3.927963184404102,
"grad_norm": 3.5145153999328613,
"learning_rate": 1.073343962893173e-05,
"loss": 0.3377,
"step": 27100
},
{
"epoch": 3.9352103489509727,
"grad_norm": 3.0157294273376465,
"learning_rate": 1.066096535729816e-05,
"loss": 0.3231,
"step": 27150
},
{
"epoch": 3.942457513497844,
"grad_norm": 3.3504700660705566,
"learning_rate": 1.058849108566459e-05,
"loss": 0.3407,
"step": 27200
},
{
"epoch": 3.949704678044715,
"grad_norm": 2.313544988632202,
"learning_rate": 1.051601681403102e-05,
"loss": 0.3252,
"step": 27250
},
{
"epoch": 3.956951842591586,
"grad_norm": 3.034682035446167,
"learning_rate": 1.044354254239745e-05,
"loss": 0.319,
"step": 27300
},
{
"epoch": 3.964199007138457,
"grad_norm": 2.8075735569000244,
"learning_rate": 1.037106827076388e-05,
"loss": 0.3371,
"step": 27350
},
{
"epoch": 3.971446171685328,
"grad_norm": 2.394465446472168,
"learning_rate": 1.0298593999130309e-05,
"loss": 0.3286,
"step": 27400
},
{
"epoch": 3.9786933362321992,
"grad_norm": 2.6049180030822754,
"learning_rate": 1.0226119727496738e-05,
"loss": 0.3295,
"step": 27450
},
{
"epoch": 3.98594050077907,
"grad_norm": 2.6557512283325195,
"learning_rate": 1.0153645455863168e-05,
"loss": 0.3566,
"step": 27500
},
{
"epoch": 3.9931876653259413,
"grad_norm": 2.997840166091919,
"learning_rate": 1.0081171184229598e-05,
"loss": 0.326,
"step": 27550
},
{
"epoch": 4.0004348298728125,
"grad_norm": 2.4651620388031006,
"learning_rate": 1.0008696912596029e-05,
"loss": 0.3328,
"step": 27600
},
{
"epoch": 4.007681994419683,
"grad_norm": 2.93027400970459,
"learning_rate": 9.936222640962459e-06,
"loss": 0.2705,
"step": 27650
},
{
"epoch": 4.014929158966554,
"grad_norm": 3.154695987701416,
"learning_rate": 9.863748369328889e-06,
"loss": 0.251,
"step": 27700
},
{
"epoch": 4.022176323513426,
"grad_norm": 2.877485990524292,
"learning_rate": 9.79127409769532e-06,
"loss": 0.2557,
"step": 27750
},
{
"epoch": 4.029423488060297,
"grad_norm": 2.5868325233459473,
"learning_rate": 9.71879982606175e-06,
"loss": 0.2481,
"step": 27800
},
{
"epoch": 4.036670652607167,
"grad_norm": 4.68599271774292,
"learning_rate": 9.646325554428178e-06,
"loss": 0.2704,
"step": 27850
},
{
"epoch": 4.043917817154038,
"grad_norm": 2.302772045135498,
"learning_rate": 9.573851282794608e-06,
"loss": 0.2537,
"step": 27900
},
{
"epoch": 4.05116498170091,
"grad_norm": 2.2476446628570557,
"learning_rate": 9.501377011161039e-06,
"loss": 0.2488,
"step": 27950
},
{
"epoch": 4.058412146247781,
"grad_norm": 1.8352832794189453,
"learning_rate": 9.428902739527467e-06,
"loss": 0.2693,
"step": 28000
},
{
"epoch": 4.058412146247781,
"eval_loss": 0.6433804631233215,
"eval_runtime": 245.0452,
"eval_samples_per_second": 140.77,
"eval_steps_per_second": 14.079,
"step": 28000
},
{
"epoch": 4.0656593107946515,
"grad_norm": 2.861711263656616,
"learning_rate": 9.356428467893898e-06,
"loss": 0.2552,
"step": 28050
},
{
"epoch": 4.072906475341522,
"grad_norm": 2.669003963470459,
"learning_rate": 9.283954196260328e-06,
"loss": 0.2737,
"step": 28100
},
{
"epoch": 4.080153639888394,
"grad_norm": 3.106980323791504,
"learning_rate": 9.211479924626758e-06,
"loss": 0.265,
"step": 28150
},
{
"epoch": 4.087400804435265,
"grad_norm": 2.7630670070648193,
"learning_rate": 9.139005652993188e-06,
"loss": 0.2457,
"step": 28200
},
{
"epoch": 4.094647968982136,
"grad_norm": 2.5765066146850586,
"learning_rate": 9.066531381359619e-06,
"loss": 0.2815,
"step": 28250
},
{
"epoch": 4.101895133529006,
"grad_norm": 2.980583667755127,
"learning_rate": 8.994057109726047e-06,
"loss": 0.2763,
"step": 28300
},
{
"epoch": 4.109142298075878,
"grad_norm": 2.6509013175964355,
"learning_rate": 8.921582838092477e-06,
"loss": 0.2454,
"step": 28350
},
{
"epoch": 4.116389462622749,
"grad_norm": 2.8553245067596436,
"learning_rate": 8.849108566458908e-06,
"loss": 0.2749,
"step": 28400
},
{
"epoch": 4.12363662716962,
"grad_norm": 2.524636745452881,
"learning_rate": 8.776634294825338e-06,
"loss": 0.2537,
"step": 28450
},
{
"epoch": 4.130883791716491,
"grad_norm": 3.848393440246582,
"learning_rate": 8.704160023191767e-06,
"loss": 0.2635,
"step": 28500
},
{
"epoch": 4.138130956263362,
"grad_norm": 3.296485662460327,
"learning_rate": 8.631685751558197e-06,
"loss": 0.2499,
"step": 28550
},
{
"epoch": 4.145378120810233,
"grad_norm": 3.0012335777282715,
"learning_rate": 8.559211479924627e-06,
"loss": 0.2698,
"step": 28600
},
{
"epoch": 4.152625285357104,
"grad_norm": 1.961544156074524,
"learning_rate": 8.486737208291057e-06,
"loss": 0.23,
"step": 28650
},
{
"epoch": 4.159872449903975,
"grad_norm": 3.157874822616577,
"learning_rate": 8.414262936657488e-06,
"loss": 0.2493,
"step": 28700
},
{
"epoch": 4.167119614450846,
"grad_norm": 2.372300624847412,
"learning_rate": 8.341788665023916e-06,
"loss": 0.2874,
"step": 28750
},
{
"epoch": 4.174366778997717,
"grad_norm": 1.9763847589492798,
"learning_rate": 8.269314393390346e-06,
"loss": 0.2508,
"step": 28800
},
{
"epoch": 4.181613943544588,
"grad_norm": 3.00111985206604,
"learning_rate": 8.196840121756777e-06,
"loss": 0.2732,
"step": 28850
},
{
"epoch": 4.1888611080914595,
"grad_norm": 3.2600185871124268,
"learning_rate": 8.124365850123207e-06,
"loss": 0.2718,
"step": 28900
},
{
"epoch": 4.19610827263833,
"grad_norm": 2.755221366882324,
"learning_rate": 8.051891578489637e-06,
"loss": 0.2401,
"step": 28950
},
{
"epoch": 4.203355437185201,
"grad_norm": 3.3103065490722656,
"learning_rate": 7.979417306856067e-06,
"loss": 0.2898,
"step": 29000
},
{
"epoch": 4.203355437185201,
"eval_loss": 0.6491243243217468,
"eval_runtime": 245.4343,
"eval_samples_per_second": 140.547,
"eval_steps_per_second": 14.057,
"step": 29000
},
{
"epoch": 4.210602601732072,
"grad_norm": 2.783529281616211,
"learning_rate": 7.906943035222496e-06,
"loss": 0.2774,
"step": 29050
},
{
"epoch": 4.217849766278944,
"grad_norm": 2.3917746543884277,
"learning_rate": 7.834468763588926e-06,
"loss": 0.2896,
"step": 29100
},
{
"epoch": 4.225096930825814,
"grad_norm": 3.132794141769409,
"learning_rate": 7.761994491955357e-06,
"loss": 0.2704,
"step": 29150
},
{
"epoch": 4.232344095372685,
"grad_norm": 2.8275017738342285,
"learning_rate": 7.689520220321785e-06,
"loss": 0.2695,
"step": 29200
},
{
"epoch": 4.239591259919557,
"grad_norm": 3.1233084201812744,
"learning_rate": 7.617045948688216e-06,
"loss": 0.2677,
"step": 29250
},
{
"epoch": 4.246838424466428,
"grad_norm": 3.1787428855895996,
"learning_rate": 7.544571677054646e-06,
"loss": 0.2611,
"step": 29300
},
{
"epoch": 4.2540855890132985,
"grad_norm": 3.4065091609954834,
"learning_rate": 7.472097405421076e-06,
"loss": 0.285,
"step": 29350
},
{
"epoch": 4.261332753560169,
"grad_norm": 2.7599704265594482,
"learning_rate": 7.399623133787506e-06,
"loss": 0.2586,
"step": 29400
},
{
"epoch": 4.268579918107041,
"grad_norm": 2.2776358127593994,
"learning_rate": 7.3271488621539365e-06,
"loss": 0.2532,
"step": 29450
},
{
"epoch": 4.275827082653912,
"grad_norm": 3.391362428665161,
"learning_rate": 7.254674590520366e-06,
"loss": 0.257,
"step": 29500
},
{
"epoch": 4.283074247200783,
"grad_norm": 4.152310371398926,
"learning_rate": 7.182200318886796e-06,
"loss": 0.2655,
"step": 29550
},
{
"epoch": 4.290321411747653,
"grad_norm": 1.8384218215942383,
"learning_rate": 7.109726047253225e-06,
"loss": 0.2556,
"step": 29600
},
{
"epoch": 4.297568576294525,
"grad_norm": 2.8006668090820312,
"learning_rate": 7.037251775619655e-06,
"loss": 0.252,
"step": 29650
},
{
"epoch": 4.304815740841396,
"grad_norm": 2.0686655044555664,
"learning_rate": 6.964777503986085e-06,
"loss": 0.2626,
"step": 29700
},
{
"epoch": 4.312062905388267,
"grad_norm": 4.304172515869141,
"learning_rate": 6.892303232352515e-06,
"loss": 0.2566,
"step": 29750
},
{
"epoch": 4.319310069935138,
"grad_norm": 3.3372154235839844,
"learning_rate": 6.819828960718945e-06,
"loss": 0.256,
"step": 29800
},
{
"epoch": 4.326557234482009,
"grad_norm": 2.2065439224243164,
"learning_rate": 6.747354689085375e-06,
"loss": 0.251,
"step": 29850
},
{
"epoch": 4.33380439902888,
"grad_norm": 3.629650354385376,
"learning_rate": 6.6748804174518055e-06,
"loss": 0.2696,
"step": 29900
},
{
"epoch": 4.341051563575751,
"grad_norm": 2.2397236824035645,
"learning_rate": 6.602406145818235e-06,
"loss": 0.2359,
"step": 29950
},
{
"epoch": 4.348298728122622,
"grad_norm": 3.494893789291382,
"learning_rate": 6.529931874184665e-06,
"loss": 0.2714,
"step": 30000
},
{
"epoch": 4.348298728122622,
"eval_loss": 0.6459131836891174,
"eval_runtime": 245.3544,
"eval_samples_per_second": 140.593,
"eval_steps_per_second": 14.061,
"step": 30000
},
{
"epoch": 4.355545892669493,
"grad_norm": 2.9454917907714844,
"learning_rate": 6.4574576025510955e-06,
"loss": 0.2539,
"step": 30050
},
{
"epoch": 4.362793057216364,
"grad_norm": 2.9951882362365723,
"learning_rate": 6.384983330917525e-06,
"loss": 0.2687,
"step": 30100
},
{
"epoch": 4.370040221763235,
"grad_norm": 3.583976984024048,
"learning_rate": 6.312509059283954e-06,
"loss": 0.2879,
"step": 30150
},
{
"epoch": 4.3772873863101065,
"grad_norm": 3.201929807662964,
"learning_rate": 6.2400347876503846e-06,
"loss": 0.2531,
"step": 30200
},
{
"epoch": 4.384534550856977,
"grad_norm": 2.60980486869812,
"learning_rate": 6.167560516016814e-06,
"loss": 0.2681,
"step": 30250
},
{
"epoch": 4.391781715403848,
"grad_norm": 2.1033682823181152,
"learning_rate": 6.095086244383244e-06,
"loss": 0.2774,
"step": 30300
},
{
"epoch": 4.399028879950719,
"grad_norm": 2.239474058151245,
"learning_rate": 6.022611972749674e-06,
"loss": 0.2676,
"step": 30350
},
{
"epoch": 4.406276044497591,
"grad_norm": 3.176302671432495,
"learning_rate": 5.950137701116104e-06,
"loss": 0.2722,
"step": 30400
},
{
"epoch": 4.413523209044461,
"grad_norm": 2.5901739597320557,
"learning_rate": 5.877663429482534e-06,
"loss": 0.2523,
"step": 30450
},
{
"epoch": 4.420770373591332,
"grad_norm": 2.8084895610809326,
"learning_rate": 5.8051891578489645e-06,
"loss": 0.2487,
"step": 30500
},
{
"epoch": 4.428017538138203,
"grad_norm": 3.332167387008667,
"learning_rate": 5.732714886215394e-06,
"loss": 0.2523,
"step": 30550
},
{
"epoch": 4.435264702685075,
"grad_norm": 2.872776746749878,
"learning_rate": 5.660240614581823e-06,
"loss": 0.2739,
"step": 30600
},
{
"epoch": 4.4425118672319455,
"grad_norm": 1.9077197313308716,
"learning_rate": 5.5877663429482536e-06,
"loss": 0.2476,
"step": 30650
},
{
"epoch": 4.449759031778816,
"grad_norm": 2.676182270050049,
"learning_rate": 5.515292071314684e-06,
"loss": 0.2579,
"step": 30700
},
{
"epoch": 4.457006196325688,
"grad_norm": 3.4579455852508545,
"learning_rate": 5.442817799681114e-06,
"loss": 0.2686,
"step": 30750
},
{
"epoch": 4.464253360872559,
"grad_norm": 2.6098556518554688,
"learning_rate": 5.370343528047543e-06,
"loss": 0.2578,
"step": 30800
},
{
"epoch": 4.4715005254194296,
"grad_norm": 3.4452645778656006,
"learning_rate": 5.297869256413973e-06,
"loss": 0.2868,
"step": 30850
},
{
"epoch": 4.4787476899663,
"grad_norm": 3.4186015129089355,
"learning_rate": 5.225394984780403e-06,
"loss": 0.2676,
"step": 30900
},
{
"epoch": 4.485994854513172,
"grad_norm": 3.0700855255126953,
"learning_rate": 5.1529207131468335e-06,
"loss": 0.2642,
"step": 30950
},
{
"epoch": 4.493242019060043,
"grad_norm": 2.718798875808716,
"learning_rate": 5.080446441513263e-06,
"loss": 0.2488,
"step": 31000
},
{
"epoch": 4.493242019060043,
"eval_loss": 0.6449950337409973,
"eval_runtime": 245.4001,
"eval_samples_per_second": 140.566,
"eval_steps_per_second": 14.059,
"step": 31000
},
{
"epoch": 4.500489183606914,
"grad_norm": 3.136319875717163,
"learning_rate": 5.007972169879692e-06,
"loss": 0.2831,
"step": 31050
},
{
"epoch": 4.507736348153784,
"grad_norm": 2.020862340927124,
"learning_rate": 4.935497898246123e-06,
"loss": 0.2312,
"step": 31100
},
{
"epoch": 4.514983512700656,
"grad_norm": 4.0119948387146,
"learning_rate": 4.863023626612553e-06,
"loss": 0.2583,
"step": 31150
},
{
"epoch": 4.522230677247527,
"grad_norm": 3.341949462890625,
"learning_rate": 4.790549354978983e-06,
"loss": 0.2777,
"step": 31200
},
{
"epoch": 4.529477841794398,
"grad_norm": 3.6001293659210205,
"learning_rate": 4.7180750833454125e-06,
"loss": 0.251,
"step": 31250
},
{
"epoch": 4.536725006341269,
"grad_norm": 2.4010775089263916,
"learning_rate": 4.645600811711843e-06,
"loss": 0.2679,
"step": 31300
},
{
"epoch": 4.54397217088814,
"grad_norm": 2.33186674118042,
"learning_rate": 4.573126540078272e-06,
"loss": 0.2702,
"step": 31350
},
{
"epoch": 4.551219335435011,
"grad_norm": 3.366321325302124,
"learning_rate": 4.5006522684447025e-06,
"loss": 0.2383,
"step": 31400
},
{
"epoch": 4.558466499981882,
"grad_norm": 2.606224298477173,
"learning_rate": 4.428177996811132e-06,
"loss": 0.2709,
"step": 31450
},
{
"epoch": 4.5657136645287535,
"grad_norm": 3.850285053253174,
"learning_rate": 4.355703725177562e-06,
"loss": 0.2712,
"step": 31500
},
{
"epoch": 4.572960829075624,
"grad_norm": 2.4807302951812744,
"learning_rate": 4.2832294535439924e-06,
"loss": 0.267,
"step": 31550
},
{
"epoch": 4.580207993622495,
"grad_norm": 1.4953159093856812,
"learning_rate": 4.210755181910422e-06,
"loss": 0.2599,
"step": 31600
},
{
"epoch": 4.587455158169366,
"grad_norm": 2.457629680633545,
"learning_rate": 4.138280910276852e-06,
"loss": 0.2802,
"step": 31650
},
{
"epoch": 4.594702322716238,
"grad_norm": 2.150555372238159,
"learning_rate": 4.0658066386432815e-06,
"loss": 0.2564,
"step": 31700
},
{
"epoch": 4.601949487263108,
"grad_norm": 1.8131722211837769,
"learning_rate": 3.993332367009712e-06,
"loss": 0.2805,
"step": 31750
},
{
"epoch": 4.609196651809979,
"grad_norm": 3.3967912197113037,
"learning_rate": 3.920858095376142e-06,
"loss": 0.2812,
"step": 31800
},
{
"epoch": 4.616443816356851,
"grad_norm": 2.5398590564727783,
"learning_rate": 3.8483838237425715e-06,
"loss": 0.2787,
"step": 31850
},
{
"epoch": 4.623690980903722,
"grad_norm": 2.45865535736084,
"learning_rate": 3.7759095521090013e-06,
"loss": 0.2333,
"step": 31900
},
{
"epoch": 4.6309381454505925,
"grad_norm": 3.3966212272644043,
"learning_rate": 3.704884765908103e-06,
"loss": 0.2619,
"step": 31950
},
{
"epoch": 4.638185309997463,
"grad_norm": 2.421985149383545,
"learning_rate": 3.6324104942745326e-06,
"loss": 0.2851,
"step": 32000
},
{
"epoch": 4.638185309997463,
"eval_loss": 0.6421298980712891,
"eval_runtime": 245.3379,
"eval_samples_per_second": 140.602,
"eval_steps_per_second": 14.062,
"step": 32000
},
{
"epoch": 4.645432474544334,
"grad_norm": 2.5105528831481934,
"learning_rate": 3.5599362226409624e-06,
"loss": 0.2595,
"step": 32050
},
{
"epoch": 4.652679639091206,
"grad_norm": 2.653074026107788,
"learning_rate": 3.4874619510073923e-06,
"loss": 0.2478,
"step": 32100
},
{
"epoch": 4.6599268036380765,
"grad_norm": 1.6157690286636353,
"learning_rate": 3.4149876793738225e-06,
"loss": 0.2563,
"step": 32150
},
{
"epoch": 4.667173968184947,
"grad_norm": 2.220090866088867,
"learning_rate": 3.3425134077402524e-06,
"loss": 0.2825,
"step": 32200
},
{
"epoch": 4.674421132731819,
"grad_norm": 3.191338062286377,
"learning_rate": 3.2700391361066826e-06,
"loss": 0.2421,
"step": 32250
},
{
"epoch": 4.68166829727869,
"grad_norm": 2.6636569499969482,
"learning_rate": 3.1990143499057836e-06,
"loss": 0.274,
"step": 32300
},
{
"epoch": 4.688915461825561,
"grad_norm": 2.6303908824920654,
"learning_rate": 3.126540078272214e-06,
"loss": 0.2763,
"step": 32350
},
{
"epoch": 4.696162626372431,
"grad_norm": 2.2153165340423584,
"learning_rate": 3.0540658066386433e-06,
"loss": 0.2512,
"step": 32400
},
{
"epoch": 4.703409790919303,
"grad_norm": 2.291551351547241,
"learning_rate": 2.9815915350050736e-06,
"loss": 0.2739,
"step": 32450
},
{
"epoch": 4.710656955466174,
"grad_norm": 3.2897346019744873,
"learning_rate": 2.909117263371503e-06,
"loss": 0.239,
"step": 32500
},
{
"epoch": 4.717904120013045,
"grad_norm": 3.1026740074157715,
"learning_rate": 2.8366429917379333e-06,
"loss": 0.2806,
"step": 32550
},
{
"epoch": 4.7251512845599155,
"grad_norm": 4.166581153869629,
"learning_rate": 2.764168720104363e-06,
"loss": 0.2597,
"step": 32600
},
{
"epoch": 4.732398449106787,
"grad_norm": 3.2309772968292236,
"learning_rate": 2.691694448470793e-06,
"loss": 0.2748,
"step": 32650
},
{
"epoch": 4.739645613653658,
"grad_norm": 3.379218816757202,
"learning_rate": 2.619220176837223e-06,
"loss": 0.2665,
"step": 32700
},
{
"epoch": 4.746892778200529,
"grad_norm": 2.8024582862854004,
"learning_rate": 2.5467459052036526e-06,
"loss": 0.2571,
"step": 32750
},
{
"epoch": 4.7541399427474005,
"grad_norm": 1.905375361442566,
"learning_rate": 2.474271633570083e-06,
"loss": 0.2547,
"step": 32800
},
{
"epoch": 4.761387107294271,
"grad_norm": 2.8878672122955322,
"learning_rate": 2.4017973619365127e-06,
"loss": 0.2633,
"step": 32850
},
{
"epoch": 4.768634271841142,
"grad_norm": 2.940661907196045,
"learning_rate": 2.3293230903029426e-06,
"loss": 0.2752,
"step": 32900
},
{
"epoch": 4.775881436388013,
"grad_norm": 2.747434139251709,
"learning_rate": 2.2568488186693724e-06,
"loss": 0.2762,
"step": 32950
},
{
"epoch": 4.783128600934885,
"grad_norm": 3.6419451236724854,
"learning_rate": 2.1843745470358027e-06,
"loss": 0.2685,
"step": 33000
},
{
"epoch": 4.783128600934885,
"eval_loss": 0.6418930888175964,
"eval_runtime": 245.4849,
"eval_samples_per_second": 140.518,
"eval_steps_per_second": 14.054,
"step": 33000
},
{
"epoch": 4.790375765481755,
"grad_norm": 2.916613817214966,
"learning_rate": 2.111900275402232e-06,
"loss": 0.2846,
"step": 33050
},
{
"epoch": 4.797622930028626,
"grad_norm": 2.95576548576355,
"learning_rate": 2.0394260037686624e-06,
"loss": 0.2626,
"step": 33100
},
{
"epoch": 4.804870094575497,
"grad_norm": 2.053476333618164,
"learning_rate": 1.966951732135092e-06,
"loss": 0.2619,
"step": 33150
},
{
"epoch": 4.812117259122369,
"grad_norm": 2.7688095569610596,
"learning_rate": 1.894477460501522e-06,
"loss": 0.2974,
"step": 33200
},
{
"epoch": 4.8193644236692395,
"grad_norm": 2.67800235748291,
"learning_rate": 1.8220031888679521e-06,
"loss": 0.2676,
"step": 33250
},
{
"epoch": 4.82661158821611,
"grad_norm": 3.279421806335449,
"learning_rate": 1.7495289172343818e-06,
"loss": 0.2689,
"step": 33300
},
{
"epoch": 4.833858752762982,
"grad_norm": 1.616542100906372,
"learning_rate": 1.6770546456008118e-06,
"loss": 0.2555,
"step": 33350
},
{
"epoch": 4.841105917309853,
"grad_norm": 3.103170156478882,
"learning_rate": 1.6045803739672419e-06,
"loss": 0.2551,
"step": 33400
},
{
"epoch": 4.8483530818567235,
"grad_norm": 2.5930793285369873,
"learning_rate": 1.5321061023336715e-06,
"loss": 0.2618,
"step": 33450
},
{
"epoch": 4.855600246403594,
"grad_norm": 3.2237420082092285,
"learning_rate": 1.4596318307001016e-06,
"loss": 0.2461,
"step": 33500
},
{
"epoch": 4.862847410950465,
"grad_norm": 2.2981555461883545,
"learning_rate": 1.3871575590665314e-06,
"loss": 0.2707,
"step": 33550
},
{
"epoch": 4.870094575497337,
"grad_norm": 1.4708250761032104,
"learning_rate": 1.3146832874329614e-06,
"loss": 0.254,
"step": 33600
},
{
"epoch": 4.877341740044208,
"grad_norm": 2.699856758117676,
"learning_rate": 1.2422090157993913e-06,
"loss": 0.2593,
"step": 33650
},
{
"epoch": 4.884588904591078,
"grad_norm": 2.0948593616485596,
"learning_rate": 1.1697347441658213e-06,
"loss": 0.2583,
"step": 33700
},
{
"epoch": 4.89183606913795,
"grad_norm": 2.787429094314575,
"learning_rate": 1.0972604725322512e-06,
"loss": 0.2381,
"step": 33750
},
{
"epoch": 4.899083233684821,
"grad_norm": 2.4767441749572754,
"learning_rate": 1.024786200898681e-06,
"loss": 0.2474,
"step": 33800
},
{
"epoch": 4.906330398231692,
"grad_norm": 3.3083810806274414,
"learning_rate": 9.523119292651109e-07,
"loss": 0.2647,
"step": 33850
},
{
"epoch": 4.9135775627785625,
"grad_norm": 2.8774940967559814,
"learning_rate": 8.798376576315409e-07,
"loss": 0.2622,
"step": 33900
},
{
"epoch": 4.920824727325434,
"grad_norm": 2.5657265186309814,
"learning_rate": 8.073633859979708e-07,
"loss": 0.2537,
"step": 33950
},
{
"epoch": 4.928071891872305,
"grad_norm": 1.964735984802246,
"learning_rate": 7.348891143644006e-07,
"loss": 0.2646,
"step": 34000
},
{
"epoch": 4.928071891872305,
"eval_loss": 0.6408438682556152,
"eval_runtime": 245.4737,
"eval_samples_per_second": 140.524,
"eval_steps_per_second": 14.054,
"step": 34000
},
{
"epoch": 4.935319056419176,
"grad_norm": 2.6487998962402344,
"learning_rate": 6.624148427308306e-07,
"loss": 0.2799,
"step": 34050
},
{
"epoch": 4.942566220966047,
"grad_norm": 2.05784010887146,
"learning_rate": 5.899405710972605e-07,
"loss": 0.2657,
"step": 34100
},
{
"epoch": 4.949813385512918,
"grad_norm": 2.4890284538269043,
"learning_rate": 5.174662994636905e-07,
"loss": 0.2477,
"step": 34150
},
{
"epoch": 4.957060550059789,
"grad_norm": 2.277297258377075,
"learning_rate": 4.449920278301203e-07,
"loss": 0.2664,
"step": 34200
},
{
"epoch": 4.96430771460666,
"grad_norm": 3.3281660079956055,
"learning_rate": 3.7251775619655025e-07,
"loss": 0.2582,
"step": 34250
},
{
"epoch": 4.971554879153532,
"grad_norm": 3.9353535175323486,
"learning_rate": 3.0004348456298015e-07,
"loss": 0.2622,
"step": 34300
},
{
"epoch": 4.978802043700402,
"grad_norm": 2.5661704540252686,
"learning_rate": 2.275692129294101e-07,
"loss": 0.2562,
"step": 34350
},
{
"epoch": 4.986049208247273,
"grad_norm": 3.4376327991485596,
"learning_rate": 1.5509494129583997e-07,
"loss": 0.2647,
"step": 34400
},
{
"epoch": 4.993296372794144,
"grad_norm": 3.3473730087280273,
"learning_rate": 8.26206696622699e-08,
"loss": 0.2737,
"step": 34450
}
],
"logging_steps": 50,
"max_steps": 34495,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5981403722758554e+18,
"train_batch_size": 10,
"trial_name": null,
"trial_params": null
}