FormlessAI's picture
Training in progress, epoch 0, checkpoint
96b467b verified
{
"best_global_step": null,
"best_metric": 1.3073337078094482,
"best_model_checkpoint": null,
"epoch": 0.7067795033944015,
"eval_steps": 50,
"global_step": 1900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018599460615642146,
"grad_norm": 3.400698661804199,
"learning_rate": 1.4037033750100226e-06,
"loss": 2.6219,
"step": 5
},
{
"epoch": 0.003719892123128429,
"grad_norm": 3.7983412742614746,
"learning_rate": 3.1583325937725507e-06,
"loss": 2.5432,
"step": 10
},
{
"epoch": 0.005579838184692644,
"grad_norm": 3.0861151218414307,
"learning_rate": 4.912961812535079e-06,
"loss": 2.3792,
"step": 15
},
{
"epoch": 0.007439784246256858,
"grad_norm": 2.717170476913452,
"learning_rate": 6.667591031297607e-06,
"loss": 2.3455,
"step": 20
},
{
"epoch": 0.009299730307821073,
"grad_norm": 2.2805142402648926,
"learning_rate": 8.422220250060135e-06,
"loss": 2.1424,
"step": 25
},
{
"epoch": 0.011159676369385288,
"grad_norm": 1.9744335412979126,
"learning_rate": 1.0176849468822663e-05,
"loss": 1.8742,
"step": 30
},
{
"epoch": 0.013019622430949503,
"grad_norm": 1.8863332271575928,
"learning_rate": 1.1931478687585193e-05,
"loss": 1.7454,
"step": 35
},
{
"epoch": 0.014879568492513717,
"grad_norm": 1.5830954313278198,
"learning_rate": 1.368610790634772e-05,
"loss": 1.7088,
"step": 40
},
{
"epoch": 0.016739514554077933,
"grad_norm": 1.8321353197097778,
"learning_rate": 1.544073712511025e-05,
"loss": 1.632,
"step": 45
},
{
"epoch": 0.018599460615642147,
"grad_norm": 1.4320825338363647,
"learning_rate": 1.7195366343872776e-05,
"loss": 1.4594,
"step": 50
},
{
"epoch": 0.018599460615642147,
"eval_loss": 1.5426534414291382,
"eval_runtime": 60.7069,
"eval_samples_per_second": 164.726,
"eval_steps_per_second": 5.156,
"step": 50
},
{
"epoch": 0.02045940667720636,
"grad_norm": 1.4581339359283447,
"learning_rate": 1.8949995562635306e-05,
"loss": 1.4841,
"step": 55
},
{
"epoch": 0.022319352738770577,
"grad_norm": 1.3386048078536987,
"learning_rate": 2.0704624781397832e-05,
"loss": 1.5089,
"step": 60
},
{
"epoch": 0.02417929880033479,
"grad_norm": 1.3601100444793701,
"learning_rate": 2.245925400016036e-05,
"loss": 1.4723,
"step": 65
},
{
"epoch": 0.026039244861899007,
"grad_norm": 1.2655599117279053,
"learning_rate": 2.4213883218922888e-05,
"loss": 1.4864,
"step": 70
},
{
"epoch": 0.02789919092346322,
"grad_norm": 1.3355369567871094,
"learning_rate": 2.5968512437685417e-05,
"loss": 1.4451,
"step": 75
},
{
"epoch": 0.029759136985027433,
"grad_norm": 1.274318814277649,
"learning_rate": 2.7723141656447947e-05,
"loss": 1.4944,
"step": 80
},
{
"epoch": 0.031619083046591646,
"grad_norm": 2.0729751586914062,
"learning_rate": 2.9477770875210473e-05,
"loss": 1.437,
"step": 85
},
{
"epoch": 0.03347902910815587,
"grad_norm": 1.2173129320144653,
"learning_rate": 3.1232400093973e-05,
"loss": 1.482,
"step": 90
},
{
"epoch": 0.03533897516972008,
"grad_norm": 1.2413370609283447,
"learning_rate": 3.298702931273553e-05,
"loss": 1.3871,
"step": 95
},
{
"epoch": 0.03719892123128429,
"grad_norm": 1.4032536745071411,
"learning_rate": 3.4741658531498055e-05,
"loss": 1.436,
"step": 100
},
{
"epoch": 0.03719892123128429,
"eval_loss": 1.4507780075073242,
"eval_runtime": 60.5546,
"eval_samples_per_second": 165.14,
"eval_steps_per_second": 5.169,
"step": 100
},
{
"epoch": 0.039058867292848506,
"grad_norm": 1.408437967300415,
"learning_rate": 3.509220060941937e-05,
"loss": 1.4291,
"step": 105
},
{
"epoch": 0.04091881335441272,
"grad_norm": 1.2020059823989868,
"learning_rate": 3.509064158950106e-05,
"loss": 1.4322,
"step": 110
},
{
"epoch": 0.04277875941597694,
"grad_norm": 1.2841988801956177,
"learning_rate": 3.5087883436606155e-05,
"loss": 1.4509,
"step": 115
},
{
"epoch": 0.04463870547754115,
"grad_norm": 1.1238864660263062,
"learning_rate": 3.508392633925074e-05,
"loss": 1.4467,
"step": 120
},
{
"epoch": 0.046498651539105366,
"grad_norm": 1.285390019416809,
"learning_rate": 3.507877056789716e-05,
"loss": 1.4269,
"step": 125
},
{
"epoch": 0.04835859760066958,
"grad_norm": 1.1746469736099243,
"learning_rate": 3.507241647493555e-05,
"loss": 1.4636,
"step": 130
},
{
"epoch": 0.05021854366223379,
"grad_norm": 1.177941083908081,
"learning_rate": 3.506486449465971e-05,
"loss": 1.4078,
"step": 135
},
{
"epoch": 0.05207848972379801,
"grad_norm": 1.2271251678466797,
"learning_rate": 3.505611514323747e-05,
"loss": 1.403,
"step": 140
},
{
"epoch": 0.053938435785362226,
"grad_norm": 1.1131396293640137,
"learning_rate": 3.5046169018675374e-05,
"loss": 1.4511,
"step": 145
},
{
"epoch": 0.05579838184692644,
"grad_norm": 1.195426106452942,
"learning_rate": 3.503502680077782e-05,
"loss": 1.3926,
"step": 150
},
{
"epoch": 0.05579838184692644,
"eval_loss": 1.428297519683838,
"eval_runtime": 60.4864,
"eval_samples_per_second": 165.326,
"eval_steps_per_second": 5.175,
"step": 150
},
{
"epoch": 0.05765832790849065,
"grad_norm": 1.2685747146606445,
"learning_rate": 3.5022689251100616e-05,
"loss": 1.4539,
"step": 155
},
{
"epoch": 0.059518273970054866,
"grad_norm": 1.09740149974823,
"learning_rate": 3.500915721289888e-05,
"loss": 1.431,
"step": 160
},
{
"epoch": 0.06137822003161908,
"grad_norm": 1.1276105642318726,
"learning_rate": 3.499443161106944e-05,
"loss": 1.3862,
"step": 165
},
{
"epoch": 0.06323816609318329,
"grad_norm": 1.1806763410568237,
"learning_rate": 3.497851345208764e-05,
"loss": 1.4269,
"step": 170
},
{
"epoch": 0.0650981121547475,
"grad_norm": 1.079424500465393,
"learning_rate": 3.496140382393849e-05,
"loss": 1.3912,
"step": 175
},
{
"epoch": 0.06695805821631173,
"grad_norm": 1.0723962783813477,
"learning_rate": 3.4943103896042344e-05,
"loss": 1.3961,
"step": 180
},
{
"epoch": 0.06881800427787595,
"grad_norm": 1.1186383962631226,
"learning_rate": 3.492361491917497e-05,
"loss": 1.4213,
"step": 185
},
{
"epoch": 0.07067795033944016,
"grad_norm": 1.0343255996704102,
"learning_rate": 3.4902938225382055e-05,
"loss": 1.3989,
"step": 190
},
{
"epoch": 0.07253789640100437,
"grad_norm": 1.141801357269287,
"learning_rate": 3.488107522788814e-05,
"loss": 1.4074,
"step": 195
},
{
"epoch": 0.07439784246256859,
"grad_norm": 1.0875025987625122,
"learning_rate": 3.485802742100007e-05,
"loss": 1.4185,
"step": 200
},
{
"epoch": 0.07439784246256859,
"eval_loss": 1.4180645942687988,
"eval_runtime": 60.4683,
"eval_samples_per_second": 165.376,
"eval_steps_per_second": 5.176,
"step": 200
},
{
"epoch": 0.0762577885241328,
"grad_norm": 1.1400611400604248,
"learning_rate": 3.483379638000484e-05,
"loss": 1.4337,
"step": 205
},
{
"epoch": 0.07811773458569701,
"grad_norm": 1.0570169687271118,
"learning_rate": 3.480838376106189e-05,
"loss": 1.4042,
"step": 210
},
{
"epoch": 0.07997768064726123,
"grad_norm": 1.4329943656921387,
"learning_rate": 3.478179130108999e-05,
"loss": 1.3974,
"step": 215
},
{
"epoch": 0.08183762670882544,
"grad_norm": 1.1569150686264038,
"learning_rate": 3.475402081764844e-05,
"loss": 1.4402,
"step": 220
},
{
"epoch": 0.08369757277038965,
"grad_norm": 1.0719656944274902,
"learning_rate": 3.4725074208812906e-05,
"loss": 1.4071,
"step": 225
},
{
"epoch": 0.08555751883195388,
"grad_norm": 0.9809865951538086,
"learning_rate": 3.4694953453045645e-05,
"loss": 1.4349,
"step": 230
},
{
"epoch": 0.0874174648935181,
"grad_norm": 1.3142799139022827,
"learning_rate": 3.466366060906031e-05,
"loss": 1.3969,
"step": 235
},
{
"epoch": 0.0892774109550823,
"grad_norm": 1.1862435340881348,
"learning_rate": 3.463119781568121e-05,
"loss": 1.4122,
"step": 240
},
{
"epoch": 0.09113735701664652,
"grad_norm": 1.1050539016723633,
"learning_rate": 3.459756729169715e-05,
"loss": 1.3759,
"step": 245
},
{
"epoch": 0.09299730307821073,
"grad_norm": 1.0910768508911133,
"learning_rate": 3.456277133570978e-05,
"loss": 1.4006,
"step": 250
},
{
"epoch": 0.09299730307821073,
"eval_loss": 1.4099781513214111,
"eval_runtime": 60.4718,
"eval_samples_per_second": 165.366,
"eval_steps_per_second": 5.176,
"step": 250
},
{
"epoch": 0.09485724913977495,
"grad_norm": 1.0490922927856445,
"learning_rate": 3.452681232597646e-05,
"loss": 1.3863,
"step": 255
},
{
"epoch": 0.09671719520133916,
"grad_norm": 1.0440068244934082,
"learning_rate": 3.448969272024775e-05,
"loss": 1.4116,
"step": 260
},
{
"epoch": 0.09857714126290337,
"grad_norm": 1.1154160499572754,
"learning_rate": 3.4451415055599386e-05,
"loss": 1.4171,
"step": 265
},
{
"epoch": 0.10043708732446759,
"grad_norm": 1.091792106628418,
"learning_rate": 3.4411981948258904e-05,
"loss": 1.3739,
"step": 270
},
{
"epoch": 0.1022970333860318,
"grad_norm": 1.1216192245483398,
"learning_rate": 3.437139609342681e-05,
"loss": 1.3473,
"step": 275
},
{
"epoch": 0.10415697944759603,
"grad_norm": 1.0132431983947754,
"learning_rate": 3.4329660265092366e-05,
"loss": 1.4201,
"step": 280
},
{
"epoch": 0.10601692550916024,
"grad_norm": 1.0745493173599243,
"learning_rate": 3.4286777315844006e-05,
"loss": 1.3943,
"step": 285
},
{
"epoch": 0.10787687157072445,
"grad_norm": 1.173161268234253,
"learning_rate": 3.4242750176674336e-05,
"loss": 1.4077,
"step": 290
},
{
"epoch": 0.10973681763228867,
"grad_norm": 1.085132122039795,
"learning_rate": 3.419758185677985e-05,
"loss": 1.3981,
"step": 295
},
{
"epoch": 0.11159676369385288,
"grad_norm": 1.049797534942627,
"learning_rate": 3.41512754433552e-05,
"loss": 1.4143,
"step": 300
},
{
"epoch": 0.11159676369385288,
"eval_loss": 1.3975080251693726,
"eval_runtime": 60.4745,
"eval_samples_per_second": 165.359,
"eval_steps_per_second": 5.176,
"step": 300
},
{
"epoch": 0.11345670975541709,
"grad_norm": 1.0642030239105225,
"learning_rate": 3.4103834101382244e-05,
"loss": 1.4265,
"step": 305
},
{
"epoch": 0.1153166558169813,
"grad_norm": 1.074144721031189,
"learning_rate": 3.405526107341368e-05,
"loss": 1.3677,
"step": 310
},
{
"epoch": 0.11717660187854552,
"grad_norm": 1.2073023319244385,
"learning_rate": 3.4005559679351445e-05,
"loss": 1.3879,
"step": 315
},
{
"epoch": 0.11903654794010973,
"grad_norm": 1.0445431470870972,
"learning_rate": 3.395473331621981e-05,
"loss": 1.3625,
"step": 320
},
{
"epoch": 0.12089649400167395,
"grad_norm": 1.1690475940704346,
"learning_rate": 3.3902785457933166e-05,
"loss": 1.414,
"step": 325
},
{
"epoch": 0.12275644006323816,
"grad_norm": 0.9960464239120483,
"learning_rate": 3.3849719655058636e-05,
"loss": 1.386,
"step": 330
},
{
"epoch": 0.12461638612480239,
"grad_norm": 1.1535345315933228,
"learning_rate": 3.379553953457336e-05,
"loss": 1.3309,
"step": 335
},
{
"epoch": 0.12647633218636659,
"grad_norm": 1.1487047672271729,
"learning_rate": 3.3740248799616596e-05,
"loss": 1.4549,
"step": 340
},
{
"epoch": 0.1283362782479308,
"grad_norm": 1.0776844024658203,
"learning_rate": 3.368385122923663e-05,
"loss": 1.382,
"step": 345
},
{
"epoch": 0.130196224309495,
"grad_norm": 1.1013463735580444,
"learning_rate": 3.362635067813248e-05,
"loss": 1.4432,
"step": 350
},
{
"epoch": 0.130196224309495,
"eval_loss": 1.3882778882980347,
"eval_runtime": 60.4797,
"eval_samples_per_second": 165.345,
"eval_steps_per_second": 5.175,
"step": 350
},
{
"epoch": 0.13205617037105924,
"grad_norm": 1.1181257963180542,
"learning_rate": 3.356775107639044e-05,
"loss": 1.3779,
"step": 355
},
{
"epoch": 0.13391611643262347,
"grad_norm": 1.0939253568649292,
"learning_rate": 3.350805642921544e-05,
"loss": 1.3873,
"step": 360
},
{
"epoch": 0.13577606249418767,
"grad_norm": 1.0570895671844482,
"learning_rate": 3.3447270816657335e-05,
"loss": 1.3408,
"step": 365
},
{
"epoch": 0.1376360085557519,
"grad_norm": 1.0405073165893555,
"learning_rate": 3.338539839333198e-05,
"loss": 1.3577,
"step": 370
},
{
"epoch": 0.1394959546173161,
"grad_norm": 1.0169920921325684,
"learning_rate": 3.332244338813734e-05,
"loss": 1.3905,
"step": 375
},
{
"epoch": 0.14135590067888032,
"grad_norm": 0.9779713749885559,
"learning_rate": 3.325841010396438e-05,
"loss": 1.4239,
"step": 380
},
{
"epoch": 0.14321584674044452,
"grad_norm": 1.1131649017333984,
"learning_rate": 3.319330291740301e-05,
"loss": 1.3793,
"step": 385
},
{
"epoch": 0.14507579280200875,
"grad_norm": 1.091145396232605,
"learning_rate": 3.312712627844296e-05,
"loss": 1.3784,
"step": 390
},
{
"epoch": 0.14693573886357295,
"grad_norm": 0.997805118560791,
"learning_rate": 3.3059884710169595e-05,
"loss": 1.3277,
"step": 395
},
{
"epoch": 0.14879568492513717,
"grad_norm": 1.102238416671753,
"learning_rate": 3.299158280845478e-05,
"loss": 1.3698,
"step": 400
},
{
"epoch": 0.14879568492513717,
"eval_loss": 1.3824845552444458,
"eval_runtime": 60.5144,
"eval_samples_per_second": 165.25,
"eval_steps_per_second": 5.172,
"step": 400
},
{
"epoch": 0.15065563098670137,
"grad_norm": 1.0498243570327759,
"learning_rate": 3.292222524164277e-05,
"loss": 1.3825,
"step": 405
},
{
"epoch": 0.1525155770482656,
"grad_norm": 1.1161103248596191,
"learning_rate": 3.2851816750231135e-05,
"loss": 1.4245,
"step": 410
},
{
"epoch": 0.15437552310982983,
"grad_norm": 1.0065839290618896,
"learning_rate": 3.278036214654672e-05,
"loss": 1.3448,
"step": 415
},
{
"epoch": 0.15623546917139403,
"grad_norm": 1.109067440032959,
"learning_rate": 3.2707866314416786e-05,
"loss": 1.4031,
"step": 420
},
{
"epoch": 0.15809541523295825,
"grad_norm": 1.0229747295379639,
"learning_rate": 3.263433420883514e-05,
"loss": 1.3632,
"step": 425
},
{
"epoch": 0.15995536129452245,
"grad_norm": 0.9845519065856934,
"learning_rate": 3.255977085562354e-05,
"loss": 1.3494,
"step": 430
},
{
"epoch": 0.16181530735608668,
"grad_norm": 1.106702446937561,
"learning_rate": 3.248418135108813e-05,
"loss": 1.3682,
"step": 435
},
{
"epoch": 0.16367525341765088,
"grad_norm": 0.9956566095352173,
"learning_rate": 3.240757086167112e-05,
"loss": 1.3687,
"step": 440
},
{
"epoch": 0.1655351994792151,
"grad_norm": 1.006332516670227,
"learning_rate": 3.2329944623597715e-05,
"loss": 1.3089,
"step": 445
},
{
"epoch": 0.1673951455407793,
"grad_norm": 0.977312445640564,
"learning_rate": 3.2251307942518165e-05,
"loss": 1.3686,
"step": 450
},
{
"epoch": 0.1673951455407793,
"eval_loss": 1.3752514123916626,
"eval_runtime": 60.5893,
"eval_samples_per_second": 165.046,
"eval_steps_per_second": 5.166,
"step": 450
},
{
"epoch": 0.16925509160234353,
"grad_norm": 1.0094255208969116,
"learning_rate": 3.2171666193145165e-05,
"loss": 1.3994,
"step": 455
},
{
"epoch": 0.17111503766390776,
"grad_norm": 1.0625929832458496,
"learning_rate": 3.209102481888649e-05,
"loss": 1.404,
"step": 460
},
{
"epoch": 0.17297498372547196,
"grad_norm": 1.0466253757476807,
"learning_rate": 3.2009389331472956e-05,
"loss": 1.3521,
"step": 465
},
{
"epoch": 0.1748349297870362,
"grad_norm": 1.0407981872558594,
"learning_rate": 3.192676531058168e-05,
"loss": 1.3876,
"step": 470
},
{
"epoch": 0.17669487584860039,
"grad_norm": 1.0586848258972168,
"learning_rate": 3.184315840345474e-05,
"loss": 1.3607,
"step": 475
},
{
"epoch": 0.1785548219101646,
"grad_norm": 1.0073552131652832,
"learning_rate": 3.175857432451318e-05,
"loss": 1.3726,
"step": 480
},
{
"epoch": 0.1804147679717288,
"grad_norm": 1.0010629892349243,
"learning_rate": 3.167301885496645e-05,
"loss": 1.3581,
"step": 485
},
{
"epoch": 0.18227471403329304,
"grad_norm": 1.002271056175232,
"learning_rate": 3.158649784241722e-05,
"loss": 1.3611,
"step": 490
},
{
"epoch": 0.18413466009485724,
"grad_norm": 0.9680567979812622,
"learning_rate": 3.149901720046178e-05,
"loss": 1.363,
"step": 495
},
{
"epoch": 0.18599460615642147,
"grad_norm": 1.0661317110061646,
"learning_rate": 3.1410582908285814e-05,
"loss": 1.3698,
"step": 500
},
{
"epoch": 0.18599460615642147,
"eval_loss": 1.368044137954712,
"eval_runtime": 60.5679,
"eval_samples_per_second": 165.104,
"eval_steps_per_second": 5.168,
"step": 500
},
{
"epoch": 0.18785455221798567,
"grad_norm": 1.1560924053192139,
"learning_rate": 3.132120101025571e-05,
"loss": 1.401,
"step": 505
},
{
"epoch": 0.1897144982795499,
"grad_norm": 1.0191872119903564,
"learning_rate": 3.1230877615505466e-05,
"loss": 1.3452,
"step": 510
},
{
"epoch": 0.19157444434111412,
"grad_norm": 1.4533931016921997,
"learning_rate": 3.113961889751914e-05,
"loss": 1.3616,
"step": 515
},
{
"epoch": 0.19343439040267832,
"grad_norm": 0.9897879362106323,
"learning_rate": 3.104743109370887e-05,
"loss": 1.3048,
"step": 520
},
{
"epoch": 0.19529433646424255,
"grad_norm": 1.078536868095398,
"learning_rate": 3.09543205049886e-05,
"loss": 1.3072,
"step": 525
},
{
"epoch": 0.19715428252580675,
"grad_norm": 1.0671093463897705,
"learning_rate": 3.0860293495343384e-05,
"loss": 1.369,
"step": 530
},
{
"epoch": 0.19901422858737097,
"grad_norm": 1.077250599861145,
"learning_rate": 3.076535649139443e-05,
"loss": 1.3864,
"step": 535
},
{
"epoch": 0.20087417464893517,
"grad_norm": 1.0758692026138306,
"learning_rate": 3.0669515981959844e-05,
"loss": 1.328,
"step": 540
},
{
"epoch": 0.2027341207104994,
"grad_norm": 1.1256663799285889,
"learning_rate": 3.057277851761114e-05,
"loss": 1.3823,
"step": 545
},
{
"epoch": 0.2045940667720636,
"grad_norm": 1.2285213470458984,
"learning_rate": 3.0475150710225507e-05,
"loss": 1.3729,
"step": 550
},
{
"epoch": 0.2045940667720636,
"eval_loss": 1.3673256635665894,
"eval_runtime": 60.4936,
"eval_samples_per_second": 165.307,
"eval_steps_per_second": 5.174,
"step": 550
},
{
"epoch": 0.20645401283362783,
"grad_norm": 1.0815597772598267,
"learning_rate": 3.0376639232533898e-05,
"loss": 1.3791,
"step": 555
},
{
"epoch": 0.20831395889519205,
"grad_norm": 1.1154593229293823,
"learning_rate": 3.0277250817664945e-05,
"loss": 1.4125,
"step": 560
},
{
"epoch": 0.21017390495675625,
"grad_norm": 1.1166361570358276,
"learning_rate": 3.017699225868479e-05,
"loss": 1.3984,
"step": 565
},
{
"epoch": 0.21203385101832048,
"grad_norm": 1.0057345628738403,
"learning_rate": 3.007587040813276e-05,
"loss": 1.3763,
"step": 570
},
{
"epoch": 0.21389379707988468,
"grad_norm": 1.091741681098938,
"learning_rate": 2.9973892177553013e-05,
"loss": 1.3778,
"step": 575
},
{
"epoch": 0.2157537431414489,
"grad_norm": 1.0506634712219238,
"learning_rate": 2.987106453702215e-05,
"loss": 1.3768,
"step": 580
},
{
"epoch": 0.2176136892030131,
"grad_norm": 1.0511835813522339,
"learning_rate": 2.9767394514672807e-05,
"loss": 1.3113,
"step": 585
},
{
"epoch": 0.21947363526457733,
"grad_norm": 1.0758532285690308,
"learning_rate": 2.9662889196213302e-05,
"loss": 1.433,
"step": 590
},
{
"epoch": 0.22133358132614153,
"grad_norm": 1.0389028787612915,
"learning_rate": 2.955755572444333e-05,
"loss": 1.3394,
"step": 595
},
{
"epoch": 0.22319352738770576,
"grad_norm": 0.9888710379600525,
"learning_rate": 2.9451401298765766e-05,
"loss": 1.3657,
"step": 600
},
{
"epoch": 0.22319352738770576,
"eval_loss": 1.358090877532959,
"eval_runtime": 60.5018,
"eval_samples_per_second": 165.284,
"eval_steps_per_second": 5.173,
"step": 600
},
{
"epoch": 0.22505347344926996,
"grad_norm": 1.0805813074111938,
"learning_rate": 2.9344433174694606e-05,
"loss": 1.3697,
"step": 605
},
{
"epoch": 0.22691341951083419,
"grad_norm": 0.9579132199287415,
"learning_rate": 2.9236658663359032e-05,
"loss": 1.3599,
"step": 610
},
{
"epoch": 0.2287733655723984,
"grad_norm": 1.103060007095337,
"learning_rate": 2.912808513100373e-05,
"loss": 1.3386,
"step": 615
},
{
"epoch": 0.2306333116339626,
"grad_norm": 0.9141831994056702,
"learning_rate": 2.901871999848541e-05,
"loss": 1.3891,
"step": 620
},
{
"epoch": 0.23249325769552684,
"grad_norm": 0.9142250418663025,
"learning_rate": 2.8908570740765607e-05,
"loss": 1.3781,
"step": 625
},
{
"epoch": 0.23435320375709104,
"grad_norm": 0.952091634273529,
"learning_rate": 2.8797644886399776e-05,
"loss": 1.3449,
"step": 630
},
{
"epoch": 0.23621314981865527,
"grad_norm": 1.228947401046753,
"learning_rate": 2.8685950017022712e-05,
"loss": 1.3673,
"step": 635
},
{
"epoch": 0.23807309588021947,
"grad_norm": 0.9899388551712036,
"learning_rate": 2.857349376683036e-05,
"loss": 1.3736,
"step": 640
},
{
"epoch": 0.2399330419417837,
"grad_norm": 1.0279674530029297,
"learning_rate": 2.8460283822058048e-05,
"loss": 1.3377,
"step": 645
},
{
"epoch": 0.2417929880033479,
"grad_norm": 1.1311445236206055,
"learning_rate": 2.8346327920455112e-05,
"loss": 1.4113,
"step": 650
},
{
"epoch": 0.2417929880033479,
"eval_loss": 1.3546315431594849,
"eval_runtime": 60.4585,
"eval_samples_per_second": 165.403,
"eval_steps_per_second": 5.177,
"step": 650
},
{
"epoch": 0.24365293406491212,
"grad_norm": 1.0192480087280273,
"learning_rate": 2.8231633850756056e-05,
"loss": 1.4002,
"step": 655
},
{
"epoch": 0.24551288012647632,
"grad_norm": 0.9501408338546753,
"learning_rate": 2.8116209452148195e-05,
"loss": 1.3385,
"step": 660
},
{
"epoch": 0.24737282618804055,
"grad_norm": 0.9931688904762268,
"learning_rate": 2.800006261373584e-05,
"loss": 1.3638,
"step": 665
},
{
"epoch": 0.24923277224960477,
"grad_norm": 1.0014399290084839,
"learning_rate": 2.7883201274001122e-05,
"loss": 1.4008,
"step": 670
},
{
"epoch": 0.25109271831116897,
"grad_norm": 0.9573982357978821,
"learning_rate": 2.7765633420261374e-05,
"loss": 1.3607,
"step": 675
},
{
"epoch": 0.25295266437273317,
"grad_norm": 1.4360772371292114,
"learning_rate": 2.7647367088123233e-05,
"loss": 1.3409,
"step": 680
},
{
"epoch": 0.2548126104342974,
"grad_norm": 0.9750449061393738,
"learning_rate": 2.7528410360933393e-05,
"loss": 1.3392,
"step": 685
},
{
"epoch": 0.2566725564958616,
"grad_norm": 1.0708160400390625,
"learning_rate": 2.740877136922615e-05,
"loss": 1.3225,
"step": 690
},
{
"epoch": 0.2585325025574258,
"grad_norm": 0.9025946259498596,
"learning_rate": 2.728845829016766e-05,
"loss": 1.3537,
"step": 695
},
{
"epoch": 0.26039244861899,
"grad_norm": 0.9979115128517151,
"learning_rate": 2.7167479346997062e-05,
"loss": 1.3367,
"step": 700
},
{
"epoch": 0.26039244861899,
"eval_loss": 1.350304126739502,
"eval_runtime": 60.5478,
"eval_samples_per_second": 165.159,
"eval_steps_per_second": 5.169,
"step": 700
},
{
"epoch": 0.2622523946805543,
"grad_norm": 0.9901404976844788,
"learning_rate": 2.7045842808464416e-05,
"loss": 1.3596,
"step": 705
},
{
"epoch": 0.2641123407421185,
"grad_norm": 1.103663444519043,
"learning_rate": 2.692355698826556e-05,
"loss": 1.3654,
"step": 710
},
{
"epoch": 0.2659722868036827,
"grad_norm": 0.9586177468299866,
"learning_rate": 2.680063024447386e-05,
"loss": 1.3472,
"step": 715
},
{
"epoch": 0.26783223286524693,
"grad_norm": 0.9534765481948853,
"learning_rate": 2.6677070978968968e-05,
"loss": 1.3306,
"step": 720
},
{
"epoch": 0.26969217892681113,
"grad_norm": 1.0054278373718262,
"learning_rate": 2.655288763686255e-05,
"loss": 1.392,
"step": 725
},
{
"epoch": 0.27155212498837533,
"grad_norm": 1.0037821531295776,
"learning_rate": 2.642808870592108e-05,
"loss": 1.3624,
"step": 730
},
{
"epoch": 0.27341207104993953,
"grad_norm": 1.0708328485488892,
"learning_rate": 2.6302682715985714e-05,
"loss": 1.3319,
"step": 735
},
{
"epoch": 0.2752720171115038,
"grad_norm": 1.0329240560531616,
"learning_rate": 2.617667823838928e-05,
"loss": 1.3502,
"step": 740
},
{
"epoch": 0.277131963173068,
"grad_norm": 0.9648259878158569,
"learning_rate": 2.6050083885370444e-05,
"loss": 1.358,
"step": 745
},
{
"epoch": 0.2789919092346322,
"grad_norm": 1.043942928314209,
"learning_rate": 2.592290830948507e-05,
"loss": 1.3254,
"step": 750
},
{
"epoch": 0.2789919092346322,
"eval_loss": 1.3463472127914429,
"eval_runtime": 60.4768,
"eval_samples_per_second": 165.353,
"eval_steps_per_second": 5.176,
"step": 750
},
{
"epoch": 0.2808518552961964,
"grad_norm": 1.074987530708313,
"learning_rate": 2.579516020301484e-05,
"loss": 1.3848,
"step": 755
},
{
"epoch": 0.28271180135776064,
"grad_norm": 1.0375388860702515,
"learning_rate": 2.5666848297373133e-05,
"loss": 1.4132,
"step": 760
},
{
"epoch": 0.28457174741932484,
"grad_norm": 0.9844212532043457,
"learning_rate": 2.553798136250826e-05,
"loss": 1.3713,
"step": 765
},
{
"epoch": 0.28643169348088904,
"grad_norm": 1.021017074584961,
"learning_rate": 2.540856820630404e-05,
"loss": 1.3627,
"step": 770
},
{
"epoch": 0.2882916395424533,
"grad_norm": 1.0512734651565552,
"learning_rate": 2.5278617673977793e-05,
"loss": 1.3959,
"step": 775
},
{
"epoch": 0.2901515856040175,
"grad_norm": 1.0537786483764648,
"learning_rate": 2.514813864747578e-05,
"loss": 1.3661,
"step": 780
},
{
"epoch": 0.2920115316655817,
"grad_norm": 1.0074138641357422,
"learning_rate": 2.5017140044866143e-05,
"loss": 1.3438,
"step": 785
},
{
"epoch": 0.2938714777271459,
"grad_norm": 0.9245646595954895,
"learning_rate": 2.488563081972936e-05,
"loss": 1.3232,
"step": 790
},
{
"epoch": 0.29573142378871015,
"grad_norm": 1.0348320007324219,
"learning_rate": 2.4753619960546277e-05,
"loss": 1.3259,
"step": 795
},
{
"epoch": 0.29759136985027435,
"grad_norm": 1.0290751457214355,
"learning_rate": 2.4621116490083764e-05,
"loss": 1.3328,
"step": 800
},
{
"epoch": 0.29759136985027435,
"eval_loss": 1.3428822755813599,
"eval_runtime": 60.4759,
"eval_samples_per_second": 165.355,
"eval_steps_per_second": 5.176,
"step": 800
},
{
"epoch": 0.29945131591183854,
"grad_norm": 0.9886801838874817,
"learning_rate": 2.4488129464778016e-05,
"loss": 1.3618,
"step": 805
},
{
"epoch": 0.30131126197340274,
"grad_norm": 1.0141544342041016,
"learning_rate": 2.4354667974115556e-05,
"loss": 1.2996,
"step": 810
},
{
"epoch": 0.303171208034967,
"grad_norm": 1.0118813514709473,
"learning_rate": 2.4220741140011997e-05,
"loss": 1.3632,
"step": 815
},
{
"epoch": 0.3050311540965312,
"grad_norm": 1.0116174221038818,
"learning_rate": 2.4086358116188535e-05,
"loss": 1.3208,
"step": 820
},
{
"epoch": 0.3068911001580954,
"grad_norm": 0.9581089019775391,
"learning_rate": 2.395152808754635e-05,
"loss": 1.3489,
"step": 825
},
{
"epoch": 0.30875104621965965,
"grad_norm": 1.0313812494277954,
"learning_rate": 2.3816260269538798e-05,
"loss": 1.3304,
"step": 830
},
{
"epoch": 0.31061099228122385,
"grad_norm": 0.9710265398025513,
"learning_rate": 2.368056390754155e-05,
"loss": 1.32,
"step": 835
},
{
"epoch": 0.31247093834278805,
"grad_norm": 0.9445378184318542,
"learning_rate": 2.35444482762207e-05,
"loss": 1.3755,
"step": 840
},
{
"epoch": 0.31433088440435225,
"grad_norm": 0.9779049158096313,
"learning_rate": 2.340792267889885e-05,
"loss": 1.3321,
"step": 845
},
{
"epoch": 0.3161908304659165,
"grad_norm": 1.0178042650222778,
"learning_rate": 2.3270996446919208e-05,
"loss": 1.3845,
"step": 850
},
{
"epoch": 0.3161908304659165,
"eval_loss": 1.3410464525222778,
"eval_runtime": 60.4741,
"eval_samples_per_second": 165.36,
"eval_steps_per_second": 5.176,
"step": 850
},
{
"epoch": 0.3180507765274807,
"grad_norm": 0.938162624835968,
"learning_rate": 2.313367893900785e-05,
"loss": 1.3115,
"step": 855
},
{
"epoch": 0.3199107225890449,
"grad_norm": 0.9968370199203491,
"learning_rate": 2.2995979540634033e-05,
"loss": 1.3254,
"step": 860
},
{
"epoch": 0.32177066865060916,
"grad_norm": 1.0070589780807495,
"learning_rate": 2.2857907663368726e-05,
"loss": 1.4028,
"step": 865
},
{
"epoch": 0.32363061471217336,
"grad_norm": 1.0026954412460327,
"learning_rate": 2.2719472744241337e-05,
"loss": 1.3736,
"step": 870
},
{
"epoch": 0.32549056077373756,
"grad_norm": 0.923169732093811,
"learning_rate": 2.258068424509469e-05,
"loss": 1.3654,
"step": 875
},
{
"epoch": 0.32735050683530176,
"grad_norm": 0.9882062077522278,
"learning_rate": 2.244155165193835e-05,
"loss": 1.3451,
"step": 880
},
{
"epoch": 0.329210452896866,
"grad_norm": 1.023770809173584,
"learning_rate": 2.2302084474300236e-05,
"loss": 1.3042,
"step": 885
},
{
"epoch": 0.3310703989584302,
"grad_norm": 0.9432891011238098,
"learning_rate": 2.2162292244576682e-05,
"loss": 1.3351,
"step": 890
},
{
"epoch": 0.3329303450199944,
"grad_norm": 1.0601178407669067,
"learning_rate": 2.202218451738089e-05,
"loss": 1.4111,
"step": 895
},
{
"epoch": 0.3347902910815586,
"grad_norm": 1.7586909532546997,
"learning_rate": 2.1881770868889913e-05,
"loss": 1.3708,
"step": 900
},
{
"epoch": 0.3347902910815586,
"eval_loss": 1.336167812347412,
"eval_runtime": 60.4785,
"eval_samples_per_second": 165.348,
"eval_steps_per_second": 5.175,
"step": 900
},
{
"epoch": 0.33665023714312287,
"grad_norm": 1.055015206336975,
"learning_rate": 2.1741060896190096e-05,
"loss": 1.3273,
"step": 905
},
{
"epoch": 0.33851018320468707,
"grad_norm": 1.1004860401153564,
"learning_rate": 2.160006421662117e-05,
"loss": 1.3965,
"step": 910
},
{
"epoch": 0.34037012926625126,
"grad_norm": 0.9952622056007385,
"learning_rate": 2.1458790467118895e-05,
"loss": 1.3419,
"step": 915
},
{
"epoch": 0.3422300753278155,
"grad_norm": 1.0129551887512207,
"learning_rate": 2.131724930355637e-05,
"loss": 1.3443,
"step": 920
},
{
"epoch": 0.3440900213893797,
"grad_norm": 1.0409715175628662,
"learning_rate": 2.117545040008412e-05,
"loss": 1.3601,
"step": 925
},
{
"epoch": 0.3459499674509439,
"grad_norm": 1.0580016374588013,
"learning_rate": 2.1033403448468844e-05,
"loss": 1.3321,
"step": 930
},
{
"epoch": 0.3478099135125081,
"grad_norm": 1.0712617635726929,
"learning_rate": 2.089111815743099e-05,
"loss": 1.3727,
"step": 935
},
{
"epoch": 0.3496698595740724,
"grad_norm": 0.9753930568695068,
"learning_rate": 2.074860425198119e-05,
"loss": 1.3014,
"step": 940
},
{
"epoch": 0.35152980563563657,
"grad_norm": 0.9435998201370239,
"learning_rate": 2.0605871472755586e-05,
"loss": 1.3518,
"step": 945
},
{
"epoch": 0.35338975169720077,
"grad_norm": 1.0743812322616577,
"learning_rate": 2.046292957535004e-05,
"loss": 1.3012,
"step": 950
},
{
"epoch": 0.35338975169720077,
"eval_loss": 1.3357957601547241,
"eval_runtime": 60.4881,
"eval_samples_per_second": 165.322,
"eval_steps_per_second": 5.175,
"step": 950
},
{
"epoch": 0.35524969775876497,
"grad_norm": 0.9649530649185181,
"learning_rate": 2.0319788329653343e-05,
"loss": 1.3176,
"step": 955
},
{
"epoch": 0.3571096438203292,
"grad_norm": 1.0349267721176147,
"learning_rate": 2.0176457519179516e-05,
"loss": 1.3752,
"step": 960
},
{
"epoch": 0.3589695898818934,
"grad_norm": 1.021322250366211,
"learning_rate": 2.0032946940399056e-05,
"loss": 1.3736,
"step": 965
},
{
"epoch": 0.3608295359434576,
"grad_norm": 1.0290499925613403,
"learning_rate": 1.9889266402069386e-05,
"loss": 1.3687,
"step": 970
},
{
"epoch": 0.3626894820050219,
"grad_norm": 1.0723743438720703,
"learning_rate": 1.974542572456445e-05,
"loss": 1.3196,
"step": 975
},
{
"epoch": 0.3645494280665861,
"grad_norm": 0.9752843379974365,
"learning_rate": 1.9601434739203483e-05,
"loss": 1.3734,
"step": 980
},
{
"epoch": 0.3664093741281503,
"grad_norm": 1.0339841842651367,
"learning_rate": 1.945730328757906e-05,
"loss": 1.3315,
"step": 985
},
{
"epoch": 0.3682693201897145,
"grad_norm": 1.0285077095031738,
"learning_rate": 1.9313041220884443e-05,
"loss": 1.3427,
"step": 990
},
{
"epoch": 0.37012926625127873,
"grad_norm": 1.0121210813522339,
"learning_rate": 1.9168658399240265e-05,
"loss": 1.3379,
"step": 995
},
{
"epoch": 0.37198921231284293,
"grad_norm": 0.9975342154502869,
"learning_rate": 1.9024164691020593e-05,
"loss": 1.3238,
"step": 1000
},
{
"epoch": 0.37198921231284293,
"eval_loss": 1.330836296081543,
"eval_runtime": 60.528,
"eval_samples_per_second": 165.213,
"eval_steps_per_second": 5.171,
"step": 1000
},
{
"epoch": 0.37384915837440713,
"grad_norm": 1.1997116804122925,
"learning_rate": 1.8879569972178443e-05,
"loss": 1.3633,
"step": 1005
},
{
"epoch": 0.37570910443597133,
"grad_norm": 0.9928303360939026,
"learning_rate": 1.8734884125570776e-05,
"loss": 1.367,
"step": 1010
},
{
"epoch": 0.3775690504975356,
"grad_norm": 1.1708087921142578,
"learning_rate": 1.859011704028302e-05,
"loss": 1.3382,
"step": 1015
},
{
"epoch": 0.3794289965590998,
"grad_norm": 1.06816565990448,
"learning_rate": 1.8445278610953146e-05,
"loss": 1.3605,
"step": 1020
},
{
"epoch": 0.381288942620664,
"grad_norm": 1.0174709558486938,
"learning_rate": 1.8300378737095408e-05,
"loss": 1.3561,
"step": 1025
},
{
"epoch": 0.38314888868222824,
"grad_norm": 1.0002470016479492,
"learning_rate": 1.8155427322423704e-05,
"loss": 1.3546,
"step": 1030
},
{
"epoch": 0.38500883474379244,
"grad_norm": 1.0101186037063599,
"learning_rate": 1.8010434274174678e-05,
"loss": 1.3915,
"step": 1035
},
{
"epoch": 0.38686878080535664,
"grad_norm": 1.0798920392990112,
"learning_rate": 1.786540950243058e-05,
"loss": 1.3456,
"step": 1040
},
{
"epoch": 0.38872872686692084,
"grad_norm": 1.0295590162277222,
"learning_rate": 1.772036291944191e-05,
"loss": 1.2886,
"step": 1045
},
{
"epoch": 0.3905886729284851,
"grad_norm": 1.3735235929489136,
"learning_rate": 1.7575304438949958e-05,
"loss": 1.3724,
"step": 1050
},
{
"epoch": 0.3905886729284851,
"eval_loss": 1.3277840614318848,
"eval_runtime": 60.4768,
"eval_samples_per_second": 165.353,
"eval_steps_per_second": 5.176,
"step": 1050
},
{
"epoch": 0.3924486189900493,
"grad_norm": 1.078456997871399,
"learning_rate": 1.743024397550916e-05,
"loss": 1.3861,
"step": 1055
},
{
"epoch": 0.3943085650516135,
"grad_norm": 1.0125305652618408,
"learning_rate": 1.7285191443809507e-05,
"loss": 1.3671,
"step": 1060
},
{
"epoch": 0.3961685111131777,
"grad_norm": 0.9944830536842346,
"learning_rate": 1.714015675799886e-05,
"loss": 1.3546,
"step": 1065
},
{
"epoch": 0.39802845717474195,
"grad_norm": 1.0055299997329712,
"learning_rate": 1.699514983100534e-05,
"loss": 1.318,
"step": 1070
},
{
"epoch": 0.39988840323630614,
"grad_norm": 0.9434425830841064,
"learning_rate": 1.6850180573859786e-05,
"loss": 1.3336,
"step": 1075
},
{
"epoch": 0.40174834929787034,
"grad_norm": 1.0011420249938965,
"learning_rate": 1.6705258895018352e-05,
"loss": 1.3028,
"step": 1080
},
{
"epoch": 0.4036082953594346,
"grad_norm": 1.0826334953308105,
"learning_rate": 1.6560394699685283e-05,
"loss": 1.3033,
"step": 1085
},
{
"epoch": 0.4054682414209988,
"grad_norm": 0.9963647723197937,
"learning_rate": 1.6415597889135897e-05,
"loss": 1.3299,
"step": 1090
},
{
"epoch": 0.407328187482563,
"grad_norm": 0.9412882924079895,
"learning_rate": 1.6270878360039855e-05,
"loss": 1.3982,
"step": 1095
},
{
"epoch": 0.4091881335441272,
"grad_norm": 1.145696759223938,
"learning_rate": 1.6126246003784744e-05,
"loss": 1.3216,
"step": 1100
},
{
"epoch": 0.4091881335441272,
"eval_loss": 1.3285400867462158,
"eval_runtime": 60.8136,
"eval_samples_per_second": 164.437,
"eval_steps_per_second": 5.147,
"step": 1100
},
{
"epoch": 0.41104807960569145,
"grad_norm": 0.9899356961250305,
"learning_rate": 1.598171070579997e-05,
"loss": 1.3185,
"step": 1105
},
{
"epoch": 0.41290802566725565,
"grad_norm": 1.0239601135253906,
"learning_rate": 1.583728234488117e-05,
"loss": 1.3312,
"step": 1110
},
{
"epoch": 0.41476797172881985,
"grad_norm": 1.0883080959320068,
"learning_rate": 1.569297079251496e-05,
"loss": 1.251,
"step": 1115
},
{
"epoch": 0.4166279177903841,
"grad_norm": 0.9925746321678162,
"learning_rate": 1.5548785912204247e-05,
"loss": 1.3591,
"step": 1120
},
{
"epoch": 0.4184878638519483,
"grad_norm": 1.1945092678070068,
"learning_rate": 1.5404737558794072e-05,
"loss": 1.3444,
"step": 1125
},
{
"epoch": 0.4203478099135125,
"grad_norm": 1.1621458530426025,
"learning_rate": 1.526083557779805e-05,
"loss": 1.3419,
"step": 1130
},
{
"epoch": 0.4222077559750767,
"grad_norm": 1.0666788816452026,
"learning_rate": 1.511708980472542e-05,
"loss": 1.3351,
"step": 1135
},
{
"epoch": 0.42406770203664096,
"grad_norm": 0.9614555835723877,
"learning_rate": 1.4973510064408831e-05,
"loss": 1.2855,
"step": 1140
},
{
"epoch": 0.42592764809820516,
"grad_norm": 1.0012832880020142,
"learning_rate": 1.4830106170332813e-05,
"loss": 1.4136,
"step": 1145
},
{
"epoch": 0.42778759415976936,
"grad_norm": 1.2083728313446045,
"learning_rate": 1.4686887923963032e-05,
"loss": 1.3486,
"step": 1150
},
{
"epoch": 0.42778759415976936,
"eval_loss": 1.3236939907073975,
"eval_runtime": 60.4717,
"eval_samples_per_second": 165.367,
"eval_steps_per_second": 5.176,
"step": 1150
},
{
"epoch": 0.42964754022133356,
"grad_norm": 0.9932472705841064,
"learning_rate": 1.4543865114076387e-05,
"loss": 1.2785,
"step": 1155
},
{
"epoch": 0.4315074862828978,
"grad_norm": 1.0794252157211304,
"learning_rate": 1.4401047516091949e-05,
"loss": 1.3629,
"step": 1160
},
{
"epoch": 0.433367432344462,
"grad_norm": 1.0356013774871826,
"learning_rate": 1.4258444891402823e-05,
"loss": 1.3503,
"step": 1165
},
{
"epoch": 0.4352273784060262,
"grad_norm": 1.0389708280563354,
"learning_rate": 1.4116066986708994e-05,
"loss": 1.3919,
"step": 1170
},
{
"epoch": 0.43708732446759047,
"grad_norm": 1.0182055234909058,
"learning_rate": 1.3973923533351102e-05,
"loss": 1.3258,
"step": 1175
},
{
"epoch": 0.43894727052915467,
"grad_norm": 0.9431582689285278,
"learning_rate": 1.3832024246645377e-05,
"loss": 1.3329,
"step": 1180
},
{
"epoch": 0.44080721659071886,
"grad_norm": 1.014979600906372,
"learning_rate": 1.3690378825219572e-05,
"loss": 1.3719,
"step": 1185
},
{
"epoch": 0.44266716265228306,
"grad_norm": 1.0422216653823853,
"learning_rate": 1.354899695035009e-05,
"loss": 1.3107,
"step": 1190
},
{
"epoch": 0.4445271087138473,
"grad_norm": 1.0085375308990479,
"learning_rate": 1.340788828530027e-05,
"loss": 1.3542,
"step": 1195
},
{
"epoch": 0.4463870547754115,
"grad_norm": 2.4849801063537598,
"learning_rate": 1.326706247465993e-05,
"loss": 1.3224,
"step": 1200
},
{
"epoch": 0.4463870547754115,
"eval_loss": 1.3226845264434814,
"eval_runtime": 60.4716,
"eval_samples_per_second": 165.367,
"eval_steps_per_second": 5.176,
"step": 1200
},
{
"epoch": 0.4482470008369757,
"grad_norm": 1.0490351915359497,
"learning_rate": 1.3126529143686158e-05,
"loss": 1.3868,
"step": 1205
},
{
"epoch": 0.4501069468985399,
"grad_norm": 0.9706118702888489,
"learning_rate": 1.2986297897645448e-05,
"loss": 1.2929,
"step": 1210
},
{
"epoch": 0.45196689296010417,
"grad_norm": 0.9651924967765808,
"learning_rate": 1.2846378321157197e-05,
"loss": 1.3361,
"step": 1215
},
{
"epoch": 0.45382683902166837,
"grad_norm": 0.9976927638053894,
"learning_rate": 1.270677997753859e-05,
"loss": 1.3169,
"step": 1220
},
{
"epoch": 0.45568678508323257,
"grad_norm": 1.0347989797592163,
"learning_rate": 1.256751240815098e-05,
"loss": 1.3577,
"step": 1225
},
{
"epoch": 0.4575467311447968,
"grad_norm": 1.0190398693084717,
"learning_rate": 1.242858513174774e-05,
"loss": 1.3067,
"step": 1230
},
{
"epoch": 0.459406677206361,
"grad_norm": 1.1658672094345093,
"learning_rate": 1.2290007643823672e-05,
"loss": 1.3642,
"step": 1235
},
{
"epoch": 0.4612666232679252,
"grad_norm": 0.9859827756881714,
"learning_rate": 1.2151789415965982e-05,
"loss": 1.3451,
"step": 1240
},
{
"epoch": 0.4631265693294894,
"grad_norm": 1.0628533363342285,
"learning_rate": 1.2013939895206955e-05,
"loss": 1.3878,
"step": 1245
},
{
"epoch": 0.4649865153910537,
"grad_norm": 1.0261117219924927,
"learning_rate": 1.187646850337822e-05,
"loss": 1.3263,
"step": 1250
},
{
"epoch": 0.4649865153910537,
"eval_loss": 1.319972276687622,
"eval_runtime": 60.4945,
"eval_samples_per_second": 165.304,
"eval_steps_per_second": 5.174,
"step": 1250
},
{
"epoch": 0.4668464614526179,
"grad_norm": 0.9808030128479004,
"learning_rate": 1.1739384636466793e-05,
"loss": 1.334,
"step": 1255
},
{
"epoch": 0.4687064075141821,
"grad_norm": 0.9631521105766296,
"learning_rate": 1.160269766397289e-05,
"loss": 1.393,
"step": 1260
},
{
"epoch": 0.4705663535757463,
"grad_norm": 1.0527286529541016,
"learning_rate": 1.146641692826951e-05,
"loss": 1.3541,
"step": 1265
},
{
"epoch": 0.47242629963731053,
"grad_norm": 0.9961625933647156,
"learning_rate": 1.1330551743963907e-05,
"loss": 1.3919,
"step": 1270
},
{
"epoch": 0.47428624569887473,
"grad_norm": 1.033630132675171,
"learning_rate": 1.1195111397260953e-05,
"loss": 1.324,
"step": 1275
},
{
"epoch": 0.47614619176043893,
"grad_norm": 0.9427865147590637,
"learning_rate": 1.1060105145328438e-05,
"loss": 1.3016,
"step": 1280
},
{
"epoch": 0.4780061378220032,
"grad_norm": 1.0001837015151978,
"learning_rate": 1.0925542215664338e-05,
"loss": 1.3122,
"step": 1285
},
{
"epoch": 0.4798660838835674,
"grad_norm": 0.9850189089775085,
"learning_rate": 1.0791431805466157e-05,
"loss": 1.3301,
"step": 1290
},
{
"epoch": 0.4817260299451316,
"grad_norm": 1.0945813655853271,
"learning_rate": 1.065778308100228e-05,
"loss": 1.3268,
"step": 1295
},
{
"epoch": 0.4835859760066958,
"grad_norm": 1.0691360235214233,
"learning_rate": 1.0524605176985496e-05,
"loss": 1.3294,
"step": 1300
},
{
"epoch": 0.4835859760066958,
"eval_loss": 1.3182451725006104,
"eval_runtime": 60.5291,
"eval_samples_per_second": 165.21,
"eval_steps_per_second": 5.171,
"step": 1300
},
{
"epoch": 0.48544592206826004,
"grad_norm": 1.0112754106521606,
"learning_rate": 1.0391907195948643e-05,
"loss": 1.2971,
"step": 1305
},
{
"epoch": 0.48730586812982424,
"grad_norm": 1.0204293727874756,
"learning_rate": 1.0259698207622443e-05,
"loss": 1.3557,
"step": 1310
},
{
"epoch": 0.48916581419138844,
"grad_norm": 0.9734729528427124,
"learning_rate": 1.0127987248315628e-05,
"loss": 1.2949,
"step": 1315
},
{
"epoch": 0.49102576025295264,
"grad_norm": 1.0576294660568237,
"learning_rate": 9.996783320297322e-06,
"loss": 1.3116,
"step": 1320
},
{
"epoch": 0.4928857063145169,
"grad_norm": 1.0640610456466675,
"learning_rate": 9.866095391181714e-06,
"loss": 1.3427,
"step": 1325
},
{
"epoch": 0.4947456523760811,
"grad_norm": 1.0691922903060913,
"learning_rate": 9.735932393315157e-06,
"loss": 1.3017,
"step": 1330
},
{
"epoch": 0.4966055984376453,
"grad_norm": 1.0166995525360107,
"learning_rate": 9.606303223165656e-06,
"loss": 1.3303,
"step": 1335
},
{
"epoch": 0.49846554449920955,
"grad_norm": 1.110524296760559,
"learning_rate": 9.477216740714798e-06,
"loss": 1.321,
"step": 1340
},
{
"epoch": 0.5003254905607737,
"grad_norm": 1.0372258424758911,
"learning_rate": 9.348681768852186e-06,
"loss": 1.3231,
"step": 1345
},
{
"epoch": 0.5021854366223379,
"grad_norm": 1.021887183189392,
"learning_rate": 9.220707092772407e-06,
"loss": 1.3761,
"step": 1350
},
{
"epoch": 0.5021854366223379,
"eval_loss": 1.3183324337005615,
"eval_runtime": 60.5168,
"eval_samples_per_second": 165.243,
"eval_steps_per_second": 5.172,
"step": 1350
},
{
"epoch": 0.5040453826839022,
"grad_norm": 1.0530022382736206,
"learning_rate": 9.093301459374576e-06,
"loss": 1.3057,
"step": 1355
},
{
"epoch": 0.5059053287454663,
"grad_norm": 1.042121171951294,
"learning_rate": 8.966473576664499e-06,
"loss": 1.2922,
"step": 1360
},
{
"epoch": 0.5077652748070306,
"grad_norm": 1.0158429145812988,
"learning_rate": 8.840232113159481e-06,
"loss": 1.3242,
"step": 1365
},
{
"epoch": 0.5096252208685949,
"grad_norm": 1.0740450620651245,
"learning_rate": 8.714585697295876e-06,
"loss": 1.3677,
"step": 1370
},
{
"epoch": 0.511485166930159,
"grad_norm": 1.0379616022109985,
"learning_rate": 8.589542916839287e-06,
"loss": 1.3317,
"step": 1375
},
{
"epoch": 0.5133451129917233,
"grad_norm": 1.1563701629638672,
"learning_rate": 8.465112318297662e-06,
"loss": 1.3267,
"step": 1380
},
{
"epoch": 0.5152050590532875,
"grad_norm": 0.9955624938011169,
"learning_rate": 8.34130240633713e-06,
"loss": 1.3064,
"step": 1385
},
{
"epoch": 0.5170650051148517,
"grad_norm": 0.9720199108123779,
"learning_rate": 8.218121643200707e-06,
"loss": 1.2534,
"step": 1390
},
{
"epoch": 0.5189249511764159,
"grad_norm": 1.0836621522903442,
"learning_rate": 8.095578448129925e-06,
"loss": 1.3382,
"step": 1395
},
{
"epoch": 0.52078489723798,
"grad_norm": 1.0937702655792236,
"learning_rate": 7.973681196789392e-06,
"loss": 1.379,
"step": 1400
},
{
"epoch": 0.52078489723798,
"eval_loss": 1.3146955966949463,
"eval_runtime": 60.6087,
"eval_samples_per_second": 164.993,
"eval_steps_per_second": 5.164,
"step": 1400
},
{
"epoch": 0.5226448432995443,
"grad_norm": 1.0201658010482788,
"learning_rate": 7.85243822069431e-06,
"loss": 1.3525,
"step": 1405
},
{
"epoch": 0.5245047893611086,
"grad_norm": 1.0252045392990112,
"learning_rate": 7.731857806641046e-06,
"loss": 1.3257,
"step": 1410
},
{
"epoch": 0.5263647354226727,
"grad_norm": 0.9879313707351685,
"learning_rate": 7.611948196140724e-06,
"loss": 1.3926,
"step": 1415
},
{
"epoch": 0.528224681484237,
"grad_norm": 1.0046051740646362,
"learning_rate": 7.492717584855942e-06,
"loss": 1.352,
"step": 1420
},
{
"epoch": 0.5300846275458012,
"grad_norm": 1.0227553844451904,
"learning_rate": 7.3741741220406e-06,
"loss": 1.3079,
"step": 1425
},
{
"epoch": 0.5319445736073654,
"grad_norm": 1.0626540184020996,
"learning_rate": 7.2563259099829175e-06,
"loss": 1.3412,
"step": 1430
},
{
"epoch": 0.5338045196689296,
"grad_norm": 1.0018831491470337,
"learning_rate": 7.1391810034516405e-06,
"loss": 1.3367,
"step": 1435
},
{
"epoch": 0.5356644657304939,
"grad_norm": 0.9997648000717163,
"learning_rate": 7.022747409145532e-06,
"loss": 1.314,
"step": 1440
},
{
"epoch": 0.537524411792058,
"grad_norm": 1.0522111654281616,
"learning_rate": 6.907033085146082e-06,
"loss": 1.3755,
"step": 1445
},
{
"epoch": 0.5393843578536223,
"grad_norm": 0.9937166571617126,
"learning_rate": 6.792045940373635e-06,
"loss": 1.328,
"step": 1450
},
{
"epoch": 0.5393843578536223,
"eval_loss": 1.3144030570983887,
"eval_runtime": 60.5569,
"eval_samples_per_second": 165.134,
"eval_steps_per_second": 5.169,
"step": 1450
},
{
"epoch": 0.5412443039151864,
"grad_norm": 1.0255683660507202,
"learning_rate": 6.677793834046793e-06,
"loss": 1.3236,
"step": 1455
},
{
"epoch": 0.5431042499767507,
"grad_norm": 0.9852287769317627,
"learning_rate": 6.564284575145255e-06,
"loss": 1.3339,
"step": 1460
},
{
"epoch": 0.5449641960383149,
"grad_norm": 1.0695271492004395,
"learning_rate": 6.451525921876091e-06,
"loss": 1.3292,
"step": 1465
},
{
"epoch": 0.5468241420998791,
"grad_norm": 1.1841660737991333,
"learning_rate": 6.339525581143464e-06,
"loss": 1.2773,
"step": 1470
},
{
"epoch": 0.5486840881614433,
"grad_norm": 1.0714248418807983,
"learning_rate": 6.2282912080218895e-06,
"loss": 1.3186,
"step": 1475
},
{
"epoch": 0.5505440342230076,
"grad_norm": 1.1358453035354614,
"learning_rate": 6.1178304052330156e-06,
"loss": 1.3164,
"step": 1480
},
{
"epoch": 0.5524039802845717,
"grad_norm": 1.0477732419967651,
"learning_rate": 6.008150722625978e-06,
"loss": 1.3006,
"step": 1485
},
{
"epoch": 0.554263926346136,
"grad_norm": 0.9401105642318726,
"learning_rate": 5.899259656661391e-06,
"loss": 1.3668,
"step": 1490
},
{
"epoch": 0.5561238724077002,
"grad_norm": 1.0811374187469482,
"learning_rate": 5.791164649898969e-06,
"loss": 1.2621,
"step": 1495
},
{
"epoch": 0.5579838184692644,
"grad_norm": 1.046583890914917,
"learning_rate": 5.683873090488836e-06,
"loss": 1.3526,
"step": 1500
},
{
"epoch": 0.5579838184692644,
"eval_loss": 1.310549020767212,
"eval_runtime": 60.5321,
"eval_samples_per_second": 165.202,
"eval_steps_per_second": 5.171,
"step": 1500
},
{
"epoch": 0.5598437645308286,
"grad_norm": 1.042725920677185,
"learning_rate": 5.577392311666558e-06,
"loss": 1.3255,
"step": 1505
},
{
"epoch": 0.5617037105923928,
"grad_norm": 1.0929292440414429,
"learning_rate": 5.471729591251926e-06,
"loss": 1.3171,
"step": 1510
},
{
"epoch": 0.563563656653957,
"grad_norm": 1.0681570768356323,
"learning_rate": 5.366892151151515e-06,
"loss": 1.3551,
"step": 1515
},
{
"epoch": 0.5654236027155213,
"grad_norm": 0.9610005617141724,
"learning_rate": 5.262887156865101e-06,
"loss": 1.3463,
"step": 1520
},
{
"epoch": 0.5672835487770854,
"grad_norm": 1.028685450553894,
"learning_rate": 5.159721716995887e-06,
"loss": 1.331,
"step": 1525
},
{
"epoch": 0.5691434948386497,
"grad_norm": 1.0659209489822388,
"learning_rate": 5.0574028827646464e-06,
"loss": 1.3453,
"step": 1530
},
{
"epoch": 0.5710034409002139,
"grad_norm": 1.0256808996200562,
"learning_rate": 4.955937647527789e-06,
"loss": 1.3523,
"step": 1535
},
{
"epoch": 0.5728633869617781,
"grad_norm": 1.0253880023956299,
"learning_rate": 4.855332946299358e-06,
"loss": 1.3594,
"step": 1540
},
{
"epoch": 0.5747233330233423,
"grad_norm": 0.9797042608261108,
"learning_rate": 4.755595655277047e-06,
"loss": 1.3301,
"step": 1545
},
{
"epoch": 0.5765832790849066,
"grad_norm": 1.0288350582122803,
"learning_rate": 4.656732591372208e-06,
"loss": 1.2953,
"step": 1550
},
{
"epoch": 0.5765832790849066,
"eval_loss": 1.3121490478515625,
"eval_runtime": 60.5529,
"eval_samples_per_second": 165.145,
"eval_steps_per_second": 5.169,
"step": 1550
},
{
"epoch": 0.5784432251464707,
"grad_norm": 1.0119401216506958,
"learning_rate": 4.558750511743937e-06,
"loss": 1.348,
"step": 1555
},
{
"epoch": 0.580303171208035,
"grad_norm": 1.0771074295043945,
"learning_rate": 4.461656113337223e-06,
"loss": 1.3214,
"step": 1560
},
{
"epoch": 0.5821631172695991,
"grad_norm": 1.0131157636642456,
"learning_rate": 4.365456032425219e-06,
"loss": 1.2896,
"step": 1565
},
{
"epoch": 0.5840230633311634,
"grad_norm": 0.9793763160705566,
"learning_rate": 4.270156844155667e-06,
"loss": 1.3405,
"step": 1570
},
{
"epoch": 0.5858830093927276,
"grad_norm": 1.0757414102554321,
"learning_rate": 4.175765062101498e-06,
"loss": 1.3704,
"step": 1575
},
{
"epoch": 0.5877429554542918,
"grad_norm": 1.182327151298523,
"learning_rate": 4.082287137815629e-06,
"loss": 1.3274,
"step": 1580
},
{
"epoch": 0.589602901515856,
"grad_norm": 1.0652798414230347,
"learning_rate": 3.989729460390014e-06,
"loss": 1.3884,
"step": 1585
},
{
"epoch": 0.5914628475774203,
"grad_norm": 1.0110604763031006,
"learning_rate": 3.8980983560189544e-06,
"loss": 1.2986,
"step": 1590
},
{
"epoch": 0.5933227936389844,
"grad_norm": 1.0287728309631348,
"learning_rate": 3.8074000875667173e-06,
"loss": 1.3227,
"step": 1595
},
{
"epoch": 0.5951827397005487,
"grad_norm": 1.0629034042358398,
"learning_rate": 3.7176408541394724e-06,
"loss": 1.3454,
"step": 1600
},
{
"epoch": 0.5951827397005487,
"eval_loss": 1.3092882633209229,
"eval_runtime": 60.6125,
"eval_samples_per_second": 164.983,
"eval_steps_per_second": 5.164,
"step": 1600
},
{
"epoch": 0.597042685762113,
"grad_norm": 1.0497337579727173,
"learning_rate": 3.6288267906615927e-06,
"loss": 1.2941,
"step": 1605
},
{
"epoch": 0.5989026318236771,
"grad_norm": 1.0062696933746338,
"learning_rate": 3.5409639674563414e-06,
"loss": 1.3119,
"step": 1610
},
{
"epoch": 0.6007625778852413,
"grad_norm": 0.9828023314476013,
"learning_rate": 3.4540583898309718e-06,
"loss": 1.299,
"step": 1615
},
{
"epoch": 0.6026225239468055,
"grad_norm": 0.9954901337623596,
"learning_rate": 3.3681159976662705e-06,
"loss": 1.3478,
"step": 1620
},
{
"epoch": 0.6044824700083697,
"grad_norm": 1.0962938070297241,
"learning_rate": 3.2831426650105854e-06,
"loss": 1.3267,
"step": 1625
},
{
"epoch": 0.606342416069934,
"grad_norm": 1.0761761665344238,
"learning_rate": 3.199144199678326e-06,
"loss": 1.3335,
"step": 1630
},
{
"epoch": 0.6082023621314981,
"grad_norm": 1.0441513061523438,
"learning_rate": 3.11612634285302e-06,
"loss": 1.3116,
"step": 1635
},
{
"epoch": 0.6100623081930624,
"grad_norm": 1.0409022569656372,
"learning_rate": 3.034094768694904e-06,
"loss": 1.3156,
"step": 1640
},
{
"epoch": 0.6119222542546267,
"grad_norm": 0.9353120923042297,
"learning_rate": 2.95305508395311e-06,
"loss": 1.2779,
"step": 1645
},
{
"epoch": 0.6137822003161908,
"grad_norm": 1.8940584659576416,
"learning_rate": 2.8730128275824325e-06,
"loss": 1.2904,
"step": 1650
},
{
"epoch": 0.6137822003161908,
"eval_loss": 1.3097484111785889,
"eval_runtime": 60.6389,
"eval_samples_per_second": 164.911,
"eval_steps_per_second": 5.162,
"step": 1650
},
{
"epoch": 0.615642146377755,
"grad_norm": 1.0142489671707153,
"learning_rate": 2.7939734703647734e-06,
"loss": 1.3105,
"step": 1655
},
{
"epoch": 0.6175020924393193,
"grad_norm": 0.9673045873641968,
"learning_rate": 2.7159424145352063e-06,
"loss": 1.3624,
"step": 1660
},
{
"epoch": 0.6193620385008834,
"grad_norm": 1.3291678428649902,
"learning_rate": 2.6389249934127475e-06,
"loss": 1.3605,
"step": 1665
},
{
"epoch": 0.6212219845624477,
"grad_norm": 1.0622340440750122,
"learning_rate": 2.5629264710358236e-06,
"loss": 1.3195,
"step": 1670
},
{
"epoch": 0.6230819306240118,
"grad_norm": 1.07737135887146,
"learning_rate": 2.4879520418024855e-06,
"loss": 1.3621,
"step": 1675
},
{
"epoch": 0.6249418766855761,
"grad_norm": 1.076094627380371,
"learning_rate": 2.4140068301153783e-06,
"loss": 1.3352,
"step": 1680
},
{
"epoch": 0.6268018227471404,
"grad_norm": 1.1228044033050537,
"learning_rate": 2.3410958900314987e-06,
"loss": 1.3171,
"step": 1685
},
{
"epoch": 0.6286617688087045,
"grad_norm": 1.0142266750335693,
"learning_rate": 2.2692242049167475e-06,
"loss": 1.3062,
"step": 1690
},
{
"epoch": 0.6305217148702688,
"grad_norm": 1.1721749305725098,
"learning_rate": 2.1983966871053323e-06,
"loss": 1.3482,
"step": 1695
},
{
"epoch": 0.632381660931833,
"grad_norm": 1.1963858604431152,
"learning_rate": 2.1286181775640126e-06,
"loss": 1.3803,
"step": 1700
},
{
"epoch": 0.632381660931833,
"eval_loss": 1.3107084035873413,
"eval_runtime": 60.7317,
"eval_samples_per_second": 164.659,
"eval_steps_per_second": 5.154,
"step": 1700
},
{
"epoch": 0.6342416069933972,
"grad_norm": 1.0231841802597046,
"learning_rate": 2.059893445561226e-06,
"loss": 1.313,
"step": 1705
},
{
"epoch": 0.6361015530549614,
"grad_norm": 1.0954822301864624,
"learning_rate": 1.9922271883411143e-06,
"loss": 1.3398,
"step": 1710
},
{
"epoch": 0.6379614991165257,
"grad_norm": 1.1705200672149658,
"learning_rate": 1.925624030802471e-06,
"loss": 1.3195,
"step": 1715
},
{
"epoch": 0.6398214451780898,
"grad_norm": 1.1075305938720703,
"learning_rate": 1.8600885251826436e-06,
"loss": 1.3313,
"step": 1720
},
{
"epoch": 0.6416813912396541,
"grad_norm": 1.020974040031433,
"learning_rate": 1.7956251507463883e-06,
"loss": 1.3352,
"step": 1725
},
{
"epoch": 0.6435413373012183,
"grad_norm": 1.1549155712127686,
"learning_rate": 1.7322383134797149e-06,
"loss": 1.361,
"step": 1730
},
{
"epoch": 0.6454012833627825,
"grad_norm": 1.1022095680236816,
"learning_rate": 1.6699323457887554e-06,
"loss": 1.3802,
"step": 1735
},
{
"epoch": 0.6472612294243467,
"grad_norm": 1.0258790254592896,
"learning_rate": 1.6087115062036328e-06,
"loss": 1.2892,
"step": 1740
},
{
"epoch": 0.6491211754859109,
"grad_norm": 1.0042780637741089,
"learning_rate": 1.5485799790874115e-06,
"loss": 1.35,
"step": 1745
},
{
"epoch": 0.6509811215474751,
"grad_norm": 1.0090981721878052,
"learning_rate": 1.4895418743500954e-06,
"loss": 1.3185,
"step": 1750
},
{
"epoch": 0.6509811215474751,
"eval_loss": 1.308115839958191,
"eval_runtime": 60.6449,
"eval_samples_per_second": 164.894,
"eval_steps_per_second": 5.161,
"step": 1750
},
{
"epoch": 0.6528410676090394,
"grad_norm": 1.059403657913208,
"learning_rate": 1.431601227167719e-06,
"loss": 1.3153,
"step": 1755
},
{
"epoch": 0.6547010136706035,
"grad_norm": 1.0298171043395996,
"learning_rate": 1.3747619977065534e-06,
"loss": 1.3642,
"step": 1760
},
{
"epoch": 0.6565609597321678,
"grad_norm": 1.0219703912734985,
"learning_rate": 1.3190280708524274e-06,
"loss": 1.356,
"step": 1765
},
{
"epoch": 0.658420905793732,
"grad_norm": 1.0099432468414307,
"learning_rate": 1.2644032559452095e-06,
"loss": 1.2847,
"step": 1770
},
{
"epoch": 0.6602808518552962,
"grad_norm": 1.0368894338607788,
"learning_rate": 1.2108912865184372e-06,
"loss": 1.3282,
"step": 1775
},
{
"epoch": 0.6621407979168604,
"grad_norm": 1.1284433603286743,
"learning_rate": 1.1584958200441366e-06,
"loss": 1.3546,
"step": 1780
},
{
"epoch": 0.6640007439784247,
"grad_norm": 1.024755597114563,
"learning_rate": 1.107220437682845e-06,
"loss": 1.3606,
"step": 1785
},
{
"epoch": 0.6658606900399888,
"grad_norm": 0.9858983159065247,
"learning_rate": 1.0570686440388318e-06,
"loss": 1.2959,
"step": 1790
},
{
"epoch": 0.6677206361015531,
"grad_norm": 1.0345178842544556,
"learning_rate": 1.0080438669205757e-06,
"loss": 1.3119,
"step": 1795
},
{
"epoch": 0.6695805821631172,
"grad_norm": 1.125440001487732,
"learning_rate": 9.601494571064706e-07,
"loss": 1.3397,
"step": 1800
},
{
"epoch": 0.6695805821631172,
"eval_loss": 1.308713674545288,
"eval_runtime": 60.5955,
"eval_samples_per_second": 165.029,
"eval_steps_per_second": 5.165,
"step": 1800
},
{
"epoch": 0.6714405282246815,
"grad_norm": 0.9926828742027283,
"learning_rate": 9.133886881158041e-07,
"loss": 1.3156,
"step": 1805
},
{
"epoch": 0.6733004742862457,
"grad_norm": 2.1787917613983154,
"learning_rate": 8.677647559850251e-07,
"loss": 1.3884,
"step": 1810
},
{
"epoch": 0.6751604203478099,
"grad_norm": 1.1709134578704834,
"learning_rate": 8.232807790492901e-07,
"loss": 1.3408,
"step": 1815
},
{
"epoch": 0.6770203664093741,
"grad_norm": 1.049757480621338,
"learning_rate": 7.799397977293321e-07,
"loss": 1.3641,
"step": 1820
},
{
"epoch": 0.6788803124709384,
"grad_norm": 1.0234988927841187,
"learning_rate": 7.377447743236496e-07,
"loss": 1.3226,
"step": 1825
},
{
"epoch": 0.6807402585325025,
"grad_norm": 0.9746132493019104,
"learning_rate": 6.966985928060477e-07,
"loss": 1.3427,
"step": 1830
},
{
"epoch": 0.6826002045940668,
"grad_norm": 3.015550374984741,
"learning_rate": 6.568040586285049e-07,
"loss": 1.3353,
"step": 1835
},
{
"epoch": 0.684460150655631,
"grad_norm": 1.0778768062591553,
"learning_rate": 6.180638985294406e-07,
"loss": 1.3186,
"step": 1840
},
{
"epoch": 0.6863200967171952,
"grad_norm": 1.1300712823867798,
"learning_rate": 5.804807603473371e-07,
"loss": 1.3452,
"step": 1845
},
{
"epoch": 0.6881800427787594,
"grad_norm": 1.057513952255249,
"learning_rate": 5.44057212839764e-07,
"loss": 1.2665,
"step": 1850
},
{
"epoch": 0.6881800427787594,
"eval_loss": 1.3100022077560425,
"eval_runtime": 60.5956,
"eval_samples_per_second": 165.028,
"eval_steps_per_second": 5.165,
"step": 1850
},
{
"epoch": 0.6900399888403236,
"grad_norm": 1.076949954032898,
"learning_rate": 5.08795745507812e-07,
"loss": 1.3476,
"step": 1855
},
{
"epoch": 0.6918999349018878,
"grad_norm": 0.9384456276893616,
"learning_rate": 4.746987684259339e-07,
"loss": 1.2809,
"step": 1860
},
{
"epoch": 0.6937598809634521,
"grad_norm": 0.9707076549530029,
"learning_rate": 4.417686120772182e-07,
"loss": 1.3307,
"step": 1865
},
{
"epoch": 0.6956198270250162,
"grad_norm": 1.1225857734680176,
"learning_rate": 4.100075271941094e-07,
"loss": 1.3795,
"step": 1870
},
{
"epoch": 0.6974797730865805,
"grad_norm": 1.0616652965545654,
"learning_rate": 3.794176846045729e-07,
"loss": 1.3137,
"step": 1875
},
{
"epoch": 0.6993397191481447,
"grad_norm": 1.0068029165267944,
"learning_rate": 3.500011750837112e-07,
"loss": 1.3503,
"step": 1880
},
{
"epoch": 0.7011996652097089,
"grad_norm": 1.0520777702331543,
"learning_rate": 3.21760009210876e-07,
"loss": 1.3875,
"step": 1885
},
{
"epoch": 0.7030596112712731,
"grad_norm": 1.0416043996810913,
"learning_rate": 2.946961172322425e-07,
"loss": 1.3565,
"step": 1890
},
{
"epoch": 0.7049195573328374,
"grad_norm": 1.0348520278930664,
"learning_rate": 2.6881134892887327e-07,
"loss": 1.307,
"step": 1895
},
{
"epoch": 0.7067795033944015,
"grad_norm": 1.0482661724090576,
"learning_rate": 2.441074734903027e-07,
"loss": 1.4315,
"step": 1900
},
{
"epoch": 0.7067795033944015,
"eval_loss": 1.3073337078094482,
"eval_runtime": 60.5911,
"eval_samples_per_second": 165.041,
"eval_steps_per_second": 5.166,
"step": 1900
}
],
"logging_steps": 5,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0001
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.700258141162177e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}