Llama-3.3-70b-Instruct / trainer_state.json
zgrgr's picture
Upload LoRA checkpoint-14319
3b1442b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 54000,
"global_step": 14319,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.028284098051539912,
"grad_norm": 0.04601588801994645,
"learning_rate": 9.42737430167598e-06,
"loss": 0.1706,
"step": 135
},
{
"epoch": 0.056568196103079824,
"grad_norm": 0.08599903987598387,
"learning_rate": 1.885474860335196e-05,
"loss": 0.1407,
"step": 270
},
{
"epoch": 0.08485229415461974,
"grad_norm": 0.07811249522270598,
"learning_rate": 2.8282122905027936e-05,
"loss": 0.133,
"step": 405
},
{
"epoch": 0.11313639220615965,
"grad_norm": 0.08106903455078629,
"learning_rate": 3.770949720670392e-05,
"loss": 0.1302,
"step": 540
},
{
"epoch": 0.14142049025769957,
"grad_norm": 0.07762084811197388,
"learning_rate": 4.713687150837989e-05,
"loss": 0.128,
"step": 675
},
{
"epoch": 0.16970458830923948,
"grad_norm": 0.06702784826347409,
"learning_rate": 5.656424581005587e-05,
"loss": 0.1269,
"step": 810
},
{
"epoch": 0.19798868636077938,
"grad_norm": 0.06832201155426712,
"learning_rate": 6.599162011173185e-05,
"loss": 0.1258,
"step": 945
},
{
"epoch": 0.2262727844123193,
"grad_norm": 0.07124429027326014,
"learning_rate": 7.541899441340783e-05,
"loss": 0.1251,
"step": 1080
},
{
"epoch": 0.2545568824638592,
"grad_norm": 0.059836090071897996,
"learning_rate": 8.48463687150838e-05,
"loss": 0.125,
"step": 1215
},
{
"epoch": 0.28284098051539913,
"grad_norm": 0.06065917805260472,
"learning_rate": 9.427374301675978e-05,
"loss": 0.1244,
"step": 1350
},
{
"epoch": 0.31112507856693905,
"grad_norm": 0.05448515282733843,
"learning_rate": 9.999582667896216e-05,
"loss": 0.1241,
"step": 1485
},
{
"epoch": 0.33940917661847897,
"grad_norm": 0.05195941534715265,
"learning_rate": 9.994749800860066e-05,
"loss": 0.1236,
"step": 1620
},
{
"epoch": 0.36769327467001883,
"grad_norm": 0.0533796560627076,
"learning_rate": 9.984507669983246e-05,
"loss": 0.123,
"step": 1755
},
{
"epoch": 0.39597737272155875,
"grad_norm": 0.05673889785368016,
"learning_rate": 9.968867367390571e-05,
"loss": 0.1222,
"step": 1890
},
{
"epoch": 0.42426147077309867,
"grad_norm": 0.052072201102599504,
"learning_rate": 9.947845831372577e-05,
"loss": 0.1217,
"step": 2025
},
{
"epoch": 0.4525455688246386,
"grad_norm": 0.054891014733737455,
"learning_rate": 9.921465828041518e-05,
"loss": 0.1218,
"step": 2160
},
{
"epoch": 0.4808296668761785,
"grad_norm": 0.05168215652940432,
"learning_rate": 9.889755926675904e-05,
"loss": 0.121,
"step": 2295
},
{
"epoch": 0.5091137649277184,
"grad_norm": 0.051281710130397264,
"learning_rate": 9.85275046878025e-05,
"loss": 0.121,
"step": 2430
},
{
"epoch": 0.5373978629792583,
"grad_norm": 0.05193059833614012,
"learning_rate": 9.810489530893578e-05,
"loss": 0.1205,
"step": 2565
},
{
"epoch": 0.5656819610307983,
"grad_norm": 0.052032831215777654,
"learning_rate": 9.763018881186927e-05,
"loss": 0.1195,
"step": 2700
},
{
"epoch": 0.5939660590823381,
"grad_norm": 0.055670900157506434,
"learning_rate": 9.710389929896887e-05,
"loss": 0.1197,
"step": 2835
},
{
"epoch": 0.6222501571338781,
"grad_norm": 0.05191679567872077,
"learning_rate": 9.652659673648816e-05,
"loss": 0.1191,
"step": 2970
},
{
"epoch": 0.650534255185418,
"grad_norm": 0.053751440231836235,
"learning_rate": 9.589890633730087e-05,
"loss": 0.1185,
"step": 3105
},
{
"epoch": 0.6788183532369579,
"grad_norm": 0.05742218717400969,
"learning_rate": 9.522150788380149e-05,
"loss": 0.1181,
"step": 3240
},
{
"epoch": 0.7071024512884978,
"grad_norm": 0.05641344503893988,
"learning_rate": 9.449513499170775e-05,
"loss": 0.118,
"step": 3375
},
{
"epoch": 0.7353865493400377,
"grad_norm": 0.05184155469686776,
"learning_rate": 9.372057431556227e-05,
"loss": 0.1177,
"step": 3510
},
{
"epoch": 0.7636706473915776,
"grad_norm": 0.04878199937516276,
"learning_rate": 9.289866469679355e-05,
"loss": 0.1175,
"step": 3645
},
{
"epoch": 0.7919547454431175,
"grad_norm": 0.05310159239349626,
"learning_rate": 9.203029625525912e-05,
"loss": 0.1169,
"step": 3780
},
{
"epoch": 0.8202388434946575,
"grad_norm": 0.0534469001920457,
"learning_rate": 9.111640942525466e-05,
"loss": 0.1175,
"step": 3915
},
{
"epoch": 0.8485229415461973,
"grad_norm": 0.05052535987329732,
"learning_rate": 9.015799393703315e-05,
"loss": 0.1169,
"step": 4050
},
{
"epoch": 0.8768070395977373,
"grad_norm": 0.05309924206712465,
"learning_rate": 8.915608774493695e-05,
"loss": 0.1166,
"step": 4185
},
{
"epoch": 0.9050911376492772,
"grad_norm": 0.05471927034944372,
"learning_rate": 8.811177590330367e-05,
"loss": 0.1158,
"step": 4320
},
{
"epoch": 0.933375235700817,
"grad_norm": 0.051718680639674705,
"learning_rate": 8.702618939136322e-05,
"loss": 0.1156,
"step": 4455
},
{
"epoch": 0.961659333752357,
"grad_norm": 0.055160448975554825,
"learning_rate": 8.590050388839863e-05,
"loss": 0.1155,
"step": 4590
},
{
"epoch": 0.9899434318038969,
"grad_norm": 0.05086520628842916,
"learning_rate": 8.473593850049731e-05,
"loss": 0.1155,
"step": 4725
},
{
"epoch": 1.0182275298554369,
"grad_norm": 0.05154998381506173,
"learning_rate": 8.353375444027128e-05,
"loss": 0.1066,
"step": 4860
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.05189766725554911,
"learning_rate": 8.22952536609767e-05,
"loss": 0.1021,
"step": 4995
},
{
"epoch": 1.0747957259585166,
"grad_norm": 0.052617917188326715,
"learning_rate": 8.102177744651149e-05,
"loss": 0.1024,
"step": 5130
},
{
"epoch": 1.1030798240100566,
"grad_norm": 0.05214363158387452,
"learning_rate": 7.971470495881836e-05,
"loss": 0.1025,
"step": 5265
},
{
"epoch": 1.1313639220615965,
"grad_norm": 0.054295844912421495,
"learning_rate": 7.837545174426639e-05,
"loss": 0.1023,
"step": 5400
},
{
"epoch": 1.1596480201131363,
"grad_norm": 0.05197457231465077,
"learning_rate": 7.700546820062839e-05,
"loss": 0.1025,
"step": 5535
},
{
"epoch": 1.1879321181646763,
"grad_norm": 0.056484265602417545,
"learning_rate": 7.560623800631472e-05,
"loss": 0.1023,
"step": 5670
},
{
"epoch": 1.2162162162162162,
"grad_norm": 0.052111946846749885,
"learning_rate": 7.417927651356462e-05,
"loss": 0.1024,
"step": 5805
},
{
"epoch": 1.2445003142677562,
"grad_norm": 0.05566575920944282,
"learning_rate": 7.272612910733475e-05,
"loss": 0.1017,
"step": 5940
},
{
"epoch": 1.2727844123192962,
"grad_norm": 0.05811617199922452,
"learning_rate": 7.124836953166298e-05,
"loss": 0.1019,
"step": 6075
},
{
"epoch": 1.301068510370836,
"grad_norm": 0.061671271903986996,
"learning_rate": 6.974759818531935e-05,
"loss": 0.1019,
"step": 6210
},
{
"epoch": 1.329352608422376,
"grad_norm": 0.05466121007796382,
"learning_rate": 6.822544038859025e-05,
"loss": 0.1016,
"step": 6345
},
{
"epoch": 1.3576367064739157,
"grad_norm": 0.05425557430418602,
"learning_rate": 6.668354462307296e-05,
"loss": 0.1011,
"step": 6480
},
{
"epoch": 1.3859208045254556,
"grad_norm": 0.05874672603708157,
"learning_rate": 6.512358074638657e-05,
"loss": 0.1012,
"step": 6615
},
{
"epoch": 1.4142049025769956,
"grad_norm": 0.05823531384414933,
"learning_rate": 6.354723818373301e-05,
"loss": 0.1008,
"step": 6750
},
{
"epoch": 1.4424890006285356,
"grad_norm": 0.05608262826782312,
"learning_rate": 6.195622409826653e-05,
"loss": 0.1007,
"step": 6885
},
{
"epoch": 1.4707730986800756,
"grad_norm": 0.05408687704162592,
"learning_rate": 6.035226154225313e-05,
"loss": 0.1002,
"step": 7020
},
{
"epoch": 1.4990571967316153,
"grad_norm": 0.054980789427209784,
"learning_rate": 5.8737087591022275e-05,
"loss": 0.1004,
"step": 7155
},
{
"epoch": 1.5273412947831553,
"grad_norm": 0.05745041355343903,
"learning_rate": 5.7112451461731854e-05,
"loss": 0.0999,
"step": 7290
},
{
"epoch": 1.555625392834695,
"grad_norm": 0.05803905423764401,
"learning_rate": 5.5480112618983404e-05,
"loss": 0.0995,
"step": 7425
},
{
"epoch": 1.583909490886235,
"grad_norm": 0.056971104280436516,
"learning_rate": 5.384183886933983e-05,
"loss": 0.0997,
"step": 7560
},
{
"epoch": 1.612193588937775,
"grad_norm": 0.056543402577003486,
"learning_rate": 5.2199404446808475e-05,
"loss": 0.0988,
"step": 7695
},
{
"epoch": 1.640477686989315,
"grad_norm": 0.05414144319537392,
"learning_rate": 5.0554588091363683e-05,
"loss": 0.0988,
"step": 7830
},
{
"epoch": 1.668761785040855,
"grad_norm": 0.05756652870031753,
"learning_rate": 4.890917112258916e-05,
"loss": 0.0988,
"step": 7965
},
{
"epoch": 1.6970458830923947,
"grad_norm": 0.054317396460825465,
"learning_rate": 4.726493551052682e-05,
"loss": 0.0985,
"step": 8100
},
{
"epoch": 1.7253299811439347,
"grad_norm": 0.05780938044176143,
"learning_rate": 4.562366194582113e-05,
"loss": 0.0979,
"step": 8235
},
{
"epoch": 1.7536140791954744,
"grad_norm": 0.05615442700243257,
"learning_rate": 4.398712791124905e-05,
"loss": 0.0976,
"step": 8370
},
{
"epoch": 1.7818981772470144,
"grad_norm": 0.0550653325962579,
"learning_rate": 4.235710575672401e-05,
"loss": 0.0975,
"step": 8505
},
{
"epoch": 1.8101822752985544,
"grad_norm": 0.055514099512198385,
"learning_rate": 4.073536077985884e-05,
"loss": 0.0974,
"step": 8640
},
{
"epoch": 1.8384663733500943,
"grad_norm": 0.05542114420833896,
"learning_rate": 3.9123649314166065e-05,
"loss": 0.0968,
"step": 8775
},
{
"epoch": 1.8667504714016343,
"grad_norm": 0.05466973411282308,
"learning_rate": 3.752371682696652e-05,
"loss": 0.0966,
"step": 8910
},
{
"epoch": 1.895034569453174,
"grad_norm": 0.05615670182195563,
"learning_rate": 3.5937296029065625e-05,
"loss": 0.0967,
"step": 9045
},
{
"epoch": 1.923318667504714,
"grad_norm": 0.05525832187066413,
"learning_rate": 3.4366104998245154e-05,
"loss": 0.096,
"step": 9180
},
{
"epoch": 1.9516027655562538,
"grad_norm": 0.05493108241819906,
"learning_rate": 3.28118453186021e-05,
"loss": 0.0957,
"step": 9315
},
{
"epoch": 1.9798868636077938,
"grad_norm": 0.05505518318771863,
"learning_rate": 3.1276200237750355e-05,
"loss": 0.0955,
"step": 9450
},
{
"epoch": 2.0081709616593337,
"grad_norm": 0.05306378687028771,
"learning_rate": 2.976083284388031e-05,
"loss": 0.0906,
"step": 9585
},
{
"epoch": 2.0364550597108737,
"grad_norm": 0.05130179788880526,
"learning_rate": 2.8267384264651188e-05,
"loss": 0.0784,
"step": 9720
},
{
"epoch": 2.0647391577624137,
"grad_norm": 0.05335109388781897,
"learning_rate": 2.679747188986622e-05,
"loss": 0.0782,
"step": 9855
},
{
"epoch": 2.0930232558139537,
"grad_norm": 0.0513888628362633,
"learning_rate": 2.53526876198557e-05,
"loss": 0.0778,
"step": 9990
},
{
"epoch": 2.121307353865493,
"grad_norm": 0.055136967997034346,
"learning_rate": 2.3934596141465028e-05,
"loss": 0.0778,
"step": 10125
},
{
"epoch": 2.149591451917033,
"grad_norm": 0.05620080185680983,
"learning_rate": 2.254473323351446e-05,
"loss": 0.0777,
"step": 10260
},
{
"epoch": 2.177875549968573,
"grad_norm": 0.05464669812207657,
"learning_rate": 2.1184604103566198e-05,
"loss": 0.0774,
"step": 10395
},
{
"epoch": 2.206159648020113,
"grad_norm": 0.05398354993752342,
"learning_rate": 1.9855681757799664e-05,
"loss": 0.0774,
"step": 10530
},
{
"epoch": 2.234443746071653,
"grad_norm": 0.055897809997969714,
"learning_rate": 1.8559405405760584e-05,
"loss": 0.0772,
"step": 10665
},
{
"epoch": 2.262727844123193,
"grad_norm": 0.05732086913703312,
"learning_rate": 1.729717890171157e-05,
"loss": 0.0767,
"step": 10800
},
{
"epoch": 2.291011942174733,
"grad_norm": 0.0565406180756469,
"learning_rate": 1.607036922427203e-05,
"loss": 0.0765,
"step": 10935
},
{
"epoch": 2.3192960402262726,
"grad_norm": 0.057074660751889154,
"learning_rate": 1.4880304995994099e-05,
"loss": 0.0765,
"step": 11070
},
{
"epoch": 2.3475801382778125,
"grad_norm": 0.05609805473293312,
"learning_rate": 1.3728275044477673e-05,
"loss": 0.0762,
"step": 11205
},
{
"epoch": 2.3758642363293525,
"grad_norm": 0.05516011004588835,
"learning_rate": 1.2615527006583178e-05,
"loss": 0.0763,
"step": 11340
},
{
"epoch": 2.4041483343808925,
"grad_norm": 0.05722853773387842,
"learning_rate": 1.1543265977253332e-05,
"loss": 0.0762,
"step": 11475
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.0587961606821977,
"learning_rate": 1.0512653204407463e-05,
"loss": 0.0757,
"step": 11610
},
{
"epoch": 2.4607165304839724,
"grad_norm": 0.0569771841520313,
"learning_rate": 9.524804831321604e-06,
"loss": 0.0759,
"step": 11745
},
{
"epoch": 2.4890006285355124,
"grad_norm": 0.059353282770618576,
"learning_rate": 8.580790687856661e-06,
"loss": 0.0756,
"step": 11880
},
{
"epoch": 2.517284726587052,
"grad_norm": 0.05454050877182314,
"learning_rate": 7.68163313184333e-06,
"loss": 0.0754,
"step": 12015
},
{
"epoch": 2.5455688246385924,
"grad_norm": 0.05815652945221045,
"learning_rate": 6.828305941878904e-06,
"loss": 0.0752,
"step": 12150
},
{
"epoch": 2.573852922690132,
"grad_norm": 0.057648681945551325,
"learning_rate": 6.021733262734758e-06,
"loss": 0.075,
"step": 12285
},
{
"epoch": 2.602137020741672,
"grad_norm": 0.05540948539481964,
"learning_rate": 5.262788604516944e-06,
"loss": 0.075,
"step": 12420
},
{
"epoch": 2.630421118793212,
"grad_norm": 0.06718456332465579,
"learning_rate": 4.552293896663451e-06,
"loss": 0.075,
"step": 12555
},
{
"epoch": 2.658705216844752,
"grad_norm": 0.07105890518243332,
"learning_rate": 3.8910185978029314e-06,
"loss": 0.0748,
"step": 12690
},
{
"epoch": 2.686989314896292,
"grad_norm": 0.05567353880578137,
"learning_rate": 3.2796788624387066e-06,
"loss": 0.0748,
"step": 12825
},
{
"epoch": 2.7152734129478313,
"grad_norm": 0.05612203710501599,
"learning_rate": 2.71893676536063e-06,
"loss": 0.0749,
"step": 12960
},
{
"epoch": 2.7435575109993717,
"grad_norm": 0.05673870284934767,
"learning_rate": 2.209399584624794e-06,
"loss": 0.0748,
"step": 13095
},
{
"epoch": 2.7718416090509113,
"grad_norm": 0.05761628274356501,
"learning_rate": 1.7516191438774588e-06,
"loss": 0.0747,
"step": 13230
},
{
"epoch": 2.8001257071024512,
"grad_norm": 0.054886010400605305,
"learning_rate": 1.3460912147355787e-06,
"loss": 0.0746,
"step": 13365
},
{
"epoch": 2.828409805153991,
"grad_norm": 0.05340561070795086,
"learning_rate": 9.932549798711443e-07,
"loss": 0.0746,
"step": 13500
},
{
"epoch": 2.856693903205531,
"grad_norm": 0.059674346591722494,
"learning_rate": 6.934925573807704e-07,
"loss": 0.0747,
"step": 13635
},
{
"epoch": 2.884978001257071,
"grad_norm": 0.058825608417044505,
"learning_rate": 4.4712858695560856e-07,
"loss": 0.0742,
"step": 13770
},
{
"epoch": 2.9132620993086107,
"grad_norm": 0.05775417712295375,
"learning_rate": 2.5442987829985556e-07,
"loss": 0.0745,
"step": 13905
},
{
"epoch": 2.941546197360151,
"grad_norm": 0.056228990397294835,
"learning_rate": 1.1560512217849707e-07,
"loss": 0.0747,
"step": 14040
},
{
"epoch": 2.9698302954116906,
"grad_norm": 0.056212393072501816,
"learning_rate": 3.080466440732455e-08,
"loss": 0.0743,
"step": 14175
},
{
"epoch": 2.9981143934632306,
"grad_norm": 0.056538060201727615,
"learning_rate": 1.2034302991903445e-10,
"loss": 0.0745,
"step": 14310
}
],
"logging_steps": 135,
"max_steps": 14319,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 54000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.413317465141412e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}