codek-qwen2.5-coder-7b-lora-v2 / trainer_state.json
mechramc's picture
Upload folder using huggingface_hub
659a4df verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.276707530647986,
"eval_steps": 100,
"global_step": 1300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017513134851138354,
"grad_norm": 0.4135197103023529,
"learning_rate": 3.6e-05,
"loss": 0.8109177589416504,
"step": 10
},
{
"epoch": 0.03502626970227671,
"grad_norm": 0.5954136252403259,
"learning_rate": 7.6e-05,
"loss": 0.6212304115295411,
"step": 20
},
{
"epoch": 0.05253940455341506,
"grad_norm": 0.4027167856693268,
"learning_rate": 0.000116,
"loss": 0.44783411026000974,
"step": 30
},
{
"epoch": 0.07005253940455342,
"grad_norm": 0.47371360659599304,
"learning_rate": 0.00015600000000000002,
"loss": 0.3630207538604736,
"step": 40
},
{
"epoch": 0.08756567425569177,
"grad_norm": 0.48840901255607605,
"learning_rate": 0.000196,
"loss": 0.32424685955047605,
"step": 50
},
{
"epoch": 0.10507880910683012,
"grad_norm": 0.5532234311103821,
"learning_rate": 0.0001989176187612748,
"loss": 0.2953991413116455,
"step": 60
},
{
"epoch": 0.12259194395796848,
"grad_norm": 0.5430059432983398,
"learning_rate": 0.00019771497294046903,
"loss": 0.26429708003997804,
"step": 70
},
{
"epoch": 0.14010507880910683,
"grad_norm": 0.5477070212364197,
"learning_rate": 0.00019651232711966328,
"loss": 0.2550451040267944,
"step": 80
},
{
"epoch": 0.15761821366024517,
"grad_norm": 0.37017086148262024,
"learning_rate": 0.00019530968129885748,
"loss": 0.23371753692626954,
"step": 90
},
{
"epoch": 0.17513134851138354,
"grad_norm": 0.38276150822639465,
"learning_rate": 0.0001941070354780517,
"loss": 0.2195589542388916,
"step": 100
},
{
"epoch": 0.17513134851138354,
"eval_loss": 0.23231205344200134,
"eval_runtime": 169.8531,
"eval_samples_per_second": 2.991,
"eval_steps_per_second": 0.748,
"step": 100
},
{
"epoch": 0.19264448336252188,
"grad_norm": 0.406323105096817,
"learning_rate": 0.00019290438965724596,
"loss": 0.2108442783355713,
"step": 110
},
{
"epoch": 0.21015761821366025,
"grad_norm": 0.47465822100639343,
"learning_rate": 0.00019170174383644018,
"loss": 0.2249575138092041,
"step": 120
},
{
"epoch": 0.2276707530647986,
"grad_norm": 0.35268914699554443,
"learning_rate": 0.0001904990980156344,
"loss": 0.16998076438903809,
"step": 130
},
{
"epoch": 0.24518388791593695,
"grad_norm": 0.31479501724243164,
"learning_rate": 0.00018929645219482863,
"loss": 0.1623205780982971,
"step": 140
},
{
"epoch": 0.2626970227670753,
"grad_norm": 0.3868594467639923,
"learning_rate": 0.00018809380637402286,
"loss": 0.16868008375167848,
"step": 150
},
{
"epoch": 0.28021015761821366,
"grad_norm": 0.4887761175632477,
"learning_rate": 0.00018689116055321708,
"loss": 0.1882340431213379,
"step": 160
},
{
"epoch": 0.29772329246935203,
"grad_norm": 0.39412927627563477,
"learning_rate": 0.0001856885147324113,
"loss": 0.15920686721801758,
"step": 170
},
{
"epoch": 0.31523642732049034,
"grad_norm": 0.41622865200042725,
"learning_rate": 0.00018448586891160553,
"loss": 0.16607775688171386,
"step": 180
},
{
"epoch": 0.3327495621716287,
"grad_norm": 0.4045696258544922,
"learning_rate": 0.00018328322309079978,
"loss": 0.158127498626709,
"step": 190
},
{
"epoch": 0.3502626970227671,
"grad_norm": 0.3789847493171692,
"learning_rate": 0.00018208057726999398,
"loss": 0.14632443189620972,
"step": 200
},
{
"epoch": 0.3502626970227671,
"eval_loss": 0.1353635936975479,
"eval_runtime": 171.8534,
"eval_samples_per_second": 2.956,
"eval_steps_per_second": 0.739,
"step": 200
},
{
"epoch": 0.36777583187390545,
"grad_norm": 0.41194388270378113,
"learning_rate": 0.00018087793144918823,
"loss": 0.1293831706047058,
"step": 210
},
{
"epoch": 0.38528896672504376,
"grad_norm": 0.35434651374816895,
"learning_rate": 0.00017967528562838245,
"loss": 0.13147668838500975,
"step": 220
},
{
"epoch": 0.4028021015761821,
"grad_norm": 0.3050230145454407,
"learning_rate": 0.00017847263980757668,
"loss": 0.12810969352722168,
"step": 230
},
{
"epoch": 0.4203152364273205,
"grad_norm": 0.29852065443992615,
"learning_rate": 0.0001772699939867709,
"loss": 0.13389307260513306,
"step": 240
},
{
"epoch": 0.43782837127845886,
"grad_norm": 0.3992239832878113,
"learning_rate": 0.00017606734816596513,
"loss": 0.11474900245666504,
"step": 250
},
{
"epoch": 0.4553415061295972,
"grad_norm": 0.323345422744751,
"learning_rate": 0.00017486470234515935,
"loss": 0.11180757284164429,
"step": 260
},
{
"epoch": 0.47285464098073554,
"grad_norm": 0.3820851147174835,
"learning_rate": 0.00017366205652435358,
"loss": 0.10637552738189697,
"step": 270
},
{
"epoch": 0.4903677758318739,
"grad_norm": 0.3785695433616638,
"learning_rate": 0.0001724594107035478,
"loss": 0.11243565082550049,
"step": 280
},
{
"epoch": 0.5078809106830122,
"grad_norm": 0.34767481684684753,
"learning_rate": 0.00017125676488274205,
"loss": 0.11057982444763184,
"step": 290
},
{
"epoch": 0.5253940455341506,
"grad_norm": 0.32242536544799805,
"learning_rate": 0.00017005411906193628,
"loss": 0.09878214001655579,
"step": 300
},
{
"epoch": 0.5253940455341506,
"eval_loss": 0.10276732593774796,
"eval_runtime": 170.0789,
"eval_samples_per_second": 2.987,
"eval_steps_per_second": 0.747,
"step": 300
},
{
"epoch": 0.542907180385289,
"grad_norm": 0.3188435435295105,
"learning_rate": 0.00016885147324113047,
"loss": 0.08771577477455139,
"step": 310
},
{
"epoch": 0.5604203152364273,
"grad_norm": 0.2941615879535675,
"learning_rate": 0.00016764882742032473,
"loss": 0.08557047247886658,
"step": 320
},
{
"epoch": 0.5779334500875657,
"grad_norm": 0.2936120927333832,
"learning_rate": 0.00016644618159951895,
"loss": 0.08636216521263122,
"step": 330
},
{
"epoch": 0.5954465849387041,
"grad_norm": 0.21349965035915375,
"learning_rate": 0.0001652435357787132,
"loss": 0.08149101734161376,
"step": 340
},
{
"epoch": 0.6129597197898424,
"grad_norm": 0.2442740797996521,
"learning_rate": 0.0001640408899579074,
"loss": 0.08436259627342224,
"step": 350
},
{
"epoch": 0.6304728546409807,
"grad_norm": 0.3144635856151581,
"learning_rate": 0.00016283824413710162,
"loss": 0.0912843644618988,
"step": 360
},
{
"epoch": 0.647985989492119,
"grad_norm": 0.18774041533470154,
"learning_rate": 0.00016163559831629587,
"loss": 0.08484984040260315,
"step": 370
},
{
"epoch": 0.6654991243432574,
"grad_norm": 0.3200187385082245,
"learning_rate": 0.0001604329524954901,
"loss": 0.08420997262001037,
"step": 380
},
{
"epoch": 0.6830122591943958,
"grad_norm": 0.20744681358337402,
"learning_rate": 0.0001592303066746843,
"loss": 0.07883568406105042,
"step": 390
},
{
"epoch": 0.7005253940455342,
"grad_norm": 0.49990326166152954,
"learning_rate": 0.00015802766085387855,
"loss": 0.07491461634635925,
"step": 400
},
{
"epoch": 0.7005253940455342,
"eval_loss": 0.08543122559785843,
"eval_runtime": 169.5964,
"eval_samples_per_second": 2.995,
"eval_steps_per_second": 0.749,
"step": 400
},
{
"epoch": 0.7180385288966725,
"grad_norm": 0.21963991224765778,
"learning_rate": 0.00015682501503307277,
"loss": 0.07940490245819092,
"step": 410
},
{
"epoch": 0.7355516637478109,
"grad_norm": 0.282270610332489,
"learning_rate": 0.000155622369212267,
"loss": 0.08389427065849304,
"step": 420
},
{
"epoch": 0.7530647985989493,
"grad_norm": 0.19522342085838318,
"learning_rate": 0.00015441972339146122,
"loss": 0.07796943187713623,
"step": 430
},
{
"epoch": 0.7705779334500875,
"grad_norm": 0.20144295692443848,
"learning_rate": 0.00015321707757065545,
"loss": 0.08569519519805908,
"step": 440
},
{
"epoch": 0.7880910683012259,
"grad_norm": 0.31299343705177307,
"learning_rate": 0.0001520144317498497,
"loss": 0.07234247326850891,
"step": 450
},
{
"epoch": 0.8056042031523643,
"grad_norm": 0.22233198583126068,
"learning_rate": 0.0001508117859290439,
"loss": 0.06918607354164123,
"step": 460
},
{
"epoch": 0.8231173380035026,
"grad_norm": 0.3281087577342987,
"learning_rate": 0.00014960914010823812,
"loss": 0.06424351334571839,
"step": 470
},
{
"epoch": 0.840630472854641,
"grad_norm": 0.23634330928325653,
"learning_rate": 0.00014840649428743237,
"loss": 0.07089964151382447,
"step": 480
},
{
"epoch": 0.8581436077057794,
"grad_norm": 0.24085308611392975,
"learning_rate": 0.0001472038484666266,
"loss": 0.07725317478179931,
"step": 490
},
{
"epoch": 0.8756567425569177,
"grad_norm": 0.2506239712238312,
"learning_rate": 0.00014600120264582082,
"loss": 0.07955536246299744,
"step": 500
},
{
"epoch": 0.8756567425569177,
"eval_loss": 0.07601634413003922,
"eval_runtime": 170.4186,
"eval_samples_per_second": 2.981,
"eval_steps_per_second": 0.745,
"step": 500
},
{
"epoch": 0.8931698774080561,
"grad_norm": 0.30001509189605713,
"learning_rate": 0.00014479855682501504,
"loss": 0.06071768999099732,
"step": 510
},
{
"epoch": 0.9106830122591943,
"grad_norm": 0.1644354909658432,
"learning_rate": 0.00014359591100420927,
"loss": 0.07156956791877747,
"step": 520
},
{
"epoch": 0.9281961471103327,
"grad_norm": 0.2289579063653946,
"learning_rate": 0.0001423932651834035,
"loss": 0.07050368785858155,
"step": 530
},
{
"epoch": 0.9457092819614711,
"grad_norm": 0.3195700943470001,
"learning_rate": 0.00014119061936259772,
"loss": 0.06230233311653137,
"step": 540
},
{
"epoch": 0.9632224168126094,
"grad_norm": 0.15884605050086975,
"learning_rate": 0.00013998797354179194,
"loss": 0.06492781639099121,
"step": 550
},
{
"epoch": 0.9807355516637478,
"grad_norm": 0.17338015139102936,
"learning_rate": 0.0001387853277209862,
"loss": 0.07274928689002991,
"step": 560
},
{
"epoch": 0.9982486865148862,
"grad_norm": 0.18797871470451355,
"learning_rate": 0.0001375826819001804,
"loss": 0.07553291320800781,
"step": 570
},
{
"epoch": 1.0157618213660244,
"grad_norm": 0.14001163840293884,
"learning_rate": 0.00013638003607937464,
"loss": 0.04513072073459625,
"step": 580
},
{
"epoch": 1.0332749562171628,
"grad_norm": 0.25820890069007874,
"learning_rate": 0.00013517739025856887,
"loss": 0.05151134729385376,
"step": 590
},
{
"epoch": 1.0507880910683012,
"grad_norm": 0.2387373149394989,
"learning_rate": 0.0001339747444377631,
"loss": 0.05233837962150574,
"step": 600
},
{
"epoch": 1.0507880910683012,
"eval_loss": 0.07496609538793564,
"eval_runtime": 169.9129,
"eval_samples_per_second": 2.99,
"eval_steps_per_second": 0.747,
"step": 600
},
{
"epoch": 1.0683012259194395,
"grad_norm": 0.21280422806739807,
"learning_rate": 0.00013277209861695731,
"loss": 0.04595586657524109,
"step": 610
},
{
"epoch": 1.085814360770578,
"grad_norm": 0.2865266799926758,
"learning_rate": 0.00013156945279615154,
"loss": 0.04963254630565643,
"step": 620
},
{
"epoch": 1.1033274956217163,
"grad_norm": 0.19880151748657227,
"learning_rate": 0.00013036680697534576,
"loss": 0.05288234353065491,
"step": 630
},
{
"epoch": 1.1208406304728546,
"grad_norm": 0.25318190455436707,
"learning_rate": 0.00012916416115454,
"loss": 0.04070430099964142,
"step": 640
},
{
"epoch": 1.138353765323993,
"grad_norm": 0.2229541689157486,
"learning_rate": 0.0001279615153337342,
"loss": 0.04462625682353973,
"step": 650
},
{
"epoch": 1.1558669001751314,
"grad_norm": 0.15195652842521667,
"learning_rate": 0.00012675886951292846,
"loss": 0.04568430483341217,
"step": 660
},
{
"epoch": 1.1733800350262698,
"grad_norm": 0.2872307300567627,
"learning_rate": 0.0001255562236921227,
"loss": 0.04056203365325928,
"step": 670
},
{
"epoch": 1.1908931698774081,
"grad_norm": 0.30495700240135193,
"learning_rate": 0.00012435357787131689,
"loss": 0.047316303849220274,
"step": 680
},
{
"epoch": 1.2084063047285465,
"grad_norm": 0.1586247980594635,
"learning_rate": 0.00012315093205051114,
"loss": 0.044099316000938416,
"step": 690
},
{
"epoch": 1.2259194395796849,
"grad_norm": 0.19665417075157166,
"learning_rate": 0.00012194828622970536,
"loss": 0.04525145888328552,
"step": 700
},
{
"epoch": 1.2259194395796849,
"eval_loss": 0.07472622394561768,
"eval_runtime": 169.568,
"eval_samples_per_second": 2.996,
"eval_steps_per_second": 0.749,
"step": 700
},
{
"epoch": 1.2434325744308232,
"grad_norm": 0.21693575382232666,
"learning_rate": 0.00012074564040889957,
"loss": 0.04104744493961334,
"step": 710
},
{
"epoch": 1.2609457092819616,
"grad_norm": 0.24825339019298553,
"learning_rate": 0.00011954299458809381,
"loss": 0.0438425600528717,
"step": 720
},
{
"epoch": 1.2784588441331,
"grad_norm": 0.18047627806663513,
"learning_rate": 0.00011834034876728803,
"loss": 0.047738096117973326,
"step": 730
},
{
"epoch": 1.295971978984238,
"grad_norm": 0.19772164523601532,
"learning_rate": 0.00011713770294648227,
"loss": 0.04714350998401642,
"step": 740
},
{
"epoch": 1.3134851138353765,
"grad_norm": 0.22316114604473114,
"learning_rate": 0.0001159350571256765,
"loss": 0.04388459920883179,
"step": 750
},
{
"epoch": 1.3309982486865148,
"grad_norm": 0.1677238643169403,
"learning_rate": 0.00011473241130487071,
"loss": 0.04296576082706451,
"step": 760
},
{
"epoch": 1.3485113835376532,
"grad_norm": 0.2544882595539093,
"learning_rate": 0.00011352976548406496,
"loss": 0.037767985463142396,
"step": 770
},
{
"epoch": 1.3660245183887916,
"grad_norm": 0.17373642325401306,
"learning_rate": 0.00011232711966325917,
"loss": 0.04673008918762207,
"step": 780
},
{
"epoch": 1.38353765323993,
"grad_norm": 0.23099961876869202,
"learning_rate": 0.00011112447384245341,
"loss": 0.04906592071056366,
"step": 790
},
{
"epoch": 1.4010507880910683,
"grad_norm": 0.2572455406188965,
"learning_rate": 0.00010992182802164763,
"loss": 0.04228177070617676,
"step": 800
},
{
"epoch": 1.4010507880910683,
"eval_loss": 0.07377293705940247,
"eval_runtime": 169.6978,
"eval_samples_per_second": 2.994,
"eval_steps_per_second": 0.748,
"step": 800
},
{
"epoch": 1.4185639229422067,
"grad_norm": 0.1933060735464096,
"learning_rate": 0.00010871918220084186,
"loss": 0.039757218956947324,
"step": 810
},
{
"epoch": 1.436077057793345,
"grad_norm": 0.21861182153224945,
"learning_rate": 0.0001075165363800361,
"loss": 0.04450837075710297,
"step": 820
},
{
"epoch": 1.4535901926444834,
"grad_norm": 0.27015894651412964,
"learning_rate": 0.0001063138905592303,
"loss": 0.04501202404499054,
"step": 830
},
{
"epoch": 1.4711033274956218,
"grad_norm": 0.15882235765457153,
"learning_rate": 0.00010511124473842453,
"loss": 0.040595722198486325,
"step": 840
},
{
"epoch": 1.4886164623467601,
"grad_norm": 0.22079160809516907,
"learning_rate": 0.00010390859891761877,
"loss": 0.04613872766494751,
"step": 850
},
{
"epoch": 1.5061295971978983,
"grad_norm": 0.26043882966041565,
"learning_rate": 0.00010270595309681299,
"loss": 0.052975207567214966,
"step": 860
},
{
"epoch": 1.5236427320490367,
"grad_norm": 0.1896980255842209,
"learning_rate": 0.00010150330727600723,
"loss": 0.04145742654800415,
"step": 870
},
{
"epoch": 1.541155866900175,
"grad_norm": 0.17354312539100647,
"learning_rate": 0.00010030066145520146,
"loss": 0.04943464994430542,
"step": 880
},
{
"epoch": 1.5586690017513134,
"grad_norm": 0.14007078111171722,
"learning_rate": 9.909801563439568e-05,
"loss": 0.04217578768730164,
"step": 890
},
{
"epoch": 1.5761821366024518,
"grad_norm": 0.20131802558898926,
"learning_rate": 9.78953698135899e-05,
"loss": 0.041672542691230774,
"step": 900
},
{
"epoch": 1.5761821366024518,
"eval_loss": 0.07051914185285568,
"eval_runtime": 169.9735,
"eval_samples_per_second": 2.989,
"eval_steps_per_second": 0.747,
"step": 900
},
{
"epoch": 1.5936952714535901,
"grad_norm": 0.22193501889705658,
"learning_rate": 9.669272399278413e-05,
"loss": 0.04524196684360504,
"step": 910
},
{
"epoch": 1.6112084063047285,
"grad_norm": 0.23595920205116272,
"learning_rate": 9.549007817197835e-05,
"loss": 0.04126276075839996,
"step": 920
},
{
"epoch": 1.6287215411558669,
"grad_norm": 0.2922545373439789,
"learning_rate": 9.428743235117259e-05,
"loss": 0.04022812843322754,
"step": 930
},
{
"epoch": 1.6462346760070052,
"grad_norm": 0.23278813064098358,
"learning_rate": 9.30847865303668e-05,
"loss": 0.04213928878307342,
"step": 940
},
{
"epoch": 1.6637478108581436,
"grad_norm": 0.14974910020828247,
"learning_rate": 9.188214070956104e-05,
"loss": 0.0363939642906189,
"step": 950
},
{
"epoch": 1.681260945709282,
"grad_norm": 0.1183304563164711,
"learning_rate": 9.067949488875526e-05,
"loss": 0.04207303524017334,
"step": 960
},
{
"epoch": 1.6987740805604203,
"grad_norm": 0.23170360922813416,
"learning_rate": 8.94768490679495e-05,
"loss": 0.042323988676071164,
"step": 970
},
{
"epoch": 1.7162872154115587,
"grad_norm": 0.14556758105754852,
"learning_rate": 8.827420324714371e-05,
"loss": 0.042339283227920535,
"step": 980
},
{
"epoch": 1.733800350262697,
"grad_norm": 0.1421191394329071,
"learning_rate": 8.707155742633795e-05,
"loss": 0.04450683891773224,
"step": 990
},
{
"epoch": 1.7513134851138354,
"grad_norm": 0.31845614314079285,
"learning_rate": 8.586891160553218e-05,
"loss": 0.042928069829940796,
"step": 1000
},
{
"epoch": 1.7513134851138354,
"eval_loss": 0.0688522532582283,
"eval_runtime": 169.5678,
"eval_samples_per_second": 2.996,
"eval_steps_per_second": 0.749,
"step": 1000
},
{
"epoch": 1.7688266199649738,
"grad_norm": 0.1398610770702362,
"learning_rate": 8.46662657847264e-05,
"loss": 0.042378559708595276,
"step": 1010
},
{
"epoch": 1.7863397548161122,
"grad_norm": 0.18888983130455017,
"learning_rate": 8.346361996392062e-05,
"loss": 0.044092172384262086,
"step": 1020
},
{
"epoch": 1.8038528896672505,
"grad_norm": 0.192138671875,
"learning_rate": 8.226097414311485e-05,
"loss": 0.03955377042293549,
"step": 1030
},
{
"epoch": 1.821366024518389,
"grad_norm": 0.2001374512910843,
"learning_rate": 8.105832832230909e-05,
"loss": 0.04774285852909088,
"step": 1040
},
{
"epoch": 1.8388791593695273,
"grad_norm": 0.24916240572929382,
"learning_rate": 7.985568250150331e-05,
"loss": 0.044192954897880554,
"step": 1050
},
{
"epoch": 1.8563922942206657,
"grad_norm": 0.21104031801223755,
"learning_rate": 7.865303668069754e-05,
"loss": 0.0387516975402832,
"step": 1060
},
{
"epoch": 1.873905429071804,
"grad_norm": 0.27948206663131714,
"learning_rate": 7.745039085989176e-05,
"loss": 0.042763397097587585,
"step": 1070
},
{
"epoch": 1.8914185639229422,
"grad_norm": 0.21115849912166595,
"learning_rate": 7.6247745039086e-05,
"loss": 0.03943166434764862,
"step": 1080
},
{
"epoch": 1.9089316987740805,
"grad_norm": 0.24164821207523346,
"learning_rate": 7.504509921828022e-05,
"loss": 0.04395500421524048,
"step": 1090
},
{
"epoch": 1.926444833625219,
"grad_norm": 0.14232757687568665,
"learning_rate": 7.384245339747445e-05,
"loss": 0.03802197575569153,
"step": 1100
},
{
"epoch": 1.926444833625219,
"eval_loss": 0.0663708746433258,
"eval_runtime": 170.0427,
"eval_samples_per_second": 2.987,
"eval_steps_per_second": 0.747,
"step": 1100
},
{
"epoch": 1.9439579684763573,
"grad_norm": 0.20456406474113464,
"learning_rate": 7.263980757666867e-05,
"loss": 0.04351660311222076,
"step": 1110
},
{
"epoch": 1.9614711033274956,
"grad_norm": 0.28461146354675293,
"learning_rate": 7.14371617558629e-05,
"loss": 0.04411421418190002,
"step": 1120
},
{
"epoch": 1.978984238178634,
"grad_norm": 0.33428093791007996,
"learning_rate": 7.023451593505713e-05,
"loss": 0.04533115029335022,
"step": 1130
},
{
"epoch": 1.9964973730297724,
"grad_norm": 0.2965065538883209,
"learning_rate": 6.903187011425134e-05,
"loss": 0.04683744609355926,
"step": 1140
},
{
"epoch": 2.0140105078809105,
"grad_norm": 0.13189074397087097,
"learning_rate": 6.782922429344558e-05,
"loss": 0.024469637870788576,
"step": 1150
},
{
"epoch": 2.031523642732049,
"grad_norm": 0.26192790269851685,
"learning_rate": 6.662657847263981e-05,
"loss": 0.020343032479286195,
"step": 1160
},
{
"epoch": 2.0490367775831873,
"grad_norm": 0.17017051577568054,
"learning_rate": 6.542393265183405e-05,
"loss": 0.023167347908020018,
"step": 1170
},
{
"epoch": 2.0665499124343256,
"grad_norm": 0.23270311951637268,
"learning_rate": 6.422128683102826e-05,
"loss": 0.019265547394752502,
"step": 1180
},
{
"epoch": 2.084063047285464,
"grad_norm": 0.17566721141338348,
"learning_rate": 6.30186410102225e-05,
"loss": 0.020077353715896605,
"step": 1190
},
{
"epoch": 2.1015761821366024,
"grad_norm": 0.21460862457752228,
"learning_rate": 6.181599518941672e-05,
"loss": 0.020433691143989564,
"step": 1200
},
{
"epoch": 2.1015761821366024,
"eval_loss": 0.0755230188369751,
"eval_runtime": 169.6234,
"eval_samples_per_second": 2.995,
"eval_steps_per_second": 0.749,
"step": 1200
},
{
"epoch": 2.1190893169877407,
"grad_norm": 0.19966909289360046,
"learning_rate": 6.061334936861095e-05,
"loss": 0.019319312274456026,
"step": 1210
},
{
"epoch": 2.136602451838879,
"grad_norm": 0.19373339414596558,
"learning_rate": 5.941070354780517e-05,
"loss": 0.022010722756385805,
"step": 1220
},
{
"epoch": 2.1541155866900175,
"grad_norm": 0.19323857128620148,
"learning_rate": 5.82080577269994e-05,
"loss": 0.021162202954292296,
"step": 1230
},
{
"epoch": 2.171628721541156,
"grad_norm": 0.16135787963867188,
"learning_rate": 5.700541190619363e-05,
"loss": 0.02209024876356125,
"step": 1240
},
{
"epoch": 2.189141856392294,
"grad_norm": 0.1409604251384735,
"learning_rate": 5.580276608538786e-05,
"loss": 0.020828820765018463,
"step": 1250
},
{
"epoch": 2.2066549912434326,
"grad_norm": 0.15199248492717743,
"learning_rate": 5.460012026458209e-05,
"loss": 0.019746646285057068,
"step": 1260
},
{
"epoch": 2.224168126094571,
"grad_norm": 0.1164596751332283,
"learning_rate": 5.339747444377631e-05,
"loss": 0.02107318639755249,
"step": 1270
},
{
"epoch": 2.2416812609457093,
"grad_norm": 0.14257144927978516,
"learning_rate": 5.219482862297054e-05,
"loss": 0.018259820342063905,
"step": 1280
},
{
"epoch": 2.2591943957968477,
"grad_norm": 0.1540592759847641,
"learning_rate": 5.0992182802164765e-05,
"loss": 0.0190964937210083,
"step": 1290
},
{
"epoch": 2.276707530647986,
"grad_norm": 0.2179027795791626,
"learning_rate": 4.978953698135899e-05,
"loss": 0.020862923562526704,
"step": 1300
},
{
"epoch": 2.276707530647986,
"eval_loss": 0.0765165463089943,
"eval_runtime": 170.3828,
"eval_samples_per_second": 2.982,
"eval_steps_per_second": 0.745,
"step": 1300
}
],
"logging_steps": 10,
"max_steps": 1713,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.0067417630582374e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}