BRlkl's picture
Push best model used for final benchmarks
c668209 verified
{
"best_global_step": 6586,
"best_metric": 0.847490661036219,
"best_model_checkpoint": "outputs/final-run/checkpoint-6586",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 6586,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015186028853454821,
"grad_norm": 2.3480491638183594,
"learning_rate": 9.999658552822536e-06,
"loss": 0.6629,
"step": 50
},
{
"epoch": 0.030372057706909643,
"grad_norm": 3.3713912963867188,
"learning_rate": 9.998606244733398e-06,
"loss": 0.5702,
"step": 100
},
{
"epoch": 0.04555808656036447,
"grad_norm": 5.710732460021973,
"learning_rate": 9.996843083169648e-06,
"loss": 0.5245,
"step": 150
},
{
"epoch": 0.060744115413819286,
"grad_norm": 5.55316686630249,
"learning_rate": 9.994369318871088e-06,
"loss": 0.4684,
"step": 200
},
{
"epoch": 0.07593014426727411,
"grad_norm": 8.149177551269531,
"learning_rate": 9.991185303632574e-06,
"loss": 0.4783,
"step": 250
},
{
"epoch": 0.09111617312072894,
"grad_norm": 7.209503650665283,
"learning_rate": 9.987291490253976e-06,
"loss": 0.4589,
"step": 300
},
{
"epoch": 0.10630220197418375,
"grad_norm": 7.983670234680176,
"learning_rate": 9.98268843247581e-06,
"loss": 0.4623,
"step": 350
},
{
"epoch": 0.12148823082763857,
"grad_norm": 4.2466206550598145,
"learning_rate": 9.977376784900465e-06,
"loss": 0.4694,
"step": 400
},
{
"epoch": 0.1366742596810934,
"grad_norm": 6.11989164352417,
"learning_rate": 9.971357302899133e-06,
"loss": 0.4965,
"step": 450
},
{
"epoch": 0.15186028853454822,
"grad_norm": 4.606278896331787,
"learning_rate": 9.964630842504372e-06,
"loss": 0.4919,
"step": 500
},
{
"epoch": 0.16704631738800305,
"grad_norm": 5.91365909576416,
"learning_rate": 9.957198360288374e-06,
"loss": 0.4536,
"step": 550
},
{
"epoch": 0.18223234624145787,
"grad_norm": 5.541100025177002,
"learning_rate": 9.949060913226936e-06,
"loss": 0.4719,
"step": 600
},
{
"epoch": 0.19741837509491267,
"grad_norm": 4.289076805114746,
"learning_rate": 9.94021965854914e-06,
"loss": 0.4136,
"step": 650
},
{
"epoch": 0.2126044039483675,
"grad_norm": 14.322737693786621,
"learning_rate": 9.930675853572787e-06,
"loss": 0.4705,
"step": 700
},
{
"epoch": 0.22779043280182232,
"grad_norm": 6.375570297241211,
"learning_rate": 9.920430855525589e-06,
"loss": 0.4701,
"step": 750
},
{
"epoch": 0.24297646165527714,
"grad_norm": 3.261223316192627,
"learning_rate": 9.909486121352163e-06,
"loss": 0.4528,
"step": 800
},
{
"epoch": 0.25816249050873197,
"grad_norm": 3.854436159133911,
"learning_rate": 9.89784320750684e-06,
"loss": 0.4265,
"step": 850
},
{
"epoch": 0.2733485193621868,
"grad_norm": 10.977453231811523,
"learning_rate": 9.885503769732304e-06,
"loss": 0.4329,
"step": 900
},
{
"epoch": 0.2885345482156416,
"grad_norm": 6.016301155090332,
"learning_rate": 9.872469562824157e-06,
"loss": 0.4147,
"step": 950
},
{
"epoch": 0.30372057706909644,
"grad_norm": 5.806178569793701,
"learning_rate": 9.858742440381343e-06,
"loss": 0.4718,
"step": 1000
},
{
"epoch": 0.31890660592255127,
"grad_norm": 6.394021987915039,
"learning_rate": 9.844324354542558e-06,
"loss": 0.3912,
"step": 1050
},
{
"epoch": 0.3340926347760061,
"grad_norm": 7.948565483093262,
"learning_rate": 9.82921735570864e-06,
"loss": 0.4223,
"step": 1100
},
{
"epoch": 0.3492786636294609,
"grad_norm": 5.151625156402588,
"learning_rate": 9.813423592250969e-06,
"loss": 0.4079,
"step": 1150
},
{
"epoch": 0.36446469248291574,
"grad_norm": 6.724940299987793,
"learning_rate": 9.796945310205958e-06,
"loss": 0.4306,
"step": 1200
},
{
"epoch": 0.37965072133637057,
"grad_norm": 3.623936176300049,
"learning_rate": 9.779784852955636e-06,
"loss": 0.438,
"step": 1250
},
{
"epoch": 0.39483675018982534,
"grad_norm": 6.648692607879639,
"learning_rate": 9.761944660894397e-06,
"loss": 0.4515,
"step": 1300
},
{
"epoch": 0.41002277904328016,
"grad_norm": 5.989009380340576,
"learning_rate": 9.743427271081954e-06,
"loss": 0.3911,
"step": 1350
},
{
"epoch": 0.425208807896735,
"grad_norm": 3.3359053134918213,
"learning_rate": 9.724235316882537e-06,
"loss": 0.4454,
"step": 1400
},
{
"epoch": 0.4403948367501898,
"grad_norm": 11.017061233520508,
"learning_rate": 9.704371527590404e-06,
"loss": 0.4022,
"step": 1450
},
{
"epoch": 0.45558086560364464,
"grad_norm": 7.159852981567383,
"learning_rate": 9.68383872804171e-06,
"loss": 0.4464,
"step": 1500
},
{
"epoch": 0.47076689445709946,
"grad_norm": 4.259323596954346,
"learning_rate": 9.662639838212781e-06,
"loss": 0.3829,
"step": 1550
},
{
"epoch": 0.4859529233105543,
"grad_norm": 7.917912483215332,
"learning_rate": 9.640777872804868e-06,
"loss": 0.4186,
"step": 1600
},
{
"epoch": 0.5011389521640092,
"grad_norm": 8.7236967086792,
"learning_rate": 9.61825594081542e-06,
"loss": 0.3766,
"step": 1650
},
{
"epoch": 0.5163249810174639,
"grad_norm": 4.761518478393555,
"learning_rate": 9.595077245095959e-06,
"loss": 0.4057,
"step": 1700
},
{
"epoch": 0.5315110098709187,
"grad_norm": 2.5256729125976562,
"learning_rate": 9.571245081896594e-06,
"loss": 0.4321,
"step": 1750
},
{
"epoch": 0.5466970387243736,
"grad_norm": 9.82975959777832,
"learning_rate": 9.546762840397268e-06,
"loss": 0.4067,
"step": 1800
},
{
"epoch": 0.5618830675778284,
"grad_norm": 4.607714653015137,
"learning_rate": 9.521634002225774e-06,
"loss": 0.3834,
"step": 1850
},
{
"epoch": 0.5770690964312832,
"grad_norm": 8.330415725708008,
"learning_rate": 9.495862140962638e-06,
"loss": 0.374,
"step": 1900
},
{
"epoch": 0.592255125284738,
"grad_norm": 5.7992634773254395,
"learning_rate": 9.469450921632912e-06,
"loss": 0.3852,
"step": 1950
},
{
"epoch": 0.6074411541381929,
"grad_norm": 5.298435211181641,
"learning_rate": 9.44240410018498e-06,
"loss": 0.4345,
"step": 2000
},
{
"epoch": 0.6226271829916477,
"grad_norm": 6.483381271362305,
"learning_rate": 9.414725522956414e-06,
"loss": 0.407,
"step": 2050
},
{
"epoch": 0.6378132118451025,
"grad_norm": 5.179783821105957,
"learning_rate": 9.386419126126983e-06,
"loss": 0.432,
"step": 2100
},
{
"epoch": 0.6529992406985573,
"grad_norm": 5.316011428833008,
"learning_rate": 9.357488935158897e-06,
"loss": 0.4071,
"step": 2150
},
{
"epoch": 0.6681852695520122,
"grad_norm": 10.58410930633545,
"learning_rate": 9.327939064224346e-06,
"loss": 0.3772,
"step": 2200
},
{
"epoch": 0.683371298405467,
"grad_norm": 4.013734817504883,
"learning_rate": 9.297773715620406e-06,
"loss": 0.4064,
"step": 2250
},
{
"epoch": 0.6985573272589218,
"grad_norm": 9.252372741699219,
"learning_rate": 9.266997179171442e-06,
"loss": 0.3911,
"step": 2300
},
{
"epoch": 0.7137433561123766,
"grad_norm": 8.192291259765625,
"learning_rate": 9.235613831619052e-06,
"loss": 0.3816,
"step": 2350
},
{
"epoch": 0.7289293849658315,
"grad_norm": 4.068896770477295,
"learning_rate": 9.203628135999643e-06,
"loss": 0.4304,
"step": 2400
},
{
"epoch": 0.7441154138192863,
"grad_norm": 2.9444737434387207,
"learning_rate": 9.171044641009741e-06,
"loss": 0.4231,
"step": 2450
},
{
"epoch": 0.7593014426727411,
"grad_norm": 4.700106620788574,
"learning_rate": 9.137867980359126e-06,
"loss": 0.3982,
"step": 2500
},
{
"epoch": 0.7744874715261959,
"grad_norm": 14.975322723388672,
"learning_rate": 9.104102872111858e-06,
"loss": 0.4241,
"step": 2550
},
{
"epoch": 0.7896735003796507,
"grad_norm": 4.325404644012451,
"learning_rate": 9.069754118015339e-06,
"loss": 0.3725,
"step": 2600
},
{
"epoch": 0.8048595292331056,
"grad_norm": 3.829643964767456,
"learning_rate": 9.034826602817433e-06,
"loss": 0.4048,
"step": 2650
},
{
"epoch": 0.8200455580865603,
"grad_norm": 6.086367607116699,
"learning_rate": 8.99932529357182e-06,
"loss": 0.4333,
"step": 2700
},
{
"epoch": 0.8352315869400152,
"grad_norm": 4.058459758758545,
"learning_rate": 8.963255238931623e-06,
"loss": 0.4004,
"step": 2750
},
{
"epoch": 0.85041761579347,
"grad_norm": 4.049592971801758,
"learning_rate": 8.926621568431442e-06,
"loss": 0.4126,
"step": 2800
},
{
"epoch": 0.8656036446469249,
"grad_norm": 3.434569835662842,
"learning_rate": 8.889429491757872e-06,
"loss": 0.4134,
"step": 2850
},
{
"epoch": 0.8807896735003796,
"grad_norm": 5.300995349884033,
"learning_rate": 8.851684298008642e-06,
"loss": 0.4224,
"step": 2900
},
{
"epoch": 0.8959757023538345,
"grad_norm": 8.158344268798828,
"learning_rate": 8.813391354940445e-06,
"loss": 0.3538,
"step": 2950
},
{
"epoch": 0.9111617312072893,
"grad_norm": 6.747292518615723,
"learning_rate": 8.77455610820559e-06,
"loss": 0.3907,
"step": 3000
},
{
"epoch": 0.9263477600607442,
"grad_norm": 6.279948711395264,
"learning_rate": 8.735184080577569e-06,
"loss": 0.4344,
"step": 3050
},
{
"epoch": 0.9415337889141989,
"grad_norm": 4.355826377868652,
"learning_rate": 8.69528087116567e-06,
"loss": 0.4082,
"step": 3100
},
{
"epoch": 0.9567198177676538,
"grad_norm": 6.685491561889648,
"learning_rate": 8.65485215461872e-06,
"loss": 0.3851,
"step": 3150
},
{
"epoch": 0.9719058466211086,
"grad_norm": 5.933023452758789,
"learning_rate": 8.61390368031809e-06,
"loss": 0.3734,
"step": 3200
},
{
"epoch": 0.9870918754745635,
"grad_norm": 9.179722785949707,
"learning_rate": 8.572441271560077e-06,
"loss": 0.3934,
"step": 3250
},
{
"epoch": 1.0,
"eval_f1": 0.8445935154128733,
"eval_loss": 0.37980714440345764,
"eval_runtime": 7.8494,
"eval_samples_per_second": 745.663,
"eval_steps_per_second": 23.314,
"step": 3293
},
{
"epoch": 1.0021260440394837,
"grad_norm": 1.5981299877166748,
"learning_rate": 8.53047082472777e-06,
"loss": 0.3967,
"step": 3300
},
{
"epoch": 1.0173120728929386,
"grad_norm": 5.159671783447266,
"learning_rate": 8.487998308452525e-06,
"loss": 0.3125,
"step": 3350
},
{
"epoch": 1.0324981017463932,
"grad_norm": 8.904830932617188,
"learning_rate": 8.445029762765159e-06,
"loss": 0.3201,
"step": 3400
},
{
"epoch": 1.047684130599848,
"grad_norm": 4.215548992156982,
"learning_rate": 8.401571298237e-06,
"loss": 0.3043,
"step": 3450
},
{
"epoch": 1.062870159453303,
"grad_norm": 2.9603254795074463,
"learning_rate": 8.357629095110906e-06,
"loss": 0.307,
"step": 3500
},
{
"epoch": 1.0780561883067579,
"grad_norm": 8.665258407592773,
"learning_rate": 8.313209402422348e-06,
"loss": 0.3081,
"step": 3550
},
{
"epoch": 1.0932422171602125,
"grad_norm": 7.101922512054443,
"learning_rate": 8.268318537110762e-06,
"loss": 0.3536,
"step": 3600
},
{
"epoch": 1.1084282460136674,
"grad_norm": 9.113100051879883,
"learning_rate": 8.222962883121196e-06,
"loss": 0.3557,
"step": 3650
},
{
"epoch": 1.1236142748671223,
"grad_norm": 3.427243947982788,
"learning_rate": 8.177148890496452e-06,
"loss": 0.2984,
"step": 3700
},
{
"epoch": 1.138800303720577,
"grad_norm": 6.6492695808410645,
"learning_rate": 8.130883074459823e-06,
"loss": 0.3407,
"step": 3750
},
{
"epoch": 1.1539863325740318,
"grad_norm": 9.254618644714355,
"learning_rate": 8.084172014488564e-06,
"loss": 0.3487,
"step": 3800
},
{
"epoch": 1.1691723614274867,
"grad_norm": 3.8507754802703857,
"learning_rate": 8.037022353378218e-06,
"loss": 0.3374,
"step": 3850
},
{
"epoch": 1.1843583902809416,
"grad_norm": 18.62590217590332,
"learning_rate": 7.989440796297943e-06,
"loss": 0.3269,
"step": 3900
},
{
"epoch": 1.1995444191343965,
"grad_norm": 14.359010696411133,
"learning_rate": 7.941434109836968e-06,
"loss": 0.3219,
"step": 3950
},
{
"epoch": 1.2147304479878511,
"grad_norm": 8.173829078674316,
"learning_rate": 7.893009121042314e-06,
"loss": 0.2944,
"step": 4000
},
{
"epoch": 1.229916476841306,
"grad_norm": 6.0913591384887695,
"learning_rate": 7.844172716447918e-06,
"loss": 0.366,
"step": 4050
},
{
"epoch": 1.2451025056947609,
"grad_norm": 8.989174842834473,
"learning_rate": 7.794931841095297e-06,
"loss": 0.3223,
"step": 4100
},
{
"epoch": 1.2602885345482155,
"grad_norm": 4.618454456329346,
"learning_rate": 7.745293497545892e-06,
"loss": 0.3718,
"step": 4150
},
{
"epoch": 1.2754745634016704,
"grad_norm": 6.966646194458008,
"learning_rate": 7.695264744885225e-06,
"loss": 0.34,
"step": 4200
},
{
"epoch": 1.2906605922551253,
"grad_norm": 8.476325988769531,
"learning_rate": 7.64485269771903e-06,
"loss": 0.309,
"step": 4250
},
{
"epoch": 1.3058466211085802,
"grad_norm": 3.3412492275238037,
"learning_rate": 7.594064525161487e-06,
"loss": 0.3491,
"step": 4300
},
{
"epoch": 1.321032649962035,
"grad_norm": 9.971606254577637,
"learning_rate": 7.54290744981569e-06,
"loss": 0.3097,
"step": 4350
},
{
"epoch": 1.3362186788154897,
"grad_norm": 7.083515167236328,
"learning_rate": 7.491388746746522e-06,
"loss": 0.3446,
"step": 4400
},
{
"epoch": 1.3514047076689446,
"grad_norm": 5.6028361320495605,
"learning_rate": 7.439515742446065e-06,
"loss": 0.3229,
"step": 4450
},
{
"epoch": 1.3665907365223995,
"grad_norm": 9.373847961425781,
"learning_rate": 7.387295813791705e-06,
"loss": 0.3022,
"step": 4500
},
{
"epoch": 1.3817767653758541,
"grad_norm": 5.378981590270996,
"learning_rate": 7.334736386997049e-06,
"loss": 0.2955,
"step": 4550
},
{
"epoch": 1.396962794229309,
"grad_norm": 9.248358726501465,
"learning_rate": 7.281844936555853e-06,
"loss": 0.3562,
"step": 4600
},
{
"epoch": 1.412148823082764,
"grad_norm": 6.579871654510498,
"learning_rate": 7.228628984179068e-06,
"loss": 0.3436,
"step": 4650
},
{
"epoch": 1.4273348519362186,
"grad_norm": 2.5316176414489746,
"learning_rate": 7.175096097725169e-06,
"loss": 0.3464,
"step": 4700
},
{
"epoch": 1.4425208807896734,
"grad_norm": 12.828206062316895,
"learning_rate": 7.121253890123941e-06,
"loss": 0.3333,
"step": 4750
},
{
"epoch": 1.4577069096431283,
"grad_norm": 8.807774543762207,
"learning_rate": 7.067110018293828e-06,
"loss": 0.2955,
"step": 4800
},
{
"epoch": 1.4728929384965832,
"grad_norm": 10.35312557220459,
"learning_rate": 7.012672182053043e-06,
"loss": 0.3321,
"step": 4850
},
{
"epoch": 1.488078967350038,
"grad_norm": 2.2814652919769287,
"learning_rate": 6.9579481230245835e-06,
"loss": 0.3466,
"step": 4900
},
{
"epoch": 1.5032649962034927,
"grad_norm": 5.442550182342529,
"learning_rate": 6.9029456235352795e-06,
"loss": 0.3321,
"step": 4950
},
{
"epoch": 1.5184510250569476,
"grad_norm": 12.557025909423828,
"learning_rate": 6.847672505509079e-06,
"loss": 0.3429,
"step": 5000
},
{
"epoch": 1.5336370539104025,
"grad_norm": 4.002285480499268,
"learning_rate": 6.792136629354677e-06,
"loss": 0.3274,
"step": 5050
},
{
"epoch": 1.5488230827638572,
"grad_norm": 17.179048538208008,
"learning_rate": 6.736345892847691e-06,
"loss": 0.3472,
"step": 5100
},
{
"epoch": 1.564009111617312,
"grad_norm": 8.354110717773438,
"learning_rate": 6.680308230007521e-06,
"loss": 0.3282,
"step": 5150
},
{
"epoch": 1.579195140470767,
"grad_norm": 5.1743035316467285,
"learning_rate": 6.624031609969036e-06,
"loss": 0.3443,
"step": 5200
},
{
"epoch": 1.5943811693242216,
"grad_norm": 6.959432601928711,
"learning_rate": 6.567524035849293e-06,
"loss": 0.35,
"step": 5250
},
{
"epoch": 1.6095671981776767,
"grad_norm": 20.55417823791504,
"learning_rate": 6.5107935436094076e-06,
"loss": 0.3158,
"step": 5300
},
{
"epoch": 1.6247532270311313,
"grad_norm": 14.025495529174805,
"learning_rate": 6.453848200911752e-06,
"loss": 0.3287,
"step": 5350
},
{
"epoch": 1.6399392558845862,
"grad_norm": 12.094548225402832,
"learning_rate": 6.396696105972655e-06,
"loss": 0.3448,
"step": 5400
},
{
"epoch": 1.655125284738041,
"grad_norm": 3.596747398376465,
"learning_rate": 6.339345386410756e-06,
"loss": 0.3544,
"step": 5450
},
{
"epoch": 1.6703113135914958,
"grad_norm": 4.897212505340576,
"learning_rate": 6.2818041980911635e-06,
"loss": 0.3363,
"step": 5500
},
{
"epoch": 1.6854973424449506,
"grad_norm": 2.7992074489593506,
"learning_rate": 6.224080723965616e-06,
"loss": 0.3405,
"step": 5550
},
{
"epoch": 1.7006833712984055,
"grad_norm": 8.647635459899902,
"learning_rate": 6.1661831729087705e-06,
"loss": 0.3218,
"step": 5600
},
{
"epoch": 1.7158694001518602,
"grad_norm": 16.2703800201416,
"learning_rate": 6.1081197785508335e-06,
"loss": 0.3569,
"step": 5650
},
{
"epoch": 1.731055429005315,
"grad_norm": 9.62259578704834,
"learning_rate": 6.049898798106636e-06,
"loss": 0.3181,
"step": 5700
},
{
"epoch": 1.74624145785877,
"grad_norm": 10.183274269104004,
"learning_rate": 5.991528511201382e-06,
"loss": 0.3191,
"step": 5750
},
{
"epoch": 1.7614274867122246,
"grad_norm": 20.28440284729004,
"learning_rate": 5.933017218693193e-06,
"loss": 0.3162,
"step": 5800
},
{
"epoch": 1.7766135155656797,
"grad_norm": 18.231319427490234,
"learning_rate": 5.874373241492651e-06,
"loss": 0.3788,
"step": 5850
},
{
"epoch": 1.7917995444191344,
"grad_norm": 14.682201385498047,
"learning_rate": 5.815604919379472e-06,
"loss": 0.3242,
"step": 5900
},
{
"epoch": 1.8069855732725892,
"grad_norm": 6.563547611236572,
"learning_rate": 5.7567206098164965e-06,
"loss": 0.3377,
"step": 5950
},
{
"epoch": 1.8221716021260441,
"grad_norm": 8.406890869140625,
"learning_rate": 5.697728686761189e-06,
"loss": 0.3222,
"step": 6000
},
{
"epoch": 1.8373576309794988,
"grad_norm": 5.706462860107422,
"learning_rate": 5.638637539474758e-06,
"loss": 0.3169,
"step": 6050
},
{
"epoch": 1.8525436598329537,
"grad_norm": 3.566732883453369,
"learning_rate": 5.579455571329128e-06,
"loss": 0.2993,
"step": 6100
},
{
"epoch": 1.8677296886864085,
"grad_norm": 21.842191696166992,
"learning_rate": 5.520191198611883e-06,
"loss": 0.3411,
"step": 6150
},
{
"epoch": 1.8829157175398632,
"grad_norm": 7.155375957489014,
"learning_rate": 5.460852849329394e-06,
"loss": 0.3168,
"step": 6200
},
{
"epoch": 1.8981017463933183,
"grad_norm": 5.166109085083008,
"learning_rate": 5.401448962008262e-06,
"loss": 0.3526,
"step": 6250
},
{
"epoch": 1.913287775246773,
"grad_norm": 10.691755294799805,
"learning_rate": 5.341987984495275e-06,
"loss": 0.334,
"step": 6300
},
{
"epoch": 1.9284738041002278,
"grad_norm": 1.8157846927642822,
"learning_rate": 5.282478372756036e-06,
"loss": 0.2981,
"step": 6350
},
{
"epoch": 1.9436598329536827,
"grad_norm": 6.267528057098389,
"learning_rate": 5.222928589672436e-06,
"loss": 0.3443,
"step": 6400
},
{
"epoch": 1.9588458618071374,
"grad_norm": 8.20384407043457,
"learning_rate": 5.163347103839149e-06,
"loss": 0.3196,
"step": 6450
},
{
"epoch": 1.9740318906605923,
"grad_norm": 6.2834882736206055,
"learning_rate": 5.10374238835931e-06,
"loss": 0.3176,
"step": 6500
},
{
"epoch": 1.9892179195140471,
"grad_norm": 7.512860298156738,
"learning_rate": 5.0441229196395416e-06,
"loss": 0.3216,
"step": 6550
},
{
"epoch": 2.0,
"eval_f1": 0.847490661036219,
"eval_loss": 0.39481809735298157,
"eval_runtime": 7.8513,
"eval_samples_per_second": 745.486,
"eval_steps_per_second": 23.308,
"step": 6586
}
],
"logging_steps": 50,
"max_steps": 13172,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.771769723795456e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}