TmpModel / saves /chess /no_explain /trainer_state.json
reasonwang's picture
Upload folder using huggingface_hub
a1506a1 verified
{
"best_metric": 0.029243575409054756,
"best_model_checkpoint": "saves/chess/no_explain/checkpoint-4000",
"epoch": 9.992390869042852,
"eval_steps": 1000,
"global_step": 12480,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08009611533840609,
"grad_norm": 0.8625897724596373,
"learning_rate": 4.006410256410257e-07,
"loss": 1.3897,
"step": 100
},
{
"epoch": 0.16019223067681218,
"grad_norm": 0.8895947937892531,
"learning_rate": 8.012820512820515e-07,
"loss": 0.0598,
"step": 200
},
{
"epoch": 0.24028834601521826,
"grad_norm": 0.5221246844134636,
"learning_rate": 1.201923076923077e-06,
"loss": 0.0551,
"step": 300
},
{
"epoch": 0.32038446135362436,
"grad_norm": 0.5590357289952654,
"learning_rate": 1.602564102564103e-06,
"loss": 0.0516,
"step": 400
},
{
"epoch": 0.4004805766920304,
"grad_norm": 0.36991974174438536,
"learning_rate": 2.0032051282051286e-06,
"loss": 0.0501,
"step": 500
},
{
"epoch": 0.4805766920304365,
"grad_norm": 0.6389443947236714,
"learning_rate": 2.403846153846154e-06,
"loss": 0.0486,
"step": 600
},
{
"epoch": 0.5606728073688426,
"grad_norm": 0.44563280571067243,
"learning_rate": 2.8044871794871797e-06,
"loss": 0.0463,
"step": 700
},
{
"epoch": 0.6407689227072487,
"grad_norm": 0.44266380357676305,
"learning_rate": 3.205128205128206e-06,
"loss": 0.0447,
"step": 800
},
{
"epoch": 0.7208650380456548,
"grad_norm": 0.585654631503778,
"learning_rate": 3.605769230769231e-06,
"loss": 0.0441,
"step": 900
},
{
"epoch": 0.8009611533840608,
"grad_norm": 0.600751877456253,
"learning_rate": 4.006410256410257e-06,
"loss": 0.0429,
"step": 1000
},
{
"epoch": 0.8009611533840608,
"eval_loss": 0.042210426181554794,
"eval_runtime": 97.133,
"eval_samples_per_second": 1462.17,
"eval_steps_per_second": 2.862,
"step": 1000
},
{
"epoch": 0.8810572687224669,
"grad_norm": 0.2641551118831142,
"learning_rate": 4.4070512820512826e-06,
"loss": 0.0414,
"step": 1100
},
{
"epoch": 0.961153384060873,
"grad_norm": 0.29049561928975876,
"learning_rate": 4.807692307692308e-06,
"loss": 0.0402,
"step": 1200
},
{
"epoch": 1.0408490188225872,
"grad_norm": 0.5344113116420023,
"learning_rate": 4.999735579817769e-06,
"loss": 0.0386,
"step": 1300
},
{
"epoch": 1.1209451341609933,
"grad_norm": 0.31257482202449377,
"learning_rate": 4.997740994288484e-06,
"loss": 0.0373,
"step": 1400
},
{
"epoch": 1.2010412494993994,
"grad_norm": 0.4593106982622164,
"learning_rate": 4.993792498360407e-06,
"loss": 0.0366,
"step": 1500
},
{
"epoch": 1.2811373648378055,
"grad_norm": 0.2012883704449717,
"learning_rate": 4.9878931808274796e-06,
"loss": 0.0357,
"step": 1600
},
{
"epoch": 1.3612334801762114,
"grad_norm": 0.22908626001592647,
"learning_rate": 4.980047656554856e-06,
"loss": 0.0352,
"step": 1700
},
{
"epoch": 1.4413295955146175,
"grad_norm": 0.3169879320183415,
"learning_rate": 4.970262062868821e-06,
"loss": 0.0346,
"step": 1800
},
{
"epoch": 1.5214257108530236,
"grad_norm": 0.2078878255601618,
"learning_rate": 4.958544054755741e-06,
"loss": 0.0336,
"step": 1900
},
{
"epoch": 1.6015218261914297,
"grad_norm": 0.2978110993331312,
"learning_rate": 4.944902798873794e-06,
"loss": 0.0329,
"step": 2000
},
{
"epoch": 1.6015218261914297,
"eval_loss": 0.03361953794956207,
"eval_runtime": 97.2876,
"eval_samples_per_second": 1459.847,
"eval_steps_per_second": 2.858,
"step": 2000
},
{
"epoch": 1.6816179415298358,
"grad_norm": 0.16678424956102253,
"learning_rate": 4.92934896638215e-06,
"loss": 0.0328,
"step": 2100
},
{
"epoch": 1.761714056868242,
"grad_norm": 0.19029664571581045,
"learning_rate": 4.91189472459324e-06,
"loss": 0.0316,
"step": 2200
},
{
"epoch": 1.841810172206648,
"grad_norm": 0.2388908631462674,
"learning_rate": 4.892553727454616e-06,
"loss": 0.0317,
"step": 2300
},
{
"epoch": 1.921906287545054,
"grad_norm": 0.15794270702360638,
"learning_rate": 4.8713411048678635e-06,
"loss": 0.0309,
"step": 2400
},
{
"epoch": 2.0016019223067683,
"grad_norm": 0.2103115075663395,
"learning_rate": 4.848273450852921e-06,
"loss": 0.0305,
"step": 2500
},
{
"epoch": 2.0816980376451744,
"grad_norm": 0.28601246983481904,
"learning_rate": 4.823368810567056e-06,
"loss": 0.0268,
"step": 2600
},
{
"epoch": 2.1617941529835805,
"grad_norm": 0.25522616878445004,
"learning_rate": 4.796646666188663e-06,
"loss": 0.0268,
"step": 2700
},
{
"epoch": 2.2418902683219866,
"grad_norm": 0.2343538332348778,
"learning_rate": 4.768127921676916e-06,
"loss": 0.0272,
"step": 2800
},
{
"epoch": 2.3219863836603922,
"grad_norm": 0.22903658893889398,
"learning_rate": 4.737834886419217e-06,
"loss": 0.0297,
"step": 2900
},
{
"epoch": 2.4020824989987988,
"grad_norm": 0.19855668130980528,
"learning_rate": 4.705791257779196e-06,
"loss": 0.0275,
"step": 3000
},
{
"epoch": 2.4020824989987988,
"eval_loss": 0.029653793200850487,
"eval_runtime": 97.2179,
"eval_samples_per_second": 1460.893,
"eval_steps_per_second": 2.86,
"step": 3000
},
{
"epoch": 2.4821786143372044,
"grad_norm": 0.1868527106405498,
"learning_rate": 4.672022102558958e-06,
"loss": 0.0269,
"step": 3100
},
{
"epoch": 2.562274729675611,
"grad_norm": 0.1985255713449175,
"learning_rate": 4.636553837390051e-06,
"loss": 0.0269,
"step": 3200
},
{
"epoch": 2.6423708450140166,
"grad_norm": 0.17528235376425527,
"learning_rate": 4.5994142080684956e-06,
"loss": 0.026,
"step": 3300
},
{
"epoch": 2.7224669603524227,
"grad_norm": 0.20238382028782428,
"learning_rate": 4.560632267850054e-06,
"loss": 0.026,
"step": 3400
},
{
"epoch": 2.802563075690829,
"grad_norm": 0.20789525240306345,
"learning_rate": 4.5202383547227134e-06,
"loss": 0.0257,
"step": 3500
},
{
"epoch": 2.882659191029235,
"grad_norm": 0.2849074845845128,
"learning_rate": 4.478264067674155e-06,
"loss": 0.0256,
"step": 3600
},
{
"epoch": 2.962755306367641,
"grad_norm": 0.1826392119567578,
"learning_rate": 4.43474224197278e-06,
"loss": 0.0255,
"step": 3700
},
{
"epoch": 3.0424509411293554,
"grad_norm": 0.3254043272458406,
"learning_rate": 4.389706923481633e-06,
"loss": 0.0224,
"step": 3800
},
{
"epoch": 3.122547056467761,
"grad_norm": 0.2695456046362865,
"learning_rate": 4.34319334202531e-06,
"loss": 0.0198,
"step": 3900
},
{
"epoch": 3.202643171806167,
"grad_norm": 0.24345073976828904,
"learning_rate": 4.2952378838306855e-06,
"loss": 0.0202,
"step": 4000
},
{
"epoch": 3.202643171806167,
"eval_loss": 0.029243575409054756,
"eval_runtime": 97.6159,
"eval_samples_per_second": 1454.937,
"eval_steps_per_second": 2.848,
"step": 4000
},
{
"epoch": 3.2827392871445733,
"grad_norm": 0.3753413906545954,
"learning_rate": 4.245878063063022e-06,
"loss": 0.0205,
"step": 4100
},
{
"epoch": 3.3628354024829794,
"grad_norm": 0.2460926534460345,
"learning_rate": 4.195152492479727e-06,
"loss": 0.0205,
"step": 4200
},
{
"epoch": 3.4429315178213855,
"grad_norm": 0.2704381094416959,
"learning_rate": 4.143100853224714e-06,
"loss": 0.0204,
"step": 4300
},
{
"epoch": 3.5230276331597916,
"grad_norm": 0.32177852781904165,
"learning_rate": 4.089763863786988e-06,
"loss": 0.0204,
"step": 4400
},
{
"epoch": 3.6031237484981977,
"grad_norm": 0.24794031349246146,
"learning_rate": 4.035183248147752e-06,
"loss": 0.0201,
"step": 4500
},
{
"epoch": 3.683219863836604,
"grad_norm": 0.2548491545100107,
"learning_rate": 3.979401703140955e-06,
"loss": 0.02,
"step": 4600
},
{
"epoch": 3.76331597917501,
"grad_norm": 0.28339343421860097,
"learning_rate": 3.922462865052782e-06,
"loss": 0.0206,
"step": 4700
},
{
"epoch": 3.843412094513416,
"grad_norm": 0.25858197249007897,
"learning_rate": 3.8644112754862614e-06,
"loss": 0.0199,
"step": 4800
},
{
"epoch": 3.923508209851822,
"grad_norm": 0.25917676686664276,
"learning_rate": 3.805292346517659e-06,
"loss": 0.0199,
"step": 4900
},
{
"epoch": 4.0032038446135365,
"grad_norm": 0.205217434085613,
"learning_rate": 3.745152325171921e-06,
"loss": 0.0194,
"step": 5000
},
{
"epoch": 4.0032038446135365,
"eval_loss": 0.02939535118639469,
"eval_runtime": 97.4425,
"eval_samples_per_second": 1457.526,
"eval_steps_per_second": 2.853,
"step": 5000
},
{
"epoch": 4.083299959951942,
"grad_norm": 0.3449210512817333,
"learning_rate": 3.6840382572449733e-06,
"loss": 0.0109,
"step": 5100
},
{
"epoch": 4.163396075290349,
"grad_norm": 0.35702251381560834,
"learning_rate": 3.621997950501156e-06,
"loss": 0.0116,
"step": 5200
},
{
"epoch": 4.243492190628754,
"grad_norm": 0.3503016584030036,
"learning_rate": 3.5590799372745915e-06,
"loss": 0.0119,
"step": 5300
},
{
"epoch": 4.323588305967161,
"grad_norm": 0.32698876302828034,
"learning_rate": 3.495333436503753e-06,
"loss": 0.0125,
"step": 5400
},
{
"epoch": 4.403684421305567,
"grad_norm": 0.29218555867917617,
"learning_rate": 3.4308083152289073e-06,
"loss": 0.0122,
"step": 5500
},
{
"epoch": 4.483780536643973,
"grad_norm": 0.42870050776267266,
"learning_rate": 3.3655550495825824e-06,
"loss": 0.0121,
"step": 5600
},
{
"epoch": 4.563876651982379,
"grad_norm": 0.3841200097431653,
"learning_rate": 3.2996246853035417e-06,
"loss": 0.0123,
"step": 5700
},
{
"epoch": 4.6439727673207845,
"grad_norm": 0.27276776968480937,
"learning_rate": 3.233068797805194e-06,
"loss": 0.0121,
"step": 5800
},
{
"epoch": 4.724068882659191,
"grad_norm": 0.37618566324117403,
"learning_rate": 3.1659394518296303e-06,
"loss": 0.0121,
"step": 5900
},
{
"epoch": 4.8041649979975976,
"grad_norm": 0.3053361427605705,
"learning_rate": 3.0982891607188948e-06,
"loss": 0.0119,
"step": 6000
},
{
"epoch": 4.8041649979975976,
"eval_loss": 0.031129568815231323,
"eval_runtime": 97.1562,
"eval_samples_per_second": 1461.821,
"eval_steps_per_second": 2.861,
"step": 6000
},
{
"epoch": 4.884261113336003,
"grad_norm": 0.24611176483050773,
"learning_rate": 3.0301708453353118e-06,
"loss": 0.0121,
"step": 6100
},
{
"epoch": 4.964357228674409,
"grad_norm": 0.30724706018820913,
"learning_rate": 2.961637792663032e-06,
"loss": 0.0114,
"step": 6200
},
{
"epoch": 5.044052863436123,
"grad_norm": 0.38244952925905945,
"learning_rate": 2.8927436141231695e-06,
"loss": 0.0075,
"step": 6300
},
{
"epoch": 5.12414897877453,
"grad_norm": 0.20603254501695356,
"learning_rate": 2.8235422036351384e-06,
"loss": 0.0044,
"step": 6400
},
{
"epoch": 5.2042450941129355,
"grad_norm": 0.2637357295160275,
"learning_rate": 2.754087695457005e-06,
"loss": 0.0043,
"step": 6500
},
{
"epoch": 5.284341209451342,
"grad_norm": 0.5274953505653177,
"learning_rate": 2.684434421837821e-06,
"loss": 0.0045,
"step": 6600
},
{
"epoch": 5.364437324789748,
"grad_norm": 0.4238975113115418,
"learning_rate": 2.6146368705150854e-06,
"loss": 0.0048,
"step": 6700
},
{
"epoch": 5.444533440128154,
"grad_norm": 0.37315897649626995,
"learning_rate": 2.5447496420905608e-06,
"loss": 0.0045,
"step": 6800
},
{
"epoch": 5.52462955546656,
"grad_norm": 0.33573760401057196,
"learning_rate": 2.4748274073178114e-06,
"loss": 0.0045,
"step": 6900
},
{
"epoch": 5.6047256708049655,
"grad_norm": 0.4205706583224986,
"learning_rate": 2.4049248643348512e-06,
"loss": 0.0048,
"step": 7000
},
{
"epoch": 5.6047256708049655,
"eval_loss": 0.04388193413615227,
"eval_runtime": 97.237,
"eval_samples_per_second": 1460.607,
"eval_steps_per_second": 2.859,
"step": 7000
},
{
"epoch": 5.684821786143372,
"grad_norm": 0.3352151910327754,
"learning_rate": 2.3350966958753766e-06,
"loss": 0.0043,
"step": 7100
},
{
"epoch": 5.764917901481779,
"grad_norm": 0.30090375390642815,
"learning_rate": 2.265397526492052e-06,
"loss": 0.0043,
"step": 7200
},
{
"epoch": 5.845014016820184,
"grad_norm": 0.3649497532401096,
"learning_rate": 2.195881879825301e-06,
"loss": 0.0043,
"step": 7300
},
{
"epoch": 5.92511013215859,
"grad_norm": 0.26280545277109674,
"learning_rate": 2.1266041359510456e-06,
"loss": 0.0043,
"step": 7400
},
{
"epoch": 6.004805766920304,
"grad_norm": 0.13356592430041458,
"learning_rate": 2.057618488840745e-06,
"loss": 0.004,
"step": 7500
},
{
"epoch": 6.084901882258711,
"grad_norm": 0.5145664187486052,
"learning_rate": 1.9889789039670276e-06,
"loss": 0.0014,
"step": 7600
},
{
"epoch": 6.1649979975971165,
"grad_norm": 0.4699723392536862,
"learning_rate": 1.9207390760880605e-06,
"loss": 0.0014,
"step": 7700
},
{
"epoch": 6.245094112935522,
"grad_norm": 0.4374296333529995,
"learning_rate": 1.852952387243698e-06,
"loss": 0.0013,
"step": 7800
},
{
"epoch": 6.325190228273929,
"grad_norm": 0.42961763489773475,
"learning_rate": 1.7856718649962606e-06,
"loss": 0.0013,
"step": 7900
},
{
"epoch": 6.405286343612334,
"grad_norm": 0.2569945673800255,
"learning_rate": 1.7189501409486061e-06,
"loss": 0.0013,
"step": 8000
},
{
"epoch": 6.405286343612334,
"eval_loss": 0.053785648196935654,
"eval_runtime": 97.6698,
"eval_samples_per_second": 1454.134,
"eval_steps_per_second": 2.846,
"step": 8000
},
{
"epoch": 6.485382458950741,
"grad_norm": 0.36869037970588475,
"learning_rate": 1.6528394095719558e-06,
"loss": 0.0013,
"step": 8100
},
{
"epoch": 6.565478574289147,
"grad_norm": 0.2187928895873153,
"learning_rate": 1.587391387375669e-06,
"loss": 0.0014,
"step": 8200
},
{
"epoch": 6.645574689627553,
"grad_norm": 0.3346302824445088,
"learning_rate": 1.522657272450917e-06,
"loss": 0.0013,
"step": 8300
},
{
"epoch": 6.725670804965959,
"grad_norm": 0.22583442175391086,
"learning_rate": 1.4586877044199015e-06,
"loss": 0.0014,
"step": 8400
},
{
"epoch": 6.805766920304365,
"grad_norm": 0.24275774632690653,
"learning_rate": 1.3955327248219438e-06,
"loss": 0.0014,
"step": 8500
},
{
"epoch": 6.885863035642771,
"grad_norm": 0.322120144658376,
"learning_rate": 1.3332417379674426e-06,
"loss": 0.0012,
"step": 8600
},
{
"epoch": 6.965959150981178,
"grad_norm": 0.2971892796613953,
"learning_rate": 1.2718634722903073e-06,
"loss": 0.0013,
"step": 8700
},
{
"epoch": 7.045654785742891,
"grad_norm": 0.18481532134302478,
"learning_rate": 1.2114459422291205e-06,
"loss": 0.0007,
"step": 8800
},
{
"epoch": 7.125750901081298,
"grad_norm": 0.02287021398890685,
"learning_rate": 1.1520364106668342e-06,
"loss": 0.0003,
"step": 8900
},
{
"epoch": 7.205847016419703,
"grad_norm": 0.2801253618567114,
"learning_rate": 1.093681351958383e-06,
"loss": 0.0004,
"step": 9000
},
{
"epoch": 7.205847016419703,
"eval_loss": 0.06704169511795044,
"eval_runtime": 97.5519,
"eval_samples_per_second": 1455.892,
"eval_steps_per_second": 2.85,
"step": 9000
},
{
"epoch": 7.28594313175811,
"grad_norm": 0.05323383136377585,
"learning_rate": 1.0364264155751489e-06,
"loss": 0.0004,
"step": 9100
},
{
"epoch": 7.3660392470965155,
"grad_norm": 0.10232274475527954,
"learning_rate": 9.803163903946952e-07,
"loss": 0.0004,
"step": 9200
},
{
"epoch": 7.446135362434922,
"grad_norm": 0.1820198743026229,
"learning_rate": 9.253951696637311e-07,
"loss": 0.0004,
"step": 9300
},
{
"epoch": 7.526231477773328,
"grad_norm": 0.3077085461325738,
"learning_rate": 8.717057166616926e-07,
"loss": 0.0003,
"step": 9400
},
{
"epoch": 7.606327593111734,
"grad_norm": 0.02788433448517323,
"learning_rate": 8.192900310918206e-07,
"loss": 0.0004,
"step": 9500
},
{
"epoch": 7.68642370845014,
"grad_norm": 0.454935336405101,
"learning_rate": 7.681891162260016e-07,
"loss": 0.0004,
"step": 9600
},
{
"epoch": 7.766519823788546,
"grad_norm": 0.404101132597737,
"learning_rate": 7.184429468291023e-07,
"loss": 0.0003,
"step": 9700
},
{
"epoch": 7.846615939126952,
"grad_norm": 0.006797483493599147,
"learning_rate": 6.700904378878675e-07,
"loss": 0.0004,
"step": 9800
},
{
"epoch": 7.926712054465359,
"grad_norm": 0.253219681405225,
"learning_rate": 6.231694141688535e-07,
"loss": 0.0003,
"step": 9900
},
{
"epoch": 8.006407689227073,
"grad_norm": 0.01676261471498421,
"learning_rate": 5.777165806292109e-07,
"loss": 0.0003,
"step": 10000
},
{
"epoch": 8.006407689227073,
"eval_loss": 0.06978683918714523,
"eval_runtime": 97.4682,
"eval_samples_per_second": 1457.142,
"eval_steps_per_second": 2.852,
"step": 10000
},
{
"epoch": 8.086503804565478,
"grad_norm": 0.023667739210693765,
"learning_rate": 5.337674937034581e-07,
"loss": 0.0001,
"step": 10100
},
{
"epoch": 8.166599919903884,
"grad_norm": 0.19018699551662502,
"learning_rate": 4.913565334887135e-07,
"loss": 0.0001,
"step": 10200
},
{
"epoch": 8.246696035242291,
"grad_norm": 0.15520052508971907,
"learning_rate": 4.505168768501431e-07,
"loss": 0.0001,
"step": 10300
},
{
"epoch": 8.326792150580697,
"grad_norm": 0.002693072772497186,
"learning_rate": 4.1128047146765936e-07,
"loss": 0.0001,
"step": 10400
},
{
"epoch": 8.406888265919102,
"grad_norm": 0.07912436909277526,
"learning_rate": 3.736780108441762e-07,
"loss": 0.0001,
"step": 10500
},
{
"epoch": 8.486984381257509,
"grad_norm": 0.0704194063541305,
"learning_rate": 3.3773891029497326e-07,
"loss": 0.0001,
"step": 10600
},
{
"epoch": 8.567080496595915,
"grad_norm": 0.0031901574938772484,
"learning_rate": 3.034912839369447e-07,
"loss": 0.0,
"step": 10700
},
{
"epoch": 8.647176611934322,
"grad_norm": 0.3276934498065665,
"learning_rate": 2.70961922695743e-07,
"loss": 0.0001,
"step": 10800
},
{
"epoch": 8.727272727272727,
"grad_norm": 0.0018329070981180388,
"learning_rate": 2.401762733480115e-07,
"loss": 0.0,
"step": 10900
},
{
"epoch": 8.807368842611133,
"grad_norm": 0.002408780413295549,
"learning_rate": 2.1115841861510945e-07,
"loss": 0.0,
"step": 11000
},
{
"epoch": 8.807368842611133,
"eval_loss": 0.08943355828523636,
"eval_runtime": 97.2207,
"eval_samples_per_second": 1460.852,
"eval_steps_per_second": 2.859,
"step": 11000
},
{
"epoch": 8.88746495794954,
"grad_norm": 0.005344361337035522,
"learning_rate": 1.8393105832389791e-07,
"loss": 0.0,
"step": 11100
},
{
"epoch": 8.967561073287946,
"grad_norm": 0.008809607265012539,
"learning_rate": 1.5851549164932118e-07,
"loss": 0.0001,
"step": 11200
},
{
"epoch": 9.047256708049659,
"grad_norm": 0.004351746843250683,
"learning_rate": 1.349316004526824e-07,
"loss": 0.0,
"step": 11300
},
{
"epoch": 9.127352823388065,
"grad_norm": 0.0008942462172532464,
"learning_rate": 1.1319783372863601e-07,
"loss": 0.0,
"step": 11400
},
{
"epoch": 9.207448938726472,
"grad_norm": 0.0009999088005623051,
"learning_rate": 9.333119317307598e-08,
"loss": 0.0,
"step": 11500
},
{
"epoch": 9.287545054064879,
"grad_norm": 0.006833873365903121,
"learning_rate": 7.534721988320143e-08,
"loss": 0.0,
"step": 11600
},
{
"epoch": 9.367641169403283,
"grad_norm": 0.001580786758369194,
"learning_rate": 5.92599822001666e-08,
"loss": 0.0,
"step": 11700
},
{
"epoch": 9.44773728474169,
"grad_norm": 0.08223063305947663,
"learning_rate": 4.508206470382554e-08,
"loss": 0.0,
"step": 11800
},
{
"epoch": 9.527833400080096,
"grad_norm": 0.0003265712066290809,
"learning_rate": 3.2824558368179384e-08,
"loss": 0.0,
"step": 11900
},
{
"epoch": 9.607929515418503,
"grad_norm": 0.0005479447690907845,
"learning_rate": 2.2497051885228825e-08,
"loss": 0.0,
"step": 12000
},
{
"epoch": 9.607929515418503,
"eval_loss": 0.09308738261461258,
"eval_runtime": 97.3361,
"eval_samples_per_second": 1459.119,
"eval_steps_per_second": 2.856,
"step": 12000
},
{
"epoch": 9.688025630756908,
"grad_norm": 0.01973266591029808,
"learning_rate": 1.4107624164019229e-08,
"loss": 0.0,
"step": 12100
},
{
"epoch": 9.768121746095314,
"grad_norm": 0.0007774042502156854,
"learning_rate": 7.662838010742413e-09,
"loss": 0.0,
"step": 12200
},
{
"epoch": 9.84821786143372,
"grad_norm": 0.0003574216553306887,
"learning_rate": 3.1677349948461277e-09,
"loss": 0.0,
"step": 12300
},
{
"epoch": 9.928313976772127,
"grad_norm": 0.0005660328857731791,
"learning_rate": 6.258315051568819e-10,
"loss": 0.0,
"step": 12400
},
{
"epoch": 9.992390869042852,
"step": 12480,
"total_flos": 2784163811819520.0,
"train_loss": 0.025371345406674895,
"train_runtime": 36703.9164,
"train_samples_per_second": 348.252,
"train_steps_per_second": 0.34
}
],
"logging_steps": 100,
"max_steps": 12480,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2784163811819520.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}