reasonwang's picture
Upload folder using huggingface_hub
a1506a1 verified
{
"best_metric": 0.17886345088481903,
"best_model_checkpoint": "saves/chess/generate_strategy/checkpoint-19208",
"epoch": 9.996042491147678,
"eval_steps": 500,
"global_step": 24000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0416579879191835,
"grad_norm": 8.262849587594042,
"learning_rate": 2.0833333333333333e-07,
"loss": 3.9539,
"step": 100
},
{
"epoch": 0.083315975838367,
"grad_norm": 2.1815007336055197,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.4086,
"step": 200
},
{
"epoch": 0.12497396375755052,
"grad_norm": 1.094766614987478,
"learning_rate": 6.25e-07,
"loss": 0.2144,
"step": 300
},
{
"epoch": 0.166631951676734,
"grad_norm": 1.015902700288932,
"learning_rate": 8.333333333333333e-07,
"loss": 0.2103,
"step": 400
},
{
"epoch": 0.20828993959591752,
"grad_norm": 1.083927107302103,
"learning_rate": 1.0416666666666667e-06,
"loss": 0.2075,
"step": 500
},
{
"epoch": 0.24994792751510103,
"grad_norm": 0.8787980351861964,
"learning_rate": 1.25e-06,
"loss": 0.2049,
"step": 600
},
{
"epoch": 0.29160591543428455,
"grad_norm": 0.5454433660253264,
"learning_rate": 1.4583333333333335e-06,
"loss": 0.2001,
"step": 700
},
{
"epoch": 0.333263903353468,
"grad_norm": 0.6745519185509095,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.1916,
"step": 800
},
{
"epoch": 0.3749218912726515,
"grad_norm": 0.3263511819812891,
"learning_rate": 1.8750000000000003e-06,
"loss": 0.1849,
"step": 900
},
{
"epoch": 0.41657987919183503,
"grad_norm": 3.199309878765134,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.1847,
"step": 1000
},
{
"epoch": 0.45823786711101855,
"grad_norm": 0.4060106618321982,
"learning_rate": 2.2916666666666666e-06,
"loss": 0.1845,
"step": 1100
},
{
"epoch": 0.49989585503020206,
"grad_norm": 0.36591848729629267,
"learning_rate": 2.5e-06,
"loss": 0.1818,
"step": 1200
},
{
"epoch": 0.5415538429493856,
"grad_norm": 0.35361804320631923,
"learning_rate": 2.7083333333333334e-06,
"loss": 0.1807,
"step": 1300
},
{
"epoch": 0.5832118308685691,
"grad_norm": 0.35892337648275896,
"learning_rate": 2.916666666666667e-06,
"loss": 0.1806,
"step": 1400
},
{
"epoch": 0.6248698187877526,
"grad_norm": 0.2820867931414937,
"learning_rate": 3.125e-06,
"loss": 0.1806,
"step": 1500
},
{
"epoch": 0.666527806706936,
"grad_norm": 0.3098924570604735,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.1808,
"step": 1600
},
{
"epoch": 0.7081857946261195,
"grad_norm": 0.29714949257038253,
"learning_rate": 3.5416666666666673e-06,
"loss": 0.1803,
"step": 1700
},
{
"epoch": 0.749843782545303,
"grad_norm": 0.302226244442205,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.1805,
"step": 1800
},
{
"epoch": 0.7915017704644866,
"grad_norm": 0.3329180855942572,
"learning_rate": 3.958333333333333e-06,
"loss": 0.1833,
"step": 1900
},
{
"epoch": 0.8331597583836701,
"grad_norm": 0.28770265809452183,
"learning_rate": 4.166666666666667e-06,
"loss": 0.1807,
"step": 2000
},
{
"epoch": 0.8748177463028536,
"grad_norm": 0.3308819875323557,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.1804,
"step": 2100
},
{
"epoch": 0.9164757342220371,
"grad_norm": 0.3163212399640271,
"learning_rate": 4.583333333333333e-06,
"loss": 0.1805,
"step": 2200
},
{
"epoch": 0.9581337221412206,
"grad_norm": 0.3898310274135571,
"learning_rate": 4.791666666666668e-06,
"loss": 0.1803,
"step": 2300
},
{
"epoch": 0.9997917100604041,
"grad_norm": 0.27784332983216586,
"learning_rate": 5e-06,
"loss": 0.1808,
"step": 2400
},
{
"epoch": 1.0,
"eval_loss": 0.18106774985790253,
"eval_runtime": 196.6682,
"eval_samples_per_second": 1388.623,
"eval_steps_per_second": 2.715,
"step": 2401
},
{
"epoch": 1.0412414080399917,
"grad_norm": 0.2936543487056633,
"learning_rate": 4.999735579817769e-06,
"loss": 0.1807,
"step": 2500
},
{
"epoch": 1.082899395959175,
"grad_norm": 0.2809875255295402,
"learning_rate": 4.998942375205502e-06,
"loss": 0.1801,
"step": 2600
},
{
"epoch": 1.1245573838783587,
"grad_norm": 0.2760622198201079,
"learning_rate": 4.997620553954645e-06,
"loss": 0.1801,
"step": 2700
},
{
"epoch": 1.166215371797542,
"grad_norm": 0.2710350326429577,
"learning_rate": 4.995770395678171e-06,
"loss": 0.1803,
"step": 2800
},
{
"epoch": 1.2078733597167257,
"grad_norm": 0.20931696168572392,
"learning_rate": 4.993392291751431e-06,
"loss": 0.1803,
"step": 2900
},
{
"epoch": 1.2495313476359091,
"grad_norm": 0.24323887106839603,
"learning_rate": 4.990486745229364e-06,
"loss": 0.1799,
"step": 3000
},
{
"epoch": 1.2911893355550927,
"grad_norm": 0.2815796357302052,
"learning_rate": 4.9870543707400835e-06,
"loss": 0.1798,
"step": 3100
},
{
"epoch": 1.3328473234742761,
"grad_norm": 0.23664820561946712,
"learning_rate": 4.983095894354858e-06,
"loss": 0.1801,
"step": 3200
},
{
"epoch": 1.3745053113934598,
"grad_norm": 0.3083911955290968,
"learning_rate": 4.978612153434527e-06,
"loss": 0.1801,
"step": 3300
},
{
"epoch": 1.4161632993126432,
"grad_norm": 0.24337206279187154,
"learning_rate": 4.973604096452361e-06,
"loss": 0.1799,
"step": 3400
},
{
"epoch": 1.4578212872318268,
"grad_norm": 0.2691338598173961,
"learning_rate": 4.968072782793436e-06,
"loss": 0.1798,
"step": 3500
},
{
"epoch": 1.4994792751510102,
"grad_norm": 0.1859964729302664,
"learning_rate": 4.962019382530521e-06,
"loss": 0.18,
"step": 3600
},
{
"epoch": 1.5411372630701936,
"grad_norm": 0.29588302582709847,
"learning_rate": 4.955445176176577e-06,
"loss": 0.18,
"step": 3700
},
{
"epoch": 1.5827952509893772,
"grad_norm": 0.24224751463035848,
"learning_rate": 4.948351554413879e-06,
"loss": 0.1993,
"step": 3800
},
{
"epoch": 1.6244532389085609,
"grad_norm": 0.24926986804364754,
"learning_rate": 4.9407400177998335e-06,
"loss": 0.1799,
"step": 3900
},
{
"epoch": 1.6661112268277443,
"grad_norm": 0.26907499271712193,
"learning_rate": 4.93261217644956e-06,
"loss": 0.1796,
"step": 4000
},
{
"epoch": 1.7077692147469277,
"grad_norm": 0.24652167596434857,
"learning_rate": 4.9239697496952904e-06,
"loss": 0.1797,
"step": 4100
},
{
"epoch": 1.7494272026661113,
"grad_norm": 0.26360641338937,
"learning_rate": 4.914814565722671e-06,
"loss": 0.1797,
"step": 4200
},
{
"epoch": 1.7910851905852947,
"grad_norm": 0.21211424396568565,
"learning_rate": 4.905148561184033e-06,
"loss": 0.1798,
"step": 4300
},
{
"epoch": 1.832743178504478,
"grad_norm": 0.23174306094818595,
"learning_rate": 4.894973780788722e-06,
"loss": 0.1798,
"step": 4400
},
{
"epoch": 1.8744011664236617,
"grad_norm": 0.20239856810705756,
"learning_rate": 4.884292376870567e-06,
"loss": 0.1797,
"step": 4500
},
{
"epoch": 1.9160591543428453,
"grad_norm": 0.20895880362963307,
"learning_rate": 4.873106608932585e-06,
"loss": 0.1796,
"step": 4600
},
{
"epoch": 1.9577171422620288,
"grad_norm": 0.2341875351736524,
"learning_rate": 4.861418843169012e-06,
"loss": 0.1797,
"step": 4700
},
{
"epoch": 1.9993751301812122,
"grad_norm": 0.20045835157915606,
"learning_rate": 4.849231551964771e-06,
"loss": 0.1796,
"step": 4800
},
{
"epoch": 2.0,
"eval_loss": 0.17972978949546814,
"eval_runtime": 196.3636,
"eval_samples_per_second": 1390.777,
"eval_steps_per_second": 2.719,
"step": 4802
},
{
"epoch": 2.0408248281607997,
"grad_norm": 0.21309941078379252,
"learning_rate": 4.836547313372472e-06,
"loss": 0.1795,
"step": 4900
},
{
"epoch": 2.0824828160799833,
"grad_norm": 0.19717578427183138,
"learning_rate": 4.823368810567056e-06,
"loss": 0.1794,
"step": 5000
},
{
"epoch": 2.124140803999167,
"grad_norm": 0.23023011075724995,
"learning_rate": 4.809698831278217e-06,
"loss": 0.1802,
"step": 5100
},
{
"epoch": 2.16579879191835,
"grad_norm": 0.21578484379978355,
"learning_rate": 4.7955402672006855e-06,
"loss": 0.18,
"step": 5200
},
{
"epoch": 2.2074567798375337,
"grad_norm": 0.21410225528440446,
"learning_rate": 4.780896113382536e-06,
"loss": 0.1798,
"step": 5300
},
{
"epoch": 2.2491147677567174,
"grad_norm": 0.24923656549560563,
"learning_rate": 4.765769467591626e-06,
"loss": 0.1796,
"step": 5400
},
{
"epoch": 2.290772755675901,
"grad_norm": 0.27043973727195314,
"learning_rate": 4.750163529660303e-06,
"loss": 0.1799,
"step": 5500
},
{
"epoch": 2.332430743595084,
"grad_norm": 0.20084508849747548,
"learning_rate": 4.734081600808531e-06,
"loss": 0.1796,
"step": 5600
},
{
"epoch": 2.374088731514268,
"grad_norm": 0.17037675166345598,
"learning_rate": 4.717527082945555e-06,
"loss": 0.1797,
"step": 5700
},
{
"epoch": 2.4157467194334514,
"grad_norm": 0.20792174660657012,
"learning_rate": 4.700503477950278e-06,
"loss": 0.1797,
"step": 5800
},
{
"epoch": 2.457404707352635,
"grad_norm": 0.20444912332175158,
"learning_rate": 4.6830143869304904e-06,
"loss": 0.1799,
"step": 5900
},
{
"epoch": 2.4990626952718182,
"grad_norm": 0.2160441899332462,
"learning_rate": 4.665063509461098e-06,
"loss": 0.1797,
"step": 6000
},
{
"epoch": 2.540720683191002,
"grad_norm": 0.25556787549882387,
"learning_rate": 4.646654642801533e-06,
"loss": 0.1794,
"step": 6100
},
{
"epoch": 2.5823786711101855,
"grad_norm": 0.22198410769602075,
"learning_rate": 4.627791681092499e-06,
"loss": 0.1794,
"step": 6200
},
{
"epoch": 2.624036659029369,
"grad_norm": 0.19549701905963526,
"learning_rate": 4.608478614532215e-06,
"loss": 0.1795,
"step": 6300
},
{
"epoch": 2.6656946469485523,
"grad_norm": 0.24454736703986502,
"learning_rate": 4.588719528532342e-06,
"loss": 0.1797,
"step": 6400
},
{
"epoch": 2.707352634867736,
"grad_norm": 0.20111965276500102,
"learning_rate": 4.568518602853776e-06,
"loss": 0.1797,
"step": 6500
},
{
"epoch": 2.7490106227869195,
"grad_norm": 0.2155615827433472,
"learning_rate": 4.54788011072248e-06,
"loss": 0.1796,
"step": 6600
},
{
"epoch": 2.7906686107061027,
"grad_norm": 0.23518049751986453,
"learning_rate": 4.526808417925531e-06,
"loss": 0.1796,
"step": 6700
},
{
"epoch": 2.8323265986252864,
"grad_norm": 0.2088881277827675,
"learning_rate": 4.50530798188761e-06,
"loss": 0.1795,
"step": 6800
},
{
"epoch": 2.87398458654447,
"grad_norm": 0.22027451607755855,
"learning_rate": 4.4833833507280884e-06,
"loss": 0.1794,
"step": 6900
},
{
"epoch": 2.9156425744636536,
"grad_norm": 0.20366425013850817,
"learning_rate": 4.46103916229894e-06,
"loss": 0.1793,
"step": 7000
},
{
"epoch": 2.957300562382837,
"grad_norm": 0.2718663681076218,
"learning_rate": 4.438280143203665e-06,
"loss": 0.1796,
"step": 7100
},
{
"epoch": 2.9989585503020204,
"grad_norm": 0.19182709064421555,
"learning_rate": 4.415111107797445e-06,
"loss": 0.1794,
"step": 7200
},
{
"epoch": 3.0,
"eval_loss": 0.1794959157705307,
"eval_runtime": 196.4289,
"eval_samples_per_second": 1390.315,
"eval_steps_per_second": 2.719,
"step": 7203
},
{
"epoch": 3.040408248281608,
"grad_norm": 0.195058367609666,
"learning_rate": 4.391536957168733e-06,
"loss": 0.1798,
"step": 7300
},
{
"epoch": 3.0820662362007916,
"grad_norm": 0.2256357073328012,
"learning_rate": 4.367562678102491e-06,
"loss": 0.1795,
"step": 7400
},
{
"epoch": 3.123724224119975,
"grad_norm": 0.2129481809880029,
"learning_rate": 4.34319334202531e-06,
"loss": 0.1795,
"step": 7500
},
{
"epoch": 3.1653822120391584,
"grad_norm": 0.1689665633552094,
"learning_rate": 4.318434103932622e-06,
"loss": 0.1795,
"step": 7600
},
{
"epoch": 3.207040199958342,
"grad_norm": 0.18434140023135,
"learning_rate": 4.293290201298224e-06,
"loss": 0.1796,
"step": 7700
},
{
"epoch": 3.2486981878775256,
"grad_norm": 0.2103528683280332,
"learning_rate": 4.267766952966369e-06,
"loss": 0.1793,
"step": 7800
},
{
"epoch": 3.290356175796709,
"grad_norm": 0.16087446181904855,
"learning_rate": 4.241869758026638e-06,
"loss": 0.1794,
"step": 7900
},
{
"epoch": 3.3320141637158924,
"grad_norm": 0.22569144057534085,
"learning_rate": 4.215604094671835e-06,
"loss": 0.1792,
"step": 8000
},
{
"epoch": 3.373672151635076,
"grad_norm": 0.19990473196998446,
"learning_rate": 4.188975519039151e-06,
"loss": 0.1794,
"step": 8100
},
{
"epoch": 3.4153301395542597,
"grad_norm": 0.1902243355455867,
"learning_rate": 4.161989664034844e-06,
"loss": 0.1794,
"step": 8200
},
{
"epoch": 3.456988127473443,
"grad_norm": 0.18824118604006632,
"learning_rate": 4.134652238142674e-06,
"loss": 0.1794,
"step": 8300
},
{
"epoch": 3.4986461153926265,
"grad_norm": 0.19597204875441573,
"learning_rate": 4.106969024216348e-06,
"loss": 0.1794,
"step": 8400
},
{
"epoch": 3.54030410331181,
"grad_norm": 0.17674897479656335,
"learning_rate": 4.078945878256244e-06,
"loss": 0.1793,
"step": 8500
},
{
"epoch": 3.5819620912309933,
"grad_norm": 0.19658906636767987,
"learning_rate": 4.0505887281706505e-06,
"loss": 0.1794,
"step": 8600
},
{
"epoch": 3.623620079150177,
"grad_norm": 0.1607909455989355,
"learning_rate": 4.021903572521802e-06,
"loss": 0.1794,
"step": 8700
},
{
"epoch": 3.6652780670693605,
"grad_norm": 0.18982136425367155,
"learning_rate": 3.992896479256966e-06,
"loss": 0.1793,
"step": 8800
},
{
"epoch": 3.706936054988544,
"grad_norm": 0.18212426964310202,
"learning_rate": 3.963573584424852e-06,
"loss": 0.1794,
"step": 8900
},
{
"epoch": 3.748594042907728,
"grad_norm": 0.18731109638030716,
"learning_rate": 3.933941090877615e-06,
"loss": 0.1799,
"step": 9000
},
{
"epoch": 3.790252030826911,
"grad_norm": 0.2243920924541318,
"learning_rate": 3.9040052669587325e-06,
"loss": 0.1863,
"step": 9100
},
{
"epoch": 3.8319100187460946,
"grad_norm": 0.19665494095424324,
"learning_rate": 3.8737724451770155e-06,
"loss": 0.1793,
"step": 9200
},
{
"epoch": 3.8735680066652782,
"grad_norm": 0.1709097835399287,
"learning_rate": 3.8432490208670605e-06,
"loss": 0.1792,
"step": 9300
},
{
"epoch": 3.9152259945844614,
"grad_norm": 0.1519558310026607,
"learning_rate": 3.8124414508364005e-06,
"loss": 0.1792,
"step": 9400
},
{
"epoch": 3.956883982503645,
"grad_norm": 0.18615584510557248,
"learning_rate": 3.7813562519996633e-06,
"loss": 0.1791,
"step": 9500
},
{
"epoch": 3.9985419704228287,
"grad_norm": 0.14216906700933155,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.1792,
"step": 9600
},
{
"epoch": 4.0,
"eval_loss": 0.17919312417507172,
"eval_runtime": 196.5199,
"eval_samples_per_second": 1389.671,
"eval_steps_per_second": 2.717,
"step": 9604
},
{
"epoch": 4.039991668402416,
"grad_norm": 0.1981373334933009,
"learning_rate": 3.7183793278181063e-06,
"loss": 0.1793,
"step": 9700
},
{
"epoch": 4.081649656321599,
"grad_norm": 0.1796707844873524,
"learning_rate": 3.6865009243691015e-06,
"loss": 0.1791,
"step": 9800
},
{
"epoch": 4.123307644240783,
"grad_norm": 0.21582792834146144,
"learning_rate": 3.654371533087586e-06,
"loss": 0.1792,
"step": 9900
},
{
"epoch": 4.164965632159967,
"grad_norm": 0.22285894509633086,
"learning_rate": 3.621997950501156e-06,
"loss": 0.179,
"step": 10000
},
{
"epoch": 4.20662362007915,
"grad_norm": 0.1947839176316504,
"learning_rate": 3.5893870247926986e-06,
"loss": 0.1792,
"step": 10100
},
{
"epoch": 4.248281607998334,
"grad_norm": 0.18044045004936568,
"learning_rate": 3.556545654351749e-06,
"loss": 0.1791,
"step": 10200
},
{
"epoch": 4.2899395959175175,
"grad_norm": 0.21629122720481903,
"learning_rate": 3.5234807863152316e-06,
"loss": 0.1793,
"step": 10300
},
{
"epoch": 4.3315975838367,
"grad_norm": 0.15404290423986947,
"learning_rate": 3.4901994150978926e-06,
"loss": 0.1791,
"step": 10400
},
{
"epoch": 4.373255571755884,
"grad_norm": 0.16032922618842949,
"learning_rate": 3.4567085809127247e-06,
"loss": 0.1791,
"step": 10500
},
{
"epoch": 4.4149135596750675,
"grad_norm": 0.1495191719599753,
"learning_rate": 3.4230153682817112e-06,
"loss": 0.1791,
"step": 10600
},
{
"epoch": 4.456571547594251,
"grad_norm": 0.19697439856186114,
"learning_rate": 3.389126904537192e-06,
"loss": 0.1791,
"step": 10700
},
{
"epoch": 4.498229535513435,
"grad_norm": 0.17156322418134476,
"learning_rate": 3.3550503583141726e-06,
"loss": 0.1791,
"step": 10800
},
{
"epoch": 4.539887523432618,
"grad_norm": 0.1561878142062692,
"learning_rate": 3.3207929380339034e-06,
"loss": 0.1792,
"step": 10900
},
{
"epoch": 4.581545511351802,
"grad_norm": 0.1828679685381653,
"learning_rate": 3.2863618903790346e-06,
"loss": 0.1791,
"step": 11000
},
{
"epoch": 4.623203499270986,
"grad_norm": 0.1802733896031037,
"learning_rate": 3.2517644987606827e-06,
"loss": 0.1792,
"step": 11100
},
{
"epoch": 4.664861487190168,
"grad_norm": 0.15579534435978112,
"learning_rate": 3.217008081777726e-06,
"loss": 0.1791,
"step": 11200
},
{
"epoch": 4.706519475109352,
"grad_norm": 0.16638908065693153,
"learning_rate": 3.182099991668653e-06,
"loss": 0.1791,
"step": 11300
},
{
"epoch": 4.748177463028536,
"grad_norm": 0.18397163828033228,
"learning_rate": 3.147047612756302e-06,
"loss": 0.1792,
"step": 11400
},
{
"epoch": 4.789835450947719,
"grad_norm": 0.17751483450519995,
"learning_rate": 3.1118583598858097e-06,
"loss": 0.179,
"step": 11500
},
{
"epoch": 4.831493438866903,
"grad_norm": 0.1808778224251496,
"learning_rate": 3.0765396768561005e-06,
"loss": 0.179,
"step": 11600
},
{
"epoch": 4.8731514267860865,
"grad_norm": 0.17593346330767928,
"learning_rate": 3.0410990348452572e-06,
"loss": 0.1793,
"step": 11700
},
{
"epoch": 4.91480941470527,
"grad_norm": 0.15824861181745342,
"learning_rate": 3.0055439308300954e-06,
"loss": 0.1791,
"step": 11800
},
{
"epoch": 4.956467402624453,
"grad_norm": 0.21055777806239853,
"learning_rate": 2.96988188600028e-06,
"loss": 0.1792,
"step": 11900
},
{
"epoch": 4.9981253905436365,
"grad_norm": 0.15352806003656314,
"learning_rate": 2.9341204441673267e-06,
"loss": 0.1791,
"step": 12000
},
{
"epoch": 5.0,
"eval_loss": 0.17911389470100403,
"eval_runtime": 196.4564,
"eval_samples_per_second": 1390.12,
"eval_steps_per_second": 2.718,
"step": 12005
},
{
"epoch": 5.0395750885232244,
"grad_norm": 0.1891820592041876,
"learning_rate": 2.898267170168807e-06,
"loss": 0.1791,
"step": 12100
},
{
"epoch": 5.081233076442408,
"grad_norm": 0.14302405130068518,
"learning_rate": 2.862329648268117e-06,
"loss": 0.1789,
"step": 12200
},
{
"epoch": 5.122891064361592,
"grad_norm": 0.2215960599158716,
"learning_rate": 2.82631548055013e-06,
"loss": 0.1792,
"step": 12300
},
{
"epoch": 5.164549052280774,
"grad_norm": 0.1566593937408507,
"learning_rate": 2.7902322853130758e-06,
"loss": 0.179,
"step": 12400
},
{
"epoch": 5.206207040199958,
"grad_norm": 0.15513379693358573,
"learning_rate": 2.754087695457005e-06,
"loss": 0.1791,
"step": 12500
},
{
"epoch": 5.247865028119142,
"grad_norm": 0.14968722299942713,
"learning_rate": 2.717889356869146e-06,
"loss": 0.179,
"step": 12600
},
{
"epoch": 5.289523016038325,
"grad_norm": 0.2097123380235341,
"learning_rate": 2.681644926806527e-06,
"loss": 0.179,
"step": 12700
},
{
"epoch": 5.331181003957509,
"grad_norm": 0.19315969222642626,
"learning_rate": 2.6453620722761897e-06,
"loss": 0.179,
"step": 12800
},
{
"epoch": 5.372838991876693,
"grad_norm": 0.2209634744371871,
"learning_rate": 2.6090484684133406e-06,
"loss": 0.1791,
"step": 12900
},
{
"epoch": 5.414496979795876,
"grad_norm": 0.20430693758591473,
"learning_rate": 2.572711796857779e-06,
"loss": 0.179,
"step": 13000
},
{
"epoch": 5.45615496771506,
"grad_norm": 0.18903967369853375,
"learning_rate": 2.5363597441289574e-06,
"loss": 0.179,
"step": 13100
},
{
"epoch": 5.4978129556342425,
"grad_norm": 0.15616083753477006,
"learning_rate": 2.5e-06,
"loss": 0.179,
"step": 13200
},
{
"epoch": 5.539470943553426,
"grad_norm": 0.1507559008561688,
"learning_rate": 2.4636402558710434e-06,
"loss": 0.1791,
"step": 13300
},
{
"epoch": 5.58112893147261,
"grad_norm": 0.16640062646644058,
"learning_rate": 2.4272882031422216e-06,
"loss": 0.179,
"step": 13400
},
{
"epoch": 5.622786919391793,
"grad_norm": 0.1824434916593794,
"learning_rate": 2.3909515315866606e-06,
"loss": 0.1791,
"step": 13500
},
{
"epoch": 5.664444907310977,
"grad_norm": 0.2004975100759413,
"learning_rate": 2.3546379277238107e-06,
"loss": 0.179,
"step": 13600
},
{
"epoch": 5.706102895230161,
"grad_norm": 0.17154522514366766,
"learning_rate": 2.318355073193474e-06,
"loss": 0.1791,
"step": 13700
},
{
"epoch": 5.747760883149343,
"grad_norm": 0.13248550006328844,
"learning_rate": 2.2821106431308546e-06,
"loss": 0.179,
"step": 13800
},
{
"epoch": 5.789418871068527,
"grad_norm": 0.1915171020600886,
"learning_rate": 2.2459123045429953e-06,
"loss": 0.1792,
"step": 13900
},
{
"epoch": 5.831076858987711,
"grad_norm": 0.16235356856597902,
"learning_rate": 2.2097677146869242e-06,
"loss": 0.1791,
"step": 14000
},
{
"epoch": 5.872734846906894,
"grad_norm": 0.1627140490119954,
"learning_rate": 2.173684519449872e-06,
"loss": 0.1789,
"step": 14100
},
{
"epoch": 5.914392834826078,
"grad_norm": 0.16466884224746445,
"learning_rate": 2.1376703517318835e-06,
"loss": 0.179,
"step": 14200
},
{
"epoch": 5.9560508227452615,
"grad_norm": 0.20611687756993843,
"learning_rate": 2.101732829831194e-06,
"loss": 0.179,
"step": 14300
},
{
"epoch": 5.997708810664445,
"grad_norm": 0.16559158144998481,
"learning_rate": 2.0658795558326745e-06,
"loss": 0.179,
"step": 14400
},
{
"epoch": 6.0,
"eval_loss": 0.17907121777534485,
"eval_runtime": 196.4273,
"eval_samples_per_second": 1390.326,
"eval_steps_per_second": 2.719,
"step": 14406
},
{
"epoch": 6.039158508644032,
"grad_norm": 0.16927649861039284,
"learning_rate": 2.0301181139997206e-06,
"loss": 0.1789,
"step": 14500
},
{
"epoch": 6.080816496563216,
"grad_norm": 0.1752142512252337,
"learning_rate": 1.994456069169906e-06,
"loss": 0.179,
"step": 14600
},
{
"epoch": 6.1224744844823995,
"grad_norm": 0.21170178196900302,
"learning_rate": 1.958900965154743e-06,
"loss": 0.1789,
"step": 14700
},
{
"epoch": 6.164132472401583,
"grad_norm": 0.21884267966966597,
"learning_rate": 1.9234603231439e-06,
"loss": 0.1788,
"step": 14800
},
{
"epoch": 6.205790460320767,
"grad_norm": 0.17106948371146288,
"learning_rate": 1.8881416401141905e-06,
"loss": 0.1788,
"step": 14900
},
{
"epoch": 6.24744844823995,
"grad_norm": 0.174097273230219,
"learning_rate": 1.852952387243698e-06,
"loss": 0.1788,
"step": 15000
},
{
"epoch": 6.289106436159133,
"grad_norm": 0.20862365699110258,
"learning_rate": 1.8179000083313483e-06,
"loss": 0.1788,
"step": 15100
},
{
"epoch": 6.330764424078317,
"grad_norm": 0.17885797151549512,
"learning_rate": 1.7829919182222752e-06,
"loss": 0.1788,
"step": 15200
},
{
"epoch": 6.3724224119975,
"grad_norm": 0.19498914359958716,
"learning_rate": 1.7482355012393177e-06,
"loss": 0.1789,
"step": 15300
},
{
"epoch": 6.414080399916684,
"grad_norm": 0.1389966716220221,
"learning_rate": 1.7136381096209665e-06,
"loss": 0.179,
"step": 15400
},
{
"epoch": 6.455738387835868,
"grad_norm": 0.1786092324697337,
"learning_rate": 1.6792070619660977e-06,
"loss": 0.179,
"step": 15500
},
{
"epoch": 6.497396375755051,
"grad_norm": 0.19161758807721282,
"learning_rate": 1.6449496416858285e-06,
"loss": 0.1788,
"step": 15600
},
{
"epoch": 6.539054363674235,
"grad_norm": 0.19197303954060144,
"learning_rate": 1.6108730954628093e-06,
"loss": 0.1788,
"step": 15700
},
{
"epoch": 6.580712351593418,
"grad_norm": 0.16743828588501417,
"learning_rate": 1.5769846317182894e-06,
"loss": 0.1787,
"step": 15800
},
{
"epoch": 6.622370339512601,
"grad_norm": 0.16492318029574304,
"learning_rate": 1.5432914190872757e-06,
"loss": 0.1788,
"step": 15900
},
{
"epoch": 6.664028327431785,
"grad_norm": 0.15440438163304784,
"learning_rate": 1.509800584902108e-06,
"loss": 0.1789,
"step": 16000
},
{
"epoch": 6.7056863153509685,
"grad_norm": 0.17667275704806315,
"learning_rate": 1.4765192136847686e-06,
"loss": 0.1789,
"step": 16100
},
{
"epoch": 6.747344303270152,
"grad_norm": 0.17904015323124156,
"learning_rate": 1.443454345648252e-06,
"loss": 0.1789,
"step": 16200
},
{
"epoch": 6.789002291189336,
"grad_norm": 0.16736730033822061,
"learning_rate": 1.4106129752073023e-06,
"loss": 0.179,
"step": 16300
},
{
"epoch": 6.830660279108519,
"grad_norm": 0.16038102753372047,
"learning_rate": 1.3780020494988447e-06,
"loss": 0.179,
"step": 16400
},
{
"epoch": 6.872318267027703,
"grad_norm": 0.15315299560909978,
"learning_rate": 1.3456284669124159e-06,
"loss": 0.1786,
"step": 16500
},
{
"epoch": 6.913976254946886,
"grad_norm": 0.1430660492396621,
"learning_rate": 1.313499075630899e-06,
"loss": 0.179,
"step": 16600
},
{
"epoch": 6.955634242866069,
"grad_norm": 0.17326024703322063,
"learning_rate": 1.2816206721818944e-06,
"loss": 0.1789,
"step": 16700
},
{
"epoch": 6.997292230785253,
"grad_norm": 0.14987232796770428,
"learning_rate": 1.2500000000000007e-06,
"loss": 0.1787,
"step": 16800
},
{
"epoch": 7.0,
"eval_loss": 0.17893224954605103,
"eval_runtime": 196.4121,
"eval_samples_per_second": 1390.434,
"eval_steps_per_second": 2.719,
"step": 16807
},
{
"epoch": 7.038741928764841,
"grad_norm": 0.1439804790666206,
"learning_rate": 1.218643748000337e-06,
"loss": 0.1787,
"step": 16900
},
{
"epoch": 7.080399916684025,
"grad_norm": 0.1820620837643405,
"learning_rate": 1.1875585491636e-06,
"loss": 0.1788,
"step": 17000
},
{
"epoch": 7.122057904603207,
"grad_norm": 0.1619570282327302,
"learning_rate": 1.1567509791329402e-06,
"loss": 0.1786,
"step": 17100
},
{
"epoch": 7.163715892522391,
"grad_norm": 0.2470491812569796,
"learning_rate": 1.1262275548229852e-06,
"loss": 0.1791,
"step": 17200
},
{
"epoch": 7.205373880441575,
"grad_norm": 0.18058952670407366,
"learning_rate": 1.0959947330412681e-06,
"loss": 0.1789,
"step": 17300
},
{
"epoch": 7.247031868360758,
"grad_norm": 0.20589528394837478,
"learning_rate": 1.0660589091223854e-06,
"loss": 0.1786,
"step": 17400
},
{
"epoch": 7.288689856279942,
"grad_norm": 0.13562633767825757,
"learning_rate": 1.0364264155751489e-06,
"loss": 0.1786,
"step": 17500
},
{
"epoch": 7.330347844199125,
"grad_norm": 0.194696644563295,
"learning_rate": 1.0071035207430352e-06,
"loss": 0.1787,
"step": 17600
},
{
"epoch": 7.372005832118309,
"grad_norm": 0.19213496981753242,
"learning_rate": 9.780964274781984e-07,
"loss": 0.1786,
"step": 17700
},
{
"epoch": 7.413663820037492,
"grad_norm": 0.19876379595232896,
"learning_rate": 9.494112718293503e-07,
"loss": 0.1787,
"step": 17800
},
{
"epoch": 7.455321807956675,
"grad_norm": 0.1684329683430977,
"learning_rate": 9.210541217437566e-07,
"loss": 0.1787,
"step": 17900
},
{
"epoch": 7.496979795875859,
"grad_norm": 0.1823625942631362,
"learning_rate": 8.930309757836517e-07,
"loss": 0.1785,
"step": 18000
},
{
"epoch": 7.538637783795043,
"grad_norm": 0.18725762365246973,
"learning_rate": 8.653477618573261e-07,
"loss": 0.1786,
"step": 18100
},
{
"epoch": 7.580295771714226,
"grad_norm": 0.1507247392992477,
"learning_rate": 8.380103359651554e-07,
"loss": 0.1787,
"step": 18200
},
{
"epoch": 7.62195375963341,
"grad_norm": 0.18505299719524845,
"learning_rate": 8.110244809608494e-07,
"loss": 0.1786,
"step": 18300
},
{
"epoch": 7.663611747552594,
"grad_norm": 0.12101506184025812,
"learning_rate": 7.843959053281663e-07,
"loss": 0.1786,
"step": 18400
},
{
"epoch": 7.705269735471777,
"grad_norm": 0.16939344528667466,
"learning_rate": 7.581302419733633e-07,
"loss": 0.1785,
"step": 18500
},
{
"epoch": 7.74692772339096,
"grad_norm": 0.13840737012325652,
"learning_rate": 7.322330470336314e-07,
"loss": 0.1785,
"step": 18600
},
{
"epoch": 7.7885857113101435,
"grad_norm": 0.16859264286478876,
"learning_rate": 7.067097987017762e-07,
"loss": 0.1787,
"step": 18700
},
{
"epoch": 7.830243699229327,
"grad_norm": 0.1897535110592711,
"learning_rate": 6.815658960673782e-07,
"loss": 0.1785,
"step": 18800
},
{
"epoch": 7.871901687148511,
"grad_norm": 0.18368265058091485,
"learning_rate": 6.568066579746901e-07,
"loss": 0.1785,
"step": 18900
},
{
"epoch": 7.913559675067694,
"grad_norm": 0.13696515467419504,
"learning_rate": 6.324373218975105e-07,
"loss": 0.1786,
"step": 19000
},
{
"epoch": 7.955217662986878,
"grad_norm": 0.14354515830035847,
"learning_rate": 6.084630428312679e-07,
"loss": 0.1785,
"step": 19100
},
{
"epoch": 7.996875650906061,
"grad_norm": 0.15165778139105265,
"learning_rate": 5.848888922025553e-07,
"loss": 0.1786,
"step": 19200
},
{
"epoch": 8.0,
"eval_loss": 0.17886345088481903,
"eval_runtime": 196.5554,
"eval_samples_per_second": 1389.42,
"eval_steps_per_second": 2.717,
"step": 19208
},
{
"epoch": 8.03832534888565,
"grad_norm": 0.15763312404128105,
"learning_rate": 5.617198567963353e-07,
"loss": 0.1783,
"step": 19300
},
{
"epoch": 8.079983336804831,
"grad_norm": 0.1720429493205497,
"learning_rate": 5.389608377010608e-07,
"loss": 0.1783,
"step": 19400
},
{
"epoch": 8.121641324724015,
"grad_norm": 0.1690726413308925,
"learning_rate": 5.166166492719124e-07,
"loss": 0.1783,
"step": 19500
},
{
"epoch": 8.163299312643199,
"grad_norm": 0.17909925356768044,
"learning_rate": 4.946920181123904e-07,
"loss": 0.1782,
"step": 19600
},
{
"epoch": 8.204957300562382,
"grad_norm": 0.22116088190481087,
"learning_rate": 4.7319158207446953e-07,
"loss": 0.1782,
"step": 19700
},
{
"epoch": 8.246615288481566,
"grad_norm": 0.16383363990929287,
"learning_rate": 4.5211988927752026e-07,
"loss": 0.1782,
"step": 19800
},
{
"epoch": 8.28827327640075,
"grad_norm": 0.18255215192836688,
"learning_rate": 4.3148139714622365e-07,
"loss": 0.1782,
"step": 19900
},
{
"epoch": 8.329931264319933,
"grad_norm": 0.19783668808521335,
"learning_rate": 4.1128047146765936e-07,
"loss": 0.1781,
"step": 20000
},
{
"epoch": 8.371589252239117,
"grad_norm": 0.1828620345488146,
"learning_rate": 3.915213854677863e-07,
"loss": 0.1781,
"step": 20100
},
{
"epoch": 8.4132472401583,
"grad_norm": 0.1461266269903454,
"learning_rate": 3.722083189075007e-07,
"loss": 0.1782,
"step": 20200
},
{
"epoch": 8.454905228077484,
"grad_norm": 0.19063937525748337,
"learning_rate": 3.5334535719846767e-07,
"loss": 0.1781,
"step": 20300
},
{
"epoch": 8.496563215996668,
"grad_norm": 0.12678778363904367,
"learning_rate": 3.3493649053890325e-07,
"loss": 0.1781,
"step": 20400
},
{
"epoch": 8.538221203915851,
"grad_norm": 0.15880039262804566,
"learning_rate": 3.1698561306951065e-07,
"loss": 0.1782,
"step": 20500
},
{
"epoch": 8.579879191835035,
"grad_norm": 0.18763241075198428,
"learning_rate": 2.9949652204972257e-07,
"loss": 0.178,
"step": 20600
},
{
"epoch": 8.621537179754219,
"grad_norm": 0.1582482612527278,
"learning_rate": 2.8247291705444575e-07,
"loss": 0.1778,
"step": 20700
},
{
"epoch": 8.6631951676734,
"grad_norm": 0.181992432758085,
"learning_rate": 2.6591839919146963e-07,
"loss": 0.178,
"step": 20800
},
{
"epoch": 8.704853155592584,
"grad_norm": 0.1463913122272469,
"learning_rate": 2.4983647033969714e-07,
"loss": 0.1783,
"step": 20900
},
{
"epoch": 8.746511143511768,
"grad_norm": 0.15649171707147957,
"learning_rate": 2.3423053240837518e-07,
"loss": 0.1781,
"step": 21000
},
{
"epoch": 8.788169131430951,
"grad_norm": 0.16428482803404829,
"learning_rate": 2.1910388661746495e-07,
"loss": 0.1782,
"step": 21100
},
{
"epoch": 8.829827119350135,
"grad_norm": 0.19349382720192548,
"learning_rate": 2.044597327993153e-07,
"loss": 0.1781,
"step": 21200
},
{
"epoch": 8.871485107269319,
"grad_norm": 0.1678737628788564,
"learning_rate": 1.9030116872178317e-07,
"loss": 0.1781,
"step": 21300
},
{
"epoch": 8.913143095188502,
"grad_norm": 0.187501462753097,
"learning_rate": 1.7663118943294367e-07,
"loss": 0.1781,
"step": 21400
},
{
"epoch": 8.954801083107686,
"grad_norm": 0.17102799413092362,
"learning_rate": 1.6345268662752904e-07,
"loss": 0.1781,
"step": 21500
},
{
"epoch": 8.99645907102687,
"grad_norm": 0.14591121551272715,
"learning_rate": 1.507684480352292e-07,
"loss": 0.1781,
"step": 21600
},
{
"epoch": 9.0,
"eval_loss": 0.17907947301864624,
"eval_runtime": 196.3329,
"eval_samples_per_second": 1390.995,
"eval_steps_per_second": 2.72,
"step": 21609
},
{
"epoch": 9.037908769006457,
"grad_norm": 0.1816902644971728,
"learning_rate": 1.3858115683098832e-07,
"loss": 0.177,
"step": 21700
},
{
"epoch": 9.07956675692564,
"grad_norm": 0.18741449385017522,
"learning_rate": 1.2689339106741529e-07,
"loss": 0.1767,
"step": 21800
},
{
"epoch": 9.121224744844824,
"grad_norm": 0.20197534473429568,
"learning_rate": 1.1570762312943295e-07,
"loss": 0.1768,
"step": 21900
},
{
"epoch": 9.162882732764007,
"grad_norm": 0.21639195747399645,
"learning_rate": 1.0502621921127776e-07,
"loss": 0.1767,
"step": 22000
},
{
"epoch": 9.204540720683191,
"grad_norm": 0.18933606645836426,
"learning_rate": 9.485143881596715e-08,
"loss": 0.1768,
"step": 22100
},
{
"epoch": 9.246198708602375,
"grad_norm": 0.1960648079791721,
"learning_rate": 8.518543427732951e-08,
"loss": 0.1767,
"step": 22200
},
{
"epoch": 9.287856696521558,
"grad_norm": 0.18056583891057434,
"learning_rate": 7.603025030471001e-08,
"loss": 0.1766,
"step": 22300
},
{
"epoch": 9.329514684440742,
"grad_norm": 0.18480124722464905,
"learning_rate": 6.738782355044048e-08,
"loss": 0.1769,
"step": 22400
},
{
"epoch": 9.371172672359926,
"grad_norm": 0.22786425388668805,
"learning_rate": 5.92599822001666e-08,
"loss": 0.1767,
"step": 22500
},
{
"epoch": 9.41283066027911,
"grad_norm": 0.2205541920741548,
"learning_rate": 5.164844558612131e-08,
"loss": 0.1766,
"step": 22600
},
{
"epoch": 9.454488648198293,
"grad_norm": 0.2134938008984885,
"learning_rate": 4.455482382342336e-08,
"loss": 0.1767,
"step": 22700
},
{
"epoch": 9.496146636117475,
"grad_norm": 0.23030736326238382,
"learning_rate": 3.798061746947995e-08,
"loss": 0.1767,
"step": 22800
},
{
"epoch": 9.537804624036658,
"grad_norm": 0.2214355490299709,
"learning_rate": 3.1927217206564884e-08,
"loss": 0.1767,
"step": 22900
},
{
"epoch": 9.579462611955842,
"grad_norm": 0.2291392443441154,
"learning_rate": 2.6395903547638825e-08,
"loss": 0.1765,
"step": 23000
},
{
"epoch": 9.621120599875026,
"grad_norm": 0.22120778210484332,
"learning_rate": 2.1387846565474047e-08,
"loss": 0.1765,
"step": 23100
},
{
"epoch": 9.66277858779421,
"grad_norm": 0.1927066727358843,
"learning_rate": 1.6904105645142443e-08,
"loss": 0.1765,
"step": 23200
},
{
"epoch": 9.704436575713393,
"grad_norm": 0.2369391538896648,
"learning_rate": 1.2945629259917547e-08,
"loss": 0.1766,
"step": 23300
},
{
"epoch": 9.746094563632576,
"grad_norm": 0.21269587694232558,
"learning_rate": 9.513254770636138e-09,
"loss": 0.1767,
"step": 23400
},
{
"epoch": 9.78775255155176,
"grad_norm": 0.20767475535201343,
"learning_rate": 6.607708248569378e-09,
"loss": 0.1766,
"step": 23500
},
{
"epoch": 9.829410539470944,
"grad_norm": 0.21058981271348698,
"learning_rate": 4.229604321829561e-09,
"loss": 0.1766,
"step": 23600
},
{
"epoch": 9.871068527390127,
"grad_norm": 0.18917603463369678,
"learning_rate": 2.3794460453555046e-09,
"loss": 0.1766,
"step": 23700
},
{
"epoch": 9.912726515309311,
"grad_norm": 0.18145195315540197,
"learning_rate": 1.0576247944985018e-09,
"loss": 0.1767,
"step": 23800
},
{
"epoch": 9.954384503228495,
"grad_norm": 0.22385123601872012,
"learning_rate": 2.6442018223132857e-10,
"loss": 0.1766,
"step": 23900
},
{
"epoch": 9.996042491147678,
"grad_norm": 0.22063368359660335,
"learning_rate": 0.0,
"loss": 0.1766,
"step": 24000
},
{
"epoch": 9.996042491147678,
"eval_loss": 0.18023133277893066,
"eval_runtime": 196.0313,
"eval_samples_per_second": 1393.135,
"eval_steps_per_second": 2.724,
"step": 24000
},
{
"epoch": 9.996042491147678,
"step": 24000,
"total_flos": 5485114750402560.0,
"train_loss": 0.19645737719535827,
"train_runtime": 70712.6152,
"train_samples_per_second": 347.587,
"train_steps_per_second": 0.339
}
],
"logging_steps": 100,
"max_steps": 24000,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5485114750402560.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}