{ "best_metric": 0.17886345088481903, "best_model_checkpoint": "saves/chess/generate_strategy/checkpoint-19208", "epoch": 9.996042491147678, "eval_steps": 500, "global_step": 24000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0416579879191835, "grad_norm": 8.262849587594042, "learning_rate": 2.0833333333333333e-07, "loss": 3.9539, "step": 100 }, { "epoch": 0.083315975838367, "grad_norm": 2.1815007336055197, "learning_rate": 4.1666666666666667e-07, "loss": 0.4086, "step": 200 }, { "epoch": 0.12497396375755052, "grad_norm": 1.094766614987478, "learning_rate": 6.25e-07, "loss": 0.2144, "step": 300 }, { "epoch": 0.166631951676734, "grad_norm": 1.015902700288932, "learning_rate": 8.333333333333333e-07, "loss": 0.2103, "step": 400 }, { "epoch": 0.20828993959591752, "grad_norm": 1.083927107302103, "learning_rate": 1.0416666666666667e-06, "loss": 0.2075, "step": 500 }, { "epoch": 0.24994792751510103, "grad_norm": 0.8787980351861964, "learning_rate": 1.25e-06, "loss": 0.2049, "step": 600 }, { "epoch": 0.29160591543428455, "grad_norm": 0.5454433660253264, "learning_rate": 1.4583333333333335e-06, "loss": 0.2001, "step": 700 }, { "epoch": 0.333263903353468, "grad_norm": 0.6745519185509095, "learning_rate": 1.6666666666666667e-06, "loss": 0.1916, "step": 800 }, { "epoch": 0.3749218912726515, "grad_norm": 0.3263511819812891, "learning_rate": 1.8750000000000003e-06, "loss": 0.1849, "step": 900 }, { "epoch": 0.41657987919183503, "grad_norm": 3.199309878765134, "learning_rate": 2.0833333333333334e-06, "loss": 0.1847, "step": 1000 }, { "epoch": 0.45823786711101855, "grad_norm": 0.4060106618321982, "learning_rate": 2.2916666666666666e-06, "loss": 0.1845, "step": 1100 }, { "epoch": 0.49989585503020206, "grad_norm": 0.36591848729629267, "learning_rate": 2.5e-06, "loss": 0.1818, "step": 1200 }, { "epoch": 0.5415538429493856, "grad_norm": 0.35361804320631923, "learning_rate": 2.7083333333333334e-06, "loss": 0.1807, "step": 1300 }, { "epoch": 0.5832118308685691, "grad_norm": 0.35892337648275896, "learning_rate": 2.916666666666667e-06, "loss": 0.1806, "step": 1400 }, { "epoch": 0.6248698187877526, "grad_norm": 0.2820867931414937, "learning_rate": 3.125e-06, "loss": 0.1806, "step": 1500 }, { "epoch": 0.666527806706936, "grad_norm": 0.3098924570604735, "learning_rate": 3.3333333333333333e-06, "loss": 0.1808, "step": 1600 }, { "epoch": 0.7081857946261195, "grad_norm": 0.29714949257038253, "learning_rate": 3.5416666666666673e-06, "loss": 0.1803, "step": 1700 }, { "epoch": 0.749843782545303, "grad_norm": 0.302226244442205, "learning_rate": 3.7500000000000005e-06, "loss": 0.1805, "step": 1800 }, { "epoch": 0.7915017704644866, "grad_norm": 0.3329180855942572, "learning_rate": 3.958333333333333e-06, "loss": 0.1833, "step": 1900 }, { "epoch": 0.8331597583836701, "grad_norm": 0.28770265809452183, "learning_rate": 4.166666666666667e-06, "loss": 0.1807, "step": 2000 }, { "epoch": 0.8748177463028536, "grad_norm": 0.3308819875323557, "learning_rate": 4.3750000000000005e-06, "loss": 0.1804, "step": 2100 }, { "epoch": 0.9164757342220371, "grad_norm": 0.3163212399640271, "learning_rate": 4.583333333333333e-06, "loss": 0.1805, "step": 2200 }, { "epoch": 0.9581337221412206, "grad_norm": 0.3898310274135571, "learning_rate": 4.791666666666668e-06, "loss": 0.1803, "step": 2300 }, { "epoch": 0.9997917100604041, "grad_norm": 0.27784332983216586, "learning_rate": 5e-06, "loss": 0.1808, "step": 2400 }, { "epoch": 1.0, "eval_loss": 0.18106774985790253, "eval_runtime": 196.6682, "eval_samples_per_second": 1388.623, "eval_steps_per_second": 2.715, "step": 2401 }, { "epoch": 1.0412414080399917, "grad_norm": 0.2936543487056633, "learning_rate": 4.999735579817769e-06, "loss": 0.1807, "step": 2500 }, { "epoch": 1.082899395959175, "grad_norm": 0.2809875255295402, "learning_rate": 4.998942375205502e-06, "loss": 0.1801, "step": 2600 }, { "epoch": 1.1245573838783587, "grad_norm": 0.2760622198201079, "learning_rate": 4.997620553954645e-06, "loss": 0.1801, "step": 2700 }, { "epoch": 1.166215371797542, "grad_norm": 0.2710350326429577, "learning_rate": 4.995770395678171e-06, "loss": 0.1803, "step": 2800 }, { "epoch": 1.2078733597167257, "grad_norm": 0.20931696168572392, "learning_rate": 4.993392291751431e-06, "loss": 0.1803, "step": 2900 }, { "epoch": 1.2495313476359091, "grad_norm": 0.24323887106839603, "learning_rate": 4.990486745229364e-06, "loss": 0.1799, "step": 3000 }, { "epoch": 1.2911893355550927, "grad_norm": 0.2815796357302052, "learning_rate": 4.9870543707400835e-06, "loss": 0.1798, "step": 3100 }, { "epoch": 1.3328473234742761, "grad_norm": 0.23664820561946712, "learning_rate": 4.983095894354858e-06, "loss": 0.1801, "step": 3200 }, { "epoch": 1.3745053113934598, "grad_norm": 0.3083911955290968, "learning_rate": 4.978612153434527e-06, "loss": 0.1801, "step": 3300 }, { "epoch": 1.4161632993126432, "grad_norm": 0.24337206279187154, "learning_rate": 4.973604096452361e-06, "loss": 0.1799, "step": 3400 }, { "epoch": 1.4578212872318268, "grad_norm": 0.2691338598173961, "learning_rate": 4.968072782793436e-06, "loss": 0.1798, "step": 3500 }, { "epoch": 1.4994792751510102, "grad_norm": 0.1859964729302664, "learning_rate": 4.962019382530521e-06, "loss": 0.18, "step": 3600 }, { "epoch": 1.5411372630701936, "grad_norm": 0.29588302582709847, "learning_rate": 4.955445176176577e-06, "loss": 0.18, "step": 3700 }, { "epoch": 1.5827952509893772, "grad_norm": 0.24224751463035848, "learning_rate": 4.948351554413879e-06, "loss": 0.1993, "step": 3800 }, { "epoch": 1.6244532389085609, "grad_norm": 0.24926986804364754, "learning_rate": 4.9407400177998335e-06, "loss": 0.1799, "step": 3900 }, { "epoch": 1.6661112268277443, "grad_norm": 0.26907499271712193, "learning_rate": 4.93261217644956e-06, "loss": 0.1796, "step": 4000 }, { "epoch": 1.7077692147469277, "grad_norm": 0.24652167596434857, "learning_rate": 4.9239697496952904e-06, "loss": 0.1797, "step": 4100 }, { "epoch": 1.7494272026661113, "grad_norm": 0.26360641338937, "learning_rate": 4.914814565722671e-06, "loss": 0.1797, "step": 4200 }, { "epoch": 1.7910851905852947, "grad_norm": 0.21211424396568565, "learning_rate": 4.905148561184033e-06, "loss": 0.1798, "step": 4300 }, { "epoch": 1.832743178504478, "grad_norm": 0.23174306094818595, "learning_rate": 4.894973780788722e-06, "loss": 0.1798, "step": 4400 }, { "epoch": 1.8744011664236617, "grad_norm": 0.20239856810705756, "learning_rate": 4.884292376870567e-06, "loss": 0.1797, "step": 4500 }, { "epoch": 1.9160591543428453, "grad_norm": 0.20895880362963307, "learning_rate": 4.873106608932585e-06, "loss": 0.1796, "step": 4600 }, { "epoch": 1.9577171422620288, "grad_norm": 0.2341875351736524, "learning_rate": 4.861418843169012e-06, "loss": 0.1797, "step": 4700 }, { "epoch": 1.9993751301812122, "grad_norm": 0.20045835157915606, "learning_rate": 4.849231551964771e-06, "loss": 0.1796, "step": 4800 }, { "epoch": 2.0, "eval_loss": 0.17972978949546814, "eval_runtime": 196.3636, "eval_samples_per_second": 1390.777, "eval_steps_per_second": 2.719, "step": 4802 }, { "epoch": 2.0408248281607997, "grad_norm": 0.21309941078379252, "learning_rate": 4.836547313372472e-06, "loss": 0.1795, "step": 4900 }, { "epoch": 2.0824828160799833, "grad_norm": 0.19717578427183138, "learning_rate": 4.823368810567056e-06, "loss": 0.1794, "step": 5000 }, { "epoch": 2.124140803999167, "grad_norm": 0.23023011075724995, "learning_rate": 4.809698831278217e-06, "loss": 0.1802, "step": 5100 }, { "epoch": 2.16579879191835, "grad_norm": 0.21578484379978355, "learning_rate": 4.7955402672006855e-06, "loss": 0.18, "step": 5200 }, { "epoch": 2.2074567798375337, "grad_norm": 0.21410225528440446, "learning_rate": 4.780896113382536e-06, "loss": 0.1798, "step": 5300 }, { "epoch": 2.2491147677567174, "grad_norm": 0.24923656549560563, "learning_rate": 4.765769467591626e-06, "loss": 0.1796, "step": 5400 }, { "epoch": 2.290772755675901, "grad_norm": 0.27043973727195314, "learning_rate": 4.750163529660303e-06, "loss": 0.1799, "step": 5500 }, { "epoch": 2.332430743595084, "grad_norm": 0.20084508849747548, "learning_rate": 4.734081600808531e-06, "loss": 0.1796, "step": 5600 }, { "epoch": 2.374088731514268, "grad_norm": 0.17037675166345598, "learning_rate": 4.717527082945555e-06, "loss": 0.1797, "step": 5700 }, { "epoch": 2.4157467194334514, "grad_norm": 0.20792174660657012, "learning_rate": 4.700503477950278e-06, "loss": 0.1797, "step": 5800 }, { "epoch": 2.457404707352635, "grad_norm": 0.20444912332175158, "learning_rate": 4.6830143869304904e-06, "loss": 0.1799, "step": 5900 }, { "epoch": 2.4990626952718182, "grad_norm": 0.2160441899332462, "learning_rate": 4.665063509461098e-06, "loss": 0.1797, "step": 6000 }, { "epoch": 2.540720683191002, "grad_norm": 0.25556787549882387, "learning_rate": 4.646654642801533e-06, "loss": 0.1794, "step": 6100 }, { "epoch": 2.5823786711101855, "grad_norm": 0.22198410769602075, "learning_rate": 4.627791681092499e-06, "loss": 0.1794, "step": 6200 }, { "epoch": 2.624036659029369, "grad_norm": 0.19549701905963526, "learning_rate": 4.608478614532215e-06, "loss": 0.1795, "step": 6300 }, { "epoch": 2.6656946469485523, "grad_norm": 0.24454736703986502, "learning_rate": 4.588719528532342e-06, "loss": 0.1797, "step": 6400 }, { "epoch": 2.707352634867736, "grad_norm": 0.20111965276500102, "learning_rate": 4.568518602853776e-06, "loss": 0.1797, "step": 6500 }, { "epoch": 2.7490106227869195, "grad_norm": 0.2155615827433472, "learning_rate": 4.54788011072248e-06, "loss": 0.1796, "step": 6600 }, { "epoch": 2.7906686107061027, "grad_norm": 0.23518049751986453, "learning_rate": 4.526808417925531e-06, "loss": 0.1796, "step": 6700 }, { "epoch": 2.8323265986252864, "grad_norm": 0.2088881277827675, "learning_rate": 4.50530798188761e-06, "loss": 0.1795, "step": 6800 }, { "epoch": 2.87398458654447, "grad_norm": 0.22027451607755855, "learning_rate": 4.4833833507280884e-06, "loss": 0.1794, "step": 6900 }, { "epoch": 2.9156425744636536, "grad_norm": 0.20366425013850817, "learning_rate": 4.46103916229894e-06, "loss": 0.1793, "step": 7000 }, { "epoch": 2.957300562382837, "grad_norm": 0.2718663681076218, "learning_rate": 4.438280143203665e-06, "loss": 0.1796, "step": 7100 }, { "epoch": 2.9989585503020204, "grad_norm": 0.19182709064421555, "learning_rate": 4.415111107797445e-06, "loss": 0.1794, "step": 7200 }, { "epoch": 3.0, "eval_loss": 0.1794959157705307, "eval_runtime": 196.4289, "eval_samples_per_second": 1390.315, "eval_steps_per_second": 2.719, "step": 7203 }, { "epoch": 3.040408248281608, "grad_norm": 0.195058367609666, "learning_rate": 4.391536957168733e-06, "loss": 0.1798, "step": 7300 }, { "epoch": 3.0820662362007916, "grad_norm": 0.2256357073328012, "learning_rate": 4.367562678102491e-06, "loss": 0.1795, "step": 7400 }, { "epoch": 3.123724224119975, "grad_norm": 0.2129481809880029, "learning_rate": 4.34319334202531e-06, "loss": 0.1795, "step": 7500 }, { "epoch": 3.1653822120391584, "grad_norm": 0.1689665633552094, "learning_rate": 4.318434103932622e-06, "loss": 0.1795, "step": 7600 }, { "epoch": 3.207040199958342, "grad_norm": 0.18434140023135, "learning_rate": 4.293290201298224e-06, "loss": 0.1796, "step": 7700 }, { "epoch": 3.2486981878775256, "grad_norm": 0.2103528683280332, "learning_rate": 4.267766952966369e-06, "loss": 0.1793, "step": 7800 }, { "epoch": 3.290356175796709, "grad_norm": 0.16087446181904855, "learning_rate": 4.241869758026638e-06, "loss": 0.1794, "step": 7900 }, { "epoch": 3.3320141637158924, "grad_norm": 0.22569144057534085, "learning_rate": 4.215604094671835e-06, "loss": 0.1792, "step": 8000 }, { "epoch": 3.373672151635076, "grad_norm": 0.19990473196998446, "learning_rate": 4.188975519039151e-06, "loss": 0.1794, "step": 8100 }, { "epoch": 3.4153301395542597, "grad_norm": 0.1902243355455867, "learning_rate": 4.161989664034844e-06, "loss": 0.1794, "step": 8200 }, { "epoch": 3.456988127473443, "grad_norm": 0.18824118604006632, "learning_rate": 4.134652238142674e-06, "loss": 0.1794, "step": 8300 }, { "epoch": 3.4986461153926265, "grad_norm": 0.19597204875441573, "learning_rate": 4.106969024216348e-06, "loss": 0.1794, "step": 8400 }, { "epoch": 3.54030410331181, "grad_norm": 0.17674897479656335, "learning_rate": 4.078945878256244e-06, "loss": 0.1793, "step": 8500 }, { "epoch": 3.5819620912309933, "grad_norm": 0.19658906636767987, "learning_rate": 4.0505887281706505e-06, "loss": 0.1794, "step": 8600 }, { "epoch": 3.623620079150177, "grad_norm": 0.1607909455989355, "learning_rate": 4.021903572521802e-06, "loss": 0.1794, "step": 8700 }, { "epoch": 3.6652780670693605, "grad_norm": 0.18982136425367155, "learning_rate": 3.992896479256966e-06, "loss": 0.1793, "step": 8800 }, { "epoch": 3.706936054988544, "grad_norm": 0.18212426964310202, "learning_rate": 3.963573584424852e-06, "loss": 0.1794, "step": 8900 }, { "epoch": 3.748594042907728, "grad_norm": 0.18731109638030716, "learning_rate": 3.933941090877615e-06, "loss": 0.1799, "step": 9000 }, { "epoch": 3.790252030826911, "grad_norm": 0.2243920924541318, "learning_rate": 3.9040052669587325e-06, "loss": 0.1863, "step": 9100 }, { "epoch": 3.8319100187460946, "grad_norm": 0.19665494095424324, "learning_rate": 3.8737724451770155e-06, "loss": 0.1793, "step": 9200 }, { "epoch": 3.8735680066652782, "grad_norm": 0.1709097835399287, "learning_rate": 3.8432490208670605e-06, "loss": 0.1792, "step": 9300 }, { "epoch": 3.9152259945844614, "grad_norm": 0.1519558310026607, "learning_rate": 3.8124414508364005e-06, "loss": 0.1792, "step": 9400 }, { "epoch": 3.956883982503645, "grad_norm": 0.18615584510557248, "learning_rate": 3.7813562519996633e-06, "loss": 0.1791, "step": 9500 }, { "epoch": 3.9985419704228287, "grad_norm": 0.14216906700933155, "learning_rate": 3.7500000000000005e-06, "loss": 0.1792, "step": 9600 }, { "epoch": 4.0, "eval_loss": 0.17919312417507172, "eval_runtime": 196.5199, "eval_samples_per_second": 1389.671, "eval_steps_per_second": 2.717, "step": 9604 }, { "epoch": 4.039991668402416, "grad_norm": 0.1981373334933009, "learning_rate": 3.7183793278181063e-06, "loss": 0.1793, "step": 9700 }, { "epoch": 4.081649656321599, "grad_norm": 0.1796707844873524, "learning_rate": 3.6865009243691015e-06, "loss": 0.1791, "step": 9800 }, { "epoch": 4.123307644240783, "grad_norm": 0.21582792834146144, "learning_rate": 3.654371533087586e-06, "loss": 0.1792, "step": 9900 }, { "epoch": 4.164965632159967, "grad_norm": 0.22285894509633086, "learning_rate": 3.621997950501156e-06, "loss": 0.179, "step": 10000 }, { "epoch": 4.20662362007915, "grad_norm": 0.1947839176316504, "learning_rate": 3.5893870247926986e-06, "loss": 0.1792, "step": 10100 }, { "epoch": 4.248281607998334, "grad_norm": 0.18044045004936568, "learning_rate": 3.556545654351749e-06, "loss": 0.1791, "step": 10200 }, { "epoch": 4.2899395959175175, "grad_norm": 0.21629122720481903, "learning_rate": 3.5234807863152316e-06, "loss": 0.1793, "step": 10300 }, { "epoch": 4.3315975838367, "grad_norm": 0.15404290423986947, "learning_rate": 3.4901994150978926e-06, "loss": 0.1791, "step": 10400 }, { "epoch": 4.373255571755884, "grad_norm": 0.16032922618842949, "learning_rate": 3.4567085809127247e-06, "loss": 0.1791, "step": 10500 }, { "epoch": 4.4149135596750675, "grad_norm": 0.1495191719599753, "learning_rate": 3.4230153682817112e-06, "loss": 0.1791, "step": 10600 }, { "epoch": 4.456571547594251, "grad_norm": 0.19697439856186114, "learning_rate": 3.389126904537192e-06, "loss": 0.1791, "step": 10700 }, { "epoch": 4.498229535513435, "grad_norm": 0.17156322418134476, "learning_rate": 3.3550503583141726e-06, "loss": 0.1791, "step": 10800 }, { "epoch": 4.539887523432618, "grad_norm": 0.1561878142062692, "learning_rate": 3.3207929380339034e-06, "loss": 0.1792, "step": 10900 }, { "epoch": 4.581545511351802, "grad_norm": 0.1828679685381653, "learning_rate": 3.2863618903790346e-06, "loss": 0.1791, "step": 11000 }, { "epoch": 4.623203499270986, "grad_norm": 0.1802733896031037, "learning_rate": 3.2517644987606827e-06, "loss": 0.1792, "step": 11100 }, { "epoch": 4.664861487190168, "grad_norm": 0.15579534435978112, "learning_rate": 3.217008081777726e-06, "loss": 0.1791, "step": 11200 }, { "epoch": 4.706519475109352, "grad_norm": 0.16638908065693153, "learning_rate": 3.182099991668653e-06, "loss": 0.1791, "step": 11300 }, { "epoch": 4.748177463028536, "grad_norm": 0.18397163828033228, "learning_rate": 3.147047612756302e-06, "loss": 0.1792, "step": 11400 }, { "epoch": 4.789835450947719, "grad_norm": 0.17751483450519995, "learning_rate": 3.1118583598858097e-06, "loss": 0.179, "step": 11500 }, { "epoch": 4.831493438866903, "grad_norm": 0.1808778224251496, "learning_rate": 3.0765396768561005e-06, "loss": 0.179, "step": 11600 }, { "epoch": 4.8731514267860865, "grad_norm": 0.17593346330767928, "learning_rate": 3.0410990348452572e-06, "loss": 0.1793, "step": 11700 }, { "epoch": 4.91480941470527, "grad_norm": 0.15824861181745342, "learning_rate": 3.0055439308300954e-06, "loss": 0.1791, "step": 11800 }, { "epoch": 4.956467402624453, "grad_norm": 0.21055777806239853, "learning_rate": 2.96988188600028e-06, "loss": 0.1792, "step": 11900 }, { "epoch": 4.9981253905436365, "grad_norm": 0.15352806003656314, "learning_rate": 2.9341204441673267e-06, "loss": 0.1791, "step": 12000 }, { "epoch": 5.0, "eval_loss": 0.17911389470100403, "eval_runtime": 196.4564, "eval_samples_per_second": 1390.12, "eval_steps_per_second": 2.718, "step": 12005 }, { "epoch": 5.0395750885232244, "grad_norm": 0.1891820592041876, "learning_rate": 2.898267170168807e-06, "loss": 0.1791, "step": 12100 }, { "epoch": 5.081233076442408, "grad_norm": 0.14302405130068518, "learning_rate": 2.862329648268117e-06, "loss": 0.1789, "step": 12200 }, { "epoch": 5.122891064361592, "grad_norm": 0.2215960599158716, "learning_rate": 2.82631548055013e-06, "loss": 0.1792, "step": 12300 }, { "epoch": 5.164549052280774, "grad_norm": 0.1566593937408507, "learning_rate": 2.7902322853130758e-06, "loss": 0.179, "step": 12400 }, { "epoch": 5.206207040199958, "grad_norm": 0.15513379693358573, "learning_rate": 2.754087695457005e-06, "loss": 0.1791, "step": 12500 }, { "epoch": 5.247865028119142, "grad_norm": 0.14968722299942713, "learning_rate": 2.717889356869146e-06, "loss": 0.179, "step": 12600 }, { "epoch": 5.289523016038325, "grad_norm": 0.2097123380235341, "learning_rate": 2.681644926806527e-06, "loss": 0.179, "step": 12700 }, { "epoch": 5.331181003957509, "grad_norm": 0.19315969222642626, "learning_rate": 2.6453620722761897e-06, "loss": 0.179, "step": 12800 }, { "epoch": 5.372838991876693, "grad_norm": 0.2209634744371871, "learning_rate": 2.6090484684133406e-06, "loss": 0.1791, "step": 12900 }, { "epoch": 5.414496979795876, "grad_norm": 0.20430693758591473, "learning_rate": 2.572711796857779e-06, "loss": 0.179, "step": 13000 }, { "epoch": 5.45615496771506, "grad_norm": 0.18903967369853375, "learning_rate": 2.5363597441289574e-06, "loss": 0.179, "step": 13100 }, { "epoch": 5.4978129556342425, "grad_norm": 0.15616083753477006, "learning_rate": 2.5e-06, "loss": 0.179, "step": 13200 }, { "epoch": 5.539470943553426, "grad_norm": 0.1507559008561688, "learning_rate": 2.4636402558710434e-06, "loss": 0.1791, "step": 13300 }, { "epoch": 5.58112893147261, "grad_norm": 0.16640062646644058, "learning_rate": 2.4272882031422216e-06, "loss": 0.179, "step": 13400 }, { "epoch": 5.622786919391793, "grad_norm": 0.1824434916593794, "learning_rate": 2.3909515315866606e-06, "loss": 0.1791, "step": 13500 }, { "epoch": 5.664444907310977, "grad_norm": 0.2004975100759413, "learning_rate": 2.3546379277238107e-06, "loss": 0.179, "step": 13600 }, { "epoch": 5.706102895230161, "grad_norm": 0.17154522514366766, "learning_rate": 2.318355073193474e-06, "loss": 0.1791, "step": 13700 }, { "epoch": 5.747760883149343, "grad_norm": 0.13248550006328844, "learning_rate": 2.2821106431308546e-06, "loss": 0.179, "step": 13800 }, { "epoch": 5.789418871068527, "grad_norm": 0.1915171020600886, "learning_rate": 2.2459123045429953e-06, "loss": 0.1792, "step": 13900 }, { "epoch": 5.831076858987711, "grad_norm": 0.16235356856597902, "learning_rate": 2.2097677146869242e-06, "loss": 0.1791, "step": 14000 }, { "epoch": 5.872734846906894, "grad_norm": 0.1627140490119954, "learning_rate": 2.173684519449872e-06, "loss": 0.1789, "step": 14100 }, { "epoch": 5.914392834826078, "grad_norm": 0.16466884224746445, "learning_rate": 2.1376703517318835e-06, "loss": 0.179, "step": 14200 }, { "epoch": 5.9560508227452615, "grad_norm": 0.20611687756993843, "learning_rate": 2.101732829831194e-06, "loss": 0.179, "step": 14300 }, { "epoch": 5.997708810664445, "grad_norm": 0.16559158144998481, "learning_rate": 2.0658795558326745e-06, "loss": 0.179, "step": 14400 }, { "epoch": 6.0, "eval_loss": 0.17907121777534485, "eval_runtime": 196.4273, "eval_samples_per_second": 1390.326, "eval_steps_per_second": 2.719, "step": 14406 }, { "epoch": 6.039158508644032, "grad_norm": 0.16927649861039284, "learning_rate": 2.0301181139997206e-06, "loss": 0.1789, "step": 14500 }, { "epoch": 6.080816496563216, "grad_norm": 0.1752142512252337, "learning_rate": 1.994456069169906e-06, "loss": 0.179, "step": 14600 }, { "epoch": 6.1224744844823995, "grad_norm": 0.21170178196900302, "learning_rate": 1.958900965154743e-06, "loss": 0.1789, "step": 14700 }, { "epoch": 6.164132472401583, "grad_norm": 0.21884267966966597, "learning_rate": 1.9234603231439e-06, "loss": 0.1788, "step": 14800 }, { "epoch": 6.205790460320767, "grad_norm": 0.17106948371146288, "learning_rate": 1.8881416401141905e-06, "loss": 0.1788, "step": 14900 }, { "epoch": 6.24744844823995, "grad_norm": 0.174097273230219, "learning_rate": 1.852952387243698e-06, "loss": 0.1788, "step": 15000 }, { "epoch": 6.289106436159133, "grad_norm": 0.20862365699110258, "learning_rate": 1.8179000083313483e-06, "loss": 0.1788, "step": 15100 }, { "epoch": 6.330764424078317, "grad_norm": 0.17885797151549512, "learning_rate": 1.7829919182222752e-06, "loss": 0.1788, "step": 15200 }, { "epoch": 6.3724224119975, "grad_norm": 0.19498914359958716, "learning_rate": 1.7482355012393177e-06, "loss": 0.1789, "step": 15300 }, { "epoch": 6.414080399916684, "grad_norm": 0.1389966716220221, "learning_rate": 1.7136381096209665e-06, "loss": 0.179, "step": 15400 }, { "epoch": 6.455738387835868, "grad_norm": 0.1786092324697337, "learning_rate": 1.6792070619660977e-06, "loss": 0.179, "step": 15500 }, { "epoch": 6.497396375755051, "grad_norm": 0.19161758807721282, "learning_rate": 1.6449496416858285e-06, "loss": 0.1788, "step": 15600 }, { "epoch": 6.539054363674235, "grad_norm": 0.19197303954060144, "learning_rate": 1.6108730954628093e-06, "loss": 0.1788, "step": 15700 }, { "epoch": 6.580712351593418, "grad_norm": 0.16743828588501417, "learning_rate": 1.5769846317182894e-06, "loss": 0.1787, "step": 15800 }, { "epoch": 6.622370339512601, "grad_norm": 0.16492318029574304, "learning_rate": 1.5432914190872757e-06, "loss": 0.1788, "step": 15900 }, { "epoch": 6.664028327431785, "grad_norm": 0.15440438163304784, "learning_rate": 1.509800584902108e-06, "loss": 0.1789, "step": 16000 }, { "epoch": 6.7056863153509685, "grad_norm": 0.17667275704806315, "learning_rate": 1.4765192136847686e-06, "loss": 0.1789, "step": 16100 }, { "epoch": 6.747344303270152, "grad_norm": 0.17904015323124156, "learning_rate": 1.443454345648252e-06, "loss": 0.1789, "step": 16200 }, { "epoch": 6.789002291189336, "grad_norm": 0.16736730033822061, "learning_rate": 1.4106129752073023e-06, "loss": 0.179, "step": 16300 }, { "epoch": 6.830660279108519, "grad_norm": 0.16038102753372047, "learning_rate": 1.3780020494988447e-06, "loss": 0.179, "step": 16400 }, { "epoch": 6.872318267027703, "grad_norm": 0.15315299560909978, "learning_rate": 1.3456284669124159e-06, "loss": 0.1786, "step": 16500 }, { "epoch": 6.913976254946886, "grad_norm": 0.1430660492396621, "learning_rate": 1.313499075630899e-06, "loss": 0.179, "step": 16600 }, { "epoch": 6.955634242866069, "grad_norm": 0.17326024703322063, "learning_rate": 1.2816206721818944e-06, "loss": 0.1789, "step": 16700 }, { "epoch": 6.997292230785253, "grad_norm": 0.14987232796770428, "learning_rate": 1.2500000000000007e-06, "loss": 0.1787, "step": 16800 }, { "epoch": 7.0, "eval_loss": 0.17893224954605103, "eval_runtime": 196.4121, "eval_samples_per_second": 1390.434, "eval_steps_per_second": 2.719, "step": 16807 }, { "epoch": 7.038741928764841, "grad_norm": 0.1439804790666206, "learning_rate": 1.218643748000337e-06, "loss": 0.1787, "step": 16900 }, { "epoch": 7.080399916684025, "grad_norm": 0.1820620837643405, "learning_rate": 1.1875585491636e-06, "loss": 0.1788, "step": 17000 }, { "epoch": 7.122057904603207, "grad_norm": 0.1619570282327302, "learning_rate": 1.1567509791329402e-06, "loss": 0.1786, "step": 17100 }, { "epoch": 7.163715892522391, "grad_norm": 0.2470491812569796, "learning_rate": 1.1262275548229852e-06, "loss": 0.1791, "step": 17200 }, { "epoch": 7.205373880441575, "grad_norm": 0.18058952670407366, "learning_rate": 1.0959947330412681e-06, "loss": 0.1789, "step": 17300 }, { "epoch": 7.247031868360758, "grad_norm": 0.20589528394837478, "learning_rate": 1.0660589091223854e-06, "loss": 0.1786, "step": 17400 }, { "epoch": 7.288689856279942, "grad_norm": 0.13562633767825757, "learning_rate": 1.0364264155751489e-06, "loss": 0.1786, "step": 17500 }, { "epoch": 7.330347844199125, "grad_norm": 0.194696644563295, "learning_rate": 1.0071035207430352e-06, "loss": 0.1787, "step": 17600 }, { "epoch": 7.372005832118309, "grad_norm": 0.19213496981753242, "learning_rate": 9.780964274781984e-07, "loss": 0.1786, "step": 17700 }, { "epoch": 7.413663820037492, "grad_norm": 0.19876379595232896, "learning_rate": 9.494112718293503e-07, "loss": 0.1787, "step": 17800 }, { "epoch": 7.455321807956675, "grad_norm": 0.1684329683430977, "learning_rate": 9.210541217437566e-07, "loss": 0.1787, "step": 17900 }, { "epoch": 7.496979795875859, "grad_norm": 0.1823625942631362, "learning_rate": 8.930309757836517e-07, "loss": 0.1785, "step": 18000 }, { "epoch": 7.538637783795043, "grad_norm": 0.18725762365246973, "learning_rate": 8.653477618573261e-07, "loss": 0.1786, "step": 18100 }, { "epoch": 7.580295771714226, "grad_norm": 0.1507247392992477, "learning_rate": 8.380103359651554e-07, "loss": 0.1787, "step": 18200 }, { "epoch": 7.62195375963341, "grad_norm": 0.18505299719524845, "learning_rate": 8.110244809608494e-07, "loss": 0.1786, "step": 18300 }, { "epoch": 7.663611747552594, "grad_norm": 0.12101506184025812, "learning_rate": 7.843959053281663e-07, "loss": 0.1786, "step": 18400 }, { "epoch": 7.705269735471777, "grad_norm": 0.16939344528667466, "learning_rate": 7.581302419733633e-07, "loss": 0.1785, "step": 18500 }, { "epoch": 7.74692772339096, "grad_norm": 0.13840737012325652, "learning_rate": 7.322330470336314e-07, "loss": 0.1785, "step": 18600 }, { "epoch": 7.7885857113101435, "grad_norm": 0.16859264286478876, "learning_rate": 7.067097987017762e-07, "loss": 0.1787, "step": 18700 }, { "epoch": 7.830243699229327, "grad_norm": 0.1897535110592711, "learning_rate": 6.815658960673782e-07, "loss": 0.1785, "step": 18800 }, { "epoch": 7.871901687148511, "grad_norm": 0.18368265058091485, "learning_rate": 6.568066579746901e-07, "loss": 0.1785, "step": 18900 }, { "epoch": 7.913559675067694, "grad_norm": 0.13696515467419504, "learning_rate": 6.324373218975105e-07, "loss": 0.1786, "step": 19000 }, { "epoch": 7.955217662986878, "grad_norm": 0.14354515830035847, "learning_rate": 6.084630428312679e-07, "loss": 0.1785, "step": 19100 }, { "epoch": 7.996875650906061, "grad_norm": 0.15165778139105265, "learning_rate": 5.848888922025553e-07, "loss": 0.1786, "step": 19200 }, { "epoch": 8.0, "eval_loss": 0.17886345088481903, "eval_runtime": 196.5554, "eval_samples_per_second": 1389.42, "eval_steps_per_second": 2.717, "step": 19208 }, { "epoch": 8.03832534888565, "grad_norm": 0.15763312404128105, "learning_rate": 5.617198567963353e-07, "loss": 0.1783, "step": 19300 }, { "epoch": 8.079983336804831, "grad_norm": 0.1720429493205497, "learning_rate": 5.389608377010608e-07, "loss": 0.1783, "step": 19400 }, { "epoch": 8.121641324724015, "grad_norm": 0.1690726413308925, "learning_rate": 5.166166492719124e-07, "loss": 0.1783, "step": 19500 }, { "epoch": 8.163299312643199, "grad_norm": 0.17909925356768044, "learning_rate": 4.946920181123904e-07, "loss": 0.1782, "step": 19600 }, { "epoch": 8.204957300562382, "grad_norm": 0.22116088190481087, "learning_rate": 4.7319158207446953e-07, "loss": 0.1782, "step": 19700 }, { "epoch": 8.246615288481566, "grad_norm": 0.16383363990929287, "learning_rate": 4.5211988927752026e-07, "loss": 0.1782, "step": 19800 }, { "epoch": 8.28827327640075, "grad_norm": 0.18255215192836688, "learning_rate": 4.3148139714622365e-07, "loss": 0.1782, "step": 19900 }, { "epoch": 8.329931264319933, "grad_norm": 0.19783668808521335, "learning_rate": 4.1128047146765936e-07, "loss": 0.1781, "step": 20000 }, { "epoch": 8.371589252239117, "grad_norm": 0.1828620345488146, "learning_rate": 3.915213854677863e-07, "loss": 0.1781, "step": 20100 }, { "epoch": 8.4132472401583, "grad_norm": 0.1461266269903454, "learning_rate": 3.722083189075007e-07, "loss": 0.1782, "step": 20200 }, { "epoch": 8.454905228077484, "grad_norm": 0.19063937525748337, "learning_rate": 3.5334535719846767e-07, "loss": 0.1781, "step": 20300 }, { "epoch": 8.496563215996668, "grad_norm": 0.12678778363904367, "learning_rate": 3.3493649053890325e-07, "loss": 0.1781, "step": 20400 }, { "epoch": 8.538221203915851, "grad_norm": 0.15880039262804566, "learning_rate": 3.1698561306951065e-07, "loss": 0.1782, "step": 20500 }, { "epoch": 8.579879191835035, "grad_norm": 0.18763241075198428, "learning_rate": 2.9949652204972257e-07, "loss": 0.178, "step": 20600 }, { "epoch": 8.621537179754219, "grad_norm": 0.1582482612527278, "learning_rate": 2.8247291705444575e-07, "loss": 0.1778, "step": 20700 }, { "epoch": 8.6631951676734, "grad_norm": 0.181992432758085, "learning_rate": 2.6591839919146963e-07, "loss": 0.178, "step": 20800 }, { "epoch": 8.704853155592584, "grad_norm": 0.1463913122272469, "learning_rate": 2.4983647033969714e-07, "loss": 0.1783, "step": 20900 }, { "epoch": 8.746511143511768, "grad_norm": 0.15649171707147957, "learning_rate": 2.3423053240837518e-07, "loss": 0.1781, "step": 21000 }, { "epoch": 8.788169131430951, "grad_norm": 0.16428482803404829, "learning_rate": 2.1910388661746495e-07, "loss": 0.1782, "step": 21100 }, { "epoch": 8.829827119350135, "grad_norm": 0.19349382720192548, "learning_rate": 2.044597327993153e-07, "loss": 0.1781, "step": 21200 }, { "epoch": 8.871485107269319, "grad_norm": 0.1678737628788564, "learning_rate": 1.9030116872178317e-07, "loss": 0.1781, "step": 21300 }, { "epoch": 8.913143095188502, "grad_norm": 0.187501462753097, "learning_rate": 1.7663118943294367e-07, "loss": 0.1781, "step": 21400 }, { "epoch": 8.954801083107686, "grad_norm": 0.17102799413092362, "learning_rate": 1.6345268662752904e-07, "loss": 0.1781, "step": 21500 }, { "epoch": 8.99645907102687, "grad_norm": 0.14591121551272715, "learning_rate": 1.507684480352292e-07, "loss": 0.1781, "step": 21600 }, { "epoch": 9.0, "eval_loss": 0.17907947301864624, "eval_runtime": 196.3329, "eval_samples_per_second": 1390.995, "eval_steps_per_second": 2.72, "step": 21609 }, { "epoch": 9.037908769006457, "grad_norm": 0.1816902644971728, "learning_rate": 1.3858115683098832e-07, "loss": 0.177, "step": 21700 }, { "epoch": 9.07956675692564, "grad_norm": 0.18741449385017522, "learning_rate": 1.2689339106741529e-07, "loss": 0.1767, "step": 21800 }, { "epoch": 9.121224744844824, "grad_norm": 0.20197534473429568, "learning_rate": 1.1570762312943295e-07, "loss": 0.1768, "step": 21900 }, { "epoch": 9.162882732764007, "grad_norm": 0.21639195747399645, "learning_rate": 1.0502621921127776e-07, "loss": 0.1767, "step": 22000 }, { "epoch": 9.204540720683191, "grad_norm": 0.18933606645836426, "learning_rate": 9.485143881596715e-08, "loss": 0.1768, "step": 22100 }, { "epoch": 9.246198708602375, "grad_norm": 0.1960648079791721, "learning_rate": 8.518543427732951e-08, "loss": 0.1767, "step": 22200 }, { "epoch": 9.287856696521558, "grad_norm": 0.18056583891057434, "learning_rate": 7.603025030471001e-08, "loss": 0.1766, "step": 22300 }, { "epoch": 9.329514684440742, "grad_norm": 0.18480124722464905, "learning_rate": 6.738782355044048e-08, "loss": 0.1769, "step": 22400 }, { "epoch": 9.371172672359926, "grad_norm": 0.22786425388668805, "learning_rate": 5.92599822001666e-08, "loss": 0.1767, "step": 22500 }, { "epoch": 9.41283066027911, "grad_norm": 0.2205541920741548, "learning_rate": 5.164844558612131e-08, "loss": 0.1766, "step": 22600 }, { "epoch": 9.454488648198293, "grad_norm": 0.2134938008984885, "learning_rate": 4.455482382342336e-08, "loss": 0.1767, "step": 22700 }, { "epoch": 9.496146636117475, "grad_norm": 0.23030736326238382, "learning_rate": 3.798061746947995e-08, "loss": 0.1767, "step": 22800 }, { "epoch": 9.537804624036658, "grad_norm": 0.2214355490299709, "learning_rate": 3.1927217206564884e-08, "loss": 0.1767, "step": 22900 }, { "epoch": 9.579462611955842, "grad_norm": 0.2291392443441154, "learning_rate": 2.6395903547638825e-08, "loss": 0.1765, "step": 23000 }, { "epoch": 9.621120599875026, "grad_norm": 0.22120778210484332, "learning_rate": 2.1387846565474047e-08, "loss": 0.1765, "step": 23100 }, { "epoch": 9.66277858779421, "grad_norm": 0.1927066727358843, "learning_rate": 1.6904105645142443e-08, "loss": 0.1765, "step": 23200 }, { "epoch": 9.704436575713393, "grad_norm": 0.2369391538896648, "learning_rate": 1.2945629259917547e-08, "loss": 0.1766, "step": 23300 }, { "epoch": 9.746094563632576, "grad_norm": 0.21269587694232558, "learning_rate": 9.513254770636138e-09, "loss": 0.1767, "step": 23400 }, { "epoch": 9.78775255155176, "grad_norm": 0.20767475535201343, "learning_rate": 6.607708248569378e-09, "loss": 0.1766, "step": 23500 }, { "epoch": 9.829410539470944, "grad_norm": 0.21058981271348698, "learning_rate": 4.229604321829561e-09, "loss": 0.1766, "step": 23600 }, { "epoch": 9.871068527390127, "grad_norm": 0.18917603463369678, "learning_rate": 2.3794460453555046e-09, "loss": 0.1766, "step": 23700 }, { "epoch": 9.912726515309311, "grad_norm": 0.18145195315540197, "learning_rate": 1.0576247944985018e-09, "loss": 0.1767, "step": 23800 }, { "epoch": 9.954384503228495, "grad_norm": 0.22385123601872012, "learning_rate": 2.6442018223132857e-10, "loss": 0.1766, "step": 23900 }, { "epoch": 9.996042491147678, "grad_norm": 0.22063368359660335, "learning_rate": 0.0, "loss": 0.1766, "step": 24000 }, { "epoch": 9.996042491147678, "eval_loss": 0.18023133277893066, "eval_runtime": 196.0313, "eval_samples_per_second": 1393.135, "eval_steps_per_second": 2.724, "step": 24000 }, { "epoch": 9.996042491147678, "step": 24000, "total_flos": 5485114750402560.0, "train_loss": 0.19645737719535827, "train_runtime": 70712.6152, "train_samples_per_second": 347.587, "train_steps_per_second": 0.339 } ], "logging_steps": 100, "max_steps": 24000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5485114750402560.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }