| { | |
| "best_metric": 0.17886345088481903, | |
| "best_model_checkpoint": "saves/chess/generate_strategy/checkpoint-19208", | |
| "epoch": 9.996042491147678, | |
| "eval_steps": 500, | |
| "global_step": 24000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0416579879191835, | |
| "grad_norm": 8.262849587594042, | |
| "learning_rate": 2.0833333333333333e-07, | |
| "loss": 3.9539, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.083315975838367, | |
| "grad_norm": 2.1815007336055197, | |
| "learning_rate": 4.1666666666666667e-07, | |
| "loss": 0.4086, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.12497396375755052, | |
| "grad_norm": 1.094766614987478, | |
| "learning_rate": 6.25e-07, | |
| "loss": 0.2144, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.166631951676734, | |
| "grad_norm": 1.015902700288932, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.2103, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.20828993959591752, | |
| "grad_norm": 1.083927107302103, | |
| "learning_rate": 1.0416666666666667e-06, | |
| "loss": 0.2075, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.24994792751510103, | |
| "grad_norm": 0.8787980351861964, | |
| "learning_rate": 1.25e-06, | |
| "loss": 0.2049, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.29160591543428455, | |
| "grad_norm": 0.5454433660253264, | |
| "learning_rate": 1.4583333333333335e-06, | |
| "loss": 0.2001, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.333263903353468, | |
| "grad_norm": 0.6745519185509095, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.1916, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3749218912726515, | |
| "grad_norm": 0.3263511819812891, | |
| "learning_rate": 1.8750000000000003e-06, | |
| "loss": 0.1849, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.41657987919183503, | |
| "grad_norm": 3.199309878765134, | |
| "learning_rate": 2.0833333333333334e-06, | |
| "loss": 0.1847, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.45823786711101855, | |
| "grad_norm": 0.4060106618321982, | |
| "learning_rate": 2.2916666666666666e-06, | |
| "loss": 0.1845, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.49989585503020206, | |
| "grad_norm": 0.36591848729629267, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.1818, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5415538429493856, | |
| "grad_norm": 0.35361804320631923, | |
| "learning_rate": 2.7083333333333334e-06, | |
| "loss": 0.1807, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5832118308685691, | |
| "grad_norm": 0.35892337648275896, | |
| "learning_rate": 2.916666666666667e-06, | |
| "loss": 0.1806, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.6248698187877526, | |
| "grad_norm": 0.2820867931414937, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.1806, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.666527806706936, | |
| "grad_norm": 0.3098924570604735, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.1808, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.7081857946261195, | |
| "grad_norm": 0.29714949257038253, | |
| "learning_rate": 3.5416666666666673e-06, | |
| "loss": 0.1803, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.749843782545303, | |
| "grad_norm": 0.302226244442205, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.1805, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7915017704644866, | |
| "grad_norm": 0.3329180855942572, | |
| "learning_rate": 3.958333333333333e-06, | |
| "loss": 0.1833, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.8331597583836701, | |
| "grad_norm": 0.28770265809452183, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.1807, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.8748177463028536, | |
| "grad_norm": 0.3308819875323557, | |
| "learning_rate": 4.3750000000000005e-06, | |
| "loss": 0.1804, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.9164757342220371, | |
| "grad_norm": 0.3163212399640271, | |
| "learning_rate": 4.583333333333333e-06, | |
| "loss": 0.1805, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.9581337221412206, | |
| "grad_norm": 0.3898310274135571, | |
| "learning_rate": 4.791666666666668e-06, | |
| "loss": 0.1803, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.9997917100604041, | |
| "grad_norm": 0.27784332983216586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.1808, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.18106774985790253, | |
| "eval_runtime": 196.6682, | |
| "eval_samples_per_second": 1388.623, | |
| "eval_steps_per_second": 2.715, | |
| "step": 2401 | |
| }, | |
| { | |
| "epoch": 1.0412414080399917, | |
| "grad_norm": 0.2936543487056633, | |
| "learning_rate": 4.999735579817769e-06, | |
| "loss": 0.1807, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.082899395959175, | |
| "grad_norm": 0.2809875255295402, | |
| "learning_rate": 4.998942375205502e-06, | |
| "loss": 0.1801, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.1245573838783587, | |
| "grad_norm": 0.2760622198201079, | |
| "learning_rate": 4.997620553954645e-06, | |
| "loss": 0.1801, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.166215371797542, | |
| "grad_norm": 0.2710350326429577, | |
| "learning_rate": 4.995770395678171e-06, | |
| "loss": 0.1803, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.2078733597167257, | |
| "grad_norm": 0.20931696168572392, | |
| "learning_rate": 4.993392291751431e-06, | |
| "loss": 0.1803, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.2495313476359091, | |
| "grad_norm": 0.24323887106839603, | |
| "learning_rate": 4.990486745229364e-06, | |
| "loss": 0.1799, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.2911893355550927, | |
| "grad_norm": 0.2815796357302052, | |
| "learning_rate": 4.9870543707400835e-06, | |
| "loss": 0.1798, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.3328473234742761, | |
| "grad_norm": 0.23664820561946712, | |
| "learning_rate": 4.983095894354858e-06, | |
| "loss": 0.1801, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.3745053113934598, | |
| "grad_norm": 0.3083911955290968, | |
| "learning_rate": 4.978612153434527e-06, | |
| "loss": 0.1801, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.4161632993126432, | |
| "grad_norm": 0.24337206279187154, | |
| "learning_rate": 4.973604096452361e-06, | |
| "loss": 0.1799, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.4578212872318268, | |
| "grad_norm": 0.2691338598173961, | |
| "learning_rate": 4.968072782793436e-06, | |
| "loss": 0.1798, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.4994792751510102, | |
| "grad_norm": 0.1859964729302664, | |
| "learning_rate": 4.962019382530521e-06, | |
| "loss": 0.18, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.5411372630701936, | |
| "grad_norm": 0.29588302582709847, | |
| "learning_rate": 4.955445176176577e-06, | |
| "loss": 0.18, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.5827952509893772, | |
| "grad_norm": 0.24224751463035848, | |
| "learning_rate": 4.948351554413879e-06, | |
| "loss": 0.1993, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.6244532389085609, | |
| "grad_norm": 0.24926986804364754, | |
| "learning_rate": 4.9407400177998335e-06, | |
| "loss": 0.1799, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.6661112268277443, | |
| "grad_norm": 0.26907499271712193, | |
| "learning_rate": 4.93261217644956e-06, | |
| "loss": 0.1796, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.7077692147469277, | |
| "grad_norm": 0.24652167596434857, | |
| "learning_rate": 4.9239697496952904e-06, | |
| "loss": 0.1797, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.7494272026661113, | |
| "grad_norm": 0.26360641338937, | |
| "learning_rate": 4.914814565722671e-06, | |
| "loss": 0.1797, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.7910851905852947, | |
| "grad_norm": 0.21211424396568565, | |
| "learning_rate": 4.905148561184033e-06, | |
| "loss": 0.1798, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.832743178504478, | |
| "grad_norm": 0.23174306094818595, | |
| "learning_rate": 4.894973780788722e-06, | |
| "loss": 0.1798, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.8744011664236617, | |
| "grad_norm": 0.20239856810705756, | |
| "learning_rate": 4.884292376870567e-06, | |
| "loss": 0.1797, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.9160591543428453, | |
| "grad_norm": 0.20895880362963307, | |
| "learning_rate": 4.873106608932585e-06, | |
| "loss": 0.1796, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.9577171422620288, | |
| "grad_norm": 0.2341875351736524, | |
| "learning_rate": 4.861418843169012e-06, | |
| "loss": 0.1797, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.9993751301812122, | |
| "grad_norm": 0.20045835157915606, | |
| "learning_rate": 4.849231551964771e-06, | |
| "loss": 0.1796, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.17972978949546814, | |
| "eval_runtime": 196.3636, | |
| "eval_samples_per_second": 1390.777, | |
| "eval_steps_per_second": 2.719, | |
| "step": 4802 | |
| }, | |
| { | |
| "epoch": 2.0408248281607997, | |
| "grad_norm": 0.21309941078379252, | |
| "learning_rate": 4.836547313372472e-06, | |
| "loss": 0.1795, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.0824828160799833, | |
| "grad_norm": 0.19717578427183138, | |
| "learning_rate": 4.823368810567056e-06, | |
| "loss": 0.1794, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.124140803999167, | |
| "grad_norm": 0.23023011075724995, | |
| "learning_rate": 4.809698831278217e-06, | |
| "loss": 0.1802, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.16579879191835, | |
| "grad_norm": 0.21578484379978355, | |
| "learning_rate": 4.7955402672006855e-06, | |
| "loss": 0.18, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.2074567798375337, | |
| "grad_norm": 0.21410225528440446, | |
| "learning_rate": 4.780896113382536e-06, | |
| "loss": 0.1798, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.2491147677567174, | |
| "grad_norm": 0.24923656549560563, | |
| "learning_rate": 4.765769467591626e-06, | |
| "loss": 0.1796, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.290772755675901, | |
| "grad_norm": 0.27043973727195314, | |
| "learning_rate": 4.750163529660303e-06, | |
| "loss": 0.1799, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.332430743595084, | |
| "grad_norm": 0.20084508849747548, | |
| "learning_rate": 4.734081600808531e-06, | |
| "loss": 0.1796, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.374088731514268, | |
| "grad_norm": 0.17037675166345598, | |
| "learning_rate": 4.717527082945555e-06, | |
| "loss": 0.1797, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.4157467194334514, | |
| "grad_norm": 0.20792174660657012, | |
| "learning_rate": 4.700503477950278e-06, | |
| "loss": 0.1797, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.457404707352635, | |
| "grad_norm": 0.20444912332175158, | |
| "learning_rate": 4.6830143869304904e-06, | |
| "loss": 0.1799, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.4990626952718182, | |
| "grad_norm": 0.2160441899332462, | |
| "learning_rate": 4.665063509461098e-06, | |
| "loss": 0.1797, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.540720683191002, | |
| "grad_norm": 0.25556787549882387, | |
| "learning_rate": 4.646654642801533e-06, | |
| "loss": 0.1794, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.5823786711101855, | |
| "grad_norm": 0.22198410769602075, | |
| "learning_rate": 4.627791681092499e-06, | |
| "loss": 0.1794, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.624036659029369, | |
| "grad_norm": 0.19549701905963526, | |
| "learning_rate": 4.608478614532215e-06, | |
| "loss": 0.1795, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.6656946469485523, | |
| "grad_norm": 0.24454736703986502, | |
| "learning_rate": 4.588719528532342e-06, | |
| "loss": 0.1797, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.707352634867736, | |
| "grad_norm": 0.20111965276500102, | |
| "learning_rate": 4.568518602853776e-06, | |
| "loss": 0.1797, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.7490106227869195, | |
| "grad_norm": 0.2155615827433472, | |
| "learning_rate": 4.54788011072248e-06, | |
| "loss": 0.1796, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.7906686107061027, | |
| "grad_norm": 0.23518049751986453, | |
| "learning_rate": 4.526808417925531e-06, | |
| "loss": 0.1796, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.8323265986252864, | |
| "grad_norm": 0.2088881277827675, | |
| "learning_rate": 4.50530798188761e-06, | |
| "loss": 0.1795, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.87398458654447, | |
| "grad_norm": 0.22027451607755855, | |
| "learning_rate": 4.4833833507280884e-06, | |
| "loss": 0.1794, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.9156425744636536, | |
| "grad_norm": 0.20366425013850817, | |
| "learning_rate": 4.46103916229894e-06, | |
| "loss": 0.1793, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.957300562382837, | |
| "grad_norm": 0.2718663681076218, | |
| "learning_rate": 4.438280143203665e-06, | |
| "loss": 0.1796, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.9989585503020204, | |
| "grad_norm": 0.19182709064421555, | |
| "learning_rate": 4.415111107797445e-06, | |
| "loss": 0.1794, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.1794959157705307, | |
| "eval_runtime": 196.4289, | |
| "eval_samples_per_second": 1390.315, | |
| "eval_steps_per_second": 2.719, | |
| "step": 7203 | |
| }, | |
| { | |
| "epoch": 3.040408248281608, | |
| "grad_norm": 0.195058367609666, | |
| "learning_rate": 4.391536957168733e-06, | |
| "loss": 0.1798, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.0820662362007916, | |
| "grad_norm": 0.2256357073328012, | |
| "learning_rate": 4.367562678102491e-06, | |
| "loss": 0.1795, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.123724224119975, | |
| "grad_norm": 0.2129481809880029, | |
| "learning_rate": 4.34319334202531e-06, | |
| "loss": 0.1795, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.1653822120391584, | |
| "grad_norm": 0.1689665633552094, | |
| "learning_rate": 4.318434103932622e-06, | |
| "loss": 0.1795, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 3.207040199958342, | |
| "grad_norm": 0.18434140023135, | |
| "learning_rate": 4.293290201298224e-06, | |
| "loss": 0.1796, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 3.2486981878775256, | |
| "grad_norm": 0.2103528683280332, | |
| "learning_rate": 4.267766952966369e-06, | |
| "loss": 0.1793, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 3.290356175796709, | |
| "grad_norm": 0.16087446181904855, | |
| "learning_rate": 4.241869758026638e-06, | |
| "loss": 0.1794, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 3.3320141637158924, | |
| "grad_norm": 0.22569144057534085, | |
| "learning_rate": 4.215604094671835e-06, | |
| "loss": 0.1792, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.373672151635076, | |
| "grad_norm": 0.19990473196998446, | |
| "learning_rate": 4.188975519039151e-06, | |
| "loss": 0.1794, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.4153301395542597, | |
| "grad_norm": 0.1902243355455867, | |
| "learning_rate": 4.161989664034844e-06, | |
| "loss": 0.1794, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.456988127473443, | |
| "grad_norm": 0.18824118604006632, | |
| "learning_rate": 4.134652238142674e-06, | |
| "loss": 0.1794, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.4986461153926265, | |
| "grad_norm": 0.19597204875441573, | |
| "learning_rate": 4.106969024216348e-06, | |
| "loss": 0.1794, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.54030410331181, | |
| "grad_norm": 0.17674897479656335, | |
| "learning_rate": 4.078945878256244e-06, | |
| "loss": 0.1793, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.5819620912309933, | |
| "grad_norm": 0.19658906636767987, | |
| "learning_rate": 4.0505887281706505e-06, | |
| "loss": 0.1794, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.623620079150177, | |
| "grad_norm": 0.1607909455989355, | |
| "learning_rate": 4.021903572521802e-06, | |
| "loss": 0.1794, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 3.6652780670693605, | |
| "grad_norm": 0.18982136425367155, | |
| "learning_rate": 3.992896479256966e-06, | |
| "loss": 0.1793, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 3.706936054988544, | |
| "grad_norm": 0.18212426964310202, | |
| "learning_rate": 3.963573584424852e-06, | |
| "loss": 0.1794, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.748594042907728, | |
| "grad_norm": 0.18731109638030716, | |
| "learning_rate": 3.933941090877615e-06, | |
| "loss": 0.1799, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.790252030826911, | |
| "grad_norm": 0.2243920924541318, | |
| "learning_rate": 3.9040052669587325e-06, | |
| "loss": 0.1863, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.8319100187460946, | |
| "grad_norm": 0.19665494095424324, | |
| "learning_rate": 3.8737724451770155e-06, | |
| "loss": 0.1793, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.8735680066652782, | |
| "grad_norm": 0.1709097835399287, | |
| "learning_rate": 3.8432490208670605e-06, | |
| "loss": 0.1792, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.9152259945844614, | |
| "grad_norm": 0.1519558310026607, | |
| "learning_rate": 3.8124414508364005e-06, | |
| "loss": 0.1792, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.956883982503645, | |
| "grad_norm": 0.18615584510557248, | |
| "learning_rate": 3.7813562519996633e-06, | |
| "loss": 0.1791, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.9985419704228287, | |
| "grad_norm": 0.14216906700933155, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.1792, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.17919312417507172, | |
| "eval_runtime": 196.5199, | |
| "eval_samples_per_second": 1389.671, | |
| "eval_steps_per_second": 2.717, | |
| "step": 9604 | |
| }, | |
| { | |
| "epoch": 4.039991668402416, | |
| "grad_norm": 0.1981373334933009, | |
| "learning_rate": 3.7183793278181063e-06, | |
| "loss": 0.1793, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 4.081649656321599, | |
| "grad_norm": 0.1796707844873524, | |
| "learning_rate": 3.6865009243691015e-06, | |
| "loss": 0.1791, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 4.123307644240783, | |
| "grad_norm": 0.21582792834146144, | |
| "learning_rate": 3.654371533087586e-06, | |
| "loss": 0.1792, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 4.164965632159967, | |
| "grad_norm": 0.22285894509633086, | |
| "learning_rate": 3.621997950501156e-06, | |
| "loss": 0.179, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 4.20662362007915, | |
| "grad_norm": 0.1947839176316504, | |
| "learning_rate": 3.5893870247926986e-06, | |
| "loss": 0.1792, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 4.248281607998334, | |
| "grad_norm": 0.18044045004936568, | |
| "learning_rate": 3.556545654351749e-06, | |
| "loss": 0.1791, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 4.2899395959175175, | |
| "grad_norm": 0.21629122720481903, | |
| "learning_rate": 3.5234807863152316e-06, | |
| "loss": 0.1793, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 4.3315975838367, | |
| "grad_norm": 0.15404290423986947, | |
| "learning_rate": 3.4901994150978926e-06, | |
| "loss": 0.1791, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 4.373255571755884, | |
| "grad_norm": 0.16032922618842949, | |
| "learning_rate": 3.4567085809127247e-06, | |
| "loss": 0.1791, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 4.4149135596750675, | |
| "grad_norm": 0.1495191719599753, | |
| "learning_rate": 3.4230153682817112e-06, | |
| "loss": 0.1791, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 4.456571547594251, | |
| "grad_norm": 0.19697439856186114, | |
| "learning_rate": 3.389126904537192e-06, | |
| "loss": 0.1791, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 4.498229535513435, | |
| "grad_norm": 0.17156322418134476, | |
| "learning_rate": 3.3550503583141726e-06, | |
| "loss": 0.1791, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 4.539887523432618, | |
| "grad_norm": 0.1561878142062692, | |
| "learning_rate": 3.3207929380339034e-06, | |
| "loss": 0.1792, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 4.581545511351802, | |
| "grad_norm": 0.1828679685381653, | |
| "learning_rate": 3.2863618903790346e-06, | |
| "loss": 0.1791, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 4.623203499270986, | |
| "grad_norm": 0.1802733896031037, | |
| "learning_rate": 3.2517644987606827e-06, | |
| "loss": 0.1792, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 4.664861487190168, | |
| "grad_norm": 0.15579534435978112, | |
| "learning_rate": 3.217008081777726e-06, | |
| "loss": 0.1791, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 4.706519475109352, | |
| "grad_norm": 0.16638908065693153, | |
| "learning_rate": 3.182099991668653e-06, | |
| "loss": 0.1791, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 4.748177463028536, | |
| "grad_norm": 0.18397163828033228, | |
| "learning_rate": 3.147047612756302e-06, | |
| "loss": 0.1792, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 4.789835450947719, | |
| "grad_norm": 0.17751483450519995, | |
| "learning_rate": 3.1118583598858097e-06, | |
| "loss": 0.179, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 4.831493438866903, | |
| "grad_norm": 0.1808778224251496, | |
| "learning_rate": 3.0765396768561005e-06, | |
| "loss": 0.179, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 4.8731514267860865, | |
| "grad_norm": 0.17593346330767928, | |
| "learning_rate": 3.0410990348452572e-06, | |
| "loss": 0.1793, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 4.91480941470527, | |
| "grad_norm": 0.15824861181745342, | |
| "learning_rate": 3.0055439308300954e-06, | |
| "loss": 0.1791, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 4.956467402624453, | |
| "grad_norm": 0.21055777806239853, | |
| "learning_rate": 2.96988188600028e-06, | |
| "loss": 0.1792, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 4.9981253905436365, | |
| "grad_norm": 0.15352806003656314, | |
| "learning_rate": 2.9341204441673267e-06, | |
| "loss": 0.1791, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.17911389470100403, | |
| "eval_runtime": 196.4564, | |
| "eval_samples_per_second": 1390.12, | |
| "eval_steps_per_second": 2.718, | |
| "step": 12005 | |
| }, | |
| { | |
| "epoch": 5.0395750885232244, | |
| "grad_norm": 0.1891820592041876, | |
| "learning_rate": 2.898267170168807e-06, | |
| "loss": 0.1791, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 5.081233076442408, | |
| "grad_norm": 0.14302405130068518, | |
| "learning_rate": 2.862329648268117e-06, | |
| "loss": 0.1789, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 5.122891064361592, | |
| "grad_norm": 0.2215960599158716, | |
| "learning_rate": 2.82631548055013e-06, | |
| "loss": 0.1792, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 5.164549052280774, | |
| "grad_norm": 0.1566593937408507, | |
| "learning_rate": 2.7902322853130758e-06, | |
| "loss": 0.179, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 5.206207040199958, | |
| "grad_norm": 0.15513379693358573, | |
| "learning_rate": 2.754087695457005e-06, | |
| "loss": 0.1791, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 5.247865028119142, | |
| "grad_norm": 0.14968722299942713, | |
| "learning_rate": 2.717889356869146e-06, | |
| "loss": 0.179, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 5.289523016038325, | |
| "grad_norm": 0.2097123380235341, | |
| "learning_rate": 2.681644926806527e-06, | |
| "loss": 0.179, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 5.331181003957509, | |
| "grad_norm": 0.19315969222642626, | |
| "learning_rate": 2.6453620722761897e-06, | |
| "loss": 0.179, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 5.372838991876693, | |
| "grad_norm": 0.2209634744371871, | |
| "learning_rate": 2.6090484684133406e-06, | |
| "loss": 0.1791, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 5.414496979795876, | |
| "grad_norm": 0.20430693758591473, | |
| "learning_rate": 2.572711796857779e-06, | |
| "loss": 0.179, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 5.45615496771506, | |
| "grad_norm": 0.18903967369853375, | |
| "learning_rate": 2.5363597441289574e-06, | |
| "loss": 0.179, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 5.4978129556342425, | |
| "grad_norm": 0.15616083753477006, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.179, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 5.539470943553426, | |
| "grad_norm": 0.1507559008561688, | |
| "learning_rate": 2.4636402558710434e-06, | |
| "loss": 0.1791, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 5.58112893147261, | |
| "grad_norm": 0.16640062646644058, | |
| "learning_rate": 2.4272882031422216e-06, | |
| "loss": 0.179, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 5.622786919391793, | |
| "grad_norm": 0.1824434916593794, | |
| "learning_rate": 2.3909515315866606e-06, | |
| "loss": 0.1791, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 5.664444907310977, | |
| "grad_norm": 0.2004975100759413, | |
| "learning_rate": 2.3546379277238107e-06, | |
| "loss": 0.179, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 5.706102895230161, | |
| "grad_norm": 0.17154522514366766, | |
| "learning_rate": 2.318355073193474e-06, | |
| "loss": 0.1791, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 5.747760883149343, | |
| "grad_norm": 0.13248550006328844, | |
| "learning_rate": 2.2821106431308546e-06, | |
| "loss": 0.179, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 5.789418871068527, | |
| "grad_norm": 0.1915171020600886, | |
| "learning_rate": 2.2459123045429953e-06, | |
| "loss": 0.1792, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 5.831076858987711, | |
| "grad_norm": 0.16235356856597902, | |
| "learning_rate": 2.2097677146869242e-06, | |
| "loss": 0.1791, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 5.872734846906894, | |
| "grad_norm": 0.1627140490119954, | |
| "learning_rate": 2.173684519449872e-06, | |
| "loss": 0.1789, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 5.914392834826078, | |
| "grad_norm": 0.16466884224746445, | |
| "learning_rate": 2.1376703517318835e-06, | |
| "loss": 0.179, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 5.9560508227452615, | |
| "grad_norm": 0.20611687756993843, | |
| "learning_rate": 2.101732829831194e-06, | |
| "loss": 0.179, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 5.997708810664445, | |
| "grad_norm": 0.16559158144998481, | |
| "learning_rate": 2.0658795558326745e-06, | |
| "loss": 0.179, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.17907121777534485, | |
| "eval_runtime": 196.4273, | |
| "eval_samples_per_second": 1390.326, | |
| "eval_steps_per_second": 2.719, | |
| "step": 14406 | |
| }, | |
| { | |
| "epoch": 6.039158508644032, | |
| "grad_norm": 0.16927649861039284, | |
| "learning_rate": 2.0301181139997206e-06, | |
| "loss": 0.1789, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 6.080816496563216, | |
| "grad_norm": 0.1752142512252337, | |
| "learning_rate": 1.994456069169906e-06, | |
| "loss": 0.179, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 6.1224744844823995, | |
| "grad_norm": 0.21170178196900302, | |
| "learning_rate": 1.958900965154743e-06, | |
| "loss": 0.1789, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 6.164132472401583, | |
| "grad_norm": 0.21884267966966597, | |
| "learning_rate": 1.9234603231439e-06, | |
| "loss": 0.1788, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 6.205790460320767, | |
| "grad_norm": 0.17106948371146288, | |
| "learning_rate": 1.8881416401141905e-06, | |
| "loss": 0.1788, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 6.24744844823995, | |
| "grad_norm": 0.174097273230219, | |
| "learning_rate": 1.852952387243698e-06, | |
| "loss": 0.1788, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 6.289106436159133, | |
| "grad_norm": 0.20862365699110258, | |
| "learning_rate": 1.8179000083313483e-06, | |
| "loss": 0.1788, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 6.330764424078317, | |
| "grad_norm": 0.17885797151549512, | |
| "learning_rate": 1.7829919182222752e-06, | |
| "loss": 0.1788, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 6.3724224119975, | |
| "grad_norm": 0.19498914359958716, | |
| "learning_rate": 1.7482355012393177e-06, | |
| "loss": 0.1789, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 6.414080399916684, | |
| "grad_norm": 0.1389966716220221, | |
| "learning_rate": 1.7136381096209665e-06, | |
| "loss": 0.179, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 6.455738387835868, | |
| "grad_norm": 0.1786092324697337, | |
| "learning_rate": 1.6792070619660977e-06, | |
| "loss": 0.179, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 6.497396375755051, | |
| "grad_norm": 0.19161758807721282, | |
| "learning_rate": 1.6449496416858285e-06, | |
| "loss": 0.1788, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 6.539054363674235, | |
| "grad_norm": 0.19197303954060144, | |
| "learning_rate": 1.6108730954628093e-06, | |
| "loss": 0.1788, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 6.580712351593418, | |
| "grad_norm": 0.16743828588501417, | |
| "learning_rate": 1.5769846317182894e-06, | |
| "loss": 0.1787, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 6.622370339512601, | |
| "grad_norm": 0.16492318029574304, | |
| "learning_rate": 1.5432914190872757e-06, | |
| "loss": 0.1788, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 6.664028327431785, | |
| "grad_norm": 0.15440438163304784, | |
| "learning_rate": 1.509800584902108e-06, | |
| "loss": 0.1789, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 6.7056863153509685, | |
| "grad_norm": 0.17667275704806315, | |
| "learning_rate": 1.4765192136847686e-06, | |
| "loss": 0.1789, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 6.747344303270152, | |
| "grad_norm": 0.17904015323124156, | |
| "learning_rate": 1.443454345648252e-06, | |
| "loss": 0.1789, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 6.789002291189336, | |
| "grad_norm": 0.16736730033822061, | |
| "learning_rate": 1.4106129752073023e-06, | |
| "loss": 0.179, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 6.830660279108519, | |
| "grad_norm": 0.16038102753372047, | |
| "learning_rate": 1.3780020494988447e-06, | |
| "loss": 0.179, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 6.872318267027703, | |
| "grad_norm": 0.15315299560909978, | |
| "learning_rate": 1.3456284669124159e-06, | |
| "loss": 0.1786, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 6.913976254946886, | |
| "grad_norm": 0.1430660492396621, | |
| "learning_rate": 1.313499075630899e-06, | |
| "loss": 0.179, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 6.955634242866069, | |
| "grad_norm": 0.17326024703322063, | |
| "learning_rate": 1.2816206721818944e-06, | |
| "loss": 0.1789, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 6.997292230785253, | |
| "grad_norm": 0.14987232796770428, | |
| "learning_rate": 1.2500000000000007e-06, | |
| "loss": 0.1787, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.17893224954605103, | |
| "eval_runtime": 196.4121, | |
| "eval_samples_per_second": 1390.434, | |
| "eval_steps_per_second": 2.719, | |
| "step": 16807 | |
| }, | |
| { | |
| "epoch": 7.038741928764841, | |
| "grad_norm": 0.1439804790666206, | |
| "learning_rate": 1.218643748000337e-06, | |
| "loss": 0.1787, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 7.080399916684025, | |
| "grad_norm": 0.1820620837643405, | |
| "learning_rate": 1.1875585491636e-06, | |
| "loss": 0.1788, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 7.122057904603207, | |
| "grad_norm": 0.1619570282327302, | |
| "learning_rate": 1.1567509791329402e-06, | |
| "loss": 0.1786, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 7.163715892522391, | |
| "grad_norm": 0.2470491812569796, | |
| "learning_rate": 1.1262275548229852e-06, | |
| "loss": 0.1791, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 7.205373880441575, | |
| "grad_norm": 0.18058952670407366, | |
| "learning_rate": 1.0959947330412681e-06, | |
| "loss": 0.1789, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 7.247031868360758, | |
| "grad_norm": 0.20589528394837478, | |
| "learning_rate": 1.0660589091223854e-06, | |
| "loss": 0.1786, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 7.288689856279942, | |
| "grad_norm": 0.13562633767825757, | |
| "learning_rate": 1.0364264155751489e-06, | |
| "loss": 0.1786, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 7.330347844199125, | |
| "grad_norm": 0.194696644563295, | |
| "learning_rate": 1.0071035207430352e-06, | |
| "loss": 0.1787, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 7.372005832118309, | |
| "grad_norm": 0.19213496981753242, | |
| "learning_rate": 9.780964274781984e-07, | |
| "loss": 0.1786, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 7.413663820037492, | |
| "grad_norm": 0.19876379595232896, | |
| "learning_rate": 9.494112718293503e-07, | |
| "loss": 0.1787, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 7.455321807956675, | |
| "grad_norm": 0.1684329683430977, | |
| "learning_rate": 9.210541217437566e-07, | |
| "loss": 0.1787, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 7.496979795875859, | |
| "grad_norm": 0.1823625942631362, | |
| "learning_rate": 8.930309757836517e-07, | |
| "loss": 0.1785, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 7.538637783795043, | |
| "grad_norm": 0.18725762365246973, | |
| "learning_rate": 8.653477618573261e-07, | |
| "loss": 0.1786, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 7.580295771714226, | |
| "grad_norm": 0.1507247392992477, | |
| "learning_rate": 8.380103359651554e-07, | |
| "loss": 0.1787, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 7.62195375963341, | |
| "grad_norm": 0.18505299719524845, | |
| "learning_rate": 8.110244809608494e-07, | |
| "loss": 0.1786, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 7.663611747552594, | |
| "grad_norm": 0.12101506184025812, | |
| "learning_rate": 7.843959053281663e-07, | |
| "loss": 0.1786, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 7.705269735471777, | |
| "grad_norm": 0.16939344528667466, | |
| "learning_rate": 7.581302419733633e-07, | |
| "loss": 0.1785, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 7.74692772339096, | |
| "grad_norm": 0.13840737012325652, | |
| "learning_rate": 7.322330470336314e-07, | |
| "loss": 0.1785, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 7.7885857113101435, | |
| "grad_norm": 0.16859264286478876, | |
| "learning_rate": 7.067097987017762e-07, | |
| "loss": 0.1787, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 7.830243699229327, | |
| "grad_norm": 0.1897535110592711, | |
| "learning_rate": 6.815658960673782e-07, | |
| "loss": 0.1785, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 7.871901687148511, | |
| "grad_norm": 0.18368265058091485, | |
| "learning_rate": 6.568066579746901e-07, | |
| "loss": 0.1785, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 7.913559675067694, | |
| "grad_norm": 0.13696515467419504, | |
| "learning_rate": 6.324373218975105e-07, | |
| "loss": 0.1786, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 7.955217662986878, | |
| "grad_norm": 0.14354515830035847, | |
| "learning_rate": 6.084630428312679e-07, | |
| "loss": 0.1785, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 7.996875650906061, | |
| "grad_norm": 0.15165778139105265, | |
| "learning_rate": 5.848888922025553e-07, | |
| "loss": 0.1786, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.17886345088481903, | |
| "eval_runtime": 196.5554, | |
| "eval_samples_per_second": 1389.42, | |
| "eval_steps_per_second": 2.717, | |
| "step": 19208 | |
| }, | |
| { | |
| "epoch": 8.03832534888565, | |
| "grad_norm": 0.15763312404128105, | |
| "learning_rate": 5.617198567963353e-07, | |
| "loss": 0.1783, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 8.079983336804831, | |
| "grad_norm": 0.1720429493205497, | |
| "learning_rate": 5.389608377010608e-07, | |
| "loss": 0.1783, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 8.121641324724015, | |
| "grad_norm": 0.1690726413308925, | |
| "learning_rate": 5.166166492719124e-07, | |
| "loss": 0.1783, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 8.163299312643199, | |
| "grad_norm": 0.17909925356768044, | |
| "learning_rate": 4.946920181123904e-07, | |
| "loss": 0.1782, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 8.204957300562382, | |
| "grad_norm": 0.22116088190481087, | |
| "learning_rate": 4.7319158207446953e-07, | |
| "loss": 0.1782, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 8.246615288481566, | |
| "grad_norm": 0.16383363990929287, | |
| "learning_rate": 4.5211988927752026e-07, | |
| "loss": 0.1782, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 8.28827327640075, | |
| "grad_norm": 0.18255215192836688, | |
| "learning_rate": 4.3148139714622365e-07, | |
| "loss": 0.1782, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 8.329931264319933, | |
| "grad_norm": 0.19783668808521335, | |
| "learning_rate": 4.1128047146765936e-07, | |
| "loss": 0.1781, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 8.371589252239117, | |
| "grad_norm": 0.1828620345488146, | |
| "learning_rate": 3.915213854677863e-07, | |
| "loss": 0.1781, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 8.4132472401583, | |
| "grad_norm": 0.1461266269903454, | |
| "learning_rate": 3.722083189075007e-07, | |
| "loss": 0.1782, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 8.454905228077484, | |
| "grad_norm": 0.19063937525748337, | |
| "learning_rate": 3.5334535719846767e-07, | |
| "loss": 0.1781, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 8.496563215996668, | |
| "grad_norm": 0.12678778363904367, | |
| "learning_rate": 3.3493649053890325e-07, | |
| "loss": 0.1781, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 8.538221203915851, | |
| "grad_norm": 0.15880039262804566, | |
| "learning_rate": 3.1698561306951065e-07, | |
| "loss": 0.1782, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 8.579879191835035, | |
| "grad_norm": 0.18763241075198428, | |
| "learning_rate": 2.9949652204972257e-07, | |
| "loss": 0.178, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 8.621537179754219, | |
| "grad_norm": 0.1582482612527278, | |
| "learning_rate": 2.8247291705444575e-07, | |
| "loss": 0.1778, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 8.6631951676734, | |
| "grad_norm": 0.181992432758085, | |
| "learning_rate": 2.6591839919146963e-07, | |
| "loss": 0.178, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 8.704853155592584, | |
| "grad_norm": 0.1463913122272469, | |
| "learning_rate": 2.4983647033969714e-07, | |
| "loss": 0.1783, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 8.746511143511768, | |
| "grad_norm": 0.15649171707147957, | |
| "learning_rate": 2.3423053240837518e-07, | |
| "loss": 0.1781, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 8.788169131430951, | |
| "grad_norm": 0.16428482803404829, | |
| "learning_rate": 2.1910388661746495e-07, | |
| "loss": 0.1782, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 8.829827119350135, | |
| "grad_norm": 0.19349382720192548, | |
| "learning_rate": 2.044597327993153e-07, | |
| "loss": 0.1781, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 8.871485107269319, | |
| "grad_norm": 0.1678737628788564, | |
| "learning_rate": 1.9030116872178317e-07, | |
| "loss": 0.1781, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 8.913143095188502, | |
| "grad_norm": 0.187501462753097, | |
| "learning_rate": 1.7663118943294367e-07, | |
| "loss": 0.1781, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 8.954801083107686, | |
| "grad_norm": 0.17102799413092362, | |
| "learning_rate": 1.6345268662752904e-07, | |
| "loss": 0.1781, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 8.99645907102687, | |
| "grad_norm": 0.14591121551272715, | |
| "learning_rate": 1.507684480352292e-07, | |
| "loss": 0.1781, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.17907947301864624, | |
| "eval_runtime": 196.3329, | |
| "eval_samples_per_second": 1390.995, | |
| "eval_steps_per_second": 2.72, | |
| "step": 21609 | |
| }, | |
| { | |
| "epoch": 9.037908769006457, | |
| "grad_norm": 0.1816902644971728, | |
| "learning_rate": 1.3858115683098832e-07, | |
| "loss": 0.177, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 9.07956675692564, | |
| "grad_norm": 0.18741449385017522, | |
| "learning_rate": 1.2689339106741529e-07, | |
| "loss": 0.1767, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 9.121224744844824, | |
| "grad_norm": 0.20197534473429568, | |
| "learning_rate": 1.1570762312943295e-07, | |
| "loss": 0.1768, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 9.162882732764007, | |
| "grad_norm": 0.21639195747399645, | |
| "learning_rate": 1.0502621921127776e-07, | |
| "loss": 0.1767, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 9.204540720683191, | |
| "grad_norm": 0.18933606645836426, | |
| "learning_rate": 9.485143881596715e-08, | |
| "loss": 0.1768, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 9.246198708602375, | |
| "grad_norm": 0.1960648079791721, | |
| "learning_rate": 8.518543427732951e-08, | |
| "loss": 0.1767, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 9.287856696521558, | |
| "grad_norm": 0.18056583891057434, | |
| "learning_rate": 7.603025030471001e-08, | |
| "loss": 0.1766, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 9.329514684440742, | |
| "grad_norm": 0.18480124722464905, | |
| "learning_rate": 6.738782355044048e-08, | |
| "loss": 0.1769, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 9.371172672359926, | |
| "grad_norm": 0.22786425388668805, | |
| "learning_rate": 5.92599822001666e-08, | |
| "loss": 0.1767, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 9.41283066027911, | |
| "grad_norm": 0.2205541920741548, | |
| "learning_rate": 5.164844558612131e-08, | |
| "loss": 0.1766, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 9.454488648198293, | |
| "grad_norm": 0.2134938008984885, | |
| "learning_rate": 4.455482382342336e-08, | |
| "loss": 0.1767, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 9.496146636117475, | |
| "grad_norm": 0.23030736326238382, | |
| "learning_rate": 3.798061746947995e-08, | |
| "loss": 0.1767, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 9.537804624036658, | |
| "grad_norm": 0.2214355490299709, | |
| "learning_rate": 3.1927217206564884e-08, | |
| "loss": 0.1767, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 9.579462611955842, | |
| "grad_norm": 0.2291392443441154, | |
| "learning_rate": 2.6395903547638825e-08, | |
| "loss": 0.1765, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 9.621120599875026, | |
| "grad_norm": 0.22120778210484332, | |
| "learning_rate": 2.1387846565474047e-08, | |
| "loss": 0.1765, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 9.66277858779421, | |
| "grad_norm": 0.1927066727358843, | |
| "learning_rate": 1.6904105645142443e-08, | |
| "loss": 0.1765, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 9.704436575713393, | |
| "grad_norm": 0.2369391538896648, | |
| "learning_rate": 1.2945629259917547e-08, | |
| "loss": 0.1766, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 9.746094563632576, | |
| "grad_norm": 0.21269587694232558, | |
| "learning_rate": 9.513254770636138e-09, | |
| "loss": 0.1767, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 9.78775255155176, | |
| "grad_norm": 0.20767475535201343, | |
| "learning_rate": 6.607708248569378e-09, | |
| "loss": 0.1766, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 9.829410539470944, | |
| "grad_norm": 0.21058981271348698, | |
| "learning_rate": 4.229604321829561e-09, | |
| "loss": 0.1766, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 9.871068527390127, | |
| "grad_norm": 0.18917603463369678, | |
| "learning_rate": 2.3794460453555046e-09, | |
| "loss": 0.1766, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 9.912726515309311, | |
| "grad_norm": 0.18145195315540197, | |
| "learning_rate": 1.0576247944985018e-09, | |
| "loss": 0.1767, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 9.954384503228495, | |
| "grad_norm": 0.22385123601872012, | |
| "learning_rate": 2.6442018223132857e-10, | |
| "loss": 0.1766, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 9.996042491147678, | |
| "grad_norm": 0.22063368359660335, | |
| "learning_rate": 0.0, | |
| "loss": 0.1766, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 9.996042491147678, | |
| "eval_loss": 0.18023133277893066, | |
| "eval_runtime": 196.0313, | |
| "eval_samples_per_second": 1393.135, | |
| "eval_steps_per_second": 2.724, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 9.996042491147678, | |
| "step": 24000, | |
| "total_flos": 5485114750402560.0, | |
| "train_loss": 0.19645737719535827, | |
| "train_runtime": 70712.6152, | |
| "train_samples_per_second": 347.587, | |
| "train_steps_per_second": 0.339 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 24000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5485114750402560.0, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |