| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.9372115487817965, |
| "eval_steps": 500, |
| "global_step": 46000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.05366534292154127, |
| "grad_norm": 0.3935315012931824, |
| "learning_rate": 4.9463346570784594e-05, |
| "loss": 0.7966, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.10733068584308254, |
| "grad_norm": 0.26197516918182373, |
| "learning_rate": 4.892669314156917e-05, |
| "loss": 0.2163, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1609960287646238, |
| "grad_norm": 0.34691885113716125, |
| "learning_rate": 4.839003971235376e-05, |
| "loss": 0.2089, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.2146613716861651, |
| "grad_norm": 0.239683598279953, |
| "learning_rate": 4.7853386283138354e-05, |
| "loss": 0.1973, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.26832671460770635, |
| "grad_norm": 0.3232818841934204, |
| "learning_rate": 4.731673285392294e-05, |
| "loss": 0.2035, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.3219920575292476, |
| "grad_norm": 0.3142314553260803, |
| "learning_rate": 4.678007942470752e-05, |
| "loss": 0.197, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3756574004507889, |
| "grad_norm": 0.2924354076385498, |
| "learning_rate": 4.6243425995492114e-05, |
| "loss": 0.1913, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.4293227433723302, |
| "grad_norm": 0.4022532105445862, |
| "learning_rate": 4.57067725662767e-05, |
| "loss": 0.1931, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4829880862938714, |
| "grad_norm": 0.2730918228626251, |
| "learning_rate": 4.517011913706129e-05, |
| "loss": 0.189, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.5366534292154127, |
| "grad_norm": 0.2549761235713959, |
| "learning_rate": 4.4633465707845874e-05, |
| "loss": 0.1861, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5903187721369539, |
| "grad_norm": 0.28634166717529297, |
| "learning_rate": 4.409681227863046e-05, |
| "loss": 0.1853, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.6439841150584952, |
| "grad_norm": 0.267437607049942, |
| "learning_rate": 4.356015884941505e-05, |
| "loss": 0.1888, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6976494579800365, |
| "grad_norm": 0.24703972041606903, |
| "learning_rate": 4.302350542019964e-05, |
| "loss": 0.1863, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7513148009015778, |
| "grad_norm": 0.20265009999275208, |
| "learning_rate": 4.2486851990984225e-05, |
| "loss": 0.1854, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.804980143823119, |
| "grad_norm": 0.3746808171272278, |
| "learning_rate": 4.195019856176881e-05, |
| "loss": 0.1838, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8586454867446603, |
| "grad_norm": 0.2632112503051758, |
| "learning_rate": 4.14135451325534e-05, |
| "loss": 0.1827, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.9123108296662016, |
| "grad_norm": 0.3191295564174652, |
| "learning_rate": 4.0876891703337986e-05, |
| "loss": 0.18, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.9659761725877428, |
| "grad_norm": 0.2685633599758148, |
| "learning_rate": 4.034023827412257e-05, |
| "loss": 0.1783, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.18574130535125732, |
| "eval_runtime": 121.9059, |
| "eval_samples_per_second": 178.105, |
| "eval_steps_per_second": 11.132, |
| "step": 9317 |
| }, |
| { |
| "epoch": 1.0196415155092842, |
| "grad_norm": 0.20498144626617432, |
| "learning_rate": 3.980358484490716e-05, |
| "loss": 0.1754, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.0733068584308254, |
| "grad_norm": 0.2528730034828186, |
| "learning_rate": 3.9266931415691746e-05, |
| "loss": 0.1755, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.1269722013523666, |
| "grad_norm": 0.28406932950019836, |
| "learning_rate": 3.873027798647634e-05, |
| "loss": 0.1724, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.1806375442739079, |
| "grad_norm": 0.20382213592529297, |
| "learning_rate": 3.819362455726092e-05, |
| "loss": 0.1729, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.2343028871954491, |
| "grad_norm": 0.22263003885746002, |
| "learning_rate": 3.765697112804551e-05, |
| "loss": 0.1791, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.2879682301169906, |
| "grad_norm": 0.2563818395137787, |
| "learning_rate": 3.71203176988301e-05, |
| "loss": 0.1758, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.3416335730385316, |
| "grad_norm": 0.2475346475839615, |
| "learning_rate": 3.658366426961469e-05, |
| "loss": 0.1798, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.395298915960073, |
| "grad_norm": 0.246457040309906, |
| "learning_rate": 3.604701084039927e-05, |
| "loss": 0.1752, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.4489642588816143, |
| "grad_norm": 0.18007038533687592, |
| "learning_rate": 3.551035741118386e-05, |
| "loss": 0.1708, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.5026296018031555, |
| "grad_norm": 0.3777049779891968, |
| "learning_rate": 3.497370398196845e-05, |
| "loss": 0.174, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.5562949447246968, |
| "grad_norm": 0.29228395223617554, |
| "learning_rate": 3.443705055275303e-05, |
| "loss": 0.1723, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.609960287646238, |
| "grad_norm": 0.2725663483142853, |
| "learning_rate": 3.390039712353762e-05, |
| "loss": 0.173, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.6636256305677795, |
| "grad_norm": 0.2582469880580902, |
| "learning_rate": 3.336374369432221e-05, |
| "loss": 0.1739, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.7172909734893205, |
| "grad_norm": 0.3346666395664215, |
| "learning_rate": 3.28270902651068e-05, |
| "loss": 0.1723, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.770956316410862, |
| "grad_norm": 0.3053486943244934, |
| "learning_rate": 3.2290436835891384e-05, |
| "loss": 0.1702, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.8246216593324032, |
| "grad_norm": 0.231419175863266, |
| "learning_rate": 3.175378340667597e-05, |
| "loss": 0.1721, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.8782870022539444, |
| "grad_norm": 0.29271528124809265, |
| "learning_rate": 3.121712997746056e-05, |
| "loss": 0.1724, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.9319523451754856, |
| "grad_norm": 0.19697044789791107, |
| "learning_rate": 3.0680476548245145e-05, |
| "loss": 0.1737, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.9856176880970269, |
| "grad_norm": 0.19517184793949127, |
| "learning_rate": 3.014382311902973e-05, |
| "loss": 0.1762, |
| "step": 18500 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.18117234110832214, |
| "eval_runtime": 121.7819, |
| "eval_samples_per_second": 178.286, |
| "eval_steps_per_second": 11.143, |
| "step": 18634 |
| }, |
| { |
| "epoch": 2.0392830310185683, |
| "grad_norm": 0.38678213953971863, |
| "learning_rate": 2.9607169689814317e-05, |
| "loss": 0.1706, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.0929483739401094, |
| "grad_norm": 0.37432995438575745, |
| "learning_rate": 2.9070516260598908e-05, |
| "loss": 0.1676, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.146613716861651, |
| "grad_norm": 0.230524942278862, |
| "learning_rate": 2.8533862831383496e-05, |
| "loss": 0.17, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.200279059783192, |
| "grad_norm": 0.28219106793403625, |
| "learning_rate": 2.7997209402168077e-05, |
| "loss": 0.1701, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.2539444027047333, |
| "grad_norm": 0.31730708479881287, |
| "learning_rate": 2.7460555972952668e-05, |
| "loss": 0.1694, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.3076097456262747, |
| "grad_norm": 0.20662575960159302, |
| "learning_rate": 2.6923902543737256e-05, |
| "loss": 0.1679, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.3612750885478158, |
| "grad_norm": 0.22145278751850128, |
| "learning_rate": 2.6387249114521844e-05, |
| "loss": 0.1697, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.414940431469357, |
| "grad_norm": 0.17506997287273407, |
| "learning_rate": 2.585059568530643e-05, |
| "loss": 0.1657, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.4686057743908982, |
| "grad_norm": 0.22657690942287445, |
| "learning_rate": 2.5313942256091016e-05, |
| "loss": 0.1682, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.5222711173124397, |
| "grad_norm": 0.2509589195251465, |
| "learning_rate": 2.4777288826875604e-05, |
| "loss": 0.1642, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.575936460233981, |
| "grad_norm": 0.1847866326570511, |
| "learning_rate": 2.4240635397660192e-05, |
| "loss": 0.1712, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.629601803155522, |
| "grad_norm": 0.33803707361221313, |
| "learning_rate": 2.370398196844478e-05, |
| "loss": 0.1688, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.683267146077063, |
| "grad_norm": 0.16116875410079956, |
| "learning_rate": 2.3167328539229368e-05, |
| "loss": 0.169, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.7369324889986046, |
| "grad_norm": 0.21728037297725677, |
| "learning_rate": 2.2630675110013955e-05, |
| "loss": 0.1673, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.790597831920146, |
| "grad_norm": 0.26892897486686707, |
| "learning_rate": 2.209402168079854e-05, |
| "loss": 0.167, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.844263174841687, |
| "grad_norm": 0.24722512066364288, |
| "learning_rate": 2.1557368251583128e-05, |
| "loss": 0.1702, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.8979285177632286, |
| "grad_norm": 0.26736003160476685, |
| "learning_rate": 2.1020714822367716e-05, |
| "loss": 0.1646, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.9515938606847696, |
| "grad_norm": 0.32340237498283386, |
| "learning_rate": 2.0484061393152303e-05, |
| "loss": 0.167, |
| "step": 27500 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.17867647111415863, |
| "eval_runtime": 120.9369, |
| "eval_samples_per_second": 179.532, |
| "eval_steps_per_second": 11.221, |
| "step": 27951 |
| }, |
| { |
| "epoch": 3.005259203606311, |
| "grad_norm": 0.20410487055778503, |
| "learning_rate": 1.994740796393689e-05, |
| "loss": 0.1616, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.0589245465278525, |
| "grad_norm": 0.2781909704208374, |
| "learning_rate": 1.941075453472148e-05, |
| "loss": 0.1622, |
| "step": 28500 |
| }, |
| { |
| "epoch": 3.1125898894493935, |
| "grad_norm": 0.23636963963508606, |
| "learning_rate": 1.8874101105506064e-05, |
| "loss": 0.1671, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.166255232370935, |
| "grad_norm": 0.17557688057422638, |
| "learning_rate": 1.8337447676290655e-05, |
| "loss": 0.1637, |
| "step": 29500 |
| }, |
| { |
| "epoch": 3.219920575292476, |
| "grad_norm": 0.15285401046276093, |
| "learning_rate": 1.780079424707524e-05, |
| "loss": 0.1625, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.2735859182140175, |
| "grad_norm": 0.18128257989883423, |
| "learning_rate": 1.7264140817859827e-05, |
| "loss": 0.1634, |
| "step": 30500 |
| }, |
| { |
| "epoch": 3.3272512611355585, |
| "grad_norm": 0.2326362580060959, |
| "learning_rate": 1.6727487388644415e-05, |
| "loss": 0.1642, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.3809166040571, |
| "grad_norm": 0.28395962715148926, |
| "learning_rate": 1.6190833959429003e-05, |
| "loss": 0.1644, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.4345819469786414, |
| "grad_norm": 0.22677470743656158, |
| "learning_rate": 1.5654180530213587e-05, |
| "loss": 0.1648, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.4882472899001824, |
| "grad_norm": 0.27061572670936584, |
| "learning_rate": 1.5117527100998177e-05, |
| "loss": 0.1647, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.541912632821724, |
| "grad_norm": 0.23157773911952972, |
| "learning_rate": 1.4580873671782763e-05, |
| "loss": 0.1642, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.595577975743265, |
| "grad_norm": 0.22143514454364777, |
| "learning_rate": 1.4044220242567353e-05, |
| "loss": 0.1668, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.6492433186648063, |
| "grad_norm": 0.2169780731201172, |
| "learning_rate": 1.3507566813351939e-05, |
| "loss": 0.1658, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.7029086615863473, |
| "grad_norm": 0.3290941119194031, |
| "learning_rate": 1.2970913384136527e-05, |
| "loss": 0.1647, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.756574004507889, |
| "grad_norm": 0.29469916224479675, |
| "learning_rate": 1.2434259954921113e-05, |
| "loss": 0.1638, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.8102393474294303, |
| "grad_norm": 0.29265907406806946, |
| "learning_rate": 1.18976065257057e-05, |
| "loss": 0.1618, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.8639046903509713, |
| "grad_norm": 0.192903533577919, |
| "learning_rate": 1.1360953096490287e-05, |
| "loss": 0.1673, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.9175700332725127, |
| "grad_norm": 0.29646775126457214, |
| "learning_rate": 1.0824299667274874e-05, |
| "loss": 0.1631, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.9712353761940538, |
| "grad_norm": 0.2786768674850464, |
| "learning_rate": 1.0287646238059462e-05, |
| "loss": 0.1634, |
| "step": 37000 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.17870686948299408, |
| "eval_runtime": 120.9355, |
| "eval_samples_per_second": 179.534, |
| "eval_steps_per_second": 11.221, |
| "step": 37268 |
| }, |
| { |
| "epoch": 4.024900719115595, |
| "grad_norm": 0.18881458044052124, |
| "learning_rate": 9.750992808844048e-06, |
| "loss": 0.1587, |
| "step": 37500 |
| }, |
| { |
| "epoch": 4.078566062037137, |
| "grad_norm": 0.2428637146949768, |
| "learning_rate": 9.214339379628636e-06, |
| "loss": 0.1664, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.132231404958677, |
| "grad_norm": 0.2574012279510498, |
| "learning_rate": 8.677685950413224e-06, |
| "loss": 0.1602, |
| "step": 38500 |
| }, |
| { |
| "epoch": 4.185896747880219, |
| "grad_norm": 0.2370443195104599, |
| "learning_rate": 8.14103252119781e-06, |
| "loss": 0.1619, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.23956209080176, |
| "grad_norm": 0.2101278007030487, |
| "learning_rate": 7.604379091982398e-06, |
| "loss": 0.1636, |
| "step": 39500 |
| }, |
| { |
| "epoch": 4.293227433723302, |
| "grad_norm": 0.1884097009897232, |
| "learning_rate": 7.067725662766986e-06, |
| "loss": 0.1613, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.346892776644843, |
| "grad_norm": 0.2265803962945938, |
| "learning_rate": 6.531072233551573e-06, |
| "loss": 0.1657, |
| "step": 40500 |
| }, |
| { |
| "epoch": 4.400558119566384, |
| "grad_norm": 0.2761909067630768, |
| "learning_rate": 5.99441880433616e-06, |
| "loss": 0.1641, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.454223462487925, |
| "grad_norm": 0.2776241898536682, |
| "learning_rate": 5.457765375120747e-06, |
| "loss": 0.1611, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.507888805409467, |
| "grad_norm": 0.17776153981685638, |
| "learning_rate": 4.921111945905334e-06, |
| "loss": 0.1641, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.561554148331008, |
| "grad_norm": 0.29574069380760193, |
| "learning_rate": 4.384458516689922e-06, |
| "loss": 0.1619, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.6152194912525495, |
| "grad_norm": 0.25601324439048767, |
| "learning_rate": 3.847805087474509e-06, |
| "loss": 0.1598, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.66888483417409, |
| "grad_norm": 0.2692703306674957, |
| "learning_rate": 3.3111516582590963e-06, |
| "loss": 0.1604, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.7225501770956315, |
| "grad_norm": 0.24300901591777802, |
| "learning_rate": 2.7744982290436837e-06, |
| "loss": 0.1631, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.776215520017173, |
| "grad_norm": 0.2995280623435974, |
| "learning_rate": 2.237844799828271e-06, |
| "loss": 0.1647, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.829880862938714, |
| "grad_norm": 0.26150044798851013, |
| "learning_rate": 1.7011913706128583e-06, |
| "loss": 0.1619, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.883546205860256, |
| "grad_norm": 0.22888287901878357, |
| "learning_rate": 1.1645379413974456e-06, |
| "loss": 0.1607, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.9372115487817965, |
| "grad_norm": 0.3330378234386444, |
| "learning_rate": 6.278845121820329e-07, |
| "loss": 0.1607, |
| "step": 46000 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 46585, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.490275612904653e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|