| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 14.0, | |
| "eval_steps": 500, | |
| "global_step": 14728, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.09505703422053231, | |
| "grad_norm": 0.9250678420066833, | |
| "learning_rate": 3.95882818685669e-05, | |
| "loss": 6.6663, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19011406844106463, | |
| "grad_norm": 1.117205262184143, | |
| "learning_rate": 7.91765637371338e-05, | |
| "loss": 4.145, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.28517110266159695, | |
| "grad_norm": 1.3325448036193848, | |
| "learning_rate": 0.00011876484560570071, | |
| "loss": 3.8608, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.38022813688212925, | |
| "grad_norm": 0.9613128900527954, | |
| "learning_rate": 0.0001583531274742676, | |
| "loss": 3.768, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4752851711026616, | |
| "grad_norm": 1.1198444366455078, | |
| "learning_rate": 0.00019794140934283454, | |
| "loss": 3.7093, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5703422053231939, | |
| "grad_norm": 1.0210366249084473, | |
| "learning_rate": 0.00023752969121140142, | |
| "loss": 3.6664, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6653992395437263, | |
| "grad_norm": 0.9687440395355225, | |
| "learning_rate": 0.00027711797307996834, | |
| "loss": 3.5409, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7604562737642585, | |
| "grad_norm": 1.4981633424758911, | |
| "learning_rate": 0.0003167062549485352, | |
| "loss": 3.3992, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8555133079847909, | |
| "grad_norm": 0.8627603650093079, | |
| "learning_rate": 0.00035629453681710216, | |
| "loss": 3.1727, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9505703422053232, | |
| "grad_norm": 0.9925593733787537, | |
| "learning_rate": 0.0003958828186856691, | |
| "loss": 2.9724, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 2.6960370540618896, | |
| "eval_runtime": 3.7345, | |
| "eval_samples_per_second": 1897.152, | |
| "eval_steps_per_second": 118.622, | |
| "step": 1052 | |
| }, | |
| { | |
| "epoch": 1.0456273764258555, | |
| "grad_norm": 0.9267619848251343, | |
| "learning_rate": 0.00043547110055423594, | |
| "loss": 2.7022, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.1406844106463878, | |
| "grad_norm": 0.7666485905647278, | |
| "learning_rate": 0.00047505938242280285, | |
| "loss": 2.5641, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.2357414448669202, | |
| "grad_norm": 0.5969619154930115, | |
| "learning_rate": 0.0004990645699549983, | |
| "loss": 2.5058, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.3307984790874525, | |
| "grad_norm": 0.7782655358314514, | |
| "learning_rate": 0.0004965363806441826, | |
| "loss": 2.4665, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.4258555133079849, | |
| "grad_norm": 0.8928040266036987, | |
| "learning_rate": 0.000494008191333367, | |
| "loss": 2.4311, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.5209125475285172, | |
| "grad_norm": 0.8687949180603027, | |
| "learning_rate": 0.0004914800020225515, | |
| "loss": 2.3964, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.6159695817490496, | |
| "grad_norm": 0.6245518922805786, | |
| "learning_rate": 0.0004889518127117359, | |
| "loss": 2.374, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.7110266159695817, | |
| "grad_norm": 0.6903976202011108, | |
| "learning_rate": 0.0004864236234009203, | |
| "loss": 2.3606, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.806083650190114, | |
| "grad_norm": 0.8996257781982422, | |
| "learning_rate": 0.00048389543409010466, | |
| "loss": 2.3376, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.9011406844106464, | |
| "grad_norm": 0.734466016292572, | |
| "learning_rate": 0.0004813672447792891, | |
| "loss": 2.3226, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.9961977186311786, | |
| "grad_norm": 0.6836825013160706, | |
| "learning_rate": 0.0004788390554684735, | |
| "loss": 2.3108, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 2.285733461380005, | |
| "eval_runtime": 3.623, | |
| "eval_samples_per_second": 1955.579, | |
| "eval_steps_per_second": 122.275, | |
| "step": 2104 | |
| }, | |
| { | |
| "epoch": 2.091254752851711, | |
| "grad_norm": 0.5974160432815552, | |
| "learning_rate": 0.0004763108661576579, | |
| "loss": 2.2585, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.1863117870722433, | |
| "grad_norm": 0.788093626499176, | |
| "learning_rate": 0.0004737826768468423, | |
| "loss": 2.264, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.2813688212927756, | |
| "grad_norm": 0.7451100945472717, | |
| "learning_rate": 0.00047125448753602674, | |
| "loss": 2.2504, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.376425855513308, | |
| "grad_norm": 0.6724629998207092, | |
| "learning_rate": 0.0004687262982252111, | |
| "loss": 2.2358, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.4714828897338403, | |
| "grad_norm": 0.6606141924858093, | |
| "learning_rate": 0.00046619810891439554, | |
| "loss": 2.2301, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.5665399239543727, | |
| "grad_norm": 0.6599621772766113, | |
| "learning_rate": 0.0004636699196035799, | |
| "loss": 2.2268, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.661596958174905, | |
| "grad_norm": 0.6633493304252625, | |
| "learning_rate": 0.00046114173029276434, | |
| "loss": 2.2247, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.7566539923954374, | |
| "grad_norm": 0.6308265328407288, | |
| "learning_rate": 0.00045861354098194877, | |
| "loss": 2.2221, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.8517110266159698, | |
| "grad_norm": 0.6383451223373413, | |
| "learning_rate": 0.00045608535167113314, | |
| "loss": 2.2274, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.9467680608365017, | |
| "grad_norm": 0.61512291431427, | |
| "learning_rate": 0.00045355716236031757, | |
| "loss": 2.2067, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 2.2008087635040283, | |
| "eval_runtime": 3.5613, | |
| "eval_samples_per_second": 1989.445, | |
| "eval_steps_per_second": 124.393, | |
| "step": 3156 | |
| }, | |
| { | |
| "epoch": 3.041825095057034, | |
| "grad_norm": 0.7461186647415161, | |
| "learning_rate": 0.00045102897304950194, | |
| "loss": 2.1882, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.1368821292775664, | |
| "grad_norm": 0.6590662598609924, | |
| "learning_rate": 0.00044850078373868637, | |
| "loss": 2.1662, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.2319391634980987, | |
| "grad_norm": 0.5832785964012146, | |
| "learning_rate": 0.00044597259442787074, | |
| "loss": 2.1603, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 3.326996197718631, | |
| "grad_norm": 0.6356543898582458, | |
| "learning_rate": 0.00044344440511705517, | |
| "loss": 2.1601, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.4220532319391634, | |
| "grad_norm": 0.7197031378746033, | |
| "learning_rate": 0.0004409162158062396, | |
| "loss": 2.1567, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 3.517110266159696, | |
| "grad_norm": 0.5856086611747742, | |
| "learning_rate": 0.00043838802649542397, | |
| "loss": 2.1588, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.612167300380228, | |
| "grad_norm": 0.6212655305862427, | |
| "learning_rate": 0.00043585983718460834, | |
| "loss": 2.1565, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.7072243346007605, | |
| "grad_norm": 0.6765671968460083, | |
| "learning_rate": 0.0004333316478737928, | |
| "loss": 2.1667, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.802281368821293, | |
| "grad_norm": 0.6720090508460999, | |
| "learning_rate": 0.0004308034585629772, | |
| "loss": 2.1675, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.897338403041825, | |
| "grad_norm": 0.7150991559028625, | |
| "learning_rate": 0.00042827526925216157, | |
| "loss": 2.1474, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.9923954372623576, | |
| "grad_norm": 0.5831249356269836, | |
| "learning_rate": 0.00042574707994134605, | |
| "loss": 2.1485, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 2.15364408493042, | |
| "eval_runtime": 3.644, | |
| "eval_samples_per_second": 1944.292, | |
| "eval_steps_per_second": 121.57, | |
| "step": 4208 | |
| }, | |
| { | |
| "epoch": 4.08745247148289, | |
| "grad_norm": 0.6653150916099548, | |
| "learning_rate": 0.0004232188906305304, | |
| "loss": 2.0899, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 4.182509505703422, | |
| "grad_norm": 0.7235066294670105, | |
| "learning_rate": 0.0004206907013197148, | |
| "loss": 2.0982, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 4.277566539923955, | |
| "grad_norm": 0.7326545715332031, | |
| "learning_rate": 0.0004181625120088992, | |
| "loss": 2.1007, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 4.3726235741444865, | |
| "grad_norm": 0.6236776113510132, | |
| "learning_rate": 0.00041563432269808365, | |
| "loss": 2.1031, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 4.467680608365019, | |
| "grad_norm": 0.5669475197792053, | |
| "learning_rate": 0.000413106133387268, | |
| "loss": 2.1087, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 4.562737642585551, | |
| "grad_norm": 0.5483006834983826, | |
| "learning_rate": 0.00041057794407645245, | |
| "loss": 2.1034, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 4.657794676806084, | |
| "grad_norm": 0.5456926822662354, | |
| "learning_rate": 0.0004080497547656369, | |
| "loss": 2.1065, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 4.752851711026616, | |
| "grad_norm": 0.9545803666114807, | |
| "learning_rate": 0.00040552156545482125, | |
| "loss": 2.1168, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.847908745247148, | |
| "grad_norm": 0.5378767251968384, | |
| "learning_rate": 0.0004029933761440057, | |
| "loss": 2.1107, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 4.942965779467681, | |
| "grad_norm": 0.629880964756012, | |
| "learning_rate": 0.00040046518683319005, | |
| "loss": 2.0983, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 2.132718801498413, | |
| "eval_runtime": 3.6373, | |
| "eval_samples_per_second": 1947.857, | |
| "eval_steps_per_second": 121.793, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 5.038022813688213, | |
| "grad_norm": 0.5900342464447021, | |
| "learning_rate": 0.0003979369975223745, | |
| "loss": 2.0758, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 5.133079847908745, | |
| "grad_norm": 0.6181082129478455, | |
| "learning_rate": 0.0003954088082115589, | |
| "loss": 2.041, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 5.228136882129277, | |
| "grad_norm": 0.6756412386894226, | |
| "learning_rate": 0.0003928806189007433, | |
| "loss": 2.0548, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 5.32319391634981, | |
| "grad_norm": 0.6649320125579834, | |
| "learning_rate": 0.0003903524295899277, | |
| "loss": 2.0438, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 5.418250950570342, | |
| "grad_norm": 0.5628513693809509, | |
| "learning_rate": 0.00038782424027911214, | |
| "loss": 2.0485, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 5.513307984790875, | |
| "grad_norm": 0.6923677921295166, | |
| "learning_rate": 0.0003852960509682965, | |
| "loss": 2.063, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 5.608365019011407, | |
| "grad_norm": 0.6819363236427307, | |
| "learning_rate": 0.0003827678616574809, | |
| "loss": 2.0618, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 5.7034220532319395, | |
| "grad_norm": 0.6446284055709839, | |
| "learning_rate": 0.00038023967234666537, | |
| "loss": 2.0674, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 5.798479087452471, | |
| "grad_norm": 0.6319680213928223, | |
| "learning_rate": 0.00037771148303584974, | |
| "loss": 2.061, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 5.893536121673003, | |
| "grad_norm": 0.6318814754486084, | |
| "learning_rate": 0.0003751832937250341, | |
| "loss": 2.0656, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 5.988593155893536, | |
| "grad_norm": 0.6261875033378601, | |
| "learning_rate": 0.0003726551044142186, | |
| "loss": 2.0663, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 2.1098814010620117, | |
| "eval_runtime": 3.698, | |
| "eval_samples_per_second": 1915.889, | |
| "eval_steps_per_second": 119.794, | |
| "step": 6312 | |
| }, | |
| { | |
| "epoch": 6.083650190114068, | |
| "grad_norm": 0.6620230674743652, | |
| "learning_rate": 0.00037012691510340297, | |
| "loss": 1.9996, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 6.178707224334601, | |
| "grad_norm": 1.0794607400894165, | |
| "learning_rate": 0.00036759872579258734, | |
| "loss": 2.0018, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 6.273764258555133, | |
| "grad_norm": 1.372861385345459, | |
| "learning_rate": 0.00036507053648177177, | |
| "loss": 2.0059, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 6.3688212927756656, | |
| "grad_norm": 0.5926664471626282, | |
| "learning_rate": 0.0003625423471709562, | |
| "loss": 2.012, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 6.4638783269961975, | |
| "grad_norm": 0.7855852246284485, | |
| "learning_rate": 0.00036001415786014057, | |
| "loss": 2.0128, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 6.55893536121673, | |
| "grad_norm": 0.6684075593948364, | |
| "learning_rate": 0.000357485968549325, | |
| "loss": 2.0221, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 6.653992395437262, | |
| "grad_norm": 0.628013014793396, | |
| "learning_rate": 0.00035495777923850937, | |
| "loss": 2.0159, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 6.749049429657795, | |
| "grad_norm": 0.7943947911262512, | |
| "learning_rate": 0.0003524295899276938, | |
| "loss": 2.0223, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 6.844106463878327, | |
| "grad_norm": 0.645799994468689, | |
| "learning_rate": 0.0003499014006168782, | |
| "loss": 2.0206, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 6.93916349809886, | |
| "grad_norm": 0.6603648066520691, | |
| "learning_rate": 0.0003473732113060626, | |
| "loss": 2.0304, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 2.099062919616699, | |
| "eval_runtime": 3.631, | |
| "eval_samples_per_second": 1951.251, | |
| "eval_steps_per_second": 122.005, | |
| "step": 7364 | |
| }, | |
| { | |
| "epoch": 7.034220532319392, | |
| "grad_norm": 0.6082973480224609, | |
| "learning_rate": 0.000344845021995247, | |
| "loss": 2.0039, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 7.129277566539924, | |
| "grad_norm": 0.673995852470398, | |
| "learning_rate": 0.0003423168326844314, | |
| "loss": 1.9663, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 7.224334600760456, | |
| "grad_norm": 0.675037682056427, | |
| "learning_rate": 0.0003397886433736158, | |
| "loss": 1.9696, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 7.319391634980988, | |
| "grad_norm": 0.6488978266716003, | |
| "learning_rate": 0.0003372604540628002, | |
| "loss": 1.9701, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 7.414448669201521, | |
| "grad_norm": 0.8255399465560913, | |
| "learning_rate": 0.0003347322647519846, | |
| "loss": 1.9654, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 7.509505703422053, | |
| "grad_norm": 1.2661654949188232, | |
| "learning_rate": 0.00033220407544116905, | |
| "loss": 1.9736, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 7.604562737642586, | |
| "grad_norm": 0.6545805335044861, | |
| "learning_rate": 0.0003296758861303534, | |
| "loss": 1.9783, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 7.699619771863118, | |
| "grad_norm": 0.8890361189842224, | |
| "learning_rate": 0.00032714769681953785, | |
| "loss": 1.9807, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 7.79467680608365, | |
| "grad_norm": 0.6547899842262268, | |
| "learning_rate": 0.0003246195075087223, | |
| "loss": 1.9723, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 7.889733840304182, | |
| "grad_norm": 1.1239402294158936, | |
| "learning_rate": 0.00032209131819790665, | |
| "loss": 1.9734, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 7.984790874524715, | |
| "grad_norm": 0.6624830961227417, | |
| "learning_rate": 0.000319563128887091, | |
| "loss": 1.9869, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 2.1034328937530518, | |
| "eval_runtime": 3.6013, | |
| "eval_samples_per_second": 1967.337, | |
| "eval_steps_per_second": 123.011, | |
| "step": 8416 | |
| }, | |
| { | |
| "epoch": 8.079847908745247, | |
| "grad_norm": 0.6550971269607544, | |
| "learning_rate": 0.0003170349395762755, | |
| "loss": 1.9223, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 8.17490494296578, | |
| "grad_norm": 0.660987138748169, | |
| "learning_rate": 0.0003145067502654599, | |
| "loss": 1.9245, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 8.269961977186313, | |
| "grad_norm": 0.759884774684906, | |
| "learning_rate": 0.00031197856095464425, | |
| "loss": 1.9235, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 8.365019011406844, | |
| "grad_norm": 0.9319919347763062, | |
| "learning_rate": 0.00030945037164382874, | |
| "loss": 1.9239, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 8.460076045627376, | |
| "grad_norm": 0.6610597968101501, | |
| "learning_rate": 0.0003069221823330131, | |
| "loss": 1.928, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 8.55513307984791, | |
| "grad_norm": 0.7076143622398376, | |
| "learning_rate": 0.0003043939930221975, | |
| "loss": 1.9289, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 8.65019011406844, | |
| "grad_norm": 0.6368849873542786, | |
| "learning_rate": 0.0003018658037113819, | |
| "loss": 1.932, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 8.745247148288973, | |
| "grad_norm": 0.7639185786247253, | |
| "learning_rate": 0.00029933761440056634, | |
| "loss": 1.9485, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 8.840304182509506, | |
| "grad_norm": 1.0823330879211426, | |
| "learning_rate": 0.0002968094250897507, | |
| "loss": 1.9447, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 8.935361216730039, | |
| "grad_norm": 0.8542035222053528, | |
| "learning_rate": 0.00029428123577893514, | |
| "loss": 1.942, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 2.0947535037994385, | |
| "eval_runtime": 3.6147, | |
| "eval_samples_per_second": 1960.063, | |
| "eval_steps_per_second": 122.556, | |
| "step": 9468 | |
| }, | |
| { | |
| "epoch": 9.03041825095057, | |
| "grad_norm": 0.7601971626281738, | |
| "learning_rate": 0.00029175304646811956, | |
| "loss": 1.9243, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 9.125475285171103, | |
| "grad_norm": 0.7461040019989014, | |
| "learning_rate": 0.00028922485715730394, | |
| "loss": 1.8704, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 9.220532319391635, | |
| "grad_norm": 0.7719326019287109, | |
| "learning_rate": 0.00028669666784648836, | |
| "loss": 1.8832, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 9.315589353612168, | |
| "grad_norm": 0.716136634349823, | |
| "learning_rate": 0.00028416847853567274, | |
| "loss": 1.8787, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 9.4106463878327, | |
| "grad_norm": 0.6928532123565674, | |
| "learning_rate": 0.00028164028922485717, | |
| "loss": 1.8855, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 9.505703422053232, | |
| "grad_norm": 0.7696681618690491, | |
| "learning_rate": 0.0002791120999140416, | |
| "loss": 1.8855, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 9.600760456273765, | |
| "grad_norm": 0.8969391584396362, | |
| "learning_rate": 0.00027658391060322597, | |
| "loss": 1.9034, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 9.695817490494296, | |
| "grad_norm": 0.8469530940055847, | |
| "learning_rate": 0.00027405572129241034, | |
| "loss": 1.8965, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 9.790874524714829, | |
| "grad_norm": 0.7956866025924683, | |
| "learning_rate": 0.0002715275319815948, | |
| "loss": 1.9087, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 9.885931558935361, | |
| "grad_norm": 0.8293343782424927, | |
| "learning_rate": 0.0002689993426707792, | |
| "loss": 1.9177, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 9.980988593155894, | |
| "grad_norm": 0.7472631931304932, | |
| "learning_rate": 0.00026647115335996357, | |
| "loss": 1.9082, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 2.097904920578003, | |
| "eval_runtime": 3.5592, | |
| "eval_samples_per_second": 1990.641, | |
| "eval_steps_per_second": 124.468, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 10.076045627376425, | |
| "grad_norm": 0.7787309288978577, | |
| "learning_rate": 0.00026394296404914805, | |
| "loss": 1.8393, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 10.171102661596958, | |
| "grad_norm": 1.3328174352645874, | |
| "learning_rate": 0.0002614147747383324, | |
| "loss": 1.8283, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 10.26615969581749, | |
| "grad_norm": 0.7740694284439087, | |
| "learning_rate": 0.0002588865854275168, | |
| "loss": 1.8422, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 10.361216730038024, | |
| "grad_norm": 0.828940749168396, | |
| "learning_rate": 0.0002563583961167012, | |
| "loss": 1.8516, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 10.456273764258555, | |
| "grad_norm": 0.751752495765686, | |
| "learning_rate": 0.00025383020680588565, | |
| "loss": 1.8624, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 10.551330798479087, | |
| "grad_norm": 0.9940192103385925, | |
| "learning_rate": 0.00025130201749507, | |
| "loss": 1.8599, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 10.64638783269962, | |
| "grad_norm": 0.8591569066047668, | |
| "learning_rate": 0.00024877382818425445, | |
| "loss": 1.8581, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 10.741444866920151, | |
| "grad_norm": 0.7676281332969666, | |
| "learning_rate": 0.0002462456388734388, | |
| "loss": 1.8637, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 10.836501901140684, | |
| "grad_norm": 0.7896871566772461, | |
| "learning_rate": 0.00024371744956262325, | |
| "loss": 1.8606, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 10.931558935361217, | |
| "grad_norm": 0.8302274942398071, | |
| "learning_rate": 0.00024118926025180765, | |
| "loss": 1.8656, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 2.0961618423461914, | |
| "eval_runtime": 3.6362, | |
| "eval_samples_per_second": 1948.473, | |
| "eval_steps_per_second": 121.831, | |
| "step": 11572 | |
| }, | |
| { | |
| "epoch": 11.02661596958175, | |
| "grad_norm": 0.8891871571540833, | |
| "learning_rate": 0.00023866107094099208, | |
| "loss": 1.8522, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 11.12167300380228, | |
| "grad_norm": 0.7549653649330139, | |
| "learning_rate": 0.00023613288163017645, | |
| "loss": 1.7913, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 11.216730038022813, | |
| "grad_norm": 0.8127674460411072, | |
| "learning_rate": 0.00023360469231936088, | |
| "loss": 1.8102, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 11.311787072243346, | |
| "grad_norm": 0.841659426689148, | |
| "learning_rate": 0.0002310765030085453, | |
| "loss": 1.803, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 11.406844106463879, | |
| "grad_norm": 0.8460645079612732, | |
| "learning_rate": 0.00022854831369772968, | |
| "loss": 1.8201, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 11.50190114068441, | |
| "grad_norm": 0.7932580709457397, | |
| "learning_rate": 0.0002260201243869141, | |
| "loss": 1.811, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 11.596958174904943, | |
| "grad_norm": 0.8419378399848938, | |
| "learning_rate": 0.0002234919350760985, | |
| "loss": 1.8145, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 11.692015209125476, | |
| "grad_norm": 0.8346748352050781, | |
| "learning_rate": 0.0002209637457652829, | |
| "loss": 1.8328, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 11.787072243346007, | |
| "grad_norm": 1.019510269165039, | |
| "learning_rate": 0.0002184355564544673, | |
| "loss": 1.8257, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 11.88212927756654, | |
| "grad_norm": 0.8175719976425171, | |
| "learning_rate": 0.00021590736714365173, | |
| "loss": 1.8274, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 11.977186311787072, | |
| "grad_norm": 0.7476153373718262, | |
| "learning_rate": 0.00021337917783283614, | |
| "loss": 1.8361, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 2.1029505729675293, | |
| "eval_runtime": 3.5932, | |
| "eval_samples_per_second": 1971.782, | |
| "eval_steps_per_second": 123.289, | |
| "step": 12624 | |
| }, | |
| { | |
| "epoch": 12.072243346007605, | |
| "grad_norm": 0.8637651205062866, | |
| "learning_rate": 0.00021085098852202054, | |
| "loss": 1.7684, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 12.167300380228136, | |
| "grad_norm": 0.80800461769104, | |
| "learning_rate": 0.00020832279921120496, | |
| "loss": 1.7703, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 12.262357414448669, | |
| "grad_norm": 1.0111021995544434, | |
| "learning_rate": 0.00020579460990038934, | |
| "loss": 1.7809, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 12.357414448669202, | |
| "grad_norm": 0.8477798700332642, | |
| "learning_rate": 0.00020326642058957376, | |
| "loss": 1.7795, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 12.452471482889734, | |
| "grad_norm": 0.8284028172492981, | |
| "learning_rate": 0.00020073823127875814, | |
| "loss": 1.7803, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 12.547528517110266, | |
| "grad_norm": 0.7752136588096619, | |
| "learning_rate": 0.00019821004196794256, | |
| "loss": 1.7836, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 12.642585551330798, | |
| "grad_norm": 0.8929184675216675, | |
| "learning_rate": 0.00019568185265712696, | |
| "loss": 1.7724, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 12.737642585551331, | |
| "grad_norm": 0.8475900888442993, | |
| "learning_rate": 0.00019315366334631136, | |
| "loss": 1.7891, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 12.832699619771864, | |
| "grad_norm": 0.9029939770698547, | |
| "learning_rate": 0.0001906254740354958, | |
| "loss": 1.7888, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 12.927756653992395, | |
| "grad_norm": 0.841206967830658, | |
| "learning_rate": 0.0001880972847246802, | |
| "loss": 1.8005, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 2.1176211833953857, | |
| "eval_runtime": 3.6226, | |
| "eval_samples_per_second": 1955.796, | |
| "eval_steps_per_second": 122.289, | |
| "step": 13676 | |
| }, | |
| { | |
| "epoch": 13.022813688212928, | |
| "grad_norm": 0.786509096622467, | |
| "learning_rate": 0.0001855690954138646, | |
| "loss": 1.7784, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 13.11787072243346, | |
| "grad_norm": 0.8644747734069824, | |
| "learning_rate": 0.000183040906103049, | |
| "loss": 1.7234, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 13.212927756653992, | |
| "grad_norm": 0.8760172128677368, | |
| "learning_rate": 0.00018051271679223342, | |
| "loss": 1.7308, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 13.307984790874524, | |
| "grad_norm": 0.7858941555023193, | |
| "learning_rate": 0.0001779845274814178, | |
| "loss": 1.7318, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 13.403041825095057, | |
| "grad_norm": 0.8771238327026367, | |
| "learning_rate": 0.00017545633817060222, | |
| "loss": 1.7473, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 13.49809885931559, | |
| "grad_norm": 0.8886803984642029, | |
| "learning_rate": 0.00017292814885978665, | |
| "loss": 1.7491, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 13.593155893536121, | |
| "grad_norm": 0.8704127669334412, | |
| "learning_rate": 0.00017039995954897102, | |
| "loss": 1.7548, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 13.688212927756654, | |
| "grad_norm": 1.2635705471038818, | |
| "learning_rate": 0.00016787177023815545, | |
| "loss": 1.7532, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 13.783269961977187, | |
| "grad_norm": 0.9218750596046448, | |
| "learning_rate": 0.00016534358092733985, | |
| "loss": 1.7531, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 13.87832699619772, | |
| "grad_norm": 0.9513919353485107, | |
| "learning_rate": 0.00016281539161652425, | |
| "loss": 1.7618, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 13.97338403041825, | |
| "grad_norm": 1.010962963104248, | |
| "learning_rate": 0.00016028720230570865, | |
| "loss": 1.7646, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 2.130631923675537, | |
| "eval_runtime": 3.6539, | |
| "eval_samples_per_second": 1938.998, | |
| "eval_steps_per_second": 121.239, | |
| "step": 14728 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 21040, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.6215157665850184e+16, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |