| { | |
| "best_global_step": 7500, | |
| "best_metric": 0.7491397857666016, | |
| "best_model_checkpoint": "./results/checkpoint-7500", | |
| "epoch": 4.997752808988764, | |
| "eval_steps": 250, | |
| "global_step": 9455, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.026437541308658295, | |
| "grad_norm": 1.9868909120559692, | |
| "learning_rate": 0.00034625958983852136, | |
| "loss": 1.3782, | |
| "mean_token_accuracy": 0.7697586753964424, | |
| "num_tokens": 1638400.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05287508261731659, | |
| "grad_norm": 0.9880499839782715, | |
| "learning_rate": 0.000407611186724682, | |
| "loss": 1.2966, | |
| "mean_token_accuracy": 0.7892405907809734, | |
| "num_tokens": 3276800.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07931262392597488, | |
| "grad_norm": 0.6774707436561584, | |
| "learning_rate": 0.0004434995702624468, | |
| "loss": 0.939, | |
| "mean_token_accuracy": 0.8182881197333336, | |
| "num_tokens": 4915200.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10575016523463318, | |
| "grad_norm": 0.84303218126297, | |
| "learning_rate": 0.0004689627836108426, | |
| "loss": 0.9239, | |
| "mean_token_accuracy": 0.8214302301406861, | |
| "num_tokens": 6553600.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13218770654329148, | |
| "grad_norm": 2.8259122371673584, | |
| "learning_rate": 0.0004887135863147016, | |
| "loss": 1.0103, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13218770654329148, | |
| "eval_loss": 1.3388922214508057, | |
| "eval_mean_token_accuracy": 0.767919923740008, | |
| "eval_num_tokens": 8192000.0, | |
| "eval_runtime": 1597.0105, | |
| "eval_samples_per_second": 4.737, | |
| "eval_steps_per_second": 0.592, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.15862524785194976, | |
| "grad_norm": 6.46172571182251, | |
| "learning_rate": 0.0004991822047759241, | |
| "loss": 1.6239, | |
| "mean_token_accuracy": 0.767223238646984, | |
| "num_tokens": 9830400.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.18506278916060806, | |
| "grad_norm": 17.279727935791016, | |
| "learning_rate": 0.0004964562206956711, | |
| "loss": 1.3753, | |
| "mean_token_accuracy": 0.7646566477417945, | |
| "num_tokens": 11468800.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.21150033046926636, | |
| "grad_norm": 9.562877655029297, | |
| "learning_rate": 0.0004937302366154182, | |
| "loss": 1.7307, | |
| "mean_token_accuracy": 0.7092528110742569, | |
| "num_tokens": 13107200.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23793787177792466, | |
| "grad_norm": 57.86962890625, | |
| "learning_rate": 0.0004910042525351653, | |
| "loss": 3.8636, | |
| "mean_token_accuracy": 0.44049181263893844, | |
| "num_tokens": 14745600.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.26437541308658297, | |
| "grad_norm": 2.1134138107299805, | |
| "learning_rate": 0.00048827826845491225, | |
| "loss": 3.3381, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.26437541308658297, | |
| "eval_loss": 3.0370047092437744, | |
| "eval_mean_token_accuracy": 0.5421812577177052, | |
| "eval_num_tokens": 16384000.0, | |
| "eval_runtime": 1593.4634, | |
| "eval_samples_per_second": 4.748, | |
| "eval_steps_per_second": 0.594, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.29081295439524124, | |
| "grad_norm": 2.805110454559326, | |
| "learning_rate": 0.0004855522843746593, | |
| "loss": 2.8023, | |
| "mean_token_accuracy": 0.5313697456568479, | |
| "num_tokens": 18022400.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3172504957038995, | |
| "grad_norm": 2.2337939739227295, | |
| "learning_rate": 0.00048282630029440626, | |
| "loss": 2.3643, | |
| "mean_token_accuracy": 0.6257539093494415, | |
| "num_tokens": 19660800.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.34368803701255785, | |
| "grad_norm": 1.3661304712295532, | |
| "learning_rate": 0.00048010031621415335, | |
| "loss": 2.2117, | |
| "mean_token_accuracy": 0.6438269788026809, | |
| "num_tokens": 21299200.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3701255783212161, | |
| "grad_norm": 1.319533109664917, | |
| "learning_rate": 0.0004773743321339004, | |
| "loss": 2.0377, | |
| "mean_token_accuracy": 0.6656103357672691, | |
| "num_tokens": 22937600.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3965631196298744, | |
| "grad_norm": 0.9889560341835022, | |
| "learning_rate": 0.00047464834805364736, | |
| "loss": 1.9612, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3965631196298744, | |
| "eval_loss": 1.9166280031204224, | |
| "eval_mean_token_accuracy": 0.6814079303570076, | |
| "eval_num_tokens": 24576000.0, | |
| "eval_runtime": 1597.3511, | |
| "eval_samples_per_second": 4.736, | |
| "eval_steps_per_second": 0.592, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4230006609385327, | |
| "grad_norm": 0.9154905676841736, | |
| "learning_rate": 0.0004719223639733944, | |
| "loss": 1.895, | |
| "mean_token_accuracy": 0.6779982282221317, | |
| "num_tokens": 26214400.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.449438202247191, | |
| "grad_norm": 1.1073395013809204, | |
| "learning_rate": 0.0004691963798931415, | |
| "loss": 1.8532, | |
| "mean_token_accuracy": 0.6910386118292808, | |
| "num_tokens": 27852800.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.47587574355584933, | |
| "grad_norm": 0.7696494460105896, | |
| "learning_rate": 0.00046647039581288846, | |
| "loss": 1.7828, | |
| "mean_token_accuracy": 0.7011858496069908, | |
| "num_tokens": 29491200.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5023132848645075, | |
| "grad_norm": 0.8735769987106323, | |
| "learning_rate": 0.0004637444117326355, | |
| "loss": 1.7661, | |
| "mean_token_accuracy": 0.7030816239118576, | |
| "num_tokens": 31129600.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5287508261731659, | |
| "grad_norm": 0.8184662461280823, | |
| "learning_rate": 0.0004610184276523825, | |
| "loss": 1.7303, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5287508261731659, | |
| "eval_loss": 1.732823133468628, | |
| "eval_mean_token_accuracy": 0.7076210416774669, | |
| "eval_num_tokens": 32768000.0, | |
| "eval_runtime": 1598.294, | |
| "eval_samples_per_second": 4.733, | |
| "eval_steps_per_second": 0.592, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5551883674818242, | |
| "grad_norm": 0.805102527141571, | |
| "learning_rate": 0.00045829244357212956, | |
| "loss": 1.7074, | |
| "mean_token_accuracy": 0.7088553883135319, | |
| "num_tokens": 34406400.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5816259087904825, | |
| "grad_norm": 1.1714200973510742, | |
| "learning_rate": 0.0004555664594918766, | |
| "loss": 1.6578, | |
| "mean_token_accuracy": 0.7183681574463844, | |
| "num_tokens": 36044800.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6080634500991408, | |
| "grad_norm": 0.801713228225708, | |
| "learning_rate": 0.0004528404754116236, | |
| "loss": 1.639, | |
| "mean_token_accuracy": 0.7187829902768135, | |
| "num_tokens": 37683200.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.634500991407799, | |
| "grad_norm": 0.776907205581665, | |
| "learning_rate": 0.0004501144913313706, | |
| "loss": 1.6155, | |
| "mean_token_accuracy": 0.7220100191235542, | |
| "num_tokens": 39321600.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6609385327164574, | |
| "grad_norm": 0.5754767656326294, | |
| "learning_rate": 0.0004473885072511177, | |
| "loss": 1.5987, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6609385327164574, | |
| "eval_loss": 1.5740511417388916, | |
| "eval_mean_token_accuracy": 0.7283713231001041, | |
| "eval_num_tokens": 40960000.0, | |
| "eval_runtime": 1599.1655, | |
| "eval_samples_per_second": 4.731, | |
| "eval_steps_per_second": 0.592, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6873760740251157, | |
| "grad_norm": 0.5739009976387024, | |
| "learning_rate": 0.0004446625231708647, | |
| "loss": 1.5626, | |
| "mean_token_accuracy": 0.7280584080517292, | |
| "num_tokens": 42598400.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.713813615333774, | |
| "grad_norm": 0.56184983253479, | |
| "learning_rate": 0.0004419365390906117, | |
| "loss": 1.5383, | |
| "mean_token_accuracy": 0.7336229240894317, | |
| "num_tokens": 44236800.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7402511566424322, | |
| "grad_norm": 0.7078151106834412, | |
| "learning_rate": 0.00043921055501035873, | |
| "loss": 1.4998, | |
| "mean_token_accuracy": 0.7408009549975395, | |
| "num_tokens": 45875200.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7666886979510905, | |
| "grad_norm": 0.6344922184944153, | |
| "learning_rate": 0.0004364845709301058, | |
| "loss": 1.4751, | |
| "mean_token_accuracy": 0.74332783639431, | |
| "num_tokens": 47513600.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7931262392597488, | |
| "grad_norm": 0.4986041486263275, | |
| "learning_rate": 0.0004337585868498528, | |
| "loss": 1.4716, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.7931262392597488, | |
| "eval_loss": 1.4890165328979492, | |
| "eval_mean_token_accuracy": 0.7397930833174361, | |
| "eval_num_tokens": 49152000.0, | |
| "eval_runtime": 1599.8275, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8195637805684072, | |
| "grad_norm": 0.508725643157959, | |
| "learning_rate": 0.00043103260276959983, | |
| "loss": 1.4734, | |
| "mean_token_accuracy": 0.7423943056166172, | |
| "num_tokens": 50790400.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8460013218770654, | |
| "grad_norm": 0.5139680504798889, | |
| "learning_rate": 0.0004283066186893468, | |
| "loss": 1.4513, | |
| "mean_token_accuracy": 0.7446151030063629, | |
| "num_tokens": 52428800.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8724388631857237, | |
| "grad_norm": 0.5360570549964905, | |
| "learning_rate": 0.0004255806346090939, | |
| "loss": 1.4587, | |
| "mean_token_accuracy": 0.7414125129580498, | |
| "num_tokens": 54067200.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.898876404494382, | |
| "grad_norm": 0.46601545810699463, | |
| "learning_rate": 0.00042285465052884093, | |
| "loss": 1.4468, | |
| "mean_token_accuracy": 0.7438508039712906, | |
| "num_tokens": 55705600.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9253139458030403, | |
| "grad_norm": 0.4491994380950928, | |
| "learning_rate": 0.0004201286664485879, | |
| "loss": 1.4234, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9253139458030403, | |
| "eval_loss": 1.405325174331665, | |
| "eval_mean_token_accuracy": 0.7498895639975025, | |
| "eval_num_tokens": 57344000.0, | |
| "eval_runtime": 1600.1713, | |
| "eval_samples_per_second": 4.728, | |
| "eval_steps_per_second": 0.591, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9517514871116987, | |
| "grad_norm": 0.40710222721099854, | |
| "learning_rate": 0.00041740268236833495, | |
| "loss": 1.3835, | |
| "mean_token_accuracy": 0.7491015987098217, | |
| "num_tokens": 58982400.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.9781890284203569, | |
| "grad_norm": 0.4693559408187866, | |
| "learning_rate": 0.00041467669828808203, | |
| "loss": 1.3498, | |
| "mean_token_accuracy": 0.7580449655652046, | |
| "num_tokens": 60620800.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.0042300066093852, | |
| "grad_norm": 0.4662095606327057, | |
| "learning_rate": 0.00041200523388943414, | |
| "loss": 1.3744, | |
| "mean_token_accuracy": 0.7523588169044649, | |
| "num_tokens": 62234624.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.0306675479180436, | |
| "grad_norm": 0.39532390236854553, | |
| "learning_rate": 0.0004092792498091811, | |
| "loss": 1.3057, | |
| "mean_token_accuracy": 0.7592387574911118, | |
| "num_tokens": 63873024.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.057105089226702, | |
| "grad_norm": 0.4154648780822754, | |
| "learning_rate": 0.00040655326572892816, | |
| "loss": 1.3248, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.057105089226702, | |
| "eval_loss": 1.339290976524353, | |
| "eval_mean_token_accuracy": 0.7579027240422513, | |
| "eval_num_tokens": 65511424.0, | |
| "eval_runtime": 1599.5937, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.0835426305353602, | |
| "grad_norm": 0.3895817697048187, | |
| "learning_rate": 0.00040382728164867513, | |
| "loss": 1.2956, | |
| "mean_token_accuracy": 0.7590417274832726, | |
| "num_tokens": 67149824.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.1099801718440185, | |
| "grad_norm": 0.39260634779930115, | |
| "learning_rate": 0.0004011012975684222, | |
| "loss": 1.3223, | |
| "mean_token_accuracy": 0.756996577680111, | |
| "num_tokens": 68788224.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.1364177131526767, | |
| "grad_norm": 0.3638737201690674, | |
| "learning_rate": 0.00039837531348816925, | |
| "loss": 1.268, | |
| "mean_token_accuracy": 0.7646328181028366, | |
| "num_tokens": 70426624.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.162855254461335, | |
| "grad_norm": 0.3186447322368622, | |
| "learning_rate": 0.00039564932940791623, | |
| "loss": 1.2705, | |
| "mean_token_accuracy": 0.7647727259993553, | |
| "num_tokens": 72065024.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.1892927957699935, | |
| "grad_norm": 0.37439003586769104, | |
| "learning_rate": 0.00039292334532766327, | |
| "loss": 1.2631, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.1892927957699935, | |
| "eval_loss": 1.278181791305542, | |
| "eval_mean_token_accuracy": 0.7655498584531578, | |
| "eval_num_tokens": 73703424.0, | |
| "eval_runtime": 1599.9443, | |
| "eval_samples_per_second": 4.728, | |
| "eval_steps_per_second": 0.591, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.2157303370786516, | |
| "grad_norm": 0.36414834856987, | |
| "learning_rate": 0.00039019736124741035, | |
| "loss": 1.2556, | |
| "mean_token_accuracy": 0.7657642959058285, | |
| "num_tokens": 75341824.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.24216787838731, | |
| "grad_norm": 0.38630911707878113, | |
| "learning_rate": 0.00038747137716715733, | |
| "loss": 1.2453, | |
| "mean_token_accuracy": 0.7686192587018013, | |
| "num_tokens": 76980224.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.2686054196959682, | |
| "grad_norm": 0.34793412685394287, | |
| "learning_rate": 0.00038474539308690437, | |
| "loss": 1.2113, | |
| "mean_token_accuracy": 0.7736504143476486, | |
| "num_tokens": 78618624.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.2950429610046266, | |
| "grad_norm": 0.3544578552246094, | |
| "learning_rate": 0.0003820194090066514, | |
| "loss": 1.1924, | |
| "mean_token_accuracy": 0.7755889534950257, | |
| "num_tokens": 80257024.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.321480502313285, | |
| "grad_norm": 0.30794623494148254, | |
| "learning_rate": 0.00037929342492639843, | |
| "loss": 1.1748, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.321480502313285, | |
| "eval_loss": 1.198885440826416, | |
| "eval_mean_token_accuracy": 0.776488389197666, | |
| "eval_num_tokens": 81895424.0, | |
| "eval_runtime": 1599.7215, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.3479180436219431, | |
| "grad_norm": 0.2978927493095398, | |
| "learning_rate": 0.00037656744084614547, | |
| "loss": 1.1624, | |
| "mean_token_accuracy": 0.7798365721106529, | |
| "num_tokens": 83533824.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.3743555849306015, | |
| "grad_norm": 0.3153753876686096, | |
| "learning_rate": 0.0003738414567658925, | |
| "loss": 1.1462, | |
| "mean_token_accuracy": 0.7827309390902519, | |
| "num_tokens": 85172224.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.4007931262392597, | |
| "grad_norm": 0.31813791394233704, | |
| "learning_rate": 0.0003711154726856395, | |
| "loss": 1.1274, | |
| "mean_token_accuracy": 0.7860245615243912, | |
| "num_tokens": 86810624.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.427230667547918, | |
| "grad_norm": 0.30844855308532715, | |
| "learning_rate": 0.00036838948860538656, | |
| "loss": 1.118, | |
| "mean_token_accuracy": 0.7876050838828087, | |
| "num_tokens": 88449024.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.4536682088565764, | |
| "grad_norm": 0.3054572343826294, | |
| "learning_rate": 0.0003656635045251336, | |
| "loss": 1.1336, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.4536682088565764, | |
| "eval_loss": 1.1190927028656006, | |
| "eval_mean_token_accuracy": 0.7886134508926319, | |
| "eval_num_tokens": 90087424.0, | |
| "eval_runtime": 1599.6779, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.4801057501652346, | |
| "grad_norm": 0.28417208790779114, | |
| "learning_rate": 0.0003629375204448806, | |
| "loss": 1.1039, | |
| "mean_token_accuracy": 0.7864858260750771, | |
| "num_tokens": 91725824.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.5065432914738928, | |
| "grad_norm": 0.307099312543869, | |
| "learning_rate": 0.0003602115363646276, | |
| "loss": 1.0909, | |
| "mean_token_accuracy": 0.7900452110171318, | |
| "num_tokens": 93364224.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.5329808327825512, | |
| "grad_norm": 0.30008023977279663, | |
| "learning_rate": 0.0003574855522843747, | |
| "loss": 1.0824, | |
| "mean_token_accuracy": 0.7928054749965667, | |
| "num_tokens": 95002624.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.5594183740912095, | |
| "grad_norm": 0.27622541785240173, | |
| "learning_rate": 0.0003547595682041217, | |
| "loss": 1.055, | |
| "mean_token_accuracy": 0.7961241453886032, | |
| "num_tokens": 96641024.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.585855915399868, | |
| "grad_norm": 0.2670520544052124, | |
| "learning_rate": 0.0003520335841238687, | |
| "loss": 1.0466, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.585855915399868, | |
| "eval_loss": 1.0528658628463745, | |
| "eval_mean_token_accuracy": 0.7982593539149262, | |
| "eval_num_tokens": 98279424.0, | |
| "eval_runtime": 1599.5737, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.612293456708526, | |
| "grad_norm": 0.26690635085105896, | |
| "learning_rate": 0.00034930760004361574, | |
| "loss": 1.0354, | |
| "mean_token_accuracy": 0.7988346171379089, | |
| "num_tokens": 99917824.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.6387309980171842, | |
| "grad_norm": 0.27989307045936584, | |
| "learning_rate": 0.0003465816159633628, | |
| "loss": 1.0225, | |
| "mean_token_accuracy": 0.8015902996063232, | |
| "num_tokens": 101556224.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.6651685393258426, | |
| "grad_norm": 0.21368129551410675, | |
| "learning_rate": 0.0003438556318831098, | |
| "loss": 1.0197, | |
| "mean_token_accuracy": 0.8019238775968551, | |
| "num_tokens": 103194624.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.691606080634501, | |
| "grad_norm": 0.288343220949173, | |
| "learning_rate": 0.00034112964780285684, | |
| "loss": 1.0174, | |
| "mean_token_accuracy": 0.8012603887915611, | |
| "num_tokens": 104833024.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.7180436219431594, | |
| "grad_norm": 0.245047464966774, | |
| "learning_rate": 0.0003384036637226039, | |
| "loss": 0.9922, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.7180436219431594, | |
| "eval_loss": 1.0065803527832031, | |
| "eval_mean_token_accuracy": 0.8048020523773943, | |
| "eval_num_tokens": 106471424.0, | |
| "eval_runtime": 1599.1563, | |
| "eval_samples_per_second": 4.731, | |
| "eval_steps_per_second": 0.592, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.7444811632518176, | |
| "grad_norm": 0.23827126622200012, | |
| "learning_rate": 0.0003356776796423509, | |
| "loss": 0.9838, | |
| "mean_token_accuracy": 0.8066547532379628, | |
| "num_tokens": 108109824.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.7709187045604757, | |
| "grad_norm": 0.22703391313552856, | |
| "learning_rate": 0.00033295169556209794, | |
| "loss": 0.9587, | |
| "mean_token_accuracy": 0.8113178130984307, | |
| "num_tokens": 109748224.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.7973562458691341, | |
| "grad_norm": 0.25331422686576843, | |
| "learning_rate": 0.0003302257114818449, | |
| "loss": 0.9697, | |
| "mean_token_accuracy": 0.8093206259608269, | |
| "num_tokens": 111386624.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.8237937871777925, | |
| "grad_norm": 0.264260470867157, | |
| "learning_rate": 0.000327499727401592, | |
| "loss": 0.956, | |
| "mean_token_accuracy": 0.8123435971140861, | |
| "num_tokens": 113025024.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.8502313284864509, | |
| "grad_norm": 0.2458537220954895, | |
| "learning_rate": 0.00032477374332133904, | |
| "loss": 0.9539, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.8502313284864509, | |
| "eval_loss": 0.9670175909996033, | |
| "eval_mean_token_accuracy": 0.8104942284729214, | |
| "eval_num_tokens": 114663424.0, | |
| "eval_runtime": 1599.1718, | |
| "eval_samples_per_second": 4.731, | |
| "eval_steps_per_second": 0.592, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.876668869795109, | |
| "grad_norm": 0.20451125502586365, | |
| "learning_rate": 0.000322047759241086, | |
| "loss": 0.9479, | |
| "mean_token_accuracy": 0.8118679732084274, | |
| "num_tokens": 116301824.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.9031064111037672, | |
| "grad_norm": 0.22584660351276398, | |
| "learning_rate": 0.00031932177516083305, | |
| "loss": 0.9688, | |
| "mean_token_accuracy": 0.8094049346446991, | |
| "num_tokens": 117940224.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.9295439524124256, | |
| "grad_norm": 0.2119428962469101, | |
| "learning_rate": 0.00031659579108058014, | |
| "loss": 0.9229, | |
| "mean_token_accuracy": 0.816270771920681, | |
| "num_tokens": 119578624.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.955981493721084, | |
| "grad_norm": 0.2210853099822998, | |
| "learning_rate": 0.0003138698070003271, | |
| "loss": 0.9341, | |
| "mean_token_accuracy": 0.814192325770855, | |
| "num_tokens": 121217024.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.9824190350297424, | |
| "grad_norm": 0.1966710090637207, | |
| "learning_rate": 0.00031114382292007415, | |
| "loss": 0.9283, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.9824190350297424, | |
| "eval_loss": 0.9337447881698608, | |
| "eval_mean_token_accuracy": 0.8153352634725308, | |
| "eval_num_tokens": 122855424.0, | |
| "eval_runtime": 1599.5384, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.0084600132187704, | |
| "grad_norm": 0.20168109238147736, | |
| "learning_rate": 0.00030847235852142626, | |
| "loss": 0.8927, | |
| "mean_token_accuracy": 0.8173428005175266, | |
| "num_tokens": 124469248.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.034897554527429, | |
| "grad_norm": 0.1905670017004013, | |
| "learning_rate": 0.00030574637444117324, | |
| "loss": 0.8427, | |
| "mean_token_accuracy": 0.8238172018527985, | |
| "num_tokens": 126107648.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.061335095836087, | |
| "grad_norm": 0.19004780054092407, | |
| "learning_rate": 0.0003030203903609203, | |
| "loss": 0.8536, | |
| "mean_token_accuracy": 0.8214613863825798, | |
| "num_tokens": 127746048.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.0877726371447456, | |
| "grad_norm": 0.21092021465301514, | |
| "learning_rate": 0.00030029440628066736, | |
| "loss": 0.8347, | |
| "mean_token_accuracy": 0.8248136582970619, | |
| "num_tokens": 129384448.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.114210178453404, | |
| "grad_norm": 0.2002408355474472, | |
| "learning_rate": 0.00029756842220041434, | |
| "loss": 0.8409, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.114210178453404, | |
| "eval_loss": 0.913899838924408, | |
| "eval_mean_token_accuracy": 0.8182373826271888, | |
| "eval_num_tokens": 131022848.0, | |
| "eval_runtime": 1599.607, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.140647719762062, | |
| "grad_norm": 0.22307777404785156, | |
| "learning_rate": 0.0002948424381201614, | |
| "loss": 0.8428, | |
| "mean_token_accuracy": 0.8242137080430985, | |
| "num_tokens": 132661248.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.1670852610707203, | |
| "grad_norm": 0.1873617023229599, | |
| "learning_rate": 0.0002921164540399084, | |
| "loss": 0.8439, | |
| "mean_token_accuracy": 0.8233803743124009, | |
| "num_tokens": 134299648.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.1935228023793787, | |
| "grad_norm": 0.1888233870267868, | |
| "learning_rate": 0.00028939046995965544, | |
| "loss": 0.8406, | |
| "mean_token_accuracy": 0.8241153433918953, | |
| "num_tokens": 135938048.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.219960343688037, | |
| "grad_norm": 0.19996315240859985, | |
| "learning_rate": 0.00028666448587940247, | |
| "loss": 0.8337, | |
| "mean_token_accuracy": 0.8248002156615257, | |
| "num_tokens": 137576448.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.2463978849966955, | |
| "grad_norm": 0.21117758750915527, | |
| "learning_rate": 0.0002839385017991495, | |
| "loss": 0.8411, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.2463978849966955, | |
| "eval_loss": 0.893865704536438, | |
| "eval_mean_token_accuracy": 0.8210062818093733, | |
| "eval_num_tokens": 139214848.0, | |
| "eval_runtime": 1599.858, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.2728354263053534, | |
| "grad_norm": 0.20331983268260956, | |
| "learning_rate": 0.00028121251771889654, | |
| "loss": 0.8389, | |
| "mean_token_accuracy": 0.824597994685173, | |
| "num_tokens": 140853248.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.299272967614012, | |
| "grad_norm": 0.19736993312835693, | |
| "learning_rate": 0.00027848653363864357, | |
| "loss": 0.8168, | |
| "mean_token_accuracy": 0.8279356023669243, | |
| "num_tokens": 142491648.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.32571050892267, | |
| "grad_norm": 0.1942383050918579, | |
| "learning_rate": 0.0002757605495583906, | |
| "loss": 0.8158, | |
| "mean_token_accuracy": 0.8288569149374961, | |
| "num_tokens": 144130048.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.3521480502313286, | |
| "grad_norm": 0.18327121436595917, | |
| "learning_rate": 0.0002730345654781376, | |
| "loss": 0.8097, | |
| "mean_token_accuracy": 0.8300702553987503, | |
| "num_tokens": 145768448.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.378585591539987, | |
| "grad_norm": 0.17920152842998505, | |
| "learning_rate": 0.00027030858139788467, | |
| "loss": 0.8017, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.378585591539987, | |
| "eval_loss": 0.874257504940033, | |
| "eval_mean_token_accuracy": 0.823767999828996, | |
| "eval_num_tokens": 147406848.0, | |
| "eval_runtime": 1599.5025, | |
| "eval_samples_per_second": 4.73, | |
| "eval_steps_per_second": 0.591, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.405023132848645, | |
| "grad_norm": 0.18811027705669403, | |
| "learning_rate": 0.0002675825973176317, | |
| "loss": 0.8215, | |
| "mean_token_accuracy": 0.8293267333507538, | |
| "num_tokens": 149045248.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.4314606741573033, | |
| "grad_norm": 0.20340368151664734, | |
| "learning_rate": 0.0002648566132373787, | |
| "loss": 0.8249, | |
| "mean_token_accuracy": 0.8268548348546028, | |
| "num_tokens": 150683648.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.4578982154659617, | |
| "grad_norm": 0.18492697179317474, | |
| "learning_rate": 0.0002621306291571257, | |
| "loss": 0.7914, | |
| "mean_token_accuracy": 0.832571476995945, | |
| "num_tokens": 152322048.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.48433575677462, | |
| "grad_norm": 0.19855117797851562, | |
| "learning_rate": 0.0002594046450768728, | |
| "loss": 0.8077, | |
| "mean_token_accuracy": 0.8298674210906029, | |
| "num_tokens": 153960448.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.5107732980832784, | |
| "grad_norm": 0.1997339129447937, | |
| "learning_rate": 0.0002566786609966198, | |
| "loss": 0.809, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.5107732980832784, | |
| "eval_loss": 0.8553281426429749, | |
| "eval_mean_token_accuracy": 0.8265610535729511, | |
| "eval_num_tokens": 155598848.0, | |
| "eval_runtime": 1599.9059, | |
| "eval_samples_per_second": 4.728, | |
| "eval_steps_per_second": 0.591, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.5372108393919364, | |
| "grad_norm": 0.19008329510688782, | |
| "learning_rate": 0.0002539526769163668, | |
| "loss": 0.797, | |
| "mean_token_accuracy": 0.8298222103714943, | |
| "num_tokens": 157237248.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.5636483807005948, | |
| "grad_norm": 0.18476171791553497, | |
| "learning_rate": 0.00025122669283611385, | |
| "loss": 0.7987, | |
| "mean_token_accuracy": 0.8304337722063064, | |
| "num_tokens": 158875648.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.590085922009253, | |
| "grad_norm": 0.18693213164806366, | |
| "learning_rate": 0.0002485007087558609, | |
| "loss": 0.8042, | |
| "mean_token_accuracy": 0.8297446221113205, | |
| "num_tokens": 160514048.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.6165234633179115, | |
| "grad_norm": 0.19470660388469696, | |
| "learning_rate": 0.0002457747246756079, | |
| "loss": 0.8024, | |
| "mean_token_accuracy": 0.8308174461126328, | |
| "num_tokens": 162152448.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.64296100462657, | |
| "grad_norm": 0.23168876767158508, | |
| "learning_rate": 0.00024304874059535492, | |
| "loss": 0.7903, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.64296100462657, | |
| "eval_loss": 0.8376234769821167, | |
| "eval_mean_token_accuracy": 0.828871109394896, | |
| "eval_num_tokens": 163790848.0, | |
| "eval_runtime": 1600.0988, | |
| "eval_samples_per_second": 4.728, | |
| "eval_steps_per_second": 0.591, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.669398545935228, | |
| "grad_norm": 0.15908803045749664, | |
| "learning_rate": 0.00024032275651510195, | |
| "loss": 0.7967, | |
| "mean_token_accuracy": 0.8314005956053734, | |
| "num_tokens": 165429248.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 2.6958360872438862, | |
| "grad_norm": 0.1805862933397293, | |
| "learning_rate": 0.000237596772434849, | |
| "loss": 0.7774, | |
| "mean_token_accuracy": 0.8344085997343064, | |
| "num_tokens": 167067648.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 2.7222736285525446, | |
| "grad_norm": 0.17997150123119354, | |
| "learning_rate": 0.00023487078835459602, | |
| "loss": 0.7851, | |
| "mean_token_accuracy": 0.8325213807821273, | |
| "num_tokens": 168706048.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 2.748711169861203, | |
| "grad_norm": 0.18113110959529877, | |
| "learning_rate": 0.00023214480427434303, | |
| "loss": 0.776, | |
| "mean_token_accuracy": 0.8346639758348465, | |
| "num_tokens": 170344448.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 2.7751487111698614, | |
| "grad_norm": 0.18302254378795624, | |
| "learning_rate": 0.00022941882019409009, | |
| "loss": 0.7854, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.7751487111698614, | |
| "eval_loss": 0.8233165144920349, | |
| "eval_mean_token_accuracy": 0.830954508725987, | |
| "eval_num_tokens": 171982848.0, | |
| "eval_runtime": 1599.9718, | |
| "eval_samples_per_second": 4.728, | |
| "eval_steps_per_second": 0.591, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 2.8015862524785193, | |
| "grad_norm": 0.1922728568315506, | |
| "learning_rate": 0.0002266928361138371, | |
| "loss": 0.7936, | |
| "mean_token_accuracy": 0.8322769993543625, | |
| "num_tokens": 173621248.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 2.8280237937871777, | |
| "grad_norm": 0.1617008000612259, | |
| "learning_rate": 0.00022396685203358413, | |
| "loss": 0.7738, | |
| "mean_token_accuracy": 0.8344037118554115, | |
| "num_tokens": 175259648.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 2.854461335095836, | |
| "grad_norm": 0.17171062529087067, | |
| "learning_rate": 0.00022124086795333116, | |
| "loss": 0.7697, | |
| "mean_token_accuracy": 0.8351166906952858, | |
| "num_tokens": 176898048.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.8808988764044945, | |
| "grad_norm": 0.1803775280714035, | |
| "learning_rate": 0.0002185148838730782, | |
| "loss": 0.7735, | |
| "mean_token_accuracy": 0.8350091609358787, | |
| "num_tokens": 178536448.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 2.907336417713153, | |
| "grad_norm": 0.17305733263492584, | |
| "learning_rate": 0.0002157888997928252, | |
| "loss": 0.7716, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.907336417713153, | |
| "eval_loss": 0.8076795339584351, | |
| "eval_mean_token_accuracy": 0.8331229730841977, | |
| "eval_num_tokens": 180174848.0, | |
| "eval_runtime": 1600.6859, | |
| "eval_samples_per_second": 4.726, | |
| "eval_steps_per_second": 0.591, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.933773959021811, | |
| "grad_norm": 0.17064611613750458, | |
| "learning_rate": 0.00021306291571257226, | |
| "loss": 0.7713, | |
| "mean_token_accuracy": 0.8356136959791184, | |
| "num_tokens": 181813248.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 2.960211500330469, | |
| "grad_norm": 0.18137440085411072, | |
| "learning_rate": 0.00021033693163231926, | |
| "loss": 0.7667, | |
| "mean_token_accuracy": 0.8351374611258506, | |
| "num_tokens": 183451648.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.9866490416391276, | |
| "grad_norm": 0.17405763268470764, | |
| "learning_rate": 0.0002076109475520663, | |
| "loss": 0.7495, | |
| "mean_token_accuracy": 0.8385416662693024, | |
| "num_tokens": 185090048.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 3.012690019828156, | |
| "grad_norm": 0.17279721796512604, | |
| "learning_rate": 0.0002049394831534184, | |
| "loss": 0.7159, | |
| "mean_token_accuracy": 0.8417613173499325, | |
| "num_tokens": 186703872.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.0391275611368145, | |
| "grad_norm": 0.19387085735797882, | |
| "learning_rate": 0.0002022134990731654, | |
| "loss": 0.666, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 3.0391275611368145, | |
| "eval_loss": 0.8053749799728394, | |
| "eval_mean_token_accuracy": 0.8340540434416959, | |
| "eval_num_tokens": 188342272.0, | |
| "eval_runtime": 1600.205, | |
| "eval_samples_per_second": 4.728, | |
| "eval_steps_per_second": 0.591, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 3.0655651024454724, | |
| "grad_norm": 0.18193645775318146, | |
| "learning_rate": 0.00019948751499291245, | |
| "loss": 0.6644, | |
| "mean_token_accuracy": 0.8480684906244278, | |
| "num_tokens": 189980672.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.092002643754131, | |
| "grad_norm": 0.16633963584899902, | |
| "learning_rate": 0.00019676153091265948, | |
| "loss": 0.6691, | |
| "mean_token_accuracy": 0.847120603621006, | |
| "num_tokens": 191619072.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 3.118440185062789, | |
| "grad_norm": 0.17585037648677826, | |
| "learning_rate": 0.0001940355468324065, | |
| "loss": 0.6636, | |
| "mean_token_accuracy": 0.84809934258461, | |
| "num_tokens": 193257472.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 3.1448777263714476, | |
| "grad_norm": 0.1676415503025055, | |
| "learning_rate": 0.00019130956275215352, | |
| "loss": 0.6672, | |
| "mean_token_accuracy": 0.8475995865464211, | |
| "num_tokens": 194895872.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 3.1713152676801055, | |
| "grad_norm": 0.18070462346076965, | |
| "learning_rate": 0.00018858357867190058, | |
| "loss": 0.6627, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.1713152676801055, | |
| "eval_loss": 0.801948070526123, | |
| "eval_mean_token_accuracy": 0.8345137661909704, | |
| "eval_num_tokens": 196534272.0, | |
| "eval_runtime": 1599.8066, | |
| "eval_samples_per_second": 4.729, | |
| "eval_steps_per_second": 0.591, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.197752808988764, | |
| "grad_norm": 0.16841137409210205, | |
| "learning_rate": 0.00018585759459164758, | |
| "loss": 0.6569, | |
| "mean_token_accuracy": 0.8492378443479538, | |
| "num_tokens": 198172672.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 3.2241903502974223, | |
| "grad_norm": 0.18084491789340973, | |
| "learning_rate": 0.00018313161051139462, | |
| "loss": 0.6678, | |
| "mean_token_accuracy": 0.8477779817581177, | |
| "num_tokens": 199811072.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.2506278916060807, | |
| "grad_norm": 0.17532089352607727, | |
| "learning_rate": 0.00018040562643114165, | |
| "loss": 0.6693, | |
| "mean_token_accuracy": 0.8475476580858231, | |
| "num_tokens": 201449472.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 3.277065432914739, | |
| "grad_norm": 0.17762629687786102, | |
| "learning_rate": 0.00017767964235088868, | |
| "loss": 0.6568, | |
| "mean_token_accuracy": 0.8500018376111984, | |
| "num_tokens": 203087872.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.303502974223397, | |
| "grad_norm": 0.17803572118282318, | |
| "learning_rate": 0.0001749536582706357, | |
| "loss": 0.6664, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 3.303502974223397, | |
| "eval_loss": 0.7924287915229797, | |
| "eval_mean_token_accuracy": 0.8360363316838384, | |
| "eval_num_tokens": 204726272.0, | |
| "eval_runtime": 1600.5314, | |
| "eval_samples_per_second": 4.727, | |
| "eval_steps_per_second": 0.591, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 3.3299405155320554, | |
| "grad_norm": 0.1736496537923813, | |
| "learning_rate": 0.00017222767419038275, | |
| "loss": 0.6626, | |
| "mean_token_accuracy": 0.8480022014677524, | |
| "num_tokens": 206364672.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 3.3563780568407138, | |
| "grad_norm": 0.1790972799062729, | |
| "learning_rate": 0.00016950169011012976, | |
| "loss": 0.666, | |
| "mean_token_accuracy": 0.8478036442399025, | |
| "num_tokens": 208003072.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 3.382815598149372, | |
| "grad_norm": 0.17161910235881805, | |
| "learning_rate": 0.0001667757060298768, | |
| "loss": 0.6635, | |
| "mean_token_accuracy": 0.8481677681207657, | |
| "num_tokens": 209641472.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 3.4092531394580305, | |
| "grad_norm": 0.17608526349067688, | |
| "learning_rate": 0.00016404972194962382, | |
| "loss": 0.6483, | |
| "mean_token_accuracy": 0.8513996881246567, | |
| "num_tokens": 211279872.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 3.4356906807666885, | |
| "grad_norm": 0.17622597515583038, | |
| "learning_rate": 0.00016132373786937086, | |
| "loss": 0.6562, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.4356906807666885, | |
| "eval_loss": 0.7829640507698059, | |
| "eval_mean_token_accuracy": 0.8375042610768284, | |
| "eval_num_tokens": 212918272.0, | |
| "eval_runtime": 1600.7277, | |
| "eval_samples_per_second": 4.726, | |
| "eval_steps_per_second": 0.591, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 3.462128222075347, | |
| "grad_norm": 0.18006405234336853, | |
| "learning_rate": 0.00015859775378911786, | |
| "loss": 0.6498, | |
| "mean_token_accuracy": 0.8504380528628827, | |
| "num_tokens": 214556672.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 3.4885657633840053, | |
| "grad_norm": 0.16343793272972107, | |
| "learning_rate": 0.0001558717697088649, | |
| "loss": 0.6519, | |
| "mean_token_accuracy": 0.850884655714035, | |
| "num_tokens": 216195072.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 3.5150033046926636, | |
| "grad_norm": 0.16798467934131622, | |
| "learning_rate": 0.00015314578562861193, | |
| "loss": 0.6648, | |
| "mean_token_accuracy": 0.8490127098560333, | |
| "num_tokens": 217833472.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 3.541440846001322, | |
| "grad_norm": 0.15794213116168976, | |
| "learning_rate": 0.00015041980154835896, | |
| "loss": 0.6471, | |
| "mean_token_accuracy": 0.8517173796892166, | |
| "num_tokens": 219471872.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 3.56787838730998, | |
| "grad_norm": 0.1636921763420105, | |
| "learning_rate": 0.00014769381746810597, | |
| "loss": 0.6424, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.56787838730998, | |
| "eval_loss": 0.773522138595581, | |
| "eval_mean_token_accuracy": 0.8390711046928583, | |
| "eval_num_tokens": 221110272.0, | |
| "eval_runtime": 1600.7216, | |
| "eval_samples_per_second": 4.726, | |
| "eval_steps_per_second": 0.591, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 3.5943159286186384, | |
| "grad_norm": 0.15980064868927002, | |
| "learning_rate": 0.00014496783338785303, | |
| "loss": 0.6571, | |
| "mean_token_accuracy": 0.851312015503645, | |
| "num_tokens": 222748672.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 3.6207534699272967, | |
| "grad_norm": 0.1708955615758896, | |
| "learning_rate": 0.00014224184930760003, | |
| "loss": 0.6484, | |
| "mean_token_accuracy": 0.8513654717803001, | |
| "num_tokens": 224387072.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 3.647191011235955, | |
| "grad_norm": 0.16906002163887024, | |
| "learning_rate": 0.00013951586522734707, | |
| "loss": 0.6517, | |
| "mean_token_accuracy": 0.8500537672638893, | |
| "num_tokens": 226025472.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 3.6736285525446135, | |
| "grad_norm": 0.16365185379981995, | |
| "learning_rate": 0.0001367898811470941, | |
| "loss": 0.6372, | |
| "mean_token_accuracy": 0.8536284250020981, | |
| "num_tokens": 227663872.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 3.7000660938532715, | |
| "grad_norm": 0.17780087888240814, | |
| "learning_rate": 0.00013406389706684113, | |
| "loss": 0.6501, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.7000660938532715, | |
| "eval_loss": 0.7657620906829834, | |
| "eval_mean_token_accuracy": 0.8401046276848614, | |
| "eval_num_tokens": 229302272.0, | |
| "eval_runtime": 1600.1467, | |
| "eval_samples_per_second": 4.728, | |
| "eval_steps_per_second": 0.591, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.72650363516193, | |
| "grad_norm": 0.17722897231578827, | |
| "learning_rate": 0.00013133791298658814, | |
| "loss": 0.6527, | |
| "mean_token_accuracy": 0.8508571648597717, | |
| "num_tokens": 230940672.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 3.7529411764705882, | |
| "grad_norm": 0.16244906187057495, | |
| "learning_rate": 0.0001286119289063352, | |
| "loss": 0.6356, | |
| "mean_token_accuracy": 0.8537634432315826, | |
| "num_tokens": 232579072.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 3.7793787177792466, | |
| "grad_norm": 0.15864387154579163, | |
| "learning_rate": 0.0001258859448260822, | |
| "loss": 0.6452, | |
| "mean_token_accuracy": 0.8518102434277535, | |
| "num_tokens": 234217472.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 3.805816259087905, | |
| "grad_norm": 0.16620229184627533, | |
| "learning_rate": 0.00012315996074582924, | |
| "loss": 0.6418, | |
| "mean_token_accuracy": 0.8521817001700401, | |
| "num_tokens": 235855872.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 3.832253800396563, | |
| "grad_norm": 0.1765565574169159, | |
| "learning_rate": 0.00012043397666557627, | |
| "loss": 0.6387, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 3.832253800396563, | |
| "eval_loss": 0.7578161358833313, | |
| "eval_mean_token_accuracy": 0.8413670561404359, | |
| "eval_num_tokens": 237494272.0, | |
| "eval_runtime": 1602.8152, | |
| "eval_samples_per_second": 4.72, | |
| "eval_steps_per_second": 0.59, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 3.8586913417052213, | |
| "grad_norm": 0.15968503057956696, | |
| "learning_rate": 0.0001177079925853233, | |
| "loss": 0.6365, | |
| "mean_token_accuracy": 0.8533528861403465, | |
| "num_tokens": 239132672.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 3.8851288830138797, | |
| "grad_norm": 0.15743543207645416, | |
| "learning_rate": 0.00011498200850507034, | |
| "loss": 0.6486, | |
| "mean_token_accuracy": 0.8513813573122024, | |
| "num_tokens": 240771072.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 3.911566424322538, | |
| "grad_norm": 0.18122394382953644, | |
| "learning_rate": 0.00011225602442481736, | |
| "loss": 0.6384, | |
| "mean_token_accuracy": 0.8533547213673591, | |
| "num_tokens": 242409472.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 3.9380039656311965, | |
| "grad_norm": 0.15892641246318817, | |
| "learning_rate": 0.00010953004034456439, | |
| "loss": 0.6338, | |
| "mean_token_accuracy": 0.8538844108581543, | |
| "num_tokens": 244047872.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 3.9644415069398544, | |
| "grad_norm": 0.16563069820404053, | |
| "learning_rate": 0.00010680405626431142, | |
| "loss": 0.6256, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.9644415069398544, | |
| "eval_loss": 0.7491397857666016, | |
| "eval_mean_token_accuracy": 0.8425506683535102, | |
| "eval_num_tokens": 245686272.0, | |
| "eval_runtime": 1603.9187, | |
| "eval_samples_per_second": 4.717, | |
| "eval_steps_per_second": 0.59, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 3.990879048248513, | |
| "grad_norm": 0.1561686098575592, | |
| "learning_rate": 0.00010407807218405844, | |
| "loss": 0.6398, | |
| "mean_token_accuracy": 0.8541101579368114, | |
| "num_tokens": 247324672.0, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 4.016920026437541, | |
| "grad_norm": 0.17185606062412262, | |
| "learning_rate": 0.00010135208810380548, | |
| "loss": 0.5504, | |
| "mean_token_accuracy": 0.8682107241625713, | |
| "num_tokens": 248938496.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 4.0433575677462, | |
| "grad_norm": 0.17470529675483704, | |
| "learning_rate": 9.86261040235525e-05, | |
| "loss": 0.5029, | |
| "mean_token_accuracy": 0.8748790314793586, | |
| "num_tokens": 250576896.0, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 4.069795109054858, | |
| "grad_norm": 0.1801612824201584, | |
| "learning_rate": 9.590011994329953e-05, | |
| "loss": 0.5043, | |
| "mean_token_accuracy": 0.8748985821008682, | |
| "num_tokens": 252215296.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 4.0962326503635165, | |
| "grad_norm": 0.16825653612613678, | |
| "learning_rate": 9.317413586304656e-05, | |
| "loss": 0.4967, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 4.0962326503635165, | |
| "eval_loss": 0.7883051037788391, | |
| "eval_mean_token_accuracy": 0.8408105385983973, | |
| "eval_num_tokens": 253853696.0, | |
| "eval_runtime": 1607.5856, | |
| "eval_samples_per_second": 4.706, | |
| "eval_steps_per_second": 0.588, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 4.122670191672174, | |
| "grad_norm": 0.17985741794109344, | |
| "learning_rate": 9.044815178279358e-05, | |
| "loss": 0.5031, | |
| "mean_token_accuracy": 0.875472262352705, | |
| "num_tokens": 255492096.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 4.149107732980832, | |
| "grad_norm": 0.17613214254379272, | |
| "learning_rate": 8.772216770254061e-05, | |
| "loss": 0.4969, | |
| "mean_token_accuracy": 0.8762671053409576, | |
| "num_tokens": 257130496.0, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 4.175545274289491, | |
| "grad_norm": 0.17405198514461517, | |
| "learning_rate": 8.499618362228765e-05, | |
| "loss": 0.5095, | |
| "mean_token_accuracy": 0.8734744620323182, | |
| "num_tokens": 258768896.0, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 4.201982815598149, | |
| "grad_norm": 0.17185764014720917, | |
| "learning_rate": 8.227019954203467e-05, | |
| "loss": 0.5074, | |
| "mean_token_accuracy": 0.8739729967713356, | |
| "num_tokens": 260407296.0, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 4.228420356906808, | |
| "grad_norm": 0.17758677899837494, | |
| "learning_rate": 7.95442154617817e-05, | |
| "loss": 0.5085, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.228420356906808, | |
| "eval_loss": 0.7870664000511169, | |
| "eval_mean_token_accuracy": 0.8414596145929292, | |
| "eval_num_tokens": 262045696.0, | |
| "eval_runtime": 1607.3849, | |
| "eval_samples_per_second": 4.706, | |
| "eval_steps_per_second": 0.589, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.254857898215466, | |
| "grad_norm": 0.16629241406917572, | |
| "learning_rate": 7.681823138152873e-05, | |
| "loss": 0.5032, | |
| "mean_token_accuracy": 0.8741639178991317, | |
| "num_tokens": 263684096.0, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 4.281295439524124, | |
| "grad_norm": 0.173508420586586, | |
| "learning_rate": 7.409224730127575e-05, | |
| "loss": 0.4909, | |
| "mean_token_accuracy": 0.8775629255175591, | |
| "num_tokens": 265322496.0, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 4.307732980832783, | |
| "grad_norm": 0.1713671237230301, | |
| "learning_rate": 7.136626322102279e-05, | |
| "loss": 0.4923, | |
| "mean_token_accuracy": 0.8772788345813751, | |
| "num_tokens": 266960896.0, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 4.334170522141441, | |
| "grad_norm": 0.17122632265090942, | |
| "learning_rate": 6.864027914076983e-05, | |
| "loss": 0.5, | |
| "mean_token_accuracy": 0.8755180832743644, | |
| "num_tokens": 268599296.0, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 4.360608063450099, | |
| "grad_norm": 0.17359545826911926, | |
| "learning_rate": 6.591429506051685e-05, | |
| "loss": 0.4943, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 4.360608063450099, | |
| "eval_loss": 0.7823996543884277, | |
| "eval_mean_token_accuracy": 0.8421699439370355, | |
| "eval_num_tokens": 270237696.0, | |
| "eval_runtime": 1607.7567, | |
| "eval_samples_per_second": 4.705, | |
| "eval_steps_per_second": 0.588, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 4.387045604758757, | |
| "grad_norm": 0.17702388763427734, | |
| "learning_rate": 6.318831098026388e-05, | |
| "loss": 0.4904, | |
| "mean_token_accuracy": 0.8775449013710022, | |
| "num_tokens": 271876096.0, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 4.413483146067415, | |
| "grad_norm": 0.18663644790649414, | |
| "learning_rate": 6.0462326900010904e-05, | |
| "loss": 0.4959, | |
| "mean_token_accuracy": 0.8762383911013604, | |
| "num_tokens": 273514496.0, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 4.439920687376074, | |
| "grad_norm": 0.1880512684583664, | |
| "learning_rate": 5.773634281975793e-05, | |
| "loss": 0.4931, | |
| "mean_token_accuracy": 0.8767839661240577, | |
| "num_tokens": 275152896.0, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 4.466358228684732, | |
| "grad_norm": 0.18527589738368988, | |
| "learning_rate": 5.5010358739504963e-05, | |
| "loss": 0.4877, | |
| "mean_token_accuracy": 0.87819525629282, | |
| "num_tokens": 276791296.0, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 4.492795769993391, | |
| "grad_norm": 0.19010977447032928, | |
| "learning_rate": 5.228437465925199e-05, | |
| "loss": 0.4894, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.492795769993391, | |
| "eval_loss": 0.7803131341934204, | |
| "eval_mean_token_accuracy": 0.8430041650637008, | |
| "eval_num_tokens": 278429696.0, | |
| "eval_runtime": 1610.9854, | |
| "eval_samples_per_second": 4.696, | |
| "eval_steps_per_second": 0.587, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 4.519233311302049, | |
| "grad_norm": 0.17016442120075226, | |
| "learning_rate": 4.9558390578999016e-05, | |
| "loss": 0.4847, | |
| "mean_token_accuracy": 0.8786284182965756, | |
| "num_tokens": 280068096.0, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 4.545670852610707, | |
| "grad_norm": 0.1719425618648529, | |
| "learning_rate": 4.683240649874604e-05, | |
| "loss": 0.4875, | |
| "mean_token_accuracy": 0.8785123375058174, | |
| "num_tokens": 281706496.0, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 4.572108393919366, | |
| "grad_norm": 0.17816464602947235, | |
| "learning_rate": 4.4106422418493076e-05, | |
| "loss": 0.4863, | |
| "mean_token_accuracy": 0.8782337459921837, | |
| "num_tokens": 283344896.0, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 4.598545935228024, | |
| "grad_norm": 0.1728549599647522, | |
| "learning_rate": 4.138043833824011e-05, | |
| "loss": 0.4879, | |
| "mean_token_accuracy": 0.8787457209825515, | |
| "num_tokens": 284983296.0, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 4.624983476536682, | |
| "grad_norm": 0.18577666580677032, | |
| "learning_rate": 3.8654454257987135e-05, | |
| "loss": 0.4914, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 4.624983476536682, | |
| "eval_loss": 0.7784421443939209, | |
| "eval_mean_token_accuracy": 0.8436387255000262, | |
| "eval_num_tokens": 286621696.0, | |
| "eval_runtime": 1611.4393, | |
| "eval_samples_per_second": 4.695, | |
| "eval_steps_per_second": 0.587, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 4.65142101784534, | |
| "grad_norm": 0.16825436055660248, | |
| "learning_rate": 3.592847017773417e-05, | |
| "loss": 0.4756, | |
| "mean_token_accuracy": 0.8792506690323353, | |
| "num_tokens": 288260096.0, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 4.677858559153998, | |
| "grad_norm": 0.18510740995407104, | |
| "learning_rate": 3.3202486097481194e-05, | |
| "loss": 0.4788, | |
| "mean_token_accuracy": 0.8801001918315887, | |
| "num_tokens": 289898496.0, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 4.704296100462657, | |
| "grad_norm": 0.18907974660396576, | |
| "learning_rate": 3.0476502017228217e-05, | |
| "loss": 0.4837, | |
| "mean_token_accuracy": 0.8794446450471878, | |
| "num_tokens": 291536896.0, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 4.730733641771315, | |
| "grad_norm": 0.1798245906829834, | |
| "learning_rate": 2.775051793697525e-05, | |
| "loss": 0.4883, | |
| "mean_token_accuracy": 0.8778897827863693, | |
| "num_tokens": 293175296.0, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 4.757171183079974, | |
| "grad_norm": 0.17980748414993286, | |
| "learning_rate": 2.502453385672228e-05, | |
| "loss": 0.475, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.757171183079974, | |
| "eval_loss": 0.7753015756607056, | |
| "eval_mean_token_accuracy": 0.8443130426754659, | |
| "eval_num_tokens": 294813696.0, | |
| "eval_runtime": 1611.452, | |
| "eval_samples_per_second": 4.695, | |
| "eval_steps_per_second": 0.587, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.783608724388632, | |
| "grad_norm": 0.17731408774852753, | |
| "learning_rate": 2.2298549776469306e-05, | |
| "loss": 0.4657, | |
| "mean_token_accuracy": 0.8821294555068016, | |
| "num_tokens": 296452096.0, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 4.81004626569729, | |
| "grad_norm": 0.19258248805999756, | |
| "learning_rate": 1.9572565696216336e-05, | |
| "loss": 0.4779, | |
| "mean_token_accuracy": 0.8807239699363708, | |
| "num_tokens": 298090496.0, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 4.836483807005949, | |
| "grad_norm": 0.17705880105495453, | |
| "learning_rate": 1.6846581615963362e-05, | |
| "loss": 0.476, | |
| "mean_token_accuracy": 0.8808369943499565, | |
| "num_tokens": 299728896.0, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 4.8629213483146065, | |
| "grad_norm": 0.1794816255569458, | |
| "learning_rate": 1.4120597535710392e-05, | |
| "loss": 0.4742, | |
| "mean_token_accuracy": 0.8813196429610253, | |
| "num_tokens": 301367296.0, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 4.889358889623265, | |
| "grad_norm": 0.17823387682437897, | |
| "learning_rate": 1.139461345545742e-05, | |
| "loss": 0.4719, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 4.889358889623265, | |
| "eval_loss": 0.7754274010658264, | |
| "eval_mean_token_accuracy": 0.844791491931387, | |
| "eval_num_tokens": 303005696.0, | |
| "eval_runtime": 1610.8654, | |
| "eval_samples_per_second": 4.696, | |
| "eval_steps_per_second": 0.587, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 4.915796430931923, | |
| "grad_norm": 0.16834519803524017, | |
| "learning_rate": 8.668629375204448e-06, | |
| "loss": 0.4653, | |
| "mean_token_accuracy": 0.8821077673137188, | |
| "num_tokens": 304644096.0, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 4.942233972240581, | |
| "grad_norm": 0.17272663116455078, | |
| "learning_rate": 5.942645294951477e-06, | |
| "loss": 0.4783, | |
| "mean_token_accuracy": 0.8806390488147735, | |
| "num_tokens": 306282496.0, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 4.96867151354924, | |
| "grad_norm": 0.17334023118019104, | |
| "learning_rate": 3.2166612146985063e-06, | |
| "loss": 0.4794, | |
| "mean_token_accuracy": 0.8807239702343941, | |
| "num_tokens": 307920896.0, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 4.995109054857898, | |
| "grad_norm": 0.17255398631095886, | |
| "learning_rate": 4.906771344455349e-07, | |
| "loss": 0.4793, | |
| "mean_token_accuracy": 0.8803439608216286, | |
| "num_tokens": 309559296.0, | |
| "step": 9450 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 9455, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 619390244487168.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |