{ "best_global_step": 7500, "best_metric": 0.7491397857666016, "best_model_checkpoint": "./results/checkpoint-7500", "epoch": 4.997752808988764, "eval_steps": 250, "global_step": 9455, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026437541308658295, "grad_norm": 1.9868909120559692, "learning_rate": 0.00034625958983852136, "loss": 1.3782, "mean_token_accuracy": 0.7697586753964424, "num_tokens": 1638400.0, "step": 50 }, { "epoch": 0.05287508261731659, "grad_norm": 0.9880499839782715, "learning_rate": 0.000407611186724682, "loss": 1.2966, "mean_token_accuracy": 0.7892405907809734, "num_tokens": 3276800.0, "step": 100 }, { "epoch": 0.07931262392597488, "grad_norm": 0.6774707436561584, "learning_rate": 0.0004434995702624468, "loss": 0.939, "mean_token_accuracy": 0.8182881197333336, "num_tokens": 4915200.0, "step": 150 }, { "epoch": 0.10575016523463318, "grad_norm": 0.84303218126297, "learning_rate": 0.0004689627836108426, "loss": 0.9239, "mean_token_accuracy": 0.8214302301406861, "num_tokens": 6553600.0, "step": 200 }, { "epoch": 0.13218770654329148, "grad_norm": 2.8259122371673584, "learning_rate": 0.0004887135863147016, "loss": 1.0103, "step": 250 }, { "epoch": 0.13218770654329148, "eval_loss": 1.3388922214508057, "eval_mean_token_accuracy": 0.767919923740008, "eval_num_tokens": 8192000.0, "eval_runtime": 1597.0105, "eval_samples_per_second": 4.737, "eval_steps_per_second": 0.592, "step": 250 }, { "epoch": 0.15862524785194976, "grad_norm": 6.46172571182251, "learning_rate": 0.0004991822047759241, "loss": 1.6239, "mean_token_accuracy": 0.767223238646984, "num_tokens": 9830400.0, "step": 300 }, { "epoch": 0.18506278916060806, "grad_norm": 17.279727935791016, "learning_rate": 0.0004964562206956711, "loss": 1.3753, "mean_token_accuracy": 0.7646566477417945, "num_tokens": 11468800.0, "step": 350 }, { "epoch": 0.21150033046926636, "grad_norm": 9.562877655029297, "learning_rate": 0.0004937302366154182, "loss": 1.7307, "mean_token_accuracy": 0.7092528110742569, "num_tokens": 13107200.0, "step": 400 }, { "epoch": 0.23793787177792466, "grad_norm": 57.86962890625, "learning_rate": 0.0004910042525351653, "loss": 3.8636, "mean_token_accuracy": 0.44049181263893844, "num_tokens": 14745600.0, "step": 450 }, { "epoch": 0.26437541308658297, "grad_norm": 2.1134138107299805, "learning_rate": 0.00048827826845491225, "loss": 3.3381, "step": 500 }, { "epoch": 0.26437541308658297, "eval_loss": 3.0370047092437744, "eval_mean_token_accuracy": 0.5421812577177052, "eval_num_tokens": 16384000.0, "eval_runtime": 1593.4634, "eval_samples_per_second": 4.748, "eval_steps_per_second": 0.594, "step": 500 }, { "epoch": 0.29081295439524124, "grad_norm": 2.805110454559326, "learning_rate": 0.0004855522843746593, "loss": 2.8023, "mean_token_accuracy": 0.5313697456568479, "num_tokens": 18022400.0, "step": 550 }, { "epoch": 0.3172504957038995, "grad_norm": 2.2337939739227295, "learning_rate": 0.00048282630029440626, "loss": 2.3643, "mean_token_accuracy": 0.6257539093494415, "num_tokens": 19660800.0, "step": 600 }, { "epoch": 0.34368803701255785, "grad_norm": 1.3661304712295532, "learning_rate": 0.00048010031621415335, "loss": 2.2117, "mean_token_accuracy": 0.6438269788026809, "num_tokens": 21299200.0, "step": 650 }, { "epoch": 0.3701255783212161, "grad_norm": 1.319533109664917, "learning_rate": 0.0004773743321339004, "loss": 2.0377, "mean_token_accuracy": 0.6656103357672691, "num_tokens": 22937600.0, "step": 700 }, { "epoch": 0.3965631196298744, "grad_norm": 0.9889560341835022, "learning_rate": 0.00047464834805364736, "loss": 1.9612, "step": 750 }, { "epoch": 0.3965631196298744, "eval_loss": 1.9166280031204224, "eval_mean_token_accuracy": 0.6814079303570076, "eval_num_tokens": 24576000.0, "eval_runtime": 1597.3511, "eval_samples_per_second": 4.736, "eval_steps_per_second": 0.592, "step": 750 }, { "epoch": 0.4230006609385327, "grad_norm": 0.9154905676841736, "learning_rate": 0.0004719223639733944, "loss": 1.895, "mean_token_accuracy": 0.6779982282221317, "num_tokens": 26214400.0, "step": 800 }, { "epoch": 0.449438202247191, "grad_norm": 1.1073395013809204, "learning_rate": 0.0004691963798931415, "loss": 1.8532, "mean_token_accuracy": 0.6910386118292808, "num_tokens": 27852800.0, "step": 850 }, { "epoch": 0.47587574355584933, "grad_norm": 0.7696494460105896, "learning_rate": 0.00046647039581288846, "loss": 1.7828, "mean_token_accuracy": 0.7011858496069908, "num_tokens": 29491200.0, "step": 900 }, { "epoch": 0.5023132848645075, "grad_norm": 0.8735769987106323, "learning_rate": 0.0004637444117326355, "loss": 1.7661, "mean_token_accuracy": 0.7030816239118576, "num_tokens": 31129600.0, "step": 950 }, { "epoch": 0.5287508261731659, "grad_norm": 0.8184662461280823, "learning_rate": 0.0004610184276523825, "loss": 1.7303, "step": 1000 }, { "epoch": 0.5287508261731659, "eval_loss": 1.732823133468628, "eval_mean_token_accuracy": 0.7076210416774669, "eval_num_tokens": 32768000.0, "eval_runtime": 1598.294, "eval_samples_per_second": 4.733, "eval_steps_per_second": 0.592, "step": 1000 }, { "epoch": 0.5551883674818242, "grad_norm": 0.805102527141571, "learning_rate": 0.00045829244357212956, "loss": 1.7074, "mean_token_accuracy": 0.7088553883135319, "num_tokens": 34406400.0, "step": 1050 }, { "epoch": 0.5816259087904825, "grad_norm": 1.1714200973510742, "learning_rate": 0.0004555664594918766, "loss": 1.6578, "mean_token_accuracy": 0.7183681574463844, "num_tokens": 36044800.0, "step": 1100 }, { "epoch": 0.6080634500991408, "grad_norm": 0.801713228225708, "learning_rate": 0.0004528404754116236, "loss": 1.639, "mean_token_accuracy": 0.7187829902768135, "num_tokens": 37683200.0, "step": 1150 }, { "epoch": 0.634500991407799, "grad_norm": 0.776907205581665, "learning_rate": 0.0004501144913313706, "loss": 1.6155, "mean_token_accuracy": 0.7220100191235542, "num_tokens": 39321600.0, "step": 1200 }, { "epoch": 0.6609385327164574, "grad_norm": 0.5754767656326294, "learning_rate": 0.0004473885072511177, "loss": 1.5987, "step": 1250 }, { "epoch": 0.6609385327164574, "eval_loss": 1.5740511417388916, "eval_mean_token_accuracy": 0.7283713231001041, "eval_num_tokens": 40960000.0, "eval_runtime": 1599.1655, "eval_samples_per_second": 4.731, "eval_steps_per_second": 0.592, "step": 1250 }, { "epoch": 0.6873760740251157, "grad_norm": 0.5739009976387024, "learning_rate": 0.0004446625231708647, "loss": 1.5626, "mean_token_accuracy": 0.7280584080517292, "num_tokens": 42598400.0, "step": 1300 }, { "epoch": 0.713813615333774, "grad_norm": 0.56184983253479, "learning_rate": 0.0004419365390906117, "loss": 1.5383, "mean_token_accuracy": 0.7336229240894317, "num_tokens": 44236800.0, "step": 1350 }, { "epoch": 0.7402511566424322, "grad_norm": 0.7078151106834412, "learning_rate": 0.00043921055501035873, "loss": 1.4998, "mean_token_accuracy": 0.7408009549975395, "num_tokens": 45875200.0, "step": 1400 }, { "epoch": 0.7666886979510905, "grad_norm": 0.6344922184944153, "learning_rate": 0.0004364845709301058, "loss": 1.4751, "mean_token_accuracy": 0.74332783639431, "num_tokens": 47513600.0, "step": 1450 }, { "epoch": 0.7931262392597488, "grad_norm": 0.4986041486263275, "learning_rate": 0.0004337585868498528, "loss": 1.4716, "step": 1500 }, { "epoch": 0.7931262392597488, "eval_loss": 1.4890165328979492, "eval_mean_token_accuracy": 0.7397930833174361, "eval_num_tokens": 49152000.0, "eval_runtime": 1599.8275, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 1500 }, { "epoch": 0.8195637805684072, "grad_norm": 0.508725643157959, "learning_rate": 0.00043103260276959983, "loss": 1.4734, "mean_token_accuracy": 0.7423943056166172, "num_tokens": 50790400.0, "step": 1550 }, { "epoch": 0.8460013218770654, "grad_norm": 0.5139680504798889, "learning_rate": 0.0004283066186893468, "loss": 1.4513, "mean_token_accuracy": 0.7446151030063629, "num_tokens": 52428800.0, "step": 1600 }, { "epoch": 0.8724388631857237, "grad_norm": 0.5360570549964905, "learning_rate": 0.0004255806346090939, "loss": 1.4587, "mean_token_accuracy": 0.7414125129580498, "num_tokens": 54067200.0, "step": 1650 }, { "epoch": 0.898876404494382, "grad_norm": 0.46601545810699463, "learning_rate": 0.00042285465052884093, "loss": 1.4468, "mean_token_accuracy": 0.7438508039712906, "num_tokens": 55705600.0, "step": 1700 }, { "epoch": 0.9253139458030403, "grad_norm": 0.4491994380950928, "learning_rate": 0.0004201286664485879, "loss": 1.4234, "step": 1750 }, { "epoch": 0.9253139458030403, "eval_loss": 1.405325174331665, "eval_mean_token_accuracy": 0.7498895639975025, "eval_num_tokens": 57344000.0, "eval_runtime": 1600.1713, "eval_samples_per_second": 4.728, "eval_steps_per_second": 0.591, "step": 1750 }, { "epoch": 0.9517514871116987, "grad_norm": 0.40710222721099854, "learning_rate": 0.00041740268236833495, "loss": 1.3835, "mean_token_accuracy": 0.7491015987098217, "num_tokens": 58982400.0, "step": 1800 }, { "epoch": 0.9781890284203569, "grad_norm": 0.4693559408187866, "learning_rate": 0.00041467669828808203, "loss": 1.3498, "mean_token_accuracy": 0.7580449655652046, "num_tokens": 60620800.0, "step": 1850 }, { "epoch": 1.0042300066093852, "grad_norm": 0.4662095606327057, "learning_rate": 0.00041200523388943414, "loss": 1.3744, "mean_token_accuracy": 0.7523588169044649, "num_tokens": 62234624.0, "step": 1900 }, { "epoch": 1.0306675479180436, "grad_norm": 0.39532390236854553, "learning_rate": 0.0004092792498091811, "loss": 1.3057, "mean_token_accuracy": 0.7592387574911118, "num_tokens": 63873024.0, "step": 1950 }, { "epoch": 1.057105089226702, "grad_norm": 0.4154648780822754, "learning_rate": 0.00040655326572892816, "loss": 1.3248, "step": 2000 }, { "epoch": 1.057105089226702, "eval_loss": 1.339290976524353, "eval_mean_token_accuracy": 0.7579027240422513, "eval_num_tokens": 65511424.0, "eval_runtime": 1599.5937, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 2000 }, { "epoch": 1.0835426305353602, "grad_norm": 0.3895817697048187, "learning_rate": 0.00040382728164867513, "loss": 1.2956, "mean_token_accuracy": 0.7590417274832726, "num_tokens": 67149824.0, "step": 2050 }, { "epoch": 1.1099801718440185, "grad_norm": 0.39260634779930115, "learning_rate": 0.0004011012975684222, "loss": 1.3223, "mean_token_accuracy": 0.756996577680111, "num_tokens": 68788224.0, "step": 2100 }, { "epoch": 1.1364177131526767, "grad_norm": 0.3638737201690674, "learning_rate": 0.00039837531348816925, "loss": 1.268, "mean_token_accuracy": 0.7646328181028366, "num_tokens": 70426624.0, "step": 2150 }, { "epoch": 1.162855254461335, "grad_norm": 0.3186447322368622, "learning_rate": 0.00039564932940791623, "loss": 1.2705, "mean_token_accuracy": 0.7647727259993553, "num_tokens": 72065024.0, "step": 2200 }, { "epoch": 1.1892927957699935, "grad_norm": 0.37439003586769104, "learning_rate": 0.00039292334532766327, "loss": 1.2631, "step": 2250 }, { "epoch": 1.1892927957699935, "eval_loss": 1.278181791305542, "eval_mean_token_accuracy": 0.7655498584531578, "eval_num_tokens": 73703424.0, "eval_runtime": 1599.9443, "eval_samples_per_second": 4.728, "eval_steps_per_second": 0.591, "step": 2250 }, { "epoch": 1.2157303370786516, "grad_norm": 0.36414834856987, "learning_rate": 0.00039019736124741035, "loss": 1.2556, "mean_token_accuracy": 0.7657642959058285, "num_tokens": 75341824.0, "step": 2300 }, { "epoch": 1.24216787838731, "grad_norm": 0.38630911707878113, "learning_rate": 0.00038747137716715733, "loss": 1.2453, "mean_token_accuracy": 0.7686192587018013, "num_tokens": 76980224.0, "step": 2350 }, { "epoch": 1.2686054196959682, "grad_norm": 0.34793412685394287, "learning_rate": 0.00038474539308690437, "loss": 1.2113, "mean_token_accuracy": 0.7736504143476486, "num_tokens": 78618624.0, "step": 2400 }, { "epoch": 1.2950429610046266, "grad_norm": 0.3544578552246094, "learning_rate": 0.0003820194090066514, "loss": 1.1924, "mean_token_accuracy": 0.7755889534950257, "num_tokens": 80257024.0, "step": 2450 }, { "epoch": 1.321480502313285, "grad_norm": 0.30794623494148254, "learning_rate": 0.00037929342492639843, "loss": 1.1748, "step": 2500 }, { "epoch": 1.321480502313285, "eval_loss": 1.198885440826416, "eval_mean_token_accuracy": 0.776488389197666, "eval_num_tokens": 81895424.0, "eval_runtime": 1599.7215, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 2500 }, { "epoch": 1.3479180436219431, "grad_norm": 0.2978927493095398, "learning_rate": 0.00037656744084614547, "loss": 1.1624, "mean_token_accuracy": 0.7798365721106529, "num_tokens": 83533824.0, "step": 2550 }, { "epoch": 1.3743555849306015, "grad_norm": 0.3153753876686096, "learning_rate": 0.0003738414567658925, "loss": 1.1462, "mean_token_accuracy": 0.7827309390902519, "num_tokens": 85172224.0, "step": 2600 }, { "epoch": 1.4007931262392597, "grad_norm": 0.31813791394233704, "learning_rate": 0.0003711154726856395, "loss": 1.1274, "mean_token_accuracy": 0.7860245615243912, "num_tokens": 86810624.0, "step": 2650 }, { "epoch": 1.427230667547918, "grad_norm": 0.30844855308532715, "learning_rate": 0.00036838948860538656, "loss": 1.118, "mean_token_accuracy": 0.7876050838828087, "num_tokens": 88449024.0, "step": 2700 }, { "epoch": 1.4536682088565764, "grad_norm": 0.3054572343826294, "learning_rate": 0.0003656635045251336, "loss": 1.1336, "step": 2750 }, { "epoch": 1.4536682088565764, "eval_loss": 1.1190927028656006, "eval_mean_token_accuracy": 0.7886134508926319, "eval_num_tokens": 90087424.0, "eval_runtime": 1599.6779, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 2750 }, { "epoch": 1.4801057501652346, "grad_norm": 0.28417208790779114, "learning_rate": 0.0003629375204448806, "loss": 1.1039, "mean_token_accuracy": 0.7864858260750771, "num_tokens": 91725824.0, "step": 2800 }, { "epoch": 1.5065432914738928, "grad_norm": 0.307099312543869, "learning_rate": 0.0003602115363646276, "loss": 1.0909, "mean_token_accuracy": 0.7900452110171318, "num_tokens": 93364224.0, "step": 2850 }, { "epoch": 1.5329808327825512, "grad_norm": 0.30008023977279663, "learning_rate": 0.0003574855522843747, "loss": 1.0824, "mean_token_accuracy": 0.7928054749965667, "num_tokens": 95002624.0, "step": 2900 }, { "epoch": 1.5594183740912095, "grad_norm": 0.27622541785240173, "learning_rate": 0.0003547595682041217, "loss": 1.055, "mean_token_accuracy": 0.7961241453886032, "num_tokens": 96641024.0, "step": 2950 }, { "epoch": 1.585855915399868, "grad_norm": 0.2670520544052124, "learning_rate": 0.0003520335841238687, "loss": 1.0466, "step": 3000 }, { "epoch": 1.585855915399868, "eval_loss": 1.0528658628463745, "eval_mean_token_accuracy": 0.7982593539149262, "eval_num_tokens": 98279424.0, "eval_runtime": 1599.5737, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 3000 }, { "epoch": 1.612293456708526, "grad_norm": 0.26690635085105896, "learning_rate": 0.00034930760004361574, "loss": 1.0354, "mean_token_accuracy": 0.7988346171379089, "num_tokens": 99917824.0, "step": 3050 }, { "epoch": 1.6387309980171842, "grad_norm": 0.27989307045936584, "learning_rate": 0.0003465816159633628, "loss": 1.0225, "mean_token_accuracy": 0.8015902996063232, "num_tokens": 101556224.0, "step": 3100 }, { "epoch": 1.6651685393258426, "grad_norm": 0.21368129551410675, "learning_rate": 0.0003438556318831098, "loss": 1.0197, "mean_token_accuracy": 0.8019238775968551, "num_tokens": 103194624.0, "step": 3150 }, { "epoch": 1.691606080634501, "grad_norm": 0.288343220949173, "learning_rate": 0.00034112964780285684, "loss": 1.0174, "mean_token_accuracy": 0.8012603887915611, "num_tokens": 104833024.0, "step": 3200 }, { "epoch": 1.7180436219431594, "grad_norm": 0.245047464966774, "learning_rate": 0.0003384036637226039, "loss": 0.9922, "step": 3250 }, { "epoch": 1.7180436219431594, "eval_loss": 1.0065803527832031, "eval_mean_token_accuracy": 0.8048020523773943, "eval_num_tokens": 106471424.0, "eval_runtime": 1599.1563, "eval_samples_per_second": 4.731, "eval_steps_per_second": 0.592, "step": 3250 }, { "epoch": 1.7444811632518176, "grad_norm": 0.23827126622200012, "learning_rate": 0.0003356776796423509, "loss": 0.9838, "mean_token_accuracy": 0.8066547532379628, "num_tokens": 108109824.0, "step": 3300 }, { "epoch": 1.7709187045604757, "grad_norm": 0.22703391313552856, "learning_rate": 0.00033295169556209794, "loss": 0.9587, "mean_token_accuracy": 0.8113178130984307, "num_tokens": 109748224.0, "step": 3350 }, { "epoch": 1.7973562458691341, "grad_norm": 0.25331422686576843, "learning_rate": 0.0003302257114818449, "loss": 0.9697, "mean_token_accuracy": 0.8093206259608269, "num_tokens": 111386624.0, "step": 3400 }, { "epoch": 1.8237937871777925, "grad_norm": 0.264260470867157, "learning_rate": 0.000327499727401592, "loss": 0.956, "mean_token_accuracy": 0.8123435971140861, "num_tokens": 113025024.0, "step": 3450 }, { "epoch": 1.8502313284864509, "grad_norm": 0.2458537220954895, "learning_rate": 0.00032477374332133904, "loss": 0.9539, "step": 3500 }, { "epoch": 1.8502313284864509, "eval_loss": 0.9670175909996033, "eval_mean_token_accuracy": 0.8104942284729214, "eval_num_tokens": 114663424.0, "eval_runtime": 1599.1718, "eval_samples_per_second": 4.731, "eval_steps_per_second": 0.592, "step": 3500 }, { "epoch": 1.876668869795109, "grad_norm": 0.20451125502586365, "learning_rate": 0.000322047759241086, "loss": 0.9479, "mean_token_accuracy": 0.8118679732084274, "num_tokens": 116301824.0, "step": 3550 }, { "epoch": 1.9031064111037672, "grad_norm": 0.22584660351276398, "learning_rate": 0.00031932177516083305, "loss": 0.9688, "mean_token_accuracy": 0.8094049346446991, "num_tokens": 117940224.0, "step": 3600 }, { "epoch": 1.9295439524124256, "grad_norm": 0.2119428962469101, "learning_rate": 0.00031659579108058014, "loss": 0.9229, "mean_token_accuracy": 0.816270771920681, "num_tokens": 119578624.0, "step": 3650 }, { "epoch": 1.955981493721084, "grad_norm": 0.2210853099822998, "learning_rate": 0.0003138698070003271, "loss": 0.9341, "mean_token_accuracy": 0.814192325770855, "num_tokens": 121217024.0, "step": 3700 }, { "epoch": 1.9824190350297424, "grad_norm": 0.1966710090637207, "learning_rate": 0.00031114382292007415, "loss": 0.9283, "step": 3750 }, { "epoch": 1.9824190350297424, "eval_loss": 0.9337447881698608, "eval_mean_token_accuracy": 0.8153352634725308, "eval_num_tokens": 122855424.0, "eval_runtime": 1599.5384, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 3750 }, { "epoch": 2.0084600132187704, "grad_norm": 0.20168109238147736, "learning_rate": 0.00030847235852142626, "loss": 0.8927, "mean_token_accuracy": 0.8173428005175266, "num_tokens": 124469248.0, "step": 3800 }, { "epoch": 2.034897554527429, "grad_norm": 0.1905670017004013, "learning_rate": 0.00030574637444117324, "loss": 0.8427, "mean_token_accuracy": 0.8238172018527985, "num_tokens": 126107648.0, "step": 3850 }, { "epoch": 2.061335095836087, "grad_norm": 0.19004780054092407, "learning_rate": 0.0003030203903609203, "loss": 0.8536, "mean_token_accuracy": 0.8214613863825798, "num_tokens": 127746048.0, "step": 3900 }, { "epoch": 2.0877726371447456, "grad_norm": 0.21092021465301514, "learning_rate": 0.00030029440628066736, "loss": 0.8347, "mean_token_accuracy": 0.8248136582970619, "num_tokens": 129384448.0, "step": 3950 }, { "epoch": 2.114210178453404, "grad_norm": 0.2002408355474472, "learning_rate": 0.00029756842220041434, "loss": 0.8409, "step": 4000 }, { "epoch": 2.114210178453404, "eval_loss": 0.913899838924408, "eval_mean_token_accuracy": 0.8182373826271888, "eval_num_tokens": 131022848.0, "eval_runtime": 1599.607, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 4000 }, { "epoch": 2.140647719762062, "grad_norm": 0.22307777404785156, "learning_rate": 0.0002948424381201614, "loss": 0.8428, "mean_token_accuracy": 0.8242137080430985, "num_tokens": 132661248.0, "step": 4050 }, { "epoch": 2.1670852610707203, "grad_norm": 0.1873617023229599, "learning_rate": 0.0002921164540399084, "loss": 0.8439, "mean_token_accuracy": 0.8233803743124009, "num_tokens": 134299648.0, "step": 4100 }, { "epoch": 2.1935228023793787, "grad_norm": 0.1888233870267868, "learning_rate": 0.00028939046995965544, "loss": 0.8406, "mean_token_accuracy": 0.8241153433918953, "num_tokens": 135938048.0, "step": 4150 }, { "epoch": 2.219960343688037, "grad_norm": 0.19996315240859985, "learning_rate": 0.00028666448587940247, "loss": 0.8337, "mean_token_accuracy": 0.8248002156615257, "num_tokens": 137576448.0, "step": 4200 }, { "epoch": 2.2463978849966955, "grad_norm": 0.21117758750915527, "learning_rate": 0.0002839385017991495, "loss": 0.8411, "step": 4250 }, { "epoch": 2.2463978849966955, "eval_loss": 0.893865704536438, "eval_mean_token_accuracy": 0.8210062818093733, "eval_num_tokens": 139214848.0, "eval_runtime": 1599.858, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 4250 }, { "epoch": 2.2728354263053534, "grad_norm": 0.20331983268260956, "learning_rate": 0.00028121251771889654, "loss": 0.8389, "mean_token_accuracy": 0.824597994685173, "num_tokens": 140853248.0, "step": 4300 }, { "epoch": 2.299272967614012, "grad_norm": 0.19736993312835693, "learning_rate": 0.00027848653363864357, "loss": 0.8168, "mean_token_accuracy": 0.8279356023669243, "num_tokens": 142491648.0, "step": 4350 }, { "epoch": 2.32571050892267, "grad_norm": 0.1942383050918579, "learning_rate": 0.0002757605495583906, "loss": 0.8158, "mean_token_accuracy": 0.8288569149374961, "num_tokens": 144130048.0, "step": 4400 }, { "epoch": 2.3521480502313286, "grad_norm": 0.18327121436595917, "learning_rate": 0.0002730345654781376, "loss": 0.8097, "mean_token_accuracy": 0.8300702553987503, "num_tokens": 145768448.0, "step": 4450 }, { "epoch": 2.378585591539987, "grad_norm": 0.17920152842998505, "learning_rate": 0.00027030858139788467, "loss": 0.8017, "step": 4500 }, { "epoch": 2.378585591539987, "eval_loss": 0.874257504940033, "eval_mean_token_accuracy": 0.823767999828996, "eval_num_tokens": 147406848.0, "eval_runtime": 1599.5025, "eval_samples_per_second": 4.73, "eval_steps_per_second": 0.591, "step": 4500 }, { "epoch": 2.405023132848645, "grad_norm": 0.18811027705669403, "learning_rate": 0.0002675825973176317, "loss": 0.8215, "mean_token_accuracy": 0.8293267333507538, "num_tokens": 149045248.0, "step": 4550 }, { "epoch": 2.4314606741573033, "grad_norm": 0.20340368151664734, "learning_rate": 0.0002648566132373787, "loss": 0.8249, "mean_token_accuracy": 0.8268548348546028, "num_tokens": 150683648.0, "step": 4600 }, { "epoch": 2.4578982154659617, "grad_norm": 0.18492697179317474, "learning_rate": 0.0002621306291571257, "loss": 0.7914, "mean_token_accuracy": 0.832571476995945, "num_tokens": 152322048.0, "step": 4650 }, { "epoch": 2.48433575677462, "grad_norm": 0.19855117797851562, "learning_rate": 0.0002594046450768728, "loss": 0.8077, "mean_token_accuracy": 0.8298674210906029, "num_tokens": 153960448.0, "step": 4700 }, { "epoch": 2.5107732980832784, "grad_norm": 0.1997339129447937, "learning_rate": 0.0002566786609966198, "loss": 0.809, "step": 4750 }, { "epoch": 2.5107732980832784, "eval_loss": 0.8553281426429749, "eval_mean_token_accuracy": 0.8265610535729511, "eval_num_tokens": 155598848.0, "eval_runtime": 1599.9059, "eval_samples_per_second": 4.728, "eval_steps_per_second": 0.591, "step": 4750 }, { "epoch": 2.5372108393919364, "grad_norm": 0.19008329510688782, "learning_rate": 0.0002539526769163668, "loss": 0.797, "mean_token_accuracy": 0.8298222103714943, "num_tokens": 157237248.0, "step": 4800 }, { "epoch": 2.5636483807005948, "grad_norm": 0.18476171791553497, "learning_rate": 0.00025122669283611385, "loss": 0.7987, "mean_token_accuracy": 0.8304337722063064, "num_tokens": 158875648.0, "step": 4850 }, { "epoch": 2.590085922009253, "grad_norm": 0.18693213164806366, "learning_rate": 0.0002485007087558609, "loss": 0.8042, "mean_token_accuracy": 0.8297446221113205, "num_tokens": 160514048.0, "step": 4900 }, { "epoch": 2.6165234633179115, "grad_norm": 0.19470660388469696, "learning_rate": 0.0002457747246756079, "loss": 0.8024, "mean_token_accuracy": 0.8308174461126328, "num_tokens": 162152448.0, "step": 4950 }, { "epoch": 2.64296100462657, "grad_norm": 0.23168876767158508, "learning_rate": 0.00024304874059535492, "loss": 0.7903, "step": 5000 }, { "epoch": 2.64296100462657, "eval_loss": 0.8376234769821167, "eval_mean_token_accuracy": 0.828871109394896, "eval_num_tokens": 163790848.0, "eval_runtime": 1600.0988, "eval_samples_per_second": 4.728, "eval_steps_per_second": 0.591, "step": 5000 }, { "epoch": 2.669398545935228, "grad_norm": 0.15908803045749664, "learning_rate": 0.00024032275651510195, "loss": 0.7967, "mean_token_accuracy": 0.8314005956053734, "num_tokens": 165429248.0, "step": 5050 }, { "epoch": 2.6958360872438862, "grad_norm": 0.1805862933397293, "learning_rate": 0.000237596772434849, "loss": 0.7774, "mean_token_accuracy": 0.8344085997343064, "num_tokens": 167067648.0, "step": 5100 }, { "epoch": 2.7222736285525446, "grad_norm": 0.17997150123119354, "learning_rate": 0.00023487078835459602, "loss": 0.7851, "mean_token_accuracy": 0.8325213807821273, "num_tokens": 168706048.0, "step": 5150 }, { "epoch": 2.748711169861203, "grad_norm": 0.18113110959529877, "learning_rate": 0.00023214480427434303, "loss": 0.776, "mean_token_accuracy": 0.8346639758348465, "num_tokens": 170344448.0, "step": 5200 }, { "epoch": 2.7751487111698614, "grad_norm": 0.18302254378795624, "learning_rate": 0.00022941882019409009, "loss": 0.7854, "step": 5250 }, { "epoch": 2.7751487111698614, "eval_loss": 0.8233165144920349, "eval_mean_token_accuracy": 0.830954508725987, "eval_num_tokens": 171982848.0, "eval_runtime": 1599.9718, "eval_samples_per_second": 4.728, "eval_steps_per_second": 0.591, "step": 5250 }, { "epoch": 2.8015862524785193, "grad_norm": 0.1922728568315506, "learning_rate": 0.0002266928361138371, "loss": 0.7936, "mean_token_accuracy": 0.8322769993543625, "num_tokens": 173621248.0, "step": 5300 }, { "epoch": 2.8280237937871777, "grad_norm": 0.1617008000612259, "learning_rate": 0.00022396685203358413, "loss": 0.7738, "mean_token_accuracy": 0.8344037118554115, "num_tokens": 175259648.0, "step": 5350 }, { "epoch": 2.854461335095836, "grad_norm": 0.17171062529087067, "learning_rate": 0.00022124086795333116, "loss": 0.7697, "mean_token_accuracy": 0.8351166906952858, "num_tokens": 176898048.0, "step": 5400 }, { "epoch": 2.8808988764044945, "grad_norm": 0.1803775280714035, "learning_rate": 0.0002185148838730782, "loss": 0.7735, "mean_token_accuracy": 0.8350091609358787, "num_tokens": 178536448.0, "step": 5450 }, { "epoch": 2.907336417713153, "grad_norm": 0.17305733263492584, "learning_rate": 0.0002157888997928252, "loss": 0.7716, "step": 5500 }, { "epoch": 2.907336417713153, "eval_loss": 0.8076795339584351, "eval_mean_token_accuracy": 0.8331229730841977, "eval_num_tokens": 180174848.0, "eval_runtime": 1600.6859, "eval_samples_per_second": 4.726, "eval_steps_per_second": 0.591, "step": 5500 }, { "epoch": 2.933773959021811, "grad_norm": 0.17064611613750458, "learning_rate": 0.00021306291571257226, "loss": 0.7713, "mean_token_accuracy": 0.8356136959791184, "num_tokens": 181813248.0, "step": 5550 }, { "epoch": 2.960211500330469, "grad_norm": 0.18137440085411072, "learning_rate": 0.00021033693163231926, "loss": 0.7667, "mean_token_accuracy": 0.8351374611258506, "num_tokens": 183451648.0, "step": 5600 }, { "epoch": 2.9866490416391276, "grad_norm": 0.17405763268470764, "learning_rate": 0.0002076109475520663, "loss": 0.7495, "mean_token_accuracy": 0.8385416662693024, "num_tokens": 185090048.0, "step": 5650 }, { "epoch": 3.012690019828156, "grad_norm": 0.17279721796512604, "learning_rate": 0.0002049394831534184, "loss": 0.7159, "mean_token_accuracy": 0.8417613173499325, "num_tokens": 186703872.0, "step": 5700 }, { "epoch": 3.0391275611368145, "grad_norm": 0.19387085735797882, "learning_rate": 0.0002022134990731654, "loss": 0.666, "step": 5750 }, { "epoch": 3.0391275611368145, "eval_loss": 0.8053749799728394, "eval_mean_token_accuracy": 0.8340540434416959, "eval_num_tokens": 188342272.0, "eval_runtime": 1600.205, "eval_samples_per_second": 4.728, "eval_steps_per_second": 0.591, "step": 5750 }, { "epoch": 3.0655651024454724, "grad_norm": 0.18193645775318146, "learning_rate": 0.00019948751499291245, "loss": 0.6644, "mean_token_accuracy": 0.8480684906244278, "num_tokens": 189980672.0, "step": 5800 }, { "epoch": 3.092002643754131, "grad_norm": 0.16633963584899902, "learning_rate": 0.00019676153091265948, "loss": 0.6691, "mean_token_accuracy": 0.847120603621006, "num_tokens": 191619072.0, "step": 5850 }, { "epoch": 3.118440185062789, "grad_norm": 0.17585037648677826, "learning_rate": 0.0001940355468324065, "loss": 0.6636, "mean_token_accuracy": 0.84809934258461, "num_tokens": 193257472.0, "step": 5900 }, { "epoch": 3.1448777263714476, "grad_norm": 0.1676415503025055, "learning_rate": 0.00019130956275215352, "loss": 0.6672, "mean_token_accuracy": 0.8475995865464211, "num_tokens": 194895872.0, "step": 5950 }, { "epoch": 3.1713152676801055, "grad_norm": 0.18070462346076965, "learning_rate": 0.00018858357867190058, "loss": 0.6627, "step": 6000 }, { "epoch": 3.1713152676801055, "eval_loss": 0.801948070526123, "eval_mean_token_accuracy": 0.8345137661909704, "eval_num_tokens": 196534272.0, "eval_runtime": 1599.8066, "eval_samples_per_second": 4.729, "eval_steps_per_second": 0.591, "step": 6000 }, { "epoch": 3.197752808988764, "grad_norm": 0.16841137409210205, "learning_rate": 0.00018585759459164758, "loss": 0.6569, "mean_token_accuracy": 0.8492378443479538, "num_tokens": 198172672.0, "step": 6050 }, { "epoch": 3.2241903502974223, "grad_norm": 0.18084491789340973, "learning_rate": 0.00018313161051139462, "loss": 0.6678, "mean_token_accuracy": 0.8477779817581177, "num_tokens": 199811072.0, "step": 6100 }, { "epoch": 3.2506278916060807, "grad_norm": 0.17532089352607727, "learning_rate": 0.00018040562643114165, "loss": 0.6693, "mean_token_accuracy": 0.8475476580858231, "num_tokens": 201449472.0, "step": 6150 }, { "epoch": 3.277065432914739, "grad_norm": 0.17762629687786102, "learning_rate": 0.00017767964235088868, "loss": 0.6568, "mean_token_accuracy": 0.8500018376111984, "num_tokens": 203087872.0, "step": 6200 }, { "epoch": 3.303502974223397, "grad_norm": 0.17803572118282318, "learning_rate": 0.0001749536582706357, "loss": 0.6664, "step": 6250 }, { "epoch": 3.303502974223397, "eval_loss": 0.7924287915229797, "eval_mean_token_accuracy": 0.8360363316838384, "eval_num_tokens": 204726272.0, "eval_runtime": 1600.5314, "eval_samples_per_second": 4.727, "eval_steps_per_second": 0.591, "step": 6250 }, { "epoch": 3.3299405155320554, "grad_norm": 0.1736496537923813, "learning_rate": 0.00017222767419038275, "loss": 0.6626, "mean_token_accuracy": 0.8480022014677524, "num_tokens": 206364672.0, "step": 6300 }, { "epoch": 3.3563780568407138, "grad_norm": 0.1790972799062729, "learning_rate": 0.00016950169011012976, "loss": 0.666, "mean_token_accuracy": 0.8478036442399025, "num_tokens": 208003072.0, "step": 6350 }, { "epoch": 3.382815598149372, "grad_norm": 0.17161910235881805, "learning_rate": 0.0001667757060298768, "loss": 0.6635, "mean_token_accuracy": 0.8481677681207657, "num_tokens": 209641472.0, "step": 6400 }, { "epoch": 3.4092531394580305, "grad_norm": 0.17608526349067688, "learning_rate": 0.00016404972194962382, "loss": 0.6483, "mean_token_accuracy": 0.8513996881246567, "num_tokens": 211279872.0, "step": 6450 }, { "epoch": 3.4356906807666885, "grad_norm": 0.17622597515583038, "learning_rate": 0.00016132373786937086, "loss": 0.6562, "step": 6500 }, { "epoch": 3.4356906807666885, "eval_loss": 0.7829640507698059, "eval_mean_token_accuracy": 0.8375042610768284, "eval_num_tokens": 212918272.0, "eval_runtime": 1600.7277, "eval_samples_per_second": 4.726, "eval_steps_per_second": 0.591, "step": 6500 }, { "epoch": 3.462128222075347, "grad_norm": 0.18006405234336853, "learning_rate": 0.00015859775378911786, "loss": 0.6498, "mean_token_accuracy": 0.8504380528628827, "num_tokens": 214556672.0, "step": 6550 }, { "epoch": 3.4885657633840053, "grad_norm": 0.16343793272972107, "learning_rate": 0.0001558717697088649, "loss": 0.6519, "mean_token_accuracy": 0.850884655714035, "num_tokens": 216195072.0, "step": 6600 }, { "epoch": 3.5150033046926636, "grad_norm": 0.16798467934131622, "learning_rate": 0.00015314578562861193, "loss": 0.6648, "mean_token_accuracy": 0.8490127098560333, "num_tokens": 217833472.0, "step": 6650 }, { "epoch": 3.541440846001322, "grad_norm": 0.15794213116168976, "learning_rate": 0.00015041980154835896, "loss": 0.6471, "mean_token_accuracy": 0.8517173796892166, "num_tokens": 219471872.0, "step": 6700 }, { "epoch": 3.56787838730998, "grad_norm": 0.1636921763420105, "learning_rate": 0.00014769381746810597, "loss": 0.6424, "step": 6750 }, { "epoch": 3.56787838730998, "eval_loss": 0.773522138595581, "eval_mean_token_accuracy": 0.8390711046928583, "eval_num_tokens": 221110272.0, "eval_runtime": 1600.7216, "eval_samples_per_second": 4.726, "eval_steps_per_second": 0.591, "step": 6750 }, { "epoch": 3.5943159286186384, "grad_norm": 0.15980064868927002, "learning_rate": 0.00014496783338785303, "loss": 0.6571, "mean_token_accuracy": 0.851312015503645, "num_tokens": 222748672.0, "step": 6800 }, { "epoch": 3.6207534699272967, "grad_norm": 0.1708955615758896, "learning_rate": 0.00014224184930760003, "loss": 0.6484, "mean_token_accuracy": 0.8513654717803001, "num_tokens": 224387072.0, "step": 6850 }, { "epoch": 3.647191011235955, "grad_norm": 0.16906002163887024, "learning_rate": 0.00013951586522734707, "loss": 0.6517, "mean_token_accuracy": 0.8500537672638893, "num_tokens": 226025472.0, "step": 6900 }, { "epoch": 3.6736285525446135, "grad_norm": 0.16365185379981995, "learning_rate": 0.0001367898811470941, "loss": 0.6372, "mean_token_accuracy": 0.8536284250020981, "num_tokens": 227663872.0, "step": 6950 }, { "epoch": 3.7000660938532715, "grad_norm": 0.17780087888240814, "learning_rate": 0.00013406389706684113, "loss": 0.6501, "step": 7000 }, { "epoch": 3.7000660938532715, "eval_loss": 0.7657620906829834, "eval_mean_token_accuracy": 0.8401046276848614, "eval_num_tokens": 229302272.0, "eval_runtime": 1600.1467, "eval_samples_per_second": 4.728, "eval_steps_per_second": 0.591, "step": 7000 }, { "epoch": 3.72650363516193, "grad_norm": 0.17722897231578827, "learning_rate": 0.00013133791298658814, "loss": 0.6527, "mean_token_accuracy": 0.8508571648597717, "num_tokens": 230940672.0, "step": 7050 }, { "epoch": 3.7529411764705882, "grad_norm": 0.16244906187057495, "learning_rate": 0.0001286119289063352, "loss": 0.6356, "mean_token_accuracy": 0.8537634432315826, "num_tokens": 232579072.0, "step": 7100 }, { "epoch": 3.7793787177792466, "grad_norm": 0.15864387154579163, "learning_rate": 0.0001258859448260822, "loss": 0.6452, "mean_token_accuracy": 0.8518102434277535, "num_tokens": 234217472.0, "step": 7150 }, { "epoch": 3.805816259087905, "grad_norm": 0.16620229184627533, "learning_rate": 0.00012315996074582924, "loss": 0.6418, "mean_token_accuracy": 0.8521817001700401, "num_tokens": 235855872.0, "step": 7200 }, { "epoch": 3.832253800396563, "grad_norm": 0.1765565574169159, "learning_rate": 0.00012043397666557627, "loss": 0.6387, "step": 7250 }, { "epoch": 3.832253800396563, "eval_loss": 0.7578161358833313, "eval_mean_token_accuracy": 0.8413670561404359, "eval_num_tokens": 237494272.0, "eval_runtime": 1602.8152, "eval_samples_per_second": 4.72, "eval_steps_per_second": 0.59, "step": 7250 }, { "epoch": 3.8586913417052213, "grad_norm": 0.15968503057956696, "learning_rate": 0.0001177079925853233, "loss": 0.6365, "mean_token_accuracy": 0.8533528861403465, "num_tokens": 239132672.0, "step": 7300 }, { "epoch": 3.8851288830138797, "grad_norm": 0.15743543207645416, "learning_rate": 0.00011498200850507034, "loss": 0.6486, "mean_token_accuracy": 0.8513813573122024, "num_tokens": 240771072.0, "step": 7350 }, { "epoch": 3.911566424322538, "grad_norm": 0.18122394382953644, "learning_rate": 0.00011225602442481736, "loss": 0.6384, "mean_token_accuracy": 0.8533547213673591, "num_tokens": 242409472.0, "step": 7400 }, { "epoch": 3.9380039656311965, "grad_norm": 0.15892641246318817, "learning_rate": 0.00010953004034456439, "loss": 0.6338, "mean_token_accuracy": 0.8538844108581543, "num_tokens": 244047872.0, "step": 7450 }, { "epoch": 3.9644415069398544, "grad_norm": 0.16563069820404053, "learning_rate": 0.00010680405626431142, "loss": 0.6256, "step": 7500 }, { "epoch": 3.9644415069398544, "eval_loss": 0.7491397857666016, "eval_mean_token_accuracy": 0.8425506683535102, "eval_num_tokens": 245686272.0, "eval_runtime": 1603.9187, "eval_samples_per_second": 4.717, "eval_steps_per_second": 0.59, "step": 7500 }, { "epoch": 3.990879048248513, "grad_norm": 0.1561686098575592, "learning_rate": 0.00010407807218405844, "loss": 0.6398, "mean_token_accuracy": 0.8541101579368114, "num_tokens": 247324672.0, "step": 7550 }, { "epoch": 4.016920026437541, "grad_norm": 0.17185606062412262, "learning_rate": 0.00010135208810380548, "loss": 0.5504, "mean_token_accuracy": 0.8682107241625713, "num_tokens": 248938496.0, "step": 7600 }, { "epoch": 4.0433575677462, "grad_norm": 0.17470529675483704, "learning_rate": 9.86261040235525e-05, "loss": 0.5029, "mean_token_accuracy": 0.8748790314793586, "num_tokens": 250576896.0, "step": 7650 }, { "epoch": 4.069795109054858, "grad_norm": 0.1801612824201584, "learning_rate": 9.590011994329953e-05, "loss": 0.5043, "mean_token_accuracy": 0.8748985821008682, "num_tokens": 252215296.0, "step": 7700 }, { "epoch": 4.0962326503635165, "grad_norm": 0.16825653612613678, "learning_rate": 9.317413586304656e-05, "loss": 0.4967, "step": 7750 }, { "epoch": 4.0962326503635165, "eval_loss": 0.7883051037788391, "eval_mean_token_accuracy": 0.8408105385983973, "eval_num_tokens": 253853696.0, "eval_runtime": 1607.5856, "eval_samples_per_second": 4.706, "eval_steps_per_second": 0.588, "step": 7750 }, { "epoch": 4.122670191672174, "grad_norm": 0.17985741794109344, "learning_rate": 9.044815178279358e-05, "loss": 0.5031, "mean_token_accuracy": 0.875472262352705, "num_tokens": 255492096.0, "step": 7800 }, { "epoch": 4.149107732980832, "grad_norm": 0.17613214254379272, "learning_rate": 8.772216770254061e-05, "loss": 0.4969, "mean_token_accuracy": 0.8762671053409576, "num_tokens": 257130496.0, "step": 7850 }, { "epoch": 4.175545274289491, "grad_norm": 0.17405198514461517, "learning_rate": 8.499618362228765e-05, "loss": 0.5095, "mean_token_accuracy": 0.8734744620323182, "num_tokens": 258768896.0, "step": 7900 }, { "epoch": 4.201982815598149, "grad_norm": 0.17185764014720917, "learning_rate": 8.227019954203467e-05, "loss": 0.5074, "mean_token_accuracy": 0.8739729967713356, "num_tokens": 260407296.0, "step": 7950 }, { "epoch": 4.228420356906808, "grad_norm": 0.17758677899837494, "learning_rate": 7.95442154617817e-05, "loss": 0.5085, "step": 8000 }, { "epoch": 4.228420356906808, "eval_loss": 0.7870664000511169, "eval_mean_token_accuracy": 0.8414596145929292, "eval_num_tokens": 262045696.0, "eval_runtime": 1607.3849, "eval_samples_per_second": 4.706, "eval_steps_per_second": 0.589, "step": 8000 }, { "epoch": 4.254857898215466, "grad_norm": 0.16629241406917572, "learning_rate": 7.681823138152873e-05, "loss": 0.5032, "mean_token_accuracy": 0.8741639178991317, "num_tokens": 263684096.0, "step": 8050 }, { "epoch": 4.281295439524124, "grad_norm": 0.173508420586586, "learning_rate": 7.409224730127575e-05, "loss": 0.4909, "mean_token_accuracy": 0.8775629255175591, "num_tokens": 265322496.0, "step": 8100 }, { "epoch": 4.307732980832783, "grad_norm": 0.1713671237230301, "learning_rate": 7.136626322102279e-05, "loss": 0.4923, "mean_token_accuracy": 0.8772788345813751, "num_tokens": 266960896.0, "step": 8150 }, { "epoch": 4.334170522141441, "grad_norm": 0.17122632265090942, "learning_rate": 6.864027914076983e-05, "loss": 0.5, "mean_token_accuracy": 0.8755180832743644, "num_tokens": 268599296.0, "step": 8200 }, { "epoch": 4.360608063450099, "grad_norm": 0.17359545826911926, "learning_rate": 6.591429506051685e-05, "loss": 0.4943, "step": 8250 }, { "epoch": 4.360608063450099, "eval_loss": 0.7823996543884277, "eval_mean_token_accuracy": 0.8421699439370355, "eval_num_tokens": 270237696.0, "eval_runtime": 1607.7567, "eval_samples_per_second": 4.705, "eval_steps_per_second": 0.588, "step": 8250 }, { "epoch": 4.387045604758757, "grad_norm": 0.17702388763427734, "learning_rate": 6.318831098026388e-05, "loss": 0.4904, "mean_token_accuracy": 0.8775449013710022, "num_tokens": 271876096.0, "step": 8300 }, { "epoch": 4.413483146067415, "grad_norm": 0.18663644790649414, "learning_rate": 6.0462326900010904e-05, "loss": 0.4959, "mean_token_accuracy": 0.8762383911013604, "num_tokens": 273514496.0, "step": 8350 }, { "epoch": 4.439920687376074, "grad_norm": 0.1880512684583664, "learning_rate": 5.773634281975793e-05, "loss": 0.4931, "mean_token_accuracy": 0.8767839661240577, "num_tokens": 275152896.0, "step": 8400 }, { "epoch": 4.466358228684732, "grad_norm": 0.18527589738368988, "learning_rate": 5.5010358739504963e-05, "loss": 0.4877, "mean_token_accuracy": 0.87819525629282, "num_tokens": 276791296.0, "step": 8450 }, { "epoch": 4.492795769993391, "grad_norm": 0.19010977447032928, "learning_rate": 5.228437465925199e-05, "loss": 0.4894, "step": 8500 }, { "epoch": 4.492795769993391, "eval_loss": 0.7803131341934204, "eval_mean_token_accuracy": 0.8430041650637008, "eval_num_tokens": 278429696.0, "eval_runtime": 1610.9854, "eval_samples_per_second": 4.696, "eval_steps_per_second": 0.587, "step": 8500 }, { "epoch": 4.519233311302049, "grad_norm": 0.17016442120075226, "learning_rate": 4.9558390578999016e-05, "loss": 0.4847, "mean_token_accuracy": 0.8786284182965756, "num_tokens": 280068096.0, "step": 8550 }, { "epoch": 4.545670852610707, "grad_norm": 0.1719425618648529, "learning_rate": 4.683240649874604e-05, "loss": 0.4875, "mean_token_accuracy": 0.8785123375058174, "num_tokens": 281706496.0, "step": 8600 }, { "epoch": 4.572108393919366, "grad_norm": 0.17816464602947235, "learning_rate": 4.4106422418493076e-05, "loss": 0.4863, "mean_token_accuracy": 0.8782337459921837, "num_tokens": 283344896.0, "step": 8650 }, { "epoch": 4.598545935228024, "grad_norm": 0.1728549599647522, "learning_rate": 4.138043833824011e-05, "loss": 0.4879, "mean_token_accuracy": 0.8787457209825515, "num_tokens": 284983296.0, "step": 8700 }, { "epoch": 4.624983476536682, "grad_norm": 0.18577666580677032, "learning_rate": 3.8654454257987135e-05, "loss": 0.4914, "step": 8750 }, { "epoch": 4.624983476536682, "eval_loss": 0.7784421443939209, "eval_mean_token_accuracy": 0.8436387255000262, "eval_num_tokens": 286621696.0, "eval_runtime": 1611.4393, "eval_samples_per_second": 4.695, "eval_steps_per_second": 0.587, "step": 8750 }, { "epoch": 4.65142101784534, "grad_norm": 0.16825436055660248, "learning_rate": 3.592847017773417e-05, "loss": 0.4756, "mean_token_accuracy": 0.8792506690323353, "num_tokens": 288260096.0, "step": 8800 }, { "epoch": 4.677858559153998, "grad_norm": 0.18510740995407104, "learning_rate": 3.3202486097481194e-05, "loss": 0.4788, "mean_token_accuracy": 0.8801001918315887, "num_tokens": 289898496.0, "step": 8850 }, { "epoch": 4.704296100462657, "grad_norm": 0.18907974660396576, "learning_rate": 3.0476502017228217e-05, "loss": 0.4837, "mean_token_accuracy": 0.8794446450471878, "num_tokens": 291536896.0, "step": 8900 }, { "epoch": 4.730733641771315, "grad_norm": 0.1798245906829834, "learning_rate": 2.775051793697525e-05, "loss": 0.4883, "mean_token_accuracy": 0.8778897827863693, "num_tokens": 293175296.0, "step": 8950 }, { "epoch": 4.757171183079974, "grad_norm": 0.17980748414993286, "learning_rate": 2.502453385672228e-05, "loss": 0.475, "step": 9000 }, { "epoch": 4.757171183079974, "eval_loss": 0.7753015756607056, "eval_mean_token_accuracy": 0.8443130426754659, "eval_num_tokens": 294813696.0, "eval_runtime": 1611.452, "eval_samples_per_second": 4.695, "eval_steps_per_second": 0.587, "step": 9000 }, { "epoch": 4.783608724388632, "grad_norm": 0.17731408774852753, "learning_rate": 2.2298549776469306e-05, "loss": 0.4657, "mean_token_accuracy": 0.8821294555068016, "num_tokens": 296452096.0, "step": 9050 }, { "epoch": 4.81004626569729, "grad_norm": 0.19258248805999756, "learning_rate": 1.9572565696216336e-05, "loss": 0.4779, "mean_token_accuracy": 0.8807239699363708, "num_tokens": 298090496.0, "step": 9100 }, { "epoch": 4.836483807005949, "grad_norm": 0.17705880105495453, "learning_rate": 1.6846581615963362e-05, "loss": 0.476, "mean_token_accuracy": 0.8808369943499565, "num_tokens": 299728896.0, "step": 9150 }, { "epoch": 4.8629213483146065, "grad_norm": 0.1794816255569458, "learning_rate": 1.4120597535710392e-05, "loss": 0.4742, "mean_token_accuracy": 0.8813196429610253, "num_tokens": 301367296.0, "step": 9200 }, { "epoch": 4.889358889623265, "grad_norm": 0.17823387682437897, "learning_rate": 1.139461345545742e-05, "loss": 0.4719, "step": 9250 }, { "epoch": 4.889358889623265, "eval_loss": 0.7754274010658264, "eval_mean_token_accuracy": 0.844791491931387, "eval_num_tokens": 303005696.0, "eval_runtime": 1610.8654, "eval_samples_per_second": 4.696, "eval_steps_per_second": 0.587, "step": 9250 }, { "epoch": 4.915796430931923, "grad_norm": 0.16834519803524017, "learning_rate": 8.668629375204448e-06, "loss": 0.4653, "mean_token_accuracy": 0.8821077673137188, "num_tokens": 304644096.0, "step": 9300 }, { "epoch": 4.942233972240581, "grad_norm": 0.17272663116455078, "learning_rate": 5.942645294951477e-06, "loss": 0.4783, "mean_token_accuracy": 0.8806390488147735, "num_tokens": 306282496.0, "step": 9350 }, { "epoch": 4.96867151354924, "grad_norm": 0.17334023118019104, "learning_rate": 3.2166612146985063e-06, "loss": 0.4794, "mean_token_accuracy": 0.8807239702343941, "num_tokens": 307920896.0, "step": 9400 }, { "epoch": 4.995109054857898, "grad_norm": 0.17255398631095886, "learning_rate": 4.906771344455349e-07, "loss": 0.4793, "mean_token_accuracy": 0.8803439608216286, "num_tokens": 309559296.0, "step": 9450 } ], "logging_steps": 50, "max_steps": 9455, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 619390244487168.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }