{ "best_global_step": 9000, "best_metric": 0.9195617437362671, "best_model_checkpoint": "./results/checkpoint-9000", "epoch": 4.757171183079974, "eval_steps": 250, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026437541308658295, "grad_norm": 43.99717712402344, "learning_rate": 0.00034625958983852136, "loss": 9.1282, "mean_token_accuracy": 0.3046248774370179, "num_tokens": 1638400.0, "step": 50 }, { "epoch": 0.05287508261731659, "grad_norm": 6.144142150878906, "learning_rate": 0.000407611186724682, "loss": 4.215, "mean_token_accuracy": 0.4404753165692091, "num_tokens": 3276800.0, "step": 100 }, { "epoch": 0.07931262392597488, "grad_norm": 2.289468288421631, "learning_rate": 0.0004434995702624468, "loss": 3.3619, "mean_token_accuracy": 0.4969379284977913, "num_tokens": 4915200.0, "step": 150 }, { "epoch": 0.10575016523463318, "grad_norm": 1.4314253330230713, "learning_rate": 0.0004689627836108426, "loss": 2.3482, "mean_token_accuracy": 0.6427779817581176, "num_tokens": 6553600.0, "step": 200 }, { "epoch": 0.13218770654329148, "grad_norm": 1.0555275678634644, "learning_rate": 0.0004887135863147016, "loss": 2.0093, "step": 250 }, { "epoch": 0.13218770654329148, "eval_loss": 1.941351056098938, "eval_mean_token_accuracy": 0.6896467157826616, "eval_num_tokens": 8192000.0, "eval_runtime": 1710.841, "eval_samples_per_second": 4.422, "eval_steps_per_second": 0.553, "step": 250 }, { "epoch": 0.15862524785194976, "grad_norm": 1.0403132438659668, "learning_rate": 0.0004991822047759241, "loss": 1.8613, "mean_token_accuracy": 0.6918438403308391, "num_tokens": 9830400.0, "step": 300 }, { "epoch": 0.18506278916060806, "grad_norm": 1.0463595390319824, "learning_rate": 0.0004964562206956711, "loss": 1.7529, "mean_token_accuracy": 0.7160025656223297, "num_tokens": 11468800.0, "step": 350 }, { "epoch": 0.21150033046926636, "grad_norm": 0.8012986183166504, "learning_rate": 0.0004937302366154182, "loss": 1.642, "mean_token_accuracy": 0.731451002061367, "num_tokens": 13107200.0, "step": 400 }, { "epoch": 0.23793787177792466, "grad_norm": 0.9207384586334229, "learning_rate": 0.0004910042525351653, "loss": 1.5717, "mean_token_accuracy": 0.7387188410758972, "num_tokens": 14745600.0, "step": 450 }, { "epoch": 0.26437541308658297, "grad_norm": 0.6695268750190735, "learning_rate": 0.00048827826845491225, "loss": 1.5342, "step": 500 }, { "epoch": 0.26437541308658297, "eval_loss": 1.520922064781189, "eval_mean_token_accuracy": 0.7428315233982643, "eval_num_tokens": 16384000.0, "eval_runtime": 1711.3798, "eval_samples_per_second": 4.42, "eval_steps_per_second": 0.553, "step": 500 }, { "epoch": 0.29081295439524124, "grad_norm": 0.7166313529014587, "learning_rate": 0.0004855522843746593, "loss": 1.5115, "mean_token_accuracy": 0.7436290304362774, "num_tokens": 18022400.0, "step": 550 }, { "epoch": 0.3172504957038995, "grad_norm": 0.7090346217155457, "learning_rate": 0.00048282630029440626, "loss": 1.4558, "mean_token_accuracy": 0.7508443316817284, "num_tokens": 19660800.0, "step": 600 }, { "epoch": 0.34368803701255785, "grad_norm": 0.8063613176345825, "learning_rate": 0.00048010031621415335, "loss": 1.4464, "mean_token_accuracy": 0.7509249743819236, "num_tokens": 21299200.0, "step": 650 }, { "epoch": 0.3701255783212161, "grad_norm": 1.142205834388733, "learning_rate": 0.0004773743321339004, "loss": 1.3984, "mean_token_accuracy": 0.756721040904522, "num_tokens": 22937600.0, "step": 700 }, { "epoch": 0.3965631196298744, "grad_norm": 0.8467270135879517, "learning_rate": 0.00047464834805364736, "loss": 1.3935, "step": 750 }, { "epoch": 0.3965631196298744, "eval_loss": 1.3857780694961548, "eval_mean_token_accuracy": 0.7577579059888142, "eval_num_tokens": 24576000.0, "eval_runtime": 1712.2261, "eval_samples_per_second": 4.418, "eval_steps_per_second": 0.552, "step": 750 }, { "epoch": 0.4230006609385327, "grad_norm": 0.7939649820327759, "learning_rate": 0.0004719223639733944, "loss": 1.3699, "mean_token_accuracy": 0.7578213591873646, "num_tokens": 26214400.0, "step": 800 }, { "epoch": 0.449438202247191, "grad_norm": 0.7455005645751953, "learning_rate": 0.0004691963798931415, "loss": 1.3693, "mean_token_accuracy": 0.7589296215772628, "num_tokens": 27852800.0, "step": 850 }, { "epoch": 0.47587574355584933, "grad_norm": 0.7528285980224609, "learning_rate": 0.00046647039581288846, "loss": 1.3242, "mean_token_accuracy": 0.7669953557848931, "num_tokens": 29491200.0, "step": 900 }, { "epoch": 0.5023132848645075, "grad_norm": 0.8056386113166809, "learning_rate": 0.0004637444117326355, "loss": 1.335, "mean_token_accuracy": 0.7646432068943977, "num_tokens": 31129600.0, "step": 950 }, { "epoch": 0.5287508261731659, "grad_norm": 0.7774543166160583, "learning_rate": 0.0004610184276523825, "loss": 1.3286, "step": 1000 }, { "epoch": 0.5287508261731659, "eval_loss": 1.3367868661880493, "eval_mean_token_accuracy": 0.7645143720362706, "eval_num_tokens": 32768000.0, "eval_runtime": 1713.3241, "eval_samples_per_second": 4.415, "eval_steps_per_second": 0.552, "step": 1000 }, { "epoch": 0.5551883674818242, "grad_norm": 1.0932977199554443, "learning_rate": 0.00045829244357212956, "loss": 1.3357, "mean_token_accuracy": 0.7644849714636802, "num_tokens": 34406400.0, "step": 1050 }, { "epoch": 0.5816259087904825, "grad_norm": 0.9064853191375732, "learning_rate": 0.0004555664594918766, "loss": 1.3309, "mean_token_accuracy": 0.7655669602751732, "num_tokens": 36044800.0, "step": 1100 }, { "epoch": 0.6080634500991408, "grad_norm": 1.1222511529922485, "learning_rate": 0.0004528404754116236, "loss": 1.3088, "mean_token_accuracy": 0.7684097030758857, "num_tokens": 37683200.0, "step": 1150 }, { "epoch": 0.634500991407799, "grad_norm": 0.8550713658332825, "learning_rate": 0.0004501144913313706, "loss": 1.3121, "mean_token_accuracy": 0.7672879993915558, "num_tokens": 39321600.0, "step": 1200 }, { "epoch": 0.6609385327164574, "grad_norm": 1.1444681882858276, "learning_rate": 0.0004473885072511177, "loss": 1.3124, "step": 1250 }, { "epoch": 0.6609385327164574, "eval_loss": 1.3140060901641846, "eval_mean_token_accuracy": 0.7666200130988882, "eval_num_tokens": 40960000.0, "eval_runtime": 1711.9779, "eval_samples_per_second": 4.419, "eval_steps_per_second": 0.553, "step": 1250 }, { "epoch": 0.6873760740251157, "grad_norm": 0.9385507702827454, "learning_rate": 0.0004446625231708647, "loss": 1.299, "mean_token_accuracy": 0.7683782380819321, "num_tokens": 42598400.0, "step": 1300 }, { "epoch": 0.713813615333774, "grad_norm": 0.9909027218818665, "learning_rate": 0.0004419365390906117, "loss": 1.2776, "mean_token_accuracy": 0.7721034941077233, "num_tokens": 44236800.0, "step": 1350 }, { "epoch": 0.7402511566424322, "grad_norm": 1.0495429039001465, "learning_rate": 0.00043921055501035873, "loss": 1.2533, "mean_token_accuracy": 0.7770307904481888, "num_tokens": 45875200.0, "step": 1400 }, { "epoch": 0.7666886979510905, "grad_norm": 0.9478822350502014, "learning_rate": 0.0004364845709301058, "loss": 1.2386, "mean_token_accuracy": 0.7782954525947571, "num_tokens": 47513600.0, "step": 1450 }, { "epoch": 0.7931262392597488, "grad_norm": 0.9904903173446655, "learning_rate": 0.0004337585868498528, "loss": 1.2542, "step": 1500 }, { "epoch": 0.7931262392597488, "eval_loss": 1.2644829750061035, "eval_mean_token_accuracy": 0.7736368500027042, "eval_num_tokens": 49152000.0, "eval_runtime": 1711.2255, "eval_samples_per_second": 4.421, "eval_steps_per_second": 0.553, "step": 1500 }, { "epoch": 0.8195637805684072, "grad_norm": 0.797590970993042, "learning_rate": 0.00043103260276959983, "loss": 1.2613, "mean_token_accuracy": 0.7753057803213597, "num_tokens": 50790400.0, "step": 1550 }, { "epoch": 0.8460013218770654, "grad_norm": 1.1253471374511719, "learning_rate": 0.0004283066186893468, "loss": 1.2626, "mean_token_accuracy": 0.7741856071352958, "num_tokens": 52428800.0, "step": 1600 }, { "epoch": 0.8724388631857237, "grad_norm": 0.9880785346031189, "learning_rate": 0.0004255806346090939, "loss": 1.2731, "mean_token_accuracy": 0.7714717736840249, "num_tokens": 54067200.0, "step": 1650 }, { "epoch": 0.898876404494382, "grad_norm": 0.7777257561683655, "learning_rate": 0.00042285465052884093, "loss": 1.2649, "mean_token_accuracy": 0.7737677192687988, "num_tokens": 55705600.0, "step": 1700 }, { "epoch": 0.9253139458030403, "grad_norm": 0.8958262801170349, "learning_rate": 0.0004201286664485879, "loss": 1.2498, "step": 1750 }, { "epoch": 0.9253139458030403, "eval_loss": 1.2289273738861084, "eval_mean_token_accuracy": 0.7793115381836639, "eval_num_tokens": 57344000.0, "eval_runtime": 1711.9208, "eval_samples_per_second": 4.419, "eval_steps_per_second": 0.553, "step": 1750 }, { "epoch": 0.9517514871116987, "grad_norm": 0.9251750111579895, "learning_rate": 0.00041740268236833495, "loss": 1.2212, "mean_token_accuracy": 0.7776866452395916, "num_tokens": 58982400.0, "step": 1800 }, { "epoch": 0.9781890284203569, "grad_norm": 0.8101162910461426, "learning_rate": 0.00041467669828808203, "loss": 1.2025, "mean_token_accuracy": 0.7838043755292893, "num_tokens": 60620800.0, "step": 1850 }, { "epoch": 1.0042300066093852, "grad_norm": 1.5328075885772705, "learning_rate": 0.00041200523388943414, "loss": 1.2204, "mean_token_accuracy": 0.7796021451804843, "num_tokens": 62234624.0, "step": 1900 }, { "epoch": 1.0306675479180436, "grad_norm": 0.9993696212768555, "learning_rate": 0.0004092792498091811, "loss": 1.1622, "mean_token_accuracy": 0.7850250500440598, "num_tokens": 63873024.0, "step": 1950 }, { "epoch": 1.057105089226702, "grad_norm": 0.9176653027534485, "learning_rate": 0.00040655326572892816, "loss": 1.1807, "step": 2000 }, { "epoch": 1.057105089226702, "eval_loss": 1.2058873176574707, "eval_mean_token_accuracy": 0.7832688034588893, "eval_num_tokens": 65511424.0, "eval_runtime": 1712.3284, "eval_samples_per_second": 4.418, "eval_steps_per_second": 0.552, "step": 2000 }, { "epoch": 1.0835426305353602, "grad_norm": 1.060483455657959, "learning_rate": 0.00040382728164867513, "loss": 1.1635, "mean_token_accuracy": 0.7839937689900398, "num_tokens": 67149824.0, "step": 2050 }, { "epoch": 1.1099801718440185, "grad_norm": 1.3085092306137085, "learning_rate": 0.0004011012975684222, "loss": 1.1928, "mean_token_accuracy": 0.7807007575035095, "num_tokens": 68788224.0, "step": 2100 }, { "epoch": 1.1364177131526767, "grad_norm": 0.9808939695358276, "learning_rate": 0.00039837531348816925, "loss": 1.1454, "mean_token_accuracy": 0.7877211648225785, "num_tokens": 70426624.0, "step": 2150 }, { "epoch": 1.162855254461335, "grad_norm": 0.8935715556144714, "learning_rate": 0.00039564932940791623, "loss": 1.1514, "mean_token_accuracy": 0.7872568437457085, "num_tokens": 72065024.0, "step": 2200 }, { "epoch": 1.1892927957699935, "grad_norm": 0.9606215357780457, "learning_rate": 0.00039292334532766327, "loss": 1.1569, "step": 2250 }, { "epoch": 1.1892927957699935, "eval_loss": 1.1869500875473022, "eval_mean_token_accuracy": 0.7853513486037547, "eval_num_tokens": 73703424.0, "eval_runtime": 1713.6219, "eval_samples_per_second": 4.415, "eval_steps_per_second": 0.552, "step": 2250 }, { "epoch": 1.2157303370786516, "grad_norm": 1.2232052087783813, "learning_rate": 0.00039019736124741035, "loss": 1.1651, "mean_token_accuracy": 0.785599644035101, "num_tokens": 75341824.0, "step": 2300 }, { "epoch": 1.24216787838731, "grad_norm": 1.2846604585647583, "learning_rate": 0.00038747137716715733, "loss": 1.1691, "mean_token_accuracy": 0.7854020059108734, "num_tokens": 76980224.0, "step": 2350 }, { "epoch": 1.2686054196959682, "grad_norm": 1.0899465084075928, "learning_rate": 0.00038474539308690437, "loss": 1.1416, "mean_token_accuracy": 0.7897036933898925, "num_tokens": 78618624.0, "step": 2400 }, { "epoch": 1.2950429610046266, "grad_norm": 1.0662829875946045, "learning_rate": 0.0003820194090066514, "loss": 1.1435, "mean_token_accuracy": 0.7887658843398094, "num_tokens": 80257024.0, "step": 2450 }, { "epoch": 1.321480502313285, "grad_norm": 0.9844885468482971, "learning_rate": 0.00037929342492639843, "loss": 1.1374, "step": 2500 }, { "epoch": 1.321480502313285, "eval_loss": 1.1827911138534546, "eval_mean_token_accuracy": 0.7856577540930936, "eval_num_tokens": 81895424.0, "eval_runtime": 1711.6622, "eval_samples_per_second": 4.42, "eval_steps_per_second": 0.553, "step": 2500 }, { "epoch": 1.3479180436219431, "grad_norm": 0.8380193710327148, "learning_rate": 0.00037656744084614547, "loss": 1.1353, "mean_token_accuracy": 0.790248963534832, "num_tokens": 83533824.0, "step": 2550 }, { "epoch": 1.3743555849306015, "grad_norm": 0.7707766890525818, "learning_rate": 0.0003738414567658925, "loss": 1.1309, "mean_token_accuracy": 0.7910679399967193, "num_tokens": 85172224.0, "step": 2600 }, { "epoch": 1.4007931262392597, "grad_norm": 1.3123962879180908, "learning_rate": 0.0003711154726856395, "loss": 1.119, "mean_token_accuracy": 0.7927541556954384, "num_tokens": 86810624.0, "step": 2650 }, { "epoch": 1.427230667547918, "grad_norm": 0.9434394836425781, "learning_rate": 0.00036838948860538656, "loss": 1.1233, "mean_token_accuracy": 0.7917754176259041, "num_tokens": 88449024.0, "step": 2700 }, { "epoch": 1.4536682088565764, "grad_norm": 1.0501152276992798, "learning_rate": 0.0003656635045251336, "loss": 1.1463, "step": 2750 }, { "epoch": 1.4536682088565764, "eval_loss": 1.1497071981430054, "eval_mean_token_accuracy": 0.7899814840981119, "eval_num_tokens": 90087424.0, "eval_runtime": 1712.288, "eval_samples_per_second": 4.418, "eval_steps_per_second": 0.552, "step": 2750 }, { "epoch": 1.4801057501652346, "grad_norm": 1.1930551528930664, "learning_rate": 0.0003629375204448806, "loss": 1.1309, "mean_token_accuracy": 0.7891275675594807, "num_tokens": 91725824.0, "step": 2800 }, { "epoch": 1.5065432914738928, "grad_norm": 1.3507503271102905, "learning_rate": 0.0003602115363646276, "loss": 1.1303, "mean_token_accuracy": 0.7900684276223182, "num_tokens": 93364224.0, "step": 2850 }, { "epoch": 1.5329808327825512, "grad_norm": 0.935932993888855, "learning_rate": 0.0003574855522843747, "loss": 1.124, "mean_token_accuracy": 0.7918371230363845, "num_tokens": 95002624.0, "step": 2900 }, { "epoch": 1.5594183740912095, "grad_norm": 1.3527334928512573, "learning_rate": 0.0003547595682041217, "loss": 1.1103, "mean_token_accuracy": 0.7935648819804192, "num_tokens": 96641024.0, "step": 2950 }, { "epoch": 1.585855915399868, "grad_norm": 0.8783284425735474, "learning_rate": 0.0003520335841238687, "loss": 1.113, "step": 3000 }, { "epoch": 1.585855915399868, "eval_loss": 1.1312052011489868, "eval_mean_token_accuracy": 0.7928862137597913, "eval_num_tokens": 98279424.0, "eval_runtime": 1712.2465, "eval_samples_per_second": 4.418, "eval_steps_per_second": 0.552, "step": 3000 }, { "epoch": 1.612293456708526, "grad_norm": 1.079725980758667, "learning_rate": 0.00034930760004361574, "loss": 1.107, "mean_token_accuracy": 0.7933507452905179, "num_tokens": 99917824.0, "step": 3050 }, { "epoch": 1.6387309980171842, "grad_norm": 1.188661813735962, "learning_rate": 0.0003465816159633628, "loss": 1.1139, "mean_token_accuracy": 0.7925458225607872, "num_tokens": 101556224.0, "step": 3100 }, { "epoch": 1.6651685393258426, "grad_norm": 0.983051061630249, "learning_rate": 0.0003438556318831098, "loss": 1.1198, "mean_token_accuracy": 0.7924914485216141, "num_tokens": 103194624.0, "step": 3150 }, { "epoch": 1.691606080634501, "grad_norm": 0.9756836891174316, "learning_rate": 0.00034112964780285684, "loss": 1.1175, "mean_token_accuracy": 0.7917662528157234, "num_tokens": 104833024.0, "step": 3200 }, { "epoch": 1.7180436219431594, "grad_norm": 1.1230757236480713, "learning_rate": 0.0003384036637226039, "loss": 1.0988, "step": 3250 }, { "epoch": 1.7180436219431594, "eval_loss": 1.1241850852966309, "eval_mean_token_accuracy": 0.793406566215116, "eval_num_tokens": 106471424.0, "eval_runtime": 1712.6064, "eval_samples_per_second": 4.417, "eval_steps_per_second": 0.552, "step": 3250 }, { "epoch": 1.7444811632518176, "grad_norm": 1.3361942768096924, "learning_rate": 0.0003356776796423509, "loss": 1.0957, "mean_token_accuracy": 0.7953433538973331, "num_tokens": 108109824.0, "step": 3300 }, { "epoch": 1.7709187045604757, "grad_norm": 0.8606221079826355, "learning_rate": 0.00033295169556209794, "loss": 1.072, "mean_token_accuracy": 0.7988990727066994, "num_tokens": 109748224.0, "step": 3350 }, { "epoch": 1.7973562458691341, "grad_norm": 1.1985405683517456, "learning_rate": 0.0003302257114818449, "loss": 1.0964, "mean_token_accuracy": 0.7953415229916573, "num_tokens": 111386624.0, "step": 3400 }, { "epoch": 1.8237937871777925, "grad_norm": 1.0375052690505981, "learning_rate": 0.000327499727401592, "loss": 1.0821, "mean_token_accuracy": 0.7982869046926498, "num_tokens": 113025024.0, "step": 3450 }, { "epoch": 1.8502313284864509, "grad_norm": 1.037645936012268, "learning_rate": 0.00032477374332133904, "loss": 1.0854, "step": 3500 }, { "epoch": 1.8502313284864509, "eval_loss": 1.1053054332733154, "eval_mean_token_accuracy": 0.7964076004119051, "eval_num_tokens": 114663424.0, "eval_runtime": 1713.2036, "eval_samples_per_second": 4.416, "eval_steps_per_second": 0.552, "step": 3500 }, { "epoch": 1.876668869795109, "grad_norm": 0.8609442114830017, "learning_rate": 0.000322047759241086, "loss": 1.0789, "mean_token_accuracy": 0.7976255512237549, "num_tokens": 116301824.0, "step": 3550 }, { "epoch": 1.9031064111037672, "grad_norm": 0.7845131158828735, "learning_rate": 0.00031932177516083305, "loss": 1.1106, "mean_token_accuracy": 0.7933571606874465, "num_tokens": 117940224.0, "step": 3600 }, { "epoch": 1.9295439524124256, "grad_norm": 0.9003056287765503, "learning_rate": 0.00031659579108058014, "loss": 1.0636, "mean_token_accuracy": 0.7998753663897514, "num_tokens": 119578624.0, "step": 3650 }, { "epoch": 1.955981493721084, "grad_norm": 1.202172040939331, "learning_rate": 0.0003138698070003271, "loss": 1.0809, "mean_token_accuracy": 0.7975763711333275, "num_tokens": 121217024.0, "step": 3700 }, { "epoch": 1.9824190350297424, "grad_norm": 0.6643933653831482, "learning_rate": 0.00031114382292007415, "loss": 1.0808, "step": 3750 }, { "epoch": 1.9824190350297424, "eval_loss": 1.0950915813446045, "eval_mean_token_accuracy": 0.7972050871833939, "eval_num_tokens": 122855424.0, "eval_runtime": 1713.9729, "eval_samples_per_second": 4.414, "eval_steps_per_second": 0.552, "step": 3750 }, { "epoch": 2.0084600132187704, "grad_norm": 1.0089055299758911, "learning_rate": 0.00030847235852142626, "loss": 1.0434, "mean_token_accuracy": 0.7995400524860065, "num_tokens": 124469248.0, "step": 3800 }, { "epoch": 2.034897554527429, "grad_norm": 1.8495018482208252, "learning_rate": 0.00030574637444117324, "loss": 1.0069, "mean_token_accuracy": 0.8041293996572495, "num_tokens": 126107648.0, "step": 3850 }, { "epoch": 2.061335095836087, "grad_norm": 0.9792631268501282, "learning_rate": 0.0003030203903609203, "loss": 1.0248, "mean_token_accuracy": 0.800488149523735, "num_tokens": 127746048.0, "step": 3900 }, { "epoch": 2.0877726371447456, "grad_norm": 1.0454398393630981, "learning_rate": 0.00030029440628066736, "loss": 1.0032, "mean_token_accuracy": 0.8048930823802948, "num_tokens": 129384448.0, "step": 3950 }, { "epoch": 2.114210178453404, "grad_norm": 1.124090313911438, "learning_rate": 0.00029756842220041434, "loss": 1.0141, "step": 4000 }, { "epoch": 2.114210178453404, "eval_loss": 1.0894951820373535, "eval_mean_token_accuracy": 0.7976183624146604, "eval_num_tokens": 131022848.0, "eval_runtime": 1713.146, "eval_samples_per_second": 4.416, "eval_steps_per_second": 0.552, "step": 4000 }, { "epoch": 2.140647719762062, "grad_norm": 1.068744421005249, "learning_rate": 0.0002948424381201614, "loss": 1.017, "mean_token_accuracy": 0.8032655183970928, "num_tokens": 132661248.0, "step": 4050 }, { "epoch": 2.1670852610707203, "grad_norm": 0.6453216671943665, "learning_rate": 0.0002921164540399084, "loss": 1.0173, "mean_token_accuracy": 0.8023881965875626, "num_tokens": 134299648.0, "step": 4100 }, { "epoch": 2.1935228023793787, "grad_norm": 0.8488343954086304, "learning_rate": 0.00028939046995965544, "loss": 1.0192, "mean_token_accuracy": 0.8028500735759735, "num_tokens": 135938048.0, "step": 4150 }, { "epoch": 2.219960343688037, "grad_norm": 1.107086181640625, "learning_rate": 0.00028666448587940247, "loss": 1.0167, "mean_token_accuracy": 0.8023680368065834, "num_tokens": 137576448.0, "step": 4200 }, { "epoch": 2.2463978849966955, "grad_norm": 0.9816263914108276, "learning_rate": 0.0002839385017991495, "loss": 1.026, "step": 4250 }, { "epoch": 2.2463978849966955, "eval_loss": 1.070574402809143, "eval_mean_token_accuracy": 0.8008053159738948, "eval_num_tokens": 139214848.0, "eval_runtime": 1713.5677, "eval_samples_per_second": 4.415, "eval_steps_per_second": 0.552, "step": 4250 }, { "epoch": 2.2728354263053534, "grad_norm": 1.0816267728805542, "learning_rate": 0.00028121251771889654, "loss": 1.0271, "mean_token_accuracy": 0.8018050470948219, "num_tokens": 140853248.0, "step": 4300 }, { "epoch": 2.299272967614012, "grad_norm": 1.1499203443527222, "learning_rate": 0.00027848653363864357, "loss": 1.0008, "mean_token_accuracy": 0.8052902013063431, "num_tokens": 142491648.0, "step": 4350 }, { "epoch": 2.32571050892267, "grad_norm": 0.9058449864387512, "learning_rate": 0.0002757605495583906, "loss": 1.002, "mean_token_accuracy": 0.806196848154068, "num_tokens": 144130048.0, "step": 4400 }, { "epoch": 2.3521480502313286, "grad_norm": 0.8086408972740173, "learning_rate": 0.0002730345654781376, "loss": 0.9968, "mean_token_accuracy": 0.8068004646897315, "num_tokens": 145768448.0, "step": 4450 }, { "epoch": 2.378585591539987, "grad_norm": 0.9422939419746399, "learning_rate": 0.00027030858139788467, "loss": 0.9925, "step": 4500 }, { "epoch": 2.378585591539987, "eval_loss": 1.0708719491958618, "eval_mean_token_accuracy": 0.8005553823570872, "eval_num_tokens": 147406848.0, "eval_runtime": 1714.6525, "eval_samples_per_second": 4.412, "eval_steps_per_second": 0.552, "step": 4500 }, { "epoch": 2.405023132848645, "grad_norm": 0.9562957882881165, "learning_rate": 0.0002675825973176317, "loss": 1.0125, "mean_token_accuracy": 0.8054704304039478, "num_tokens": 149045248.0, "step": 4550 }, { "epoch": 2.4314606741573033, "grad_norm": 0.8408384919166565, "learning_rate": 0.0002648566132373787, "loss": 1.0194, "mean_token_accuracy": 0.8027529340982437, "num_tokens": 150683648.0, "step": 4600 }, { "epoch": 2.4578982154659617, "grad_norm": 0.9756256341934204, "learning_rate": 0.0002621306291571257, "loss": 0.9828, "mean_token_accuracy": 0.8083504402637481, "num_tokens": 152322048.0, "step": 4650 }, { "epoch": 2.48433575677462, "grad_norm": 1.2137442827224731, "learning_rate": 0.0002594046450768728, "loss": 1.0021, "mean_token_accuracy": 0.8056860953569412, "num_tokens": 153960448.0, "step": 4700 }, { "epoch": 2.5107732980832784, "grad_norm": 0.7544079422950745, "learning_rate": 0.0002566786609966198, "loss": 1.0089, "step": 4750 }, { "epoch": 2.5107732980832784, "eval_loss": 1.0549876689910889, "eval_mean_token_accuracy": 0.8026504306158102, "eval_num_tokens": 155598848.0, "eval_runtime": 1712.6528, "eval_samples_per_second": 4.417, "eval_steps_per_second": 0.552, "step": 4750 }, { "epoch": 2.5372108393919364, "grad_norm": 1.3571584224700928, "learning_rate": 0.0002539526769163668, "loss": 0.9947, "mean_token_accuracy": 0.8055629892647267, "num_tokens": 157237248.0, "step": 4800 }, { "epoch": 2.5636483807005948, "grad_norm": 0.9661728739738464, "learning_rate": 0.00025122669283611385, "loss": 0.9934, "mean_token_accuracy": 0.8067454797029495, "num_tokens": 158875648.0, "step": 4850 }, { "epoch": 2.590085922009253, "grad_norm": 0.8960219025611877, "learning_rate": 0.0002485007087558609, "loss": 1.0084, "mean_token_accuracy": 0.8042240959405899, "num_tokens": 160514048.0, "step": 4900 }, { "epoch": 2.6165234633179115, "grad_norm": 0.8337807059288025, "learning_rate": 0.0002457747246756079, "loss": 1.0028, "mean_token_accuracy": 0.8062023460865021, "num_tokens": 162152448.0, "step": 4950 }, { "epoch": 2.64296100462657, "grad_norm": 1.2237184047698975, "learning_rate": 0.00024304874059535492, "loss": 0.9907, "step": 5000 }, { "epoch": 2.64296100462657, "eval_loss": 1.0443217754364014, "eval_mean_token_accuracy": 0.8038508863706165, "eval_num_tokens": 163790848.0, "eval_runtime": 1716.2002, "eval_samples_per_second": 4.408, "eval_steps_per_second": 0.551, "step": 5000 }, { "epoch": 2.669398545935228, "grad_norm": 0.7289232015609741, "learning_rate": 0.00024032275651510195, "loss": 1.0012, "mean_token_accuracy": 0.8063395051658153, "num_tokens": 165429248.0, "step": 5050 }, { "epoch": 2.6958360872438862, "grad_norm": 1.0250189304351807, "learning_rate": 0.000237596772434849, "loss": 0.976, "mean_token_accuracy": 0.8098240447044373, "num_tokens": 167067648.0, "step": 5100 }, { "epoch": 2.7222736285525446, "grad_norm": 1.0034643411636353, "learning_rate": 0.00023487078835459602, "loss": 0.9848, "mean_token_accuracy": 0.8076087480783463, "num_tokens": 168706048.0, "step": 5150 }, { "epoch": 2.748711169861203, "grad_norm": 0.9291382431983948, "learning_rate": 0.00023214480427434303, "loss": 0.9757, "mean_token_accuracy": 0.8101533487439155, "num_tokens": 170344448.0, "step": 5200 }, { "epoch": 2.7751487111698614, "grad_norm": 0.6038099527359009, "learning_rate": 0.00022941882019409009, "loss": 0.989, "step": 5250 }, { "epoch": 2.7751487111698614, "eval_loss": 1.0298680067062378, "eval_mean_token_accuracy": 0.8057682283584966, "eval_num_tokens": 171982848.0, "eval_runtime": 1715.5734, "eval_samples_per_second": 4.41, "eval_steps_per_second": 0.551, "step": 5250 }, { "epoch": 2.8015862524785193, "grad_norm": 0.8782141804695129, "learning_rate": 0.0002266928361138371, "loss": 1.0017, "mean_token_accuracy": 0.8068728642165661, "num_tokens": 173621248.0, "step": 5300 }, { "epoch": 2.8280237937871777, "grad_norm": 0.5077300667762756, "learning_rate": 0.00022396685203358413, "loss": 0.9759, "mean_token_accuracy": 0.8097360721230507, "num_tokens": 175259648.0, "step": 5350 }, { "epoch": 2.854461335095836, "grad_norm": 0.571225643157959, "learning_rate": 0.00022124086795333116, "loss": 0.9693, "mean_token_accuracy": 0.8103903934359551, "num_tokens": 176898048.0, "step": 5400 }, { "epoch": 2.8808988764044945, "grad_norm": 0.9907204508781433, "learning_rate": 0.0002185148838730782, "loss": 0.9783, "mean_token_accuracy": 0.8095271262526512, "num_tokens": 178536448.0, "step": 5450 }, { "epoch": 2.907336417713153, "grad_norm": 1.0461844205856323, "learning_rate": 0.0002157888997928252, "loss": 0.9796, "step": 5500 }, { "epoch": 2.907336417713153, "eval_loss": 1.0164023637771606, "eval_mean_token_accuracy": 0.8078667395462698, "eval_num_tokens": 180174848.0, "eval_runtime": 1713.163, "eval_samples_per_second": 4.416, "eval_steps_per_second": 0.552, "step": 5500 }, { "epoch": 2.933773959021811, "grad_norm": 0.5164626240730286, "learning_rate": 0.00021306291571257226, "loss": 0.9813, "mean_token_accuracy": 0.8092189015448094, "num_tokens": 181813248.0, "step": 5550 }, { "epoch": 2.960211500330469, "grad_norm": 0.9014139771461487, "learning_rate": 0.00021033693163231926, "loss": 0.9785, "mean_token_accuracy": 0.808426809310913, "num_tokens": 183451648.0, "step": 5600 }, { "epoch": 2.9866490416391276, "grad_norm": 1.2656482458114624, "learning_rate": 0.0002076109475520663, "loss": 0.9546, "mean_token_accuracy": 0.8128830647468567, "num_tokens": 185090048.0, "step": 5650 }, { "epoch": 3.012690019828156, "grad_norm": 0.6291442513465881, "learning_rate": 0.0002049394831534184, "loss": 0.9425, "mean_token_accuracy": 0.8123012103405095, "num_tokens": 186703872.0, "step": 5700 }, { "epoch": 3.0391275611368145, "grad_norm": 0.9149487614631653, "learning_rate": 0.0002022134990731654, "loss": 0.9036, "step": 5750 }, { "epoch": 3.0391275611368145, "eval_loss": 1.010271668434143, "eval_mean_token_accuracy": 0.8088774525463959, "eval_num_tokens": 188342272.0, "eval_runtime": 1714.8688, "eval_samples_per_second": 4.411, "eval_steps_per_second": 0.552, "step": 5750 }, { "epoch": 3.0655651024454724, "grad_norm": 0.7065662741661072, "learning_rate": 0.00019948751499291245, "loss": 0.9032, "mean_token_accuracy": 0.816273825019598, "num_tokens": 189980672.0, "step": 5800 }, { "epoch": 3.092002643754131, "grad_norm": 0.8670871257781982, "learning_rate": 0.00019676153091265948, "loss": 0.9084, "mean_token_accuracy": 0.8148881956934929, "num_tokens": 191619072.0, "step": 5850 }, { "epoch": 3.118440185062789, "grad_norm": 0.9667902588844299, "learning_rate": 0.0001940355468324065, "loss": 0.8968, "mean_token_accuracy": 0.8168530049920082, "num_tokens": 193257472.0, "step": 5900 }, { "epoch": 3.1448777263714476, "grad_norm": 0.6061888933181763, "learning_rate": 0.00019130956275215352, "loss": 0.9082, "mean_token_accuracy": 0.8162536644935607, "num_tokens": 194895872.0, "step": 5950 }, { "epoch": 3.1713152676801055, "grad_norm": 0.8645080924034119, "learning_rate": 0.00018858357867190058, "loss": 0.9014, "step": 6000 }, { "epoch": 3.1713152676801055, "eval_loss": 1.0060479640960693, "eval_mean_token_accuracy": 0.8095184696275134, "eval_num_tokens": 196534272.0, "eval_runtime": 1716.7924, "eval_samples_per_second": 4.406, "eval_steps_per_second": 0.551, "step": 6000 }, { "epoch": 3.197752808988764, "grad_norm": 0.9978011250495911, "learning_rate": 0.00018585759459164758, "loss": 0.8941, "mean_token_accuracy": 0.817662510573864, "num_tokens": 198172672.0, "step": 6050 }, { "epoch": 3.2241903502974223, "grad_norm": 0.610701322555542, "learning_rate": 0.00018313161051139462, "loss": 0.9088, "mean_token_accuracy": 0.8161278122663498, "num_tokens": 199811072.0, "step": 6100 }, { "epoch": 3.2506278916060807, "grad_norm": 0.592491626739502, "learning_rate": 0.00018040562643114165, "loss": 0.9111, "mean_token_accuracy": 0.8152193301916122, "num_tokens": 201449472.0, "step": 6150 }, { "epoch": 3.277065432914739, "grad_norm": 0.5505239367485046, "learning_rate": 0.00017767964235088868, "loss": 0.894, "mean_token_accuracy": 0.8186956241726875, "num_tokens": 203087872.0, "step": 6200 }, { "epoch": 3.303502974223397, "grad_norm": 0.6099046468734741, "learning_rate": 0.0001749536582706357, "loss": 0.9061, "step": 6250 }, { "epoch": 3.303502974223397, "eval_loss": 0.9953573942184448, "eval_mean_token_accuracy": 0.8111508759585294, "eval_num_tokens": 204726272.0, "eval_runtime": 1714.1014, "eval_samples_per_second": 4.413, "eval_steps_per_second": 0.552, "step": 6250 }, { "epoch": 3.3299405155320554, "grad_norm": 0.6866306066513062, "learning_rate": 0.00017222767419038275, "loss": 0.9023, "mean_token_accuracy": 0.8162371690571308, "num_tokens": 206364672.0, "step": 6300 }, { "epoch": 3.3563780568407138, "grad_norm": 0.6457993984222412, "learning_rate": 0.00016950169011012976, "loss": 0.9083, "mean_token_accuracy": 0.8154759269952774, "num_tokens": 208003072.0, "step": 6350 }, { "epoch": 3.382815598149372, "grad_norm": 0.5925601124763489, "learning_rate": 0.0001667757060298768, "loss": 0.9009, "mean_token_accuracy": 0.8165108740329743, "num_tokens": 209641472.0, "step": 6400 }, { "epoch": 3.4092531394580305, "grad_norm": 0.8631545901298523, "learning_rate": 0.00016404972194962382, "loss": 0.8779, "mean_token_accuracy": 0.8210025626420975, "num_tokens": 211279872.0, "step": 6450 }, { "epoch": 3.4356906807666885, "grad_norm": 0.6113960146903992, "learning_rate": 0.00016132373786937086, "loss": 0.8894, "step": 6500 }, { "epoch": 3.4356906807666885, "eval_loss": 0.9821568131446838, "eval_mean_token_accuracy": 0.8126549717613809, "eval_num_tokens": 212918272.0, "eval_runtime": 1714.991, "eval_samples_per_second": 4.411, "eval_steps_per_second": 0.552, "step": 6500 }, { "epoch": 3.462128222075347, "grad_norm": 0.6569721698760986, "learning_rate": 0.00015859775378911786, "loss": 0.8828, "mean_token_accuracy": 0.8195891354978084, "num_tokens": 214556672.0, "step": 6550 }, { "epoch": 3.4885657633840053, "grad_norm": 0.548383891582489, "learning_rate": 0.0001558717697088649, "loss": 0.8922, "mean_token_accuracy": 0.8191379508376122, "num_tokens": 216195072.0, "step": 6600 }, { "epoch": 3.5150033046926636, "grad_norm": 0.4519716799259186, "learning_rate": 0.00015314578562861193, "loss": 0.9047, "mean_token_accuracy": 0.8174547863006592, "num_tokens": 217833472.0, "step": 6650 }, { "epoch": 3.541440846001322, "grad_norm": 0.4486851692199707, "learning_rate": 0.00015041980154835896, "loss": 0.8812, "mean_token_accuracy": 0.8206359946727753, "num_tokens": 219471872.0, "step": 6700 }, { "epoch": 3.56787838730998, "grad_norm": 0.511616587638855, "learning_rate": 0.00014769381746810597, "loss": 0.8754, "step": 6750 }, { "epoch": 3.56787838730998, "eval_loss": 0.9736062288284302, "eval_mean_token_accuracy": 0.8140331042997428, "eval_num_tokens": 221110272.0, "eval_runtime": 1716.1615, "eval_samples_per_second": 4.408, "eval_steps_per_second": 0.551, "step": 6750 }, { "epoch": 3.5943159286186384, "grad_norm": 0.4776919186115265, "learning_rate": 0.00014496783338785303, "loss": 0.8959, "mean_token_accuracy": 0.8200723953545094, "num_tokens": 222748672.0, "step": 6800 }, { "epoch": 3.6207534699272967, "grad_norm": 0.6696462035179138, "learning_rate": 0.00014224184930760003, "loss": 0.8818, "mean_token_accuracy": 0.8205584043264389, "num_tokens": 224387072.0, "step": 6850 }, { "epoch": 3.647191011235955, "grad_norm": 0.6305286884307861, "learning_rate": 0.00013951586522734707, "loss": 0.8918, "mean_token_accuracy": 0.8183388301730156, "num_tokens": 226025472.0, "step": 6900 }, { "epoch": 3.6736285525446135, "grad_norm": 0.4481205344200134, "learning_rate": 0.0001367898811470941, "loss": 0.8689, "mean_token_accuracy": 0.822255617082119, "num_tokens": 227663872.0, "step": 6950 }, { "epoch": 3.7000660938532715, "grad_norm": 0.5297748446464539, "learning_rate": 0.00013406389706684113, "loss": 0.891, "step": 7000 }, { "epoch": 3.7000660938532715, "eval_loss": 0.9645546078681946, "eval_mean_token_accuracy": 0.8154595721725681, "eval_num_tokens": 229302272.0, "eval_runtime": 1714.7844, "eval_samples_per_second": 4.412, "eval_steps_per_second": 0.552, "step": 7000 }, { "epoch": 3.72650363516193, "grad_norm": 0.5066333413124084, "learning_rate": 0.00013133791298658814, "loss": 0.8888, "mean_token_accuracy": 0.8194779419898987, "num_tokens": 230940672.0, "step": 7050 }, { "epoch": 3.7529411764705882, "grad_norm": 0.5374875068664551, "learning_rate": 0.0001286119289063352, "loss": 0.8668, "mean_token_accuracy": 0.8228415179252625, "num_tokens": 232579072.0, "step": 7100 }, { "epoch": 3.7793787177792466, "grad_norm": 0.45081761479377747, "learning_rate": 0.0001258859448260822, "loss": 0.8793, "mean_token_accuracy": 0.8205449622869492, "num_tokens": 234217472.0, "step": 7150 }, { "epoch": 3.805816259087905, "grad_norm": 0.4918268620967865, "learning_rate": 0.00012315996074582924, "loss": 0.8733, "mean_token_accuracy": 0.8207551288604736, "num_tokens": 235855872.0, "step": 7200 }, { "epoch": 3.832253800396563, "grad_norm": 0.5663712024688721, "learning_rate": 0.00012043397666557627, "loss": 0.8701, "step": 7250 }, { "epoch": 3.832253800396563, "eval_loss": 0.9548874497413635, "eval_mean_token_accuracy": 0.8167319189418446, "eval_num_tokens": 237494272.0, "eval_runtime": 1714.4512, "eval_samples_per_second": 4.412, "eval_steps_per_second": 0.552, "step": 7250 }, { "epoch": 3.8586913417052213, "grad_norm": 1.051850438117981, "learning_rate": 0.0001177079925853233, "loss": 0.8694, "mean_token_accuracy": 0.8223982758820056, "num_tokens": 239132672.0, "step": 7300 }, { "epoch": 3.8851288830138797, "grad_norm": 0.4363590478897095, "learning_rate": 0.00011498200850507034, "loss": 0.8864, "mean_token_accuracy": 0.8197085753083229, "num_tokens": 240771072.0, "step": 7350 }, { "epoch": 3.911566424322538, "grad_norm": 1.5718705654144287, "learning_rate": 0.00011225602442481736, "loss": 0.8714, "mean_token_accuracy": 0.8220063516497612, "num_tokens": 242409472.0, "step": 7400 }, { "epoch": 3.9380039656311965, "grad_norm": 2.0182573795318604, "learning_rate": 0.00010953004034456439, "loss": 0.8666, "mean_token_accuracy": 0.8229606547951698, "num_tokens": 244047872.0, "step": 7450 }, { "epoch": 3.9644415069398544, "grad_norm": 0.5944796800613403, "learning_rate": 0.00010680405626431142, "loss": 0.8553, "step": 7500 }, { "epoch": 3.9644415069398544, "eval_loss": 0.9463370442390442, "eval_mean_token_accuracy": 0.8175775335026594, "eval_num_tokens": 245686272.0, "eval_runtime": 1716.9243, "eval_samples_per_second": 4.406, "eval_steps_per_second": 0.551, "step": 7500 }, { "epoch": 3.990879048248513, "grad_norm": 0.39286720752716064, "learning_rate": 0.00010407807218405844, "loss": 0.8742, "mean_token_accuracy": 0.8231118628382683, "num_tokens": 247324672.0, "step": 7550 }, { "epoch": 4.016920026437541, "grad_norm": 0.485441118478775, "learning_rate": 0.00010135208810380548, "loss": 0.8161, "mean_token_accuracy": 0.8290703360199323, "num_tokens": 248938496.0, "step": 7600 }, { "epoch": 4.0433575677462, "grad_norm": 0.6562045216560364, "learning_rate": 9.86261040235525e-05, "loss": 0.7963, "mean_token_accuracy": 0.829828929901123, "num_tokens": 250576896.0, "step": 7650 }, { "epoch": 4.069795109054858, "grad_norm": 0.4619589149951935, "learning_rate": 9.590011994329953e-05, "loss": 0.7968, "mean_token_accuracy": 0.829875974059105, "num_tokens": 252215296.0, "step": 7700 }, { "epoch": 4.0962326503635165, "grad_norm": 0.5542292594909668, "learning_rate": 9.317413586304656e-05, "loss": 0.7897, "step": 7750 }, { "epoch": 4.0962326503635165, "eval_loss": 0.9498882293701172, "eval_mean_token_accuracy": 0.8178022539136778, "eval_num_tokens": 253853696.0, "eval_runtime": 1713.64, "eval_samples_per_second": 4.415, "eval_steps_per_second": 0.552, "step": 7750 }, { "epoch": 4.122670191672174, "grad_norm": 0.40135377645492554, "learning_rate": 9.044815178279358e-05, "loss": 0.7963, "mean_token_accuracy": 0.8303173841536045, "num_tokens": 255492096.0, "step": 7800 }, { "epoch": 4.149107732980832, "grad_norm": 0.39157313108444214, "learning_rate": 8.772216770254061e-05, "loss": 0.7846, "mean_token_accuracy": 0.8317717489600182, "num_tokens": 257130496.0, "step": 7850 }, { "epoch": 4.175545274289491, "grad_norm": 0.4963982105255127, "learning_rate": 8.499618362228765e-05, "loss": 0.8069, "mean_token_accuracy": 0.8283883157372475, "num_tokens": 258768896.0, "step": 7900 }, { "epoch": 4.201982815598149, "grad_norm": 0.3942487835884094, "learning_rate": 8.227019954203467e-05, "loss": 0.803, "mean_token_accuracy": 0.8284738489985466, "num_tokens": 260407296.0, "step": 7950 }, { "epoch": 4.228420356906808, "grad_norm": 0.37145310640335083, "learning_rate": 7.95442154617817e-05, "loss": 0.805, "step": 8000 }, { "epoch": 4.228420356906808, "eval_loss": 0.9421485066413879, "eval_mean_token_accuracy": 0.8191204303540841, "eval_num_tokens": 262045696.0, "eval_runtime": 1716.291, "eval_samples_per_second": 4.408, "eval_steps_per_second": 0.551, "step": 8000 }, { "epoch": 4.254857898215466, "grad_norm": 0.3017653524875641, "learning_rate": 7.681823138152873e-05, "loss": 0.796, "mean_token_accuracy": 0.8288511091470718, "num_tokens": 263684096.0, "step": 8050 }, { "epoch": 4.281295439524124, "grad_norm": 0.4065409004688263, "learning_rate": 7.409224730127575e-05, "loss": 0.7812, "mean_token_accuracy": 0.8326673975586891, "num_tokens": 265322496.0, "step": 8100 }, { "epoch": 4.307732980832783, "grad_norm": 0.33838245272636414, "learning_rate": 7.136626322102279e-05, "loss": 0.7868, "mean_token_accuracy": 0.8319782489538192, "num_tokens": 266960896.0, "step": 8150 }, { "epoch": 4.334170522141441, "grad_norm": 0.39351001381874084, "learning_rate": 6.864027914076983e-05, "loss": 0.7891, "mean_token_accuracy": 0.8313220903277397, "num_tokens": 268599296.0, "step": 8200 }, { "epoch": 4.360608063450099, "grad_norm": 0.3555977940559387, "learning_rate": 6.591429506051685e-05, "loss": 0.7858, "step": 8250 }, { "epoch": 4.360608063450099, "eval_loss": 0.9359485507011414, "eval_mean_token_accuracy": 0.8200258982483983, "eval_num_tokens": 270237696.0, "eval_runtime": 1713.1543, "eval_samples_per_second": 4.416, "eval_steps_per_second": 0.552, "step": 8250 }, { "epoch": 4.387045604758757, "grad_norm": 0.34437137842178345, "learning_rate": 6.318831098026388e-05, "loss": 0.7825, "mean_token_accuracy": 0.8322449275851249, "num_tokens": 271876096.0, "step": 8300 }, { "epoch": 4.413483146067415, "grad_norm": 0.37046581506729126, "learning_rate": 6.0462326900010904e-05, "loss": 0.7936, "mean_token_accuracy": 0.8298429843783378, "num_tokens": 273514496.0, "step": 8350 }, { "epoch": 4.439920687376074, "grad_norm": 0.44303834438323975, "learning_rate": 5.773634281975793e-05, "loss": 0.7891, "mean_token_accuracy": 0.8311491903662681, "num_tokens": 275152896.0, "step": 8400 }, { "epoch": 4.466358228684732, "grad_norm": 0.382201611995697, "learning_rate": 5.5010358739504963e-05, "loss": 0.7781, "mean_token_accuracy": 0.8330895602703094, "num_tokens": 276791296.0, "step": 8450 }, { "epoch": 4.492795769993391, "grad_norm": 0.39989522099494934, "learning_rate": 5.228437465925199e-05, "loss": 0.7877, "step": 8500 }, { "epoch": 4.492795769993391, "eval_loss": 0.9300816059112549, "eval_mean_token_accuracy": 0.8209756191890789, "eval_num_tokens": 278429696.0, "eval_runtime": 1713.4373, "eval_samples_per_second": 4.415, "eval_steps_per_second": 0.552, "step": 8500 }, { "epoch": 4.519233311302049, "grad_norm": 0.3832317590713501, "learning_rate": 4.9558390578999016e-05, "loss": 0.7782, "mean_token_accuracy": 0.8327266594767571, "num_tokens": 280068096.0, "step": 8550 }, { "epoch": 4.545670852610707, "grad_norm": 0.4667583703994751, "learning_rate": 4.683240649874604e-05, "loss": 0.7849, "mean_token_accuracy": 0.8322770014405251, "num_tokens": 281706496.0, "step": 8600 }, { "epoch": 4.572108393919366, "grad_norm": 0.3977579176425934, "learning_rate": 4.4106422418493076e-05, "loss": 0.786, "mean_token_accuracy": 0.83141067892313, "num_tokens": 283344896.0, "step": 8650 }, { "epoch": 4.598545935228024, "grad_norm": 0.38620129227638245, "learning_rate": 4.138043833824011e-05, "loss": 0.7865, "mean_token_accuracy": 0.8317729702591896, "num_tokens": 284983296.0, "step": 8700 }, { "epoch": 4.624983476536682, "grad_norm": 0.3608716130256653, "learning_rate": 3.8654454257987135e-05, "loss": 0.7905, "step": 8750 }, { "epoch": 4.624983476536682, "eval_loss": 0.9239566922187805, "eval_mean_token_accuracy": 0.8218199411607948, "eval_num_tokens": 286621696.0, "eval_runtime": 1715.401, "eval_samples_per_second": 4.41, "eval_steps_per_second": 0.551, "step": 8750 }, { "epoch": 4.65142101784534, "grad_norm": 0.30198875069618225, "learning_rate": 3.592847017773417e-05, "loss": 0.7672, "mean_token_accuracy": 0.8327672865986824, "num_tokens": 288260096.0, "step": 8800 }, { "epoch": 4.677858559153998, "grad_norm": 0.3867688477039337, "learning_rate": 3.3202486097481194e-05, "loss": 0.7728, "mean_token_accuracy": 0.8339509972929955, "num_tokens": 289898496.0, "step": 8850 }, { "epoch": 4.704296100462657, "grad_norm": 0.38331055641174316, "learning_rate": 3.0476502017228217e-05, "loss": 0.7801, "mean_token_accuracy": 0.8332563516497612, "num_tokens": 291536896.0, "step": 8900 }, { "epoch": 4.730733641771315, "grad_norm": 0.32032325863838196, "learning_rate": 2.775051793697525e-05, "loss": 0.7896, "mean_token_accuracy": 0.8309176415205002, "num_tokens": 293175296.0, "step": 8950 }, { "epoch": 4.757171183079974, "grad_norm": 0.41111549735069275, "learning_rate": 2.502453385672228e-05, "loss": 0.7691, "step": 9000 }, { "epoch": 4.757171183079974, "eval_loss": 0.9195617437362671, "eval_mean_token_accuracy": 0.8225174557583025, "eval_num_tokens": 294813696.0, "eval_runtime": 1712.3817, "eval_samples_per_second": 4.418, "eval_steps_per_second": 0.552, "step": 9000 } ], "logging_steps": 50, "max_steps": 9455, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 470934104309760.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }