| { |
| "best_global_step": null, |
| "best_metric": 0.8265531063079834, |
| "best_model_checkpoint": null, |
| "epoch": 3.7608367478973475, |
| "eval_steps": 1, |
| "global_step": 244, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.015527280569333621, |
| "grad_norm": 388.60345458984375, |
| "learning_rate": 0.0002, |
| "loss": 235.3557, |
| "mean_token_accuracy": 0.6456618151730962, |
| "num_tokens": 206114.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.015527280569333621, |
| "eval_accuracy": 0.8821729421615601, |
| "eval_loss": 1.297731637954712, |
| "eval_mean_token_accuracy": 0.6854841719239445, |
| "eval_num_tokens": 206114.0, |
| "eval_runtime": 517.9367, |
| "eval_samples_per_second": 0.226, |
| "eval_self_calculate_token_accuracy": 0.6851710081100464, |
| "eval_steps_per_second": 0.114, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.031054561138667242, |
| "grad_norm": 244.1171875, |
| "learning_rate": 0.0002, |
| "loss": 196.5053, |
| "mean_token_accuracy": 0.6811893607179323, |
| "num_tokens": 413337.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.031054561138667242, |
| "eval_accuracy": 0.8948897123336792, |
| "eval_loss": 1.1358916759490967, |
| "eval_mean_token_accuracy": 0.7127781692197768, |
| "eval_num_tokens": 413337.0, |
| "eval_runtime": 478.3922, |
| "eval_samples_per_second": 0.245, |
| "eval_self_calculate_token_accuracy": 0.7122454047203064, |
| "eval_steps_per_second": 0.123, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.046581841708000865, |
| "grad_norm": 184.6853790283203, |
| "learning_rate": 0.0002, |
| "loss": 166.5832, |
| "mean_token_accuracy": 0.7129071710838212, |
| "num_tokens": 620427.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.046581841708000865, |
| "eval_accuracy": 0.8886107206344604, |
| "eval_loss": 1.0528955459594727, |
| "eval_mean_token_accuracy": 0.726955561314599, |
| "eval_num_tokens": 620427.0, |
| "eval_runtime": 512.5693, |
| "eval_samples_per_second": 0.228, |
| "eval_self_calculate_token_accuracy": 0.7270154356956482, |
| "eval_steps_per_second": 0.115, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.062109122277334484, |
| "grad_norm": 226.86062622070312, |
| "learning_rate": 0.0002, |
| "loss": 151.0703, |
| "mean_token_accuracy": 0.7337513185209699, |
| "num_tokens": 828830.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.062109122277334484, |
| "eval_accuracy": 0.8933614492416382, |
| "eval_loss": 0.9952864646911621, |
| "eval_mean_token_accuracy": 0.7376000901400033, |
| "eval_num_tokens": 828830.0, |
| "eval_runtime": 529.3297, |
| "eval_samples_per_second": 0.221, |
| "eval_self_calculate_token_accuracy": 0.7374235987663269, |
| "eval_steps_per_second": 0.111, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0776364028466681, |
| "grad_norm": 185.3592071533203, |
| "learning_rate": 0.0002, |
| "loss": 138.0182, |
| "mean_token_accuracy": 0.748477922545539, |
| "num_tokens": 1036062.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0776364028466681, |
| "eval_accuracy": 0.8940512537956238, |
| "eval_loss": 0.9435210824012756, |
| "eval_mean_token_accuracy": 0.7477744332814621, |
| "eval_num_tokens": 1036062.0, |
| "eval_runtime": 536.6396, |
| "eval_samples_per_second": 0.218, |
| "eval_self_calculate_token_accuracy": 0.7476144433021545, |
| "eval_steps_per_second": 0.11, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.09316368341600173, |
| "grad_norm": 133.414794921875, |
| "learning_rate": 0.0002, |
| "loss": 129.527, |
| "mean_token_accuracy": 0.7577708040674528, |
| "num_tokens": 1242430.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.09316368341600173, |
| "eval_accuracy": 0.8943483233451843, |
| "eval_loss": 0.902610719203949, |
| "eval_mean_token_accuracy": 0.7560684579913899, |
| "eval_num_tokens": 1242430.0, |
| "eval_runtime": 514.2372, |
| "eval_samples_per_second": 0.228, |
| "eval_self_calculate_token_accuracy": 0.7558857202529907, |
| "eval_steps_per_second": 0.115, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.10869096398533534, |
| "grad_norm": 129.9571990966797, |
| "learning_rate": 0.0002, |
| "loss": 123.6458, |
| "mean_token_accuracy": 0.767652340233326, |
| "num_tokens": 1450526.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.10869096398533534, |
| "eval_accuracy": 0.8973140716552734, |
| "eval_loss": 0.8669535517692566, |
| "eval_mean_token_accuracy": 0.7622208665993254, |
| "eval_num_tokens": 1450526.0, |
| "eval_runtime": 472.6248, |
| "eval_samples_per_second": 0.248, |
| "eval_self_calculate_token_accuracy": 0.7623353600502014, |
| "eval_steps_per_second": 0.125, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.12421824455466897, |
| "grad_norm": 115.03675842285156, |
| "learning_rate": 0.0002, |
| "loss": 115.3756, |
| "mean_token_accuracy": 0.7793781318598323, |
| "num_tokens": 1658442.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.12421824455466897, |
| "eval_accuracy": 0.8980473279953003, |
| "eval_loss": 0.8377227783203125, |
| "eval_mean_token_accuracy": 0.7683570415286695, |
| "eval_num_tokens": 1658442.0, |
| "eval_runtime": 572.1487, |
| "eval_samples_per_second": 0.204, |
| "eval_self_calculate_token_accuracy": 0.7683823108673096, |
| "eval_steps_per_second": 0.103, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.1397455251240026, |
| "grad_norm": 134.6886444091797, |
| "learning_rate": 0.0002, |
| "loss": 114.2794, |
| "mean_token_accuracy": 0.778469916847017, |
| "num_tokens": 1865586.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.1397455251240026, |
| "eval_accuracy": 0.8952139616012573, |
| "eval_loss": 0.8179730176925659, |
| "eval_mean_token_accuracy": 0.7713285038026713, |
| "eval_num_tokens": 1865586.0, |
| "eval_runtime": 513.0781, |
| "eval_samples_per_second": 0.228, |
| "eval_self_calculate_token_accuracy": 0.7713114619255066, |
| "eval_steps_per_second": 0.115, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.1552728056933362, |
| "grad_norm": 108.02084350585938, |
| "learning_rate": 0.0002, |
| "loss": 110.487, |
| "mean_token_accuracy": 0.7797972386082014, |
| "num_tokens": 2072774.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.1552728056933362, |
| "eval_accuracy": 0.8961661458015442, |
| "eval_loss": 0.8034513592720032, |
| "eval_mean_token_accuracy": 0.7742107955075926, |
| "eval_num_tokens": 2072774.0, |
| "eval_runtime": 553.1262, |
| "eval_samples_per_second": 0.212, |
| "eval_self_calculate_token_accuracy": 0.7741386890411377, |
| "eval_steps_per_second": 0.107, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.17080008626266982, |
| "grad_norm": 102.06855773925781, |
| "learning_rate": 0.0002, |
| "loss": 104.7562, |
| "mean_token_accuracy": 0.7928152316146426, |
| "num_tokens": 2279763.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.17080008626266982, |
| "eval_accuracy": 0.901638925075531, |
| "eval_loss": 0.7883971333503723, |
| "eval_mean_token_accuracy": 0.7770755493034751, |
| "eval_num_tokens": 2279763.0, |
| "eval_runtime": 496.6288, |
| "eval_samples_per_second": 0.236, |
| "eval_self_calculate_token_accuracy": 0.7770423293113708, |
| "eval_steps_per_second": 0.119, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.18632736683200346, |
| "grad_norm": 94.77603912353516, |
| "learning_rate": 0.0002, |
| "loss": 106.2583, |
| "mean_token_accuracy": 0.7874543368816376, |
| "num_tokens": 2486212.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.18632736683200346, |
| "eval_accuracy": 0.8989700675010681, |
| "eval_loss": 0.775090754032135, |
| "eval_mean_token_accuracy": 0.7787025055642856, |
| "eval_num_tokens": 2486212.0, |
| "eval_runtime": 524.0398, |
| "eval_samples_per_second": 0.223, |
| "eval_self_calculate_token_accuracy": 0.7788378000259399, |
| "eval_steps_per_second": 0.113, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.20185464740133707, |
| "grad_norm": 87.69193267822266, |
| "learning_rate": 0.0002, |
| "loss": 99.7778, |
| "mean_token_accuracy": 0.8001320113738378, |
| "num_tokens": 2693227.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.20185464740133707, |
| "eval_accuracy": 0.9033994674682617, |
| "eval_loss": 0.7654978036880493, |
| "eval_mean_token_accuracy": 0.7799155146388684, |
| "eval_num_tokens": 2693227.0, |
| "eval_runtime": 525.2025, |
| "eval_samples_per_second": 0.223, |
| "eval_self_calculate_token_accuracy": 0.7803354859352112, |
| "eval_steps_per_second": 0.112, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.21738192797067069, |
| "grad_norm": 84.79354858398438, |
| "learning_rate": 0.0002, |
| "loss": 99.6628, |
| "mean_token_accuracy": 0.7965838453835912, |
| "num_tokens": 2900199.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.21738192797067069, |
| "eval_accuracy": 0.9020536541938782, |
| "eval_loss": 0.7549988031387329, |
| "eval_mean_token_accuracy": 0.7810374003345684, |
| "eval_num_tokens": 2900199.0, |
| "eval_runtime": 547.9186, |
| "eval_samples_per_second": 0.214, |
| "eval_self_calculate_token_accuracy": 0.781562089920044, |
| "eval_steps_per_second": 0.108, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.23290920854000433, |
| "grad_norm": 80.01195526123047, |
| "learning_rate": 0.0002, |
| "loss": 95.2465, |
| "mean_token_accuracy": 0.8037065060602294, |
| "num_tokens": 3107590.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.23290920854000433, |
| "eval_accuracy": 0.905937135219574, |
| "eval_loss": 0.7444906830787659, |
| "eval_mean_token_accuracy": 0.7849317597130597, |
| "eval_num_tokens": 3107590.0, |
| "eval_runtime": 537.9629, |
| "eval_samples_per_second": 0.217, |
| "eval_self_calculate_token_accuracy": 0.7851163148880005, |
| "eval_steps_per_second": 0.11, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.24843648910933794, |
| "grad_norm": 92.70500946044922, |
| "learning_rate": 0.0002, |
| "loss": 93.99, |
| "mean_token_accuracy": 0.8052933307157623, |
| "num_tokens": 3314109.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.24843648910933794, |
| "eval_accuracy": 0.9052315354347229, |
| "eval_loss": 0.735614538192749, |
| "eval_mean_token_accuracy": 0.7869909334990938, |
| "eval_num_tokens": 3314109.0, |
| "eval_runtime": 543.538, |
| "eval_samples_per_second": 0.215, |
| "eval_self_calculate_token_accuracy": 0.7872579097747803, |
| "eval_steps_per_second": 0.109, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2639637696786716, |
| "grad_norm": 81.36389923095703, |
| "learning_rate": 0.0002, |
| "loss": 96.111, |
| "mean_token_accuracy": 0.8035745413766967, |
| "num_tokens": 3521909.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.2639637696786716, |
| "eval_accuracy": 0.9038390517234802, |
| "eval_loss": 0.7300280332565308, |
| "eval_mean_token_accuracy": 0.7879955313973508, |
| "eval_num_tokens": 3521909.0, |
| "eval_runtime": 525.4894, |
| "eval_samples_per_second": 0.223, |
| "eval_self_calculate_token_accuracy": 0.7884399890899658, |
| "eval_steps_per_second": 0.112, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.2794910502480052, |
| "grad_norm": 79.87276458740234, |
| "learning_rate": 0.0002, |
| "loss": 91.7171, |
| "mean_token_accuracy": 0.808049873345428, |
| "num_tokens": 3728619.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2794910502480052, |
| "eval_accuracy": 0.908112645149231, |
| "eval_loss": 0.7197296023368835, |
| "eval_mean_token_accuracy": 0.7914570246712637, |
| "eval_num_tokens": 3728619.0, |
| "eval_runtime": 485.4509, |
| "eval_samples_per_second": 0.241, |
| "eval_self_calculate_token_accuracy": 0.791652262210846, |
| "eval_steps_per_second": 0.122, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.2950183308173388, |
| "grad_norm": 73.84920501708984, |
| "learning_rate": 0.0002, |
| "loss": 92.7451, |
| "mean_token_accuracy": 0.8076880541112688, |
| "num_tokens": 3935669.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.2950183308173388, |
| "eval_accuracy": 0.9082204699516296, |
| "eval_loss": 0.7118967771530151, |
| "eval_mean_token_accuracy": 0.7932763655306929, |
| "eval_num_tokens": 3935669.0, |
| "eval_runtime": 419.5804, |
| "eval_samples_per_second": 0.279, |
| "eval_self_calculate_token_accuracy": 0.7933873534202576, |
| "eval_steps_per_second": 0.141, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.3105456113866724, |
| "grad_norm": 74.23236846923828, |
| "learning_rate": 0.0002, |
| "loss": 88.3248, |
| "mean_token_accuracy": 0.815578955743048, |
| "num_tokens": 4142234.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.3105456113866724, |
| "eval_accuracy": 0.9096711277961731, |
| "eval_loss": 0.7069401144981384, |
| "eval_mean_token_accuracy": 0.7935693213495157, |
| "eval_num_tokens": 4142234.0, |
| "eval_runtime": 494.6358, |
| "eval_samples_per_second": 0.237, |
| "eval_self_calculate_token_accuracy": 0.7936540842056274, |
| "eval_steps_per_second": 0.119, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.32607289195600603, |
| "grad_norm": 68.7576904296875, |
| "learning_rate": 0.0002, |
| "loss": 92.0735, |
| "mean_token_accuracy": 0.8079203971558147, |
| "num_tokens": 4349997.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.32607289195600603, |
| "eval_accuracy": 0.9083203077316284, |
| "eval_loss": 0.7048131227493286, |
| "eval_mean_token_accuracy": 0.7935696102805057, |
| "eval_num_tokens": 4349997.0, |
| "eval_runtime": 379.1616, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.7936612367630005, |
| "eval_steps_per_second": 0.156, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.34160017252533964, |
| "grad_norm": 88.72570037841797, |
| "learning_rate": 0.0002, |
| "loss": 85.4505, |
| "mean_token_accuracy": 0.8195419386029243, |
| "num_tokens": 4556703.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.34160017252533964, |
| "eval_accuracy": 0.9117487072944641, |
| "eval_loss": 0.69442218542099, |
| "eval_mean_token_accuracy": 0.7948849878068698, |
| "eval_num_tokens": 4556703.0, |
| "eval_runtime": 466.9576, |
| "eval_samples_per_second": 0.251, |
| "eval_self_calculate_token_accuracy": 0.7950271964073181, |
| "eval_steps_per_second": 0.126, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.35712745309467325, |
| "grad_norm": 74.33305358886719, |
| "learning_rate": 0.0002, |
| "loss": 83.5596, |
| "mean_token_accuracy": 0.8205890291266971, |
| "num_tokens": 4764298.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.35712745309467325, |
| "eval_accuracy": 0.9097867012023926, |
| "eval_loss": 0.6916980147361755, |
| "eval_mean_token_accuracy": 0.7952647532446909, |
| "eval_num_tokens": 4764298.0, |
| "eval_runtime": 380.0077, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.7953569293022156, |
| "eval_steps_per_second": 0.155, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.3726547336640069, |
| "grad_norm": 76.8690185546875, |
| "learning_rate": 0.0002, |
| "loss": 85.3574, |
| "mean_token_accuracy": 0.8177222915821605, |
| "num_tokens": 4972160.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.3726547336640069, |
| "eval_accuracy": 0.9135264754295349, |
| "eval_loss": 0.6932325959205627, |
| "eval_mean_token_accuracy": 0.79371998370704, |
| "eval_num_tokens": 4972160.0, |
| "eval_runtime": 378.4894, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.7938908338546753, |
| "eval_steps_per_second": 0.156, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.38818201423334053, |
| "grad_norm": 75.3279037475586, |
| "learning_rate": 0.0002, |
| "loss": 87.6029, |
| "mean_token_accuracy": 0.8149439800116751, |
| "num_tokens": 5179076.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.38818201423334053, |
| "eval_accuracy": 0.914771556854248, |
| "eval_loss": 0.6928118467330933, |
| "eval_mean_token_accuracy": 0.7945833317304062, |
| "eval_num_tokens": 5179076.0, |
| "eval_runtime": 477.2773, |
| "eval_samples_per_second": 0.245, |
| "eval_self_calculate_token_accuracy": 0.7949104905128479, |
| "eval_steps_per_second": 0.124, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.40370929480267415, |
| "grad_norm": 75.90360260009766, |
| "learning_rate": 0.0002, |
| "loss": 87.4446, |
| "mean_token_accuracy": 0.8141942860351669, |
| "num_tokens": 5387348.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.40370929480267415, |
| "eval_accuracy": 0.915110170841217, |
| "eval_loss": 0.6823413968086243, |
| "eval_mean_token_accuracy": 0.7983418076725329, |
| "eval_num_tokens": 5387348.0, |
| "eval_runtime": 399.7904, |
| "eval_samples_per_second": 0.293, |
| "eval_self_calculate_token_accuracy": 0.798822820186615, |
| "eval_steps_per_second": 0.148, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.41923657537200776, |
| "grad_norm": 64.5274658203125, |
| "learning_rate": 0.0002, |
| "loss": 83.6138, |
| "mean_token_accuracy": 0.8214054538144006, |
| "num_tokens": 5594491.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.41923657537200776, |
| "eval_accuracy": 0.9166258573532104, |
| "eval_loss": 0.6738327741622925, |
| "eval_mean_token_accuracy": 0.7991620433532586, |
| "eval_num_tokens": 5594491.0, |
| "eval_runtime": 377.1645, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.7997329831123352, |
| "eval_steps_per_second": 0.156, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.43476385594134137, |
| "grad_norm": 76.20535278320312, |
| "learning_rate": 0.0002, |
| "loss": 84.517, |
| "mean_token_accuracy": 0.8191360524959035, |
| "num_tokens": 5801548.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.43476385594134137, |
| "eval_accuracy": 0.9158521294593811, |
| "eval_loss": 0.6709964871406555, |
| "eval_mean_token_accuracy": 0.7998738773798538, |
| "eval_num_tokens": 5801548.0, |
| "eval_runtime": 534.2722, |
| "eval_samples_per_second": 0.219, |
| "eval_self_calculate_token_accuracy": 0.8004175424575806, |
| "eval_steps_per_second": 0.11, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.450291136510675, |
| "grad_norm": 68.24972534179688, |
| "learning_rate": 0.0002, |
| "loss": 82.3459, |
| "mean_token_accuracy": 0.820239733490679, |
| "num_tokens": 6008761.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.450291136510675, |
| "eval_accuracy": 0.9137104153633118, |
| "eval_loss": 0.669684648513794, |
| "eval_mean_token_accuracy": 0.8005983486013898, |
| "eval_num_tokens": 6008761.0, |
| "eval_runtime": 401.1997, |
| "eval_samples_per_second": 0.292, |
| "eval_self_calculate_token_accuracy": 0.8010259866714478, |
| "eval_steps_per_second": 0.147, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.46581841708000865, |
| "grad_norm": 66.7740249633789, |
| "learning_rate": 0.0002, |
| "loss": 80.8085, |
| "mean_token_accuracy": 0.8230568650696013, |
| "num_tokens": 6216345.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.46581841708000865, |
| "eval_accuracy": 0.911504864692688, |
| "eval_loss": 0.6657170057296753, |
| "eval_mean_token_accuracy": 0.800199227817988, |
| "eval_num_tokens": 6216345.0, |
| "eval_runtime": 417.3271, |
| "eval_samples_per_second": 0.28, |
| "eval_self_calculate_token_accuracy": 0.8006454706192017, |
| "eval_steps_per_second": 0.141, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.48134569764934226, |
| "grad_norm": 64.8695297241211, |
| "learning_rate": 0.0002, |
| "loss": 81.244, |
| "mean_token_accuracy": 0.8241648533278041, |
| "num_tokens": 6422649.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.48134569764934226, |
| "eval_accuracy": 0.9120697379112244, |
| "eval_loss": 0.6641549468040466, |
| "eval_mean_token_accuracy": 0.8009569008471602, |
| "eval_num_tokens": 6422649.0, |
| "eval_runtime": 448.2593, |
| "eval_samples_per_second": 0.261, |
| "eval_self_calculate_token_accuracy": 0.801353931427002, |
| "eval_steps_per_second": 0.132, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.4968729782186759, |
| "grad_norm": 69.34745788574219, |
| "learning_rate": 0.0002, |
| "loss": 81.9341, |
| "mean_token_accuracy": 0.8242515631847911, |
| "num_tokens": 6629671.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.4968729782186759, |
| "eval_accuracy": 0.9109794497489929, |
| "eval_loss": 0.6608865857124329, |
| "eval_mean_token_accuracy": 0.8022122938754195, |
| "eval_num_tokens": 6629671.0, |
| "eval_runtime": 454.8004, |
| "eval_samples_per_second": 0.257, |
| "eval_self_calculate_token_accuracy": 0.8026918768882751, |
| "eval_steps_per_second": 0.13, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.5124002587880095, |
| "grad_norm": 74.86833190917969, |
| "learning_rate": 0.0002, |
| "loss": 80.0734, |
| "mean_token_accuracy": 0.8248270741767354, |
| "num_tokens": 6836475.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.5124002587880095, |
| "eval_accuracy": 0.9121805429458618, |
| "eval_loss": 0.6564350724220276, |
| "eval_mean_token_accuracy": 0.8043287487353309, |
| "eval_num_tokens": 6836475.0, |
| "eval_runtime": 495.3559, |
| "eval_samples_per_second": 0.236, |
| "eval_self_calculate_token_accuracy": 0.8046467900276184, |
| "eval_steps_per_second": 0.119, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.5279275393573432, |
| "grad_norm": 65.61771392822266, |
| "learning_rate": 0.0002, |
| "loss": 79.5041, |
| "mean_token_accuracy": 0.8282077461481094, |
| "num_tokens": 7044809.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.5279275393573432, |
| "eval_accuracy": 0.9117639660835266, |
| "eval_loss": 0.6553068161010742, |
| "eval_mean_token_accuracy": 0.8040643940537663, |
| "eval_num_tokens": 7044809.0, |
| "eval_runtime": 494.081, |
| "eval_samples_per_second": 0.237, |
| "eval_self_calculate_token_accuracy": 0.8043231964111328, |
| "eval_steps_per_second": 0.119, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.5434548199266768, |
| "grad_norm": 67.1403579711914, |
| "learning_rate": 0.0002, |
| "loss": 79.2852, |
| "mean_token_accuracy": 0.8265329235129886, |
| "num_tokens": 7252918.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.5434548199266768, |
| "eval_accuracy": 0.9122189283370972, |
| "eval_loss": 0.6555678248405457, |
| "eval_mean_token_accuracy": 0.803064175581528, |
| "eval_num_tokens": 7252918.0, |
| "eval_runtime": 527.5452, |
| "eval_samples_per_second": 0.222, |
| "eval_self_calculate_token_accuracy": 0.8032756447792053, |
| "eval_steps_per_second": 0.112, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.5589821004960104, |
| "grad_norm": 66.02539825439453, |
| "learning_rate": 0.0002, |
| "loss": 82.1148, |
| "mean_token_accuracy": 0.8243914561139213, |
| "num_tokens": 7460461.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.5589821004960104, |
| "eval_accuracy": 0.9112619757652283, |
| "eval_loss": 0.6514995098114014, |
| "eval_mean_token_accuracy": 0.8033366799354553, |
| "eval_num_tokens": 7460461.0, |
| "eval_runtime": 378.9282, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8037459254264832, |
| "eval_steps_per_second": 0.156, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.574509381065344, |
| "grad_norm": 63.525108337402344, |
| "learning_rate": 0.0002, |
| "loss": 79.0236, |
| "mean_token_accuracy": 0.82805245452457, |
| "num_tokens": 7667020.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.574509381065344, |
| "eval_accuracy": 0.9103755354881287, |
| "eval_loss": 0.6434940695762634, |
| "eval_mean_token_accuracy": 0.8052172782057423, |
| "eval_num_tokens": 7667020.0, |
| "eval_runtime": 377.8911, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8055295348167419, |
| "eval_steps_per_second": 0.156, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.5900366616346776, |
| "grad_norm": 68.07242584228516, |
| "learning_rate": 0.0002, |
| "loss": 78.1246, |
| "mean_token_accuracy": 0.82818995995654, |
| "num_tokens": 7874188.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.5900366616346776, |
| "eval_accuracy": 0.913102924823761, |
| "eval_loss": 0.6365765333175659, |
| "eval_mean_token_accuracy": 0.8071239762387034, |
| "eval_num_tokens": 7874188.0, |
| "eval_runtime": 516.5165, |
| "eval_samples_per_second": 0.227, |
| "eval_self_calculate_token_accuracy": 0.8071991205215454, |
| "eval_steps_per_second": 0.114, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.6055639422040112, |
| "grad_norm": 65.70716857910156, |
| "learning_rate": 0.0002, |
| "loss": 77.4535, |
| "mean_token_accuracy": 0.8317671550644768, |
| "num_tokens": 8080785.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.6055639422040112, |
| "eval_accuracy": 0.9140548706054688, |
| "eval_loss": 0.6353747844696045, |
| "eval_mean_token_accuracy": 0.8078821796481892, |
| "eval_num_tokens": 8080785.0, |
| "eval_runtime": 442.1132, |
| "eval_samples_per_second": 0.265, |
| "eval_self_calculate_token_accuracy": 0.8079361915588379, |
| "eval_steps_per_second": 0.133, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.6210912227733448, |
| "grad_norm": 61.63087844848633, |
| "learning_rate": 0.0002, |
| "loss": 75.8081, |
| "mean_token_accuracy": 0.8337800262702836, |
| "num_tokens": 8287147.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.6210912227733448, |
| "eval_accuracy": 0.9147362112998962, |
| "eval_loss": 0.6424786448478699, |
| "eval_mean_token_accuracy": 0.806917532015655, |
| "eval_num_tokens": 8287147.0, |
| "eval_runtime": 423.4872, |
| "eval_samples_per_second": 0.276, |
| "eval_self_calculate_token_accuracy": 0.8070501685142517, |
| "eval_steps_per_second": 0.139, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.6366185033426784, |
| "grad_norm": 65.54632568359375, |
| "learning_rate": 0.0002, |
| "loss": 76.7829, |
| "mean_token_accuracy": 0.8310973097880682, |
| "num_tokens": 8495122.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.6366185033426784, |
| "eval_accuracy": 0.9148668646812439, |
| "eval_loss": 0.6443515419960022, |
| "eval_mean_token_accuracy": 0.8066114904516835, |
| "eval_num_tokens": 8495122.0, |
| "eval_runtime": 537.8913, |
| "eval_samples_per_second": 0.218, |
| "eval_self_calculate_token_accuracy": 0.8066722750663757, |
| "eval_steps_per_second": 0.11, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.6521457839120121, |
| "grad_norm": 72.80533599853516, |
| "learning_rate": 0.0002, |
| "loss": 79.4775, |
| "mean_token_accuracy": 0.8249161483512985, |
| "num_tokens": 8703201.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.6521457839120121, |
| "eval_accuracy": 0.9164402484893799, |
| "eval_loss": 0.6363280415534973, |
| "eval_mean_token_accuracy": 0.8063408045445458, |
| "eval_num_tokens": 8703201.0, |
| "eval_runtime": 518.7859, |
| "eval_samples_per_second": 0.226, |
| "eval_self_calculate_token_accuracy": 0.806464433670044, |
| "eval_steps_per_second": 0.114, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.6676730644813457, |
| "grad_norm": 66.6220932006836, |
| "learning_rate": 0.0002, |
| "loss": 75.4804, |
| "mean_token_accuracy": 0.83234320829312, |
| "num_tokens": 8910441.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.6676730644813457, |
| "eval_accuracy": 0.9164731502532959, |
| "eval_loss": 0.6302393078804016, |
| "eval_mean_token_accuracy": 0.8075330227108325, |
| "eval_num_tokens": 8910441.0, |
| "eval_runtime": 542.2645, |
| "eval_samples_per_second": 0.216, |
| "eval_self_calculate_token_accuracy": 0.8077787160873413, |
| "eval_steps_per_second": 0.109, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.6832003450506793, |
| "grad_norm": 68.69686889648438, |
| "learning_rate": 0.0002, |
| "loss": 77.4471, |
| "mean_token_accuracy": 0.8295381383763419, |
| "num_tokens": 9118633.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.6832003450506793, |
| "eval_accuracy": 0.9123803377151489, |
| "eval_loss": 0.6293103694915771, |
| "eval_mean_token_accuracy": 0.8084206035581686, |
| "eval_num_tokens": 9118633.0, |
| "eval_runtime": 497.187, |
| "eval_samples_per_second": 0.235, |
| "eval_self_calculate_token_accuracy": 0.808789849281311, |
| "eval_steps_per_second": 0.119, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.6987276256200129, |
| "grad_norm": 63.706947326660156, |
| "learning_rate": 0.0002, |
| "loss": 76.9918, |
| "mean_token_accuracy": 0.8314318160216013, |
| "num_tokens": 9325726.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.6987276256200129, |
| "eval_accuracy": 0.9107639789581299, |
| "eval_loss": 0.6324621438980103, |
| "eval_mean_token_accuracy": 0.8079288743310056, |
| "eval_num_tokens": 9325726.0, |
| "eval_runtime": 549.8271, |
| "eval_samples_per_second": 0.213, |
| "eval_self_calculate_token_accuracy": 0.8081849217414856, |
| "eval_steps_per_second": 0.107, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.7142549061893465, |
| "grad_norm": 73.7982406616211, |
| "learning_rate": 0.0002, |
| "loss": 78.1238, |
| "mean_token_accuracy": 0.8276747150553597, |
| "num_tokens": 9532458.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.7142549061893465, |
| "eval_accuracy": 0.9130527377128601, |
| "eval_loss": 0.6298496127128601, |
| "eval_mean_token_accuracy": 0.8091619398634312, |
| "eval_num_tokens": 9532458.0, |
| "eval_runtime": 511.5222, |
| "eval_samples_per_second": 0.229, |
| "eval_self_calculate_token_accuracy": 0.8093991279602051, |
| "eval_steps_per_second": 0.115, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.7297821867586802, |
| "grad_norm": 69.95299530029297, |
| "learning_rate": 0.0002, |
| "loss": 74.6968, |
| "mean_token_accuracy": 0.8334906034999423, |
| "num_tokens": 9739085.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.7297821867586802, |
| "eval_accuracy": 0.9140156507492065, |
| "eval_loss": 0.6267667412757874, |
| "eval_mean_token_accuracy": 0.8093710400290408, |
| "eval_num_tokens": 9739085.0, |
| "eval_runtime": 402.1958, |
| "eval_samples_per_second": 0.291, |
| "eval_self_calculate_token_accuracy": 0.8095532655715942, |
| "eval_steps_per_second": 0.147, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.7453094673280138, |
| "grad_norm": 69.93089294433594, |
| "learning_rate": 0.0002, |
| "loss": 76.9015, |
| "mean_token_accuracy": 0.8311376662717925, |
| "num_tokens": 9946613.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.7453094673280138, |
| "eval_accuracy": 0.9145108461380005, |
| "eval_loss": 0.6275688409805298, |
| "eval_mean_token_accuracy": 0.8099053764747361, |
| "eval_num_tokens": 9946613.0, |
| "eval_runtime": 505.8206, |
| "eval_samples_per_second": 0.231, |
| "eval_self_calculate_token_accuracy": 0.80998694896698, |
| "eval_steps_per_second": 0.117, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.7608367478973475, |
| "grad_norm": 71.89234161376953, |
| "learning_rate": 0.0002, |
| "loss": 73.4955, |
| "mean_token_accuracy": 0.8371797477205595, |
| "num_tokens": 10154200.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.7608367478973475, |
| "eval_accuracy": 0.9144572019577026, |
| "eval_loss": 0.6280543208122253, |
| "eval_mean_token_accuracy": 0.8103320103580669, |
| "eval_num_tokens": 10154200.0, |
| "eval_runtime": 469.999, |
| "eval_samples_per_second": 0.249, |
| "eval_self_calculate_token_accuracy": 0.810602068901062, |
| "eval_steps_per_second": 0.126, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.7763640284666811, |
| "grad_norm": 74.61000061035156, |
| "learning_rate": 0.0002, |
| "loss": 74.9314, |
| "mean_token_accuracy": 0.8344847096337212, |
| "num_tokens": 10361431.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.7763640284666811, |
| "eval_accuracy": 0.9124852418899536, |
| "eval_loss": 0.6263248920440674, |
| "eval_mean_token_accuracy": 0.8109216740575887, |
| "eval_num_tokens": 10361431.0, |
| "eval_runtime": 530.0937, |
| "eval_samples_per_second": 0.221, |
| "eval_self_calculate_token_accuracy": 0.8111692070960999, |
| "eval_steps_per_second": 0.111, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.7918913090360147, |
| "grad_norm": 65.09473419189453, |
| "learning_rate": 0.0002, |
| "loss": 74.7611, |
| "mean_token_accuracy": 0.8341691344976425, |
| "num_tokens": 10568096.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.7918913090360147, |
| "eval_accuracy": 0.915256917476654, |
| "eval_loss": 0.6230616569519043, |
| "eval_mean_token_accuracy": 0.8122233293824277, |
| "eval_num_tokens": 10568096.0, |
| "eval_runtime": 493.2208, |
| "eval_samples_per_second": 0.237, |
| "eval_self_calculate_token_accuracy": 0.8124744296073914, |
| "eval_steps_per_second": 0.12, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.8074185896053483, |
| "grad_norm": 66.29564666748047, |
| "learning_rate": 0.0002, |
| "loss": 75.1666, |
| "mean_token_accuracy": 0.8314917261401812, |
| "num_tokens": 10775230.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.8074185896053483, |
| "eval_accuracy": 0.9141833782196045, |
| "eval_loss": 0.6197871565818787, |
| "eval_mean_token_accuracy": 0.8130643044487905, |
| "eval_num_tokens": 10775230.0, |
| "eval_runtime": 471.5309, |
| "eval_samples_per_second": 0.248, |
| "eval_self_calculate_token_accuracy": 0.813408374786377, |
| "eval_steps_per_second": 0.125, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.8229458701746819, |
| "grad_norm": 58.1451530456543, |
| "learning_rate": 0.0002, |
| "loss": 73.3247, |
| "mean_token_accuracy": 0.8366643513242403, |
| "num_tokens": 10982552.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.8229458701746819, |
| "eval_accuracy": 0.9132152199745178, |
| "eval_loss": 0.6162331104278564, |
| "eval_mean_token_accuracy": 0.8125809107796621, |
| "eval_num_tokens": 10982552.0, |
| "eval_runtime": 438.801, |
| "eval_samples_per_second": 0.267, |
| "eval_self_calculate_token_accuracy": 0.812820315361023, |
| "eval_steps_per_second": 0.134, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.8384731507440155, |
| "grad_norm": 63.90974807739258, |
| "learning_rate": 0.0002, |
| "loss": 75.1934, |
| "mean_token_accuracy": 0.8360330785314242, |
| "num_tokens": 11190332.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.8384731507440155, |
| "eval_accuracy": 0.9135575890541077, |
| "eval_loss": 0.6138584017753601, |
| "eval_mean_token_accuracy": 0.8129710868253546, |
| "eval_num_tokens": 11190332.0, |
| "eval_runtime": 457.8778, |
| "eval_samples_per_second": 0.256, |
| "eval_self_calculate_token_accuracy": 0.8131245970726013, |
| "eval_steps_per_second": 0.129, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.8540004313133491, |
| "grad_norm": 61.70444869995117, |
| "learning_rate": 0.0002, |
| "loss": 72.6765, |
| "mean_token_accuracy": 0.838487446308136, |
| "num_tokens": 11396939.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.8540004313133491, |
| "eval_accuracy": 0.9117525219917297, |
| "eval_loss": 0.6168607473373413, |
| "eval_mean_token_accuracy": 0.8129422583822477, |
| "eval_num_tokens": 11396939.0, |
| "eval_runtime": 447.8985, |
| "eval_samples_per_second": 0.261, |
| "eval_self_calculate_token_accuracy": 0.8131344318389893, |
| "eval_steps_per_second": 0.132, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.8695277118826827, |
| "grad_norm": 56.66482162475586, |
| "learning_rate": 0.0002, |
| "loss": 73.0716, |
| "mean_token_accuracy": 0.8375749306546317, |
| "num_tokens": 11603403.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.8695277118826827, |
| "eval_accuracy": 0.9112023115158081, |
| "eval_loss": 0.6186460256576538, |
| "eval_mean_token_accuracy": 0.8130350405887022, |
| "eval_num_tokens": 11603403.0, |
| "eval_runtime": 531.9386, |
| "eval_samples_per_second": 0.22, |
| "eval_self_calculate_token_accuracy": 0.8132300972938538, |
| "eval_steps_per_second": 0.111, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.8850549924520164, |
| "grad_norm": 68.06394958496094, |
| "learning_rate": 0.0002, |
| "loss": 71.467, |
| "mean_token_accuracy": 0.8391321500142416, |
| "num_tokens": 11811134.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.8850549924520164, |
| "eval_accuracy": 0.9130308032035828, |
| "eval_loss": 0.6166709065437317, |
| "eval_mean_token_accuracy": 0.8137099167047921, |
| "eval_num_tokens": 11811134.0, |
| "eval_runtime": 517.4726, |
| "eval_samples_per_second": 0.226, |
| "eval_self_calculate_token_accuracy": 0.8139044046401978, |
| "eval_steps_per_second": 0.114, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.90058227302135, |
| "grad_norm": 59.669342041015625, |
| "learning_rate": 0.0002, |
| "loss": 72.9217, |
| "mean_token_accuracy": 0.8353930016358694, |
| "num_tokens": 12017995.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.90058227302135, |
| "eval_accuracy": 0.9136723875999451, |
| "eval_loss": 0.6148823499679565, |
| "eval_mean_token_accuracy": 0.814302422232547, |
| "eval_num_tokens": 12017995.0, |
| "eval_runtime": 519.3912, |
| "eval_samples_per_second": 0.225, |
| "eval_self_calculate_token_accuracy": 0.8144720792770386, |
| "eval_steps_per_second": 0.114, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.9161095535906836, |
| "grad_norm": 62.19975662231445, |
| "learning_rate": 0.0002, |
| "loss": 73.5524, |
| "mean_token_accuracy": 0.836793865594599, |
| "num_tokens": 12225206.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.9161095535906836, |
| "eval_accuracy": 0.9178274273872375, |
| "eval_loss": 0.613953173160553, |
| "eval_mean_token_accuracy": 0.8129242183798451, |
| "eval_num_tokens": 12225206.0, |
| "eval_runtime": 467.3657, |
| "eval_samples_per_second": 0.25, |
| "eval_self_calculate_token_accuracy": 0.8131629824638367, |
| "eval_steps_per_second": 0.126, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.9316368341600173, |
| "grad_norm": 57.011043548583984, |
| "learning_rate": 0.0002, |
| "loss": 74.0301, |
| "mean_token_accuracy": 0.8358530120717155, |
| "num_tokens": 12433270.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.9316368341600173, |
| "eval_accuracy": 0.9191554188728333, |
| "eval_loss": 0.6118597984313965, |
| "eval_mean_token_accuracy": 0.814022664296425, |
| "eval_num_tokens": 12433270.0, |
| "eval_runtime": 522.5148, |
| "eval_samples_per_second": 0.224, |
| "eval_self_calculate_token_accuracy": 0.8142619132995605, |
| "eval_steps_per_second": 0.113, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.9471641147293509, |
| "grad_norm": 66.91328430175781, |
| "learning_rate": 0.0002, |
| "loss": 72.5186, |
| "mean_token_accuracy": 0.8367332228355937, |
| "num_tokens": 12639780.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.9471641147293509, |
| "eval_accuracy": 0.9177585244178772, |
| "eval_loss": 0.6108775734901428, |
| "eval_mean_token_accuracy": 0.8138837632486375, |
| "eval_num_tokens": 12639780.0, |
| "eval_runtime": 422.2926, |
| "eval_samples_per_second": 0.277, |
| "eval_self_calculate_token_accuracy": 0.8141418099403381, |
| "eval_steps_per_second": 0.14, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.9626913952986845, |
| "grad_norm": 87.78324127197266, |
| "learning_rate": 0.0002, |
| "loss": 71.883, |
| "mean_token_accuracy": 0.840221087137858, |
| "num_tokens": 12847997.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.9626913952986845, |
| "eval_accuracy": 0.9160240292549133, |
| "eval_loss": 0.6120632886886597, |
| "eval_mean_token_accuracy": 0.8128126203003576, |
| "eval_num_tokens": 12847997.0, |
| "eval_runtime": 531.9323, |
| "eval_samples_per_second": 0.22, |
| "eval_self_calculate_token_accuracy": 0.8129583597183228, |
| "eval_steps_per_second": 0.111, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.9782186758680181, |
| "grad_norm": 59.98785400390625, |
| "learning_rate": 0.0002, |
| "loss": 71.2979, |
| "mean_token_accuracy": 0.8396406115757095, |
| "num_tokens": 13055628.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.9782186758680181, |
| "eval_accuracy": 0.914139449596405, |
| "eval_loss": 0.6138727068901062, |
| "eval_mean_token_accuracy": 0.8129084362822064, |
| "eval_num_tokens": 13055628.0, |
| "eval_runtime": 520.2873, |
| "eval_samples_per_second": 0.225, |
| "eval_self_calculate_token_accuracy": 0.8129828572273254, |
| "eval_steps_per_second": 0.113, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.9937459564373518, |
| "grad_norm": 72.21045684814453, |
| "learning_rate": 0.0002, |
| "loss": 71.4842, |
| "mean_token_accuracy": 0.8384962247477638, |
| "num_tokens": 13263657.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.9937459564373518, |
| "eval_accuracy": 0.9158999919891357, |
| "eval_loss": 0.6086071133613586, |
| "eval_mean_token_accuracy": 0.8151745139542272, |
| "eval_num_tokens": 13263657.0, |
| "eval_runtime": 502.3441, |
| "eval_samples_per_second": 0.233, |
| "eval_self_calculate_token_accuracy": 0.8152207732200623, |
| "eval_steps_per_second": 0.117, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 31.990901947021484, |
| "learning_rate": 0.0002, |
| "loss": 29.3306, |
| "mean_token_accuracy": 0.8379724251812902, |
| "num_tokens": 13346942.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_accuracy": 0.9162644147872925, |
| "eval_loss": 0.606562077999115, |
| "eval_mean_token_accuracy": 0.8147177352743634, |
| "eval_num_tokens": 13346942.0, |
| "eval_runtime": 488.3515, |
| "eval_samples_per_second": 0.24, |
| "eval_self_calculate_token_accuracy": 0.8147389888763428, |
| "eval_steps_per_second": 0.121, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.0155272805693336, |
| "grad_norm": 65.1650390625, |
| "learning_rate": 0.0002, |
| "loss": 68.3093, |
| "mean_token_accuracy": 0.8437272103296386, |
| "num_tokens": 13553821.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.0155272805693336, |
| "eval_accuracy": 0.914888322353363, |
| "eval_loss": 0.6078404188156128, |
| "eval_mean_token_accuracy": 0.8142804877232697, |
| "eval_num_tokens": 13553821.0, |
| "eval_runtime": 489.5883, |
| "eval_samples_per_second": 0.239, |
| "eval_self_calculate_token_accuracy": 0.8142405152320862, |
| "eval_steps_per_second": 0.121, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.0310545611386672, |
| "grad_norm": 64.63192749023438, |
| "learning_rate": 0.0002, |
| "loss": 68.4059, |
| "mean_token_accuracy": 0.8433061440785726, |
| "num_tokens": 13761533.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.0310545611386672, |
| "eval_accuracy": 0.9138510227203369, |
| "eval_loss": 0.6116654872894287, |
| "eval_mean_token_accuracy": 0.8124735476606983, |
| "eval_num_tokens": 13761533.0, |
| "eval_runtime": 522.566, |
| "eval_samples_per_second": 0.224, |
| "eval_self_calculate_token_accuracy": 0.8125846982002258, |
| "eval_steps_per_second": 0.113, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.0465818417080008, |
| "grad_norm": 65.7592544555664, |
| "learning_rate": 0.0002, |
| "loss": 67.2654, |
| "mean_token_accuracy": 0.8436924947632684, |
| "num_tokens": 13969599.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.0465818417080008, |
| "eval_accuracy": 0.914797306060791, |
| "eval_loss": 0.6094985008239746, |
| "eval_mean_token_accuracy": 0.813911483449451, |
| "eval_num_tokens": 13969599.0, |
| "eval_runtime": 492.1522, |
| "eval_samples_per_second": 0.238, |
| "eval_self_calculate_token_accuracy": 0.8138671517372131, |
| "eval_steps_per_second": 0.12, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.0621091222773344, |
| "grad_norm": 65.31761169433594, |
| "learning_rate": 0.0002, |
| "loss": 67.8058, |
| "mean_token_accuracy": 0.8444534300102128, |
| "num_tokens": 14176410.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.0621091222773344, |
| "eval_accuracy": 0.916292130947113, |
| "eval_loss": 0.6073565483093262, |
| "eval_mean_token_accuracy": 0.8138014462034581, |
| "eval_num_tokens": 14176410.0, |
| "eval_runtime": 502.5082, |
| "eval_samples_per_second": 0.233, |
| "eval_self_calculate_token_accuracy": 0.8138008713722229, |
| "eval_steps_per_second": 0.117, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.077636402846668, |
| "grad_norm": 65.64813232421875, |
| "learning_rate": 0.0002, |
| "loss": 68.0804, |
| "mean_token_accuracy": 0.8438257641262479, |
| "num_tokens": 14383823.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.077636402846668, |
| "eval_accuracy": 0.9173416495323181, |
| "eval_loss": 0.6070806384086609, |
| "eval_mean_token_accuracy": 0.8139629707498065, |
| "eval_num_tokens": 14383823.0, |
| "eval_runtime": 415.9045, |
| "eval_samples_per_second": 0.281, |
| "eval_self_calculate_token_accuracy": 0.8141365647315979, |
| "eval_steps_per_second": 0.142, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.0931636834160017, |
| "grad_norm": 80.39895629882812, |
| "learning_rate": 0.0002, |
| "loss": 66.9645, |
| "mean_token_accuracy": 0.8455751662453016, |
| "num_tokens": 14590215.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.0931636834160017, |
| "eval_accuracy": 0.9200232625007629, |
| "eval_loss": 0.6067765951156616, |
| "eval_mean_token_accuracy": 0.8152269741236153, |
| "eval_num_tokens": 14590215.0, |
| "eval_runtime": 537.3198, |
| "eval_samples_per_second": 0.218, |
| "eval_self_calculate_token_accuracy": 0.815378725528717, |
| "eval_steps_per_second": 0.11, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.1086909639853353, |
| "grad_norm": 65.61837005615234, |
| "learning_rate": 0.0002, |
| "loss": 67.5675, |
| "mean_token_accuracy": 0.8453471718562974, |
| "num_tokens": 14797976.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.1086909639853353, |
| "eval_accuracy": 0.9188646674156189, |
| "eval_loss": 0.6055653095245361, |
| "eval_mean_token_accuracy": 0.8155758542529608, |
| "eval_num_tokens": 14797976.0, |
| "eval_runtime": 464.2185, |
| "eval_samples_per_second": 0.252, |
| "eval_self_calculate_token_accuracy": 0.8157888650894165, |
| "eval_steps_per_second": 0.127, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.124218244554669, |
| "grad_norm": 65.88909912109375, |
| "learning_rate": 0.0002, |
| "loss": 66.8075, |
| "mean_token_accuracy": 0.848398340245088, |
| "num_tokens": 15005047.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.124218244554669, |
| "eval_accuracy": 0.918417751789093, |
| "eval_loss": 0.6034069061279297, |
| "eval_mean_token_accuracy": 0.8151159912852918, |
| "eval_num_tokens": 15005047.0, |
| "eval_runtime": 542.2454, |
| "eval_samples_per_second": 0.216, |
| "eval_self_calculate_token_accuracy": 0.8152737617492676, |
| "eval_steps_per_second": 0.109, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.1397455251240025, |
| "grad_norm": 60.47575378417969, |
| "learning_rate": 0.0002, |
| "loss": 67.3135, |
| "mean_token_accuracy": 0.8462995580501027, |
| "num_tokens": 15212921.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.1397455251240025, |
| "eval_accuracy": 0.9160763025283813, |
| "eval_loss": 0.6052250266075134, |
| "eval_mean_token_accuracy": 0.8142917681548555, |
| "eval_num_tokens": 15212921.0, |
| "eval_runtime": 409.7233, |
| "eval_samples_per_second": 0.286, |
| "eval_self_calculate_token_accuracy": 0.814273476600647, |
| "eval_steps_per_second": 0.144, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.1552728056933361, |
| "grad_norm": 61.43801498413086, |
| "learning_rate": 0.0002, |
| "loss": 69.5417, |
| "mean_token_accuracy": 0.8406405109498236, |
| "num_tokens": 15421572.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.1552728056933361, |
| "eval_accuracy": 0.916191577911377, |
| "eval_loss": 0.6057412624359131, |
| "eval_mean_token_accuracy": 0.8134275806152215, |
| "eval_num_tokens": 15421572.0, |
| "eval_runtime": 491.5379, |
| "eval_samples_per_second": 0.238, |
| "eval_self_calculate_token_accuracy": 0.8135518431663513, |
| "eval_steps_per_second": 0.12, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.1708000862626697, |
| "grad_norm": 67.05225372314453, |
| "learning_rate": 0.0002, |
| "loss": 67.9327, |
| "mean_token_accuracy": 0.8432049610548549, |
| "num_tokens": 15629689.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.1708000862626697, |
| "eval_accuracy": 0.9198341369628906, |
| "eval_loss": 0.6028507947921753, |
| "eval_mean_token_accuracy": 0.8151428881338088, |
| "eval_num_tokens": 15629689.0, |
| "eval_runtime": 531.4871, |
| "eval_samples_per_second": 0.22, |
| "eval_self_calculate_token_accuracy": 0.8151507377624512, |
| "eval_steps_per_second": 0.111, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.1863273668320033, |
| "grad_norm": 68.65874481201172, |
| "learning_rate": 0.0002, |
| "loss": 66.4302, |
| "mean_token_accuracy": 0.8477641658650504, |
| "num_tokens": 15836164.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.1863273668320033, |
| "eval_accuracy": 0.9225656390190125, |
| "eval_loss": 0.5992919206619263, |
| "eval_mean_token_accuracy": 0.8168946100493609, |
| "eval_num_tokens": 15836164.0, |
| "eval_runtime": 505.3021, |
| "eval_samples_per_second": 0.232, |
| "eval_self_calculate_token_accuracy": 0.8169541954994202, |
| "eval_steps_per_second": 0.117, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.201854647401337, |
| "grad_norm": 66.15119934082031, |
| "learning_rate": 0.0002, |
| "loss": 65.9276, |
| "mean_token_accuracy": 0.849105974038442, |
| "num_tokens": 16043516.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.201854647401337, |
| "eval_accuracy": 0.9204577207565308, |
| "eval_loss": 0.5995540618896484, |
| "eval_mean_token_accuracy": 0.81749800605289, |
| "eval_num_tokens": 16043516.0, |
| "eval_runtime": 418.04, |
| "eval_samples_per_second": 0.28, |
| "eval_self_calculate_token_accuracy": 0.8175397515296936, |
| "eval_steps_per_second": 0.141, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.2173819279706706, |
| "grad_norm": 68.40901947021484, |
| "learning_rate": 0.0002, |
| "loss": 67.7778, |
| "mean_token_accuracy": 0.844365533027384, |
| "num_tokens": 16251336.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.2173819279706706, |
| "eval_accuracy": 0.9171594381332397, |
| "eval_loss": 0.5996333956718445, |
| "eval_mean_token_accuracy": 0.8166605688757815, |
| "eval_num_tokens": 16251336.0, |
| "eval_runtime": 515.6233, |
| "eval_samples_per_second": 0.227, |
| "eval_self_calculate_token_accuracy": 0.8167204856872559, |
| "eval_steps_per_second": 0.114, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.2329092085400044, |
| "grad_norm": 66.2791976928711, |
| "learning_rate": 0.0002, |
| "loss": 67.7025, |
| "mean_token_accuracy": 0.8446261005269157, |
| "num_tokens": 16459129.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.2329092085400044, |
| "eval_accuracy": 0.9145519137382507, |
| "eval_loss": 0.6023092269897461, |
| "eval_mean_token_accuracy": 0.8154029684551691, |
| "eval_num_tokens": 16459129.0, |
| "eval_runtime": 544.4679, |
| "eval_samples_per_second": 0.215, |
| "eval_self_calculate_token_accuracy": 0.8155403137207031, |
| "eval_steps_per_second": 0.108, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.248436489109338, |
| "grad_norm": 66.77078247070312, |
| "learning_rate": 0.0002, |
| "loss": 67.3506, |
| "mean_token_accuracy": 0.8439030920465788, |
| "num_tokens": 16666300.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.248436489109338, |
| "eval_accuracy": 0.9149135947227478, |
| "eval_loss": 0.602145791053772, |
| "eval_mean_token_accuracy": 0.8151502296075983, |
| "eval_num_tokens": 16666300.0, |
| "eval_runtime": 444.8185, |
| "eval_samples_per_second": 0.263, |
| "eval_self_calculate_token_accuracy": 0.8153896331787109, |
| "eval_steps_per_second": 0.133, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.2639637696786716, |
| "grad_norm": 66.9771957397461, |
| "learning_rate": 0.0002, |
| "loss": 66.344, |
| "mean_token_accuracy": 0.8471889520684878, |
| "num_tokens": 16873117.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.2639637696786716, |
| "eval_accuracy": 0.9140865206718445, |
| "eval_loss": 0.6002114415168762, |
| "eval_mean_token_accuracy": 0.8161402425523532, |
| "eval_num_tokens": 16873117.0, |
| "eval_runtime": 541.6985, |
| "eval_samples_per_second": 0.216, |
| "eval_self_calculate_token_accuracy": 0.816150963306427, |
| "eval_steps_per_second": 0.109, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.2794910502480052, |
| "grad_norm": 61.13132858276367, |
| "learning_rate": 0.0002, |
| "loss": 67.3976, |
| "mean_token_accuracy": 0.846321165561676, |
| "num_tokens": 17080999.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.2794910502480052, |
| "eval_accuracy": 0.9156964421272278, |
| "eval_loss": 0.5988284945487976, |
| "eval_mean_token_accuracy": 0.8168253393496497, |
| "eval_num_tokens": 17080999.0, |
| "eval_runtime": 516.4794, |
| "eval_samples_per_second": 0.227, |
| "eval_self_calculate_token_accuracy": 0.8169106245040894, |
| "eval_steps_per_second": 0.114, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.2950183308173389, |
| "grad_norm": 64.78873443603516, |
| "learning_rate": 0.0002, |
| "loss": 67.4892, |
| "mean_token_accuracy": 0.8439187837971581, |
| "num_tokens": 17288446.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.2950183308173389, |
| "eval_accuracy": 0.9147992730140686, |
| "eval_loss": 0.5951412916183472, |
| "eval_mean_token_accuracy": 0.8168428196745404, |
| "eval_num_tokens": 17288446.0, |
| "eval_runtime": 514.3283, |
| "eval_samples_per_second": 0.227, |
| "eval_self_calculate_token_accuracy": 0.817070722579956, |
| "eval_steps_per_second": 0.115, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.3105456113866725, |
| "grad_norm": 65.98389434814453, |
| "learning_rate": 0.0002, |
| "loss": 67.0237, |
| "mean_token_accuracy": 0.8461024496290419, |
| "num_tokens": 17494825.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.3105456113866725, |
| "eval_accuracy": 0.9146599173545837, |
| "eval_loss": 0.595986008644104, |
| "eval_mean_token_accuracy": 0.8160961280434819, |
| "eval_num_tokens": 17494825.0, |
| "eval_runtime": 447.4616, |
| "eval_samples_per_second": 0.261, |
| "eval_self_calculate_token_accuracy": 0.8161975145339966, |
| "eval_steps_per_second": 0.132, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.326072891956006, |
| "grad_norm": 76.89059448242188, |
| "learning_rate": 0.0002, |
| "loss": 66.7919, |
| "mean_token_accuracy": 0.8466944462723203, |
| "num_tokens": 17701740.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.326072891956006, |
| "eval_accuracy": 0.915256679058075, |
| "eval_loss": 0.5967960953712463, |
| "eval_mean_token_accuracy": 0.8167503890344652, |
| "eval_num_tokens": 17701740.0, |
| "eval_runtime": 394.6212, |
| "eval_samples_per_second": 0.296, |
| "eval_self_calculate_token_accuracy": 0.8169445991516113, |
| "eval_steps_per_second": 0.15, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.3416001725253397, |
| "grad_norm": 65.26689910888672, |
| "learning_rate": 0.0002, |
| "loss": 65.425, |
| "mean_token_accuracy": 0.8489014920261171, |
| "num_tokens": 17909032.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.3416001725253397, |
| "eval_accuracy": 0.9146382808685303, |
| "eval_loss": 0.6012277603149414, |
| "eval_mean_token_accuracy": 0.814886310343015, |
| "eval_num_tokens": 17909032.0, |
| "eval_runtime": 528.4431, |
| "eval_samples_per_second": 0.221, |
| "eval_self_calculate_token_accuracy": 0.815308153629303, |
| "eval_steps_per_second": 0.112, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.3571274530946733, |
| "grad_norm": 70.5202865600586, |
| "learning_rate": 0.0002, |
| "loss": 68.8527, |
| "mean_token_accuracy": 0.8431123610999849, |
| "num_tokens": 18117897.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.3571274530946733, |
| "eval_accuracy": 0.9140195846557617, |
| "eval_loss": 0.6005353927612305, |
| "eval_mean_token_accuracy": 0.8159067469128107, |
| "eval_num_tokens": 18117897.0, |
| "eval_runtime": 419.0597, |
| "eval_samples_per_second": 0.279, |
| "eval_self_calculate_token_accuracy": 0.8161543011665344, |
| "eval_steps_per_second": 0.141, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.372654733664007, |
| "grad_norm": 63.33000564575195, |
| "learning_rate": 0.0002, |
| "loss": 66.2856, |
| "mean_token_accuracy": 0.8484865700205168, |
| "num_tokens": 18325117.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.372654733664007, |
| "eval_accuracy": 0.9145908951759338, |
| "eval_loss": 0.5968843698501587, |
| "eval_mean_token_accuracy": 0.817190273333404, |
| "eval_num_tokens": 18325117.0, |
| "eval_runtime": 572.6535, |
| "eval_samples_per_second": 0.204, |
| "eval_self_calculate_token_accuracy": 0.8174945712089539, |
| "eval_steps_per_second": 0.103, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.3881820142333405, |
| "grad_norm": 69.2812271118164, |
| "learning_rate": 0.0002, |
| "loss": 65.8645, |
| "mean_token_accuracy": 0.8487454420990415, |
| "num_tokens": 18531718.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.3881820142333405, |
| "eval_accuracy": 0.9164107441902161, |
| "eval_loss": 0.595005214214325, |
| "eval_mean_token_accuracy": 0.8179631404957529, |
| "eval_num_tokens": 18531718.0, |
| "eval_runtime": 502.1535, |
| "eval_samples_per_second": 0.233, |
| "eval_self_calculate_token_accuracy": 0.8182094097137451, |
| "eval_steps_per_second": 0.117, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.4037092948026741, |
| "grad_norm": 65.05613708496094, |
| "learning_rate": 0.0002, |
| "loss": 65.974, |
| "mean_token_accuracy": 0.8492478239867423, |
| "num_tokens": 18739671.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.4037092948026741, |
| "eval_accuracy": 0.9192290902137756, |
| "eval_loss": 0.5931403040885925, |
| "eval_mean_token_accuracy": 0.8182988843675387, |
| "eval_num_tokens": 18739671.0, |
| "eval_runtime": 533.7116, |
| "eval_samples_per_second": 0.219, |
| "eval_self_calculate_token_accuracy": 0.8187220096588135, |
| "eval_steps_per_second": 0.111, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.4192365753720078, |
| "grad_norm": 66.6202163696289, |
| "learning_rate": 0.0002, |
| "loss": 65.8561, |
| "mean_token_accuracy": 0.8461782650815116, |
| "num_tokens": 18946385.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.4192365753720078, |
| "eval_accuracy": 0.9190437197685242, |
| "eval_loss": 0.5920654535293579, |
| "eval_mean_token_accuracy": 0.8191608089511677, |
| "eval_num_tokens": 18946385.0, |
| "eval_runtime": 519.8171, |
| "eval_samples_per_second": 0.225, |
| "eval_self_calculate_token_accuracy": 0.819395124912262, |
| "eval_steps_per_second": 0.114, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.4347638559413414, |
| "grad_norm": 66.7843017578125, |
| "learning_rate": 0.0002, |
| "loss": 68.4857, |
| "mean_token_accuracy": 0.8438015497393079, |
| "num_tokens": 19154091.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.4347638559413414, |
| "eval_accuracy": 0.9160621762275696, |
| "eval_loss": 0.5893514156341553, |
| "eval_mean_token_accuracy": 0.8189115807161493, |
| "eval_num_tokens": 19154091.0, |
| "eval_runtime": 474.7134, |
| "eval_samples_per_second": 0.246, |
| "eval_self_calculate_token_accuracy": 0.8190383315086365, |
| "eval_steps_per_second": 0.124, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.450291136510675, |
| "grad_norm": 64.44615936279297, |
| "learning_rate": 0.0002, |
| "loss": 67.0471, |
| "mean_token_accuracy": 0.8478122287326388, |
| "num_tokens": 19362120.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.450291136510675, |
| "eval_accuracy": 0.9157365560531616, |
| "eval_loss": 0.5896974205970764, |
| "eval_mean_token_accuracy": 0.818356197769359, |
| "eval_num_tokens": 19362120.0, |
| "eval_runtime": 543.0012, |
| "eval_samples_per_second": 0.215, |
| "eval_self_calculate_token_accuracy": 0.8185704946517944, |
| "eval_steps_per_second": 0.109, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.4658184170800086, |
| "grad_norm": 64.31645965576172, |
| "learning_rate": 0.0002, |
| "loss": 66.0855, |
| "mean_token_accuracy": 0.8465136811137199, |
| "num_tokens": 19569780.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.4658184170800086, |
| "eval_accuracy": 0.915468156337738, |
| "eval_loss": 0.5925508737564087, |
| "eval_mean_token_accuracy": 0.817296746423689, |
| "eval_num_tokens": 19569780.0, |
| "eval_runtime": 531.3937, |
| "eval_samples_per_second": 0.22, |
| "eval_self_calculate_token_accuracy": 0.8174288272857666, |
| "eval_steps_per_second": 0.111, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.4813456976493422, |
| "grad_norm": 63.50798034667969, |
| "learning_rate": 0.0002, |
| "loss": 67.4297, |
| "mean_token_accuracy": 0.844856838385264, |
| "num_tokens": 19776839.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.4813456976493422, |
| "eval_accuracy": 0.9153802990913391, |
| "eval_loss": 0.5922942161560059, |
| "eval_mean_token_accuracy": 0.8179396572759596, |
| "eval_num_tokens": 19776839.0, |
| "eval_runtime": 523.4909, |
| "eval_samples_per_second": 0.223, |
| "eval_self_calculate_token_accuracy": 0.8179522156715393, |
| "eval_steps_per_second": 0.113, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.4968729782186758, |
| "grad_norm": 58.527183532714844, |
| "learning_rate": 0.0002, |
| "loss": 67.4937, |
| "mean_token_accuracy": 0.8452946411238776, |
| "num_tokens": 19984843.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.4968729782186758, |
| "eval_accuracy": 0.9151400327682495, |
| "eval_loss": 0.5896632671356201, |
| "eval_mean_token_accuracy": 0.819458726098982, |
| "eval_num_tokens": 19984843.0, |
| "eval_runtime": 476.9225, |
| "eval_samples_per_second": 0.245, |
| "eval_self_calculate_token_accuracy": 0.8193376660346985, |
| "eval_steps_per_second": 0.124, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.5124002587880097, |
| "grad_norm": 59.124046325683594, |
| "learning_rate": 0.0002, |
| "loss": 65.7654, |
| "mean_token_accuracy": 0.8481284040543768, |
| "num_tokens": 20191598.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.5124002587880097, |
| "eval_accuracy": 0.9174317717552185, |
| "eval_loss": 0.5869073271751404, |
| "eval_mean_token_accuracy": 0.8197130833641958, |
| "eval_num_tokens": 20191598.0, |
| "eval_runtime": 470.1847, |
| "eval_samples_per_second": 0.249, |
| "eval_self_calculate_token_accuracy": 0.8196244239807129, |
| "eval_steps_per_second": 0.125, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.5279275393573433, |
| "grad_norm": 66.8822021484375, |
| "learning_rate": 0.0002, |
| "loss": 65.3105, |
| "mean_token_accuracy": 0.8491861952675713, |
| "num_tokens": 20398084.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.5279275393573433, |
| "eval_accuracy": 0.9170477390289307, |
| "eval_loss": 0.5863754153251648, |
| "eval_mean_token_accuracy": 0.8206737354650335, |
| "eval_num_tokens": 20398084.0, |
| "eval_runtime": 477.6228, |
| "eval_samples_per_second": 0.245, |
| "eval_self_calculate_token_accuracy": 0.8207647204399109, |
| "eval_steps_per_second": 0.124, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.5434548199266769, |
| "grad_norm": 65.41436767578125, |
| "learning_rate": 0.0002, |
| "loss": 65.5053, |
| "mean_token_accuracy": 0.849404488172796, |
| "num_tokens": 20605069.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.5434548199266769, |
| "eval_accuracy": 0.9143133163452148, |
| "eval_loss": 0.5900213718414307, |
| "eval_mean_token_accuracy": 0.8198770961519015, |
| "eval_num_tokens": 20605069.0, |
| "eval_runtime": 526.6496, |
| "eval_samples_per_second": 0.222, |
| "eval_self_calculate_token_accuracy": 0.8199238181114197, |
| "eval_steps_per_second": 0.112, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.5589821004960105, |
| "grad_norm": 62.98171615600586, |
| "learning_rate": 0.0002, |
| "loss": 66.9568, |
| "mean_token_accuracy": 0.8463868324955305, |
| "num_tokens": 20811718.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.5589821004960105, |
| "eval_accuracy": 0.9129236340522766, |
| "eval_loss": 0.5908729434013367, |
| "eval_mean_token_accuracy": 0.8185009764412702, |
| "eval_num_tokens": 20811718.0, |
| "eval_runtime": 380.7957, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8186503052711487, |
| "eval_steps_per_second": 0.155, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.574509381065344, |
| "grad_norm": 64.95964050292969, |
| "learning_rate": 0.0002, |
| "loss": 63.6062, |
| "mean_token_accuracy": 0.8520710609025426, |
| "num_tokens": 21019827.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.574509381065344, |
| "eval_accuracy": 0.9128480553627014, |
| "eval_loss": 0.5888250470161438, |
| "eval_mean_token_accuracy": 0.8192389758966737, |
| "eval_num_tokens": 21019827.0, |
| "eval_runtime": 505.9759, |
| "eval_samples_per_second": 0.231, |
| "eval_self_calculate_token_accuracy": 0.8193753957748413, |
| "eval_steps_per_second": 0.117, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.5900366616346777, |
| "grad_norm": 61.53292465209961, |
| "learning_rate": 0.0002, |
| "loss": 66.1185, |
| "mean_token_accuracy": 0.8483043238520622, |
| "num_tokens": 21228566.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.5900366616346777, |
| "eval_accuracy": 0.9124256372451782, |
| "eval_loss": 0.5878067016601562, |
| "eval_mean_token_accuracy": 0.8198131838087308, |
| "eval_num_tokens": 21228566.0, |
| "eval_runtime": 392.2673, |
| "eval_samples_per_second": 0.298, |
| "eval_self_calculate_token_accuracy": 0.8198050260543823, |
| "eval_steps_per_second": 0.15, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.6055639422040113, |
| "grad_norm": 65.00923156738281, |
| "learning_rate": 0.0002, |
| "loss": 65.6947, |
| "mean_token_accuracy": 0.8476537043849627, |
| "num_tokens": 21435076.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.6055639422040113, |
| "eval_accuracy": 0.912020206451416, |
| "eval_loss": 0.5870491862297058, |
| "eval_mean_token_accuracy": 0.8199449939242864, |
| "eval_num_tokens": 21435076.0, |
| "eval_runtime": 563.2565, |
| "eval_samples_per_second": 0.208, |
| "eval_self_calculate_token_accuracy": 0.8200485110282898, |
| "eval_steps_per_second": 0.105, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.621091222773345, |
| "grad_norm": 62.851409912109375, |
| "learning_rate": 0.0002, |
| "loss": 65.1109, |
| "mean_token_accuracy": 0.8502412289381027, |
| "num_tokens": 21641505.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.621091222773345, |
| "eval_accuracy": 0.9128506779670715, |
| "eval_loss": 0.5865439772605896, |
| "eval_mean_token_accuracy": 0.8204073572562913, |
| "eval_num_tokens": 21641505.0, |
| "eval_runtime": 406.2865, |
| "eval_samples_per_second": 0.288, |
| "eval_self_calculate_token_accuracy": 0.8205317258834839, |
| "eval_steps_per_second": 0.145, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.6366185033426786, |
| "grad_norm": 63.905975341796875, |
| "learning_rate": 0.0002, |
| "loss": 63.3789, |
| "mean_token_accuracy": 0.8532266310519643, |
| "num_tokens": 21847979.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.6366185033426786, |
| "eval_accuracy": 0.9153233766555786, |
| "eval_loss": 0.5887019634246826, |
| "eval_mean_token_accuracy": 0.8191085538621676, |
| "eval_num_tokens": 21847979.0, |
| "eval_runtime": 472.7528, |
| "eval_samples_per_second": 0.247, |
| "eval_self_calculate_token_accuracy": 0.819199800491333, |
| "eval_steps_per_second": 0.125, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.6521457839120122, |
| "grad_norm": 68.42975616455078, |
| "learning_rate": 0.0002, |
| "loss": 66.3562, |
| "mean_token_accuracy": 0.8470784227053324, |
| "num_tokens": 22055159.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.6521457839120122, |
| "eval_accuracy": 0.9135716557502747, |
| "eval_loss": 0.5869589447975159, |
| "eval_mean_token_accuracy": 0.8184095924183473, |
| "eval_num_tokens": 22055159.0, |
| "eval_runtime": 431.7511, |
| "eval_samples_per_second": 0.271, |
| "eval_self_calculate_token_accuracy": 0.8186453580856323, |
| "eval_steps_per_second": 0.137, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.6676730644813458, |
| "grad_norm": 62.75449752807617, |
| "learning_rate": 0.0002, |
| "loss": 67.4327, |
| "mean_token_accuracy": 0.8462236060036553, |
| "num_tokens": 22261565.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.6676730644813458, |
| "eval_accuracy": 0.9142144918441772, |
| "eval_loss": 0.5851525664329529, |
| "eval_mean_token_accuracy": 0.8186417525097475, |
| "eval_num_tokens": 22261565.0, |
| "eval_runtime": 533.8615, |
| "eval_samples_per_second": 0.219, |
| "eval_self_calculate_token_accuracy": 0.8189521431922913, |
| "eval_steps_per_second": 0.111, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.6832003450506794, |
| "grad_norm": 64.80010986328125, |
| "learning_rate": 0.0002, |
| "loss": 65.9084, |
| "mean_token_accuracy": 0.8482257632745637, |
| "num_tokens": 22468913.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.6832003450506794, |
| "eval_accuracy": 0.9151884317398071, |
| "eval_loss": 0.5840141177177429, |
| "eval_mean_token_accuracy": 0.8188382821567988, |
| "eval_num_tokens": 22468913.0, |
| "eval_runtime": 468.6546, |
| "eval_samples_per_second": 0.25, |
| "eval_self_calculate_token_accuracy": 0.8191075325012207, |
| "eval_steps_per_second": 0.126, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.698727625620013, |
| "grad_norm": 68.90869140625, |
| "learning_rate": 0.0002, |
| "loss": 61.8267, |
| "mean_token_accuracy": 0.8545476893583933, |
| "num_tokens": 22675797.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.698727625620013, |
| "eval_accuracy": 0.9158926010131836, |
| "eval_loss": 0.58217853307724, |
| "eval_mean_token_accuracy": 0.8200747471744732, |
| "eval_num_tokens": 22675797.0, |
| "eval_runtime": 527.4158, |
| "eval_samples_per_second": 0.222, |
| "eval_self_calculate_token_accuracy": 0.8202611804008484, |
| "eval_steps_per_second": 0.112, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.7142549061893466, |
| "grad_norm": 61.65879821777344, |
| "learning_rate": 0.0002, |
| "loss": 66.5756, |
| "mean_token_accuracy": 0.8469109121296141, |
| "num_tokens": 22883658.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.7142549061893466, |
| "eval_accuracy": 0.9147453904151917, |
| "eval_loss": 0.583078145980835, |
| "eval_mean_token_accuracy": 0.8195210755881617, |
| "eval_num_tokens": 22883658.0, |
| "eval_runtime": 520.4296, |
| "eval_samples_per_second": 0.225, |
| "eval_self_calculate_token_accuracy": 0.8196620941162109, |
| "eval_steps_per_second": 0.113, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.7297821867586802, |
| "grad_norm": 67.2427978515625, |
| "learning_rate": 0.0002, |
| "loss": 64.482, |
| "mean_token_accuracy": 0.8511827654308743, |
| "num_tokens": 23090709.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.7297821867586802, |
| "eval_accuracy": 0.9146234393119812, |
| "eval_loss": 0.5832362174987793, |
| "eval_mean_token_accuracy": 0.819351655952001, |
| "eval_num_tokens": 23090709.0, |
| "eval_runtime": 499.0606, |
| "eval_samples_per_second": 0.234, |
| "eval_self_calculate_token_accuracy": 0.8194025754928589, |
| "eval_steps_per_second": 0.118, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.7453094673280138, |
| "grad_norm": 63.449119567871094, |
| "learning_rate": 0.0002, |
| "loss": 66.0045, |
| "mean_token_accuracy": 0.8464179154899385, |
| "num_tokens": 23298298.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.7453094673280138, |
| "eval_accuracy": 0.9143020510673523, |
| "eval_loss": 0.5855695009231567, |
| "eval_mean_token_accuracy": 0.818553710387925, |
| "eval_num_tokens": 23298298.0, |
| "eval_runtime": 555.7333, |
| "eval_samples_per_second": 0.211, |
| "eval_self_calculate_token_accuracy": 0.8186615705490112, |
| "eval_steps_per_second": 0.106, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.7608367478973475, |
| "grad_norm": 69.69196319580078, |
| "learning_rate": 0.0002, |
| "loss": 66.0712, |
| "mean_token_accuracy": 0.8473733961582184, |
| "num_tokens": 23506064.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.7608367478973475, |
| "eval_accuracy": 0.915688693523407, |
| "eval_loss": 0.5876982808113098, |
| "eval_mean_token_accuracy": 0.8189748363979792, |
| "eval_num_tokens": 23506064.0, |
| "eval_runtime": 538.2243, |
| "eval_samples_per_second": 0.217, |
| "eval_self_calculate_token_accuracy": 0.8190093636512756, |
| "eval_steps_per_second": 0.11, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.776364028466681, |
| "grad_norm": 70.64740753173828, |
| "learning_rate": 0.0002, |
| "loss": 66.2606, |
| "mean_token_accuracy": 0.8471579170889325, |
| "num_tokens": 23713554.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.776364028466681, |
| "eval_accuracy": 0.9138741493225098, |
| "eval_loss": 0.5907846689224243, |
| "eval_mean_token_accuracy": 0.8195642394534612, |
| "eval_num_tokens": 23713554.0, |
| "eval_runtime": 562.2138, |
| "eval_samples_per_second": 0.208, |
| "eval_self_calculate_token_accuracy": 0.8194622993469238, |
| "eval_steps_per_second": 0.105, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.7918913090360147, |
| "grad_norm": 65.11492156982422, |
| "learning_rate": 0.0002, |
| "loss": 64.3317, |
| "mean_token_accuracy": 0.8502314595712556, |
| "num_tokens": 23919389.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.7918913090360147, |
| "eval_accuracy": 0.9144482612609863, |
| "eval_loss": 0.5899173021316528, |
| "eval_mean_token_accuracy": 0.8197249558012364, |
| "eval_num_tokens": 23919389.0, |
| "eval_runtime": 549.2504, |
| "eval_samples_per_second": 0.213, |
| "eval_self_calculate_token_accuracy": 0.8195589780807495, |
| "eval_steps_per_second": 0.107, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.8074185896053483, |
| "grad_norm": 62.23531723022461, |
| "learning_rate": 0.0002, |
| "loss": 65.9011, |
| "mean_token_accuracy": 0.848718142343892, |
| "num_tokens": 24127157.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.8074185896053483, |
| "eval_accuracy": 0.9122486114501953, |
| "eval_loss": 0.5906705856323242, |
| "eval_mean_token_accuracy": 0.8188407562546811, |
| "eval_num_tokens": 24127157.0, |
| "eval_runtime": 526.1658, |
| "eval_samples_per_second": 0.222, |
| "eval_self_calculate_token_accuracy": 0.8186511397361755, |
| "eval_steps_per_second": 0.112, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.822945870174682, |
| "grad_norm": 67.0640640258789, |
| "learning_rate": 0.0002, |
| "loss": 64.9516, |
| "mean_token_accuracy": 0.848182227048609, |
| "num_tokens": 24334214.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.822945870174682, |
| "eval_accuracy": 0.9157512784004211, |
| "eval_loss": 0.5871382355690002, |
| "eval_mean_token_accuracy": 0.8193736197584767, |
| "eval_num_tokens": 24334214.0, |
| "eval_runtime": 521.1721, |
| "eval_samples_per_second": 0.224, |
| "eval_self_calculate_token_accuracy": 0.819322407245636, |
| "eval_steps_per_second": 0.113, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.8384731507440155, |
| "grad_norm": 62.31776809692383, |
| "learning_rate": 0.0002, |
| "loss": 65.2727, |
| "mean_token_accuracy": 0.8492811752690209, |
| "num_tokens": 24540926.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.8384731507440155, |
| "eval_accuracy": 0.9180989265441895, |
| "eval_loss": 0.5852234959602356, |
| "eval_mean_token_accuracy": 0.8209986585681721, |
| "eval_num_tokens": 24540926.0, |
| "eval_runtime": 532.7618, |
| "eval_samples_per_second": 0.22, |
| "eval_self_calculate_token_accuracy": 0.8209105134010315, |
| "eval_steps_per_second": 0.111, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.8540004313133491, |
| "grad_norm": 67.45236206054688, |
| "learning_rate": 0.0002, |
| "loss": 65.6287, |
| "mean_token_accuracy": 0.848878843916787, |
| "num_tokens": 24747991.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.8540004313133491, |
| "eval_accuracy": 0.9176638722419739, |
| "eval_loss": 0.5843996405601501, |
| "eval_mean_token_accuracy": 0.8210371803429167, |
| "eval_num_tokens": 24747991.0, |
| "eval_runtime": 432.1633, |
| "eval_samples_per_second": 0.271, |
| "eval_self_calculate_token_accuracy": 0.8208267688751221, |
| "eval_steps_per_second": 0.137, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.8695277118826827, |
| "grad_norm": 61.05096435546875, |
| "learning_rate": 0.0002, |
| "loss": 63.482, |
| "mean_token_accuracy": 0.8515734192397859, |
| "num_tokens": 24954557.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.8695277118826827, |
| "eval_accuracy": 0.916504979133606, |
| "eval_loss": 0.5848639011383057, |
| "eval_mean_token_accuracy": 0.8200793337013762, |
| "eval_num_tokens": 24954557.0, |
| "eval_runtime": 380.1122, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8200535774230957, |
| "eval_steps_per_second": 0.155, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.8850549924520164, |
| "grad_norm": 64.0988998413086, |
| "learning_rate": 0.0002, |
| "loss": 63.3617, |
| "mean_token_accuracy": 0.8512310625778304, |
| "num_tokens": 25161507.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.8850549924520164, |
| "eval_accuracy": 0.9166208505630493, |
| "eval_loss": 0.5825186371803284, |
| "eval_mean_token_accuracy": 0.8216992687370818, |
| "eval_num_tokens": 25161507.0, |
| "eval_runtime": 380.6605, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8216376304626465, |
| "eval_steps_per_second": 0.155, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.90058227302135, |
| "grad_norm": 60.96718978881836, |
| "learning_rate": 0.0002, |
| "loss": 64.7341, |
| "mean_token_accuracy": 0.848566547036171, |
| "num_tokens": 25367761.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.90058227302135, |
| "eval_accuracy": 0.9145371913909912, |
| "eval_loss": 0.5828627347946167, |
| "eval_mean_token_accuracy": 0.8225611751362428, |
| "eval_num_tokens": 25367761.0, |
| "eval_runtime": 381.057, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8225669860839844, |
| "eval_steps_per_second": 0.155, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.9161095535906836, |
| "grad_norm": 61.89659881591797, |
| "learning_rate": 0.0002, |
| "loss": 63.6013, |
| "mean_token_accuracy": 0.8511651348736551, |
| "num_tokens": 25574518.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.9161095535906836, |
| "eval_accuracy": 0.9145485162734985, |
| "eval_loss": 0.5841417908668518, |
| "eval_mean_token_accuracy": 0.8220337518190933, |
| "eval_num_tokens": 25574518.0, |
| "eval_runtime": 387.7067, |
| "eval_samples_per_second": 0.302, |
| "eval_self_calculate_token_accuracy": 0.8218501210212708, |
| "eval_steps_per_second": 0.152, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.9316368341600172, |
| "grad_norm": 62.36483383178711, |
| "learning_rate": 0.0002, |
| "loss": 64.6146, |
| "mean_token_accuracy": 0.8506108522415161, |
| "num_tokens": 25781426.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.9316368341600172, |
| "eval_accuracy": 0.9129160046577454, |
| "eval_loss": 0.5827949643135071, |
| "eval_mean_token_accuracy": 0.8214941671339132, |
| "eval_num_tokens": 25781426.0, |
| "eval_runtime": 381.8285, |
| "eval_samples_per_second": 0.306, |
| "eval_self_calculate_token_accuracy": 0.8213940858840942, |
| "eval_steps_per_second": 0.155, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.9471641147293508, |
| "grad_norm": 65.14798736572266, |
| "learning_rate": 0.0002, |
| "loss": 62.4968, |
| "mean_token_accuracy": 0.8529660478234291, |
| "num_tokens": 25988968.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.9471641147293508, |
| "eval_accuracy": 0.9111137390136719, |
| "eval_loss": 0.5808615684509277, |
| "eval_mean_token_accuracy": 0.8212564749232794, |
| "eval_num_tokens": 25988968.0, |
| "eval_runtime": 386.5921, |
| "eval_samples_per_second": 0.303, |
| "eval_self_calculate_token_accuracy": 0.8212316036224365, |
| "eval_steps_per_second": 0.153, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.9626913952986844, |
| "grad_norm": 62.06315994262695, |
| "learning_rate": 0.0002, |
| "loss": 64.5604, |
| "mean_token_accuracy": 0.8502356194787555, |
| "num_tokens": 26196453.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.9626913952986844, |
| "eval_accuracy": 0.9112434983253479, |
| "eval_loss": 0.580268383026123, |
| "eval_mean_token_accuracy": 0.8204727688078153, |
| "eval_num_tokens": 26196453.0, |
| "eval_runtime": 382.1787, |
| "eval_samples_per_second": 0.306, |
| "eval_self_calculate_token_accuracy": 0.8204028010368347, |
| "eval_steps_per_second": 0.154, |
| "step": 127 |
| }, |
| { |
| "epoch": 1.978218675868018, |
| "grad_norm": 63.92466354370117, |
| "learning_rate": 0.0002, |
| "loss": 64.8758, |
| "mean_token_accuracy": 0.8495825851957003, |
| "num_tokens": 26403575.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.978218675868018, |
| "eval_accuracy": 0.911768913269043, |
| "eval_loss": 0.5803488492965698, |
| "eval_mean_token_accuracy": 0.8195409906112542, |
| "eval_num_tokens": 26403575.0, |
| "eval_runtime": 387.8159, |
| "eval_samples_per_second": 0.302, |
| "eval_self_calculate_token_accuracy": 0.8195681571960449, |
| "eval_steps_per_second": 0.152, |
| "step": 128 |
| }, |
| { |
| "epoch": 1.9937459564373516, |
| "grad_norm": 64.64863586425781, |
| "learning_rate": 0.0002, |
| "loss": 63.6354, |
| "mean_token_accuracy": 0.8531460654404428, |
| "num_tokens": 26610929.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 1.9937459564373516, |
| "eval_accuracy": 0.9138753414154053, |
| "eval_loss": 0.5813618302345276, |
| "eval_mean_token_accuracy": 0.8191980951923435, |
| "eval_num_tokens": 26610929.0, |
| "eval_runtime": 386.3754, |
| "eval_samples_per_second": 0.303, |
| "eval_self_calculate_token_accuracy": 0.8194134831428528, |
| "eval_steps_per_second": 0.153, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 33.25372314453125, |
| "learning_rate": 0.0002, |
| "loss": 25.1176, |
| "mean_token_accuracy": 0.8498612950588095, |
| "num_tokens": 26693884.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_accuracy": 0.9129433631896973, |
| "eval_loss": 0.584043562412262, |
| "eval_mean_token_accuracy": 0.8190222756337311, |
| "eval_num_tokens": 26693884.0, |
| "eval_runtime": 384.6952, |
| "eval_samples_per_second": 0.304, |
| "eval_self_calculate_token_accuracy": 0.819307804107666, |
| "eval_steps_per_second": 0.153, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.0155272805693336, |
| "grad_norm": 69.01683807373047, |
| "learning_rate": 0.0002, |
| "loss": 61.0746, |
| "mean_token_accuracy": 0.8551189593142934, |
| "num_tokens": 26900278.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.0155272805693336, |
| "eval_accuracy": 0.912857711315155, |
| "eval_loss": 0.5841554999351501, |
| "eval_mean_token_accuracy": 0.820343726772373, |
| "eval_num_tokens": 26900278.0, |
| "eval_runtime": 389.0704, |
| "eval_samples_per_second": 0.301, |
| "eval_self_calculate_token_accuracy": 0.8206613659858704, |
| "eval_steps_per_second": 0.152, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.0310545611386672, |
| "grad_norm": 61.96529769897461, |
| "learning_rate": 0.0002, |
| "loss": 60.9193, |
| "mean_token_accuracy": 0.8563586961891916, |
| "num_tokens": 27106289.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.0310545611386672, |
| "eval_accuracy": 0.9134542942047119, |
| "eval_loss": 0.5875779390335083, |
| "eval_mean_token_accuracy": 0.8205456380116738, |
| "eval_num_tokens": 27106289.0, |
| "eval_runtime": 386.3978, |
| "eval_samples_per_second": 0.303, |
| "eval_self_calculate_token_accuracy": 0.8206611275672913, |
| "eval_steps_per_second": 0.153, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.046581841708001, |
| "grad_norm": 68.82410430908203, |
| "learning_rate": 0.0002, |
| "loss": 60.1456, |
| "mean_token_accuracy": 0.8554905611607764, |
| "num_tokens": 27313156.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.046581841708001, |
| "eval_accuracy": 0.9131547212600708, |
| "eval_loss": 0.5893575549125671, |
| "eval_mean_token_accuracy": 0.819547031895589, |
| "eval_num_tokens": 27313156.0, |
| "eval_runtime": 384.4875, |
| "eval_samples_per_second": 0.304, |
| "eval_self_calculate_token_accuracy": 0.819791853427887, |
| "eval_steps_per_second": 0.153, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.0621091222773344, |
| "grad_norm": 59.988258361816406, |
| "learning_rate": 0.0002, |
| "loss": 61.8068, |
| "mean_token_accuracy": 0.8555878640876876, |
| "num_tokens": 27521182.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.0621091222773344, |
| "eval_accuracy": 0.9099174737930298, |
| "eval_loss": 0.5921615958213806, |
| "eval_mean_token_accuracy": 0.8174765948521889, |
| "eval_num_tokens": 27521182.0, |
| "eval_runtime": 387.3108, |
| "eval_samples_per_second": 0.302, |
| "eval_self_calculate_token_accuracy": 0.8175939917564392, |
| "eval_steps_per_second": 0.152, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.077636402846668, |
| "grad_norm": 75.8721923828125, |
| "learning_rate": 0.0002, |
| "loss": 59.6788, |
| "mean_token_accuracy": 0.8561115082767274, |
| "num_tokens": 27728221.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.077636402846668, |
| "eval_accuracy": 0.912885308265686, |
| "eval_loss": 0.586609423160553, |
| "eval_mean_token_accuracy": 0.8195660417362794, |
| "eval_num_tokens": 27728221.0, |
| "eval_runtime": 382.489, |
| "eval_samples_per_second": 0.306, |
| "eval_self_calculate_token_accuracy": 0.8196806907653809, |
| "eval_steps_per_second": 0.154, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.0931636834160017, |
| "grad_norm": 59.155696868896484, |
| "learning_rate": 0.0002, |
| "loss": 61.4073, |
| "mean_token_accuracy": 0.8552076510257192, |
| "num_tokens": 27936267.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.0931636834160017, |
| "eval_accuracy": 0.9157779216766357, |
| "eval_loss": 0.5844801068305969, |
| "eval_mean_token_accuracy": 0.8200439800650386, |
| "eval_num_tokens": 27936267.0, |
| "eval_runtime": 382.2755, |
| "eval_samples_per_second": 0.306, |
| "eval_self_calculate_token_accuracy": 0.8201448917388916, |
| "eval_steps_per_second": 0.154, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.1086909639853353, |
| "grad_norm": 72.3846206665039, |
| "learning_rate": 0.0002, |
| "loss": 59.9739, |
| "mean_token_accuracy": 0.8573056699501144, |
| "num_tokens": 28143277.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.1086909639853353, |
| "eval_accuracy": 0.916522741317749, |
| "eval_loss": 0.5837716460227966, |
| "eval_mean_token_accuracy": 0.821590631695117, |
| "eval_num_tokens": 28143277.0, |
| "eval_runtime": 384.8121, |
| "eval_samples_per_second": 0.304, |
| "eval_self_calculate_token_accuracy": 0.8216969966888428, |
| "eval_steps_per_second": 0.153, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.124218244554669, |
| "grad_norm": 58.57980728149414, |
| "learning_rate": 0.0002, |
| "loss": 59.7079, |
| "mean_token_accuracy": 0.8577245134446356, |
| "num_tokens": 28350686.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.124218244554669, |
| "eval_accuracy": 0.9165829420089722, |
| "eval_loss": 0.5855816602706909, |
| "eval_mean_token_accuracy": 0.8211737517583169, |
| "eval_num_tokens": 28350686.0, |
| "eval_runtime": 381.0734, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8212018013000488, |
| "eval_steps_per_second": 0.155, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.1397455251240025, |
| "grad_norm": 69.02935791015625, |
| "learning_rate": 0.0002, |
| "loss": 59.3487, |
| "mean_token_accuracy": 0.8584998357627127, |
| "num_tokens": 28558542.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.1397455251240025, |
| "eval_accuracy": 0.9140151143074036, |
| "eval_loss": 0.5840070843696594, |
| "eval_mean_token_accuracy": 0.8208096320346251, |
| "eval_num_tokens": 28558542.0, |
| "eval_runtime": 381.0211, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8210036158561707, |
| "eval_steps_per_second": 0.155, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.155272805693336, |
| "grad_norm": 64.4303970336914, |
| "learning_rate": 0.0002, |
| "loss": 59.2735, |
| "mean_token_accuracy": 0.8592962059709761, |
| "num_tokens": 28765212.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.155272805693336, |
| "eval_accuracy": 0.9148533940315247, |
| "eval_loss": 0.5841915607452393, |
| "eval_mean_token_accuracy": 0.8205250319788011, |
| "eval_num_tokens": 28765212.0, |
| "eval_runtime": 381.0415, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8208694458007812, |
| "eval_steps_per_second": 0.155, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.1708000862626697, |
| "grad_norm": 72.2681884765625, |
| "learning_rate": 0.0002, |
| "loss": 58.7764, |
| "mean_token_accuracy": 0.8609457860390345, |
| "num_tokens": 28971718.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.1708000862626697, |
| "eval_accuracy": 0.9138324856758118, |
| "eval_loss": 0.5843725204467773, |
| "eval_mean_token_accuracy": 0.8201145085237794, |
| "eval_num_tokens": 28971718.0, |
| "eval_runtime": 383.182, |
| "eval_samples_per_second": 0.305, |
| "eval_self_calculate_token_accuracy": 0.8203620910644531, |
| "eval_steps_per_second": 0.154, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.1863273668320033, |
| "grad_norm": 60.753902435302734, |
| "learning_rate": 0.0002, |
| "loss": 59.8233, |
| "mean_token_accuracy": 0.8597390494412847, |
| "num_tokens": 29178600.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.1863273668320033, |
| "eval_accuracy": 0.9141193628311157, |
| "eval_loss": 0.5848951935768127, |
| "eval_mean_token_accuracy": 0.8205834819098651, |
| "eval_num_tokens": 29178600.0, |
| "eval_runtime": 378.6677, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.820806086063385, |
| "eval_steps_per_second": 0.156, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.201854647401337, |
| "grad_norm": 61.968971252441406, |
| "learning_rate": 0.0002, |
| "loss": 59.7337, |
| "mean_token_accuracy": 0.8571138555804888, |
| "num_tokens": 29386128.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.201854647401337, |
| "eval_accuracy": 0.913427472114563, |
| "eval_loss": 0.5853984951972961, |
| "eval_mean_token_accuracy": 0.8206147167642238, |
| "eval_num_tokens": 29386128.0, |
| "eval_runtime": 377.4229, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8208497166633606, |
| "eval_steps_per_second": 0.156, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.2173819279706706, |
| "grad_norm": 70.27307891845703, |
| "learning_rate": 0.0002, |
| "loss": 61.6454, |
| "mean_token_accuracy": 0.8538142335083749, |
| "num_tokens": 29594127.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.2173819279706706, |
| "eval_accuracy": 0.913574755191803, |
| "eval_loss": 0.583490788936615, |
| "eval_mean_token_accuracy": 0.8212900040513378, |
| "eval_num_tokens": 29594127.0, |
| "eval_runtime": 379.3007, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8213763236999512, |
| "eval_steps_per_second": 0.156, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.232909208540004, |
| "grad_norm": 62.13954544067383, |
| "learning_rate": 0.0002, |
| "loss": 59.1434, |
| "mean_token_accuracy": 0.8582116017738978, |
| "num_tokens": 29802343.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.232909208540004, |
| "eval_accuracy": 0.9137652516365051, |
| "eval_loss": 0.5858919620513916, |
| "eval_mean_token_accuracy": 0.8212083598314706, |
| "eval_num_tokens": 29802343.0, |
| "eval_runtime": 377.8806, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8213328123092651, |
| "eval_steps_per_second": 0.156, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.248436489109338, |
| "grad_norm": 66.23678588867188, |
| "learning_rate": 0.0002, |
| "loss": 61.116, |
| "mean_token_accuracy": 0.8561682742502954, |
| "num_tokens": 30009586.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.248436489109338, |
| "eval_accuracy": 0.9121713042259216, |
| "eval_loss": 0.5878020524978638, |
| "eval_mean_token_accuracy": 0.8216136956619005, |
| "eval_num_tokens": 30009586.0, |
| "eval_runtime": 376.8649, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8216351866722107, |
| "eval_steps_per_second": 0.157, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.2639637696786714, |
| "grad_norm": 60.76675033569336, |
| "learning_rate": 0.0002, |
| "loss": 60.841, |
| "mean_token_accuracy": 0.8568030926916335, |
| "num_tokens": 30217163.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.2639637696786714, |
| "eval_accuracy": 0.9137811660766602, |
| "eval_loss": 0.5845969915390015, |
| "eval_mean_token_accuracy": 0.8233292759475062, |
| "eval_num_tokens": 30217163.0, |
| "eval_runtime": 376.1264, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8235104084014893, |
| "eval_steps_per_second": 0.157, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.279491050248005, |
| "grad_norm": 63.56169128417969, |
| "learning_rate": 0.0002, |
| "loss": 62.1566, |
| "mean_token_accuracy": 0.8544662164317237, |
| "num_tokens": 30424795.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.279491050248005, |
| "eval_accuracy": 0.9153903126716614, |
| "eval_loss": 0.5789721608161926, |
| "eval_mean_token_accuracy": 0.8233087032528247, |
| "eval_num_tokens": 30424795.0, |
| "eval_runtime": 376.8632, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8236533999443054, |
| "eval_steps_per_second": 0.157, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.2950183308173386, |
| "grad_norm": 61.63734436035156, |
| "learning_rate": 0.0002, |
| "loss": 59.4999, |
| "mean_token_accuracy": 0.8565603908565309, |
| "num_tokens": 30631058.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.2950183308173386, |
| "eval_accuracy": 0.9179164171218872, |
| "eval_loss": 0.5795318484306335, |
| "eval_mean_token_accuracy": 0.8219599077257059, |
| "eval_num_tokens": 30631058.0, |
| "eval_runtime": 376.0542, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8221614360809326, |
| "eval_steps_per_second": 0.157, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.3105456113866722, |
| "grad_norm": 69.45445251464844, |
| "learning_rate": 0.0002, |
| "loss": 59.375, |
| "mean_token_accuracy": 0.8604613890250524, |
| "num_tokens": 30837776.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.3105456113866722, |
| "eval_accuracy": 0.9182299375534058, |
| "eval_loss": 0.5823805928230286, |
| "eval_mean_token_accuracy": 0.821157922179012, |
| "eval_num_tokens": 30837776.0, |
| "eval_runtime": 378.0329, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8211876153945923, |
| "eval_steps_per_second": 0.156, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.326072891956006, |
| "grad_norm": 66.84317779541016, |
| "learning_rate": 0.0002, |
| "loss": 59.1895, |
| "mean_token_accuracy": 0.8586011686258845, |
| "num_tokens": 31044311.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.326072891956006, |
| "eval_accuracy": 0.9164312481880188, |
| "eval_loss": 0.5855478644371033, |
| "eval_mean_token_accuracy": 0.8209437596595893, |
| "eval_num_tokens": 31044311.0, |
| "eval_runtime": 377.7454, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8210227489471436, |
| "eval_steps_per_second": 0.156, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.3416001725253395, |
| "grad_norm": 68.1597900390625, |
| "learning_rate": 0.0002, |
| "loss": 58.9604, |
| "mean_token_accuracy": 0.8597514041595988, |
| "num_tokens": 31250115.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.3416001725253395, |
| "eval_accuracy": 0.9156810641288757, |
| "eval_loss": 0.5829243659973145, |
| "eval_mean_token_accuracy": 0.8219761605990135, |
| "eval_num_tokens": 31250115.0, |
| "eval_runtime": 375.7945, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8221800327301025, |
| "eval_steps_per_second": 0.157, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.357127453094673, |
| "grad_norm": 59.72486114501953, |
| "learning_rate": 0.0002, |
| "loss": 60.7109, |
| "mean_token_accuracy": 0.8554418202903535, |
| "num_tokens": 31456983.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.357127453094673, |
| "eval_accuracy": 0.9161943197250366, |
| "eval_loss": 0.5806098580360413, |
| "eval_mean_token_accuracy": 0.8221783668307935, |
| "eval_num_tokens": 31456983.0, |
| "eval_runtime": 376.3052, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8223862051963806, |
| "eval_steps_per_second": 0.157, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.3726547336640067, |
| "grad_norm": 63.68416213989258, |
| "learning_rate": 0.0002, |
| "loss": 60.4995, |
| "mean_token_accuracy": 0.8567898546655973, |
| "num_tokens": 31663695.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.3726547336640067, |
| "eval_accuracy": 0.9144033193588257, |
| "eval_loss": 0.5808287858963013, |
| "eval_mean_token_accuracy": 0.8221847243228201, |
| "eval_num_tokens": 31663695.0, |
| "eval_runtime": 375.8537, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8224231600761414, |
| "eval_steps_per_second": 0.157, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.3881820142333403, |
| "grad_norm": 66.3195571899414, |
| "learning_rate": 0.0002, |
| "loss": 59.3467, |
| "mean_token_accuracy": 0.8589559578233295, |
| "num_tokens": 31871132.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.3881820142333403, |
| "eval_accuracy": 0.9142367243766785, |
| "eval_loss": 0.582304835319519, |
| "eval_mean_token_accuracy": 0.8218212117583065, |
| "eval_num_tokens": 31871132.0, |
| "eval_runtime": 378.5102, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.821976363658905, |
| "eval_steps_per_second": 0.156, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.403709294802674, |
| "grad_norm": 63.1912956237793, |
| "learning_rate": 0.0002, |
| "loss": 60.1586, |
| "mean_token_accuracy": 0.8586585571368536, |
| "num_tokens": 32078640.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.403709294802674, |
| "eval_accuracy": 0.9132453799247742, |
| "eval_loss": 0.5819085240364075, |
| "eval_mean_token_accuracy": 0.8220064892607221, |
| "eval_num_tokens": 32078640.0, |
| "eval_runtime": 376.863, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8221517205238342, |
| "eval_steps_per_second": 0.157, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.4192365753720075, |
| "grad_norm": 62.005027770996094, |
| "learning_rate": 0.0002, |
| "loss": 58.8058, |
| "mean_token_accuracy": 0.859383262693882, |
| "num_tokens": 32285763.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.4192365753720075, |
| "eval_accuracy": 0.9139454960823059, |
| "eval_loss": 0.5791954398155212, |
| "eval_mean_token_accuracy": 0.8221665156089654, |
| "eval_num_tokens": 32285763.0, |
| "eval_runtime": 374.5862, |
| "eval_samples_per_second": 0.312, |
| "eval_self_calculate_token_accuracy": 0.8223824501037598, |
| "eval_steps_per_second": 0.158, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.434763855941341, |
| "grad_norm": 60.02232360839844, |
| "learning_rate": 0.0002, |
| "loss": 59.8911, |
| "mean_token_accuracy": 0.8577538381020228, |
| "num_tokens": 32492600.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.434763855941341, |
| "eval_accuracy": 0.9135267734527588, |
| "eval_loss": 0.5797085165977478, |
| "eval_mean_token_accuracy": 0.8222226146924294, |
| "eval_num_tokens": 32492600.0, |
| "eval_runtime": 377.7332, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8223224878311157, |
| "eval_steps_per_second": 0.156, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.4502911365106748, |
| "grad_norm": 64.57367706298828, |
| "learning_rate": 0.0002, |
| "loss": 61.3605, |
| "mean_token_accuracy": 0.8551682407657305, |
| "num_tokens": 32700665.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.4502911365106748, |
| "eval_accuracy": 0.9132024645805359, |
| "eval_loss": 0.5817025303840637, |
| "eval_mean_token_accuracy": 0.8226864863250215, |
| "eval_num_tokens": 32700665.0, |
| "eval_runtime": 375.9636, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8228164911270142, |
| "eval_steps_per_second": 0.157, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.465818417080009, |
| "grad_norm": 63.6982307434082, |
| "learning_rate": 0.0002, |
| "loss": 60.9457, |
| "mean_token_accuracy": 0.8559256659613715, |
| "num_tokens": 32908536.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.465818417080009, |
| "eval_accuracy": 0.9130781292915344, |
| "eval_loss": 0.5840949416160583, |
| "eval_mean_token_accuracy": 0.8226226517709635, |
| "eval_num_tokens": 32908536.0, |
| "eval_runtime": 391.5379, |
| "eval_samples_per_second": 0.299, |
| "eval_self_calculate_token_accuracy": 0.8226993083953857, |
| "eval_steps_per_second": 0.151, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.4813456976493424, |
| "grad_norm": 61.44901657104492, |
| "learning_rate": 0.0002, |
| "loss": 60.7485, |
| "mean_token_accuracy": 0.857665989961889, |
| "num_tokens": 33116824.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.4813456976493424, |
| "eval_accuracy": 0.914076566696167, |
| "eval_loss": 0.5831388235092163, |
| "eval_mean_token_accuracy": 0.8227149096585936, |
| "eval_num_tokens": 33116824.0, |
| "eval_runtime": 378.1203, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8227185010910034, |
| "eval_steps_per_second": 0.156, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.496872978218676, |
| "grad_norm": 67.32522583007812, |
| "learning_rate": 0.0002, |
| "loss": 59.7881, |
| "mean_token_accuracy": 0.8572503535283936, |
| "num_tokens": 33322935.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.496872978218676, |
| "eval_accuracy": 0.9150428175926208, |
| "eval_loss": 0.579970121383667, |
| "eval_mean_token_accuracy": 0.8239261932292227, |
| "eval_num_tokens": 33322935.0, |
| "eval_runtime": 373.405, |
| "eval_samples_per_second": 0.313, |
| "eval_self_calculate_token_accuracy": 0.8238365054130554, |
| "eval_steps_per_second": 0.158, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.5124002587880097, |
| "grad_norm": 64.82595825195312, |
| "learning_rate": 0.0002, |
| "loss": 60.2268, |
| "mean_token_accuracy": 0.8578346810407109, |
| "num_tokens": 33530086.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.5124002587880097, |
| "eval_accuracy": 0.9133766293525696, |
| "eval_loss": 0.579236626625061, |
| "eval_mean_token_accuracy": 0.8231699001991143, |
| "eval_num_tokens": 33530086.0, |
| "eval_runtime": 375.1521, |
| "eval_samples_per_second": 0.312, |
| "eval_self_calculate_token_accuracy": 0.823078453540802, |
| "eval_steps_per_second": 0.157, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.5279275393573433, |
| "grad_norm": 59.321285247802734, |
| "learning_rate": 0.0002, |
| "loss": 61.0169, |
| "mean_token_accuracy": 0.8571203433805041, |
| "num_tokens": 33737219.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.5279275393573433, |
| "eval_accuracy": 0.9121658802032471, |
| "eval_loss": 0.5804930925369263, |
| "eval_mean_token_accuracy": 0.8226102519843538, |
| "eval_num_tokens": 33737219.0, |
| "eval_runtime": 376.2711, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.822685182094574, |
| "eval_steps_per_second": 0.157, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.543454819926677, |
| "grad_norm": 63.720611572265625, |
| "learning_rate": 0.0002, |
| "loss": 61.5548, |
| "mean_token_accuracy": 0.8551921190487014, |
| "num_tokens": 33944653.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.543454819926677, |
| "eval_accuracy": 0.915259838104248, |
| "eval_loss": 0.5791561603546143, |
| "eval_mean_token_accuracy": 0.8228190127065627, |
| "eval_num_tokens": 33944653.0, |
| "eval_runtime": 378.2553, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8230419754981995, |
| "eval_steps_per_second": 0.156, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.5589821004960105, |
| "grad_norm": 66.5975341796875, |
| "learning_rate": 0.0002, |
| "loss": 60.072, |
| "mean_token_accuracy": 0.857292029592726, |
| "num_tokens": 34151500.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.5589821004960105, |
| "eval_accuracy": 0.9157586693763733, |
| "eval_loss": 0.5754640102386475, |
| "eval_mean_token_accuracy": 0.8230840957770913, |
| "eval_num_tokens": 34151500.0, |
| "eval_runtime": 379.6285, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8231777548789978, |
| "eval_steps_per_second": 0.155, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.574509381065344, |
| "grad_norm": 63.1597785949707, |
| "learning_rate": 0.0002, |
| "loss": 59.5906, |
| "mean_token_accuracy": 0.8593710619542334, |
| "num_tokens": 34357326.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.574509381065344, |
| "eval_accuracy": 0.916456401348114, |
| "eval_loss": 0.5754194855690002, |
| "eval_mean_token_accuracy": 0.8231336615853391, |
| "eval_num_tokens": 34357326.0, |
| "eval_runtime": 377.6339, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8233343958854675, |
| "eval_steps_per_second": 0.156, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.5900366616346777, |
| "grad_norm": 64.11986541748047, |
| "learning_rate": 0.0002, |
| "loss": 59.0801, |
| "mean_token_accuracy": 0.8609710228112009, |
| "num_tokens": 34563987.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.5900366616346777, |
| "eval_accuracy": 0.9162663221359253, |
| "eval_loss": 0.5782102942466736, |
| "eval_mean_token_accuracy": 0.8221433859760479, |
| "eval_num_tokens": 34563987.0, |
| "eval_runtime": 385.5465, |
| "eval_samples_per_second": 0.303, |
| "eval_self_calculate_token_accuracy": 0.8223274946212769, |
| "eval_steps_per_second": 0.153, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.6055639422040113, |
| "grad_norm": 64.44429016113281, |
| "learning_rate": 0.0002, |
| "loss": 58.6388, |
| "mean_token_accuracy": 0.8612651899456978, |
| "num_tokens": 34770635.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.6055639422040113, |
| "eval_accuracy": 0.9142144918441772, |
| "eval_loss": 0.5827234983444214, |
| "eval_mean_token_accuracy": 0.8217781176001339, |
| "eval_num_tokens": 34770635.0, |
| "eval_runtime": 387.7134, |
| "eval_samples_per_second": 0.302, |
| "eval_self_calculate_token_accuracy": 0.8219717144966125, |
| "eval_steps_per_second": 0.152, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.621091222773345, |
| "grad_norm": 63.094879150390625, |
| "learning_rate": 0.0002, |
| "loss": 59.0826, |
| "mean_token_accuracy": 0.8590661916467879, |
| "num_tokens": 34977595.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.621091222773345, |
| "eval_accuracy": 0.9146907329559326, |
| "eval_loss": 0.5826783776283264, |
| "eval_mean_token_accuracy": 0.8227395758790484, |
| "eval_num_tokens": 34977595.0, |
| "eval_runtime": 383.6212, |
| "eval_samples_per_second": 0.305, |
| "eval_self_calculate_token_accuracy": 0.8227822780609131, |
| "eval_steps_per_second": 0.154, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.6366185033426786, |
| "grad_norm": 59.84916305541992, |
| "learning_rate": 0.0002, |
| "loss": 59.6609, |
| "mean_token_accuracy": 0.8595311087038782, |
| "num_tokens": 35185133.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.6366185033426786, |
| "eval_accuracy": 0.91529381275177, |
| "eval_loss": 0.579103946685791, |
| "eval_mean_token_accuracy": 0.8233776840112977, |
| "eval_num_tokens": 35185133.0, |
| "eval_runtime": 386.2325, |
| "eval_samples_per_second": 0.303, |
| "eval_self_calculate_token_accuracy": 0.8234617114067078, |
| "eval_steps_per_second": 0.153, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.652145783912012, |
| "grad_norm": 67.7462158203125, |
| "learning_rate": 0.0002, |
| "loss": 59.3102, |
| "mean_token_accuracy": 0.8568058676189847, |
| "num_tokens": 35392759.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.652145783912012, |
| "eval_accuracy": 0.9156889319419861, |
| "eval_loss": 0.5762985348701477, |
| "eval_mean_token_accuracy": 0.8240575093334004, |
| "eval_num_tokens": 35392759.0, |
| "eval_runtime": 386.2716, |
| "eval_samples_per_second": 0.303, |
| "eval_self_calculate_token_accuracy": 0.8240380883216858, |
| "eval_steps_per_second": 0.153, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.667673064481346, |
| "grad_norm": 64.19216918945312, |
| "learning_rate": 0.0002, |
| "loss": 60.8611, |
| "mean_token_accuracy": 0.8562094701661004, |
| "num_tokens": 35600136.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.667673064481346, |
| "eval_accuracy": 0.9155153632164001, |
| "eval_loss": 0.5786460041999817, |
| "eval_mean_token_accuracy": 0.8234322758044227, |
| "eval_num_tokens": 35600136.0, |
| "eval_runtime": 389.1334, |
| "eval_samples_per_second": 0.301, |
| "eval_self_calculate_token_accuracy": 0.8234621286392212, |
| "eval_steps_per_second": 0.152, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.6832003450506794, |
| "grad_norm": 71.51544952392578, |
| "learning_rate": 0.0002, |
| "loss": 58.5023, |
| "mean_token_accuracy": 0.858416483634048, |
| "num_tokens": 35807174.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.6832003450506794, |
| "eval_accuracy": 0.9166020750999451, |
| "eval_loss": 0.5781487226486206, |
| "eval_mean_token_accuracy": 0.8228686158939943, |
| "eval_num_tokens": 35807174.0, |
| "eval_runtime": 384.7605, |
| "eval_samples_per_second": 0.304, |
| "eval_self_calculate_token_accuracy": 0.8228632807731628, |
| "eval_steps_per_second": 0.153, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.698727625620013, |
| "grad_norm": 67.23090362548828, |
| "learning_rate": 0.0002, |
| "loss": 61.9709, |
| "mean_token_accuracy": 0.8536673039197922, |
| "num_tokens": 36014599.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.698727625620013, |
| "eval_accuracy": 0.9179655909538269, |
| "eval_loss": 0.5757537484169006, |
| "eval_mean_token_accuracy": 0.8239026453535435, |
| "eval_num_tokens": 36014599.0, |
| "eval_runtime": 392.6572, |
| "eval_samples_per_second": 0.298, |
| "eval_self_calculate_token_accuracy": 0.8239191174507141, |
| "eval_steps_per_second": 0.15, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.7142549061893466, |
| "grad_norm": 61.88078689575195, |
| "learning_rate": 0.0002, |
| "loss": 60.0907, |
| "mean_token_accuracy": 0.8580549135804176, |
| "num_tokens": 36222379.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.7142549061893466, |
| "eval_accuracy": 0.9169703722000122, |
| "eval_loss": 0.5794158577919006, |
| "eval_mean_token_accuracy": 0.8231041673886574, |
| "eval_num_tokens": 36222379.0, |
| "eval_runtime": 390.4829, |
| "eval_samples_per_second": 0.3, |
| "eval_self_calculate_token_accuracy": 0.8231915235519409, |
| "eval_steps_per_second": 0.151, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.7297821867586802, |
| "grad_norm": 66.80476379394531, |
| "learning_rate": 0.0002, |
| "loss": 60.4112, |
| "mean_token_accuracy": 0.8569959965017107, |
| "num_tokens": 36428980.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 2.7297821867586802, |
| "eval_accuracy": 0.9168707728385925, |
| "eval_loss": 0.5812435150146484, |
| "eval_mean_token_accuracy": 0.8239551580558389, |
| "eval_num_tokens": 36428980.0, |
| "eval_runtime": 394.9578, |
| "eval_samples_per_second": 0.296, |
| "eval_self_calculate_token_accuracy": 0.8240401744842529, |
| "eval_steps_per_second": 0.149, |
| "step": 177 |
| }, |
| { |
| "epoch": 2.745309467328014, |
| "grad_norm": 63.65299606323242, |
| "learning_rate": 0.0002, |
| "loss": 59.9688, |
| "mean_token_accuracy": 0.8576788521475263, |
| "num_tokens": 36636539.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.745309467328014, |
| "eval_accuracy": 0.9142547249794006, |
| "eval_loss": 0.5820722579956055, |
| "eval_mean_token_accuracy": 0.8220737940174038, |
| "eval_num_tokens": 36636539.0, |
| "eval_runtime": 383.4421, |
| "eval_samples_per_second": 0.305, |
| "eval_self_calculate_token_accuracy": 0.8222988843917847, |
| "eval_steps_per_second": 0.154, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.7608367478973475, |
| "grad_norm": 69.63920593261719, |
| "learning_rate": 0.0002, |
| "loss": 61.1328, |
| "mean_token_accuracy": 0.8568937960598204, |
| "num_tokens": 36843790.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 2.7608367478973475, |
| "eval_accuracy": 0.9142366051673889, |
| "eval_loss": 0.5764155387878418, |
| "eval_mean_token_accuracy": 0.8223703220739202, |
| "eval_num_tokens": 36843790.0, |
| "eval_runtime": 378.2918, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8226000070571899, |
| "eval_steps_per_second": 0.156, |
| "step": 179 |
| }, |
| { |
| "epoch": 2.776364028466681, |
| "grad_norm": 64.85514831542969, |
| "learning_rate": 0.0002, |
| "loss": 60.4489, |
| "mean_token_accuracy": 0.856029662821028, |
| "num_tokens": 37051501.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.776364028466681, |
| "eval_accuracy": 0.9159210920333862, |
| "eval_loss": 0.5710962414741516, |
| "eval_mean_token_accuracy": 0.8246322031748496, |
| "eval_num_tokens": 37051501.0, |
| "eval_runtime": 374.7976, |
| "eval_samples_per_second": 0.312, |
| "eval_self_calculate_token_accuracy": 0.8247190713882446, |
| "eval_steps_per_second": 0.157, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.7918913090360147, |
| "grad_norm": 59.2591552734375, |
| "learning_rate": 0.0002, |
| "loss": 61.0078, |
| "mean_token_accuracy": 0.8546331119206216, |
| "num_tokens": 37260018.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 2.7918913090360147, |
| "eval_accuracy": 0.9162110686302185, |
| "eval_loss": 0.5732917189598083, |
| "eval_mean_token_accuracy": 0.8244240011199045, |
| "eval_num_tokens": 37260018.0, |
| "eval_runtime": 376.4825, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8245187997817993, |
| "eval_steps_per_second": 0.157, |
| "step": 181 |
| }, |
| { |
| "epoch": 2.8074185896053483, |
| "grad_norm": 67.02713775634766, |
| "learning_rate": 0.0002, |
| "loss": 59.6261, |
| "mean_token_accuracy": 0.8582972925570276, |
| "num_tokens": 37466409.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.8074185896053483, |
| "eval_accuracy": 0.914726972579956, |
| "eval_loss": 0.5753452777862549, |
| "eval_mean_token_accuracy": 0.8251387881020368, |
| "eval_num_tokens": 37466409.0, |
| "eval_runtime": 379.2114, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8251757025718689, |
| "eval_steps_per_second": 0.156, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.822945870174682, |
| "grad_norm": 61.20984649658203, |
| "learning_rate": 0.0002, |
| "loss": 61.67, |
| "mean_token_accuracy": 0.8533147366510497, |
| "num_tokens": 37673576.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.822945870174682, |
| "eval_accuracy": 0.9145516157150269, |
| "eval_loss": 0.5774813294410706, |
| "eval_mean_token_accuracy": 0.8243369621745611, |
| "eval_num_tokens": 37673576.0, |
| "eval_runtime": 376.3271, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.824415385723114, |
| "eval_steps_per_second": 0.157, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.8384731507440155, |
| "grad_norm": 63.579246520996094, |
| "learning_rate": 0.0002, |
| "loss": 59.0117, |
| "mean_token_accuracy": 0.8594357983933555, |
| "num_tokens": 37879601.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.8384731507440155, |
| "eval_accuracy": 0.9169211387634277, |
| "eval_loss": 0.5761664509773254, |
| "eval_mean_token_accuracy": 0.8249228788634478, |
| "eval_num_tokens": 37879601.0, |
| "eval_runtime": 377.5615, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.825006365776062, |
| "eval_steps_per_second": 0.156, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.854000431313349, |
| "grad_norm": 69.8446044921875, |
| "learning_rate": 0.0002, |
| "loss": 59.1087, |
| "mean_token_accuracy": 0.8599690844615301, |
| "num_tokens": 38087045.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.854000431313349, |
| "eval_accuracy": 0.916884183883667, |
| "eval_loss": 0.5756677985191345, |
| "eval_mean_token_accuracy": 0.8243987176377895, |
| "eval_num_tokens": 38087045.0, |
| "eval_runtime": 375.1714, |
| "eval_samples_per_second": 0.312, |
| "eval_self_calculate_token_accuracy": 0.8246213793754578, |
| "eval_steps_per_second": 0.157, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.8695277118826827, |
| "grad_norm": 60.62617874145508, |
| "learning_rate": 0.0002, |
| "loss": 60.4311, |
| "mean_token_accuracy": 0.8565576093064414, |
| "num_tokens": 38295558.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.8695277118826827, |
| "eval_accuracy": 0.9174717664718628, |
| "eval_loss": 0.577273964881897, |
| "eval_mean_token_accuracy": 0.8229941624706074, |
| "eval_num_tokens": 38295558.0, |
| "eval_runtime": 377.0068, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8231734037399292, |
| "eval_steps_per_second": 0.156, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.8850549924520164, |
| "grad_norm": 71.38862609863281, |
| "learning_rate": 0.0002, |
| "loss": 59.0786, |
| "mean_token_accuracy": 0.8596089821722772, |
| "num_tokens": 38504519.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.8850549924520164, |
| "eval_accuracy": 0.9161394834518433, |
| "eval_loss": 0.5789663195610046, |
| "eval_mean_token_accuracy": 0.8226313732438169, |
| "eval_num_tokens": 38504519.0, |
| "eval_runtime": 376.3835, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8226823210716248, |
| "eval_steps_per_second": 0.157, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.90058227302135, |
| "grad_norm": 60.55780029296875, |
| "learning_rate": 0.0002, |
| "loss": 60.4187, |
| "mean_token_accuracy": 0.8570658514897028, |
| "num_tokens": 38712335.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.90058227302135, |
| "eval_accuracy": 0.9142947196960449, |
| "eval_loss": 0.5783673524856567, |
| "eval_mean_token_accuracy": 0.8228776414515608, |
| "eval_num_tokens": 38712335.0, |
| "eval_runtime": 377.1852, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8231444358825684, |
| "eval_steps_per_second": 0.156, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.9161095535906836, |
| "grad_norm": 70.5000228881836, |
| "learning_rate": 0.0002, |
| "loss": 62.0128, |
| "mean_token_accuracy": 0.8542094528675079, |
| "num_tokens": 38920127.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.9161095535906836, |
| "eval_accuracy": 0.913066029548645, |
| "eval_loss": 0.5754489302635193, |
| "eval_mean_token_accuracy": 0.8232920381982448, |
| "eval_num_tokens": 38920127.0, |
| "eval_runtime": 376.8321, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8234686851501465, |
| "eval_steps_per_second": 0.157, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.931636834160017, |
| "grad_norm": 61.92482376098633, |
| "learning_rate": 0.0002, |
| "loss": 59.7356, |
| "mean_token_accuracy": 0.8573247475756539, |
| "num_tokens": 39127449.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.931636834160017, |
| "eval_accuracy": 0.9139809012413025, |
| "eval_loss": 0.5746923089027405, |
| "eval_mean_token_accuracy": 0.8236672767138077, |
| "eval_num_tokens": 39127449.0, |
| "eval_runtime": 382.9363, |
| "eval_samples_per_second": 0.306, |
| "eval_self_calculate_token_accuracy": 0.8237864971160889, |
| "eval_steps_per_second": 0.154, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.947164114729351, |
| "grad_norm": 65.38888549804688, |
| "learning_rate": 0.0002, |
| "loss": 59.9382, |
| "mean_token_accuracy": 0.8584683355357912, |
| "num_tokens": 39335671.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.947164114729351, |
| "eval_accuracy": 0.9120795726776123, |
| "eval_loss": 0.5764152407646179, |
| "eval_mean_token_accuracy": 0.8227895861965114, |
| "eval_num_tokens": 39335671.0, |
| "eval_runtime": 377.9027, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8228674530982971, |
| "eval_steps_per_second": 0.156, |
| "step": 191 |
| }, |
| { |
| "epoch": 2.9626913952986844, |
| "grad_norm": 60.89480972290039, |
| "learning_rate": 0.0002, |
| "loss": 59.4849, |
| "mean_token_accuracy": 0.8587185525231891, |
| "num_tokens": 39542878.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.9626913952986844, |
| "eval_accuracy": 0.9117822051048279, |
| "eval_loss": 0.5787470936775208, |
| "eval_mean_token_accuracy": 0.8223738872398765, |
| "eval_num_tokens": 39542878.0, |
| "eval_runtime": 379.5622, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8225799798965454, |
| "eval_steps_per_second": 0.155, |
| "step": 192 |
| }, |
| { |
| "epoch": 2.978218675868018, |
| "grad_norm": 71.10404968261719, |
| "learning_rate": 0.0002, |
| "loss": 59.1488, |
| "mean_token_accuracy": 0.8590359530515141, |
| "num_tokens": 39749784.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.978218675868018, |
| "eval_accuracy": 0.9117515087127686, |
| "eval_loss": 0.5786497592926025, |
| "eval_mean_token_accuracy": 0.8221206079095097, |
| "eval_num_tokens": 39749784.0, |
| "eval_runtime": 379.3419, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8222964406013489, |
| "eval_steps_per_second": 0.156, |
| "step": 193 |
| }, |
| { |
| "epoch": 2.9937459564373516, |
| "grad_norm": 73.1333236694336, |
| "learning_rate": 0.0002, |
| "loss": 59.3376, |
| "mean_token_accuracy": 0.8588751927018166, |
| "num_tokens": 39957634.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 2.9937459564373516, |
| "eval_accuracy": 0.9129020571708679, |
| "eval_loss": 0.5747383236885071, |
| "eval_mean_token_accuracy": 0.8235553846520892, |
| "eval_num_tokens": 39957634.0, |
| "eval_runtime": 380.9566, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8238747119903564, |
| "eval_steps_per_second": 0.155, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 33.0853385925293, |
| "learning_rate": 0.0002, |
| "loss": 22.9442, |
| "mean_token_accuracy": 0.8632007796188881, |
| "num_tokens": 40040826.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_accuracy": 0.9123663902282715, |
| "eval_loss": 0.573932945728302, |
| "eval_mean_token_accuracy": 0.8233134362657192, |
| "eval_num_tokens": 40040826.0, |
| "eval_runtime": 383.8543, |
| "eval_samples_per_second": 0.305, |
| "eval_self_calculate_token_accuracy": 0.8235105872154236, |
| "eval_steps_per_second": 0.154, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.0155272805693336, |
| "grad_norm": 66.18367767333984, |
| "learning_rate": 0.0002, |
| "loss": 55.6517, |
| "mean_token_accuracy": 0.8654904382096397, |
| "num_tokens": 40248044.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.0155272805693336, |
| "eval_accuracy": 0.91349196434021, |
| "eval_loss": 0.5772708058357239, |
| "eval_mean_token_accuracy": 0.8231487900523816, |
| "eval_num_tokens": 40248044.0, |
| "eval_runtime": 378.4412, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.823326826095581, |
| "eval_steps_per_second": 0.156, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.0310545611386672, |
| "grad_norm": 73.5784912109375, |
| "learning_rate": 0.0002, |
| "loss": 54.8685, |
| "mean_token_accuracy": 0.866837765607569, |
| "num_tokens": 40454135.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.0310545611386672, |
| "eval_accuracy": 0.911002516746521, |
| "eval_loss": 0.5866868495941162, |
| "eval_mean_token_accuracy": 0.8210902123127953, |
| "eval_num_tokens": 40454135.0, |
| "eval_runtime": 395.6767, |
| "eval_samples_per_second": 0.296, |
| "eval_self_calculate_token_accuracy": 0.8213343024253845, |
| "eval_steps_per_second": 0.149, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.046581841708001, |
| "grad_norm": 63.88504409790039, |
| "learning_rate": 0.0002, |
| "loss": 55.6098, |
| "mean_token_accuracy": 0.8663951067460908, |
| "num_tokens": 40662194.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.046581841708001, |
| "eval_accuracy": 0.9095934629440308, |
| "eval_loss": 0.5961205363273621, |
| "eval_mean_token_accuracy": 0.8199389384964765, |
| "eval_num_tokens": 40662194.0, |
| "eval_runtime": 377.4134, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8201433420181274, |
| "eval_steps_per_second": 0.156, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.0621091222773344, |
| "grad_norm": 106.22503662109375, |
| "learning_rate": 0.0002, |
| "loss": 56.3476, |
| "mean_token_accuracy": 0.864356574912866, |
| "num_tokens": 40869393.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.0621091222773344, |
| "eval_accuracy": 0.9093540906906128, |
| "eval_loss": 0.5963947176933289, |
| "eval_mean_token_accuracy": 0.8198688303009939, |
| "eval_num_tokens": 40869393.0, |
| "eval_runtime": 377.3528, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8199730515480042, |
| "eval_steps_per_second": 0.156, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.077636402846668, |
| "grad_norm": 68.98870849609375, |
| "learning_rate": 0.0002, |
| "loss": 54.7474, |
| "mean_token_accuracy": 0.8671221493018998, |
| "num_tokens": 41077054.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.077636402846668, |
| "eval_accuracy": 0.9102476239204407, |
| "eval_loss": 0.5848191380500793, |
| "eval_mean_token_accuracy": 0.8221132896714292, |
| "eval_num_tokens": 41077054.0, |
| "eval_runtime": 376.1662, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8223957419395447, |
| "eval_steps_per_second": 0.157, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.0931636834160017, |
| "grad_norm": 58.32075500488281, |
| "learning_rate": 0.0002, |
| "loss": 56.2661, |
| "mean_token_accuracy": 0.8653301331732008, |
| "num_tokens": 41283987.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 3.0931636834160017, |
| "eval_accuracy": 0.9114280939102173, |
| "eval_loss": 0.5753390789031982, |
| "eval_mean_token_accuracy": 0.8232856887882039, |
| "eval_num_tokens": 41283987.0, |
| "eval_runtime": 378.8066, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8237543702125549, |
| "eval_steps_per_second": 0.156, |
| "step": 201 |
| }, |
| { |
| "epoch": 3.1086909639853353, |
| "grad_norm": 65.81951141357422, |
| "learning_rate": 0.0002, |
| "loss": 56.3244, |
| "mean_token_accuracy": 0.8644906000958549, |
| "num_tokens": 41492407.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 3.1086909639853353, |
| "eval_accuracy": 0.9108811616897583, |
| "eval_loss": 0.5719010829925537, |
| "eval_mean_token_accuracy": 0.8240326440940469, |
| "eval_num_tokens": 41492407.0, |
| "eval_runtime": 378.9033, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8245309591293335, |
| "eval_steps_per_second": 0.156, |
| "step": 202 |
| }, |
| { |
| "epoch": 3.124218244554669, |
| "grad_norm": 63.15560531616211, |
| "learning_rate": 0.0002, |
| "loss": 54.9811, |
| "mean_token_accuracy": 0.8657895475625992, |
| "num_tokens": 41700208.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.124218244554669, |
| "eval_accuracy": 0.9085500836372375, |
| "eval_loss": 0.5785896182060242, |
| "eval_mean_token_accuracy": 0.8225748205589036, |
| "eval_num_tokens": 41700208.0, |
| "eval_runtime": 379.0302, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8229347467422485, |
| "eval_steps_per_second": 0.156, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.1397455251240025, |
| "grad_norm": 62.2550048828125, |
| "learning_rate": 0.0002, |
| "loss": 57.5056, |
| "mean_token_accuracy": 0.8613045050038232, |
| "num_tokens": 41907585.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 3.1397455251240025, |
| "eval_accuracy": 0.9109401702880859, |
| "eval_loss": 0.583422839641571, |
| "eval_mean_token_accuracy": 0.8213994654558473, |
| "eval_num_tokens": 41907585.0, |
| "eval_runtime": 375.7943, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.821713924407959, |
| "eval_steps_per_second": 0.157, |
| "step": 204 |
| }, |
| { |
| "epoch": 3.155272805693336, |
| "grad_norm": 59.995948791503906, |
| "learning_rate": 0.0002, |
| "loss": 55.0102, |
| "mean_token_accuracy": 0.8676944937970903, |
| "num_tokens": 42115843.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.155272805693336, |
| "eval_accuracy": 0.9110279083251953, |
| "eval_loss": 0.5866342782974243, |
| "eval_mean_token_accuracy": 0.8209789587279498, |
| "eval_num_tokens": 42115843.0, |
| "eval_runtime": 377.1177, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8212244510650635, |
| "eval_steps_per_second": 0.156, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.1708000862626697, |
| "grad_norm": 62.09001541137695, |
| "learning_rate": 0.0002, |
| "loss": 54.9099, |
| "mean_token_accuracy": 0.8654570579528809, |
| "num_tokens": 42322105.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 3.1708000862626697, |
| "eval_accuracy": 0.9133650660514832, |
| "eval_loss": 0.586047887802124, |
| "eval_mean_token_accuracy": 0.8217764355368533, |
| "eval_num_tokens": 42322105.0, |
| "eval_runtime": 378.4016, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8220511078834534, |
| "eval_steps_per_second": 0.156, |
| "step": 206 |
| }, |
| { |
| "epoch": 3.1863273668320033, |
| "grad_norm": 70.84227752685547, |
| "learning_rate": 0.0002, |
| "loss": 56.3533, |
| "mean_token_accuracy": 0.8640556451347139, |
| "num_tokens": 42528905.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 3.1863273668320033, |
| "eval_accuracy": 0.9130482077598572, |
| "eval_loss": 0.5843234062194824, |
| "eval_mean_token_accuracy": 0.8230270036196304, |
| "eval_num_tokens": 42528905.0, |
| "eval_runtime": 383.8916, |
| "eval_samples_per_second": 0.305, |
| "eval_self_calculate_token_accuracy": 0.8232707977294922, |
| "eval_steps_per_second": 0.154, |
| "step": 207 |
| }, |
| { |
| "epoch": 3.201854647401337, |
| "grad_norm": 61.541683197021484, |
| "learning_rate": 0.0002, |
| "loss": 56.2426, |
| "mean_token_accuracy": 0.8643795359465811, |
| "num_tokens": 42736274.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 3.201854647401337, |
| "eval_accuracy": 0.9138363003730774, |
| "eval_loss": 0.5839865207672119, |
| "eval_mean_token_accuracy": 0.8230968410685912, |
| "eval_num_tokens": 42736274.0, |
| "eval_runtime": 379.6266, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8231610059738159, |
| "eval_steps_per_second": 0.155, |
| "step": 208 |
| }, |
| { |
| "epoch": 3.2173819279706706, |
| "grad_norm": 63.05863952636719, |
| "learning_rate": 0.0002, |
| "loss": 56.2315, |
| "mean_token_accuracy": 0.8659966612855593, |
| "num_tokens": 42943915.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 3.2173819279706706, |
| "eval_accuracy": 0.9147468209266663, |
| "eval_loss": 0.581639289855957, |
| "eval_mean_token_accuracy": 0.8230537267054542, |
| "eval_num_tokens": 42943915.0, |
| "eval_runtime": 378.8718, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8232078552246094, |
| "eval_steps_per_second": 0.156, |
| "step": 209 |
| }, |
| { |
| "epoch": 3.232909208540004, |
| "grad_norm": 63.38664245605469, |
| "learning_rate": 0.0002, |
| "loss": 56.475, |
| "mean_token_accuracy": 0.8632564470171928, |
| "num_tokens": 43151583.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.232909208540004, |
| "eval_accuracy": 0.9132506847381592, |
| "eval_loss": 0.579624593257904, |
| "eval_mean_token_accuracy": 0.8228688593638145, |
| "eval_num_tokens": 43151583.0, |
| "eval_runtime": 381.5815, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8229358196258545, |
| "eval_steps_per_second": 0.155, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.248436489109338, |
| "grad_norm": 62.078468322753906, |
| "learning_rate": 0.0002, |
| "loss": 56.5026, |
| "mean_token_accuracy": 0.8635616219706006, |
| "num_tokens": 43359652.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 3.248436489109338, |
| "eval_accuracy": 0.9131081700325012, |
| "eval_loss": 0.5795032978057861, |
| "eval_mean_token_accuracy": 0.8223043316501683, |
| "eval_num_tokens": 43359652.0, |
| "eval_runtime": 379.7072, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8224065899848938, |
| "eval_steps_per_second": 0.155, |
| "step": 211 |
| }, |
| { |
| "epoch": 3.2639637696786714, |
| "grad_norm": 62.72001266479492, |
| "learning_rate": 0.0002, |
| "loss": 55.2883, |
| "mean_token_accuracy": 0.8654305471314324, |
| "num_tokens": 43567071.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 3.2639637696786714, |
| "eval_accuracy": 0.9126359224319458, |
| "eval_loss": 0.5812863707542419, |
| "eval_mean_token_accuracy": 0.8227268174543219, |
| "eval_num_tokens": 43567071.0, |
| "eval_runtime": 399.4547, |
| "eval_samples_per_second": 0.293, |
| "eval_self_calculate_token_accuracy": 0.8228524923324585, |
| "eval_steps_per_second": 0.148, |
| "step": 212 |
| }, |
| { |
| "epoch": 3.279491050248005, |
| "grad_norm": 67.08480072021484, |
| "learning_rate": 0.0002, |
| "loss": 55.0584, |
| "mean_token_accuracy": 0.8671173809303178, |
| "num_tokens": 43774741.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 3.279491050248005, |
| "eval_accuracy": 0.9113554358482361, |
| "eval_loss": 0.5829761028289795, |
| "eval_mean_token_accuracy": 0.8231498770794626, |
| "eval_num_tokens": 43774741.0, |
| "eval_runtime": 392.8756, |
| "eval_samples_per_second": 0.298, |
| "eval_self_calculate_token_accuracy": 0.8231012225151062, |
| "eval_steps_per_second": 0.15, |
| "step": 213 |
| }, |
| { |
| "epoch": 3.2950183308173386, |
| "grad_norm": 67.65885925292969, |
| "learning_rate": 0.0002, |
| "loss": 56.0351, |
| "mean_token_accuracy": 0.8633211511704657, |
| "num_tokens": 43981768.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 3.2950183308173386, |
| "eval_accuracy": 0.9130526781082153, |
| "eval_loss": 0.5833492875099182, |
| "eval_mean_token_accuracy": 0.8239475266408112, |
| "eval_num_tokens": 43981768.0, |
| "eval_runtime": 375.7834, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8239747285842896, |
| "eval_steps_per_second": 0.157, |
| "step": 214 |
| }, |
| { |
| "epoch": 3.3105456113866722, |
| "grad_norm": 60.63387680053711, |
| "learning_rate": 0.0002, |
| "loss": 55.0683, |
| "mean_token_accuracy": 0.8653735121091207, |
| "num_tokens": 44189100.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.3105456113866722, |
| "eval_accuracy": 0.91328364610672, |
| "eval_loss": 0.5839062333106995, |
| "eval_mean_token_accuracy": 0.8237987110170267, |
| "eval_num_tokens": 44189100.0, |
| "eval_runtime": 381.4899, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8238835334777832, |
| "eval_steps_per_second": 0.155, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.326072891956006, |
| "grad_norm": 72.77201843261719, |
| "learning_rate": 0.0002, |
| "loss": 55.8453, |
| "mean_token_accuracy": 0.8656196859147813, |
| "num_tokens": 44397071.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 3.326072891956006, |
| "eval_accuracy": 0.9145507216453552, |
| "eval_loss": 0.583573043346405, |
| "eval_mean_token_accuracy": 0.8239296886880519, |
| "eval_num_tokens": 44397071.0, |
| "eval_runtime": 381.7093, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8241197466850281, |
| "eval_steps_per_second": 0.155, |
| "step": 216 |
| }, |
| { |
| "epoch": 3.3416001725253395, |
| "grad_norm": 64.93408203125, |
| "learning_rate": 0.0002, |
| "loss": 56.5595, |
| "mean_token_accuracy": 0.8625055137607787, |
| "num_tokens": 44605119.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 3.3416001725253395, |
| "eval_accuracy": 0.9155197143554688, |
| "eval_loss": 0.5840860605239868, |
| "eval_mean_token_accuracy": 0.8220722836963201, |
| "eval_num_tokens": 44605119.0, |
| "eval_runtime": 378.3657, |
| "eval_samples_per_second": 0.309, |
| "eval_self_calculate_token_accuracy": 0.8221904635429382, |
| "eval_steps_per_second": 0.156, |
| "step": 217 |
| }, |
| { |
| "epoch": 3.357127453094673, |
| "grad_norm": 62.84563446044922, |
| "learning_rate": 0.0002, |
| "loss": 56.7382, |
| "mean_token_accuracy": 0.8644265648391511, |
| "num_tokens": 44812624.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 3.357127453094673, |
| "eval_accuracy": 0.9162209630012512, |
| "eval_loss": 0.5839123129844666, |
| "eval_mean_token_accuracy": 0.8221392904297781, |
| "eval_num_tokens": 44812624.0, |
| "eval_runtime": 374.768, |
| "eval_samples_per_second": 0.312, |
| "eval_self_calculate_token_accuracy": 0.8222371935844421, |
| "eval_steps_per_second": 0.157, |
| "step": 218 |
| }, |
| { |
| "epoch": 3.3726547336640067, |
| "grad_norm": 65.49542236328125, |
| "learning_rate": 0.0002, |
| "loss": 55.4435, |
| "mean_token_accuracy": 0.8653364992803998, |
| "num_tokens": 45019484.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 3.3726547336640067, |
| "eval_accuracy": 0.914725124835968, |
| "eval_loss": 0.5826436281204224, |
| "eval_mean_token_accuracy": 0.8221658781423407, |
| "eval_num_tokens": 45019484.0, |
| "eval_runtime": 376.8842, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8223792314529419, |
| "eval_steps_per_second": 0.157, |
| "step": 219 |
| }, |
| { |
| "epoch": 3.3881820142333403, |
| "grad_norm": 60.367271423339844, |
| "learning_rate": 0.0002, |
| "loss": 56.8741, |
| "mean_token_accuracy": 0.8633375614881516, |
| "num_tokens": 45227985.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.3881820142333403, |
| "eval_accuracy": 0.9146124720573425, |
| "eval_loss": 0.5825509428977966, |
| "eval_mean_token_accuracy": 0.8229841973821995, |
| "eval_num_tokens": 45227985.0, |
| "eval_runtime": 381.6859, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8230305314064026, |
| "eval_steps_per_second": 0.155, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.403709294802674, |
| "grad_norm": 62.41489791870117, |
| "learning_rate": 0.0002, |
| "loss": 55.8382, |
| "mean_token_accuracy": 0.8645476698875427, |
| "num_tokens": 45435114.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 3.403709294802674, |
| "eval_accuracy": 0.9151422381401062, |
| "eval_loss": 0.5852887630462646, |
| "eval_mean_token_accuracy": 0.8225530143511497, |
| "eval_num_tokens": 45435114.0, |
| "eval_runtime": 380.395, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8226567506790161, |
| "eval_steps_per_second": 0.155, |
| "step": 221 |
| }, |
| { |
| "epoch": 3.4192365753720075, |
| "grad_norm": 64.48603820800781, |
| "learning_rate": 0.0002, |
| "loss": 56.8116, |
| "mean_token_accuracy": 0.8632212297783958, |
| "num_tokens": 45641878.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 3.4192365753720075, |
| "eval_accuracy": 0.9151667356491089, |
| "eval_loss": 0.5857149958610535, |
| "eval_mean_token_accuracy": 0.8230835199356079, |
| "eval_num_tokens": 45641878.0, |
| "eval_runtime": 379.6408, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8233585953712463, |
| "eval_steps_per_second": 0.155, |
| "step": 222 |
| }, |
| { |
| "epoch": 3.434763855941341, |
| "grad_norm": 64.93486785888672, |
| "learning_rate": 0.0002, |
| "loss": 54.6315, |
| "mean_token_accuracy": 0.8682016357779503, |
| "num_tokens": 45847805.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 3.434763855941341, |
| "eval_accuracy": 0.9161090850830078, |
| "eval_loss": 0.5799417495727539, |
| "eval_mean_token_accuracy": 0.8244221735808809, |
| "eval_num_tokens": 45847805.0, |
| "eval_runtime": 383.9296, |
| "eval_samples_per_second": 0.305, |
| "eval_self_calculate_token_accuracy": 0.8246127367019653, |
| "eval_steps_per_second": 0.154, |
| "step": 223 |
| }, |
| { |
| "epoch": 3.4502911365106748, |
| "grad_norm": 58.22840881347656, |
| "learning_rate": 0.0002, |
| "loss": 54.6449, |
| "mean_token_accuracy": 0.867826946079731, |
| "num_tokens": 46054244.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 3.4502911365106748, |
| "eval_accuracy": 0.9181945323944092, |
| "eval_loss": 0.5751045942306519, |
| "eval_mean_token_accuracy": 0.8256005351826295, |
| "eval_num_tokens": 46054244.0, |
| "eval_runtime": 377.7859, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8256837129592896, |
| "eval_steps_per_second": 0.156, |
| "step": 224 |
| }, |
| { |
| "epoch": 3.465818417080009, |
| "grad_norm": 63.41786575317383, |
| "learning_rate": 0.0002, |
| "loss": 55.6345, |
| "mean_token_accuracy": 0.8640015746156374, |
| "num_tokens": 46260868.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 3.465818417080009, |
| "eval_accuracy": 0.9175178408622742, |
| "eval_loss": 0.5766599774360657, |
| "eval_mean_token_accuracy": 0.8246506422252978, |
| "eval_num_tokens": 46260868.0, |
| "eval_runtime": 376.8092, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8247548341751099, |
| "eval_steps_per_second": 0.157, |
| "step": 225 |
| }, |
| { |
| "epoch": 3.4813456976493424, |
| "grad_norm": 66.32717895507812, |
| "learning_rate": 0.0002, |
| "loss": 55.3015, |
| "mean_token_accuracy": 0.8670583665370941, |
| "num_tokens": 46467470.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 3.4813456976493424, |
| "eval_accuracy": 0.9153078198432922, |
| "eval_loss": 0.5829942226409912, |
| "eval_mean_token_accuracy": 0.8230041518049726, |
| "eval_num_tokens": 46467470.0, |
| "eval_runtime": 373.9182, |
| "eval_samples_per_second": 0.313, |
| "eval_self_calculate_token_accuracy": 0.8232405185699463, |
| "eval_steps_per_second": 0.158, |
| "step": 226 |
| }, |
| { |
| "epoch": 3.496872978218676, |
| "grad_norm": 63.02161407470703, |
| "learning_rate": 0.0002, |
| "loss": 56.6047, |
| "mean_token_accuracy": 0.8638627363575829, |
| "num_tokens": 46674725.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 3.496872978218676, |
| "eval_accuracy": 0.9119344353675842, |
| "eval_loss": 0.5881844162940979, |
| "eval_mean_token_accuracy": 0.820993787151272, |
| "eval_num_tokens": 46674725.0, |
| "eval_runtime": 377.2287, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8212374448776245, |
| "eval_steps_per_second": 0.156, |
| "step": 227 |
| }, |
| { |
| "epoch": 3.5124002587880097, |
| "grad_norm": 64.45556640625, |
| "learning_rate": 0.0002, |
| "loss": 56.3711, |
| "mean_token_accuracy": 0.8636790323588583, |
| "num_tokens": 46882089.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 3.5124002587880097, |
| "eval_accuracy": 0.911611020565033, |
| "eval_loss": 0.5878032445907593, |
| "eval_mean_token_accuracy": 0.8219126422526473, |
| "eval_num_tokens": 46882089.0, |
| "eval_runtime": 377.9309, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8222490549087524, |
| "eval_steps_per_second": 0.156, |
| "step": 228 |
| }, |
| { |
| "epoch": 3.5279275393573433, |
| "grad_norm": 60.579341888427734, |
| "learning_rate": 0.0002, |
| "loss": 54.3181, |
| "mean_token_accuracy": 0.8668135288688872, |
| "num_tokens": 47089120.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 3.5279275393573433, |
| "eval_accuracy": 0.9109750390052795, |
| "eval_loss": 0.5834836363792419, |
| "eval_mean_token_accuracy": 0.8224863682763052, |
| "eval_num_tokens": 47089120.0, |
| "eval_runtime": 376.8053, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8227524161338806, |
| "eval_steps_per_second": 0.157, |
| "step": 229 |
| }, |
| { |
| "epoch": 3.543454819926677, |
| "grad_norm": 61.627655029296875, |
| "learning_rate": 0.0002, |
| "loss": 55.1425, |
| "mean_token_accuracy": 0.8657807947860824, |
| "num_tokens": 47296264.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 3.543454819926677, |
| "eval_accuracy": 0.9130038022994995, |
| "eval_loss": 0.5790004134178162, |
| "eval_mean_token_accuracy": 0.8236208265110597, |
| "eval_num_tokens": 47296264.0, |
| "eval_runtime": 377.8866, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8238826394081116, |
| "eval_steps_per_second": 0.156, |
| "step": 230 |
| }, |
| { |
| "epoch": 3.5589821004960105, |
| "grad_norm": 66.1114501953125, |
| "learning_rate": 0.0002, |
| "loss": 56.7701, |
| "mean_token_accuracy": 0.8640961630476846, |
| "num_tokens": 47503916.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 3.5589821004960105, |
| "eval_accuracy": 0.9122428894042969, |
| "eval_loss": 0.5779992341995239, |
| "eval_mean_token_accuracy": 0.8237776433007192, |
| "eval_num_tokens": 47503916.0, |
| "eval_runtime": 376.5964, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8238122463226318, |
| "eval_steps_per_second": 0.157, |
| "step": 231 |
| }, |
| { |
| "epoch": 3.574509381065344, |
| "grad_norm": 63.54084014892578, |
| "learning_rate": 0.0002, |
| "loss": 57.4412, |
| "mean_token_accuracy": 0.8626347647772895, |
| "num_tokens": 47711271.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 3.574509381065344, |
| "eval_accuracy": 0.9128528833389282, |
| "eval_loss": 0.578497052192688, |
| "eval_mean_token_accuracy": 0.8236981024176387, |
| "eval_num_tokens": 47711271.0, |
| "eval_runtime": 377.4315, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8237510323524475, |
| "eval_steps_per_second": 0.156, |
| "step": 232 |
| }, |
| { |
| "epoch": 3.5900366616346777, |
| "grad_norm": 60.96329116821289, |
| "learning_rate": 0.0002, |
| "loss": 55.1455, |
| "mean_token_accuracy": 0.8656069561839104, |
| "num_tokens": 47919227.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 3.5900366616346777, |
| "eval_accuracy": 0.9145339131355286, |
| "eval_loss": 0.5809924006462097, |
| "eval_mean_token_accuracy": 0.8235348473160954, |
| "eval_num_tokens": 47919227.0, |
| "eval_runtime": 373.4822, |
| "eval_samples_per_second": 0.313, |
| "eval_self_calculate_token_accuracy": 0.8235819339752197, |
| "eval_steps_per_second": 0.158, |
| "step": 233 |
| }, |
| { |
| "epoch": 3.6055639422040113, |
| "grad_norm": 69.4461441040039, |
| "learning_rate": 0.0002, |
| "loss": 54.9709, |
| "mean_token_accuracy": 0.8667028173804283, |
| "num_tokens": 48126984.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 3.6055639422040113, |
| "eval_accuracy": 0.913782000541687, |
| "eval_loss": 0.5834547877311707, |
| "eval_mean_token_accuracy": 0.8238058080107479, |
| "eval_num_tokens": 48126984.0, |
| "eval_runtime": 377.6661, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8236949443817139, |
| "eval_steps_per_second": 0.156, |
| "step": 234 |
| }, |
| { |
| "epoch": 3.621091222773345, |
| "grad_norm": 65.8593521118164, |
| "learning_rate": 0.0002, |
| "loss": 57.4017, |
| "mean_token_accuracy": 0.8616026639938354, |
| "num_tokens": 48334289.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 3.621091222773345, |
| "eval_accuracy": 0.913994550704956, |
| "eval_loss": 0.5811523199081421, |
| "eval_mean_token_accuracy": 0.8241685204586741, |
| "eval_num_tokens": 48334289.0, |
| "eval_runtime": 381.4805, |
| "eval_samples_per_second": 0.307, |
| "eval_self_calculate_token_accuracy": 0.8242776393890381, |
| "eval_steps_per_second": 0.155, |
| "step": 235 |
| }, |
| { |
| "epoch": 3.6366185033426786, |
| "grad_norm": 66.53460693359375, |
| "learning_rate": 0.0002, |
| "loss": 56.4709, |
| "mean_token_accuracy": 0.8644223743014865, |
| "num_tokens": 48541210.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 3.6366185033426786, |
| "eval_accuracy": 0.915371835231781, |
| "eval_loss": 0.5767515897750854, |
| "eval_mean_token_accuracy": 0.8238709892256785, |
| "eval_num_tokens": 48541210.0, |
| "eval_runtime": 377.9523, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.824101984500885, |
| "eval_steps_per_second": 0.156, |
| "step": 236 |
| }, |
| { |
| "epoch": 3.652145783912012, |
| "grad_norm": 66.15536499023438, |
| "learning_rate": 0.0002, |
| "loss": 55.2557, |
| "mean_token_accuracy": 0.8666903782222006, |
| "num_tokens": 48747559.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 3.652145783912012, |
| "eval_accuracy": 0.9158809781074524, |
| "eval_loss": 0.573492705821991, |
| "eval_mean_token_accuracy": 0.8253442887532509, |
| "eval_num_tokens": 48747559.0, |
| "eval_runtime": 375.415, |
| "eval_samples_per_second": 0.312, |
| "eval_self_calculate_token_accuracy": 0.8254749178886414, |
| "eval_steps_per_second": 0.157, |
| "step": 237 |
| }, |
| { |
| "epoch": 3.667673064481346, |
| "grad_norm": 65.72477722167969, |
| "learning_rate": 0.0002, |
| "loss": 55.4472, |
| "mean_token_accuracy": 0.864820105334123, |
| "num_tokens": 48954324.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 3.667673064481346, |
| "eval_accuracy": 0.9132111072540283, |
| "eval_loss": 0.5748416781425476, |
| "eval_mean_token_accuracy": 0.8250882019430904, |
| "eval_num_tokens": 48954324.0, |
| "eval_runtime": 377.473, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8251835107803345, |
| "eval_steps_per_second": 0.156, |
| "step": 238 |
| }, |
| { |
| "epoch": 3.6832003450506794, |
| "grad_norm": 60.9412841796875, |
| "learning_rate": 0.0002, |
| "loss": 57.8154, |
| "mean_token_accuracy": 0.8602221765451961, |
| "num_tokens": 49162398.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 3.6832003450506794, |
| "eval_accuracy": 0.9132851362228394, |
| "eval_loss": 0.5770214796066284, |
| "eval_mean_token_accuracy": 0.8239435876830149, |
| "eval_num_tokens": 49162398.0, |
| "eval_runtime": 377.6526, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8239054679870605, |
| "eval_steps_per_second": 0.156, |
| "step": 239 |
| }, |
| { |
| "epoch": 3.698727625620013, |
| "grad_norm": 65.60748291015625, |
| "learning_rate": 0.0002, |
| "loss": 55.8472, |
| "mean_token_accuracy": 0.8651416757040553, |
| "num_tokens": 49368850.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 3.698727625620013, |
| "eval_accuracy": 0.9126320481300354, |
| "eval_loss": 0.5814118981361389, |
| "eval_mean_token_accuracy": 0.8239793817875749, |
| "eval_num_tokens": 49368850.0, |
| "eval_runtime": 377.4118, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8236658573150635, |
| "eval_steps_per_second": 0.156, |
| "step": 240 |
| }, |
| { |
| "epoch": 3.7142549061893466, |
| "grad_norm": 63.9013557434082, |
| "learning_rate": 0.0002, |
| "loss": 55.6165, |
| "mean_token_accuracy": 0.8644746591647466, |
| "num_tokens": 49576460.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 3.7142549061893466, |
| "eval_accuracy": 0.9126479625701904, |
| "eval_loss": 0.5817598700523376, |
| "eval_mean_token_accuracy": 0.8244078472509222, |
| "eval_num_tokens": 49576460.0, |
| "eval_runtime": 377.4055, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8241778016090393, |
| "eval_steps_per_second": 0.156, |
| "step": 241 |
| }, |
| { |
| "epoch": 3.7297821867586802, |
| "grad_norm": 60.710838317871094, |
| "learning_rate": 0.0002, |
| "loss": 54.9657, |
| "mean_token_accuracy": 0.8671724564499326, |
| "num_tokens": 49783238.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 3.7297821867586802, |
| "eval_accuracy": 0.9142987728118896, |
| "eval_loss": 0.5792015790939331, |
| "eval_mean_token_accuracy": 0.8250044230687417, |
| "eval_num_tokens": 49783238.0, |
| "eval_runtime": 377.941, |
| "eval_samples_per_second": 0.31, |
| "eval_self_calculate_token_accuracy": 0.8248340487480164, |
| "eval_steps_per_second": 0.156, |
| "step": 242 |
| }, |
| { |
| "epoch": 3.745309467328014, |
| "grad_norm": 57.816349029541016, |
| "learning_rate": 0.0002, |
| "loss": 56.0249, |
| "mean_token_accuracy": 0.8650954539577166, |
| "num_tokens": 49990489.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 3.745309467328014, |
| "eval_accuracy": 0.9163742065429688, |
| "eval_loss": 0.5786498188972473, |
| "eval_mean_token_accuracy": 0.8249868255550579, |
| "eval_num_tokens": 49990489.0, |
| "eval_runtime": 379.8638, |
| "eval_samples_per_second": 0.308, |
| "eval_self_calculate_token_accuracy": 0.8251599073410034, |
| "eval_steps_per_second": 0.155, |
| "step": 243 |
| }, |
| { |
| "epoch": 3.7608367478973475, |
| "grad_norm": 64.30841827392578, |
| "learning_rate": 0.0002, |
| "loss": 55.9353, |
| "mean_token_accuracy": 0.8643580915199386, |
| "num_tokens": 50196929.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 3.7608367478973475, |
| "eval_accuracy": 0.9167692065238953, |
| "eval_loss": 0.5777366757392883, |
| "eval_mean_token_accuracy": 0.8263002735073284, |
| "eval_num_tokens": 50196929.0, |
| "eval_runtime": 376.7553, |
| "eval_samples_per_second": 0.311, |
| "eval_self_calculate_token_accuracy": 0.8265531063079834, |
| "eval_steps_per_second": 0.157, |
| "step": 244 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 260, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3052783760255222e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|