| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9994879672299027, |
| "eval_steps": 500, |
| "global_step": 1464, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.000682710360129715, |
| "grad_norm": 11.289746284484863, |
| "learning_rate": 3.4013605442176873e-08, |
| "loss": 1.1263, |
| "mean_token_accuracy": 0.723930612206459, |
| "num_tokens": 65264.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00136542072025943, |
| "grad_norm": 11.28925609588623, |
| "learning_rate": 6.802721088435375e-08, |
| "loss": 1.1255, |
| "mean_token_accuracy": 0.7249358594417572, |
| "num_tokens": 130800.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.002048131080389145, |
| "grad_norm": 11.120320320129395, |
| "learning_rate": 1.0204081632653061e-07, |
| "loss": 1.1779, |
| "mean_token_accuracy": 0.7114491611719131, |
| "num_tokens": 196336.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.00273084144051886, |
| "grad_norm": 11.073629379272461, |
| "learning_rate": 1.360544217687075e-07, |
| "loss": 1.1357, |
| "mean_token_accuracy": 0.722400426864624, |
| "num_tokens": 261872.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0034135518006485747, |
| "grad_norm": 11.142949104309082, |
| "learning_rate": 1.700680272108844e-07, |
| "loss": 1.1198, |
| "mean_token_accuracy": 0.7241874635219574, |
| "num_tokens": 327408.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00409626216077829, |
| "grad_norm": 10.996609687805176, |
| "learning_rate": 2.0408163265306121e-07, |
| "loss": 1.1234, |
| "mean_token_accuracy": 0.7269978076219559, |
| "num_tokens": 392944.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.0047789725209080045, |
| "grad_norm": 10.851889610290527, |
| "learning_rate": 2.3809523809523811e-07, |
| "loss": 1.135, |
| "mean_token_accuracy": 0.7213312536478043, |
| "num_tokens": 458480.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.00546168288103772, |
| "grad_norm": 10.883561134338379, |
| "learning_rate": 2.72108843537415e-07, |
| "loss": 1.0987, |
| "mean_token_accuracy": 0.7315308004617691, |
| "num_tokens": 523678.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.006144393241167435, |
| "grad_norm": 10.859150886535645, |
| "learning_rate": 3.0612244897959183e-07, |
| "loss": 1.1308, |
| "mean_token_accuracy": 0.7226448059082031, |
| "num_tokens": 589214.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.006827103601297149, |
| "grad_norm": 10.717151641845703, |
| "learning_rate": 3.401360544217688e-07, |
| "loss": 1.1209, |
| "mean_token_accuracy": 0.7246762067079544, |
| "num_tokens": 654750.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.007509813961426865, |
| "grad_norm": 10.239886283874512, |
| "learning_rate": 3.7414965986394563e-07, |
| "loss": 1.0971, |
| "mean_token_accuracy": 0.7295017838478088, |
| "num_tokens": 720282.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.00819252432155658, |
| "grad_norm": 10.038299560546875, |
| "learning_rate": 4.0816326530612243e-07, |
| "loss": 1.1444, |
| "mean_token_accuracy": 0.7180626839399338, |
| "num_tokens": 785818.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.008875234681686295, |
| "grad_norm": 8.35667896270752, |
| "learning_rate": 4.421768707482994e-07, |
| "loss": 1.0937, |
| "mean_token_accuracy": 0.732512354850769, |
| "num_tokens": 850288.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.009557945041816009, |
| "grad_norm": 8.884153366088867, |
| "learning_rate": 4.7619047619047623e-07, |
| "loss": 1.1475, |
| "mean_token_accuracy": 0.7174967974424362, |
| "num_tokens": 914746.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.010240655401945725, |
| "grad_norm": 8.479328155517578, |
| "learning_rate": 5.102040816326531e-07, |
| "loss": 1.1403, |
| "mean_token_accuracy": 0.7170087844133377, |
| "num_tokens": 980282.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.01092336576207544, |
| "grad_norm": 8.230568885803223, |
| "learning_rate": 5.4421768707483e-07, |
| "loss": 1.1216, |
| "mean_token_accuracy": 0.7216520011425018, |
| "num_tokens": 1045818.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.011606076122205154, |
| "grad_norm": 7.903264999389648, |
| "learning_rate": 5.782312925170068e-07, |
| "loss": 1.1004, |
| "mean_token_accuracy": 0.7233932018280029, |
| "num_tokens": 1111354.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.01228878648233487, |
| "grad_norm": 8.133345603942871, |
| "learning_rate": 6.122448979591837e-07, |
| "loss": 1.1465, |
| "mean_token_accuracy": 0.7130529135465622, |
| "num_tokens": 1176890.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.012971496842464585, |
| "grad_norm": 7.991074562072754, |
| "learning_rate": 6.462585034013606e-07, |
| "loss": 1.0987, |
| "mean_token_accuracy": 0.723148837685585, |
| "num_tokens": 1242426.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.013654207202594299, |
| "grad_norm": 7.945312023162842, |
| "learning_rate": 6.802721088435376e-07, |
| "loss": 1.0762, |
| "mean_token_accuracy": 0.7300067245960236, |
| "num_tokens": 1307962.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.014336917562724014, |
| "grad_norm": 7.9221367835998535, |
| "learning_rate": 7.142857142857143e-07, |
| "loss": 1.1439, |
| "mean_token_accuracy": 0.7103036493062973, |
| "num_tokens": 1373498.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.01501962792285373, |
| "grad_norm": 7.6807026863098145, |
| "learning_rate": 7.482993197278913e-07, |
| "loss": 1.0533, |
| "mean_token_accuracy": 0.732389435172081, |
| "num_tokens": 1439034.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.015702338282983445, |
| "grad_norm": 6.164129734039307, |
| "learning_rate": 7.823129251700681e-07, |
| "loss": 1.0217, |
| "mean_token_accuracy": 0.7375824898481369, |
| "num_tokens": 1504570.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.01638504864311316, |
| "grad_norm": 6.058833599090576, |
| "learning_rate": 8.163265306122449e-07, |
| "loss": 1.0362, |
| "mean_token_accuracy": 0.7366355210542679, |
| "num_tokens": 1570106.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.017067759003242873, |
| "grad_norm": 6.059955596923828, |
| "learning_rate": 8.503401360544218e-07, |
| "loss": 1.0173, |
| "mean_token_accuracy": 0.7385695725679398, |
| "num_tokens": 1635048.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.01775046936337259, |
| "grad_norm": 6.09025764465332, |
| "learning_rate": 8.843537414965988e-07, |
| "loss": 1.0653, |
| "mean_token_accuracy": 0.727337121963501, |
| "num_tokens": 1698944.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.018433179723502304, |
| "grad_norm": 5.8138747215271, |
| "learning_rate": 9.183673469387756e-07, |
| "loss": 1.0863, |
| "mean_token_accuracy": 0.7196179032325745, |
| "num_tokens": 1764382.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.019115890083632018, |
| "grad_norm": 5.593854904174805, |
| "learning_rate": 9.523809523809525e-07, |
| "loss": 1.0164, |
| "mean_token_accuracy": 0.7361662089824677, |
| "num_tokens": 1829763.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.019798600443761735, |
| "grad_norm": 5.434593677520752, |
| "learning_rate": 9.863945578231293e-07, |
| "loss": 1.0043, |
| "mean_token_accuracy": 0.7424242496490479, |
| "num_tokens": 1895299.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.02048131080389145, |
| "grad_norm": 5.440600395202637, |
| "learning_rate": 1.0204081632653063e-06, |
| "loss": 1.0275, |
| "mean_token_accuracy": 0.7329698354005814, |
| "num_tokens": 1960835.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.021164021164021163, |
| "grad_norm": 5.293028354644775, |
| "learning_rate": 1.0544217687074832e-06, |
| "loss": 0.9885, |
| "mean_token_accuracy": 0.7405608594417572, |
| "num_tokens": 2026371.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.02184673152415088, |
| "grad_norm": 5.160416603088379, |
| "learning_rate": 1.08843537414966e-06, |
| "loss": 0.9793, |
| "mean_token_accuracy": 0.7408407628536224, |
| "num_tokens": 2091861.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.022529441884280594, |
| "grad_norm": 5.16143798828125, |
| "learning_rate": 1.122448979591837e-06, |
| "loss": 0.9774, |
| "mean_token_accuracy": 0.7396446615457535, |
| "num_tokens": 2157349.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.023212152244410308, |
| "grad_norm": 5.192001819610596, |
| "learning_rate": 1.1564625850340136e-06, |
| "loss": 1.0125, |
| "mean_token_accuracy": 0.7285452634096146, |
| "num_tokens": 2222852.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.023894862604540025, |
| "grad_norm": 4.953834056854248, |
| "learning_rate": 1.1904761904761906e-06, |
| "loss": 0.948, |
| "mean_token_accuracy": 0.7469452619552612, |
| "num_tokens": 2288388.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.02457757296466974, |
| "grad_norm": 4.8762407302856445, |
| "learning_rate": 1.2244897959183673e-06, |
| "loss": 0.9768, |
| "mean_token_accuracy": 0.7382392585277557, |
| "num_tokens": 2353924.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.025260283324799453, |
| "grad_norm": 4.632943630218506, |
| "learning_rate": 1.2585034013605443e-06, |
| "loss": 0.9532, |
| "mean_token_accuracy": 0.7446745932102203, |
| "num_tokens": 2419362.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.02594299368492917, |
| "grad_norm": 4.591414928436279, |
| "learning_rate": 1.2925170068027212e-06, |
| "loss": 0.9509, |
| "mean_token_accuracy": 0.7436308562755585, |
| "num_tokens": 2484898.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.026625704045058884, |
| "grad_norm": 4.45168924331665, |
| "learning_rate": 1.3265306122448982e-06, |
| "loss": 0.9328, |
| "mean_token_accuracy": 0.7455859035253525, |
| "num_tokens": 2550434.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.027308414405188598, |
| "grad_norm": 4.439093112945557, |
| "learning_rate": 1.3605442176870751e-06, |
| "loss": 0.926, |
| "mean_token_accuracy": 0.7475508600473404, |
| "num_tokens": 2615867.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.027991124765318315, |
| "grad_norm": 4.357616424560547, |
| "learning_rate": 1.3945578231292517e-06, |
| "loss": 0.9152, |
| "mean_token_accuracy": 0.7535587698221207, |
| "num_tokens": 2681403.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.02867383512544803, |
| "grad_norm": 4.222606658935547, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.9855, |
| "mean_token_accuracy": 0.7323101311922073, |
| "num_tokens": 2746765.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.029356545485577742, |
| "grad_norm": 3.9405136108398438, |
| "learning_rate": 1.4625850340136056e-06, |
| "loss": 0.9199, |
| "mean_token_accuracy": 0.7474187463521957, |
| "num_tokens": 2812301.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.03003925584570746, |
| "grad_norm": 3.81093430519104, |
| "learning_rate": 1.4965986394557825e-06, |
| "loss": 0.8939, |
| "mean_token_accuracy": 0.7549030482769012, |
| "num_tokens": 2877395.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.030721966205837174, |
| "grad_norm": 3.7143189907073975, |
| "learning_rate": 1.5306122448979593e-06, |
| "loss": 0.8639, |
| "mean_token_accuracy": 0.7581400275230408, |
| "num_tokens": 2942730.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.03140467656596689, |
| "grad_norm": 3.47798490524292, |
| "learning_rate": 1.5646258503401362e-06, |
| "loss": 0.8984, |
| "mean_token_accuracy": 0.7516056448221207, |
| "num_tokens": 3007476.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.032087386926096605, |
| "grad_norm": 3.2375733852386475, |
| "learning_rate": 1.5986394557823132e-06, |
| "loss": 0.8831, |
| "mean_token_accuracy": 0.7556970864534378, |
| "num_tokens": 3073012.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.03277009728622632, |
| "grad_norm": 2.973647356033325, |
| "learning_rate": 1.6326530612244897e-06, |
| "loss": 0.8794, |
| "mean_token_accuracy": 0.7556320279836655, |
| "num_tokens": 3138364.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.03345280764635603, |
| "grad_norm": 2.5924108028411865, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 0.8928, |
| "mean_token_accuracy": 0.7511672079563141, |
| "num_tokens": 3203484.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.034135518006485746, |
| "grad_norm": 1.7866108417510986, |
| "learning_rate": 1.7006802721088436e-06, |
| "loss": 0.8587, |
| "mean_token_accuracy": 0.7600806355476379, |
| "num_tokens": 3269020.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.03481822836661547, |
| "grad_norm": 1.5599899291992188, |
| "learning_rate": 1.7346938775510206e-06, |
| "loss": 0.8653, |
| "mean_token_accuracy": 0.7569495290517807, |
| "num_tokens": 3334556.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.03550093872674518, |
| "grad_norm": 1.4173204898834229, |
| "learning_rate": 1.7687074829931975e-06, |
| "loss": 0.8721, |
| "mean_token_accuracy": 0.7553055435419083, |
| "num_tokens": 3399816.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.036183649086874894, |
| "grad_norm": 1.3384040594100952, |
| "learning_rate": 1.8027210884353743e-06, |
| "loss": 0.8411, |
| "mean_token_accuracy": 0.761073425412178, |
| "num_tokens": 3465352.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.03686635944700461, |
| "grad_norm": 1.294304370880127, |
| "learning_rate": 1.8367346938775512e-06, |
| "loss": 0.7959, |
| "mean_token_accuracy": 0.7744806259870529, |
| "num_tokens": 3530741.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.03754906980713432, |
| "grad_norm": 1.186849594116211, |
| "learning_rate": 1.8707482993197282e-06, |
| "loss": 0.7824, |
| "mean_token_accuracy": 0.7765350043773651, |
| "num_tokens": 3596088.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.038231780167264036, |
| "grad_norm": 1.154654860496521, |
| "learning_rate": 1.904761904761905e-06, |
| "loss": 0.8091, |
| "mean_token_accuracy": 0.7686949968338013, |
| "num_tokens": 3661624.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.03891449052739376, |
| "grad_norm": 1.1420106887817383, |
| "learning_rate": 1.938775510204082e-06, |
| "loss": 0.8476, |
| "mean_token_accuracy": 0.7588370442390442, |
| "num_tokens": 3726929.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.03959720088752347, |
| "grad_norm": 1.085784673690796, |
| "learning_rate": 1.9727891156462586e-06, |
| "loss": 0.7694, |
| "mean_token_accuracy": 0.7776759564876556, |
| "num_tokens": 3792465.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.040279911247653184, |
| "grad_norm": 1.073722243309021, |
| "learning_rate": 2.0068027210884353e-06, |
| "loss": 0.8398, |
| "mean_token_accuracy": 0.7589962035417557, |
| "num_tokens": 3858001.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0409626216077829, |
| "grad_norm": 1.0647130012512207, |
| "learning_rate": 2.0408163265306125e-06, |
| "loss": 0.8283, |
| "mean_token_accuracy": 0.7600959092378616, |
| "num_tokens": 3923537.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.04164533196791261, |
| "grad_norm": 0.9948044419288635, |
| "learning_rate": 2.0748299319727892e-06, |
| "loss": 0.8076, |
| "mean_token_accuracy": 0.7665566951036453, |
| "num_tokens": 3989073.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.042328042328042326, |
| "grad_norm": 1.0430023670196533, |
| "learning_rate": 2.1088435374149664e-06, |
| "loss": 0.7738, |
| "mean_token_accuracy": 0.7728800028562546, |
| "num_tokens": 4054609.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.043010752688172046, |
| "grad_norm": 0.9742470383644104, |
| "learning_rate": 2.1428571428571427e-06, |
| "loss": 0.7987, |
| "mean_token_accuracy": 0.7675991207361221, |
| "num_tokens": 4119840.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.04369346304830176, |
| "grad_norm": 0.9585213661193848, |
| "learning_rate": 2.17687074829932e-06, |
| "loss": 0.7975, |
| "mean_token_accuracy": 0.7663673758506775, |
| "num_tokens": 4185284.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.044376173408431474, |
| "grad_norm": 0.9440982341766357, |
| "learning_rate": 2.2108843537414966e-06, |
| "loss": 0.7675, |
| "mean_token_accuracy": 0.776309072971344, |
| "num_tokens": 4250735.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.04505888376856119, |
| "grad_norm": 1.0054755210876465, |
| "learning_rate": 2.244897959183674e-06, |
| "loss": 0.8071, |
| "mean_token_accuracy": 0.7647958248853683, |
| "num_tokens": 4316205.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.0457415941286909, |
| "grad_norm": 0.9801008105278015, |
| "learning_rate": 2.2789115646258505e-06, |
| "loss": 0.7575, |
| "mean_token_accuracy": 0.7767521739006042, |
| "num_tokens": 4381221.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.046424304488820615, |
| "grad_norm": 0.9017577767372131, |
| "learning_rate": 2.3129251700680273e-06, |
| "loss": 0.7807, |
| "mean_token_accuracy": 0.7724828869104385, |
| "num_tokens": 4446757.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.047107014848950336, |
| "grad_norm": 0.8664342761039734, |
| "learning_rate": 2.3469387755102044e-06, |
| "loss": 0.7467, |
| "mean_token_accuracy": 0.780700147151947, |
| "num_tokens": 4512293.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.04778972520908005, |
| "grad_norm": 0.9046992659568787, |
| "learning_rate": 2.380952380952381e-06, |
| "loss": 0.7342, |
| "mean_token_accuracy": 0.7841294556856155, |
| "num_tokens": 4577297.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.048472435569209764, |
| "grad_norm": 0.8165750503540039, |
| "learning_rate": 2.414965986394558e-06, |
| "loss": 0.7255, |
| "mean_token_accuracy": 0.7828498333692551, |
| "num_tokens": 4642721.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.04915514592933948, |
| "grad_norm": 0.841571033000946, |
| "learning_rate": 2.4489795918367347e-06, |
| "loss": 0.7587, |
| "mean_token_accuracy": 0.7761638462543488, |
| "num_tokens": 4708257.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.04983785628946919, |
| "grad_norm": 0.8995031118392944, |
| "learning_rate": 2.482993197278912e-06, |
| "loss": 0.7815, |
| "mean_token_accuracy": 0.7689618766307831, |
| "num_tokens": 4773557.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.050520566649598905, |
| "grad_norm": 0.8214359879493713, |
| "learning_rate": 2.5170068027210886e-06, |
| "loss": 0.7447, |
| "mean_token_accuracy": 0.7801923453807831, |
| "num_tokens": 4838977.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.051203277009728626, |
| "grad_norm": 0.7673158049583435, |
| "learning_rate": 2.5510204081632657e-06, |
| "loss": 0.726, |
| "mean_token_accuracy": 0.7852364182472229, |
| "num_tokens": 4904513.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.05188598736985834, |
| "grad_norm": 0.7837581038475037, |
| "learning_rate": 2.5850340136054425e-06, |
| "loss": 0.7536, |
| "mean_token_accuracy": 0.7792949676513672, |
| "num_tokens": 4970049.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.052568697729988054, |
| "grad_norm": 0.7806072235107422, |
| "learning_rate": 2.6190476190476192e-06, |
| "loss": 0.7094, |
| "mean_token_accuracy": 0.7869623750448227, |
| "num_tokens": 5035585.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.05325140809011777, |
| "grad_norm": 0.8338956832885742, |
| "learning_rate": 2.6530612244897964e-06, |
| "loss": 0.7431, |
| "mean_token_accuracy": 0.7790174335241318, |
| "num_tokens": 5100407.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.05393411845024748, |
| "grad_norm": 0.857094407081604, |
| "learning_rate": 2.687074829931973e-06, |
| "loss": 0.7612, |
| "mean_token_accuracy": 0.7757337838411331, |
| "num_tokens": 5165420.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.054616828810377195, |
| "grad_norm": 0.8075940012931824, |
| "learning_rate": 2.7210884353741503e-06, |
| "loss": 0.71, |
| "mean_token_accuracy": 0.7867943644523621, |
| "num_tokens": 5230956.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.055299539170506916, |
| "grad_norm": 0.8337777853012085, |
| "learning_rate": 2.7551020408163266e-06, |
| "loss": 0.7418, |
| "mean_token_accuracy": 0.7804368883371353, |
| "num_tokens": 5296230.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.05598224953063663, |
| "grad_norm": 0.8036956787109375, |
| "learning_rate": 2.7891156462585034e-06, |
| "loss": 0.7116, |
| "mean_token_accuracy": 0.7864073514938354, |
| "num_tokens": 5361298.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.056664959890766343, |
| "grad_norm": 0.8216161131858826, |
| "learning_rate": 2.8231292517006805e-06, |
| "loss": 0.6875, |
| "mean_token_accuracy": 0.7953281402587891, |
| "num_tokens": 5426054.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.05734767025089606, |
| "grad_norm": 0.7968023419380188, |
| "learning_rate": 2.8571428571428573e-06, |
| "loss": 0.7463, |
| "mean_token_accuracy": 0.7788520306348801, |
| "num_tokens": 5491590.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.05803038061102577, |
| "grad_norm": 0.8076518177986145, |
| "learning_rate": 2.891156462585034e-06, |
| "loss": 0.7363, |
| "mean_token_accuracy": 0.7817998677492142, |
| "num_tokens": 5557126.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.058713090971155485, |
| "grad_norm": 0.8246069550514221, |
| "learning_rate": 2.925170068027211e-06, |
| "loss": 0.7219, |
| "mean_token_accuracy": 0.7867027223110199, |
| "num_tokens": 5622662.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.059395801331285206, |
| "grad_norm": 0.7641127705574036, |
| "learning_rate": 2.959183673469388e-06, |
| "loss": 0.7189, |
| "mean_token_accuracy": 0.7851627767086029, |
| "num_tokens": 5688140.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.06007851169141492, |
| "grad_norm": 0.8021376729011536, |
| "learning_rate": 2.993197278911565e-06, |
| "loss": 0.7412, |
| "mean_token_accuracy": 0.7815249264240265, |
| "num_tokens": 5753676.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.06076122205154463, |
| "grad_norm": 0.7843667268753052, |
| "learning_rate": 3.027210884353742e-06, |
| "loss": 0.7453, |
| "mean_token_accuracy": 0.7783620357513428, |
| "num_tokens": 5818498.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.06144393241167435, |
| "grad_norm": 0.8114190101623535, |
| "learning_rate": 3.0612244897959185e-06, |
| "loss": 0.6977, |
| "mean_token_accuracy": 0.7898796498775482, |
| "num_tokens": 5884034.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.06212664277180406, |
| "grad_norm": 0.8369579315185547, |
| "learning_rate": 3.0952380952380957e-06, |
| "loss": 0.7216, |
| "mean_token_accuracy": 0.7870012819766998, |
| "num_tokens": 5949370.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.06280935313193378, |
| "grad_norm": 0.7947114706039429, |
| "learning_rate": 3.1292517006802725e-06, |
| "loss": 0.7464, |
| "mean_token_accuracy": 0.7776340842247009, |
| "num_tokens": 6014764.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.06349206349206349, |
| "grad_norm": 0.8602317571640015, |
| "learning_rate": 3.1632653061224496e-06, |
| "loss": 0.6994, |
| "mean_token_accuracy": 0.7892228811979294, |
| "num_tokens": 6080300.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.06417477385219321, |
| "grad_norm": 0.7978633046150208, |
| "learning_rate": 3.1972789115646264e-06, |
| "loss": 0.7218, |
| "mean_token_accuracy": 0.7852825820446014, |
| "num_tokens": 6145359.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.06485748421232292, |
| "grad_norm": 0.8335587978363037, |
| "learning_rate": 3.231292517006803e-06, |
| "loss": 0.7087, |
| "mean_token_accuracy": 0.7889785021543503, |
| "num_tokens": 6210895.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.06554019457245264, |
| "grad_norm": 0.7953450083732605, |
| "learning_rate": 3.2653061224489794e-06, |
| "loss": 0.7208, |
| "mean_token_accuracy": 0.7854044586420059, |
| "num_tokens": 6276431.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.06622290493258236, |
| "grad_norm": 0.7711222767829895, |
| "learning_rate": 3.2993197278911566e-06, |
| "loss": 0.7419, |
| "mean_token_accuracy": 0.7802113890647888, |
| "num_tokens": 6341967.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.06690561529271206, |
| "grad_norm": 0.8267924189567566, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.7365, |
| "mean_token_accuracy": 0.7791422307491302, |
| "num_tokens": 6407503.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.06758832565284179, |
| "grad_norm": 0.7593364715576172, |
| "learning_rate": 3.3673469387755105e-06, |
| "loss": 0.6944, |
| "mean_token_accuracy": 0.7914222925901413, |
| "num_tokens": 6473039.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.06827103601297149, |
| "grad_norm": 0.7679820656776428, |
| "learning_rate": 3.4013605442176872e-06, |
| "loss": 0.701, |
| "mean_token_accuracy": 0.7901545912027359, |
| "num_tokens": 6538575.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.06895374637310121, |
| "grad_norm": 0.788735568523407, |
| "learning_rate": 3.435374149659864e-06, |
| "loss": 0.7016, |
| "mean_token_accuracy": 0.7907655388116837, |
| "num_tokens": 6604111.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.06963645673323093, |
| "grad_norm": 0.8061610460281372, |
| "learning_rate": 3.469387755102041e-06, |
| "loss": 0.7805, |
| "mean_token_accuracy": 0.7686565071344376, |
| "num_tokens": 6669582.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.07031916709336064, |
| "grad_norm": 0.7788532972335815, |
| "learning_rate": 3.503401360544218e-06, |
| "loss": 0.706, |
| "mean_token_accuracy": 0.7864888906478882, |
| "num_tokens": 6735118.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.07100187745349036, |
| "grad_norm": 0.7763977646827698, |
| "learning_rate": 3.537414965986395e-06, |
| "loss": 0.7035, |
| "mean_token_accuracy": 0.7874220460653305, |
| "num_tokens": 6800487.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.07168458781362007, |
| "grad_norm": 0.8377651572227478, |
| "learning_rate": 3.5714285714285718e-06, |
| "loss": 0.7497, |
| "mean_token_accuracy": 0.7772635817527771, |
| "num_tokens": 6866023.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.07236729817374979, |
| "grad_norm": 0.8014208674430847, |
| "learning_rate": 3.6054421768707485e-06, |
| "loss": 0.726, |
| "mean_token_accuracy": 0.7824260890483856, |
| "num_tokens": 6931559.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.0730500085338795, |
| "grad_norm": 0.7863408923149109, |
| "learning_rate": 3.6394557823129257e-06, |
| "loss": 0.6997, |
| "mean_token_accuracy": 0.7897269278764725, |
| "num_tokens": 6997095.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.07373271889400922, |
| "grad_norm": 0.7708235383033752, |
| "learning_rate": 3.6734693877551024e-06, |
| "loss": 0.7234, |
| "mean_token_accuracy": 0.7831897437572479, |
| "num_tokens": 7062631.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.07441542925413894, |
| "grad_norm": 0.8085599541664124, |
| "learning_rate": 3.7074829931972796e-06, |
| "loss": 0.7218, |
| "mean_token_accuracy": 0.7822886258363724, |
| "num_tokens": 7128167.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.07509813961426864, |
| "grad_norm": 0.7934694886207581, |
| "learning_rate": 3.7414965986394563e-06, |
| "loss": 0.7195, |
| "mean_token_accuracy": 0.7839381843805313, |
| "num_tokens": 7193703.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.07578084997439836, |
| "grad_norm": 0.7413143515586853, |
| "learning_rate": 3.7755102040816327e-06, |
| "loss": 0.6853, |
| "mean_token_accuracy": 0.7936522662639618, |
| "num_tokens": 7259239.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.07646356033452807, |
| "grad_norm": 0.7616100311279297, |
| "learning_rate": 3.80952380952381e-06, |
| "loss": 0.686, |
| "mean_token_accuracy": 0.7947530299425125, |
| "num_tokens": 7323989.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.07714627069465779, |
| "grad_norm": 0.8442330956459045, |
| "learning_rate": 3.843537414965986e-06, |
| "loss": 0.7003, |
| "mean_token_accuracy": 0.7893252372741699, |
| "num_tokens": 7389177.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.07782898105478751, |
| "grad_norm": 0.7328389286994934, |
| "learning_rate": 3.877551020408164e-06, |
| "loss": 0.6991, |
| "mean_token_accuracy": 0.7894214391708374, |
| "num_tokens": 7454713.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.07851169141491722, |
| "grad_norm": 0.7462553977966309, |
| "learning_rate": 3.9115646258503405e-06, |
| "loss": 0.7192, |
| "mean_token_accuracy": 0.7846713215112686, |
| "num_tokens": 7520249.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.07919440177504694, |
| "grad_norm": 0.7286996245384216, |
| "learning_rate": 3.945578231292517e-06, |
| "loss": 0.7022, |
| "mean_token_accuracy": 0.788993775844574, |
| "num_tokens": 7585785.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.07987711213517665, |
| "grad_norm": 0.8258713483810425, |
| "learning_rate": 3.979591836734694e-06, |
| "loss": 0.7303, |
| "mean_token_accuracy": 0.7783785462379456, |
| "num_tokens": 7651321.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.08055982249530637, |
| "grad_norm": 0.7999934554100037, |
| "learning_rate": 4.013605442176871e-06, |
| "loss": 0.6826, |
| "mean_token_accuracy": 0.7930682301521301, |
| "num_tokens": 7716448.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.08124253285543608, |
| "grad_norm": 0.7555844187736511, |
| "learning_rate": 4.047619047619048e-06, |
| "loss": 0.6219, |
| "mean_token_accuracy": 0.8124694526195526, |
| "num_tokens": 7781984.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.0819252432155658, |
| "grad_norm": 0.7775622606277466, |
| "learning_rate": 4.081632653061225e-06, |
| "loss": 0.6954, |
| "mean_token_accuracy": 0.7905822545289993, |
| "num_tokens": 7847520.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.08260795357569552, |
| "grad_norm": 0.8294740915298462, |
| "learning_rate": 4.115646258503402e-06, |
| "loss": 0.7175, |
| "mean_token_accuracy": 0.7847935110330582, |
| "num_tokens": 7913056.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.08329066393582522, |
| "grad_norm": 0.7734805941581726, |
| "learning_rate": 4.1496598639455785e-06, |
| "loss": 0.6987, |
| "mean_token_accuracy": 0.7903053909540176, |
| "num_tokens": 7978127.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.08397337429595494, |
| "grad_norm": 0.7841943502426147, |
| "learning_rate": 4.183673469387755e-06, |
| "loss": 0.7017, |
| "mean_token_accuracy": 0.7904753237962723, |
| "num_tokens": 8043663.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.08465608465608465, |
| "grad_norm": 0.7548030614852905, |
| "learning_rate": 4.217687074829933e-06, |
| "loss": 0.6716, |
| "mean_token_accuracy": 0.7981427162885666, |
| "num_tokens": 8109199.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.08533879501621437, |
| "grad_norm": 0.7701455950737, |
| "learning_rate": 4.251700680272109e-06, |
| "loss": 0.7019, |
| "mean_token_accuracy": 0.7921554446220398, |
| "num_tokens": 8174735.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.08602150537634409, |
| "grad_norm": 0.7813974022865295, |
| "learning_rate": 4.2857142857142855e-06, |
| "loss": 0.716, |
| "mean_token_accuracy": 0.7849920690059662, |
| "num_tokens": 8240271.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.0867042157364738, |
| "grad_norm": 0.7576754689216614, |
| "learning_rate": 4.319727891156463e-06, |
| "loss": 0.6691, |
| "mean_token_accuracy": 0.7991696149110794, |
| "num_tokens": 8305531.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.08738692609660352, |
| "grad_norm": 0.8087888360023499, |
| "learning_rate": 4.35374149659864e-06, |
| "loss": 0.7109, |
| "mean_token_accuracy": 0.7871221750974655, |
| "num_tokens": 8370791.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.08806963645673323, |
| "grad_norm": 0.7525134682655334, |
| "learning_rate": 4.3877551020408165e-06, |
| "loss": 0.6671, |
| "mean_token_accuracy": 0.79823437333107, |
| "num_tokens": 8436327.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.08875234681686295, |
| "grad_norm": 0.7696347832679749, |
| "learning_rate": 4.421768707482993e-06, |
| "loss": 0.7001, |
| "mean_token_accuracy": 0.7891006916761398, |
| "num_tokens": 8501863.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.08943505717699266, |
| "grad_norm": 0.7710400223731995, |
| "learning_rate": 4.45578231292517e-06, |
| "loss": 0.7287, |
| "mean_token_accuracy": 0.7811278104782104, |
| "num_tokens": 8567399.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.09011776753712238, |
| "grad_norm": 0.7712966203689575, |
| "learning_rate": 4.489795918367348e-06, |
| "loss": 0.6576, |
| "mean_token_accuracy": 0.7996701002120972, |
| "num_tokens": 8632935.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.0908004778972521, |
| "grad_norm": 0.7195627689361572, |
| "learning_rate": 4.523809523809524e-06, |
| "loss": 0.6521, |
| "mean_token_accuracy": 0.8024804592132568, |
| "num_tokens": 8698471.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.0914831882573818, |
| "grad_norm": 0.8118554949760437, |
| "learning_rate": 4.557823129251701e-06, |
| "loss": 0.6879, |
| "mean_token_accuracy": 0.7921166270971298, |
| "num_tokens": 8763993.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.09216589861751152, |
| "grad_norm": 0.786746621131897, |
| "learning_rate": 4.591836734693878e-06, |
| "loss": 0.6991, |
| "mean_token_accuracy": 0.789543628692627, |
| "num_tokens": 8829529.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.09284860897764123, |
| "grad_norm": 0.7629210948944092, |
| "learning_rate": 4.6258503401360546e-06, |
| "loss": 0.6929, |
| "mean_token_accuracy": 0.7892262637615204, |
| "num_tokens": 8894503.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.09353131933777095, |
| "grad_norm": 0.7598711252212524, |
| "learning_rate": 4.659863945578232e-06, |
| "loss": 0.6607, |
| "mean_token_accuracy": 0.7976081520318985, |
| "num_tokens": 8960039.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.09421402969790067, |
| "grad_norm": 0.7624816298484802, |
| "learning_rate": 4.693877551020409e-06, |
| "loss": 0.6648, |
| "mean_token_accuracy": 0.7974095940589905, |
| "num_tokens": 9025575.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.09489674005803038, |
| "grad_norm": 0.7908929586410522, |
| "learning_rate": 4.727891156462586e-06, |
| "loss": 0.6839, |
| "mean_token_accuracy": 0.79278165102005, |
| "num_tokens": 9091111.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.0955794504181601, |
| "grad_norm": 0.7983325123786926, |
| "learning_rate": 4.761904761904762e-06, |
| "loss": 0.7252, |
| "mean_token_accuracy": 0.7833272218704224, |
| "num_tokens": 9156647.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.09626216077828981, |
| "grad_norm": 0.787777841091156, |
| "learning_rate": 4.795918367346939e-06, |
| "loss": 0.6854, |
| "mean_token_accuracy": 0.7925372868776321, |
| "num_tokens": 9222183.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.09694487113841953, |
| "grad_norm": 0.7467668056488037, |
| "learning_rate": 4.829931972789116e-06, |
| "loss": 0.6994, |
| "mean_token_accuracy": 0.7906127870082855, |
| "num_tokens": 9287719.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.09762758149854923, |
| "grad_norm": 0.777410089969635, |
| "learning_rate": 4.863945578231293e-06, |
| "loss": 0.6915, |
| "mean_token_accuracy": 0.7887188494205475, |
| "num_tokens": 9353255.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.09831029185867896, |
| "grad_norm": 0.7552134394645691, |
| "learning_rate": 4.897959183673469e-06, |
| "loss": 0.6787, |
| "mean_token_accuracy": 0.7940340936183929, |
| "num_tokens": 9418791.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.09899300221880868, |
| "grad_norm": 0.7737309336662292, |
| "learning_rate": 4.931972789115647e-06, |
| "loss": 0.696, |
| "mean_token_accuracy": 0.7886081039905548, |
| "num_tokens": 9484311.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.09967571257893838, |
| "grad_norm": 0.7639101147651672, |
| "learning_rate": 4.965986394557824e-06, |
| "loss": 0.6617, |
| "mean_token_accuracy": 0.7969208359718323, |
| "num_tokens": 9549847.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.1003584229390681, |
| "grad_norm": 0.7466997504234314, |
| "learning_rate": 5e-06, |
| "loss": 0.7199, |
| "mean_token_accuracy": 0.7817477583885193, |
| "num_tokens": 9615193.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.10104113329919781, |
| "grad_norm": 0.8260450959205627, |
| "learning_rate": 4.999992887242496e-06, |
| "loss": 0.6805, |
| "mean_token_accuracy": 0.7912542819976807, |
| "num_tokens": 9680729.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.10172384365932753, |
| "grad_norm": 0.7522003054618835, |
| "learning_rate": 4.999971549010455e-06, |
| "loss": 0.6651, |
| "mean_token_accuracy": 0.7988911420106888, |
| "num_tokens": 9746265.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.10240655401945725, |
| "grad_norm": 0.7995320558547974, |
| "learning_rate": 4.9999359854252975e-06, |
| "loss": 0.7125, |
| "mean_token_accuracy": 0.7878176867961884, |
| "num_tokens": 9811801.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.10308926437958696, |
| "grad_norm": 0.7423191666603088, |
| "learning_rate": 4.999886196689386e-06, |
| "loss": 0.6646, |
| "mean_token_accuracy": 0.7983260154724121, |
| "num_tokens": 9877337.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.10377197473971668, |
| "grad_norm": 0.7659249901771545, |
| "learning_rate": 4.99982218308603e-06, |
| "loss": 0.6188, |
| "mean_token_accuracy": 0.811354473233223, |
| "num_tokens": 9942873.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.10445468509984639, |
| "grad_norm": 0.7772722840309143, |
| "learning_rate": 4.999743944979481e-06, |
| "loss": 0.6389, |
| "mean_token_accuracy": 0.8042980134487152, |
| "num_tokens": 10008409.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.10513739545997611, |
| "grad_norm": 0.7917481064796448, |
| "learning_rate": 4.999651482814928e-06, |
| "loss": 0.6516, |
| "mean_token_accuracy": 0.8001801371574402, |
| "num_tokens": 10073870.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.10582010582010581, |
| "grad_norm": 0.7320284843444824, |
| "learning_rate": 4.9995447971185e-06, |
| "loss": 0.66, |
| "mean_token_accuracy": 0.7984074205160141, |
| "num_tokens": 10139280.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.10650281618023553, |
| "grad_norm": 0.7924143671989441, |
| "learning_rate": 4.999423888497261e-06, |
| "loss": 0.677, |
| "mean_token_accuracy": 0.7951185405254364, |
| "num_tokens": 10204816.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.10718552654036526, |
| "grad_norm": 0.8169876933097839, |
| "learning_rate": 4.999288757639206e-06, |
| "loss": 0.662, |
| "mean_token_accuracy": 0.7976234257221222, |
| "num_tokens": 10270352.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.10786823690049496, |
| "grad_norm": 0.7444480657577515, |
| "learning_rate": 4.999139405313257e-06, |
| "loss": 0.6446, |
| "mean_token_accuracy": 0.8019917011260986, |
| "num_tokens": 10335888.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.10855094726062468, |
| "grad_norm": 0.8082272410392761, |
| "learning_rate": 4.998975832369261e-06, |
| "loss": 0.722, |
| "mean_token_accuracy": 0.78294537961483, |
| "num_tokens": 10401424.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.10923365762075439, |
| "grad_norm": 0.7980340123176575, |
| "learning_rate": 4.99879803973798e-06, |
| "loss": 0.7303, |
| "mean_token_accuracy": 0.7807306796312332, |
| "num_tokens": 10466960.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.10991636798088411, |
| "grad_norm": 0.7741237878799438, |
| "learning_rate": 4.998606028431091e-06, |
| "loss": 0.6135, |
| "mean_token_accuracy": 0.811751589179039, |
| "num_tokens": 10532496.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.11059907834101383, |
| "grad_norm": 0.8673920631408691, |
| "learning_rate": 4.998399799541179e-06, |
| "loss": 0.6504, |
| "mean_token_accuracy": 0.80196113884449, |
| "num_tokens": 10598032.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.11128178870114354, |
| "grad_norm": 0.7713350653648376, |
| "learning_rate": 4.998179354241728e-06, |
| "loss": 0.6319, |
| "mean_token_accuracy": 0.8055504709482193, |
| "num_tokens": 10663568.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.11196449906127326, |
| "grad_norm": 0.7200196385383606, |
| "learning_rate": 4.997944693787117e-06, |
| "loss": 0.6266, |
| "mean_token_accuracy": 0.8064363449811935, |
| "num_tokens": 10729104.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.11264720942140297, |
| "grad_norm": 0.8603804111480713, |
| "learning_rate": 4.997695819512612e-06, |
| "loss": 0.6469, |
| "mean_token_accuracy": 0.8034121543169022, |
| "num_tokens": 10794640.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.11332991978153269, |
| "grad_norm": 0.7777139544487, |
| "learning_rate": 4.99743273283436e-06, |
| "loss": 0.6875, |
| "mean_token_accuracy": 0.7916819453239441, |
| "num_tokens": 10860176.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.1140126301416624, |
| "grad_norm": 0.7675216794013977, |
| "learning_rate": 4.997155435249379e-06, |
| "loss": 0.6617, |
| "mean_token_accuracy": 0.7968956828117371, |
| "num_tokens": 10925650.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.11469534050179211, |
| "grad_norm": 0.7635762691497803, |
| "learning_rate": 4.996863928335547e-06, |
| "loss": 0.6355, |
| "mean_token_accuracy": 0.8052615821361542, |
| "num_tokens": 10990622.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.11537805086192184, |
| "grad_norm": 0.7841809988021851, |
| "learning_rate": 4.9965582137516e-06, |
| "loss": 0.6615, |
| "mean_token_accuracy": 0.7980477660894394, |
| "num_tokens": 11056068.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.11606076122205154, |
| "grad_norm": 0.8226203322410583, |
| "learning_rate": 4.996238293237116e-06, |
| "loss": 0.675, |
| "mean_token_accuracy": 0.7942021191120148, |
| "num_tokens": 11121604.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.11674347158218126, |
| "grad_norm": 0.7752227783203125, |
| "learning_rate": 4.99590416861251e-06, |
| "loss": 0.6452, |
| "mean_token_accuracy": 0.802770659327507, |
| "num_tokens": 11187140.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.11742618194231097, |
| "grad_norm": 0.7512950897216797, |
| "learning_rate": 4.99555584177902e-06, |
| "loss": 0.6394, |
| "mean_token_accuracy": 0.802770659327507, |
| "num_tokens": 11252676.0, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.11810889230244069, |
| "grad_norm": 0.7564979791641235, |
| "learning_rate": 4.995193314718695e-06, |
| "loss": 0.6577, |
| "mean_token_accuracy": 0.7998839169740677, |
| "num_tokens": 11318212.0, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.11879160266257041, |
| "grad_norm": 0.7963838577270508, |
| "learning_rate": 4.994816589494391e-06, |
| "loss": 0.7155, |
| "mean_token_accuracy": 0.7836958467960358, |
| "num_tokens": 11383385.0, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.11947431302270012, |
| "grad_norm": 0.8138285875320435, |
| "learning_rate": 4.994425668249751e-06, |
| "loss": 0.6714, |
| "mean_token_accuracy": 0.7966306358575821, |
| "num_tokens": 11448921.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.12015702338282984, |
| "grad_norm": 0.8292612433433533, |
| "learning_rate": 4.994020553209199e-06, |
| "loss": 0.641, |
| "mean_token_accuracy": 0.804618775844574, |
| "num_tokens": 11514457.0, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.12083973374295955, |
| "grad_norm": 0.7372015118598938, |
| "learning_rate": 4.993601246677921e-06, |
| "loss": 0.6556, |
| "mean_token_accuracy": 0.7988605946302414, |
| "num_tokens": 11579993.0, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.12152244410308927, |
| "grad_norm": 0.8773394227027893, |
| "learning_rate": 4.993167751041858e-06, |
| "loss": 0.6926, |
| "mean_token_accuracy": 0.790887713432312, |
| "num_tokens": 11645529.0, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.12220515446321897, |
| "grad_norm": 0.7941958904266357, |
| "learning_rate": 4.9927200687676905e-06, |
| "loss": 0.6788, |
| "mean_token_accuracy": 0.7920080274343491, |
| "num_tokens": 11711045.0, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.1228878648233487, |
| "grad_norm": 0.7528053522109985, |
| "learning_rate": 4.992258202402822e-06, |
| "loss": 0.6856, |
| "mean_token_accuracy": 0.791987419128418, |
| "num_tokens": 11776581.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.12357057518347841, |
| "grad_norm": 0.8133561611175537, |
| "learning_rate": 4.991782154575368e-06, |
| "loss": 0.7112, |
| "mean_token_accuracy": 0.7850594073534012, |
| "num_tokens": 11841587.0, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.12425328554360812, |
| "grad_norm": 0.7941370010375977, |
| "learning_rate": 4.991291927994138e-06, |
| "loss": 0.6595, |
| "mean_token_accuracy": 0.7981490045785904, |
| "num_tokens": 11906559.0, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.12493599590373784, |
| "grad_norm": 0.7561349868774414, |
| "learning_rate": 4.990787525448623e-06, |
| "loss": 0.6884, |
| "mean_token_accuracy": 0.7897268980741501, |
| "num_tokens": 11972095.0, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.12561870626386756, |
| "grad_norm": 0.7951211333274841, |
| "learning_rate": 4.990268949808976e-06, |
| "loss": 0.645, |
| "mean_token_accuracy": 0.8034732490777969, |
| "num_tokens": 12037631.0, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.12630141662399727, |
| "grad_norm": 0.7793656587600708, |
| "learning_rate": 4.989736204026e-06, |
| "loss": 0.6483, |
| "mean_token_accuracy": 0.8013349175453186, |
| "num_tokens": 12103167.0, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.12698412698412698, |
| "grad_norm": 0.8032715916633606, |
| "learning_rate": 4.989189291131129e-06, |
| "loss": 0.6866, |
| "mean_token_accuracy": 0.7917736023664474, |
| "num_tokens": 12168703.0, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.1276668373442567, |
| "grad_norm": 0.7852079272270203, |
| "learning_rate": 4.988628214236409e-06, |
| "loss": 0.6442, |
| "mean_token_accuracy": 0.8026293814182281, |
| "num_tokens": 12234100.0, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.12834954770438642, |
| "grad_norm": 0.7962309122085571, |
| "learning_rate": 4.988052976534483e-06, |
| "loss": 0.6989, |
| "mean_token_accuracy": 0.7885660976171494, |
| "num_tokens": 12299636.0, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.12903225806451613, |
| "grad_norm": 0.774936854839325, |
| "learning_rate": 4.987463581298573e-06, |
| "loss": 0.6561, |
| "mean_token_accuracy": 0.7997016906738281, |
| "num_tokens": 12365042.0, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.12971496842464583, |
| "grad_norm": 0.7444131374359131, |
| "learning_rate": 4.986860031882459e-06, |
| "loss": 0.6164, |
| "mean_token_accuracy": 0.8107282519340515, |
| "num_tokens": 12430578.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.13039767878477557, |
| "grad_norm": 0.7779427766799927, |
| "learning_rate": 4.986242331720461e-06, |
| "loss": 0.6538, |
| "mean_token_accuracy": 0.7991146445274353, |
| "num_tokens": 12496036.0, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.13108038914490527, |
| "grad_norm": 0.7790391445159912, |
| "learning_rate": 4.985610484327421e-06, |
| "loss": 0.6263, |
| "mean_token_accuracy": 0.8059475868940353, |
| "num_tokens": 12561572.0, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.13176309950503498, |
| "grad_norm": 0.7643638849258423, |
| "learning_rate": 4.9849644932986795e-06, |
| "loss": 0.6947, |
| "mean_token_accuracy": 0.7892792373895645, |
| "num_tokens": 12627073.0, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.13244580986516472, |
| "grad_norm": 0.8144778609275818, |
| "learning_rate": 4.98430436231006e-06, |
| "loss": 0.6745, |
| "mean_token_accuracy": 0.793398305773735, |
| "num_tokens": 12692601.0, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.13312852022529442, |
| "grad_norm": 0.7241678833961487, |
| "learning_rate": 4.983630095117843e-06, |
| "loss": 0.6126, |
| "mean_token_accuracy": 0.8117210417985916, |
| "num_tokens": 12758137.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.13381123058542413, |
| "grad_norm": 0.730594277381897, |
| "learning_rate": 4.982941695558748e-06, |
| "loss": 0.6867, |
| "mean_token_accuracy": 0.7909946292638779, |
| "num_tokens": 12823673.0, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.13449394094555384, |
| "grad_norm": 0.753174901008606, |
| "learning_rate": 4.98223916754991e-06, |
| "loss": 0.6757, |
| "mean_token_accuracy": 0.7946603149175644, |
| "num_tokens": 12889209.0, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.13517665130568357, |
| "grad_norm": 0.7512592673301697, |
| "learning_rate": 4.981522515088858e-06, |
| "loss": 0.6531, |
| "mean_token_accuracy": 0.801472395658493, |
| "num_tokens": 12954745.0, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.13585936166581328, |
| "grad_norm": 0.7973717451095581, |
| "learning_rate": 4.980791742253493e-06, |
| "loss": 0.7005, |
| "mean_token_accuracy": 0.7849462330341339, |
| "num_tokens": 13020281.0, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.13654207202594298, |
| "grad_norm": 0.7488005757331848, |
| "learning_rate": 4.980046853202062e-06, |
| "loss": 0.6187, |
| "mean_token_accuracy": 0.8101655095815659, |
| "num_tokens": 13085725.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13722478238607272, |
| "grad_norm": 0.7275497913360596, |
| "learning_rate": 4.979287852173138e-06, |
| "loss": 0.6091, |
| "mean_token_accuracy": 0.8102089464664459, |
| "num_tokens": 13151261.0, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.13790749274620243, |
| "grad_norm": 0.7557955384254456, |
| "learning_rate": 4.978514743485593e-06, |
| "loss": 0.6613, |
| "mean_token_accuracy": 0.799455463886261, |
| "num_tokens": 13216711.0, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.13859020310633213, |
| "grad_norm": 0.7728015184402466, |
| "learning_rate": 4.977727531538573e-06, |
| "loss": 0.6621, |
| "mean_token_accuracy": 0.7968718856573105, |
| "num_tokens": 13281491.0, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.13927291346646187, |
| "grad_norm": 0.7593042254447937, |
| "learning_rate": 4.976926220811479e-06, |
| "loss": 0.694, |
| "mean_token_accuracy": 0.7916208654642105, |
| "num_tokens": 13347027.0, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.13995562382659157, |
| "grad_norm": 0.7546338438987732, |
| "learning_rate": 4.976110815863932e-06, |
| "loss": 0.6704, |
| "mean_token_accuracy": 0.796303778886795, |
| "num_tokens": 13412524.0, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.14063833418672128, |
| "grad_norm": 0.7690262794494629, |
| "learning_rate": 4.975281321335755e-06, |
| "loss": 0.6753, |
| "mean_token_accuracy": 0.7962145358324051, |
| "num_tokens": 13477914.0, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.141321044546851, |
| "grad_norm": 0.7461578845977783, |
| "learning_rate": 4.974437741946943e-06, |
| "loss": 0.6543, |
| "mean_token_accuracy": 0.7989522367715836, |
| "num_tokens": 13543450.0, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.14200375490698072, |
| "grad_norm": 0.7672309279441833, |
| "learning_rate": 4.973580082497636e-06, |
| "loss": 0.6676, |
| "mean_token_accuracy": 0.7950268983840942, |
| "num_tokens": 13608986.0, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.14268646526711043, |
| "grad_norm": 0.7266797423362732, |
| "learning_rate": 4.9727083478680925e-06, |
| "loss": 0.6327, |
| "mean_token_accuracy": 0.8044965714216232, |
| "num_tokens": 13674522.0, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.14336917562724014, |
| "grad_norm": 0.7649458050727844, |
| "learning_rate": 4.971822543018663e-06, |
| "loss": 0.6644, |
| "mean_token_accuracy": 0.7988300323486328, |
| "num_tokens": 13740058.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.14405188598736987, |
| "grad_norm": 0.7558519244194031, |
| "learning_rate": 4.970922672989759e-06, |
| "loss": 0.6387, |
| "mean_token_accuracy": 0.8038550913333893, |
| "num_tokens": 13805594.0, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.14473459634749958, |
| "grad_norm": 0.7533179521560669, |
| "learning_rate": 4.970008742901827e-06, |
| "loss": 0.6512, |
| "mean_token_accuracy": 0.7999603003263474, |
| "num_tokens": 13871130.0, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.14541730670762928, |
| "grad_norm": 0.7587620615959167, |
| "learning_rate": 4.969080757955317e-06, |
| "loss": 0.6682, |
| "mean_token_accuracy": 0.7954698204994202, |
| "num_tokens": 13936666.0, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.146100017067759, |
| "grad_norm": 0.7344343066215515, |
| "learning_rate": 4.968138723430654e-06, |
| "loss": 0.6331, |
| "mean_token_accuracy": 0.8062831312417984, |
| "num_tokens": 14001627.0, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.14678272742788873, |
| "grad_norm": 0.8171228170394897, |
| "learning_rate": 4.96718264468821e-06, |
| "loss": 0.6709, |
| "mean_token_accuracy": 0.7964931726455688, |
| "num_tokens": 14067163.0, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.14746543778801843, |
| "grad_norm": 0.7533463835716248, |
| "learning_rate": 4.966212527168268e-06, |
| "loss": 0.6755, |
| "mean_token_accuracy": 0.7955246716737747, |
| "num_tokens": 14132389.0, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.14814814814814814, |
| "grad_norm": 0.7684210538864136, |
| "learning_rate": 4.965228376390998e-06, |
| "loss": 0.6898, |
| "mean_token_accuracy": 0.7908294945955276, |
| "num_tokens": 14197709.0, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.14883085850827787, |
| "grad_norm": 0.7119394540786743, |
| "learning_rate": 4.964230197956421e-06, |
| "loss": 0.6306, |
| "mean_token_accuracy": 0.8053364753723145, |
| "num_tokens": 14263205.0, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.14951356886840758, |
| "grad_norm": 0.7983673810958862, |
| "learning_rate": 4.963217997544376e-06, |
| "loss": 0.6943, |
| "mean_token_accuracy": 0.79004767537117, |
| "num_tokens": 14328741.0, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.1501962792285373, |
| "grad_norm": 0.7425892353057861, |
| "learning_rate": 4.962191780914494e-06, |
| "loss": 0.6252, |
| "mean_token_accuracy": 0.8074288666248322, |
| "num_tokens": 14394170.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.150878989588667, |
| "grad_norm": 0.7484636902809143, |
| "learning_rate": 4.961151553906158e-06, |
| "loss": 0.6192, |
| "mean_token_accuracy": 0.810231164097786, |
| "num_tokens": 14459375.0, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.15156169994879673, |
| "grad_norm": 0.795856237411499, |
| "learning_rate": 4.960097322438474e-06, |
| "loss": 0.6494, |
| "mean_token_accuracy": 0.8015000522136688, |
| "num_tokens": 14524534.0, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.15224441030892644, |
| "grad_norm": 0.7697437405586243, |
| "learning_rate": 4.959029092510236e-06, |
| "loss": 0.615, |
| "mean_token_accuracy": 0.8114613890647888, |
| "num_tokens": 14590070.0, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.15292712066905614, |
| "grad_norm": 0.7738355994224548, |
| "learning_rate": 4.9579468701998926e-06, |
| "loss": 0.6611, |
| "mean_token_accuracy": 0.7977761626243591, |
| "num_tokens": 14655606.0, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.15360983102918588, |
| "grad_norm": 0.759848952293396, |
| "learning_rate": 4.9568506616655125e-06, |
| "loss": 0.6472, |
| "mean_token_accuracy": 0.8012253940105438, |
| "num_tokens": 14721099.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.15429254138931559, |
| "grad_norm": 0.7471370697021484, |
| "learning_rate": 4.955740473144746e-06, |
| "loss": 0.6849, |
| "mean_token_accuracy": 0.7934689670801163, |
| "num_tokens": 14786635.0, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.1549752517494453, |
| "grad_norm": 0.7679874897003174, |
| "learning_rate": 4.954616310954796e-06, |
| "loss": 0.6626, |
| "mean_token_accuracy": 0.7951722294092178, |
| "num_tokens": 14851243.0, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.15565796210957503, |
| "grad_norm": 0.7903568744659424, |
| "learning_rate": 4.953478181492377e-06, |
| "loss": 0.6891, |
| "mean_token_accuracy": 0.7914992719888687, |
| "num_tokens": 14916708.0, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.15634067246970473, |
| "grad_norm": 0.8032446503639221, |
| "learning_rate": 4.95232609123368e-06, |
| "loss": 0.6801, |
| "mean_token_accuracy": 0.7916972190141678, |
| "num_tokens": 14982244.0, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.15702338282983444, |
| "grad_norm": 0.7658280730247498, |
| "learning_rate": 4.9511600467343355e-06, |
| "loss": 0.6485, |
| "mean_token_accuracy": 0.8023097962141037, |
| "num_tokens": 15047779.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.15770609318996415, |
| "grad_norm": 0.7308084964752197, |
| "learning_rate": 4.9499800546293776e-06, |
| "loss": 0.6263, |
| "mean_token_accuracy": 0.8065585345029831, |
| "num_tokens": 15113315.0, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.15838880355009388, |
| "grad_norm": 0.7760301828384399, |
| "learning_rate": 4.948786121633204e-06, |
| "loss": 0.6715, |
| "mean_token_accuracy": 0.7964503765106201, |
| "num_tokens": 15178784.0, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.1590715139102236, |
| "grad_norm": 0.7470239400863647, |
| "learning_rate": 4.947578254539539e-06, |
| "loss": 0.6479, |
| "mean_token_accuracy": 0.8013196587562561, |
| "num_tokens": 15244320.0, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.1597542242703533, |
| "grad_norm": 0.8011210560798645, |
| "learning_rate": 4.946356460221396e-06, |
| "loss": 0.6536, |
| "mean_token_accuracy": 0.7990744262933731, |
| "num_tokens": 15309856.0, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.16043693463048303, |
| "grad_norm": 0.8322118520736694, |
| "learning_rate": 4.945120745631036e-06, |
| "loss": 0.6862, |
| "mean_token_accuracy": 0.7918499410152435, |
| "num_tokens": 15375392.0, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.16111964499061274, |
| "grad_norm": 0.7466151118278503, |
| "learning_rate": 4.943871117799929e-06, |
| "loss": 0.654, |
| "mean_token_accuracy": 0.7988605946302414, |
| "num_tokens": 15440928.0, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.16180235535074244, |
| "grad_norm": 0.7664824724197388, |
| "learning_rate": 4.942607583838715e-06, |
| "loss": 0.6263, |
| "mean_token_accuracy": 0.807658240199089, |
| "num_tokens": 15506464.0, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.16248506571087215, |
| "grad_norm": 0.7403832674026489, |
| "learning_rate": 4.941330150937164e-06, |
| "loss": 0.6117, |
| "mean_token_accuracy": 0.8102048337459564, |
| "num_tokens": 15571701.0, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.16316777607100189, |
| "grad_norm": 0.8409820199012756, |
| "learning_rate": 4.94003882636413e-06, |
| "loss": 0.6636, |
| "mean_token_accuracy": 0.7948121428489685, |
| "num_tokens": 15636714.0, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.1638504864311316, |
| "grad_norm": 0.7570234537124634, |
| "learning_rate": 4.938733617467517e-06, |
| "loss": 0.6386, |
| "mean_token_accuracy": 0.8039925545454025, |
| "num_tokens": 15702250.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.1645331967912613, |
| "grad_norm": 0.7913652062416077, |
| "learning_rate": 4.937414531674234e-06, |
| "loss": 0.6241, |
| "mean_token_accuracy": 0.8082087188959122, |
| "num_tokens": 15767776.0, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.16521590715139103, |
| "grad_norm": 0.7465484738349915, |
| "learning_rate": 4.9360815764901485e-06, |
| "loss": 0.6072, |
| "mean_token_accuracy": 0.8118190169334412, |
| "num_tokens": 15832920.0, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.16589861751152074, |
| "grad_norm": 0.7312461733818054, |
| "learning_rate": 4.934734759500052e-06, |
| "loss": 0.6311, |
| "mean_token_accuracy": 0.8050464242696762, |
| "num_tokens": 15898456.0, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.16658132787165045, |
| "grad_norm": 0.7343611717224121, |
| "learning_rate": 4.933374088367608e-06, |
| "loss": 0.6303, |
| "mean_token_accuracy": 0.8073833137750626, |
| "num_tokens": 15963992.0, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.16726403823178015, |
| "grad_norm": 0.7481185793876648, |
| "learning_rate": 4.931999570835319e-06, |
| "loss": 0.6477, |
| "mean_token_accuracy": 0.8016556799411774, |
| "num_tokens": 16029528.0, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.1679467485919099, |
| "grad_norm": 0.7350367307662964, |
| "learning_rate": 4.93061121472447e-06, |
| "loss": 0.6316, |
| "mean_token_accuracy": 0.8050617128610611, |
| "num_tokens": 16095064.0, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.1686294589520396, |
| "grad_norm": 0.7721445560455322, |
| "learning_rate": 4.929209027935095e-06, |
| "loss": 0.649, |
| "mean_token_accuracy": 0.7996423244476318, |
| "num_tokens": 16160499.0, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.1693121693121693, |
| "grad_norm": 0.7480275630950928, |
| "learning_rate": 4.927793018445924e-06, |
| "loss": 0.6655, |
| "mean_token_accuracy": 0.7958669364452362, |
| "num_tokens": 16226035.0, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.16999487967229904, |
| "grad_norm": 0.7658473253250122, |
| "learning_rate": 4.926363194314345e-06, |
| "loss": 0.6314, |
| "mean_token_accuracy": 0.8055810183286667, |
| "num_tokens": 16291571.0, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.17067759003242874, |
| "grad_norm": 0.7760869264602661, |
| "learning_rate": 4.92491956367635e-06, |
| "loss": 0.6814, |
| "mean_token_accuracy": 0.7943701148033142, |
| "num_tokens": 16357107.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.17136030039255845, |
| "grad_norm": 0.7353349328041077, |
| "learning_rate": 4.9234621347464965e-06, |
| "loss": 0.6221, |
| "mean_token_accuracy": 0.8080248087644577, |
| "num_tokens": 16422643.0, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.17204301075268819, |
| "grad_norm": 0.775214433670044, |
| "learning_rate": 4.9219909158178525e-06, |
| "loss": 0.6353, |
| "mean_token_accuracy": 0.8042980283498764, |
| "num_tokens": 16488179.0, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.1727257211128179, |
| "grad_norm": 0.7820099592208862, |
| "learning_rate": 4.9205059152619595e-06, |
| "loss": 0.6736, |
| "mean_token_accuracy": 0.7939413487911224, |
| "num_tokens": 16552913.0, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.1734084314729476, |
| "grad_norm": 0.7453340291976929, |
| "learning_rate": 4.919007141528776e-06, |
| "loss": 0.6617, |
| "mean_token_accuracy": 0.7975623160600662, |
| "num_tokens": 16618449.0, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.1740911418330773, |
| "grad_norm": 0.7431179285049438, |
| "learning_rate": 4.917494603146632e-06, |
| "loss": 0.6419, |
| "mean_token_accuracy": 0.8054893761873245, |
| "num_tokens": 16683985.0, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.17477385219320704, |
| "grad_norm": 0.76502525806427, |
| "learning_rate": 4.9159683087221835e-06, |
| "loss": 0.6393, |
| "mean_token_accuracy": 0.8057032078504562, |
| "num_tokens": 16749521.0, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.17545656255333675, |
| "grad_norm": 0.793250322341919, |
| "learning_rate": 4.91442826694036e-06, |
| "loss": 0.7001, |
| "mean_token_accuracy": 0.7883522659540176, |
| "num_tokens": 16815057.0, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.17613927291346645, |
| "grad_norm": 0.8034051060676575, |
| "learning_rate": 4.912874486564317e-06, |
| "loss": 0.655, |
| "mean_token_accuracy": 0.7997875660657883, |
| "num_tokens": 16880373.0, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.1768219832735962, |
| "grad_norm": 0.7254968285560608, |
| "learning_rate": 4.911306976435384e-06, |
| "loss": 0.6268, |
| "mean_token_accuracy": 0.8077804297208786, |
| "num_tokens": 16945909.0, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.1775046936337259, |
| "grad_norm": 0.7864726781845093, |
| "learning_rate": 4.909725745473017e-06, |
| "loss": 0.6524, |
| "mean_token_accuracy": 0.7994562685489655, |
| "num_tokens": 17011445.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.1781874039938556, |
| "grad_norm": 0.7491886019706726, |
| "learning_rate": 4.908130802674747e-06, |
| "loss": 0.6405, |
| "mean_token_accuracy": 0.803121954202652, |
| "num_tokens": 17076981.0, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.1788701143539853, |
| "grad_norm": 0.7460026144981384, |
| "learning_rate": 4.906522157116125e-06, |
| "loss": 0.6518, |
| "mean_token_accuracy": 0.799975574016571, |
| "num_tokens": 17142517.0, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.17955282471411504, |
| "grad_norm": 0.7651973962783813, |
| "learning_rate": 4.904899817950677e-06, |
| "loss": 0.6911, |
| "mean_token_accuracy": 0.7881384491920471, |
| "num_tokens": 17208053.0, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.18023553507424475, |
| "grad_norm": 0.773990273475647, |
| "learning_rate": 4.903263794409846e-06, |
| "loss": 0.6426, |
| "mean_token_accuracy": 0.8015334904193878, |
| "num_tokens": 17273589.0, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.18091824543437446, |
| "grad_norm": 0.7365688681602478, |
| "learning_rate": 4.901614095802945e-06, |
| "loss": 0.6416, |
| "mean_token_accuracy": 0.8022055327892303, |
| "num_tokens": 17339125.0, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.1816009557945042, |
| "grad_norm": 0.7673572301864624, |
| "learning_rate": 4.8999507315170965e-06, |
| "loss": 0.6567, |
| "mean_token_accuracy": 0.7990744113922119, |
| "num_tokens": 17404661.0, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.1822836661546339, |
| "grad_norm": 0.751385509967804, |
| "learning_rate": 4.898273711017187e-06, |
| "loss": 0.6364, |
| "mean_token_accuracy": 0.8052991777658463, |
| "num_tokens": 17469831.0, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.1829663765147636, |
| "grad_norm": 0.744659423828125, |
| "learning_rate": 4.896583043845809e-06, |
| "loss": 0.6295, |
| "mean_token_accuracy": 0.8060995042324066, |
| "num_tokens": 17535046.0, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.1836490868748933, |
| "grad_norm": 0.708869993686676, |
| "learning_rate": 4.894878739623207e-06, |
| "loss": 0.5935, |
| "mean_token_accuracy": 0.8142259269952774, |
| "num_tokens": 17600582.0, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.18433179723502305, |
| "grad_norm": 0.7676243185997009, |
| "learning_rate": 4.893160808047222e-06, |
| "loss": 0.6736, |
| "mean_token_accuracy": 0.7934078723192215, |
| "num_tokens": 17666118.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.18501450759515276, |
| "grad_norm": 0.7718145251274109, |
| "learning_rate": 4.89142925889324e-06, |
| "loss": 0.6561, |
| "mean_token_accuracy": 0.7996843755245209, |
| "num_tokens": 17730784.0, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.18569721795528246, |
| "grad_norm": 0.7711499929428101, |
| "learning_rate": 4.889684102014132e-06, |
| "loss": 0.649, |
| "mean_token_accuracy": 0.800556480884552, |
| "num_tokens": 17796029.0, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.1863799283154122, |
| "grad_norm": 0.7592529654502869, |
| "learning_rate": 4.887925347340199e-06, |
| "loss": 0.6602, |
| "mean_token_accuracy": 0.797866091132164, |
| "num_tokens": 17860903.0, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.1870626386755419, |
| "grad_norm": 0.7690698504447937, |
| "learning_rate": 4.886153004879119e-06, |
| "loss": 0.6666, |
| "mean_token_accuracy": 0.7956989407539368, |
| "num_tokens": 17926439.0, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.1877453490356716, |
| "grad_norm": 0.7630760669708252, |
| "learning_rate": 4.884367084715884e-06, |
| "loss": 0.6189, |
| "mean_token_accuracy": 0.8093841671943665, |
| "num_tokens": 17991975.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.18842805939580134, |
| "grad_norm": 0.7848920822143555, |
| "learning_rate": 4.882567597012749e-06, |
| "loss": 0.6744, |
| "mean_token_accuracy": 0.7962527424097061, |
| "num_tokens": 18057034.0, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.18911076975593105, |
| "grad_norm": 0.7455816268920898, |
| "learning_rate": 4.88075455200917e-06, |
| "loss": 0.6357, |
| "mean_token_accuracy": 0.8023249208927155, |
| "num_tokens": 18122157.0, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.18979348011606076, |
| "grad_norm": 0.7486319541931152, |
| "learning_rate": 4.878927960021746e-06, |
| "loss": 0.6457, |
| "mean_token_accuracy": 0.8028607368469238, |
| "num_tokens": 18187425.0, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.19047619047619047, |
| "grad_norm": 0.7425917983055115, |
| "learning_rate": 4.87708783144416e-06, |
| "loss": 0.6426, |
| "mean_token_accuracy": 0.8033974319696426, |
| "num_tokens": 18252951.0, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.1911589008363202, |
| "grad_norm": 0.7807708978652954, |
| "learning_rate": 4.875234176747125e-06, |
| "loss": 0.7113, |
| "mean_token_accuracy": 0.7846617698669434, |
| "num_tokens": 18317646.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.1918416111964499, |
| "grad_norm": 0.7756418585777283, |
| "learning_rate": 4.873367006478319e-06, |
| "loss": 0.6106, |
| "mean_token_accuracy": 0.8120723366737366, |
| "num_tokens": 18383182.0, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.19252432155657961, |
| "grad_norm": 0.7727638483047485, |
| "learning_rate": 4.871486331262322e-06, |
| "loss": 0.6201, |
| "mean_token_accuracy": 0.8101401478052139, |
| "num_tokens": 18448389.0, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.19320703191670935, |
| "grad_norm": 0.7593509554862976, |
| "learning_rate": 4.869592161800566e-06, |
| "loss": 0.6288, |
| "mean_token_accuracy": 0.8057032078504562, |
| "num_tokens": 18513925.0, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.19388974227683906, |
| "grad_norm": 0.7642320394515991, |
| "learning_rate": 4.867684508871264e-06, |
| "loss": 0.6609, |
| "mean_token_accuracy": 0.7985596507787704, |
| "num_tokens": 18579436.0, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.19457245263696876, |
| "grad_norm": 0.761231541633606, |
| "learning_rate": 4.865763383329356e-06, |
| "loss": 0.6115, |
| "mean_token_accuracy": 0.8105635046958923, |
| "num_tokens": 18644911.0, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.19525516299709847, |
| "grad_norm": 0.7838804125785828, |
| "learning_rate": 4.8638287961064405e-06, |
| "loss": 0.6728, |
| "mean_token_accuracy": 0.7939742207527161, |
| "num_tokens": 18710341.0, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.1959378733572282, |
| "grad_norm": 0.8204277157783508, |
| "learning_rate": 4.861880758210717e-06, |
| "loss": 0.6403, |
| "mean_token_accuracy": 0.8030597120523453, |
| "num_tokens": 18775099.0, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.1966205837173579, |
| "grad_norm": 0.7632973194122314, |
| "learning_rate": 4.859919280726925e-06, |
| "loss": 0.6266, |
| "mean_token_accuracy": 0.807135596871376, |
| "num_tokens": 18840451.0, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.19730329407748762, |
| "grad_norm": 0.7931602001190186, |
| "learning_rate": 4.857944374816272e-06, |
| "loss": 0.6502, |
| "mean_token_accuracy": 0.7989926338195801, |
| "num_tokens": 18905826.0, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.19798600443761735, |
| "grad_norm": 0.7971415519714355, |
| "learning_rate": 4.855956051716382e-06, |
| "loss": 0.662, |
| "mean_token_accuracy": 0.7989369630813599, |
| "num_tokens": 18971362.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.19866871479774706, |
| "grad_norm": 0.7802919149398804, |
| "learning_rate": 4.853954322741221e-06, |
| "loss": 0.6415, |
| "mean_token_accuracy": 0.8015654236078262, |
| "num_tokens": 19036550.0, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.19935142515787677, |
| "grad_norm": 0.7734397649765015, |
| "learning_rate": 4.851939199281042e-06, |
| "loss": 0.6578, |
| "mean_token_accuracy": 0.7977914214134216, |
| "num_tokens": 19102086.0, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.20003413551800647, |
| "grad_norm": 0.7715845704078674, |
| "learning_rate": 4.84991069280231e-06, |
| "loss": 0.6628, |
| "mean_token_accuracy": 0.7946704924106598, |
| "num_tokens": 19167541.0, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.2007168458781362, |
| "grad_norm": 0.7626240253448486, |
| "learning_rate": 4.847868814847646e-06, |
| "loss": 0.6318, |
| "mean_token_accuracy": 0.8061958402395248, |
| "num_tokens": 19232955.0, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.20139955623826591, |
| "grad_norm": 0.7462788820266724, |
| "learning_rate": 4.845813577035756e-06, |
| "loss": 0.6455, |
| "mean_token_accuracy": 0.8009988963603973, |
| "num_tokens": 19298491.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.20208226659839562, |
| "grad_norm": 0.7987794280052185, |
| "learning_rate": 4.843744991061366e-06, |
| "loss": 0.674, |
| "mean_token_accuracy": 0.7931847423315048, |
| "num_tokens": 19363418.0, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.20276497695852536, |
| "grad_norm": 0.7509877681732178, |
| "learning_rate": 4.841663068695157e-06, |
| "loss": 0.6494, |
| "mean_token_accuracy": 0.8006734549999237, |
| "num_tokens": 19428911.0, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.20344768731865506, |
| "grad_norm": 0.762054979801178, |
| "learning_rate": 4.839567821783696e-06, |
| "loss": 0.6296, |
| "mean_token_accuracy": 0.8052349090576172, |
| "num_tokens": 19494165.0, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.20413039767878477, |
| "grad_norm": 0.7115033268928528, |
| "learning_rate": 4.8374592622493696e-06, |
| "loss": 0.6046, |
| "mean_token_accuracy": 0.8131680637598038, |
| "num_tokens": 19559490.0, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.2048131080389145, |
| "grad_norm": 0.7701613306999207, |
| "learning_rate": 4.835337402090317e-06, |
| "loss": 0.6404, |
| "mean_token_accuracy": 0.8019764274358749, |
| "num_tokens": 19625026.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2054958183990442, |
| "grad_norm": 0.8328090310096741, |
| "learning_rate": 4.833202253380357e-06, |
| "loss": 0.6707, |
| "mean_token_accuracy": 0.7956531047821045, |
| "num_tokens": 19690562.0, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.20617852875917392, |
| "grad_norm": 0.7679016590118408, |
| "learning_rate": 4.831053828268927e-06, |
| "loss": 0.6237, |
| "mean_token_accuracy": 0.809225007891655, |
| "num_tokens": 19755943.0, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.20686123911930362, |
| "grad_norm": 0.7081552743911743, |
| "learning_rate": 4.828892138981009e-06, |
| "loss": 0.5685, |
| "mean_token_accuracy": 0.8225348144769669, |
| "num_tokens": 19821479.0, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.20754394947943336, |
| "grad_norm": 0.7495241761207581, |
| "learning_rate": 4.82671719781706e-06, |
| "loss": 0.6344, |
| "mean_token_accuracy": 0.8030302971601486, |
| "num_tokens": 19887015.0, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.20822665983956307, |
| "grad_norm": 0.7506179213523865, |
| "learning_rate": 4.824529017152943e-06, |
| "loss": 0.6156, |
| "mean_token_accuracy": 0.8103922307491302, |
| "num_tokens": 19952551.0, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.20890937019969277, |
| "grad_norm": 0.8004708886146545, |
| "learning_rate": 4.822327609439857e-06, |
| "loss": 0.6525, |
| "mean_token_accuracy": 0.7992118746042252, |
| "num_tokens": 20018087.0, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.2095920805598225, |
| "grad_norm": 0.7550822496414185, |
| "learning_rate": 4.820112987204265e-06, |
| "loss": 0.6152, |
| "mean_token_accuracy": 0.8115045875310898, |
| "num_tokens": 20083376.0, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.21027479091995221, |
| "grad_norm": 0.7593732476234436, |
| "learning_rate": 4.817885163047824e-06, |
| "loss": 0.6825, |
| "mean_token_accuracy": 0.7943395674228668, |
| "num_tokens": 20148912.0, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.21095750128008192, |
| "grad_norm": 0.7294323444366455, |
| "learning_rate": 4.815644149647313e-06, |
| "loss": 0.6264, |
| "mean_token_accuracy": 0.8054499328136444, |
| "num_tokens": 20214180.0, |
| "step": 309 |
| }, |
| { |
| "epoch": 0.21164021164021163, |
| "grad_norm": 0.8109260201454163, |
| "learning_rate": 4.813389959754559e-06, |
| "loss": 0.6476, |
| "mean_token_accuracy": 0.7988300323486328, |
| "num_tokens": 20279716.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.21232292200034136, |
| "grad_norm": 0.735640823841095, |
| "learning_rate": 4.811122606196367e-06, |
| "loss": 0.6329, |
| "mean_token_accuracy": 0.805917039513588, |
| "num_tokens": 20345252.0, |
| "step": 311 |
| }, |
| { |
| "epoch": 0.21300563236047107, |
| "grad_norm": 0.7742691040039062, |
| "learning_rate": 4.808842101874447e-06, |
| "loss": 0.654, |
| "mean_token_accuracy": 0.80037821829319, |
| "num_tokens": 20410686.0, |
| "step": 312 |
| }, |
| { |
| "epoch": 0.21368834272060078, |
| "grad_norm": 0.8276035785675049, |
| "learning_rate": 4.806548459765337e-06, |
| "loss": 0.6349, |
| "mean_token_accuracy": 0.8064994513988495, |
| "num_tokens": 20475985.0, |
| "step": 313 |
| }, |
| { |
| "epoch": 0.2143710530807305, |
| "grad_norm": 0.8679972290992737, |
| "learning_rate": 4.804241692920333e-06, |
| "loss": 0.656, |
| "mean_token_accuracy": 0.799239456653595, |
| "num_tokens": 20541000.0, |
| "step": 314 |
| }, |
| { |
| "epoch": 0.21505376344086022, |
| "grad_norm": 0.781409502029419, |
| "learning_rate": 4.8019218144654135e-06, |
| "loss": 0.7053, |
| "mean_token_accuracy": 0.7867943644523621, |
| "num_tokens": 20606536.0, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.21573647380098993, |
| "grad_norm": 0.7729482650756836, |
| "learning_rate": 4.799588837601167e-06, |
| "loss": 0.6525, |
| "mean_token_accuracy": 0.8019153326749802, |
| "num_tokens": 20672072.0, |
| "step": 316 |
| }, |
| { |
| "epoch": 0.21641918416111963, |
| "grad_norm": 0.8480416536331177, |
| "learning_rate": 4.79724277560271e-06, |
| "loss": 0.6216, |
| "mean_token_accuracy": 0.8093619346618652, |
| "num_tokens": 20736796.0, |
| "step": 317 |
| }, |
| { |
| "epoch": 0.21710189452124937, |
| "grad_norm": 0.7638158202171326, |
| "learning_rate": 4.794883641819619e-06, |
| "loss": 0.6586, |
| "mean_token_accuracy": 0.7982496321201324, |
| "num_tokens": 20802332.0, |
| "step": 318 |
| }, |
| { |
| "epoch": 0.21778460488137907, |
| "grad_norm": 0.7327334880828857, |
| "learning_rate": 4.792511449675852e-06, |
| "loss": 0.6338, |
| "mean_token_accuracy": 0.8040231019258499, |
| "num_tokens": 20867868.0, |
| "step": 319 |
| }, |
| { |
| "epoch": 0.21846731524150878, |
| "grad_norm": 0.8290518522262573, |
| "learning_rate": 4.79012621266967e-06, |
| "loss": 0.6651, |
| "mean_token_accuracy": 0.7965084463357925, |
| "num_tokens": 20933404.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.21915002560163852, |
| "grad_norm": 0.7668699026107788, |
| "learning_rate": 4.787727944373565e-06, |
| "loss": 0.6644, |
| "mean_token_accuracy": 0.7934842258691788, |
| "num_tokens": 20998940.0, |
| "step": 321 |
| }, |
| { |
| "epoch": 0.21983273596176822, |
| "grad_norm": 0.7874463796615601, |
| "learning_rate": 4.7853166584341745e-06, |
| "loss": 0.6177, |
| "mean_token_accuracy": 0.8053746223449707, |
| "num_tokens": 21064190.0, |
| "step": 322 |
| }, |
| { |
| "epoch": 0.22051544632189793, |
| "grad_norm": 0.7763647437095642, |
| "learning_rate": 4.7828923685722155e-06, |
| "loss": 0.6288, |
| "mean_token_accuracy": 0.808009535074234, |
| "num_tokens": 21129726.0, |
| "step": 323 |
| }, |
| { |
| "epoch": 0.22119815668202766, |
| "grad_norm": 0.8179218173027039, |
| "learning_rate": 4.780455088582394e-06, |
| "loss": 0.645, |
| "mean_token_accuracy": 0.8021404594182968, |
| "num_tokens": 21195177.0, |
| "step": 324 |
| }, |
| { |
| "epoch": 0.22188086704215737, |
| "grad_norm": 0.7938908338546753, |
| "learning_rate": 4.778004832333337e-06, |
| "loss": 0.6616, |
| "mean_token_accuracy": 0.7988453060388565, |
| "num_tokens": 21260713.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.22256357740228708, |
| "grad_norm": 0.7682486176490784, |
| "learning_rate": 4.775541613767506e-06, |
| "loss": 0.6328, |
| "mean_token_accuracy": 0.8051348775625229, |
| "num_tokens": 21326248.0, |
| "step": 326 |
| }, |
| { |
| "epoch": 0.22324628776241678, |
| "grad_norm": 0.766139805316925, |
| "learning_rate": 4.773065446901123e-06, |
| "loss": 0.617, |
| "mean_token_accuracy": 0.8100256621837616, |
| "num_tokens": 21391784.0, |
| "step": 327 |
| }, |
| { |
| "epoch": 0.22392899812254652, |
| "grad_norm": 0.8150837421417236, |
| "learning_rate": 4.770576345824087e-06, |
| "loss": 0.6582, |
| "mean_token_accuracy": 0.7959434241056442, |
| "num_tokens": 21456337.0, |
| "step": 328 |
| }, |
| { |
| "epoch": 0.22461170848267623, |
| "grad_norm": 0.8115120530128479, |
| "learning_rate": 4.768074324699897e-06, |
| "loss": 0.6373, |
| "mean_token_accuracy": 0.8020833283662796, |
| "num_tokens": 21521873.0, |
| "step": 329 |
| }, |
| { |
| "epoch": 0.22529441884280593, |
| "grad_norm": 0.7324831485748291, |
| "learning_rate": 4.765559397765568e-06, |
| "loss": 0.618, |
| "mean_token_accuracy": 0.8072916716337204, |
| "num_tokens": 21587409.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.22597712920293567, |
| "grad_norm": 0.8572807908058167, |
| "learning_rate": 4.763031579331552e-06, |
| "loss": 0.6274, |
| "mean_token_accuracy": 0.8070014715194702, |
| "num_tokens": 21652945.0, |
| "step": 331 |
| }, |
| { |
| "epoch": 0.22665983956306537, |
| "grad_norm": 0.7536141276359558, |
| "learning_rate": 4.760490883781657e-06, |
| "loss": 0.6069, |
| "mean_token_accuracy": 0.8116141259670258, |
| "num_tokens": 21718481.0, |
| "step": 332 |
| }, |
| { |
| "epoch": 0.22734254992319508, |
| "grad_norm": 0.7357029318809509, |
| "learning_rate": 4.757937325572963e-06, |
| "loss": 0.6398, |
| "mean_token_accuracy": 0.8047867864370346, |
| "num_tokens": 21784017.0, |
| "step": 333 |
| }, |
| { |
| "epoch": 0.2280252602833248, |
| "grad_norm": 0.7707255482673645, |
| "learning_rate": 4.755370919235743e-06, |
| "loss": 0.6288, |
| "mean_token_accuracy": 0.8075971454381943, |
| "num_tokens": 21849553.0, |
| "step": 334 |
| }, |
| { |
| "epoch": 0.22870797064345452, |
| "grad_norm": 0.7792888879776001, |
| "learning_rate": 4.752791679373379e-06, |
| "loss": 0.6313, |
| "mean_token_accuracy": 0.8058997839689255, |
| "num_tokens": 21914714.0, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.22939068100358423, |
| "grad_norm": 0.8160707354545593, |
| "learning_rate": 4.750199620662276e-06, |
| "loss": 0.6464, |
| "mean_token_accuracy": 0.800752580165863, |
| "num_tokens": 21979602.0, |
| "step": 336 |
| }, |
| { |
| "epoch": 0.23007339136371394, |
| "grad_norm": 0.7280910015106201, |
| "learning_rate": 4.747594757851781e-06, |
| "loss": 0.6274, |
| "mean_token_accuracy": 0.8049395233392715, |
| "num_tokens": 22045138.0, |
| "step": 337 |
| }, |
| { |
| "epoch": 0.23075610172384367, |
| "grad_norm": 0.7372390627861023, |
| "learning_rate": 4.744977105764102e-06, |
| "loss": 0.6241, |
| "mean_token_accuracy": 0.8091856092214584, |
| "num_tokens": 22110674.0, |
| "step": 338 |
| }, |
| { |
| "epoch": 0.23143881208397338, |
| "grad_norm": 0.757652997970581, |
| "learning_rate": 4.742346679294218e-06, |
| "loss": 0.6618, |
| "mean_token_accuracy": 0.7985092997550964, |
| "num_tokens": 22176210.0, |
| "step": 339 |
| }, |
| { |
| "epoch": 0.23212152244410308, |
| "grad_norm": 0.7790650129318237, |
| "learning_rate": 4.739703493409797e-06, |
| "loss": 0.6201, |
| "mean_token_accuracy": 0.80793197453022, |
| "num_tokens": 22241702.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.2328042328042328, |
| "grad_norm": 0.7788594961166382, |
| "learning_rate": 4.73704756315111e-06, |
| "loss": 0.6437, |
| "mean_token_accuracy": 0.8009341210126877, |
| "num_tokens": 22307095.0, |
| "step": 341 |
| }, |
| { |
| "epoch": 0.23348694316436253, |
| "grad_norm": 0.7534693479537964, |
| "learning_rate": 4.73437890363095e-06, |
| "loss": 0.6437, |
| "mean_token_accuracy": 0.802969217300415, |
| "num_tokens": 22372631.0, |
| "step": 342 |
| }, |
| { |
| "epoch": 0.23416965352449223, |
| "grad_norm": 0.7882516384124756, |
| "learning_rate": 4.731697530034538e-06, |
| "loss": 0.655, |
| "mean_token_accuracy": 0.7998992055654526, |
| "num_tokens": 22438167.0, |
| "step": 343 |
| }, |
| { |
| "epoch": 0.23485236388462194, |
| "grad_norm": 0.7213677763938904, |
| "learning_rate": 4.729003457619441e-06, |
| "loss": 0.6046, |
| "mean_token_accuracy": 0.8131537586450577, |
| "num_tokens": 22503611.0, |
| "step": 344 |
| }, |
| { |
| "epoch": 0.23553507424475167, |
| "grad_norm": 0.767708957195282, |
| "learning_rate": 4.726296701715489e-06, |
| "loss": 0.6736, |
| "mean_token_accuracy": 0.794715091586113, |
| "num_tokens": 22569080.0, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.23621778460488138, |
| "grad_norm": 0.7923814058303833, |
| "learning_rate": 4.723577277724678e-06, |
| "loss": 0.6895, |
| "mean_token_accuracy": 0.7892992496490479, |
| "num_tokens": 22634616.0, |
| "step": 346 |
| }, |
| { |
| "epoch": 0.2369004949650111, |
| "grad_norm": 0.7433699369430542, |
| "learning_rate": 4.720845201121092e-06, |
| "loss": 0.5982, |
| "mean_token_accuracy": 0.8131873160600662, |
| "num_tokens": 22700152.0, |
| "step": 347 |
| }, |
| { |
| "epoch": 0.23758320532514082, |
| "grad_norm": 0.7465718984603882, |
| "learning_rate": 4.71810048745081e-06, |
| "loss": 0.6782, |
| "mean_token_accuracy": 0.7941065728664398, |
| "num_tokens": 22764843.0, |
| "step": 348 |
| }, |
| { |
| "epoch": 0.23826591568527053, |
| "grad_norm": 0.7546844482421875, |
| "learning_rate": 4.715343152331816e-06, |
| "loss": 0.6164, |
| "mean_token_accuracy": 0.8110382258892059, |
| "num_tokens": 22830057.0, |
| "step": 349 |
| }, |
| { |
| "epoch": 0.23894862604540024, |
| "grad_norm": 0.7519754767417908, |
| "learning_rate": 4.712573211453918e-06, |
| "loss": 0.6608, |
| "mean_token_accuracy": 0.7954239994287491, |
| "num_tokens": 22895593.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.23963133640552994, |
| "grad_norm": 0.7927505373954773, |
| "learning_rate": 4.7097906805786474e-06, |
| "loss": 0.6448, |
| "mean_token_accuracy": 0.7999225705862045, |
| "num_tokens": 22960640.0, |
| "step": 351 |
| }, |
| { |
| "epoch": 0.24031404676565968, |
| "grad_norm": 0.7735369205474854, |
| "learning_rate": 4.70699557553918e-06, |
| "loss": 0.6456, |
| "mean_token_accuracy": 0.8013502061367035, |
| "num_tokens": 23026176.0, |
| "step": 352 |
| }, |
| { |
| "epoch": 0.24099675712578938, |
| "grad_norm": 0.7586895823478699, |
| "learning_rate": 4.704187912240239e-06, |
| "loss": 0.6038, |
| "mean_token_accuracy": 0.8122097998857498, |
| "num_tokens": 23091712.0, |
| "step": 353 |
| }, |
| { |
| "epoch": 0.2416794674859191, |
| "grad_norm": 0.7558093667030334, |
| "learning_rate": 4.701367706658006e-06, |
| "loss": 0.6612, |
| "mean_token_accuracy": 0.7979543209075928, |
| "num_tokens": 23157067.0, |
| "step": 354 |
| }, |
| { |
| "epoch": 0.24236217784604883, |
| "grad_norm": 0.7904984951019287, |
| "learning_rate": 4.698534974840033e-06, |
| "loss": 0.6475, |
| "mean_token_accuracy": 0.8018562942743301, |
| "num_tokens": 23222084.0, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.24304488820617853, |
| "grad_norm": 0.7433775067329407, |
| "learning_rate": 4.6956897329051456e-06, |
| "loss": 0.6104, |
| "mean_token_accuracy": 0.8106366097927094, |
| "num_tokens": 23287620.0, |
| "step": 356 |
| }, |
| { |
| "epoch": 0.24372759856630824, |
| "grad_norm": 0.766858696937561, |
| "learning_rate": 4.692831997043359e-06, |
| "loss": 0.6628, |
| "mean_token_accuracy": 0.7968552559614182, |
| "num_tokens": 23352620.0, |
| "step": 357 |
| }, |
| { |
| "epoch": 0.24441030892643795, |
| "grad_norm": 0.7720522880554199, |
| "learning_rate": 4.689961783515777e-06, |
| "loss": 0.6159, |
| "mean_token_accuracy": 0.8069179058074951, |
| "num_tokens": 23418065.0, |
| "step": 358 |
| }, |
| { |
| "epoch": 0.24509301928656768, |
| "grad_norm": 0.7662636637687683, |
| "learning_rate": 4.687079108654508e-06, |
| "loss": 0.6449, |
| "mean_token_accuracy": 0.8010782450437546, |
| "num_tokens": 23483557.0, |
| "step": 359 |
| }, |
| { |
| "epoch": 0.2457757296466974, |
| "grad_norm": 0.7301419377326965, |
| "learning_rate": 4.684183988862563e-06, |
| "loss": 0.6264, |
| "mean_token_accuracy": 0.8084524720907211, |
| "num_tokens": 23549093.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.2464584400068271, |
| "grad_norm": 0.746433675289154, |
| "learning_rate": 4.681276440613772e-06, |
| "loss": 0.6566, |
| "mean_token_accuracy": 0.7988911420106888, |
| "num_tokens": 23614629.0, |
| "step": 361 |
| }, |
| { |
| "epoch": 0.24714115036695683, |
| "grad_norm": 0.7949883341789246, |
| "learning_rate": 4.678356480452683e-06, |
| "loss": 0.6575, |
| "mean_token_accuracy": 0.7978372424840927, |
| "num_tokens": 23680165.0, |
| "step": 362 |
| }, |
| { |
| "epoch": 0.24782386072708654, |
| "grad_norm": 0.7822641730308533, |
| "learning_rate": 4.675424124994471e-06, |
| "loss": 0.6526, |
| "mean_token_accuracy": 0.7977532297372818, |
| "num_tokens": 23744937.0, |
| "step": 363 |
| }, |
| { |
| "epoch": 0.24850657108721624, |
| "grad_norm": 0.7230205535888672, |
| "learning_rate": 4.672479390924842e-06, |
| "loss": 0.6103, |
| "mean_token_accuracy": 0.8111226707696915, |
| "num_tokens": 23810143.0, |
| "step": 364 |
| }, |
| { |
| "epoch": 0.24918928144734595, |
| "grad_norm": 0.7075436115264893, |
| "learning_rate": 4.669522294999941e-06, |
| "loss": 0.6161, |
| "mean_token_accuracy": 0.8099951148033142, |
| "num_tokens": 23875679.0, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.24987199180747569, |
| "grad_norm": 0.7330030798912048, |
| "learning_rate": 4.66655285404625e-06, |
| "loss": 0.6507, |
| "mean_token_accuracy": 0.801278367638588, |
| "num_tokens": 23941169.0, |
| "step": 366 |
| }, |
| { |
| "epoch": 0.2505547021676054, |
| "grad_norm": 0.7502052783966064, |
| "learning_rate": 4.6635710849605034e-06, |
| "loss": 0.6205, |
| "mean_token_accuracy": 0.8079535663127899, |
| "num_tokens": 24006214.0, |
| "step": 367 |
| }, |
| { |
| "epoch": 0.2512374125277351, |
| "grad_norm": 0.755878210067749, |
| "learning_rate": 4.660577004709579e-06, |
| "loss": 0.6197, |
| "mean_token_accuracy": 0.8080706298351288, |
| "num_tokens": 24071750.0, |
| "step": 368 |
| }, |
| { |
| "epoch": 0.2519201228878648, |
| "grad_norm": 0.7299730777740479, |
| "learning_rate": 4.657570630330411e-06, |
| "loss": 0.6366, |
| "mean_token_accuracy": 0.8038407862186432, |
| "num_tokens": 24137117.0, |
| "step": 369 |
| }, |
| { |
| "epoch": 0.25260283324799454, |
| "grad_norm": 0.7870468497276306, |
| "learning_rate": 4.65455197892989e-06, |
| "loss": 0.6363, |
| "mean_token_accuracy": 0.8036412596702576, |
| "num_tokens": 24202653.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.2532855436081243, |
| "grad_norm": 0.7470530271530151, |
| "learning_rate": 4.651521067684762e-06, |
| "loss": 0.668, |
| "mean_token_accuracy": 0.7951490730047226, |
| "num_tokens": 24268189.0, |
| "step": 371 |
| }, |
| { |
| "epoch": 0.25396825396825395, |
| "grad_norm": 0.7217208743095398, |
| "learning_rate": 4.6484779138415385e-06, |
| "loss": 0.5903, |
| "mean_token_accuracy": 0.8160587698221207, |
| "num_tokens": 24333725.0, |
| "step": 372 |
| }, |
| { |
| "epoch": 0.2546509643283837, |
| "grad_norm": 0.7956851720809937, |
| "learning_rate": 4.64542253471639e-06, |
| "loss": 0.6924, |
| "mean_token_accuracy": 0.7903948575258255, |
| "num_tokens": 24398857.0, |
| "step": 373 |
| }, |
| { |
| "epoch": 0.2553336746885134, |
| "grad_norm": 0.7602137923240662, |
| "learning_rate": 4.642354947695055e-06, |
| "loss": 0.6302, |
| "mean_token_accuracy": 0.8039003759622574, |
| "num_tokens": 24464251.0, |
| "step": 374 |
| }, |
| { |
| "epoch": 0.2560163850486431, |
| "grad_norm": 0.7951780557632446, |
| "learning_rate": 4.639275170232734e-06, |
| "loss": 0.6699, |
| "mean_token_accuracy": 0.7974366098642349, |
| "num_tokens": 24529439.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.25669909540877284, |
| "grad_norm": 0.8132842183113098, |
| "learning_rate": 4.636183219853996e-06, |
| "loss": 0.6646, |
| "mean_token_accuracy": 0.7951338142156601, |
| "num_tokens": 24594975.0, |
| "step": 376 |
| }, |
| { |
| "epoch": 0.2573818057689025, |
| "grad_norm": 0.7737603187561035, |
| "learning_rate": 4.633079114152676e-06, |
| "loss": 0.608, |
| "mean_token_accuracy": 0.811751589179039, |
| "num_tokens": 24660511.0, |
| "step": 377 |
| }, |
| { |
| "epoch": 0.25806451612903225, |
| "grad_norm": 0.804283082485199, |
| "learning_rate": 4.629962870791774e-06, |
| "loss": 0.6921, |
| "mean_token_accuracy": 0.7884904444217682, |
| "num_tokens": 24725671.0, |
| "step": 378 |
| }, |
| { |
| "epoch": 0.258747226489162, |
| "grad_norm": 0.7124019861221313, |
| "learning_rate": 4.626834507503357e-06, |
| "loss": 0.5663, |
| "mean_token_accuracy": 0.8226875513792038, |
| "num_tokens": 24791207.0, |
| "step": 379 |
| }, |
| { |
| "epoch": 0.25942993684929166, |
| "grad_norm": 0.7700932025909424, |
| "learning_rate": 4.623694042088457e-06, |
| "loss": 0.6398, |
| "mean_token_accuracy": 0.8026179224252701, |
| "num_tokens": 24856743.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.2601126472094214, |
| "grad_norm": 0.792400598526001, |
| "learning_rate": 4.620541492416968e-06, |
| "loss": 0.6382, |
| "mean_token_accuracy": 0.8020680695772171, |
| "num_tokens": 24922279.0, |
| "step": 381 |
| }, |
| { |
| "epoch": 0.26079535756955113, |
| "grad_norm": 0.7475351691246033, |
| "learning_rate": 4.617376876427549e-06, |
| "loss": 0.6306, |
| "mean_token_accuracy": 0.8047256767749786, |
| "num_tokens": 24987815.0, |
| "step": 382 |
| }, |
| { |
| "epoch": 0.2614780679296808, |
| "grad_norm": 0.7587133049964905, |
| "learning_rate": 4.614200212127514e-06, |
| "loss": 0.6442, |
| "mean_token_accuracy": 0.8020219951868057, |
| "num_tokens": 25053170.0, |
| "step": 383 |
| }, |
| { |
| "epoch": 0.26216077828981055, |
| "grad_norm": 0.7837956547737122, |
| "learning_rate": 4.611011517592741e-06, |
| "loss": 0.598, |
| "mean_token_accuracy": 0.8139603137969971, |
| "num_tokens": 25117886.0, |
| "step": 384 |
| }, |
| { |
| "epoch": 0.2628434886499403, |
| "grad_norm": 0.7706968188285828, |
| "learning_rate": 4.607810810967556e-06, |
| "loss": 0.6207, |
| "mean_token_accuracy": 0.8087273985147476, |
| "num_tokens": 25183422.0, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.26352619901006996, |
| "grad_norm": 0.752069354057312, |
| "learning_rate": 4.604598110464639e-06, |
| "loss": 0.6318, |
| "mean_token_accuracy": 0.8054282814264297, |
| "num_tokens": 25248958.0, |
| "step": 386 |
| }, |
| { |
| "epoch": 0.2642089093701997, |
| "grad_norm": 0.7261558175086975, |
| "learning_rate": 4.60137343436492e-06, |
| "loss": 0.6412, |
| "mean_token_accuracy": 0.8022666275501251, |
| "num_tokens": 25314494.0, |
| "step": 387 |
| }, |
| { |
| "epoch": 0.26489161973032943, |
| "grad_norm": 0.7284137606620789, |
| "learning_rate": 4.5981368010174676e-06, |
| "loss": 0.606, |
| "mean_token_accuracy": 0.8140856623649597, |
| "num_tokens": 25379983.0, |
| "step": 388 |
| }, |
| { |
| "epoch": 0.2655743300904591, |
| "grad_norm": 0.7730989456176758, |
| "learning_rate": 4.5948882288393935e-06, |
| "loss": 0.6313, |
| "mean_token_accuracy": 0.8056421130895615, |
| "num_tokens": 25445519.0, |
| "step": 389 |
| }, |
| { |
| "epoch": 0.26625704045058884, |
| "grad_norm": 0.7879030108451843, |
| "learning_rate": 4.5916277363157434e-06, |
| "loss": 0.6762, |
| "mean_token_accuracy": 0.790292039513588, |
| "num_tokens": 25511055.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2669397508107186, |
| "grad_norm": 0.7832986116409302, |
| "learning_rate": 4.588355341999391e-06, |
| "loss": 0.663, |
| "mean_token_accuracy": 0.7950224727392197, |
| "num_tokens": 25576264.0, |
| "step": 391 |
| }, |
| { |
| "epoch": 0.26762246117084826, |
| "grad_norm": 0.7901932597160339, |
| "learning_rate": 4.585071064510933e-06, |
| "loss": 0.6107, |
| "mean_token_accuracy": 0.8095369040966034, |
| "num_tokens": 25641800.0, |
| "step": 392 |
| }, |
| { |
| "epoch": 0.268305171530978, |
| "grad_norm": 0.7595347762107849, |
| "learning_rate": 4.581774922538586e-06, |
| "loss": 0.642, |
| "mean_token_accuracy": 0.8009682595729828, |
| "num_tokens": 25707162.0, |
| "step": 393 |
| }, |
| { |
| "epoch": 0.26898788189110767, |
| "grad_norm": 0.7476745843887329, |
| "learning_rate": 4.578466934838076e-06, |
| "loss": 0.6396, |
| "mean_token_accuracy": 0.8006323426961899, |
| "num_tokens": 25772698.0, |
| "step": 394 |
| }, |
| { |
| "epoch": 0.2696705922512374, |
| "grad_norm": 0.7542417049407959, |
| "learning_rate": 4.575147120232536e-06, |
| "loss": 0.6125, |
| "mean_token_accuracy": 0.8105543851852417, |
| "num_tokens": 25838038.0, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.27035330261136714, |
| "grad_norm": 0.7183694243431091, |
| "learning_rate": 4.571815497612393e-06, |
| "loss": 0.6075, |
| "mean_token_accuracy": 0.8125783056020737, |
| "num_tokens": 25903529.0, |
| "step": 396 |
| }, |
| { |
| "epoch": 0.2710360129714968, |
| "grad_norm": 0.747693657875061, |
| "learning_rate": 4.568472085935267e-06, |
| "loss": 0.6624, |
| "mean_token_accuracy": 0.8001435697078705, |
| "num_tokens": 25969065.0, |
| "step": 397 |
| }, |
| { |
| "epoch": 0.27171872333162655, |
| "grad_norm": 0.8016436696052551, |
| "learning_rate": 4.5651169042258605e-06, |
| "loss": 0.6261, |
| "mean_token_accuracy": 0.8060239553451538, |
| "num_tokens": 26034601.0, |
| "step": 398 |
| }, |
| { |
| "epoch": 0.2724014336917563, |
| "grad_norm": 0.7097094655036926, |
| "learning_rate": 4.561749971575846e-06, |
| "loss": 0.6236, |
| "mean_token_accuracy": 0.808208093047142, |
| "num_tokens": 26100137.0, |
| "step": 399 |
| }, |
| { |
| "epoch": 0.27308414405188597, |
| "grad_norm": 0.7615159749984741, |
| "learning_rate": 4.558371307143766e-06, |
| "loss": 0.6452, |
| "mean_token_accuracy": 0.7992061525583267, |
| "num_tokens": 26165490.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.2737668544120157, |
| "grad_norm": 0.7655863761901855, |
| "learning_rate": 4.554980930154916e-06, |
| "loss": 0.5753, |
| "mean_token_accuracy": 0.8202753812074661, |
| "num_tokens": 26230866.0, |
| "step": 401 |
| }, |
| { |
| "epoch": 0.27444956477214544, |
| "grad_norm": 0.7693037986755371, |
| "learning_rate": 4.55157885990124e-06, |
| "loss": 0.6185, |
| "mean_token_accuracy": 0.8076163828372955, |
| "num_tokens": 26295675.0, |
| "step": 402 |
| }, |
| { |
| "epoch": 0.2751322751322751, |
| "grad_norm": 0.7439252734184265, |
| "learning_rate": 4.548165115741218e-06, |
| "loss": 0.6128, |
| "mean_token_accuracy": 0.8100256621837616, |
| "num_tokens": 26361211.0, |
| "step": 403 |
| }, |
| { |
| "epoch": 0.27581498549240485, |
| "grad_norm": 0.7341496348381042, |
| "learning_rate": 4.544739717099758e-06, |
| "loss": 0.6449, |
| "mean_token_accuracy": 0.8028012067079544, |
| "num_tokens": 26426747.0, |
| "step": 404 |
| }, |
| { |
| "epoch": 0.2764976958525346, |
| "grad_norm": 0.7716757655143738, |
| "learning_rate": 4.541302683468084e-06, |
| "loss": 0.6469, |
| "mean_token_accuracy": 0.8010788112878799, |
| "num_tokens": 26491980.0, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.27718040621266427, |
| "grad_norm": 0.7929121255874634, |
| "learning_rate": 4.537854034403626e-06, |
| "loss": 0.6995, |
| "mean_token_accuracy": 0.7865194380283356, |
| "num_tokens": 26557516.0, |
| "step": 406 |
| }, |
| { |
| "epoch": 0.277863116572794, |
| "grad_norm": 0.7356821894645691, |
| "learning_rate": 4.534393789529905e-06, |
| "loss": 0.6291, |
| "mean_token_accuracy": 0.8049547970294952, |
| "num_tokens": 26623052.0, |
| "step": 407 |
| }, |
| { |
| "epoch": 0.27854582693292373, |
| "grad_norm": 0.7607781887054443, |
| "learning_rate": 4.53092196853643e-06, |
| "loss": 0.6226, |
| "mean_token_accuracy": 0.8083969801664352, |
| "num_tokens": 26688234.0, |
| "step": 408 |
| }, |
| { |
| "epoch": 0.2792285372930534, |
| "grad_norm": 0.7681825757026672, |
| "learning_rate": 4.5274385911785765e-06, |
| "loss": 0.6866, |
| "mean_token_accuracy": 0.7905820459127426, |
| "num_tokens": 26753765.0, |
| "step": 409 |
| }, |
| { |
| "epoch": 0.27991124765318315, |
| "grad_norm": 0.7501204013824463, |
| "learning_rate": 4.52394367727748e-06, |
| "loss": 0.6388, |
| "mean_token_accuracy": 0.8037634491920471, |
| "num_tokens": 26819301.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.2805939580133128, |
| "grad_norm": 0.7248852849006653, |
| "learning_rate": 4.52043724671992e-06, |
| "loss": 0.6373, |
| "mean_token_accuracy": 0.8037328869104385, |
| "num_tokens": 26884837.0, |
| "step": 411 |
| }, |
| { |
| "epoch": 0.28127666837344256, |
| "grad_norm": 0.7588721513748169, |
| "learning_rate": 4.516919319458209e-06, |
| "loss": 0.6697, |
| "mean_token_accuracy": 0.7954139858484268, |
| "num_tokens": 26949905.0, |
| "step": 412 |
| }, |
| { |
| "epoch": 0.2819593787335723, |
| "grad_norm": 0.772433876991272, |
| "learning_rate": 4.513389915510075e-06, |
| "loss": 0.6095, |
| "mean_token_accuracy": 0.8109726309776306, |
| "num_tokens": 27015441.0, |
| "step": 413 |
| }, |
| { |
| "epoch": 0.282642089093702, |
| "grad_norm": 0.741692841053009, |
| "learning_rate": 4.509849054958559e-06, |
| "loss": 0.5835, |
| "mean_token_accuracy": 0.8177823275327682, |
| "num_tokens": 27080627.0, |
| "step": 414 |
| }, |
| { |
| "epoch": 0.2833247994538317, |
| "grad_norm": 0.743645966053009, |
| "learning_rate": 4.506296757951883e-06, |
| "loss": 0.6028, |
| "mean_token_accuracy": 0.8136608004570007, |
| "num_tokens": 27146163.0, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.28400750981396145, |
| "grad_norm": 0.7747307419776917, |
| "learning_rate": 4.50273304470335e-06, |
| "loss": 0.6248, |
| "mean_token_accuracy": 0.8071649372577667, |
| "num_tokens": 27211646.0, |
| "step": 416 |
| }, |
| { |
| "epoch": 0.2846902201740911, |
| "grad_norm": 0.7713055610656738, |
| "learning_rate": 4.499157935491222e-06, |
| "loss": 0.6555, |
| "mean_token_accuracy": 0.7991966158151627, |
| "num_tokens": 27277182.0, |
| "step": 417 |
| }, |
| { |
| "epoch": 0.28537293053422086, |
| "grad_norm": 0.7445299029350281, |
| "learning_rate": 4.4955714506586064e-06, |
| "loss": 0.6093, |
| "mean_token_accuracy": 0.8092619180679321, |
| "num_tokens": 27342578.0, |
| "step": 418 |
| }, |
| { |
| "epoch": 0.2860556408943506, |
| "grad_norm": 0.740449845790863, |
| "learning_rate": 4.491973610613343e-06, |
| "loss": 0.631, |
| "mean_token_accuracy": 0.8066043555736542, |
| "num_tokens": 27408114.0, |
| "step": 419 |
| }, |
| { |
| "epoch": 0.2867383512544803, |
| "grad_norm": 0.7795570492744446, |
| "learning_rate": 4.4883644358278815e-06, |
| "loss": 0.6263, |
| "mean_token_accuracy": 0.8064363300800323, |
| "num_tokens": 27473650.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.28742106161461, |
| "grad_norm": 0.7142336964607239, |
| "learning_rate": 4.484743946839169e-06, |
| "loss": 0.6184, |
| "mean_token_accuracy": 0.8090627640485764, |
| "num_tokens": 27539146.0, |
| "step": 421 |
| }, |
| { |
| "epoch": 0.28810377197473974, |
| "grad_norm": 0.7614313364028931, |
| "learning_rate": 4.481112164248534e-06, |
| "loss": 0.6241, |
| "mean_token_accuracy": 0.8096743673086166, |
| "num_tokens": 27604682.0, |
| "step": 422 |
| }, |
| { |
| "epoch": 0.2887864823348694, |
| "grad_norm": 0.7186126708984375, |
| "learning_rate": 4.477469108721568e-06, |
| "loss": 0.6347, |
| "mean_token_accuracy": 0.8030977845191956, |
| "num_tokens": 27670067.0, |
| "step": 423 |
| }, |
| { |
| "epoch": 0.28946919269499916, |
| "grad_norm": 0.71300208568573, |
| "learning_rate": 4.473814800988009e-06, |
| "loss": 0.6057, |
| "mean_token_accuracy": 0.8111559152603149, |
| "num_tokens": 27735603.0, |
| "step": 424 |
| }, |
| { |
| "epoch": 0.29015190305512883, |
| "grad_norm": 0.7614585757255554, |
| "learning_rate": 4.4701492618416175e-06, |
| "loss": 0.6431, |
| "mean_token_accuracy": 0.8026790171861649, |
| "num_tokens": 27801139.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.29083461341525857, |
| "grad_norm": 0.8069416880607605, |
| "learning_rate": 4.466472512140069e-06, |
| "loss": 0.6811, |
| "mean_token_accuracy": 0.7914680987596512, |
| "num_tokens": 27866675.0, |
| "step": 426 |
| }, |
| { |
| "epoch": 0.2915173237753883, |
| "grad_norm": 0.7372321486473083, |
| "learning_rate": 4.4627845728048255e-06, |
| "loss": 0.6423, |
| "mean_token_accuracy": 0.8003201633691788, |
| "num_tokens": 27932051.0, |
| "step": 427 |
| }, |
| { |
| "epoch": 0.292200034135518, |
| "grad_norm": 0.7879287004470825, |
| "learning_rate": 4.459085464821024e-06, |
| "loss": 0.6453, |
| "mean_token_accuracy": 0.8002199530601501, |
| "num_tokens": 27997587.0, |
| "step": 428 |
| }, |
| { |
| "epoch": 0.2928827444956477, |
| "grad_norm": 0.7693133354187012, |
| "learning_rate": 4.455375209237346e-06, |
| "loss": 0.6208, |
| "mean_token_accuracy": 0.8095216304063797, |
| "num_tokens": 28063123.0, |
| "step": 429 |
| }, |
| { |
| "epoch": 0.29356545485577745, |
| "grad_norm": 0.7321552038192749, |
| "learning_rate": 4.451653827165915e-06, |
| "loss": 0.6313, |
| "mean_token_accuracy": 0.8039006143808365, |
| "num_tokens": 28128572.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.29424816521590713, |
| "grad_norm": 0.7376238107681274, |
| "learning_rate": 4.4479213397821585e-06, |
| "loss": 0.6408, |
| "mean_token_accuracy": 0.8035496175289154, |
| "num_tokens": 28194108.0, |
| "step": 431 |
| }, |
| { |
| "epoch": 0.29493087557603687, |
| "grad_norm": 0.7184596061706543, |
| "learning_rate": 4.4441777683247e-06, |
| "loss": 0.6243, |
| "mean_token_accuracy": 0.8056421130895615, |
| "num_tokens": 28259644.0, |
| "step": 432 |
| }, |
| { |
| "epoch": 0.2956135859361666, |
| "grad_norm": 0.7423664927482605, |
| "learning_rate": 4.440423134095232e-06, |
| "loss": 0.6035, |
| "mean_token_accuracy": 0.8111253678798676, |
| "num_tokens": 28325180.0, |
| "step": 433 |
| }, |
| { |
| "epoch": 0.2962962962962963, |
| "grad_norm": 0.7306990027427673, |
| "learning_rate": 4.436657458458396e-06, |
| "loss": 0.65, |
| "mean_token_accuracy": 0.8002391308546066, |
| "num_tokens": 28390585.0, |
| "step": 434 |
| }, |
| { |
| "epoch": 0.296979006656426, |
| "grad_norm": 0.7384706139564514, |
| "learning_rate": 4.432880762841665e-06, |
| "loss": 0.602, |
| "mean_token_accuracy": 0.8149714320898056, |
| "num_tokens": 28455904.0, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.29766171701655575, |
| "grad_norm": 0.7444391250610352, |
| "learning_rate": 4.4290930687352105e-06, |
| "loss": 0.6271, |
| "mean_token_accuracy": 0.8050769865512848, |
| "num_tokens": 28521440.0, |
| "step": 436 |
| }, |
| { |
| "epoch": 0.29834442737668543, |
| "grad_norm": 0.7371619939804077, |
| "learning_rate": 4.425294397691796e-06, |
| "loss": 0.6099, |
| "mean_token_accuracy": 0.8098118305206299, |
| "num_tokens": 28586976.0, |
| "step": 437 |
| }, |
| { |
| "epoch": 0.29902713773681516, |
| "grad_norm": 0.7588141560554504, |
| "learning_rate": 4.42148477132664e-06, |
| "loss": 0.6119, |
| "mean_token_accuracy": 0.810527041554451, |
| "num_tokens": 28652489.0, |
| "step": 438 |
| }, |
| { |
| "epoch": 0.2997098480969449, |
| "grad_norm": 0.774115264415741, |
| "learning_rate": 4.417664211317304e-06, |
| "loss": 0.627, |
| "mean_token_accuracy": 0.8073985874652863, |
| "num_tokens": 28718025.0, |
| "step": 439 |
| }, |
| { |
| "epoch": 0.3003925584570746, |
| "grad_norm": 0.7610304951667786, |
| "learning_rate": 4.413832739403558e-06, |
| "loss": 0.6646, |
| "mean_token_accuracy": 0.798673689365387, |
| "num_tokens": 28783437.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.3010752688172043, |
| "grad_norm": 0.7736408114433289, |
| "learning_rate": 4.40999037738727e-06, |
| "loss": 0.6713, |
| "mean_token_accuracy": 0.794889435172081, |
| "num_tokens": 28848973.0, |
| "step": 441 |
| }, |
| { |
| "epoch": 0.301757979177334, |
| "grad_norm": 0.7538256049156189, |
| "learning_rate": 4.406137147132268e-06, |
| "loss": 0.601, |
| "mean_token_accuracy": 0.8116599470376968, |
| "num_tokens": 28914509.0, |
| "step": 442 |
| }, |
| { |
| "epoch": 0.3024406895374637, |
| "grad_norm": 0.7843046188354492, |
| "learning_rate": 4.402273070564228e-06, |
| "loss": 0.6335, |
| "mean_token_accuracy": 0.8031372278928757, |
| "num_tokens": 28980045.0, |
| "step": 443 |
| }, |
| { |
| "epoch": 0.30312339989759346, |
| "grad_norm": 0.7894677519798279, |
| "learning_rate": 4.3983981696705415e-06, |
| "loss": 0.6715, |
| "mean_token_accuracy": 0.7911015450954437, |
| "num_tokens": 29045581.0, |
| "step": 444 |
| }, |
| { |
| "epoch": 0.30380611025772314, |
| "grad_norm": 0.7658438682556152, |
| "learning_rate": 4.3945124665001926e-06, |
| "loss": 0.6393, |
| "mean_token_accuracy": 0.8040514290332794, |
| "num_tokens": 29110647.0, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.3044888206178529, |
| "grad_norm": 0.7538372278213501, |
| "learning_rate": 4.390615983163633e-06, |
| "loss": 0.639, |
| "mean_token_accuracy": 0.8023277074098587, |
| "num_tokens": 29176183.0, |
| "step": 446 |
| }, |
| { |
| "epoch": 0.3051715309779826, |
| "grad_norm": 0.7676253914833069, |
| "learning_rate": 4.386708741832655e-06, |
| "loss": 0.6047, |
| "mean_token_accuracy": 0.8126374632120132, |
| "num_tokens": 29241719.0, |
| "step": 447 |
| }, |
| { |
| "epoch": 0.3058542413381123, |
| "grad_norm": 0.8105066418647766, |
| "learning_rate": 4.382790764740267e-06, |
| "loss": 0.6009, |
| "mean_token_accuracy": 0.8113532811403275, |
| "num_tokens": 29306858.0, |
| "step": 448 |
| }, |
| { |
| "epoch": 0.306536951698242, |
| "grad_norm": 0.7600321769714355, |
| "learning_rate": 4.378862074180566e-06, |
| "loss": 0.5914, |
| "mean_token_accuracy": 0.814546674489975, |
| "num_tokens": 29372394.0, |
| "step": 449 |
| }, |
| { |
| "epoch": 0.30721966205837176, |
| "grad_norm": 0.7705000638961792, |
| "learning_rate": 4.374922692508611e-06, |
| "loss": 0.6188, |
| "mean_token_accuracy": 0.8112035095691681, |
| "num_tokens": 29437883.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.30790237241850144, |
| "grad_norm": 0.780503511428833, |
| "learning_rate": 4.370972642140294e-06, |
| "loss": 0.6245, |
| "mean_token_accuracy": 0.8074749559164047, |
| "num_tokens": 29503419.0, |
| "step": 451 |
| }, |
| { |
| "epoch": 0.30858508277863117, |
| "grad_norm": 0.7811777591705322, |
| "learning_rate": 4.367011945552217e-06, |
| "loss": 0.6088, |
| "mean_token_accuracy": 0.8096781969070435, |
| "num_tokens": 29568784.0, |
| "step": 452 |
| }, |
| { |
| "epoch": 0.3092677931387609, |
| "grad_norm": 0.8117642402648926, |
| "learning_rate": 4.363040625281557e-06, |
| "loss": 0.6262, |
| "mean_token_accuracy": 0.8069403767585754, |
| "num_tokens": 29634320.0, |
| "step": 453 |
| }, |
| { |
| "epoch": 0.3099505034988906, |
| "grad_norm": 0.7232481241226196, |
| "learning_rate": 4.359058703925947e-06, |
| "loss": 0.6107, |
| "mean_token_accuracy": 0.8092174828052521, |
| "num_tokens": 29699343.0, |
| "step": 454 |
| }, |
| { |
| "epoch": 0.3106332138590203, |
| "grad_norm": 0.7087581157684326, |
| "learning_rate": 4.355066204143338e-06, |
| "loss": 0.5939, |
| "mean_token_accuracy": 0.8157000690698624, |
| "num_tokens": 29764058.0, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.31131592421915005, |
| "grad_norm": 0.8165489435195923, |
| "learning_rate": 4.351063148651878e-06, |
| "loss": 0.6408, |
| "mean_token_accuracy": 0.8040536493062973, |
| "num_tokens": 29829594.0, |
| "step": 456 |
| }, |
| { |
| "epoch": 0.31199863457927973, |
| "grad_norm": 0.7757338881492615, |
| "learning_rate": 4.347049560229776e-06, |
| "loss": 0.6387, |
| "mean_token_accuracy": 0.8039254248142242, |
| "num_tokens": 29894568.0, |
| "step": 457 |
| }, |
| { |
| "epoch": 0.31268134493940947, |
| "grad_norm": 0.7120219469070435, |
| "learning_rate": 4.343025461715177e-06, |
| "loss": 0.6376, |
| "mean_token_accuracy": 0.8034885227680206, |
| "num_tokens": 29960104.0, |
| "step": 458 |
| }, |
| { |
| "epoch": 0.31336405529953915, |
| "grad_norm": 0.7201838493347168, |
| "learning_rate": 4.338990876006031e-06, |
| "loss": 0.614, |
| "mean_token_accuracy": 0.8114766627550125, |
| "num_tokens": 30025640.0, |
| "step": 459 |
| }, |
| { |
| "epoch": 0.3140467656596689, |
| "grad_norm": 0.8032012581825256, |
| "learning_rate": 4.334945826059961e-06, |
| "loss": 0.6094, |
| "mean_token_accuracy": 0.8113402128219604, |
| "num_tokens": 30090352.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.3147294760197986, |
| "grad_norm": 0.7260733842849731, |
| "learning_rate": 4.330890334894136e-06, |
| "loss": 0.6028, |
| "mean_token_accuracy": 0.8130651265382767, |
| "num_tokens": 30155888.0, |
| "step": 461 |
| }, |
| { |
| "epoch": 0.3154121863799283, |
| "grad_norm": 0.6967595219612122, |
| "learning_rate": 4.326824425585135e-06, |
| "loss": 0.6017, |
| "mean_token_accuracy": 0.8125849515199661, |
| "num_tokens": 30221258.0, |
| "step": 462 |
| }, |
| { |
| "epoch": 0.31609489674005803, |
| "grad_norm": 0.7427617907524109, |
| "learning_rate": 4.322748121268821e-06, |
| "loss": 0.5934, |
| "mean_token_accuracy": 0.8141037374734879, |
| "num_tokens": 30286794.0, |
| "step": 463 |
| }, |
| { |
| "epoch": 0.31677760710018776, |
| "grad_norm": 0.7407168745994568, |
| "learning_rate": 4.318661445140202e-06, |
| "loss": 0.6337, |
| "mean_token_accuracy": 0.8028928488492966, |
| "num_tokens": 30352330.0, |
| "step": 464 |
| }, |
| { |
| "epoch": 0.31746031746031744, |
| "grad_norm": 0.714897096157074, |
| "learning_rate": 4.314564420453311e-06, |
| "loss": 0.5786, |
| "mean_token_accuracy": 0.817768931388855, |
| "num_tokens": 30417646.0, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.3181430278204472, |
| "grad_norm": 0.7537848353385925, |
| "learning_rate": 4.31045707052106e-06, |
| "loss": 0.6363, |
| "mean_token_accuracy": 0.8019506186246872, |
| "num_tokens": 30482883.0, |
| "step": 466 |
| }, |
| { |
| "epoch": 0.3188257381805769, |
| "grad_norm": 0.7029747366905212, |
| "learning_rate": 4.306339418715117e-06, |
| "loss": 0.5894, |
| "mean_token_accuracy": 0.8145619481801987, |
| "num_tokens": 30548127.0, |
| "step": 467 |
| }, |
| { |
| "epoch": 0.3195084485407066, |
| "grad_norm": 0.710258424282074, |
| "learning_rate": 4.302211488465769e-06, |
| "loss": 0.6181, |
| "mean_token_accuracy": 0.8077957034111023, |
| "num_tokens": 30613663.0, |
| "step": 468 |
| }, |
| { |
| "epoch": 0.3201911589008363, |
| "grad_norm": 0.7728216052055359, |
| "learning_rate": 4.298073303261791e-06, |
| "loss": 0.6565, |
| "mean_token_accuracy": 0.7991966009140015, |
| "num_tokens": 30679199.0, |
| "step": 469 |
| }, |
| { |
| "epoch": 0.32087386926096606, |
| "grad_norm": 0.7947602868080139, |
| "learning_rate": 4.293924886650306e-06, |
| "loss": 0.6152, |
| "mean_token_accuracy": 0.8107740730047226, |
| "num_tokens": 30744735.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.32155657962109574, |
| "grad_norm": 0.7346971035003662, |
| "learning_rate": 4.289766262236661e-06, |
| "loss": 0.6387, |
| "mean_token_accuracy": 0.8023582696914673, |
| "num_tokens": 30810271.0, |
| "step": 471 |
| }, |
| { |
| "epoch": 0.3222392899812255, |
| "grad_norm": 0.7462263107299805, |
| "learning_rate": 4.285597453684286e-06, |
| "loss": 0.6056, |
| "mean_token_accuracy": 0.8118889033794403, |
| "num_tokens": 30875569.0, |
| "step": 472 |
| }, |
| { |
| "epoch": 0.32292200034135515, |
| "grad_norm": 0.7579359412193298, |
| "learning_rate": 4.2814184847145595e-06, |
| "loss": 0.628, |
| "mean_token_accuracy": 0.805214449763298, |
| "num_tokens": 30941105.0, |
| "step": 473 |
| }, |
| { |
| "epoch": 0.3236047107014849, |
| "grad_norm": 0.7670093178749084, |
| "learning_rate": 4.2772293791066764e-06, |
| "loss": 0.639, |
| "mean_token_accuracy": 0.8046798706054688, |
| "num_tokens": 31006641.0, |
| "step": 474 |
| }, |
| { |
| "epoch": 0.3242874210616146, |
| "grad_norm": 0.8125339150428772, |
| "learning_rate": 4.27303016069751e-06, |
| "loss": 0.6028, |
| "mean_token_accuracy": 0.8125305473804474, |
| "num_tokens": 31072177.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.3249701314217443, |
| "grad_norm": 0.7287182807922363, |
| "learning_rate": 4.26882085338148e-06, |
| "loss": 0.5951, |
| "mean_token_accuracy": 0.8134927898645401, |
| "num_tokens": 31137713.0, |
| "step": 476 |
| }, |
| { |
| "epoch": 0.32565284178187404, |
| "grad_norm": 0.764122486114502, |
| "learning_rate": 4.2646014811104095e-06, |
| "loss": 0.6298, |
| "mean_token_accuracy": 0.8057503998279572, |
| "num_tokens": 31202855.0, |
| "step": 477 |
| }, |
| { |
| "epoch": 0.32633555214200377, |
| "grad_norm": 0.7873974442481995, |
| "learning_rate": 4.2603720678933965e-06, |
| "loss": 0.6214, |
| "mean_token_accuracy": 0.8090328723192215, |
| "num_tokens": 31268391.0, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.32701826250213345, |
| "grad_norm": 0.7522021532058716, |
| "learning_rate": 4.2561326377966755e-06, |
| "loss": 0.6279, |
| "mean_token_accuracy": 0.8060545027256012, |
| "num_tokens": 31333927.0, |
| "step": 479 |
| }, |
| { |
| "epoch": 0.3277009728622632, |
| "grad_norm": 0.7264294624328613, |
| "learning_rate": 4.2518832149434755e-06, |
| "loss": 0.637, |
| "mean_token_accuracy": 0.8003726750612259, |
| "num_tokens": 31399463.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.3283836832223929, |
| "grad_norm": 0.7738657593727112, |
| "learning_rate": 4.247623823513888e-06, |
| "loss": 0.6039, |
| "mean_token_accuracy": 0.8129608482122421, |
| "num_tokens": 31464886.0, |
| "step": 481 |
| }, |
| { |
| "epoch": 0.3290663935825226, |
| "grad_norm": 0.7558001279830933, |
| "learning_rate": 4.243354487744727e-06, |
| "loss": 0.6228, |
| "mean_token_accuracy": 0.80671127140522, |
| "num_tokens": 31530422.0, |
| "step": 482 |
| }, |
| { |
| "epoch": 0.32974910394265233, |
| "grad_norm": 0.758977472782135, |
| "learning_rate": 4.239075231929394e-06, |
| "loss": 0.6334, |
| "mean_token_accuracy": 0.8040841966867447, |
| "num_tokens": 31595958.0, |
| "step": 483 |
| }, |
| { |
| "epoch": 0.33043181430278207, |
| "grad_norm": 0.7687391042709351, |
| "learning_rate": 4.234786080417735e-06, |
| "loss": 0.6465, |
| "mean_token_accuracy": 0.8003726899623871, |
| "num_tokens": 31661494.0, |
| "step": 484 |
| }, |
| { |
| "epoch": 0.33111452466291175, |
| "grad_norm": 0.7958798408508301, |
| "learning_rate": 4.230487057615906e-06, |
| "loss": 0.6701, |
| "mean_token_accuracy": 0.7937389463186264, |
| "num_tokens": 31726959.0, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.3317972350230415, |
| "grad_norm": 0.812972903251648, |
| "learning_rate": 4.226178187986233e-06, |
| "loss": 0.6752, |
| "mean_token_accuracy": 0.7940188199281693, |
| "num_tokens": 31792495.0, |
| "step": 486 |
| }, |
| { |
| "epoch": 0.3324799453831712, |
| "grad_norm": 0.7672164440155029, |
| "learning_rate": 4.221859496047072e-06, |
| "loss": 0.6325, |
| "mean_token_accuracy": 0.8042980283498764, |
| "num_tokens": 31858031.0, |
| "step": 487 |
| }, |
| { |
| "epoch": 0.3331626557433009, |
| "grad_norm": 0.7787556052207947, |
| "learning_rate": 4.217531006372668e-06, |
| "loss": 0.6196, |
| "mean_token_accuracy": 0.8083626925945282, |
| "num_tokens": 31923483.0, |
| "step": 488 |
| }, |
| { |
| "epoch": 0.33384536610343063, |
| "grad_norm": 0.7686751484870911, |
| "learning_rate": 4.21319274359302e-06, |
| "loss": 0.6394, |
| "mean_token_accuracy": 0.803551197052002, |
| "num_tokens": 31989015.0, |
| "step": 489 |
| }, |
| { |
| "epoch": 0.3345280764635603, |
| "grad_norm": 0.7874401211738586, |
| "learning_rate": 4.208844732393738e-06, |
| "loss": 0.5972, |
| "mean_token_accuracy": 0.8128054738044739, |
| "num_tokens": 32054551.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.33521078682369004, |
| "grad_norm": 0.7712216973304749, |
| "learning_rate": 4.2044869975159e-06, |
| "loss": 0.6245, |
| "mean_token_accuracy": 0.8059378415346146, |
| "num_tokens": 32119517.0, |
| "step": 491 |
| }, |
| { |
| "epoch": 0.3358934971838198, |
| "grad_norm": 0.7174788117408752, |
| "learning_rate": 4.200119563755915e-06, |
| "loss": 0.6025, |
| "mean_token_accuracy": 0.81402987241745, |
| "num_tokens": 32185039.0, |
| "step": 492 |
| }, |
| { |
| "epoch": 0.33657620754394946, |
| "grad_norm": 0.7734994292259216, |
| "learning_rate": 4.195742455965381e-06, |
| "loss": 0.6263, |
| "mean_token_accuracy": 0.8075665980577469, |
| "num_tokens": 32250575.0, |
| "step": 493 |
| }, |
| { |
| "epoch": 0.3372589179040792, |
| "grad_norm": 0.776196300983429, |
| "learning_rate": 4.191355699050945e-06, |
| "loss": 0.6427, |
| "mean_token_accuracy": 0.8039925545454025, |
| "num_tokens": 32316111.0, |
| "step": 494 |
| }, |
| { |
| "epoch": 0.3379416282642089, |
| "grad_norm": 0.7637385129928589, |
| "learning_rate": 4.186959317974155e-06, |
| "loss": 0.6087, |
| "mean_token_accuracy": 0.8097383975982666, |
| "num_tokens": 32381556.0, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.3386243386243386, |
| "grad_norm": 0.7658755779266357, |
| "learning_rate": 4.182553337751326e-06, |
| "loss": 0.6456, |
| "mean_token_accuracy": 0.8000977635383606, |
| "num_tokens": 32447092.0, |
| "step": 496 |
| }, |
| { |
| "epoch": 0.33930704898446834, |
| "grad_norm": 0.7636200189590454, |
| "learning_rate": 4.178137783453393e-06, |
| "loss": 0.6444, |
| "mean_token_accuracy": 0.7969666421413422, |
| "num_tokens": 32512628.0, |
| "step": 497 |
| }, |
| { |
| "epoch": 0.3399897593445981, |
| "grad_norm": 0.7598303556442261, |
| "learning_rate": 4.17371268020577e-06, |
| "loss": 0.6172, |
| "mean_token_accuracy": 0.8083913624286652, |
| "num_tokens": 32578164.0, |
| "step": 498 |
| }, |
| { |
| "epoch": 0.34067246970472775, |
| "grad_norm": 0.7488183379173279, |
| "learning_rate": 4.169278053188206e-06, |
| "loss": 0.6004, |
| "mean_token_accuracy": 0.8118432313203812, |
| "num_tokens": 32643700.0, |
| "step": 499 |
| }, |
| { |
| "epoch": 0.3413551800648575, |
| "grad_norm": 0.7670736312866211, |
| "learning_rate": 4.164833927634641e-06, |
| "loss": 0.6166, |
| "mean_token_accuracy": 0.809506356716156, |
| "num_tokens": 32709236.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3420378904249872, |
| "grad_norm": 0.7257670760154724, |
| "learning_rate": 4.160380328833066e-06, |
| "loss": 0.5888, |
| "mean_token_accuracy": 0.8163184225559235, |
| "num_tokens": 32774772.0, |
| "step": 501 |
| }, |
| { |
| "epoch": 0.3427206007851169, |
| "grad_norm": 0.7861335873603821, |
| "learning_rate": 4.155917282125376e-06, |
| "loss": 0.6194, |
| "mean_token_accuracy": 0.8079789876937866, |
| "num_tokens": 32840308.0, |
| "step": 502 |
| }, |
| { |
| "epoch": 0.34340331114524664, |
| "grad_norm": 0.8057557344436646, |
| "learning_rate": 4.151444812907226e-06, |
| "loss": 0.5966, |
| "mean_token_accuracy": 0.8141276389360428, |
| "num_tokens": 32905558.0, |
| "step": 503 |
| }, |
| { |
| "epoch": 0.34408602150537637, |
| "grad_norm": 0.7587814331054688, |
| "learning_rate": 4.146962946627886e-06, |
| "loss": 0.6238, |
| "mean_token_accuracy": 0.8071389347314835, |
| "num_tokens": 32971094.0, |
| "step": 504 |
| }, |
| { |
| "epoch": 0.34476873186550605, |
| "grad_norm": 0.7340732216835022, |
| "learning_rate": 4.1424717087901005e-06, |
| "loss": 0.6343, |
| "mean_token_accuracy": 0.8036870807409286, |
| "num_tokens": 33036630.0, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.3454514422256358, |
| "grad_norm": 0.7753547430038452, |
| "learning_rate": 4.1379711249499355e-06, |
| "loss": 0.6344, |
| "mean_token_accuracy": 0.8057337552309036, |
| "num_tokens": 33102166.0, |
| "step": 506 |
| }, |
| { |
| "epoch": 0.34613415258576546, |
| "grad_norm": 0.7536847591400146, |
| "learning_rate": 4.133461220716642e-06, |
| "loss": 0.6072, |
| "mean_token_accuracy": 0.8107985109090805, |
| "num_tokens": 33167684.0, |
| "step": 507 |
| }, |
| { |
| "epoch": 0.3468168629458952, |
| "grad_norm": 0.7741045951843262, |
| "learning_rate": 4.1289420217525035e-06, |
| "loss": 0.6165, |
| "mean_token_accuracy": 0.808781310915947, |
| "num_tokens": 33232862.0, |
| "step": 508 |
| }, |
| { |
| "epoch": 0.34749957330602493, |
| "grad_norm": 0.7509851455688477, |
| "learning_rate": 4.124413553772693e-06, |
| "loss": 0.6507, |
| "mean_token_accuracy": 0.8005150854587555, |
| "num_tokens": 33298347.0, |
| "step": 509 |
| }, |
| { |
| "epoch": 0.3481822836661546, |
| "grad_norm": 0.741675078868866, |
| "learning_rate": 4.119875842545127e-06, |
| "loss": 0.5784, |
| "mean_token_accuracy": 0.8181970864534378, |
| "num_tokens": 33363883.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.34886499402628435, |
| "grad_norm": 0.7717655301094055, |
| "learning_rate": 4.115328913890317e-06, |
| "loss": 0.6133, |
| "mean_token_accuracy": 0.8071389347314835, |
| "num_tokens": 33429419.0, |
| "step": 511 |
| }, |
| { |
| "epoch": 0.3495477043864141, |
| "grad_norm": 0.7376033067703247, |
| "learning_rate": 4.1107727936812216e-06, |
| "loss": 0.6235, |
| "mean_token_accuracy": 0.8058039098978043, |
| "num_tokens": 33494724.0, |
| "step": 512 |
| }, |
| { |
| "epoch": 0.35023041474654376, |
| "grad_norm": 0.7243679761886597, |
| "learning_rate": 4.106207507843106e-06, |
| "loss": 0.5884, |
| "mean_token_accuracy": 0.815142348408699, |
| "num_tokens": 33560260.0, |
| "step": 513 |
| }, |
| { |
| "epoch": 0.3509131251066735, |
| "grad_norm": 0.7643143534660339, |
| "learning_rate": 4.1016330823533866e-06, |
| "loss": 0.5928, |
| "mean_token_accuracy": 0.8167460858821869, |
| "num_tokens": 33625796.0, |
| "step": 514 |
| }, |
| { |
| "epoch": 0.35159583546680323, |
| "grad_norm": 0.7645459771156311, |
| "learning_rate": 4.0970495432414854e-06, |
| "loss": 0.589, |
| "mean_token_accuracy": 0.8166697174310684, |
| "num_tokens": 33691332.0, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.3522785458269329, |
| "grad_norm": 0.7253304123878479, |
| "learning_rate": 4.092456916588686e-06, |
| "loss": 0.6383, |
| "mean_token_accuracy": 0.8029126226902008, |
| "num_tokens": 33756568.0, |
| "step": 516 |
| }, |
| { |
| "epoch": 0.35296125618706264, |
| "grad_norm": 0.744203507900238, |
| "learning_rate": 4.0878552285279794e-06, |
| "loss": 0.616, |
| "mean_token_accuracy": 0.8080986589193344, |
| "num_tokens": 33821830.0, |
| "step": 517 |
| }, |
| { |
| "epoch": 0.3536439665471924, |
| "grad_norm": 0.7576060891151428, |
| "learning_rate": 4.083244505243918e-06, |
| "loss": 0.601, |
| "mean_token_accuracy": 0.8153256475925446, |
| "num_tokens": 33887366.0, |
| "step": 518 |
| }, |
| { |
| "epoch": 0.35432667690732206, |
| "grad_norm": 0.7077663540840149, |
| "learning_rate": 4.078624772972467e-06, |
| "loss": 0.5658, |
| "mean_token_accuracy": 0.8255590051412582, |
| "num_tokens": 33952902.0, |
| "step": 519 |
| }, |
| { |
| "epoch": 0.3550093872674518, |
| "grad_norm": 0.7638600468635559, |
| "learning_rate": 4.0739960580008565e-06, |
| "loss": 0.6068, |
| "mean_token_accuracy": 0.8101020306348801, |
| "num_tokens": 34018438.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.35569209762758147, |
| "grad_norm": 0.7264181971549988, |
| "learning_rate": 4.0693583866674255e-06, |
| "loss": 0.6234, |
| "mean_token_accuracy": 0.8054409176111221, |
| "num_tokens": 34083345.0, |
| "step": 521 |
| }, |
| { |
| "epoch": 0.3563748079877112, |
| "grad_norm": 0.7131197452545166, |
| "learning_rate": 4.0647117853614824e-06, |
| "loss": 0.5843, |
| "mean_token_accuracy": 0.8186552971601486, |
| "num_tokens": 34148881.0, |
| "step": 522 |
| }, |
| { |
| "epoch": 0.35705751834784094, |
| "grad_norm": 0.7427366375923157, |
| "learning_rate": 4.060056280523144e-06, |
| "loss": 0.638, |
| "mean_token_accuracy": 0.8032098114490509, |
| "num_tokens": 34213574.0, |
| "step": 523 |
| }, |
| { |
| "epoch": 0.3577402287079706, |
| "grad_norm": 0.7695744037628174, |
| "learning_rate": 4.0553918986431904e-06, |
| "loss": 0.6793, |
| "mean_token_accuracy": 0.7921401709318161, |
| "num_tokens": 34279110.0, |
| "step": 524 |
| }, |
| { |
| "epoch": 0.35842293906810035, |
| "grad_norm": 0.7793335318565369, |
| "learning_rate": 4.050718666262919e-06, |
| "loss": 0.6295, |
| "mean_token_accuracy": 0.8063169717788696, |
| "num_tokens": 34343656.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.3591056494282301, |
| "grad_norm": 0.708317220211029, |
| "learning_rate": 4.046036609973982e-06, |
| "loss": 0.6001, |
| "mean_token_accuracy": 0.8142264485359192, |
| "num_tokens": 34409018.0, |
| "step": 526 |
| }, |
| { |
| "epoch": 0.35978835978835977, |
| "grad_norm": 0.7289883494377136, |
| "learning_rate": 4.0413457564182455e-06, |
| "loss": 0.627, |
| "mean_token_accuracy": 0.8050616979598999, |
| "num_tokens": 34474554.0, |
| "step": 527 |
| }, |
| { |
| "epoch": 0.3604710701484895, |
| "grad_norm": 0.7371439933776855, |
| "learning_rate": 4.036646132287632e-06, |
| "loss": 0.6438, |
| "mean_token_accuracy": 0.8033890724182129, |
| "num_tokens": 34540077.0, |
| "step": 528 |
| }, |
| { |
| "epoch": 0.36115378050861924, |
| "grad_norm": 0.7393643260002136, |
| "learning_rate": 4.03193776432397e-06, |
| "loss": 0.5997, |
| "mean_token_accuracy": 0.8115717619657516, |
| "num_tokens": 34605589.0, |
| "step": 529 |
| }, |
| { |
| "epoch": 0.3618364908687489, |
| "grad_norm": 0.7248691916465759, |
| "learning_rate": 4.027220679318846e-06, |
| "loss": 0.6102, |
| "mean_token_accuracy": 0.8101173043251038, |
| "num_tokens": 34671125.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.36251920122887865, |
| "grad_norm": 0.7262829542160034, |
| "learning_rate": 4.0224949041134425e-06, |
| "loss": 0.5782, |
| "mean_token_accuracy": 0.8194034397602081, |
| "num_tokens": 34736394.0, |
| "step": 531 |
| }, |
| { |
| "epoch": 0.3632019115890084, |
| "grad_norm": 0.7357496619224548, |
| "learning_rate": 4.017760465598395e-06, |
| "loss": 0.6161, |
| "mean_token_accuracy": 0.8090364784002304, |
| "num_tokens": 34801868.0, |
| "step": 532 |
| }, |
| { |
| "epoch": 0.36388462194913807, |
| "grad_norm": 0.7482576966285706, |
| "learning_rate": 4.013017390713635e-06, |
| "loss": 0.6069, |
| "mean_token_accuracy": 0.8091856092214584, |
| "num_tokens": 34867404.0, |
| "step": 533 |
| }, |
| { |
| "epoch": 0.3645673323092678, |
| "grad_norm": 0.7310959100723267, |
| "learning_rate": 4.008265706448234e-06, |
| "loss": 0.6071, |
| "mean_token_accuracy": 0.8111569434404373, |
| "num_tokens": 34932885.0, |
| "step": 534 |
| }, |
| { |
| "epoch": 0.36525004266939753, |
| "grad_norm": 0.7276792526245117, |
| "learning_rate": 4.003505439840255e-06, |
| "loss": 0.6192, |
| "mean_token_accuracy": 0.8077193349599838, |
| "num_tokens": 34998421.0, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.3659327530295272, |
| "grad_norm": 0.7438013553619385, |
| "learning_rate": 3.998736617976596e-06, |
| "loss": 0.6409, |
| "mean_token_accuracy": 0.802770659327507, |
| "num_tokens": 35063957.0, |
| "step": 536 |
| }, |
| { |
| "epoch": 0.36661546338965695, |
| "grad_norm": 0.763702392578125, |
| "learning_rate": 3.993959267992835e-06, |
| "loss": 0.5908, |
| "mean_token_accuracy": 0.8138718605041504, |
| "num_tokens": 35129311.0, |
| "step": 537 |
| }, |
| { |
| "epoch": 0.3672981737497866, |
| "grad_norm": 0.757030189037323, |
| "learning_rate": 3.989173417073078e-06, |
| "loss": 0.6427, |
| "mean_token_accuracy": 0.799578458070755, |
| "num_tokens": 35194847.0, |
| "step": 538 |
| }, |
| { |
| "epoch": 0.36798088410991636, |
| "grad_norm": 0.7634031176567078, |
| "learning_rate": 3.984379092449804e-06, |
| "loss": 0.643, |
| "mean_token_accuracy": 0.8004353195428848, |
| "num_tokens": 35260042.0, |
| "step": 539 |
| }, |
| { |
| "epoch": 0.3686635944700461, |
| "grad_norm": 0.7350435853004456, |
| "learning_rate": 3.979576321403705e-06, |
| "loss": 0.6184, |
| "mean_token_accuracy": 0.8084860742092133, |
| "num_tokens": 35325490.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.3693463048301758, |
| "grad_norm": 0.7435086369514465, |
| "learning_rate": 3.974765131263539e-06, |
| "loss": 0.6271, |
| "mean_token_accuracy": 0.8046798706054688, |
| "num_tokens": 35391026.0, |
| "step": 541 |
| }, |
| { |
| "epoch": 0.3700290151903055, |
| "grad_norm": 0.8303893804550171, |
| "learning_rate": 3.96994554940597e-06, |
| "loss": 0.6375, |
| "mean_token_accuracy": 0.8043476492166519, |
| "num_tokens": 35456464.0, |
| "step": 542 |
| }, |
| { |
| "epoch": 0.37071172555043524, |
| "grad_norm": 0.7540069818496704, |
| "learning_rate": 3.965117603255411e-06, |
| "loss": 0.6357, |
| "mean_token_accuracy": 0.8061919659376144, |
| "num_tokens": 35522000.0, |
| "step": 543 |
| }, |
| { |
| "epoch": 0.3713944359105649, |
| "grad_norm": 0.7430548667907715, |
| "learning_rate": 3.960281320283869e-06, |
| "loss": 0.6124, |
| "mean_token_accuracy": 0.8097812831401825, |
| "num_tokens": 35587536.0, |
| "step": 544 |
| }, |
| { |
| "epoch": 0.37207714627069466, |
| "grad_norm": 0.7310859560966492, |
| "learning_rate": 3.955436728010792e-06, |
| "loss": 0.6242, |
| "mean_token_accuracy": 0.8084578067064285, |
| "num_tokens": 35653036.0, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.3727598566308244, |
| "grad_norm": 0.7654127478599548, |
| "learning_rate": 3.950583854002906e-06, |
| "loss": 0.6393, |
| "mean_token_accuracy": 0.8009989112615585, |
| "num_tokens": 35718572.0, |
| "step": 546 |
| }, |
| { |
| "epoch": 0.3734425669909541, |
| "grad_norm": 0.7217867374420166, |
| "learning_rate": 3.945722725874066e-06, |
| "loss": 0.6247, |
| "mean_token_accuracy": 0.8056681156158447, |
| "num_tokens": 35784039.0, |
| "step": 547 |
| }, |
| { |
| "epoch": 0.3741252773510838, |
| "grad_norm": 0.7655815482139587, |
| "learning_rate": 3.940853371285092e-06, |
| "loss": 0.6094, |
| "mean_token_accuracy": 0.8104991465806961, |
| "num_tokens": 35849575.0, |
| "step": 548 |
| }, |
| { |
| "epoch": 0.37480798771121354, |
| "grad_norm": 0.7061954140663147, |
| "learning_rate": 3.935975817943613e-06, |
| "loss": 0.5749, |
| "mean_token_accuracy": 0.8176930546760559, |
| "num_tokens": 35915111.0, |
| "step": 549 |
| }, |
| { |
| "epoch": 0.3754906980713432, |
| "grad_norm": 0.7559870481491089, |
| "learning_rate": 3.931090093603916e-06, |
| "loss": 0.634, |
| "mean_token_accuracy": 0.8024040907621384, |
| "num_tokens": 35980647.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.37617340843147296, |
| "grad_norm": 0.7588599920272827, |
| "learning_rate": 3.9261962260667744e-06, |
| "loss": 0.6144, |
| "mean_token_accuracy": 0.8079178929328918, |
| "num_tokens": 36046183.0, |
| "step": 551 |
| }, |
| { |
| "epoch": 0.3768561187916027, |
| "grad_norm": 0.7603986263275146, |
| "learning_rate": 3.9212942431793055e-06, |
| "loss": 0.6403, |
| "mean_token_accuracy": 0.8029824495315552, |
| "num_tokens": 36110803.0, |
| "step": 552 |
| }, |
| { |
| "epoch": 0.37753882915173237, |
| "grad_norm": 0.7612850666046143, |
| "learning_rate": 3.916384172834802e-06, |
| "loss": 0.6011, |
| "mean_token_accuracy": 0.8126069158315659, |
| "num_tokens": 36176339.0, |
| "step": 553 |
| }, |
| { |
| "epoch": 0.3782215395118621, |
| "grad_norm": 0.7219383716583252, |
| "learning_rate": 3.911466042972573e-06, |
| "loss": 0.5851, |
| "mean_token_accuracy": 0.8179527074098587, |
| "num_tokens": 36241875.0, |
| "step": 554 |
| }, |
| { |
| "epoch": 0.3789042498719918, |
| "grad_norm": 0.7435865998268127, |
| "learning_rate": 3.906539881577793e-06, |
| "loss": 0.6009, |
| "mean_token_accuracy": 0.8127596527338028, |
| "num_tokens": 36307411.0, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.3795869602321215, |
| "grad_norm": 0.708329975605011, |
| "learning_rate": 3.9016057166813355e-06, |
| "loss": 0.6139, |
| "mean_token_accuracy": 0.8103158622980118, |
| "num_tokens": 36372947.0, |
| "step": 556 |
| }, |
| { |
| "epoch": 0.38026967059225125, |
| "grad_norm": 0.7403861880302429, |
| "learning_rate": 3.896663576359614e-06, |
| "loss": 0.6063, |
| "mean_token_accuracy": 0.8107587993144989, |
| "num_tokens": 36438483.0, |
| "step": 557 |
| }, |
| { |
| "epoch": 0.38095238095238093, |
| "grad_norm": 0.7762757539749146, |
| "learning_rate": 3.8917134887344235e-06, |
| "loss": 0.6399, |
| "mean_token_accuracy": 0.8010013103485107, |
| "num_tokens": 36503880.0, |
| "step": 558 |
| }, |
| { |
| "epoch": 0.38163509131251067, |
| "grad_norm": 0.8195488452911377, |
| "learning_rate": 3.8867554819727855e-06, |
| "loss": 0.6557, |
| "mean_token_accuracy": 0.7968902885913849, |
| "num_tokens": 36569416.0, |
| "step": 559 |
| }, |
| { |
| "epoch": 0.3823178016726404, |
| "grad_norm": 0.7722012996673584, |
| "learning_rate": 3.881789584286778e-06, |
| "loss": 0.6515, |
| "mean_token_accuracy": 0.7978678047657013, |
| "num_tokens": 36634952.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.3830005120327701, |
| "grad_norm": 0.7423476576805115, |
| "learning_rate": 3.876815823933382e-06, |
| "loss": 0.5918, |
| "mean_token_accuracy": 0.814546674489975, |
| "num_tokens": 36700488.0, |
| "step": 561 |
| }, |
| { |
| "epoch": 0.3836832223928998, |
| "grad_norm": 0.7171317934989929, |
| "learning_rate": 3.87183422921432e-06, |
| "loss": 0.6324, |
| "mean_token_accuracy": 0.8058253973722458, |
| "num_tokens": 36766024.0, |
| "step": 562 |
| }, |
| { |
| "epoch": 0.38436593275302955, |
| "grad_norm": 0.7465117573738098, |
| "learning_rate": 3.866844828475889e-06, |
| "loss": 0.63, |
| "mean_token_accuracy": 0.8047417551279068, |
| "num_tokens": 36831180.0, |
| "step": 563 |
| }, |
| { |
| "epoch": 0.38504864311315923, |
| "grad_norm": 0.7372057437896729, |
| "learning_rate": 3.86184765010881e-06, |
| "loss": 0.6464, |
| "mean_token_accuracy": 0.8016995638608932, |
| "num_tokens": 36896700.0, |
| "step": 564 |
| }, |
| { |
| "epoch": 0.38573135347328896, |
| "grad_norm": 0.7480089068412781, |
| "learning_rate": 3.8568427225480556e-06, |
| "loss": 0.6215, |
| "mean_token_accuracy": 0.8073798418045044, |
| "num_tokens": 36961476.0, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.3864140638334187, |
| "grad_norm": 0.7431397438049316, |
| "learning_rate": 3.851830074272697e-06, |
| "loss": 0.6476, |
| "mean_token_accuracy": 0.8006934374570847, |
| "num_tokens": 37027012.0, |
| "step": 566 |
| }, |
| { |
| "epoch": 0.3870967741935484, |
| "grad_norm": 0.7774817943572998, |
| "learning_rate": 3.846809733805732e-06, |
| "loss": 0.665, |
| "mean_token_accuracy": 0.7960631251335144, |
| "num_tokens": 37091896.0, |
| "step": 567 |
| }, |
| { |
| "epoch": 0.3877794845536781, |
| "grad_norm": 0.7300996780395508, |
| "learning_rate": 3.841781729713935e-06, |
| "loss": 0.6205, |
| "mean_token_accuracy": 0.8074519038200378, |
| "num_tokens": 37157354.0, |
| "step": 568 |
| }, |
| { |
| "epoch": 0.3884621949138078, |
| "grad_norm": 0.7174612879753113, |
| "learning_rate": 3.836746090607683e-06, |
| "loss": 0.565, |
| "mean_token_accuracy": 0.8222929537296295, |
| "num_tokens": 37222875.0, |
| "step": 569 |
| }, |
| { |
| "epoch": 0.3891449052739375, |
| "grad_norm": 0.7718711495399475, |
| "learning_rate": 3.831702845140801e-06, |
| "loss": 0.6529, |
| "mean_token_accuracy": 0.7992597669363022, |
| "num_tokens": 37288236.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.38982761563406726, |
| "grad_norm": 0.7533946633338928, |
| "learning_rate": 3.826652022010396e-06, |
| "loss": 0.6283, |
| "mean_token_accuracy": 0.8052937835454941, |
| "num_tokens": 37353484.0, |
| "step": 571 |
| }, |
| { |
| "epoch": 0.39051032599419694, |
| "grad_norm": 0.7376362085342407, |
| "learning_rate": 3.821593649956688e-06, |
| "loss": 0.6261, |
| "mean_token_accuracy": 0.8082539141178131, |
| "num_tokens": 37419020.0, |
| "step": 572 |
| }, |
| { |
| "epoch": 0.3911930363543267, |
| "grad_norm": 0.7462616562843323, |
| "learning_rate": 3.81652775776286e-06, |
| "loss": 0.6104, |
| "mean_token_accuracy": 0.8107587993144989, |
| "num_tokens": 37484556.0, |
| "step": 573 |
| }, |
| { |
| "epoch": 0.3918757467144564, |
| "grad_norm": 0.7472273707389832, |
| "learning_rate": 3.8114543742548817e-06, |
| "loss": 0.6238, |
| "mean_token_accuracy": 0.8062225133180618, |
| "num_tokens": 37550092.0, |
| "step": 574 |
| }, |
| { |
| "epoch": 0.3925584570745861, |
| "grad_norm": 0.7769070863723755, |
| "learning_rate": 3.8063735283013483e-06, |
| "loss": 0.6022, |
| "mean_token_accuracy": 0.8123604208230972, |
| "num_tokens": 37615338.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.3932411674347158, |
| "grad_norm": 0.7517635226249695, |
| "learning_rate": 3.8012852488133212e-06, |
| "loss": 0.5952, |
| "mean_token_accuracy": 0.816242054104805, |
| "num_tokens": 37680874.0, |
| "step": 576 |
| }, |
| { |
| "epoch": 0.39392387779484556, |
| "grad_norm": 0.7434868216514587, |
| "learning_rate": 3.7961895647441595e-06, |
| "loss": 0.5827, |
| "mean_token_accuracy": 0.8170668333768845, |
| "num_tokens": 37746410.0, |
| "step": 577 |
| }, |
| { |
| "epoch": 0.39460658815497524, |
| "grad_norm": 0.7710351943969727, |
| "learning_rate": 3.791086505089354e-06, |
| "loss": 0.647, |
| "mean_token_accuracy": 0.797424852848053, |
| "num_tokens": 37811946.0, |
| "step": 578 |
| }, |
| { |
| "epoch": 0.39528929851510497, |
| "grad_norm": 0.7557309865951538, |
| "learning_rate": 3.7859760988863664e-06, |
| "loss": 0.704, |
| "mean_token_accuracy": 0.7859237641096115, |
| "num_tokens": 37877482.0, |
| "step": 579 |
| }, |
| { |
| "epoch": 0.3959720088752347, |
| "grad_norm": 0.723376452922821, |
| "learning_rate": 3.7808583752144602e-06, |
| "loss": 0.6146, |
| "mean_token_accuracy": 0.8090023249387741, |
| "num_tokens": 37943018.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.3966547192353644, |
| "grad_norm": 0.7643930912017822, |
| "learning_rate": 3.775733363194537e-06, |
| "loss": 0.6384, |
| "mean_token_accuracy": 0.801774263381958, |
| "num_tokens": 38008548.0, |
| "step": 581 |
| }, |
| { |
| "epoch": 0.3973374295954941, |
| "grad_norm": 0.7285202145576477, |
| "learning_rate": 3.7706010919889726e-06, |
| "loss": 0.6471, |
| "mean_token_accuracy": 0.7999582588672638, |
| "num_tokens": 38073998.0, |
| "step": 582 |
| }, |
| { |
| "epoch": 0.39802013995562385, |
| "grad_norm": 0.7560527324676514, |
| "learning_rate": 3.7654615908014456e-06, |
| "loss": 0.6032, |
| "mean_token_accuracy": 0.8126820176839828, |
| "num_tokens": 38139245.0, |
| "step": 583 |
| }, |
| { |
| "epoch": 0.39870285031575353, |
| "grad_norm": 0.7738918662071228, |
| "learning_rate": 3.760314888876777e-06, |
| "loss": 0.6294, |
| "mean_token_accuracy": 0.8050972521305084, |
| "num_tokens": 38204491.0, |
| "step": 584 |
| }, |
| { |
| "epoch": 0.39938556067588327, |
| "grad_norm": 0.8183801770210266, |
| "learning_rate": 3.755161015500762e-06, |
| "loss": 0.5619, |
| "mean_token_accuracy": 0.823215052485466, |
| "num_tokens": 38270002.0, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.40006827103601295, |
| "grad_norm": 0.7287594676017761, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.6482, |
| "mean_token_accuracy": 0.799517348408699, |
| "num_tokens": 38335538.0, |
| "step": 586 |
| }, |
| { |
| "epoch": 0.4007509813961427, |
| "grad_norm": 0.7398510575294495, |
| "learning_rate": 3.7448318717417343e-06, |
| "loss": 0.614, |
| "mean_token_accuracy": 0.8087639510631561, |
| "num_tokens": 38400582.0, |
| "step": 587 |
| }, |
| { |
| "epoch": 0.4014336917562724, |
| "grad_norm": 0.71683269739151, |
| "learning_rate": 3.739656660133678e-06, |
| "loss": 0.5832, |
| "mean_token_accuracy": 0.818490669131279, |
| "num_tokens": 38466075.0, |
| "step": 588 |
| }, |
| { |
| "epoch": 0.4021164021164021, |
| "grad_norm": 0.7904890179634094, |
| "learning_rate": 3.734474394623852e-06, |
| "loss": 0.6087, |
| "mean_token_accuracy": 0.812148705124855, |
| "num_tokens": 38531611.0, |
| "step": 589 |
| }, |
| { |
| "epoch": 0.40279911247653183, |
| "grad_norm": 0.762642502784729, |
| "learning_rate": 3.7292851047004143e-06, |
| "loss": 0.6222, |
| "mean_token_accuracy": 0.8086510300636292, |
| "num_tokens": 38597147.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.40348182283666156, |
| "grad_norm": 0.7368521690368652, |
| "learning_rate": 3.7240888198914935e-06, |
| "loss": 0.5977, |
| "mean_token_accuracy": 0.8123778104782104, |
| "num_tokens": 38662683.0, |
| "step": 591 |
| }, |
| { |
| "epoch": 0.40416453319679124, |
| "grad_norm": 0.7672507762908936, |
| "learning_rate": 3.7188855697650212e-06, |
| "loss": 0.5934, |
| "mean_token_accuracy": 0.8154783695936203, |
| "num_tokens": 38728219.0, |
| "step": 592 |
| }, |
| { |
| "epoch": 0.404847243556921, |
| "grad_norm": 0.7662923336029053, |
| "learning_rate": 3.713675383928561e-06, |
| "loss": 0.6617, |
| "mean_token_accuracy": 0.7965695410966873, |
| "num_tokens": 38793755.0, |
| "step": 593 |
| }, |
| { |
| "epoch": 0.4055299539170507, |
| "grad_norm": 0.7628138661384583, |
| "learning_rate": 3.7084582920291456e-06, |
| "loss": 0.6167, |
| "mean_token_accuracy": 0.8086204826831818, |
| "num_tokens": 38859291.0, |
| "step": 594 |
| }, |
| { |
| "epoch": 0.4062126642771804, |
| "grad_norm": 0.7820776104927063, |
| "learning_rate": 3.7032343237531017e-06, |
| "loss": 0.5786, |
| "mean_token_accuracy": 0.8179832547903061, |
| "num_tokens": 38924827.0, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.4068953746373101, |
| "grad_norm": 0.7740116119384766, |
| "learning_rate": 3.6980035088258842e-06, |
| "loss": 0.5694, |
| "mean_token_accuracy": 0.81998410820961, |
| "num_tokens": 38990363.0, |
| "step": 596 |
| }, |
| { |
| "epoch": 0.40757808499743986, |
| "grad_norm": 0.7644342184066772, |
| "learning_rate": 3.692765877011909e-06, |
| "loss": 0.6167, |
| "mean_token_accuracy": 0.805490642786026, |
| "num_tokens": 39055369.0, |
| "step": 597 |
| }, |
| { |
| "epoch": 0.40826079535756954, |
| "grad_norm": 0.7698699831962585, |
| "learning_rate": 3.68752145811438e-06, |
| "loss": 0.616, |
| "mean_token_accuracy": 0.8080400824546814, |
| "num_tokens": 39120905.0, |
| "step": 598 |
| }, |
| { |
| "epoch": 0.4089435057176993, |
| "grad_norm": 0.7477402687072754, |
| "learning_rate": 3.6822702819751195e-06, |
| "loss": 0.6072, |
| "mean_token_accuracy": 0.8107282519340515, |
| "num_tokens": 39186441.0, |
| "step": 599 |
| }, |
| { |
| "epoch": 0.409626216077829, |
| "grad_norm": 0.7572869658470154, |
| "learning_rate": 3.6770123784744027e-06, |
| "loss": 0.6225, |
| "mean_token_accuracy": 0.8081317245960236, |
| "num_tokens": 39251977.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4103089264379587, |
| "grad_norm": 0.7412436604499817, |
| "learning_rate": 3.671747777530784e-06, |
| "loss": 0.5927, |
| "mean_token_accuracy": 0.8155547380447388, |
| "num_tokens": 39317513.0, |
| "step": 601 |
| }, |
| { |
| "epoch": 0.4109916367980884, |
| "grad_norm": 0.7406189441680908, |
| "learning_rate": 3.6664765091009267e-06, |
| "loss": 0.6025, |
| "mean_token_accuracy": 0.8125419914722443, |
| "num_tokens": 39382777.0, |
| "step": 602 |
| }, |
| { |
| "epoch": 0.4116743471582181, |
| "grad_norm": 0.7247329950332642, |
| "learning_rate": 3.6611986031794345e-06, |
| "loss": 0.618, |
| "mean_token_accuracy": 0.8083761036396027, |
| "num_tokens": 39448313.0, |
| "step": 603 |
| }, |
| { |
| "epoch": 0.41235705751834784, |
| "grad_norm": 0.7504489421844482, |
| "learning_rate": 3.6559140897986777e-06, |
| "loss": 0.6565, |
| "mean_token_accuracy": 0.7968750149011612, |
| "num_tokens": 39513849.0, |
| "step": 604 |
| }, |
| { |
| "epoch": 0.41303976787847757, |
| "grad_norm": 0.7441123723983765, |
| "learning_rate": 3.6506229990286292e-06, |
| "loss": 0.6122, |
| "mean_token_accuracy": 0.8100409358739853, |
| "num_tokens": 39579385.0, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.41372247823860725, |
| "grad_norm": 0.7135398387908936, |
| "learning_rate": 3.6453253609766825e-06, |
| "loss": 0.5994, |
| "mean_token_accuracy": 0.8148216009140015, |
| "num_tokens": 39644921.0, |
| "step": 606 |
| }, |
| { |
| "epoch": 0.414405188598737, |
| "grad_norm": 0.7364180088043213, |
| "learning_rate": 3.6400212057874912e-06, |
| "loss": 0.6371, |
| "mean_token_accuracy": 0.8020986020565033, |
| "num_tokens": 39710457.0, |
| "step": 607 |
| }, |
| { |
| "epoch": 0.4150878989588667, |
| "grad_norm": 0.7383872270584106, |
| "learning_rate": 3.63471056364279e-06, |
| "loss": 0.6188, |
| "mean_token_accuracy": 0.8076735138893127, |
| "num_tokens": 39775993.0, |
| "step": 608 |
| }, |
| { |
| "epoch": 0.4157706093189964, |
| "grad_norm": 0.7400882244110107, |
| "learning_rate": 3.629393464761227e-06, |
| "loss": 0.6417, |
| "mean_token_accuracy": 0.8041910529136658, |
| "num_tokens": 39841183.0, |
| "step": 609 |
| }, |
| { |
| "epoch": 0.41645331967912613, |
| "grad_norm": 0.7408536076545715, |
| "learning_rate": 3.6240699393981915e-06, |
| "loss": 0.6219, |
| "mean_token_accuracy": 0.8063905239105225, |
| "num_tokens": 39906719.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.41713603003925587, |
| "grad_norm": 0.7284941673278809, |
| "learning_rate": 3.618740017845638e-06, |
| "loss": 0.6009, |
| "mean_token_accuracy": 0.8142106533050537, |
| "num_tokens": 39972255.0, |
| "step": 611 |
| }, |
| { |
| "epoch": 0.41781874039938555, |
| "grad_norm": 0.7574661374092102, |
| "learning_rate": 3.613403730431917e-06, |
| "loss": 0.642, |
| "mean_token_accuracy": 0.8028387278318405, |
| "num_tokens": 40037702.0, |
| "step": 612 |
| }, |
| { |
| "epoch": 0.4185014507595153, |
| "grad_norm": 0.742321252822876, |
| "learning_rate": 3.6080611075216053e-06, |
| "loss": 0.6214, |
| "mean_token_accuracy": 0.8056885898113251, |
| "num_tokens": 40103160.0, |
| "step": 613 |
| }, |
| { |
| "epoch": 0.419184161119645, |
| "grad_norm": 0.7570779919624329, |
| "learning_rate": 3.602712179515328e-06, |
| "loss": 0.6328, |
| "mean_token_accuracy": 0.8036107122898102, |
| "num_tokens": 40168696.0, |
| "step": 614 |
| }, |
| { |
| "epoch": 0.4198668714797747, |
| "grad_norm": 0.7323223948478699, |
| "learning_rate": 3.5973569768495858e-06, |
| "loss": 0.6051, |
| "mean_token_accuracy": 0.8121181577444077, |
| "num_tokens": 40234232.0, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.42054958183990443, |
| "grad_norm": 0.7437307834625244, |
| "learning_rate": 3.5919955299965858e-06, |
| "loss": 0.6317, |
| "mean_token_accuracy": 0.802324965596199, |
| "num_tokens": 40298970.0, |
| "step": 616 |
| }, |
| { |
| "epoch": 0.4212322922000341, |
| "grad_norm": 0.7594529390335083, |
| "learning_rate": 3.586627869464065e-06, |
| "loss": 0.6508, |
| "mean_token_accuracy": 0.8005407005548477, |
| "num_tokens": 40364506.0, |
| "step": 617 |
| }, |
| { |
| "epoch": 0.42191500256016384, |
| "grad_norm": 0.7406197786331177, |
| "learning_rate": 3.5812540257951178e-06, |
| "loss": 0.615, |
| "mean_token_accuracy": 0.812698557972908, |
| "num_tokens": 40430042.0, |
| "step": 618 |
| }, |
| { |
| "epoch": 0.4225977129202936, |
| "grad_norm": 0.755382239818573, |
| "learning_rate": 3.575874029568021e-06, |
| "loss": 0.613, |
| "mean_token_accuracy": 0.8104227781295776, |
| "num_tokens": 40495578.0, |
| "step": 619 |
| }, |
| { |
| "epoch": 0.42328042328042326, |
| "grad_norm": 0.7402629256248474, |
| "learning_rate": 3.5704879113960627e-06, |
| "loss": 0.6589, |
| "mean_token_accuracy": 0.7991355061531067, |
| "num_tokens": 40561114.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.423963133640553, |
| "grad_norm": 0.7542107105255127, |
| "learning_rate": 3.5650957019273642e-06, |
| "loss": 0.5923, |
| "mean_token_accuracy": 0.8151881694793701, |
| "num_tokens": 40626650.0, |
| "step": 621 |
| }, |
| { |
| "epoch": 0.4246458440006827, |
| "grad_norm": 0.772157609462738, |
| "learning_rate": 3.5596974318447075e-06, |
| "loss": 0.6473, |
| "mean_token_accuracy": 0.7991202473640442, |
| "num_tokens": 40692186.0, |
| "step": 622 |
| }, |
| { |
| "epoch": 0.4253285543608124, |
| "grad_norm": 0.782319188117981, |
| "learning_rate": 3.5542931318653625e-06, |
| "loss": 0.6498, |
| "mean_token_accuracy": 0.799028605222702, |
| "num_tokens": 40757722.0, |
| "step": 623 |
| }, |
| { |
| "epoch": 0.42601126472094214, |
| "grad_norm": 0.7283681631088257, |
| "learning_rate": 3.5488828327409086e-06, |
| "loss": 0.6361, |
| "mean_token_accuracy": 0.8019764274358749, |
| "num_tokens": 40823258.0, |
| "step": 624 |
| }, |
| { |
| "epoch": 0.4266939750810719, |
| "grad_norm": 0.7392454743385315, |
| "learning_rate": 3.543466565257063e-06, |
| "loss": 0.6063, |
| "mean_token_accuracy": 0.8119959682226181, |
| "num_tokens": 40888794.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.42737668544120155, |
| "grad_norm": 0.7856171131134033, |
| "learning_rate": 3.538044360233503e-06, |
| "loss": 0.6085, |
| "mean_token_accuracy": 0.8111406415700912, |
| "num_tokens": 40954330.0, |
| "step": 626 |
| }, |
| { |
| "epoch": 0.4280593958013313, |
| "grad_norm": 0.7746567130088806, |
| "learning_rate": 3.532616248523692e-06, |
| "loss": 0.6443, |
| "mean_token_accuracy": 0.8002853393554688, |
| "num_tokens": 41019315.0, |
| "step": 627 |
| }, |
| { |
| "epoch": 0.428742106161461, |
| "grad_norm": 0.7823646068572998, |
| "learning_rate": 3.527182261014705e-06, |
| "loss": 0.6094, |
| "mean_token_accuracy": 0.8089713901281357, |
| "num_tokens": 41084672.0, |
| "step": 628 |
| }, |
| { |
| "epoch": 0.4294248165215907, |
| "grad_norm": 0.7409360408782959, |
| "learning_rate": 3.521742428627049e-06, |
| "loss": 0.6239, |
| "mean_token_accuracy": 0.8043133020401001, |
| "num_tokens": 41150208.0, |
| "step": 629 |
| }, |
| { |
| "epoch": 0.43010752688172044, |
| "grad_norm": 0.7446600198745728, |
| "learning_rate": 3.516296782314491e-06, |
| "loss": 0.6312, |
| "mean_token_accuracy": 0.8042682111263275, |
| "num_tokens": 41215719.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.43079023724185017, |
| "grad_norm": 0.7764535546302795, |
| "learning_rate": 3.5108453530638815e-06, |
| "loss": 0.611, |
| "mean_token_accuracy": 0.8111705332994461, |
| "num_tokens": 41280701.0, |
| "step": 631 |
| }, |
| { |
| "epoch": 0.43147294760197985, |
| "grad_norm": 0.7295299172401428, |
| "learning_rate": 3.5053881718949758e-06, |
| "loss": 0.609, |
| "mean_token_accuracy": 0.8096132725477219, |
| "num_tokens": 41346237.0, |
| "step": 632 |
| }, |
| { |
| "epoch": 0.4321556579621096, |
| "grad_norm": 0.7986657619476318, |
| "learning_rate": 3.499925269860257e-06, |
| "loss": 0.6505, |
| "mean_token_accuracy": 0.799974262714386, |
| "num_tokens": 41411547.0, |
| "step": 633 |
| }, |
| { |
| "epoch": 0.43283836832223926, |
| "grad_norm": 0.7602563500404358, |
| "learning_rate": 3.4944566780447648e-06, |
| "loss": 0.6407, |
| "mean_token_accuracy": 0.802831619977951, |
| "num_tokens": 41477017.0, |
| "step": 634 |
| }, |
| { |
| "epoch": 0.433521078682369, |
| "grad_norm": 0.750855565071106, |
| "learning_rate": 3.4889824275659136e-06, |
| "loss": 0.5869, |
| "mean_token_accuracy": 0.8175708651542664, |
| "num_tokens": 41542553.0, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.43420378904249873, |
| "grad_norm": 0.7245985865592957, |
| "learning_rate": 3.4835025495733143e-06, |
| "loss": 0.6471, |
| "mean_token_accuracy": 0.8033660650253296, |
| "num_tokens": 41607382.0, |
| "step": 636 |
| }, |
| { |
| "epoch": 0.4348864994026284, |
| "grad_norm": 0.7158551812171936, |
| "learning_rate": 3.4780170752486035e-06, |
| "loss": 0.5953, |
| "mean_token_accuracy": 0.8136302530765533, |
| "num_tokens": 41672918.0, |
| "step": 637 |
| }, |
| { |
| "epoch": 0.43556920976275815, |
| "grad_norm": 0.7428006529808044, |
| "learning_rate": 3.4725260358052597e-06, |
| "loss": 0.5941, |
| "mean_token_accuracy": 0.8138288110494614, |
| "num_tokens": 41738454.0, |
| "step": 638 |
| }, |
| { |
| "epoch": 0.4362519201228879, |
| "grad_norm": 0.7119945287704468, |
| "learning_rate": 3.4670294624884275e-06, |
| "loss": 0.617, |
| "mean_token_accuracy": 0.8088801354169846, |
| "num_tokens": 41803990.0, |
| "step": 639 |
| }, |
| { |
| "epoch": 0.43693463048301756, |
| "grad_norm": 0.6839730739593506, |
| "learning_rate": 3.461527386574743e-06, |
| "loss": 0.5456, |
| "mean_token_accuracy": 0.8276820480823517, |
| "num_tokens": 41869526.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.4376173408431473, |
| "grad_norm": 0.7335575222969055, |
| "learning_rate": 3.45601983937215e-06, |
| "loss": 0.6375, |
| "mean_token_accuracy": 0.8022129386663437, |
| "num_tokens": 41935054.0, |
| "step": 641 |
| }, |
| { |
| "epoch": 0.43830005120327703, |
| "grad_norm": 0.7447086572647095, |
| "learning_rate": 3.4505068522197277e-06, |
| "loss": 0.6568, |
| "mean_token_accuracy": 0.7966000735759735, |
| "num_tokens": 42000590.0, |
| "step": 642 |
| }, |
| { |
| "epoch": 0.4389827615634067, |
| "grad_norm": 0.7332596778869629, |
| "learning_rate": 3.4449884564875086e-06, |
| "loss": 0.6065, |
| "mean_token_accuracy": 0.8104380518198013, |
| "num_tokens": 42066126.0, |
| "step": 643 |
| }, |
| { |
| "epoch": 0.43966547192353644, |
| "grad_norm": 0.7137930989265442, |
| "learning_rate": 3.439464683576301e-06, |
| "loss": 0.6694, |
| "mean_token_accuracy": 0.7953858822584152, |
| "num_tokens": 42131485.0, |
| "step": 644 |
| }, |
| { |
| "epoch": 0.4403481822836662, |
| "grad_norm": 0.7591450214385986, |
| "learning_rate": 3.43393556491751e-06, |
| "loss": 0.649, |
| "mean_token_accuracy": 0.8000366687774658, |
| "num_tokens": 42197021.0, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.44103089264379586, |
| "grad_norm": 0.7286630868911743, |
| "learning_rate": 3.428401131972961e-06, |
| "loss": 0.5651, |
| "mean_token_accuracy": 0.8224584460258484, |
| "num_tokens": 42262557.0, |
| "step": 646 |
| }, |
| { |
| "epoch": 0.4417136030039256, |
| "grad_norm": 0.7138211131095886, |
| "learning_rate": 3.4228614162347167e-06, |
| "loss": 0.5986, |
| "mean_token_accuracy": 0.8115658462047577, |
| "num_tokens": 42328086.0, |
| "step": 647 |
| }, |
| { |
| "epoch": 0.4423963133640553, |
| "grad_norm": 0.691111147403717, |
| "learning_rate": 3.417316449224902e-06, |
| "loss": 0.575, |
| "mean_token_accuracy": 0.8209549337625504, |
| "num_tokens": 42393344.0, |
| "step": 648 |
| }, |
| { |
| "epoch": 0.443079023724185, |
| "grad_norm": 0.7229531407356262, |
| "learning_rate": 3.41176626249552e-06, |
| "loss": 0.6104, |
| "mean_token_accuracy": 0.8075550347566605, |
| "num_tokens": 42458741.0, |
| "step": 649 |
| }, |
| { |
| "epoch": 0.44376173408431474, |
| "grad_norm": 0.8127530813217163, |
| "learning_rate": 3.4062108876282773e-06, |
| "loss": 0.6424, |
| "mean_token_accuracy": 0.8024535477161407, |
| "num_tokens": 42523740.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 0.7284678220748901, |
| "learning_rate": 3.4006503562344014e-06, |
| "loss": 0.5886, |
| "mean_token_accuracy": 0.81828872859478, |
| "num_tokens": 42589276.0, |
| "step": 651 |
| }, |
| { |
| "epoch": 0.44512715480457415, |
| "grad_norm": 0.7327984571456909, |
| "learning_rate": 3.3950846999544613e-06, |
| "loss": 0.5961, |
| "mean_token_accuracy": 0.8132331371307373, |
| "num_tokens": 42654812.0, |
| "step": 652 |
| }, |
| { |
| "epoch": 0.4458098651647039, |
| "grad_norm": 0.7355505228042603, |
| "learning_rate": 3.389513950458187e-06, |
| "loss": 0.6422, |
| "mean_token_accuracy": 0.800984725356102, |
| "num_tokens": 42720132.0, |
| "step": 653 |
| }, |
| { |
| "epoch": 0.44649257552483357, |
| "grad_norm": 0.7620731592178345, |
| "learning_rate": 3.383938139444293e-06, |
| "loss": 0.6518, |
| "mean_token_accuracy": 0.7987159639596939, |
| "num_tokens": 42785539.0, |
| "step": 654 |
| }, |
| { |
| "epoch": 0.4471752858849633, |
| "grad_norm": 0.7601433992385864, |
| "learning_rate": 3.3783572986402896e-06, |
| "loss": 0.6462, |
| "mean_token_accuracy": 0.8014137893915176, |
| "num_tokens": 42850974.0, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.44785799624509304, |
| "grad_norm": 0.7175619602203369, |
| "learning_rate": 3.372771459802313e-06, |
| "loss": 0.5875, |
| "mean_token_accuracy": 0.8182276338338852, |
| "num_tokens": 42916510.0, |
| "step": 656 |
| }, |
| { |
| "epoch": 0.4485407066052227, |
| "grad_norm": 0.7211723327636719, |
| "learning_rate": 3.3671806547149357e-06, |
| "loss": 0.6365, |
| "mean_token_accuracy": 0.8023043274879456, |
| "num_tokens": 42981896.0, |
| "step": 657 |
| }, |
| { |
| "epoch": 0.44922341696535245, |
| "grad_norm": 0.7135072350502014, |
| "learning_rate": 3.36158491519099e-06, |
| "loss": 0.6132, |
| "mean_token_accuracy": 0.8097965568304062, |
| "num_tokens": 43047432.0, |
| "step": 658 |
| }, |
| { |
| "epoch": 0.4499061273254822, |
| "grad_norm": 0.7017936110496521, |
| "learning_rate": 3.355984273071389e-06, |
| "loss": 0.5707, |
| "mean_token_accuracy": 0.8174466788768768, |
| "num_tokens": 43112715.0, |
| "step": 659 |
| }, |
| { |
| "epoch": 0.45058883768561186, |
| "grad_norm": 0.7288024425506592, |
| "learning_rate": 3.3503787602249366e-06, |
| "loss": 0.6104, |
| "mean_token_accuracy": 0.8088495880365372, |
| "num_tokens": 43178251.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.4512715480457416, |
| "grad_norm": 0.7068003416061401, |
| "learning_rate": 3.344768408548158e-06, |
| "loss": 0.624, |
| "mean_token_accuracy": 0.8051533550024033, |
| "num_tokens": 43243787.0, |
| "step": 661 |
| }, |
| { |
| "epoch": 0.45195425840587133, |
| "grad_norm": 0.7492927312850952, |
| "learning_rate": 3.339153249965109e-06, |
| "loss": 0.6621, |
| "mean_token_accuracy": 0.7967247366905212, |
| "num_tokens": 43308554.0, |
| "step": 662 |
| }, |
| { |
| "epoch": 0.452636968766001, |
| "grad_norm": 0.7419738173484802, |
| "learning_rate": 3.3335333164272e-06, |
| "loss": 0.6254, |
| "mean_token_accuracy": 0.8067606091499329, |
| "num_tokens": 43373512.0, |
| "step": 663 |
| }, |
| { |
| "epoch": 0.45331967912613075, |
| "grad_norm": 0.7345742583274841, |
| "learning_rate": 3.327908639913009e-06, |
| "loss": 0.572, |
| "mean_token_accuracy": 0.8185178339481354, |
| "num_tokens": 43439048.0, |
| "step": 664 |
| }, |
| { |
| "epoch": 0.4540023894862604, |
| "grad_norm": 0.7236369848251343, |
| "learning_rate": 3.3222792524281045e-06, |
| "loss": 0.5821, |
| "mean_token_accuracy": 0.8172501176595688, |
| "num_tokens": 43504584.0, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.45468509984639016, |
| "grad_norm": 0.778313934803009, |
| "learning_rate": 3.3166451860048615e-06, |
| "loss": 0.6343, |
| "mean_token_accuracy": 0.8037634491920471, |
| "num_tokens": 43570120.0, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.4553678102065199, |
| "grad_norm": 0.7267420887947083, |
| "learning_rate": 3.3110064727022783e-06, |
| "loss": 0.6388, |
| "mean_token_accuracy": 0.8018084168434143, |
| "num_tokens": 43635656.0, |
| "step": 667 |
| }, |
| { |
| "epoch": 0.4560505205666496, |
| "grad_norm": 0.7459083199501038, |
| "learning_rate": 3.3053631446057944e-06, |
| "loss": 0.633, |
| "mean_token_accuracy": 0.803519070148468, |
| "num_tokens": 43701192.0, |
| "step": 668 |
| }, |
| { |
| "epoch": 0.4567332309267793, |
| "grad_norm": 0.7337201833724976, |
| "learning_rate": 3.299715233827111e-06, |
| "loss": 0.6728, |
| "mean_token_accuracy": 0.7914834022521973, |
| "num_tokens": 43766728.0, |
| "step": 669 |
| }, |
| { |
| "epoch": 0.45741594128690904, |
| "grad_norm": 0.75600665807724, |
| "learning_rate": 3.294062772504002e-06, |
| "loss": 0.61, |
| "mean_token_accuracy": 0.8095979988574982, |
| "num_tokens": 43832264.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.4580986516470387, |
| "grad_norm": 0.7811246514320374, |
| "learning_rate": 3.288405792800138e-06, |
| "loss": 0.6318, |
| "mean_token_accuracy": 0.8027248382568359, |
| "num_tokens": 43897800.0, |
| "step": 671 |
| }, |
| { |
| "epoch": 0.45878136200716846, |
| "grad_norm": 0.7400442957878113, |
| "learning_rate": 3.282744326904899e-06, |
| "loss": 0.5896, |
| "mean_token_accuracy": 0.8151118010282516, |
| "num_tokens": 43963336.0, |
| "step": 672 |
| }, |
| { |
| "epoch": 0.4594640723672982, |
| "grad_norm": 0.7188034653663635, |
| "learning_rate": 3.27707840703319e-06, |
| "loss": 0.6026, |
| "mean_token_accuracy": 0.8125610947608948, |
| "num_tokens": 44028872.0, |
| "step": 673 |
| }, |
| { |
| "epoch": 0.46014678272742787, |
| "grad_norm": 0.7437601685523987, |
| "learning_rate": 3.2714080654252657e-06, |
| "loss": 0.6495, |
| "mean_token_accuracy": 0.7988808453083038, |
| "num_tokens": 44094131.0, |
| "step": 674 |
| }, |
| { |
| "epoch": 0.4608294930875576, |
| "grad_norm": 0.7074049711227417, |
| "learning_rate": 3.265733334346536e-06, |
| "loss": 0.6018, |
| "mean_token_accuracy": 0.8113830238580704, |
| "num_tokens": 44159596.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.46151220344768734, |
| "grad_norm": 0.7336816787719727, |
| "learning_rate": 3.260054246087389e-06, |
| "loss": 0.5852, |
| "mean_token_accuracy": 0.8168530017137527, |
| "num_tokens": 44225132.0, |
| "step": 676 |
| }, |
| { |
| "epoch": 0.462194913807817, |
| "grad_norm": 0.7625792026519775, |
| "learning_rate": 3.2543708329630085e-06, |
| "loss": 0.6105, |
| "mean_token_accuracy": 0.8130651265382767, |
| "num_tokens": 44290668.0, |
| "step": 677 |
| }, |
| { |
| "epoch": 0.46287762416794676, |
| "grad_norm": 0.7495646476745605, |
| "learning_rate": 3.248683127313186e-06, |
| "loss": 0.6233, |
| "mean_token_accuracy": 0.8054354339838028, |
| "num_tokens": 44355496.0, |
| "step": 678 |
| }, |
| { |
| "epoch": 0.4635603345280765, |
| "grad_norm": 0.7479454278945923, |
| "learning_rate": 3.242991161502136e-06, |
| "loss": 0.6531, |
| "mean_token_accuracy": 0.7986889332532883, |
| "num_tokens": 44420943.0, |
| "step": 679 |
| }, |
| { |
| "epoch": 0.46424304488820617, |
| "grad_norm": 0.7727522850036621, |
| "learning_rate": 3.2372949679183196e-06, |
| "loss": 0.6671, |
| "mean_token_accuracy": 0.7948741465806961, |
| "num_tokens": 44486479.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.4649257552483359, |
| "grad_norm": 0.7375922799110413, |
| "learning_rate": 3.23159457897425e-06, |
| "loss": 0.6078, |
| "mean_token_accuracy": 0.8109726309776306, |
| "num_tokens": 44552015.0, |
| "step": 681 |
| }, |
| { |
| "epoch": 0.4656084656084656, |
| "grad_norm": 0.7293082475662231, |
| "learning_rate": 3.2258900271063143e-06, |
| "loss": 0.6135, |
| "mean_token_accuracy": 0.810254767537117, |
| "num_tokens": 44617551.0, |
| "step": 682 |
| }, |
| { |
| "epoch": 0.4662911759685953, |
| "grad_norm": 0.7409456372261047, |
| "learning_rate": 3.2201813447745885e-06, |
| "loss": 0.6045, |
| "mean_token_accuracy": 0.810606062412262, |
| "num_tokens": 44683087.0, |
| "step": 683 |
| }, |
| { |
| "epoch": 0.46697388632872505, |
| "grad_norm": 0.6928256154060364, |
| "learning_rate": 3.2144685644626513e-06, |
| "loss": 0.5872, |
| "mean_token_accuracy": 0.8173173069953918, |
| "num_tokens": 44748500.0, |
| "step": 684 |
| }, |
| { |
| "epoch": 0.46765659668885473, |
| "grad_norm": 0.7540627121925354, |
| "learning_rate": 3.2087517186773986e-06, |
| "loss": 0.6253, |
| "mean_token_accuracy": 0.8058706372976303, |
| "num_tokens": 44813885.0, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.46833930704898447, |
| "grad_norm": 0.7306386828422546, |
| "learning_rate": 3.203030839948862e-06, |
| "loss": 0.6371, |
| "mean_token_accuracy": 0.8030995428562164, |
| "num_tokens": 44879386.0, |
| "step": 686 |
| }, |
| { |
| "epoch": 0.4690220174091142, |
| "grad_norm": 0.7555832266807556, |
| "learning_rate": 3.1973059608300185e-06, |
| "loss": 0.6404, |
| "mean_token_accuracy": 0.8015610724687576, |
| "num_tokens": 44944541.0, |
| "step": 687 |
| }, |
| { |
| "epoch": 0.4697047277692439, |
| "grad_norm": 0.7078608870506287, |
| "learning_rate": 3.19157711389661e-06, |
| "loss": 0.5954, |
| "mean_token_accuracy": 0.8156158328056335, |
| "num_tokens": 45010077.0, |
| "step": 688 |
| }, |
| { |
| "epoch": 0.4703874381293736, |
| "grad_norm": 0.7409559488296509, |
| "learning_rate": 3.185844331746957e-06, |
| "loss": 0.6411, |
| "mean_token_accuracy": 0.8022622019052505, |
| "num_tokens": 45074969.0, |
| "step": 689 |
| }, |
| { |
| "epoch": 0.47107014848950335, |
| "grad_norm": 0.7214213013648987, |
| "learning_rate": 3.1801076470017696e-06, |
| "loss": 0.6545, |
| "mean_token_accuracy": 0.7991660535335541, |
| "num_tokens": 45140505.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.471752858849633, |
| "grad_norm": 0.733475387096405, |
| "learning_rate": 3.1743670923039655e-06, |
| "loss": 0.6209, |
| "mean_token_accuracy": 0.8057470470666885, |
| "num_tokens": 45205945.0, |
| "step": 691 |
| }, |
| { |
| "epoch": 0.47243556920976276, |
| "grad_norm": 0.7223991751670837, |
| "learning_rate": 3.168622700318485e-06, |
| "loss": 0.6138, |
| "mean_token_accuracy": 0.8067930787801743, |
| "num_tokens": 45271110.0, |
| "step": 692 |
| }, |
| { |
| "epoch": 0.4731182795698925, |
| "grad_norm": 0.7543123364448547, |
| "learning_rate": 3.1628745037321005e-06, |
| "loss": 0.6375, |
| "mean_token_accuracy": 0.8020515888929367, |
| "num_tokens": 45336637.0, |
| "step": 693 |
| }, |
| { |
| "epoch": 0.4738009899300222, |
| "grad_norm": 0.7674548625946045, |
| "learning_rate": 3.157122535253235e-06, |
| "loss": 0.6376, |
| "mean_token_accuracy": 0.802969217300415, |
| "num_tokens": 45402173.0, |
| "step": 694 |
| }, |
| { |
| "epoch": 0.4744837002901519, |
| "grad_norm": 0.740697979927063, |
| "learning_rate": 3.1513668276117747e-06, |
| "loss": 0.6336, |
| "mean_token_accuracy": 0.803000196814537, |
| "num_tokens": 45467557.0, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.47516641065028165, |
| "grad_norm": 0.7490605115890503, |
| "learning_rate": 3.1456074135588805e-06, |
| "loss": 0.65, |
| "mean_token_accuracy": 0.7991335093975067, |
| "num_tokens": 45533034.0, |
| "step": 696 |
| }, |
| { |
| "epoch": 0.4758491210104113, |
| "grad_norm": 0.7376020550727844, |
| "learning_rate": 3.1398443258668067e-06, |
| "loss": 0.594, |
| "mean_token_accuracy": 0.8152694553136826, |
| "num_tokens": 45598284.0, |
| "step": 697 |
| }, |
| { |
| "epoch": 0.47653183137054106, |
| "grad_norm": 0.7506970167160034, |
| "learning_rate": 3.134077597328708e-06, |
| "loss": 0.6136, |
| "mean_token_accuracy": 0.8080400824546814, |
| "num_tokens": 45663820.0, |
| "step": 698 |
| }, |
| { |
| "epoch": 0.47721454173067074, |
| "grad_norm": 0.7497142553329468, |
| "learning_rate": 3.1283072607584573e-06, |
| "loss": 0.613, |
| "mean_token_accuracy": 0.8081164509057999, |
| "num_tokens": 45729356.0, |
| "step": 699 |
| }, |
| { |
| "epoch": 0.4778972520908005, |
| "grad_norm": 0.7283821105957031, |
| "learning_rate": 3.1225333489904606e-06, |
| "loss": 0.6041, |
| "mean_token_accuracy": 0.8139815479516983, |
| "num_tokens": 45794892.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.4785799624509302, |
| "grad_norm": 0.7512271404266357, |
| "learning_rate": 3.116755894879464e-06, |
| "loss": 0.6081, |
| "mean_token_accuracy": 0.8116752207279205, |
| "num_tokens": 45860428.0, |
| "step": 701 |
| }, |
| { |
| "epoch": 0.4792626728110599, |
| "grad_norm": 0.7062849998474121, |
| "learning_rate": 3.1109749313003708e-06, |
| "loss": 0.6007, |
| "mean_token_accuracy": 0.8138440847396851, |
| "num_tokens": 45925964.0, |
| "step": 702 |
| }, |
| { |
| "epoch": 0.4799453831711896, |
| "grad_norm": 0.8034562468528748, |
| "learning_rate": 3.1051904911480557e-06, |
| "loss": 0.6304, |
| "mean_token_accuracy": 0.8042827546596527, |
| "num_tokens": 45991500.0, |
| "step": 703 |
| }, |
| { |
| "epoch": 0.48062809353131936, |
| "grad_norm": 0.7626417875289917, |
| "learning_rate": 3.099402607337175e-06, |
| "loss": 0.6265, |
| "mean_token_accuracy": 0.8054893612861633, |
| "num_tokens": 46057036.0, |
| "step": 704 |
| }, |
| { |
| "epoch": 0.48131080389144903, |
| "grad_norm": 0.7254495024681091, |
| "learning_rate": 3.093611312801979e-06, |
| "loss": 0.647, |
| "mean_token_accuracy": 0.8013196587562561, |
| "num_tokens": 46122572.0, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.48199351425157877, |
| "grad_norm": 0.7576347589492798, |
| "learning_rate": 3.087816640496127e-06, |
| "loss": 0.6537, |
| "mean_token_accuracy": 0.7981427162885666, |
| "num_tokens": 46188108.0, |
| "step": 706 |
| }, |
| { |
| "epoch": 0.4826762246117085, |
| "grad_norm": 0.7223957777023315, |
| "learning_rate": 3.0820186233924983e-06, |
| "loss": 0.5885, |
| "mean_token_accuracy": 0.8181630969047546, |
| "num_tokens": 46252922.0, |
| "step": 707 |
| }, |
| { |
| "epoch": 0.4833589349718382, |
| "grad_norm": 0.7446067929267883, |
| "learning_rate": 3.076217294483005e-06, |
| "loss": 0.6751, |
| "mean_token_accuracy": 0.7922012507915497, |
| "num_tokens": 46318458.0, |
| "step": 708 |
| }, |
| { |
| "epoch": 0.4840416453319679, |
| "grad_norm": 0.7198394536972046, |
| "learning_rate": 3.070412686778403e-06, |
| "loss": 0.6288, |
| "mean_token_accuracy": 0.8034121543169022, |
| "num_tokens": 46383994.0, |
| "step": 709 |
| }, |
| { |
| "epoch": 0.48472435569209765, |
| "grad_norm": 0.7317565679550171, |
| "learning_rate": 3.064604833308105e-06, |
| "loss": 0.6018, |
| "mean_token_accuracy": 0.8128818422555923, |
| "num_tokens": 46449530.0, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.48540706605222733, |
| "grad_norm": 0.7785357236862183, |
| "learning_rate": 3.058793767119996e-06, |
| "loss": 0.6415, |
| "mean_token_accuracy": 0.8014406114816666, |
| "num_tokens": 46514874.0, |
| "step": 711 |
| }, |
| { |
| "epoch": 0.48608977641235707, |
| "grad_norm": 0.7061479687690735, |
| "learning_rate": 3.0529795212802383e-06, |
| "loss": 0.5641, |
| "mean_token_accuracy": 0.8236192464828491, |
| "num_tokens": 46580410.0, |
| "step": 712 |
| }, |
| { |
| "epoch": 0.48677248677248675, |
| "grad_norm": 0.735151469707489, |
| "learning_rate": 3.047162128873089e-06, |
| "loss": 0.6134, |
| "mean_token_accuracy": 0.8087884932756424, |
| "num_tokens": 46645946.0, |
| "step": 713 |
| }, |
| { |
| "epoch": 0.4874551971326165, |
| "grad_norm": 0.7411344051361084, |
| "learning_rate": 3.041341623000708e-06, |
| "loss": 0.6257, |
| "mean_token_accuracy": 0.8060804158449173, |
| "num_tokens": 46711201.0, |
| "step": 714 |
| }, |
| { |
| "epoch": 0.4881379074927462, |
| "grad_norm": 0.7967774271965027, |
| "learning_rate": 3.0355180367829746e-06, |
| "loss": 0.6345, |
| "mean_token_accuracy": 0.8028616309165955, |
| "num_tokens": 46776659.0, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.4888206178528759, |
| "grad_norm": 0.7407638430595398, |
| "learning_rate": 3.029691403357293e-06, |
| "loss": 0.5917, |
| "mean_token_accuracy": 0.815096527338028, |
| "num_tokens": 46842195.0, |
| "step": 716 |
| }, |
| { |
| "epoch": 0.48950332821300563, |
| "grad_norm": 0.7118982076644897, |
| "learning_rate": 3.0238617558784077e-06, |
| "loss": 0.5633, |
| "mean_token_accuracy": 0.8250522017478943, |
| "num_tokens": 46907443.0, |
| "step": 717 |
| }, |
| { |
| "epoch": 0.49018603857313536, |
| "grad_norm": 0.7390048503875732, |
| "learning_rate": 3.0180291275182144e-06, |
| "loss": 0.6331, |
| "mean_token_accuracy": 0.8052017390727997, |
| "num_tokens": 46972873.0, |
| "step": 718 |
| }, |
| { |
| "epoch": 0.49086874893326504, |
| "grad_norm": 0.7418379783630371, |
| "learning_rate": 3.0121935514655697e-06, |
| "loss": 0.5866, |
| "mean_token_accuracy": 0.8170940279960632, |
| "num_tokens": 47038259.0, |
| "step": 719 |
| }, |
| { |
| "epoch": 0.4915514592933948, |
| "grad_norm": 0.7906901240348816, |
| "learning_rate": 3.006355060926103e-06, |
| "loss": 0.6644, |
| "mean_token_accuracy": 0.7941715717315674, |
| "num_tokens": 47103795.0, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.4922341696535245, |
| "grad_norm": 0.7372046709060669, |
| "learning_rate": 3.000513689122029e-06, |
| "loss": 0.6327, |
| "mean_token_accuracy": 0.8058201223611832, |
| "num_tokens": 47168745.0, |
| "step": 721 |
| }, |
| { |
| "epoch": 0.4929168800136542, |
| "grad_norm": 0.7171697616577148, |
| "learning_rate": 2.9946694692919553e-06, |
| "loss": 0.5739, |
| "mean_token_accuracy": 0.820540577173233, |
| "num_tokens": 47233820.0, |
| "step": 722 |
| }, |
| { |
| "epoch": 0.4935995903737839, |
| "grad_norm": 0.7130782604217529, |
| "learning_rate": 2.988822434690699e-06, |
| "loss": 0.6228, |
| "mean_token_accuracy": 0.8077346086502075, |
| "num_tokens": 47299356.0, |
| "step": 723 |
| }, |
| { |
| "epoch": 0.49428230073391366, |
| "grad_norm": 0.7585386633872986, |
| "learning_rate": 2.9829726185890894e-06, |
| "loss": 0.6007, |
| "mean_token_accuracy": 0.8131654858589172, |
| "num_tokens": 47364281.0, |
| "step": 724 |
| }, |
| { |
| "epoch": 0.49496501109404334, |
| "grad_norm": 0.7572980523109436, |
| "learning_rate": 2.9771200542737856e-06, |
| "loss": 0.6105, |
| "mean_token_accuracy": 0.80910924077034, |
| "num_tokens": 47429817.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.4956477214541731, |
| "grad_norm": 0.748471200466156, |
| "learning_rate": 2.9712647750470853e-06, |
| "loss": 0.6478, |
| "mean_token_accuracy": 0.8009799122810364, |
| "num_tokens": 47495145.0, |
| "step": 726 |
| }, |
| { |
| "epoch": 0.4963304318143028, |
| "grad_norm": 0.7291464805603027, |
| "learning_rate": 2.9654068142267325e-06, |
| "loss": 0.6266, |
| "mean_token_accuracy": 0.8058406710624695, |
| "num_tokens": 47560681.0, |
| "step": 727 |
| }, |
| { |
| "epoch": 0.4970131421744325, |
| "grad_norm": 0.7313188910484314, |
| "learning_rate": 2.9595462051457307e-06, |
| "loss": 0.5765, |
| "mean_token_accuracy": 0.8204728662967682, |
| "num_tokens": 47626217.0, |
| "step": 728 |
| }, |
| { |
| "epoch": 0.4976958525345622, |
| "grad_norm": 0.7730467915534973, |
| "learning_rate": 2.953682981152154e-06, |
| "loss": 0.6191, |
| "mean_token_accuracy": 0.8103005886077881, |
| "num_tokens": 47691753.0, |
| "step": 729 |
| }, |
| { |
| "epoch": 0.4983785628946919, |
| "grad_norm": 0.7231931686401367, |
| "learning_rate": 2.947817175608954e-06, |
| "loss": 0.6072, |
| "mean_token_accuracy": 0.8107830137014389, |
| "num_tokens": 47757123.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.49906127325482164, |
| "grad_norm": 0.747162401676178, |
| "learning_rate": 2.9419488218937723e-06, |
| "loss": 0.6292, |
| "mean_token_accuracy": 0.804267480969429, |
| "num_tokens": 47822659.0, |
| "step": 731 |
| }, |
| { |
| "epoch": 0.49974398361495137, |
| "grad_norm": 0.7242068648338318, |
| "learning_rate": 2.9360779533987515e-06, |
| "loss": 0.5868, |
| "mean_token_accuracy": 0.815844938158989, |
| "num_tokens": 47888195.0, |
| "step": 732 |
| }, |
| { |
| "epoch": 0.5004266939750811, |
| "grad_norm": 0.7154374122619629, |
| "learning_rate": 2.9302046035303424e-06, |
| "loss": 0.5967, |
| "mean_token_accuracy": 0.8129582107067108, |
| "num_tokens": 47953731.0, |
| "step": 733 |
| }, |
| { |
| "epoch": 0.5011094043352108, |
| "grad_norm": 0.721357524394989, |
| "learning_rate": 2.9243288057091147e-06, |
| "loss": 0.6025, |
| "mean_token_accuracy": 0.813941165804863, |
| "num_tokens": 48018967.0, |
| "step": 734 |
| }, |
| { |
| "epoch": 0.5017921146953405, |
| "grad_norm": 0.7229841947555542, |
| "learning_rate": 2.91845059336957e-06, |
| "loss": 0.6009, |
| "mean_token_accuracy": 0.8107102662324905, |
| "num_tokens": 48084313.0, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.5024748250554703, |
| "grad_norm": 0.7424589991569519, |
| "learning_rate": 2.9125699999599467e-06, |
| "loss": 0.6195, |
| "mean_token_accuracy": 0.8080553561449051, |
| "num_tokens": 48149849.0, |
| "step": 736 |
| }, |
| { |
| "epoch": 0.5031575354155999, |
| "grad_norm": 0.7185443043708801, |
| "learning_rate": 2.9066870589420323e-06, |
| "loss": 0.6142, |
| "mean_token_accuracy": 0.808559387922287, |
| "num_tokens": 48215385.0, |
| "step": 737 |
| }, |
| { |
| "epoch": 0.5038402457757296, |
| "grad_norm": 0.7181544303894043, |
| "learning_rate": 2.9008018037909735e-06, |
| "loss": 0.6381, |
| "mean_token_accuracy": 0.8024193644523621, |
| "num_tokens": 48280921.0, |
| "step": 738 |
| }, |
| { |
| "epoch": 0.5045229561358594, |
| "grad_norm": 0.6995269656181335, |
| "learning_rate": 2.8949142679950848e-06, |
| "loss": 0.5899, |
| "mean_token_accuracy": 0.816242054104805, |
| "num_tokens": 48346457.0, |
| "step": 739 |
| }, |
| { |
| "epoch": 0.5052056664959891, |
| "grad_norm": 0.7697423100471497, |
| "learning_rate": 2.889024485055657e-06, |
| "loss": 0.601, |
| "mean_token_accuracy": 0.813996821641922, |
| "num_tokens": 48411993.0, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.5058883768561188, |
| "grad_norm": 0.7361392974853516, |
| "learning_rate": 2.883132488486769e-06, |
| "loss": 0.5846, |
| "mean_token_accuracy": 0.8176256269216537, |
| "num_tokens": 48477020.0, |
| "step": 741 |
| }, |
| { |
| "epoch": 0.5065710872162486, |
| "grad_norm": 0.7436454892158508, |
| "learning_rate": 2.8772383118150946e-06, |
| "loss": 0.6126, |
| "mean_token_accuracy": 0.8058253973722458, |
| "num_tokens": 48542556.0, |
| "step": 742 |
| }, |
| { |
| "epoch": 0.5072537975763782, |
| "grad_norm": 0.7757138013839722, |
| "learning_rate": 2.871341988579714e-06, |
| "loss": 0.6233, |
| "mean_token_accuracy": 0.8082896620035172, |
| "num_tokens": 48607806.0, |
| "step": 743 |
| }, |
| { |
| "epoch": 0.5079365079365079, |
| "grad_norm": 0.7233085632324219, |
| "learning_rate": 2.86544355233192e-06, |
| "loss": 0.6005, |
| "mean_token_accuracy": 0.8126832842826843, |
| "num_tokens": 48673342.0, |
| "step": 744 |
| }, |
| { |
| "epoch": 0.5086192182966377, |
| "grad_norm": 0.722743570804596, |
| "learning_rate": 2.859543036635031e-06, |
| "loss": 0.6606, |
| "mean_token_accuracy": 0.7941257357597351, |
| "num_tokens": 48738878.0, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.5093019286567674, |
| "grad_norm": 0.7590633034706116, |
| "learning_rate": 2.8536404750641963e-06, |
| "loss": 0.6346, |
| "mean_token_accuracy": 0.8029753863811493, |
| "num_tokens": 48804385.0, |
| "step": 746 |
| }, |
| { |
| "epoch": 0.5099846390168971, |
| "grad_norm": 0.7025012373924255, |
| "learning_rate": 2.8477359012062067e-06, |
| "loss": 0.6003, |
| "mean_token_accuracy": 0.8105480968952179, |
| "num_tokens": 48869610.0, |
| "step": 747 |
| }, |
| { |
| "epoch": 0.5106673493770268, |
| "grad_norm": 0.7078215479850769, |
| "learning_rate": 2.8418293486593044e-06, |
| "loss": 0.5977, |
| "mean_token_accuracy": 0.8118890523910522, |
| "num_tokens": 48935146.0, |
| "step": 748 |
| }, |
| { |
| "epoch": 0.5113500597371565, |
| "grad_norm": 0.7257028818130493, |
| "learning_rate": 2.8359208510329913e-06, |
| "loss": 0.6179, |
| "mean_token_accuracy": 0.8075818568468094, |
| "num_tokens": 49000682.0, |
| "step": 749 |
| }, |
| { |
| "epoch": 0.5120327700972862, |
| "grad_norm": 0.7726735472679138, |
| "learning_rate": 2.830010441947834e-06, |
| "loss": 0.6213, |
| "mean_token_accuracy": 0.8048478811979294, |
| "num_tokens": 49066218.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.512715480457416, |
| "grad_norm": 0.7362603545188904, |
| "learning_rate": 2.8240981550352785e-06, |
| "loss": 0.6196, |
| "mean_token_accuracy": 0.8084271550178528, |
| "num_tokens": 49131451.0, |
| "step": 751 |
| }, |
| { |
| "epoch": 0.5133981908175457, |
| "grad_norm": 0.7677423357963562, |
| "learning_rate": 2.818184023937456e-06, |
| "loss": 0.6294, |
| "mean_token_accuracy": 0.8077346086502075, |
| "num_tokens": 49196987.0, |
| "step": 752 |
| }, |
| { |
| "epoch": 0.5140809011776754, |
| "grad_norm": 0.7333040833473206, |
| "learning_rate": 2.81226808230699e-06, |
| "loss": 0.6175, |
| "mean_token_accuracy": 0.8068181872367859, |
| "num_tokens": 49262523.0, |
| "step": 753 |
| }, |
| { |
| "epoch": 0.514763611537805, |
| "grad_norm": 0.7260233163833618, |
| "learning_rate": 2.8063503638068073e-06, |
| "loss": 0.5838, |
| "mean_token_accuracy": 0.8187592774629593, |
| "num_tokens": 49327753.0, |
| "step": 754 |
| }, |
| { |
| "epoch": 0.5154463218979348, |
| "grad_norm": 0.7574732899665833, |
| "learning_rate": 2.8004309021099444e-06, |
| "loss": 0.617, |
| "mean_token_accuracy": 0.807459682226181, |
| "num_tokens": 49393289.0, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.5161290322580645, |
| "grad_norm": 0.7057220935821533, |
| "learning_rate": 2.79450973089936e-06, |
| "loss": 0.596, |
| "mean_token_accuracy": 0.8131316304206848, |
| "num_tokens": 49458588.0, |
| "step": 756 |
| }, |
| { |
| "epoch": 0.5168117426181942, |
| "grad_norm": 0.7108963131904602, |
| "learning_rate": 2.7885868838677364e-06, |
| "loss": 0.5688, |
| "mean_token_accuracy": 0.8220307976007462, |
| "num_tokens": 49524124.0, |
| "step": 757 |
| }, |
| { |
| "epoch": 0.517494452978324, |
| "grad_norm": 0.7996737957000732, |
| "learning_rate": 2.782662394717293e-06, |
| "loss": 0.5885, |
| "mean_token_accuracy": 0.8194495290517807, |
| "num_tokens": 49589660.0, |
| "step": 758 |
| }, |
| { |
| "epoch": 0.5181771633384537, |
| "grad_norm": 0.773198664188385, |
| "learning_rate": 2.7767362971595944e-06, |
| "loss": 0.6388, |
| "mean_token_accuracy": 0.8019306063652039, |
| "num_tokens": 49655196.0, |
| "step": 759 |
| }, |
| { |
| "epoch": 0.5188598736985833, |
| "grad_norm": 0.7262683510780334, |
| "learning_rate": 2.7708086249153565e-06, |
| "loss": 0.5985, |
| "mean_token_accuracy": 0.811354473233223, |
| "num_tokens": 49720732.0, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.5195425840587131, |
| "grad_norm": 0.7870587706565857, |
| "learning_rate": 2.764879411714256e-06, |
| "loss": 0.6536, |
| "mean_token_accuracy": 0.7973643690347672, |
| "num_tokens": 49785745.0, |
| "step": 761 |
| }, |
| { |
| "epoch": 0.5202252944188428, |
| "grad_norm": 0.7811471223831177, |
| "learning_rate": 2.7589486912947354e-06, |
| "loss": 0.6228, |
| "mean_token_accuracy": 0.8065677434206009, |
| "num_tokens": 49851242.0, |
| "step": 762 |
| }, |
| { |
| "epoch": 0.5209080047789725, |
| "grad_norm": 0.7129400372505188, |
| "learning_rate": 2.7530164974038176e-06, |
| "loss": 0.5993, |
| "mean_token_accuracy": 0.813798263669014, |
| "num_tokens": 49916778.0, |
| "step": 763 |
| }, |
| { |
| "epoch": 0.5215907151391023, |
| "grad_norm": 0.7337347269058228, |
| "learning_rate": 2.747082863796907e-06, |
| "loss": 0.6037, |
| "mean_token_accuracy": 0.8119270205497742, |
| "num_tokens": 49982301.0, |
| "step": 764 |
| }, |
| { |
| "epoch": 0.522273425499232, |
| "grad_norm": 0.768386960029602, |
| "learning_rate": 2.741147824237602e-06, |
| "loss": 0.6086, |
| "mean_token_accuracy": 0.809155061841011, |
| "num_tokens": 50047837.0, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.5229561358593616, |
| "grad_norm": 0.7903435230255127, |
| "learning_rate": 2.735211412497499e-06, |
| "loss": 0.6246, |
| "mean_token_accuracy": 0.805413007736206, |
| "num_tokens": 50113373.0, |
| "step": 766 |
| }, |
| { |
| "epoch": 0.5236388462194914, |
| "grad_norm": 0.7200343608856201, |
| "learning_rate": 2.7292736623560044e-06, |
| "loss": 0.582, |
| "mean_token_accuracy": 0.8190562874078751, |
| "num_tokens": 50178582.0, |
| "step": 767 |
| }, |
| { |
| "epoch": 0.5243215565796211, |
| "grad_norm": 0.7194651365280151, |
| "learning_rate": 2.7233346076001403e-06, |
| "loss": 0.593, |
| "mean_token_accuracy": 0.8154478222131729, |
| "num_tokens": 50244118.0, |
| "step": 768 |
| }, |
| { |
| "epoch": 0.5250042669397508, |
| "grad_norm": 0.7576190829277039, |
| "learning_rate": 2.717394282024351e-06, |
| "loss": 0.6074, |
| "mean_token_accuracy": 0.8109115362167358, |
| "num_tokens": 50309654.0, |
| "step": 769 |
| }, |
| { |
| "epoch": 0.5256869772998806, |
| "grad_norm": 0.7116931080818176, |
| "learning_rate": 2.711452719430313e-06, |
| "loss": 0.6229, |
| "mean_token_accuracy": 0.8080746233463287, |
| "num_tokens": 50375028.0, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.5263696876600102, |
| "grad_norm": 0.7312449216842651, |
| "learning_rate": 2.705509953626741e-06, |
| "loss": 0.6039, |
| "mean_token_accuracy": 0.8116446733474731, |
| "num_tokens": 50440564.0, |
| "step": 771 |
| }, |
| { |
| "epoch": 0.5270523980201399, |
| "grad_norm": 0.7251054048538208, |
| "learning_rate": 2.6995660184291977e-06, |
| "loss": 0.6074, |
| "mean_token_accuracy": 0.8126221895217896, |
| "num_tokens": 50506100.0, |
| "step": 772 |
| }, |
| { |
| "epoch": 0.5277351083802697, |
| "grad_norm": 0.7374172806739807, |
| "learning_rate": 2.6936209476598977e-06, |
| "loss": 0.639, |
| "mean_token_accuracy": 0.8034860640764236, |
| "num_tokens": 50571534.0, |
| "step": 773 |
| }, |
| { |
| "epoch": 0.5284178187403994, |
| "grad_norm": 0.7465745210647583, |
| "learning_rate": 2.687674775147519e-06, |
| "loss": 0.6428, |
| "mean_token_accuracy": 0.8013654798269272, |
| "num_tokens": 50637070.0, |
| "step": 774 |
| }, |
| { |
| "epoch": 0.5291005291005291, |
| "grad_norm": 0.7335869073867798, |
| "learning_rate": 2.681727534727008e-06, |
| "loss": 0.649, |
| "mean_token_accuracy": 0.7997464686632156, |
| "num_tokens": 50702606.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.5297832394606589, |
| "grad_norm": 0.7344288229942322, |
| "learning_rate": 2.6757792602393885e-06, |
| "loss": 0.6295, |
| "mean_token_accuracy": 0.8041147440671921, |
| "num_tokens": 50768142.0, |
| "step": 776 |
| }, |
| { |
| "epoch": 0.5304659498207885, |
| "grad_norm": 0.7773926854133606, |
| "learning_rate": 2.669829985531566e-06, |
| "loss": 0.637, |
| "mean_token_accuracy": 0.8031677752733231, |
| "num_tokens": 50833678.0, |
| "step": 777 |
| }, |
| { |
| "epoch": 0.5311486601809182, |
| "grad_norm": 0.7365777492523193, |
| "learning_rate": 2.6638797444561415e-06, |
| "loss": 0.5796, |
| "mean_token_accuracy": 0.8170362859964371, |
| "num_tokens": 50899214.0, |
| "step": 778 |
| }, |
| { |
| "epoch": 0.531831370541048, |
| "grad_norm": 0.7678808569908142, |
| "learning_rate": 2.6579285708712103e-06, |
| "loss": 0.6381, |
| "mean_token_accuracy": 0.8041147440671921, |
| "num_tokens": 50964750.0, |
| "step": 779 |
| }, |
| { |
| "epoch": 0.5325140809011777, |
| "grad_norm": 0.7613728642463684, |
| "learning_rate": 2.6519764986401776e-06, |
| "loss": 0.598, |
| "mean_token_accuracy": 0.8142717480659485, |
| "num_tokens": 51030286.0, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.5331967912613074, |
| "grad_norm": 0.6854228973388672, |
| "learning_rate": 2.646023561631559e-06, |
| "loss": 0.5874, |
| "mean_token_accuracy": 0.815096527338028, |
| "num_tokens": 51095822.0, |
| "step": 781 |
| }, |
| { |
| "epoch": 0.5338795016214372, |
| "grad_norm": 0.7385619282722473, |
| "learning_rate": 2.6400697937187946e-06, |
| "loss": 0.6224, |
| "mean_token_accuracy": 0.8047562390565872, |
| "num_tokens": 51161358.0, |
| "step": 782 |
| }, |
| { |
| "epoch": 0.5345622119815668, |
| "grad_norm": 0.7568676471710205, |
| "learning_rate": 2.6341152287800475e-06, |
| "loss": 0.6509, |
| "mean_token_accuracy": 0.7998993992805481, |
| "num_tokens": 51226841.0, |
| "step": 783 |
| }, |
| { |
| "epoch": 0.5352449223416965, |
| "grad_norm": 0.7271560430526733, |
| "learning_rate": 2.628159900698022e-06, |
| "loss": 0.5857, |
| "mean_token_accuracy": 0.817341759800911, |
| "num_tokens": 51292377.0, |
| "step": 784 |
| }, |
| { |
| "epoch": 0.5359276327018262, |
| "grad_norm": 0.7313284277915955, |
| "learning_rate": 2.622203843359759e-06, |
| "loss": 0.5898, |
| "mean_token_accuracy": 0.8143208026885986, |
| "num_tokens": 51357356.0, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.536610343061956, |
| "grad_norm": 0.7274580597877502, |
| "learning_rate": 2.616247090656453e-06, |
| "loss": 0.612, |
| "mean_token_accuracy": 0.8110948204994202, |
| "num_tokens": 51422892.0, |
| "step": 786 |
| }, |
| { |
| "epoch": 0.5372930534220857, |
| "grad_norm": 0.7195629477500916, |
| "learning_rate": 2.610289676483254e-06, |
| "loss": 0.5761, |
| "mean_token_accuracy": 0.8174325376749039, |
| "num_tokens": 51488267.0, |
| "step": 787 |
| }, |
| { |
| "epoch": 0.5379757637822153, |
| "grad_norm": 0.7511928081512451, |
| "learning_rate": 2.6043316347390765e-06, |
| "loss": 0.6126, |
| "mean_token_accuracy": 0.8073833137750626, |
| "num_tokens": 51553803.0, |
| "step": 788 |
| }, |
| { |
| "epoch": 0.5386584741423451, |
| "grad_norm": 0.6966540217399597, |
| "learning_rate": 2.5983729993264033e-06, |
| "loss": 0.604, |
| "mean_token_accuracy": 0.811553031206131, |
| "num_tokens": 51619339.0, |
| "step": 789 |
| }, |
| { |
| "epoch": 0.5393411845024748, |
| "grad_norm": 0.7874122262001038, |
| "learning_rate": 2.5924138041510993e-06, |
| "loss": 0.633, |
| "mean_token_accuracy": 0.8046645969152451, |
| "num_tokens": 51684875.0, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.5400238948626045, |
| "grad_norm": 0.7472220659255981, |
| "learning_rate": 2.586454083122212e-06, |
| "loss": 0.6023, |
| "mean_token_accuracy": 0.8128818422555923, |
| "num_tokens": 51750411.0, |
| "step": 791 |
| }, |
| { |
| "epoch": 0.5407066052227343, |
| "grad_norm": 0.747721791267395, |
| "learning_rate": 2.5804938701517825e-06, |
| "loss": 0.6402, |
| "mean_token_accuracy": 0.8023495376110077, |
| "num_tokens": 51815702.0, |
| "step": 792 |
| }, |
| { |
| "epoch": 0.541389315582864, |
| "grad_norm": 0.7737058401107788, |
| "learning_rate": 2.574533199154649e-06, |
| "loss": 0.673, |
| "mean_token_accuracy": 0.7901087552309036, |
| "num_tokens": 51881238.0, |
| "step": 793 |
| }, |
| { |
| "epoch": 0.5420720259429936, |
| "grad_norm": 0.7369558811187744, |
| "learning_rate": 2.5685721040482587e-06, |
| "loss": 0.6308, |
| "mean_token_accuracy": 0.8035927265882492, |
| "num_tokens": 51945928.0, |
| "step": 794 |
| }, |
| { |
| "epoch": 0.5427547363031234, |
| "grad_norm": 0.7183763384819031, |
| "learning_rate": 2.56261061875247e-06, |
| "loss": 0.5752, |
| "mean_token_accuracy": 0.8207653313875198, |
| "num_tokens": 52011454.0, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.5434374466632531, |
| "grad_norm": 0.7445209622383118, |
| "learning_rate": 2.5566487771893627e-06, |
| "loss": 0.6272, |
| "mean_token_accuracy": 0.805764302611351, |
| "num_tokens": 52076990.0, |
| "step": 796 |
| }, |
| { |
| "epoch": 0.5441201570233828, |
| "grad_norm": 0.7469929456710815, |
| "learning_rate": 2.5506866132830433e-06, |
| "loss": 0.6253, |
| "mean_token_accuracy": 0.8070320188999176, |
| "num_tokens": 52142526.0, |
| "step": 797 |
| }, |
| { |
| "epoch": 0.5448028673835126, |
| "grad_norm": 0.7637821435928345, |
| "learning_rate": 2.5447241609594524e-06, |
| "loss": 0.613, |
| "mean_token_accuracy": 0.8110882490873337, |
| "num_tokens": 52207756.0, |
| "step": 798 |
| }, |
| { |
| "epoch": 0.5454855777436423, |
| "grad_norm": 0.722944974899292, |
| "learning_rate": 2.538761454146173e-06, |
| "loss": 0.6254, |
| "mean_token_accuracy": 0.8025415539741516, |
| "num_tokens": 52273292.0, |
| "step": 799 |
| }, |
| { |
| "epoch": 0.5461682881037719, |
| "grad_norm": 0.7083418369293213, |
| "learning_rate": 2.5327985267722337e-06, |
| "loss": 0.5797, |
| "mean_token_accuracy": 0.8187222331762314, |
| "num_tokens": 52338411.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5468509984639017, |
| "grad_norm": 0.6983156204223633, |
| "learning_rate": 2.526835412767921e-06, |
| "loss": 0.6182, |
| "mean_token_accuracy": 0.8081211447715759, |
| "num_tokens": 52403361.0, |
| "step": 801 |
| }, |
| { |
| "epoch": 0.5475337088240314, |
| "grad_norm": 0.7576866149902344, |
| "learning_rate": 2.520872146064582e-06, |
| "loss": 0.6434, |
| "mean_token_accuracy": 0.8006476163864136, |
| "num_tokens": 52468897.0, |
| "step": 802 |
| }, |
| { |
| "epoch": 0.5482164191841611, |
| "grad_norm": 0.7381872534751892, |
| "learning_rate": 2.514908760594431e-06, |
| "loss": 0.6478, |
| "mean_token_accuracy": 0.8003574162721634, |
| "num_tokens": 52534433.0, |
| "step": 803 |
| }, |
| { |
| "epoch": 0.5488991295442909, |
| "grad_norm": 0.6863521337509155, |
| "learning_rate": 2.5089452902903616e-06, |
| "loss": 0.6015, |
| "mean_token_accuracy": 0.8113850206136703, |
| "num_tokens": 52599969.0, |
| "step": 804 |
| }, |
| { |
| "epoch": 0.5495818399044206, |
| "grad_norm": 0.7224480509757996, |
| "learning_rate": 2.502981769085748e-06, |
| "loss": 0.5932, |
| "mean_token_accuracy": 0.815065935254097, |
| "num_tokens": 52665021.0, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.5502645502645502, |
| "grad_norm": 0.7919376492500305, |
| "learning_rate": 2.4970182309142533e-06, |
| "loss": 0.6559, |
| "mean_token_accuracy": 0.7980355471372604, |
| "num_tokens": 52730429.0, |
| "step": 806 |
| }, |
| { |
| "epoch": 0.55094726062468, |
| "grad_norm": 0.7454491853713989, |
| "learning_rate": 2.491054709709639e-06, |
| "loss": 0.6195, |
| "mean_token_accuracy": 0.8066349029541016, |
| "num_tokens": 52795965.0, |
| "step": 807 |
| }, |
| { |
| "epoch": 0.5516299709848097, |
| "grad_norm": 0.7252407073974609, |
| "learning_rate": 2.4850912394055693e-06, |
| "loss": 0.6077, |
| "mean_token_accuracy": 0.8098803013563156, |
| "num_tokens": 52861340.0, |
| "step": 808 |
| }, |
| { |
| "epoch": 0.5523126813449394, |
| "grad_norm": 0.7148882746696472, |
| "learning_rate": 2.479127853935419e-06, |
| "loss": 0.6179, |
| "mean_token_accuracy": 0.8085928410291672, |
| "num_tokens": 52926258.0, |
| "step": 809 |
| }, |
| { |
| "epoch": 0.5529953917050692, |
| "grad_norm": 0.7061576247215271, |
| "learning_rate": 2.473164587232079e-06, |
| "loss": 0.5861, |
| "mean_token_accuracy": 0.8157074749469757, |
| "num_tokens": 52991794.0, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.5536781020651989, |
| "grad_norm": 0.7092587947845459, |
| "learning_rate": 2.4672014732277667e-06, |
| "loss": 0.5682, |
| "mean_token_accuracy": 0.8220918774604797, |
| "num_tokens": 53057330.0, |
| "step": 811 |
| }, |
| { |
| "epoch": 0.5543608124253285, |
| "grad_norm": 0.7507848739624023, |
| "learning_rate": 2.4612385458538276e-06, |
| "loss": 0.6013, |
| "mean_token_accuracy": 0.811751589179039, |
| "num_tokens": 53122866.0, |
| "step": 812 |
| }, |
| { |
| "epoch": 0.5550435227854583, |
| "grad_norm": 0.7575944662094116, |
| "learning_rate": 2.455275839040547e-06, |
| "loss": 0.6226, |
| "mean_token_accuracy": 0.8050311654806137, |
| "num_tokens": 53188402.0, |
| "step": 813 |
| }, |
| { |
| "epoch": 0.555726233145588, |
| "grad_norm": 0.7243123650550842, |
| "learning_rate": 2.4493133867169575e-06, |
| "loss": 0.6513, |
| "mean_token_accuracy": 0.8003964424133301, |
| "num_tokens": 53253425.0, |
| "step": 814 |
| }, |
| { |
| "epoch": 0.5564089435057177, |
| "grad_norm": 0.6894735097885132, |
| "learning_rate": 2.443351222810638e-06, |
| "loss": 0.5822, |
| "mean_token_accuracy": 0.8188735246658325, |
| "num_tokens": 53318734.0, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.5570916538658475, |
| "grad_norm": 0.7273776531219482, |
| "learning_rate": 2.43738938124753e-06, |
| "loss": 0.6333, |
| "mean_token_accuracy": 0.8024957329034805, |
| "num_tokens": 53384270.0, |
| "step": 816 |
| }, |
| { |
| "epoch": 0.5577743642259771, |
| "grad_norm": 0.7011674046516418, |
| "learning_rate": 2.4314278959517425e-06, |
| "loss": 0.6148, |
| "mean_token_accuracy": 0.8078524768352509, |
| "num_tokens": 53448239.0, |
| "step": 817 |
| }, |
| { |
| "epoch": 0.5584570745861068, |
| "grad_norm": 0.7421677708625793, |
| "learning_rate": 2.4254668008453513e-06, |
| "loss": 0.6289, |
| "mean_token_accuracy": 0.806909829378128, |
| "num_tokens": 53513775.0, |
| "step": 818 |
| }, |
| { |
| "epoch": 0.5591397849462365, |
| "grad_norm": 0.7313380241394043, |
| "learning_rate": 2.419506129848218e-06, |
| "loss": 0.5929, |
| "mean_token_accuracy": 0.8127902001142502, |
| "num_tokens": 53579311.0, |
| "step": 819 |
| }, |
| { |
| "epoch": 0.5598224953063663, |
| "grad_norm": 0.7254424691200256, |
| "learning_rate": 2.4135459168777887e-06, |
| "loss": 0.6211, |
| "mean_token_accuracy": 0.8071694821119308, |
| "num_tokens": 53644847.0, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.560505205666496, |
| "grad_norm": 0.7302435636520386, |
| "learning_rate": 2.407586195848901e-06, |
| "loss": 0.5977, |
| "mean_token_accuracy": 0.8155899941921234, |
| "num_tokens": 53710151.0, |
| "step": 821 |
| }, |
| { |
| "epoch": 0.5611879160266257, |
| "grad_norm": 0.7332073450088501, |
| "learning_rate": 2.4016270006735967e-06, |
| "loss": 0.6131, |
| "mean_token_accuracy": 0.8101572394371033, |
| "num_tokens": 53775637.0, |
| "step": 822 |
| }, |
| { |
| "epoch": 0.5618706263867554, |
| "grad_norm": 0.7568655014038086, |
| "learning_rate": 2.395668365260925e-06, |
| "loss": 0.6091, |
| "mean_token_accuracy": 0.8092314302921295, |
| "num_tokens": 53841173.0, |
| "step": 823 |
| }, |
| { |
| "epoch": 0.5625533367468851, |
| "grad_norm": 0.7010401487350464, |
| "learning_rate": 2.3897103235167465e-06, |
| "loss": 0.5522, |
| "mean_token_accuracy": 0.8247800469398499, |
| "num_tokens": 53906709.0, |
| "step": 824 |
| }, |
| { |
| "epoch": 0.5632360471070148, |
| "grad_norm": 0.7365289330482483, |
| "learning_rate": 2.383752909343547e-06, |
| "loss": 0.594, |
| "mean_token_accuracy": 0.8127911686897278, |
| "num_tokens": 53972052.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.5639187574671446, |
| "grad_norm": 0.7445287108421326, |
| "learning_rate": 2.377796156640242e-06, |
| "loss": 0.6276, |
| "mean_token_accuracy": 0.8042827546596527, |
| "num_tokens": 54037588.0, |
| "step": 826 |
| }, |
| { |
| "epoch": 0.5646014678272743, |
| "grad_norm": 0.7611055970191956, |
| "learning_rate": 2.3718400993019793e-06, |
| "loss": 0.6262, |
| "mean_token_accuracy": 0.8061387240886688, |
| "num_tokens": 54102947.0, |
| "step": 827 |
| }, |
| { |
| "epoch": 0.565284178187404, |
| "grad_norm": 0.7284524440765381, |
| "learning_rate": 2.3658847712199524e-06, |
| "loss": 0.6047, |
| "mean_token_accuracy": 0.8119959682226181, |
| "num_tokens": 54168483.0, |
| "step": 828 |
| }, |
| { |
| "epoch": 0.5659668885475337, |
| "grad_norm": 0.7890021800994873, |
| "learning_rate": 2.359930206281207e-06, |
| "loss": 0.5932, |
| "mean_token_accuracy": 0.8162310272455215, |
| "num_tokens": 54233578.0, |
| "step": 829 |
| }, |
| { |
| "epoch": 0.5666495989076634, |
| "grad_norm": 0.7527608871459961, |
| "learning_rate": 2.3539764383684412e-06, |
| "loss": 0.5985, |
| "mean_token_accuracy": 0.8142717480659485, |
| "num_tokens": 54299114.0, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5673323092677931, |
| "grad_norm": 0.7443063855171204, |
| "learning_rate": 2.348023501359823e-06, |
| "loss": 0.6502, |
| "mean_token_accuracy": 0.7986772954463959, |
| "num_tokens": 54364650.0, |
| "step": 831 |
| }, |
| { |
| "epoch": 0.5680150196279229, |
| "grad_norm": 0.6920120716094971, |
| "learning_rate": 2.3420714291287905e-06, |
| "loss": 0.5764, |
| "mean_token_accuracy": 0.8207630664110184, |
| "num_tokens": 54430186.0, |
| "step": 832 |
| }, |
| { |
| "epoch": 0.5686977299880526, |
| "grad_norm": 0.7068764567375183, |
| "learning_rate": 2.3361202555438594e-06, |
| "loss": 0.5915, |
| "mean_token_accuracy": 0.8150812536478043, |
| "num_tokens": 54495722.0, |
| "step": 833 |
| }, |
| { |
| "epoch": 0.5693804403481822, |
| "grad_norm": 0.8468170762062073, |
| "learning_rate": 2.330170014468434e-06, |
| "loss": 0.6343, |
| "mean_token_accuracy": 0.8036412596702576, |
| "num_tokens": 54561258.0, |
| "step": 834 |
| }, |
| { |
| "epoch": 0.570063150708312, |
| "grad_norm": 0.6984657049179077, |
| "learning_rate": 2.3242207397606124e-06, |
| "loss": 0.5634, |
| "mean_token_accuracy": 0.8241946548223495, |
| "num_tokens": 54626564.0, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.5707458610684417, |
| "grad_norm": 0.7546541690826416, |
| "learning_rate": 2.3182724652729922e-06, |
| "loss": 0.6029, |
| "mean_token_accuracy": 0.8130804002285004, |
| "num_tokens": 54692100.0, |
| "step": 836 |
| }, |
| { |
| "epoch": 0.5714285714285714, |
| "grad_norm": 0.7233748435974121, |
| "learning_rate": 2.312325224852481e-06, |
| "loss": 0.5796, |
| "mean_token_accuracy": 0.8184720128774643, |
| "num_tokens": 54757636.0, |
| "step": 837 |
| }, |
| { |
| "epoch": 0.5721112817887012, |
| "grad_norm": 0.7522431015968323, |
| "learning_rate": 2.3063790523401035e-06, |
| "loss": 0.6447, |
| "mean_token_accuracy": 0.7997826039791107, |
| "num_tokens": 54822970.0, |
| "step": 838 |
| }, |
| { |
| "epoch": 0.5727939921488309, |
| "grad_norm": 0.7364733815193176, |
| "learning_rate": 2.300433981570803e-06, |
| "loss": 0.5975, |
| "mean_token_accuracy": 0.8118279576301575, |
| "num_tokens": 54888506.0, |
| "step": 839 |
| }, |
| { |
| "epoch": 0.5734767025089605, |
| "grad_norm": 0.7409289479255676, |
| "learning_rate": 2.2944900463732594e-06, |
| "loss": 0.5998, |
| "mean_token_accuracy": 0.8123625367879868, |
| "num_tokens": 54954042.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.5741594128690903, |
| "grad_norm": 0.7288930416107178, |
| "learning_rate": 2.2885472805696883e-06, |
| "loss": 0.5874, |
| "mean_token_accuracy": 0.8148674219846725, |
| "num_tokens": 55019578.0, |
| "step": 841 |
| }, |
| { |
| "epoch": 0.57484212322922, |
| "grad_norm": 0.7401166558265686, |
| "learning_rate": 2.28260571797565e-06, |
| "loss": 0.6255, |
| "mean_token_accuracy": 0.8048631548881531, |
| "num_tokens": 55085114.0, |
| "step": 842 |
| }, |
| { |
| "epoch": 0.5755248335893497, |
| "grad_norm": 0.7475305795669556, |
| "learning_rate": 2.2766653923998605e-06, |
| "loss": 0.6149, |
| "mean_token_accuracy": 0.8096132725477219, |
| "num_tokens": 55150650.0, |
| "step": 843 |
| }, |
| { |
| "epoch": 0.5762075439494795, |
| "grad_norm": 0.6970760226249695, |
| "learning_rate": 2.270726337643997e-06, |
| "loss": 0.6288, |
| "mean_token_accuracy": 0.8047256916761398, |
| "num_tokens": 55216186.0, |
| "step": 844 |
| }, |
| { |
| "epoch": 0.5768902543096092, |
| "grad_norm": 0.7151328325271606, |
| "learning_rate": 2.264788587502502e-06, |
| "loss": 0.5999, |
| "mean_token_accuracy": 0.8139662742614746, |
| "num_tokens": 55281722.0, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.5775729646697388, |
| "grad_norm": 0.6835965514183044, |
| "learning_rate": 2.258852175762399e-06, |
| "loss": 0.6157, |
| "mean_token_accuracy": 0.8080445528030396, |
| "num_tokens": 55347183.0, |
| "step": 846 |
| }, |
| { |
| "epoch": 0.5782556750298686, |
| "grad_norm": 0.7275755405426025, |
| "learning_rate": 2.2529171362030943e-06, |
| "loss": 0.6363, |
| "mean_token_accuracy": 0.799471452832222, |
| "num_tokens": 55412714.0, |
| "step": 847 |
| }, |
| { |
| "epoch": 0.5789383853899983, |
| "grad_norm": 0.7379560470581055, |
| "learning_rate": 2.246983502596183e-06, |
| "loss": 0.6503, |
| "mean_token_accuracy": 0.7994582206010818, |
| "num_tokens": 55478058.0, |
| "step": 848 |
| }, |
| { |
| "epoch": 0.579621095750128, |
| "grad_norm": 0.7486806511878967, |
| "learning_rate": 2.241051308705265e-06, |
| "loss": 0.6229, |
| "mean_token_accuracy": 0.8074291348457336, |
| "num_tokens": 55543594.0, |
| "step": 849 |
| }, |
| { |
| "epoch": 0.5803038061102577, |
| "grad_norm": 0.6972367763519287, |
| "learning_rate": 2.235120588285746e-06, |
| "loss": 0.5471, |
| "mean_token_accuracy": 0.8273751586675644, |
| "num_tokens": 55608322.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.5809865164703875, |
| "grad_norm": 0.7290424704551697, |
| "learning_rate": 2.229191375084644e-06, |
| "loss": 0.5765, |
| "mean_token_accuracy": 0.8169446438550949, |
| "num_tokens": 55673858.0, |
| "step": 851 |
| }, |
| { |
| "epoch": 0.5816692268305171, |
| "grad_norm": 0.7283189296722412, |
| "learning_rate": 2.223263702840406e-06, |
| "loss": 0.5646, |
| "mean_token_accuracy": 0.8223196864128113, |
| "num_tokens": 55739371.0, |
| "step": 852 |
| }, |
| { |
| "epoch": 0.5823519371906468, |
| "grad_norm": 0.7880806922912598, |
| "learning_rate": 2.2173376052827077e-06, |
| "loss": 0.6166, |
| "mean_token_accuracy": 0.807247519493103, |
| "num_tokens": 55804876.0, |
| "step": 853 |
| }, |
| { |
| "epoch": 0.5830346475507766, |
| "grad_norm": 0.7274592518806458, |
| "learning_rate": 2.2114131161322645e-06, |
| "loss": 0.6043, |
| "mean_token_accuracy": 0.8126132190227509, |
| "num_tokens": 55870223.0, |
| "step": 854 |
| }, |
| { |
| "epoch": 0.5837173579109063, |
| "grad_norm": 0.7590665221214294, |
| "learning_rate": 2.2054902691006407e-06, |
| "loss": 0.6013, |
| "mean_token_accuracy": 0.8119831830263138, |
| "num_tokens": 55935728.0, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.584400068271036, |
| "grad_norm": 0.7268674373626709, |
| "learning_rate": 2.199569097890055e-06, |
| "loss": 0.5879, |
| "mean_token_accuracy": 0.8171060085296631, |
| "num_tokens": 56000505.0, |
| "step": 856 |
| }, |
| { |
| "epoch": 0.5850827786311658, |
| "grad_norm": 0.7419127821922302, |
| "learning_rate": 2.1936496361931935e-06, |
| "loss": 0.662, |
| "mean_token_accuracy": 0.7967375367879868, |
| "num_tokens": 56066041.0, |
| "step": 857 |
| }, |
| { |
| "epoch": 0.5857654889912954, |
| "grad_norm": 0.7568214535713196, |
| "learning_rate": 2.187731917693011e-06, |
| "loss": 0.5984, |
| "mean_token_accuracy": 0.8127596527338028, |
| "num_tokens": 56131577.0, |
| "step": 858 |
| }, |
| { |
| "epoch": 0.5864481993514251, |
| "grad_norm": 0.6971532702445984, |
| "learning_rate": 2.1818159760625444e-06, |
| "loss": 0.5897, |
| "mean_token_accuracy": 0.8156934976577759, |
| "num_tokens": 56197064.0, |
| "step": 859 |
| }, |
| { |
| "epoch": 0.5871309097115549, |
| "grad_norm": 0.7272664308547974, |
| "learning_rate": 2.1759018449647224e-06, |
| "loss": 0.5836, |
| "mean_token_accuracy": 0.8184851855039597, |
| "num_tokens": 56262485.0, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.5878136200716846, |
| "grad_norm": 0.725836992263794, |
| "learning_rate": 2.1699895580521666e-06, |
| "loss": 0.6278, |
| "mean_token_accuracy": 0.804779976606369, |
| "num_tokens": 56327459.0, |
| "step": 861 |
| }, |
| { |
| "epoch": 0.5884963304318143, |
| "grad_norm": 0.7364003658294678, |
| "learning_rate": 2.164079148967009e-06, |
| "loss": 0.6447, |
| "mean_token_accuracy": 0.8013349324464798, |
| "num_tokens": 56392995.0, |
| "step": 862 |
| }, |
| { |
| "epoch": 0.589179040791944, |
| "grad_norm": 0.732744574546814, |
| "learning_rate": 2.158170651340696e-06, |
| "loss": 0.598, |
| "mean_token_accuracy": 0.8127138316631317, |
| "num_tokens": 56458531.0, |
| "step": 863 |
| }, |
| { |
| "epoch": 0.5898617511520737, |
| "grad_norm": 0.7075684070587158, |
| "learning_rate": 2.1522640987937937e-06, |
| "loss": 0.5823, |
| "mean_token_accuracy": 0.8167233616113663, |
| "num_tokens": 56523016.0, |
| "step": 864 |
| }, |
| { |
| "epoch": 0.5905444615122034, |
| "grad_norm": 0.7445662021636963, |
| "learning_rate": 2.146359524935804e-06, |
| "loss": 0.6338, |
| "mean_token_accuracy": 0.8040997236967087, |
| "num_tokens": 56588299.0, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.5912271718723332, |
| "grad_norm": 0.7430678009986877, |
| "learning_rate": 2.1404569633649703e-06, |
| "loss": 0.58, |
| "mean_token_accuracy": 0.8178457915782928, |
| "num_tokens": 56653835.0, |
| "step": 866 |
| }, |
| { |
| "epoch": 0.5919098822324629, |
| "grad_norm": 0.7150656580924988, |
| "learning_rate": 2.1345564476680807e-06, |
| "loss": 0.5607, |
| "mean_token_accuracy": 0.8213276714086533, |
| "num_tokens": 56718903.0, |
| "step": 867 |
| }, |
| { |
| "epoch": 0.5925925925925926, |
| "grad_norm": 0.7253732085227966, |
| "learning_rate": 2.1286580114202866e-06, |
| "loss": 0.5935, |
| "mean_token_accuracy": 0.815096527338028, |
| "num_tokens": 56784439.0, |
| "step": 868 |
| }, |
| { |
| "epoch": 0.5932753029527223, |
| "grad_norm": 0.7164980173110962, |
| "learning_rate": 2.122761688184906e-06, |
| "loss": 0.5905, |
| "mean_token_accuracy": 0.815692201256752, |
| "num_tokens": 56849975.0, |
| "step": 869 |
| }, |
| { |
| "epoch": 0.593958013312852, |
| "grad_norm": 0.7268425822257996, |
| "learning_rate": 2.1168675115132317e-06, |
| "loss": 0.6602, |
| "mean_token_accuracy": 0.796783372759819, |
| "num_tokens": 56915511.0, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.5946407236729817, |
| "grad_norm": 0.7295462489128113, |
| "learning_rate": 2.1109755149443436e-06, |
| "loss": 0.5928, |
| "mean_token_accuracy": 0.8162573277950287, |
| "num_tokens": 56981047.0, |
| "step": 871 |
| }, |
| { |
| "epoch": 0.5953234340331115, |
| "grad_norm": 0.7488856315612793, |
| "learning_rate": 2.1050857320049165e-06, |
| "loss": 0.6352, |
| "mean_token_accuracy": 0.8048478811979294, |
| "num_tokens": 57046583.0, |
| "step": 872 |
| }, |
| { |
| "epoch": 0.5960061443932412, |
| "grad_norm": 0.7265530824661255, |
| "learning_rate": 2.099198196209027e-06, |
| "loss": 0.627, |
| "mean_token_accuracy": 0.8041452914476395, |
| "num_tokens": 57112119.0, |
| "step": 873 |
| }, |
| { |
| "epoch": 0.5966888547533709, |
| "grad_norm": 0.7198655009269714, |
| "learning_rate": 2.093312941057968e-06, |
| "loss": 0.6357, |
| "mean_token_accuracy": 0.8020598292350769, |
| "num_tokens": 57177223.0, |
| "step": 874 |
| }, |
| { |
| "epoch": 0.5973715651135006, |
| "grad_norm": 0.7067784070968628, |
| "learning_rate": 2.0874300000400546e-06, |
| "loss": 0.6042, |
| "mean_token_accuracy": 0.8123319894075394, |
| "num_tokens": 57242759.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.5980542754736303, |
| "grad_norm": 0.7130993604660034, |
| "learning_rate": 2.0815494066304307e-06, |
| "loss": 0.5653, |
| "mean_token_accuracy": 0.8243982046842575, |
| "num_tokens": 57308295.0, |
| "step": 876 |
| }, |
| { |
| "epoch": 0.59873698583376, |
| "grad_norm": 0.7170623540878296, |
| "learning_rate": 2.0756711942908853e-06, |
| "loss": 0.5961, |
| "mean_token_accuracy": 0.8153766393661499, |
| "num_tokens": 57373249.0, |
| "step": 877 |
| }, |
| { |
| "epoch": 0.5994196961938898, |
| "grad_norm": 0.744687020778656, |
| "learning_rate": 2.069795396469659e-06, |
| "loss": 0.5967, |
| "mean_token_accuracy": 0.8129734843969345, |
| "num_tokens": 57438785.0, |
| "step": 878 |
| }, |
| { |
| "epoch": 0.6001024065540195, |
| "grad_norm": 0.7068538665771484, |
| "learning_rate": 2.063922046601249e-06, |
| "loss": 0.5869, |
| "mean_token_accuracy": 0.8159824013710022, |
| "num_tokens": 57504321.0, |
| "step": 879 |
| }, |
| { |
| "epoch": 0.6007851169141492, |
| "grad_norm": 0.6912579536437988, |
| "learning_rate": 2.058051178106228e-06, |
| "loss": 0.5619, |
| "mean_token_accuracy": 0.8215725719928741, |
| "num_tokens": 57569857.0, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.6014678272742788, |
| "grad_norm": 0.7329758405685425, |
| "learning_rate": 2.0521828243910476e-06, |
| "loss": 0.5746, |
| "mean_token_accuracy": 0.8185483813285828, |
| "num_tokens": 57635393.0, |
| "step": 881 |
| }, |
| { |
| "epoch": 0.6021505376344086, |
| "grad_norm": 0.7120118141174316, |
| "learning_rate": 2.0463170188478473e-06, |
| "loss": 0.5934, |
| "mean_token_accuracy": 0.8147895932197571, |
| "num_tokens": 57700789.0, |
| "step": 882 |
| }, |
| { |
| "epoch": 0.6028332479945383, |
| "grad_norm": 0.7233956456184387, |
| "learning_rate": 2.0404537948542697e-06, |
| "loss": 0.5776, |
| "mean_token_accuracy": 0.8192051500082016, |
| "num_tokens": 57766325.0, |
| "step": 883 |
| }, |
| { |
| "epoch": 0.603515958354668, |
| "grad_norm": 0.7008931040763855, |
| "learning_rate": 2.0345931857732688e-06, |
| "loss": 0.6225, |
| "mean_token_accuracy": 0.8070472925901413, |
| "num_tokens": 57831861.0, |
| "step": 884 |
| }, |
| { |
| "epoch": 0.6041986687147978, |
| "grad_norm": 0.7301531434059143, |
| "learning_rate": 2.0287352249529155e-06, |
| "loss": 0.6238, |
| "mean_token_accuracy": 0.8049700707197189, |
| "num_tokens": 57897397.0, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.6048813790749274, |
| "grad_norm": 0.7681100964546204, |
| "learning_rate": 2.0228799457262144e-06, |
| "loss": 0.6269, |
| "mean_token_accuracy": 0.8048326075077057, |
| "num_tokens": 57962933.0, |
| "step": 886 |
| }, |
| { |
| "epoch": 0.6055640894350571, |
| "grad_norm": 0.7019287347793579, |
| "learning_rate": 2.017027381410912e-06, |
| "loss": 0.5653, |
| "mean_token_accuracy": 0.8218242228031158, |
| "num_tokens": 58028420.0, |
| "step": 887 |
| }, |
| { |
| "epoch": 0.6062467997951869, |
| "grad_norm": 0.6995770931243896, |
| "learning_rate": 2.011177565309302e-06, |
| "loss": 0.6069, |
| "mean_token_accuracy": 0.8123671561479568, |
| "num_tokens": 58093581.0, |
| "step": 888 |
| }, |
| { |
| "epoch": 0.6069295101553166, |
| "grad_norm": 0.7418209314346313, |
| "learning_rate": 2.0053305307080447e-06, |
| "loss": 0.6059, |
| "mean_token_accuracy": 0.8123618960380554, |
| "num_tokens": 58159091.0, |
| "step": 889 |
| }, |
| { |
| "epoch": 0.6076122205154463, |
| "grad_norm": 0.8020544052124023, |
| "learning_rate": 1.9994863108779723e-06, |
| "loss": 0.6168, |
| "mean_token_accuracy": 0.8074444085359573, |
| "num_tokens": 58224627.0, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.6082949308755761, |
| "grad_norm": 0.7087436318397522, |
| "learning_rate": 1.9936449390738976e-06, |
| "loss": 0.6054, |
| "mean_token_accuracy": 0.8124083578586578, |
| "num_tokens": 58290163.0, |
| "step": 891 |
| }, |
| { |
| "epoch": 0.6089776412357057, |
| "grad_norm": 0.7432106733322144, |
| "learning_rate": 1.9878064485344307e-06, |
| "loss": 0.6097, |
| "mean_token_accuracy": 0.811003178358078, |
| "num_tokens": 58355699.0, |
| "step": 892 |
| }, |
| { |
| "epoch": 0.6096603515958354, |
| "grad_norm": 0.7401469349861145, |
| "learning_rate": 1.981970872481787e-06, |
| "loss": 0.6295, |
| "mean_token_accuracy": 0.803007110953331, |
| "num_tokens": 58421022.0, |
| "step": 893 |
| }, |
| { |
| "epoch": 0.6103430619559652, |
| "grad_norm": 0.704863429069519, |
| "learning_rate": 1.9761382441215927e-06, |
| "loss": 0.593, |
| "mean_token_accuracy": 0.8138948082923889, |
| "num_tokens": 58485866.0, |
| "step": 894 |
| }, |
| { |
| "epoch": 0.6110257723160949, |
| "grad_norm": 0.678066611289978, |
| "learning_rate": 1.9703085966427077e-06, |
| "loss": 0.5684, |
| "mean_token_accuracy": 0.821878045797348, |
| "num_tokens": 58551402.0, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.6117084826762246, |
| "grad_norm": 0.7411065697669983, |
| "learning_rate": 1.9644819632170267e-06, |
| "loss": 0.6253, |
| "mean_token_accuracy": 0.8053397685289383, |
| "num_tokens": 58616897.0, |
| "step": 896 |
| }, |
| { |
| "epoch": 0.6123911930363544, |
| "grad_norm": 0.7031525373458862, |
| "learning_rate": 1.9586583769992924e-06, |
| "loss": 0.5742, |
| "mean_token_accuracy": 0.8187011182308197, |
| "num_tokens": 58682433.0, |
| "step": 897 |
| }, |
| { |
| "epoch": 0.613073903396484, |
| "grad_norm": 0.749374508857727, |
| "learning_rate": 1.952837871126912e-06, |
| "loss": 0.6343, |
| "mean_token_accuracy": 0.8025873750448227, |
| "num_tokens": 58747969.0, |
| "step": 898 |
| }, |
| { |
| "epoch": 0.6137566137566137, |
| "grad_norm": 0.7202054262161255, |
| "learning_rate": 1.947020478719763e-06, |
| "loss": 0.6213, |
| "mean_token_accuracy": 0.8076076060533524, |
| "num_tokens": 58812802.0, |
| "step": 899 |
| }, |
| { |
| "epoch": 0.6144393241167435, |
| "grad_norm": 0.7742454409599304, |
| "learning_rate": 1.9412062328800044e-06, |
| "loss": 0.6569, |
| "mean_token_accuracy": 0.7975776046514511, |
| "num_tokens": 58878338.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6151220344768732, |
| "grad_norm": 0.7440277338027954, |
| "learning_rate": 1.9353951666918957e-06, |
| "loss": 0.6077, |
| "mean_token_accuracy": 0.808162271976471, |
| "num_tokens": 58943874.0, |
| "step": 901 |
| }, |
| { |
| "epoch": 0.6158047448370029, |
| "grad_norm": 0.7421770095825195, |
| "learning_rate": 1.929587313221599e-06, |
| "loss": 0.603, |
| "mean_token_accuracy": 0.8112628310918808, |
| "num_tokens": 59009410.0, |
| "step": 902 |
| }, |
| { |
| "epoch": 0.6164874551971327, |
| "grad_norm": 0.7264229655265808, |
| "learning_rate": 1.9237827055169963e-06, |
| "loss": 0.6386, |
| "mean_token_accuracy": 0.8024193644523621, |
| "num_tokens": 59074946.0, |
| "step": 903 |
| }, |
| { |
| "epoch": 0.6171701655572623, |
| "grad_norm": 0.7455774545669556, |
| "learning_rate": 1.917981376607502e-06, |
| "loss": 0.6415, |
| "mean_token_accuracy": 0.8024804592132568, |
| "num_tokens": 59140482.0, |
| "step": 904 |
| }, |
| { |
| "epoch": 0.617852875917392, |
| "grad_norm": 0.7394263744354248, |
| "learning_rate": 1.912183359503873e-06, |
| "loss": 0.5846, |
| "mean_token_accuracy": 0.8150625228881836, |
| "num_tokens": 59205527.0, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.6185355862775218, |
| "grad_norm": 0.7652600407600403, |
| "learning_rate": 1.9063886871980215e-06, |
| "loss": 0.6415, |
| "mean_token_accuracy": 0.8018143624067307, |
| "num_tokens": 59270685.0, |
| "step": 906 |
| }, |
| { |
| "epoch": 0.6192182966376515, |
| "grad_norm": 0.7579291462898254, |
| "learning_rate": 1.9005973926628256e-06, |
| "loss": 0.6439, |
| "mean_token_accuracy": 0.8011413216590881, |
| "num_tokens": 59336075.0, |
| "step": 907 |
| }, |
| { |
| "epoch": 0.6199010069977812, |
| "grad_norm": 0.6949485540390015, |
| "learning_rate": 1.894809508851944e-06, |
| "loss": 0.5629, |
| "mean_token_accuracy": 0.823924720287323, |
| "num_tokens": 59401611.0, |
| "step": 908 |
| }, |
| { |
| "epoch": 0.620583717357911, |
| "grad_norm": 0.7305551767349243, |
| "learning_rate": 1.8890250686996299e-06, |
| "loss": 0.6245, |
| "mean_token_accuracy": 0.806849792599678, |
| "num_tokens": 59466961.0, |
| "step": 909 |
| }, |
| { |
| "epoch": 0.6212664277180406, |
| "grad_norm": 0.7133076190948486, |
| "learning_rate": 1.8832441051205366e-06, |
| "loss": 0.6064, |
| "mean_token_accuracy": 0.8111100941896439, |
| "num_tokens": 59532497.0, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.6219491380781703, |
| "grad_norm": 0.7591602802276611, |
| "learning_rate": 1.8774666510095394e-06, |
| "loss": 0.6046, |
| "mean_token_accuracy": 0.8109726309776306, |
| "num_tokens": 59598033.0, |
| "step": 911 |
| }, |
| { |
| "epoch": 0.6226318484383001, |
| "grad_norm": 0.7252746820449829, |
| "learning_rate": 1.871692739241543e-06, |
| "loss": 0.6375, |
| "mean_token_accuracy": 0.8006934374570847, |
| "num_tokens": 59663569.0, |
| "step": 912 |
| }, |
| { |
| "epoch": 0.6233145587984298, |
| "grad_norm": 0.7542442083358765, |
| "learning_rate": 1.8659224026712926e-06, |
| "loss": 0.6207, |
| "mean_token_accuracy": 0.8062988817691803, |
| "num_tokens": 59729105.0, |
| "step": 913 |
| }, |
| { |
| "epoch": 0.6239972691585595, |
| "grad_norm": 0.6936948299407959, |
| "learning_rate": 1.860155674133194e-06, |
| "loss": 0.5966, |
| "mean_token_accuracy": 0.815292090177536, |
| "num_tokens": 59794521.0, |
| "step": 914 |
| }, |
| { |
| "epoch": 0.6246799795186891, |
| "grad_norm": 0.7454006671905518, |
| "learning_rate": 1.8543925864411203e-06, |
| "loss": 0.6305, |
| "mean_token_accuracy": 0.8039772808551788, |
| "num_tokens": 59860057.0, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.6253626898788189, |
| "grad_norm": 0.7165136933326721, |
| "learning_rate": 1.8486331723882261e-06, |
| "loss": 0.5786, |
| "mean_token_accuracy": 0.8195000439882278, |
| "num_tokens": 59925480.0, |
| "step": 916 |
| }, |
| { |
| "epoch": 0.6260454002389486, |
| "grad_norm": 0.7344151139259338, |
| "learning_rate": 1.8428774647467654e-06, |
| "loss": 0.6229, |
| "mean_token_accuracy": 0.804817333817482, |
| "num_tokens": 59991016.0, |
| "step": 917 |
| }, |
| { |
| "epoch": 0.6267281105990783, |
| "grad_norm": 0.7066066861152649, |
| "learning_rate": 1.8371254962679008e-06, |
| "loss": 0.6018, |
| "mean_token_accuracy": 0.8117974102497101, |
| "num_tokens": 60056552.0, |
| "step": 918 |
| }, |
| { |
| "epoch": 0.6274108209592081, |
| "grad_norm": 0.7304503321647644, |
| "learning_rate": 1.831377299681516e-06, |
| "loss": 0.599, |
| "mean_token_accuracy": 0.8132178634405136, |
| "num_tokens": 60122088.0, |
| "step": 919 |
| }, |
| { |
| "epoch": 0.6280935313193378, |
| "grad_norm": 0.7016748785972595, |
| "learning_rate": 1.8256329076960345e-06, |
| "loss": 0.5527, |
| "mean_token_accuracy": 0.8235010355710983, |
| "num_tokens": 60187603.0, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.6287762416794674, |
| "grad_norm": 0.7145335674285889, |
| "learning_rate": 1.8198923529982314e-06, |
| "loss": 0.5926, |
| "mean_token_accuracy": 0.8130956739187241, |
| "num_tokens": 60253139.0, |
| "step": 921 |
| }, |
| { |
| "epoch": 0.6294589520395972, |
| "grad_norm": 0.7018191814422607, |
| "learning_rate": 1.8141556682530437e-06, |
| "loss": 0.5884, |
| "mean_token_accuracy": 0.8140273690223694, |
| "num_tokens": 60318675.0, |
| "step": 922 |
| }, |
| { |
| "epoch": 0.6301416623997269, |
| "grad_norm": 0.7363874316215515, |
| "learning_rate": 1.8084228861033898e-06, |
| "loss": 0.5798, |
| "mean_token_accuracy": 0.8173789381980896, |
| "num_tokens": 60383787.0, |
| "step": 923 |
| }, |
| { |
| "epoch": 0.6308243727598566, |
| "grad_norm": 0.7196556925773621, |
| "learning_rate": 1.8026940391699826e-06, |
| "loss": 0.6073, |
| "mean_token_accuracy": 0.8098729252815247, |
| "num_tokens": 60449323.0, |
| "step": 924 |
| }, |
| { |
| "epoch": 0.6315070831199864, |
| "grad_norm": 0.7288717031478882, |
| "learning_rate": 1.796969160051139e-06, |
| "loss": 0.6005, |
| "mean_token_accuracy": 0.811950147151947, |
| "num_tokens": 60514859.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.6321897934801161, |
| "grad_norm": 0.686795711517334, |
| "learning_rate": 1.7912482813226018e-06, |
| "loss": 0.6087, |
| "mean_token_accuracy": 0.8124847114086151, |
| "num_tokens": 60580395.0, |
| "step": 926 |
| }, |
| { |
| "epoch": 0.6328725038402457, |
| "grad_norm": 0.7237222194671631, |
| "learning_rate": 1.78553143553735e-06, |
| "loss": 0.5973, |
| "mean_token_accuracy": 0.8137066215276718, |
| "num_tokens": 60645931.0, |
| "step": 927 |
| }, |
| { |
| "epoch": 0.6335552142003755, |
| "grad_norm": 0.7238149642944336, |
| "learning_rate": 1.779818655225412e-06, |
| "loss": 0.5694, |
| "mean_token_accuracy": 0.8201062977313995, |
| "num_tokens": 60711467.0, |
| "step": 928 |
| }, |
| { |
| "epoch": 0.6342379245605052, |
| "grad_norm": 0.7108457088470459, |
| "learning_rate": 1.774109972893686e-06, |
| "loss": 0.6072, |
| "mean_token_accuracy": 0.8110642731189728, |
| "num_tokens": 60777003.0, |
| "step": 929 |
| }, |
| { |
| "epoch": 0.6349206349206349, |
| "grad_norm": 0.7974938750267029, |
| "learning_rate": 1.7684054210257517e-06, |
| "loss": 0.6173, |
| "mean_token_accuracy": 0.806986540555954, |
| "num_tokens": 60842344.0, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.6356033452807647, |
| "grad_norm": 0.7198942303657532, |
| "learning_rate": 1.7627050320816814e-06, |
| "loss": 0.5821, |
| "mean_token_accuracy": 0.8179832547903061, |
| "num_tokens": 60907880.0, |
| "step": 931 |
| }, |
| { |
| "epoch": 0.6362860556408944, |
| "grad_norm": 0.7319303154945374, |
| "learning_rate": 1.7570088384978639e-06, |
| "loss": 0.6081, |
| "mean_token_accuracy": 0.8093383461236954, |
| "num_tokens": 60973416.0, |
| "step": 932 |
| }, |
| { |
| "epoch": 0.636968766001024, |
| "grad_norm": 0.7228114008903503, |
| "learning_rate": 1.7513168726868157e-06, |
| "loss": 0.6258, |
| "mean_token_accuracy": 0.808112844824791, |
| "num_tokens": 61038123.0, |
| "step": 933 |
| }, |
| { |
| "epoch": 0.6376514763611538, |
| "grad_norm": 0.7491247057914734, |
| "learning_rate": 1.7456291670369917e-06, |
| "loss": 0.5901, |
| "mean_token_accuracy": 0.8173221349716187, |
| "num_tokens": 61103617.0, |
| "step": 934 |
| }, |
| { |
| "epoch": 0.6383341867212835, |
| "grad_norm": 0.7460877299308777, |
| "learning_rate": 1.7399457539126114e-06, |
| "loss": 0.6724, |
| "mean_token_accuracy": 0.7928316593170166, |
| "num_tokens": 61168856.0, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.6390168970814132, |
| "grad_norm": 0.7103235125541687, |
| "learning_rate": 1.7342666656534658e-06, |
| "loss": 0.6028, |
| "mean_token_accuracy": 0.8120417892932892, |
| "num_tokens": 61234392.0, |
| "step": 936 |
| }, |
| { |
| "epoch": 0.639699607441543, |
| "grad_norm": 0.6912817358970642, |
| "learning_rate": 1.7285919345747352e-06, |
| "loss": 0.5863, |
| "mean_token_accuracy": 0.8152818083763123, |
| "num_tokens": 61299918.0, |
| "step": 937 |
| }, |
| { |
| "epoch": 0.6403823178016727, |
| "grad_norm": 0.7359888553619385, |
| "learning_rate": 1.7229215929668098e-06, |
| "loss": 0.6297, |
| "mean_token_accuracy": 0.8056879341602325, |
| "num_tokens": 61365454.0, |
| "step": 938 |
| }, |
| { |
| "epoch": 0.6410650281618023, |
| "grad_norm": 0.7430459260940552, |
| "learning_rate": 1.7172556730951028e-06, |
| "loss": 0.6426, |
| "mean_token_accuracy": 0.7991660684347153, |
| "num_tokens": 61430990.0, |
| "step": 939 |
| }, |
| { |
| "epoch": 0.6417477385219321, |
| "grad_norm": 0.7271755337715149, |
| "learning_rate": 1.7115942071998628e-06, |
| "loss": 0.6198, |
| "mean_token_accuracy": 0.807865634560585, |
| "num_tokens": 61496420.0, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.6424304488820618, |
| "grad_norm": 0.73670893907547, |
| "learning_rate": 1.7059372274959984e-06, |
| "loss": 0.6409, |
| "mean_token_accuracy": 0.8009095340967178, |
| "num_tokens": 61561952.0, |
| "step": 941 |
| }, |
| { |
| "epoch": 0.6431131592421915, |
| "grad_norm": 0.6874427199363708, |
| "learning_rate": 1.7002847661728905e-06, |
| "loss": 0.5671, |
| "mean_token_accuracy": 0.8194037079811096, |
| "num_tokens": 61627488.0, |
| "step": 942 |
| }, |
| { |
| "epoch": 0.6437958696023213, |
| "grad_norm": 0.7189149260520935, |
| "learning_rate": 1.6946368553942062e-06, |
| "loss": 0.5826, |
| "mean_token_accuracy": 0.8184414654970169, |
| "num_tokens": 61693024.0, |
| "step": 943 |
| }, |
| { |
| "epoch": 0.644478579962451, |
| "grad_norm": 0.7161331176757812, |
| "learning_rate": 1.6889935272977225e-06, |
| "loss": 0.601, |
| "mean_token_accuracy": 0.8121945261955261, |
| "num_tokens": 61758560.0, |
| "step": 944 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 0.7162284851074219, |
| "learning_rate": 1.6833548139951398e-06, |
| "loss": 0.6452, |
| "mean_token_accuracy": 0.7995644211769104, |
| "num_tokens": 61823760.0, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.6458440006827103, |
| "grad_norm": 0.7503809928894043, |
| "learning_rate": 1.6777207475718961e-06, |
| "loss": 0.5762, |
| "mean_token_accuracy": 0.8186247497797012, |
| "num_tokens": 61889296.0, |
| "step": 946 |
| }, |
| { |
| "epoch": 0.6465267110428401, |
| "grad_norm": 0.7211571335792542, |
| "learning_rate": 1.6720913600869914e-06, |
| "loss": 0.6211, |
| "mean_token_accuracy": 0.8063294291496277, |
| "num_tokens": 61954832.0, |
| "step": 947 |
| }, |
| { |
| "epoch": 0.6472094214029698, |
| "grad_norm": 0.7424342632293701, |
| "learning_rate": 1.6664666835728014e-06, |
| "loss": 0.5955, |
| "mean_token_accuracy": 0.8143786638975143, |
| "num_tokens": 62020368.0, |
| "step": 948 |
| }, |
| { |
| "epoch": 0.6478921317630995, |
| "grad_norm": 0.7574367523193359, |
| "learning_rate": 1.6608467500348912e-06, |
| "loss": 0.581, |
| "mean_token_accuracy": 0.8168988227844238, |
| "num_tokens": 62085904.0, |
| "step": 949 |
| }, |
| { |
| "epoch": 0.6485748421232292, |
| "grad_norm": 0.7286249399185181, |
| "learning_rate": 1.655231591451843e-06, |
| "loss": 0.6212, |
| "mean_token_accuracy": 0.8064210712909698, |
| "num_tokens": 62151440.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.6492575524833589, |
| "grad_norm": 0.7372191548347473, |
| "learning_rate": 1.6496212397750643e-06, |
| "loss": 0.6129, |
| "mean_token_accuracy": 0.8075648546218872, |
| "num_tokens": 62216902.0, |
| "step": 951 |
| }, |
| { |
| "epoch": 0.6499402628434886, |
| "grad_norm": 0.6884051561355591, |
| "learning_rate": 1.6440157269286123e-06, |
| "loss": 0.5711, |
| "mean_token_accuracy": 0.8208394348621368, |
| "num_tokens": 62282438.0, |
| "step": 952 |
| }, |
| { |
| "epoch": 0.6506229732036184, |
| "grad_norm": 0.6881023645401001, |
| "learning_rate": 1.6384150848090102e-06, |
| "loss": 0.5868, |
| "mean_token_accuracy": 0.8153561800718307, |
| "num_tokens": 62347974.0, |
| "step": 953 |
| }, |
| { |
| "epoch": 0.6513056835637481, |
| "grad_norm": 0.7258963584899902, |
| "learning_rate": 1.632819345285065e-06, |
| "loss": 0.6304, |
| "mean_token_accuracy": 0.8034427016973495, |
| "num_tokens": 62413510.0, |
| "step": 954 |
| }, |
| { |
| "epoch": 0.6519883939238778, |
| "grad_norm": 0.7467467188835144, |
| "learning_rate": 1.6272285401976878e-06, |
| "loss": 0.5642, |
| "mean_token_accuracy": 0.8224889934062958, |
| "num_tokens": 62479046.0, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.6526711042840075, |
| "grad_norm": 0.7235984206199646, |
| "learning_rate": 1.6216427013597108e-06, |
| "loss": 0.6214, |
| "mean_token_accuracy": 0.8049242496490479, |
| "num_tokens": 62544582.0, |
| "step": 956 |
| }, |
| { |
| "epoch": 0.6533538146441372, |
| "grad_norm": 0.7176678776741028, |
| "learning_rate": 1.6160618605557076e-06, |
| "loss": 0.594, |
| "mean_token_accuracy": 0.8146688640117645, |
| "num_tokens": 62610118.0, |
| "step": 957 |
| }, |
| { |
| "epoch": 0.6540365250042669, |
| "grad_norm": 0.6972813606262207, |
| "learning_rate": 1.6104860495418135e-06, |
| "loss": 0.6246, |
| "mean_token_accuracy": 0.806063175201416, |
| "num_tokens": 62674835.0, |
| "step": 958 |
| }, |
| { |
| "epoch": 0.6547192353643967, |
| "grad_norm": 0.7483172416687012, |
| "learning_rate": 1.6049153000455396e-06, |
| "loss": 0.5686, |
| "mean_token_accuracy": 0.8201521188020706, |
| "num_tokens": 62740371.0, |
| "step": 959 |
| }, |
| { |
| "epoch": 0.6554019457245264, |
| "grad_norm": 0.7193106412887573, |
| "learning_rate": 1.5993496437655988e-06, |
| "loss": 0.5832, |
| "mean_token_accuracy": 0.8177759349346161, |
| "num_tokens": 62805898.0, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.656084656084656, |
| "grad_norm": 0.7257005572319031, |
| "learning_rate": 1.5937891123717236e-06, |
| "loss": 0.6143, |
| "mean_token_accuracy": 0.8085288405418396, |
| "num_tokens": 62871434.0, |
| "step": 961 |
| }, |
| { |
| "epoch": 0.6567673664447858, |
| "grad_norm": 0.7278813719749451, |
| "learning_rate": 1.5882337375044803e-06, |
| "loss": 0.6099, |
| "mean_token_accuracy": 0.8087999373674393, |
| "num_tokens": 62936156.0, |
| "step": 962 |
| }, |
| { |
| "epoch": 0.6574500768049155, |
| "grad_norm": 0.6961346864700317, |
| "learning_rate": 1.5826835507750984e-06, |
| "loss": 0.6202, |
| "mean_token_accuracy": 0.8076887875795364, |
| "num_tokens": 63001692.0, |
| "step": 963 |
| }, |
| { |
| "epoch": 0.6581327871650452, |
| "grad_norm": 0.7237011790275574, |
| "learning_rate": 1.5771385837652839e-06, |
| "loss": 0.6107, |
| "mean_token_accuracy": 0.8089922368526459, |
| "num_tokens": 63066839.0, |
| "step": 964 |
| }, |
| { |
| "epoch": 0.658815497525175, |
| "grad_norm": 0.7264211177825928, |
| "learning_rate": 1.5715988680270394e-06, |
| "loss": 0.6245, |
| "mean_token_accuracy": 0.8053977340459824, |
| "num_tokens": 63132375.0, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.6594982078853047, |
| "grad_norm": 0.7212889194488525, |
| "learning_rate": 1.56606443508249e-06, |
| "loss": 0.5914, |
| "mean_token_accuracy": 0.8170210123062134, |
| "num_tokens": 63197911.0, |
| "step": 966 |
| }, |
| { |
| "epoch": 0.6601809182454343, |
| "grad_norm": 0.7629103660583496, |
| "learning_rate": 1.5605353164237002e-06, |
| "loss": 0.6252, |
| "mean_token_accuracy": 0.8060850501060486, |
| "num_tokens": 63263447.0, |
| "step": 967 |
| }, |
| { |
| "epoch": 0.6608636286055641, |
| "grad_norm": 0.7188578844070435, |
| "learning_rate": 1.5550115435124922e-06, |
| "loss": 0.5945, |
| "mean_token_accuracy": 0.8150165379047394, |
| "num_tokens": 63327993.0, |
| "step": 968 |
| }, |
| { |
| "epoch": 0.6615463389656938, |
| "grad_norm": 0.7305589318275452, |
| "learning_rate": 1.5494931477802725e-06, |
| "loss": 0.6028, |
| "mean_token_accuracy": 0.8119195997714996, |
| "num_tokens": 63393529.0, |
| "step": 969 |
| }, |
| { |
| "epoch": 0.6622290493258235, |
| "grad_norm": 0.720365047454834, |
| "learning_rate": 1.5439801606278509e-06, |
| "loss": 0.6045, |
| "mean_token_accuracy": 0.810316875576973, |
| "num_tokens": 63458319.0, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.6629117596859533, |
| "grad_norm": 0.713546097278595, |
| "learning_rate": 1.5384726134252578e-06, |
| "loss": 0.5899, |
| "mean_token_accuracy": 0.8163031488656998, |
| "num_tokens": 63523855.0, |
| "step": 971 |
| }, |
| { |
| "epoch": 0.663594470046083, |
| "grad_norm": 0.6926174759864807, |
| "learning_rate": 1.5329705375115727e-06, |
| "loss": 0.5968, |
| "mean_token_accuracy": 0.8157380223274231, |
| "num_tokens": 63589391.0, |
| "step": 972 |
| }, |
| { |
| "epoch": 0.6642771804062126, |
| "grad_norm": 0.7352408766746521, |
| "learning_rate": 1.5274739641947418e-06, |
| "loss": 0.6154, |
| "mean_token_accuracy": 0.8105907887220383, |
| "num_tokens": 63654927.0, |
| "step": 973 |
| }, |
| { |
| "epoch": 0.6649598907663424, |
| "grad_norm": 0.6985450983047485, |
| "learning_rate": 1.521982924751397e-06, |
| "loss": 0.6176, |
| "mean_token_accuracy": 0.8080248087644577, |
| "num_tokens": 63720463.0, |
| "step": 974 |
| }, |
| { |
| "epoch": 0.6656426011264721, |
| "grad_norm": 0.6925053000450134, |
| "learning_rate": 1.5164974504266861e-06, |
| "loss": 0.6056, |
| "mean_token_accuracy": 0.8100867569446564, |
| "num_tokens": 63785999.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.6663253114866018, |
| "grad_norm": 0.7355689406394958, |
| "learning_rate": 1.511017572434088e-06, |
| "loss": 0.6187, |
| "mean_token_accuracy": 0.8089259564876556, |
| "num_tokens": 63851535.0, |
| "step": 976 |
| }, |
| { |
| "epoch": 0.6670080218467315, |
| "grad_norm": 0.7681476473808289, |
| "learning_rate": 1.5055433219552356e-06, |
| "loss": 0.6592, |
| "mean_token_accuracy": 0.7974248677492142, |
| "num_tokens": 63917071.0, |
| "step": 977 |
| }, |
| { |
| "epoch": 0.6676907322068613, |
| "grad_norm": 0.7094239592552185, |
| "learning_rate": 1.5000747301397434e-06, |
| "loss": 0.5903, |
| "mean_token_accuracy": 0.8155722469091415, |
| "num_tokens": 63981986.0, |
| "step": 978 |
| }, |
| { |
| "epoch": 0.6683734425669909, |
| "grad_norm": 0.7052043676376343, |
| "learning_rate": 1.494611828105026e-06, |
| "loss": 0.5933, |
| "mean_token_accuracy": 0.8148674219846725, |
| "num_tokens": 64047522.0, |
| "step": 979 |
| }, |
| { |
| "epoch": 0.6690561529271206, |
| "grad_norm": 0.7070150375366211, |
| "learning_rate": 1.489154646936119e-06, |
| "loss": 0.6201, |
| "mean_token_accuracy": 0.8075665831565857, |
| "num_tokens": 64113058.0, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.6697388632872504, |
| "grad_norm": 0.7390539050102234, |
| "learning_rate": 1.483703217685509e-06, |
| "loss": 0.6184, |
| "mean_token_accuracy": 0.8068029135465622, |
| "num_tokens": 64178594.0, |
| "step": 981 |
| }, |
| { |
| "epoch": 0.6704215736473801, |
| "grad_norm": 0.7140552997589111, |
| "learning_rate": 1.4782575713729522e-06, |
| "loss": 0.6125, |
| "mean_token_accuracy": 0.8070817142724991, |
| "num_tokens": 64243854.0, |
| "step": 982 |
| }, |
| { |
| "epoch": 0.6711042840075098, |
| "grad_norm": 0.6927697658538818, |
| "learning_rate": 1.472817738985296e-06, |
| "loss": 0.5814, |
| "mean_token_accuracy": 0.8188707530498505, |
| "num_tokens": 64308929.0, |
| "step": 983 |
| }, |
| { |
| "epoch": 0.6717869943676396, |
| "grad_norm": 0.7148230075836182, |
| "learning_rate": 1.4673837514763082e-06, |
| "loss": 0.6, |
| "mean_token_accuracy": 0.8128818422555923, |
| "num_tokens": 64374465.0, |
| "step": 984 |
| }, |
| { |
| "epoch": 0.6724697047277692, |
| "grad_norm": 0.7072917819023132, |
| "learning_rate": 1.461955639766498e-06, |
| "loss": 0.5879, |
| "mean_token_accuracy": 0.8144537210464478, |
| "num_tokens": 64439849.0, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.6731524150878989, |
| "grad_norm": 0.7203112840652466, |
| "learning_rate": 1.4565334347429378e-06, |
| "loss": 0.5921, |
| "mean_token_accuracy": 0.8115013837814331, |
| "num_tokens": 64505313.0, |
| "step": 986 |
| }, |
| { |
| "epoch": 0.6738351254480287, |
| "grad_norm": 0.7090385556221008, |
| "learning_rate": 1.4511171672590924e-06, |
| "loss": 0.5915, |
| "mean_token_accuracy": 0.8125305473804474, |
| "num_tokens": 64570849.0, |
| "step": 987 |
| }, |
| { |
| "epoch": 0.6745178358081584, |
| "grad_norm": 0.73131263256073, |
| "learning_rate": 1.4457068681346388e-06, |
| "loss": 0.61, |
| "mean_token_accuracy": 0.8091856092214584, |
| "num_tokens": 64636385.0, |
| "step": 988 |
| }, |
| { |
| "epoch": 0.6752005461682881, |
| "grad_norm": 0.7277146577835083, |
| "learning_rate": 1.4403025681552937e-06, |
| "loss": 0.6139, |
| "mean_token_accuracy": 0.8073527663946152, |
| "num_tokens": 64701921.0, |
| "step": 989 |
| }, |
| { |
| "epoch": 0.6758832565284179, |
| "grad_norm": 0.7310377955436707, |
| "learning_rate": 1.4349042980726364e-06, |
| "loss": 0.6019, |
| "mean_token_accuracy": 0.811369001865387, |
| "num_tokens": 64767230.0, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.6765659668885475, |
| "grad_norm": 0.7303698658943176, |
| "learning_rate": 1.4295120886039388e-06, |
| "loss": 0.5969, |
| "mean_token_accuracy": 0.8156616538763046, |
| "num_tokens": 64832766.0, |
| "step": 991 |
| }, |
| { |
| "epoch": 0.6772486772486772, |
| "grad_norm": 0.7154099941253662, |
| "learning_rate": 1.4241259704319792e-06, |
| "loss": 0.5901, |
| "mean_token_accuracy": 0.8163703680038452, |
| "num_tokens": 64898166.0, |
| "step": 992 |
| }, |
| { |
| "epoch": 0.677931387608807, |
| "grad_norm": 0.7075607180595398, |
| "learning_rate": 1.418745974204883e-06, |
| "loss": 0.5843, |
| "mean_token_accuracy": 0.8173753768205643, |
| "num_tokens": 64963348.0, |
| "step": 993 |
| }, |
| { |
| "epoch": 0.6786140979689367, |
| "grad_norm": 0.7342114448547363, |
| "learning_rate": 1.413372130535936e-06, |
| "loss": 0.5549, |
| "mean_token_accuracy": 0.8245428204536438, |
| "num_tokens": 65028831.0, |
| "step": 994 |
| }, |
| { |
| "epoch": 0.6792968083290664, |
| "grad_norm": 0.7262970805168152, |
| "learning_rate": 1.4080044700034157e-06, |
| "loss": 0.6546, |
| "mean_token_accuracy": 0.7953476309776306, |
| "num_tokens": 65094367.0, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.6799795186891961, |
| "grad_norm": 0.7255319952964783, |
| "learning_rate": 1.4026430231504149e-06, |
| "loss": 0.6022, |
| "mean_token_accuracy": 0.8116804212331772, |
| "num_tokens": 65159591.0, |
| "step": 996 |
| }, |
| { |
| "epoch": 0.6806622290493258, |
| "grad_norm": 0.7310954928398132, |
| "learning_rate": 1.3972878204846738e-06, |
| "loss": 0.6267, |
| "mean_token_accuracy": 0.8056421130895615, |
| "num_tokens": 65225127.0, |
| "step": 997 |
| }, |
| { |
| "epoch": 0.6813449394094555, |
| "grad_norm": 0.7071923017501831, |
| "learning_rate": 1.391938892478395e-06, |
| "loss": 0.5951, |
| "mean_token_accuracy": 0.8134410679340363, |
| "num_tokens": 65290355.0, |
| "step": 998 |
| }, |
| { |
| "epoch": 0.6820276497695853, |
| "grad_norm": 0.710976779460907, |
| "learning_rate": 1.3865962695680837e-06, |
| "loss": 0.5996, |
| "mean_token_accuracy": 0.8111506998538971, |
| "num_tokens": 65355655.0, |
| "step": 999 |
| }, |
| { |
| "epoch": 0.682710360129715, |
| "grad_norm": 0.7076782584190369, |
| "learning_rate": 1.3812599821543638e-06, |
| "loss": 0.6007, |
| "mean_token_accuracy": 0.8128324896097183, |
| "num_tokens": 65420897.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.6833930704898447, |
| "grad_norm": 0.7270994186401367, |
| "learning_rate": 1.37593006060181e-06, |
| "loss": 0.6098, |
| "mean_token_accuracy": 0.8112322837114334, |
| "num_tokens": 65486433.0, |
| "step": 1001 |
| }, |
| { |
| "epoch": 0.6840757808499744, |
| "grad_norm": 0.6897610425949097, |
| "learning_rate": 1.3706065352387726e-06, |
| "loss": 0.5808, |
| "mean_token_accuracy": 0.8184567391872406, |
| "num_tokens": 65551969.0, |
| "step": 1002 |
| }, |
| { |
| "epoch": 0.6847584912101041, |
| "grad_norm": 0.7167555689811707, |
| "learning_rate": 1.3652894363572102e-06, |
| "loss": 0.6175, |
| "mean_token_accuracy": 0.8073680400848389, |
| "num_tokens": 65617505.0, |
| "step": 1003 |
| }, |
| { |
| "epoch": 0.6854412015702338, |
| "grad_norm": 0.7112638354301453, |
| "learning_rate": 1.3599787942125092e-06, |
| "loss": 0.5994, |
| "mean_token_accuracy": 0.8103158622980118, |
| "num_tokens": 65683041.0, |
| "step": 1004 |
| }, |
| { |
| "epoch": 0.6861239119303636, |
| "grad_norm": 0.7199926972389221, |
| "learning_rate": 1.354674639023318e-06, |
| "loss": 0.6189, |
| "mean_token_accuracy": 0.8093230724334717, |
| "num_tokens": 65748577.0, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.6868066222904933, |
| "grad_norm": 0.7374444007873535, |
| "learning_rate": 1.3493770009713708e-06, |
| "loss": 0.5905, |
| "mean_token_accuracy": 0.8165017068386078, |
| "num_tokens": 65814113.0, |
| "step": 1006 |
| }, |
| { |
| "epoch": 0.687489332650623, |
| "grad_norm": 0.7460854053497314, |
| "learning_rate": 1.3440859102013227e-06, |
| "loss": 0.6313, |
| "mean_token_accuracy": 0.8055046498775482, |
| "num_tokens": 65879649.0, |
| "step": 1007 |
| }, |
| { |
| "epoch": 0.6881720430107527, |
| "grad_norm": 0.746315598487854, |
| "learning_rate": 1.338801396820566e-06, |
| "loss": 0.6245, |
| "mean_token_accuracy": 0.8058559447526932, |
| "num_tokens": 65945185.0, |
| "step": 1008 |
| }, |
| { |
| "epoch": 0.6888547533708824, |
| "grad_norm": 0.6928518414497375, |
| "learning_rate": 1.3335234908990735e-06, |
| "loss": 0.5857, |
| "mean_token_accuracy": 0.8161656856536865, |
| "num_tokens": 66010721.0, |
| "step": 1009 |
| }, |
| { |
| "epoch": 0.6895374637310121, |
| "grad_norm": 0.7147393822669983, |
| "learning_rate": 1.3282522224692162e-06, |
| "loss": 0.6104, |
| "mean_token_accuracy": 0.8116415292024612, |
| "num_tokens": 66076050.0, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.6902201740911418, |
| "grad_norm": 0.697171688079834, |
| "learning_rate": 1.3229876215255977e-06, |
| "loss": 0.5854, |
| "mean_token_accuracy": 0.8165714740753174, |
| "num_tokens": 66141217.0, |
| "step": 1011 |
| }, |
| { |
| "epoch": 0.6909028844512716, |
| "grad_norm": 0.7116130590438843, |
| "learning_rate": 1.3177297180248804e-06, |
| "loss": 0.6027, |
| "mean_token_accuracy": 0.8118819445371628, |
| "num_tokens": 66206629.0, |
| "step": 1012 |
| }, |
| { |
| "epoch": 0.6915855948114012, |
| "grad_norm": 0.7741715908050537, |
| "learning_rate": 1.3124785418856216e-06, |
| "loss": 0.6525, |
| "mean_token_accuracy": 0.7982190996408463, |
| "num_tokens": 66272165.0, |
| "step": 1013 |
| }, |
| { |
| "epoch": 0.6922683051715309, |
| "grad_norm": 0.7506465911865234, |
| "learning_rate": 1.3072341229880909e-06, |
| "loss": 0.6395, |
| "mean_token_accuracy": 0.8002963215112686, |
| "num_tokens": 66337701.0, |
| "step": 1014 |
| }, |
| { |
| "epoch": 0.6929510155316607, |
| "grad_norm": 0.7145585417747498, |
| "learning_rate": 1.3019964911741157e-06, |
| "loss": 0.5892, |
| "mean_token_accuracy": 0.8156403005123138, |
| "num_tokens": 66402662.0, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.6936337258917904, |
| "grad_norm": 0.6993339657783508, |
| "learning_rate": 1.296765676246899e-06, |
| "loss": 0.6007, |
| "mean_token_accuracy": 0.8119682371616364, |
| "num_tokens": 66467955.0, |
| "step": 1016 |
| }, |
| { |
| "epoch": 0.6943164362519201, |
| "grad_norm": 0.7100650668144226, |
| "learning_rate": 1.291541707970855e-06, |
| "loss": 0.6129, |
| "mean_token_accuracy": 0.8092161566019058, |
| "num_tokens": 66533491.0, |
| "step": 1017 |
| }, |
| { |
| "epoch": 0.6949991466120499, |
| "grad_norm": 0.7469538450241089, |
| "learning_rate": 1.2863246160714394e-06, |
| "loss": 0.6017, |
| "mean_token_accuracy": 0.8118737787008286, |
| "num_tokens": 66599027.0, |
| "step": 1018 |
| }, |
| { |
| "epoch": 0.6956818569721795, |
| "grad_norm": 0.706939160823822, |
| "learning_rate": 1.28111443023498e-06, |
| "loss": 0.5585, |
| "mean_token_accuracy": 0.8248105943202972, |
| "num_tokens": 66664563.0, |
| "step": 1019 |
| }, |
| { |
| "epoch": 0.6963645673323092, |
| "grad_norm": 0.7209792137145996, |
| "learning_rate": 1.2759111801085067e-06, |
| "loss": 0.6063, |
| "mean_token_accuracy": 0.8109879046678543, |
| "num_tokens": 66730099.0, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.697047277692439, |
| "grad_norm": 0.7196431756019592, |
| "learning_rate": 1.270714895299586e-06, |
| "loss": 0.5876, |
| "mean_token_accuracy": 0.8137677162885666, |
| "num_tokens": 66795635.0, |
| "step": 1021 |
| }, |
| { |
| "epoch": 0.6977299880525687, |
| "grad_norm": 0.7060737013816833, |
| "learning_rate": 1.2655256053761483e-06, |
| "loss": 0.5657, |
| "mean_token_accuracy": 0.8216947615146637, |
| "num_tokens": 66861171.0, |
| "step": 1022 |
| }, |
| { |
| "epoch": 0.6984126984126984, |
| "grad_norm": 0.7312152981758118, |
| "learning_rate": 1.2603433398663222e-06, |
| "loss": 0.6333, |
| "mean_token_accuracy": 0.8031066805124283, |
| "num_tokens": 66926707.0, |
| "step": 1023 |
| }, |
| { |
| "epoch": 0.6990954087728282, |
| "grad_norm": 0.7349741458892822, |
| "learning_rate": 1.2551681282582663e-06, |
| "loss": 0.6243, |
| "mean_token_accuracy": 0.8045498430728912, |
| "num_tokens": 66991463.0, |
| "step": 1024 |
| }, |
| { |
| "epoch": 0.6997781191329578, |
| "grad_norm": 0.7243534326553345, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.6324, |
| "mean_token_accuracy": 0.8045996129512787, |
| "num_tokens": 67056104.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.7004608294930875, |
| "grad_norm": 0.6944549679756165, |
| "learning_rate": 1.2448389844992392e-06, |
| "loss": 0.5886, |
| "mean_token_accuracy": 0.8170895129442215, |
| "num_tokens": 67121350.0, |
| "step": 1026 |
| }, |
| { |
| "epoch": 0.7011435398532173, |
| "grad_norm": 0.7031959295272827, |
| "learning_rate": 1.239685111123223e-06, |
| "loss": 0.5969, |
| "mean_token_accuracy": 0.8143786638975143, |
| "num_tokens": 67186886.0, |
| "step": 1027 |
| }, |
| { |
| "epoch": 0.701826250213347, |
| "grad_norm": 0.7678669691085815, |
| "learning_rate": 1.234538409198555e-06, |
| "loss": 0.6496, |
| "mean_token_accuracy": 0.7978525310754776, |
| "num_tokens": 67252422.0, |
| "step": 1028 |
| }, |
| { |
| "epoch": 0.7025089605734767, |
| "grad_norm": 0.7110222578048706, |
| "learning_rate": 1.2293989080110283e-06, |
| "loss": 0.5936, |
| "mean_token_accuracy": 0.815337523818016, |
| "num_tokens": 67317916.0, |
| "step": 1029 |
| }, |
| { |
| "epoch": 0.7031916709336065, |
| "grad_norm": 0.7030754089355469, |
| "learning_rate": 1.2242666368054635e-06, |
| "loss": 0.5808, |
| "mean_token_accuracy": 0.8198858350515366, |
| "num_tokens": 67383424.0, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.7038743812937361, |
| "grad_norm": 0.748938798904419, |
| "learning_rate": 1.2191416247855408e-06, |
| "loss": 0.6279, |
| "mean_token_accuracy": 0.8053995966911316, |
| "num_tokens": 67448892.0, |
| "step": 1031 |
| }, |
| { |
| "epoch": 0.7045570916538658, |
| "grad_norm": 0.7475568652153015, |
| "learning_rate": 1.214023901113635e-06, |
| "loss": 0.6515, |
| "mean_token_accuracy": 0.7992729842662811, |
| "num_tokens": 67514428.0, |
| "step": 1032 |
| }, |
| { |
| "epoch": 0.7052398020139956, |
| "grad_norm": 0.714144229888916, |
| "learning_rate": 1.2089134949106462e-06, |
| "loss": 0.6015, |
| "mean_token_accuracy": 0.8113239258527756, |
| "num_tokens": 67579964.0, |
| "step": 1033 |
| }, |
| { |
| "epoch": 0.7059225123741253, |
| "grad_norm": 0.7368439435958862, |
| "learning_rate": 1.203810435255842e-06, |
| "loss": 0.6169, |
| "mean_token_accuracy": 0.8093841671943665, |
| "num_tokens": 67645500.0, |
| "step": 1034 |
| }, |
| { |
| "epoch": 0.706605222734255, |
| "grad_norm": 0.7233380079269409, |
| "learning_rate": 1.198714751186679e-06, |
| "loss": 0.587, |
| "mean_token_accuracy": 0.8154478222131729, |
| "num_tokens": 67711036.0, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.7072879330943848, |
| "grad_norm": 0.7264472842216492, |
| "learning_rate": 1.1936264716986523e-06, |
| "loss": 0.623, |
| "mean_token_accuracy": 0.8043133020401001, |
| "num_tokens": 67776572.0, |
| "step": 1036 |
| }, |
| { |
| "epoch": 0.7079706434545144, |
| "grad_norm": 0.7293205261230469, |
| "learning_rate": 1.1885456257451194e-06, |
| "loss": 0.6148, |
| "mean_token_accuracy": 0.8091706782579422, |
| "num_tokens": 67842091.0, |
| "step": 1037 |
| }, |
| { |
| "epoch": 0.7086533538146441, |
| "grad_norm": 0.7417516708374023, |
| "learning_rate": 1.1834722422371405e-06, |
| "loss": 0.5954, |
| "mean_token_accuracy": 0.8143175691366196, |
| "num_tokens": 67907627.0, |
| "step": 1038 |
| }, |
| { |
| "epoch": 0.7093360641747739, |
| "grad_norm": 0.7399863004684448, |
| "learning_rate": 1.1784063500433117e-06, |
| "loss": 0.625, |
| "mean_token_accuracy": 0.8072239905595779, |
| "num_tokens": 67971992.0, |
| "step": 1039 |
| }, |
| { |
| "epoch": 0.7100187745349036, |
| "grad_norm": 0.736422598361969, |
| "learning_rate": 1.1733479779896065e-06, |
| "loss": 0.6255, |
| "mean_token_accuracy": 0.803932249546051, |
| "num_tokens": 68037343.0, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.7107014848950333, |
| "grad_norm": 0.7014506459236145, |
| "learning_rate": 1.1682971548591995e-06, |
| "loss": 0.6101, |
| "mean_token_accuracy": 0.8105144202709198, |
| "num_tokens": 68102879.0, |
| "step": 1041 |
| }, |
| { |
| "epoch": 0.7113841952551629, |
| "grad_norm": 0.7273626923561096, |
| "learning_rate": 1.163253909392318e-06, |
| "loss": 0.5935, |
| "mean_token_accuracy": 0.815775141119957, |
| "num_tokens": 68167984.0, |
| "step": 1042 |
| }, |
| { |
| "epoch": 0.7120669056152927, |
| "grad_norm": 0.7792891263961792, |
| "learning_rate": 1.1582182702860667e-06, |
| "loss": 0.6134, |
| "mean_token_accuracy": 0.808910682797432, |
| "num_tokens": 68233520.0, |
| "step": 1043 |
| }, |
| { |
| "epoch": 0.7127496159754224, |
| "grad_norm": 0.7218680381774902, |
| "learning_rate": 1.1531902661942692e-06, |
| "loss": 0.5552, |
| "mean_token_accuracy": 0.8247953355312347, |
| "num_tokens": 68299056.0, |
| "step": 1044 |
| }, |
| { |
| "epoch": 0.7134323263355521, |
| "grad_norm": 0.763369619846344, |
| "learning_rate": 1.1481699257273041e-06, |
| "loss": 0.6369, |
| "mean_token_accuracy": 0.8021597117185593, |
| "num_tokens": 68364592.0, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.7141150366956819, |
| "grad_norm": 0.712095320224762, |
| "learning_rate": 1.1431572774519457e-06, |
| "loss": 0.5888, |
| "mean_token_accuracy": 0.8145788908004761, |
| "num_tokens": 68429133.0, |
| "step": 1046 |
| }, |
| { |
| "epoch": 0.7147977470558116, |
| "grad_norm": 0.6819875240325928, |
| "learning_rate": 1.1381523498911907e-06, |
| "loss": 0.5616, |
| "mean_token_accuracy": 0.8223264962434769, |
| "num_tokens": 68494642.0, |
| "step": 1047 |
| }, |
| { |
| "epoch": 0.7154804574159412, |
| "grad_norm": 0.720863938331604, |
| "learning_rate": 1.1331551715241115e-06, |
| "loss": 0.6048, |
| "mean_token_accuracy": 0.8117210417985916, |
| "num_tokens": 68560178.0, |
| "step": 1048 |
| }, |
| { |
| "epoch": 0.716163167776071, |
| "grad_norm": 0.7568435072898865, |
| "learning_rate": 1.1281657707856817e-06, |
| "loss": 0.6984, |
| "mean_token_accuracy": 0.7855392396450043, |
| "num_tokens": 68625232.0, |
| "step": 1049 |
| }, |
| { |
| "epoch": 0.7168458781362007, |
| "grad_norm": 0.7198119163513184, |
| "learning_rate": 1.1231841760666188e-06, |
| "loss": 0.5967, |
| "mean_token_accuracy": 0.8142653554677963, |
| "num_tokens": 68690063.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7175285884963304, |
| "grad_norm": 0.7616945505142212, |
| "learning_rate": 1.118210415713222e-06, |
| "loss": 0.6256, |
| "mean_token_accuracy": 0.8061919659376144, |
| "num_tokens": 68755599.0, |
| "step": 1051 |
| }, |
| { |
| "epoch": 0.7182112988564602, |
| "grad_norm": 0.7210955023765564, |
| "learning_rate": 1.1132445180272147e-06, |
| "loss": 0.5825, |
| "mean_token_accuracy": 0.8181665390729904, |
| "num_tokens": 68821135.0, |
| "step": 1052 |
| }, |
| { |
| "epoch": 0.7188940092165899, |
| "grad_norm": 0.7416355013847351, |
| "learning_rate": 1.1082865112655767e-06, |
| "loss": 0.5861, |
| "mean_token_accuracy": 0.8164656460285187, |
| "num_tokens": 68885990.0, |
| "step": 1053 |
| }, |
| { |
| "epoch": 0.7195767195767195, |
| "grad_norm": 0.6842755675315857, |
| "learning_rate": 1.1033364236403874e-06, |
| "loss": 0.5815, |
| "mean_token_accuracy": 0.8173264861106873, |
| "num_tokens": 68951526.0, |
| "step": 1054 |
| }, |
| { |
| "epoch": 0.7202594299368493, |
| "grad_norm": 0.712204098701477, |
| "learning_rate": 1.0983942833186644e-06, |
| "loss": 0.6026, |
| "mean_token_accuracy": 0.8113086521625519, |
| "num_tokens": 69017062.0, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.720942140296979, |
| "grad_norm": 0.7125012278556824, |
| "learning_rate": 1.0934601184222073e-06, |
| "loss": 0.6239, |
| "mean_token_accuracy": 0.8067265450954437, |
| "num_tokens": 69082598.0, |
| "step": 1056 |
| }, |
| { |
| "epoch": 0.7216248506571087, |
| "grad_norm": 0.6971497535705566, |
| "learning_rate": 1.0885339570274268e-06, |
| "loss": 0.6034, |
| "mean_token_accuracy": 0.8114766627550125, |
| "num_tokens": 69148134.0, |
| "step": 1057 |
| }, |
| { |
| "epoch": 0.7223075610172385, |
| "grad_norm": 0.7023418545722961, |
| "learning_rate": 1.083615827165199e-06, |
| "loss": 0.5973, |
| "mean_token_accuracy": 0.811950147151947, |
| "num_tokens": 69213670.0, |
| "step": 1058 |
| }, |
| { |
| "epoch": 0.7229902713773682, |
| "grad_norm": 0.7571163177490234, |
| "learning_rate": 1.0787057568206945e-06, |
| "loss": 0.636, |
| "mean_token_accuracy": 0.8033449351787567, |
| "num_tokens": 69279108.0, |
| "step": 1059 |
| }, |
| { |
| "epoch": 0.7236729817374978, |
| "grad_norm": 0.7366777658462524, |
| "learning_rate": 1.073803773933226e-06, |
| "loss": 0.5972, |
| "mean_token_accuracy": 0.8136913478374481, |
| "num_tokens": 69344644.0, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.7243556920976276, |
| "grad_norm": 0.727080762386322, |
| "learning_rate": 1.0689099063960844e-06, |
| "loss": 0.6161, |
| "mean_token_accuracy": 0.8062903136014938, |
| "num_tokens": 69409956.0, |
| "step": 1061 |
| }, |
| { |
| "epoch": 0.7250384024577573, |
| "grad_norm": 0.7016240954399109, |
| "learning_rate": 1.0640241820563871e-06, |
| "loss": 0.5584, |
| "mean_token_accuracy": 0.8244874775409698, |
| "num_tokens": 69475444.0, |
| "step": 1062 |
| }, |
| { |
| "epoch": 0.725721112817887, |
| "grad_norm": 0.7436378598213196, |
| "learning_rate": 1.0591466287149082e-06, |
| "loss": 0.6115, |
| "mean_token_accuracy": 0.8092294335365295, |
| "num_tokens": 69540509.0, |
| "step": 1063 |
| }, |
| { |
| "epoch": 0.7264038231780168, |
| "grad_norm": 0.7381221652030945, |
| "learning_rate": 1.054277274125934e-06, |
| "loss": 0.5855, |
| "mean_token_accuracy": 0.8153375536203384, |
| "num_tokens": 69605192.0, |
| "step": 1064 |
| }, |
| { |
| "epoch": 0.7270865335381465, |
| "grad_norm": 0.7373767495155334, |
| "learning_rate": 1.0494161459970942e-06, |
| "loss": 0.6438, |
| "mean_token_accuracy": 0.8019611537456512, |
| "num_tokens": 69670728.0, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.7277692438982761, |
| "grad_norm": 0.6937271356582642, |
| "learning_rate": 1.0445632719892093e-06, |
| "loss": 0.5947, |
| "mean_token_accuracy": 0.8145161271095276, |
| "num_tokens": 69736264.0, |
| "step": 1066 |
| }, |
| { |
| "epoch": 0.7284519542584059, |
| "grad_norm": 0.7547881603240967, |
| "learning_rate": 1.039718679716132e-06, |
| "loss": 0.6021, |
| "mean_token_accuracy": 0.8118002116680145, |
| "num_tokens": 69801715.0, |
| "step": 1067 |
| }, |
| { |
| "epoch": 0.7291346646185356, |
| "grad_norm": 0.743807852268219, |
| "learning_rate": 1.0348823967445904e-06, |
| "loss": 0.6157, |
| "mean_token_accuracy": 0.8069556504487991, |
| "num_tokens": 69867251.0, |
| "step": 1068 |
| }, |
| { |
| "epoch": 0.7298173749786653, |
| "grad_norm": 0.734443724155426, |
| "learning_rate": 1.03005445059403e-06, |
| "loss": 0.5992, |
| "mean_token_accuracy": 0.8127291053533554, |
| "num_tokens": 69932787.0, |
| "step": 1069 |
| }, |
| { |
| "epoch": 0.7305000853387951, |
| "grad_norm": 0.7220298051834106, |
| "learning_rate": 1.0252348687364608e-06, |
| "loss": 0.6139, |
| "mean_token_accuracy": 0.8077517449855804, |
| "num_tokens": 69998191.0, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.7311827956989247, |
| "grad_norm": 0.6889765858650208, |
| "learning_rate": 1.0204236785962954e-06, |
| "loss": 0.5883, |
| "mean_token_accuracy": 0.8164100646972656, |
| "num_tokens": 70063727.0, |
| "step": 1071 |
| }, |
| { |
| "epoch": 0.7318655060590544, |
| "grad_norm": 0.7063385248184204, |
| "learning_rate": 1.0156209075501972e-06, |
| "loss": 0.608, |
| "mean_token_accuracy": 0.8102242201566696, |
| "num_tokens": 70129263.0, |
| "step": 1072 |
| }, |
| { |
| "epoch": 0.7325482164191841, |
| "grad_norm": 0.7358285188674927, |
| "learning_rate": 1.0108265829269223e-06, |
| "loss": 0.6236, |
| "mean_token_accuracy": 0.807369664311409, |
| "num_tokens": 70193946.0, |
| "step": 1073 |
| }, |
| { |
| "epoch": 0.7332309267793139, |
| "grad_norm": 0.7315536141395569, |
| "learning_rate": 1.0060407320071658e-06, |
| "loss": 0.6119, |
| "mean_token_accuracy": 0.8106366097927094, |
| "num_tokens": 70259482.0, |
| "step": 1074 |
| }, |
| { |
| "epoch": 0.7339136371394436, |
| "grad_norm": 0.7272475361824036, |
| "learning_rate": 1.0012633820234052e-06, |
| "loss": 0.6137, |
| "mean_token_accuracy": 0.8086815774440765, |
| "num_tokens": 70325018.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.7345963474995733, |
| "grad_norm": 0.6909750699996948, |
| "learning_rate": 9.964945601597454e-07, |
| "loss": 0.5922, |
| "mean_token_accuracy": 0.8150201588869095, |
| "num_tokens": 70390554.0, |
| "step": 1076 |
| }, |
| { |
| "epoch": 0.735279057859703, |
| "grad_norm": 0.6908272504806519, |
| "learning_rate": 9.917342935517665e-07, |
| "loss": 0.5954, |
| "mean_token_accuracy": 0.8137371689081192, |
| "num_tokens": 70456090.0, |
| "step": 1077 |
| }, |
| { |
| "epoch": 0.7359617682198327, |
| "grad_norm": 0.7175625562667847, |
| "learning_rate": 9.869826092863661e-07, |
| "loss": 0.6074, |
| "mean_token_accuracy": 0.8106231540441513, |
| "num_tokens": 70521316.0, |
| "step": 1078 |
| }, |
| { |
| "epoch": 0.7366444785799624, |
| "grad_norm": 0.7069457769393921, |
| "learning_rate": 9.822395344016054e-07, |
| "loss": 0.6087, |
| "mean_token_accuracy": 0.8105449676513672, |
| "num_tokens": 70586852.0, |
| "step": 1079 |
| }, |
| { |
| "epoch": 0.7373271889400922, |
| "grad_norm": 0.7554196715354919, |
| "learning_rate": 9.775050958865584e-07, |
| "loss": 0.5844, |
| "mean_token_accuracy": 0.8151825070381165, |
| "num_tokens": 70652261.0, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.7380098993002219, |
| "grad_norm": 0.749987006187439, |
| "learning_rate": 9.727793206811553e-07, |
| "loss": 0.639, |
| "mean_token_accuracy": 0.8026099056005478, |
| "num_tokens": 70717628.0, |
| "step": 1081 |
| }, |
| { |
| "epoch": 0.7386926096603516, |
| "grad_norm": 0.7476643323898315, |
| "learning_rate": 9.680622356760297e-07, |
| "loss": 0.6094, |
| "mean_token_accuracy": 0.8092154115438461, |
| "num_tokens": 70782933.0, |
| "step": 1082 |
| }, |
| { |
| "epoch": 0.7393753200204813, |
| "grad_norm": 0.707511305809021, |
| "learning_rate": 9.633538677123697e-07, |
| "loss": 0.6022, |
| "mean_token_accuracy": 0.812897115945816, |
| "num_tokens": 70848469.0, |
| "step": 1083 |
| }, |
| { |
| "epoch": 0.740058030380611, |
| "grad_norm": 0.722371518611908, |
| "learning_rate": 9.586542435817553e-07, |
| "loss": 0.6375, |
| "mean_token_accuracy": 0.8035801649093628, |
| "num_tokens": 70914005.0, |
| "step": 1084 |
| }, |
| { |
| "epoch": 0.7407407407407407, |
| "grad_norm": 0.7201296091079712, |
| "learning_rate": 9.539633900260186e-07, |
| "loss": 0.6109, |
| "mean_token_accuracy": 0.8105449676513672, |
| "num_tokens": 70979541.0, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.7414234511008705, |
| "grad_norm": 0.7072751522064209, |
| "learning_rate": 9.49281333737082e-07, |
| "loss": 0.6166, |
| "mean_token_accuracy": 0.8087597638368607, |
| "num_tokens": 71045050.0, |
| "step": 1086 |
| }, |
| { |
| "epoch": 0.7421061614610002, |
| "grad_norm": 0.7147124409675598, |
| "learning_rate": 9.446081013568101e-07, |
| "loss": 0.5877, |
| "mean_token_accuracy": 0.8150329142808914, |
| "num_tokens": 71110580.0, |
| "step": 1087 |
| }, |
| { |
| "epoch": 0.7427888718211298, |
| "grad_norm": 0.7225379347801208, |
| "learning_rate": 9.399437194768571e-07, |
| "loss": 0.5851, |
| "mean_token_accuracy": 0.8183732032775879, |
| "num_tokens": 71176002.0, |
| "step": 1088 |
| }, |
| { |
| "epoch": 0.7434715821812596, |
| "grad_norm": 0.7688735723495483, |
| "learning_rate": 9.352882146385193e-07, |
| "loss": 0.6727, |
| "mean_token_accuracy": 0.7923766225576401, |
| "num_tokens": 71241504.0, |
| "step": 1089 |
| }, |
| { |
| "epoch": 0.7441542925413893, |
| "grad_norm": 0.7088306546211243, |
| "learning_rate": 9.306416133325747e-07, |
| "loss": 0.5761, |
| "mean_token_accuracy": 0.8190524131059647, |
| "num_tokens": 71307040.0, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.744837002901519, |
| "grad_norm": 0.7349225878715515, |
| "learning_rate": 9.260039419991448e-07, |
| "loss": 0.6314, |
| "mean_token_accuracy": 0.8064516186714172, |
| "num_tokens": 71372576.0, |
| "step": 1091 |
| }, |
| { |
| "epoch": 0.7455197132616488, |
| "grad_norm": 0.7324482202529907, |
| "learning_rate": 9.213752270275339e-07, |
| "loss": 0.6315, |
| "mean_token_accuracy": 0.8049242496490479, |
| "num_tokens": 71438112.0, |
| "step": 1092 |
| }, |
| { |
| "epoch": 0.7462024236217785, |
| "grad_norm": 0.7754062414169312, |
| "learning_rate": 9.167554947560836e-07, |
| "loss": 0.587, |
| "mean_token_accuracy": 0.8168682754039764, |
| "num_tokens": 71503648.0, |
| "step": 1093 |
| }, |
| { |
| "epoch": 0.7468851339819081, |
| "grad_norm": 0.7258090376853943, |
| "learning_rate": 9.121447714720214e-07, |
| "loss": 0.5716, |
| "mean_token_accuracy": 0.8204730749130249, |
| "num_tokens": 71569058.0, |
| "step": 1094 |
| }, |
| { |
| "epoch": 0.7475678443420379, |
| "grad_norm": 0.7216829657554626, |
| "learning_rate": 9.075430834113153e-07, |
| "loss": 0.6208, |
| "mean_token_accuracy": 0.8046814501285553, |
| "num_tokens": 71634344.0, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.7482505547021676, |
| "grad_norm": 0.7033687829971313, |
| "learning_rate": 9.029504567585149e-07, |
| "loss": 0.5581, |
| "mean_token_accuracy": 0.8244858235120773, |
| "num_tokens": 71699830.0, |
| "step": 1096 |
| }, |
| { |
| "epoch": 0.7489332650622973, |
| "grad_norm": 0.7144305109977722, |
| "learning_rate": 8.983669176466143e-07, |
| "loss": 0.5762, |
| "mean_token_accuracy": 0.818400427699089, |
| "num_tokens": 71764790.0, |
| "step": 1097 |
| }, |
| { |
| "epoch": 0.7496159754224271, |
| "grad_norm": 0.7208042740821838, |
| "learning_rate": 8.937924921568946e-07, |
| "loss": 0.5997, |
| "mean_token_accuracy": 0.8123532086610794, |
| "num_tokens": 71829855.0, |
| "step": 1098 |
| }, |
| { |
| "epoch": 0.7502986857825568, |
| "grad_norm": 0.7652632594108582, |
| "learning_rate": 8.892272063187793e-07, |
| "loss": 0.634, |
| "mean_token_accuracy": 0.8046972900629044, |
| "num_tokens": 71894984.0, |
| "step": 1099 |
| }, |
| { |
| "epoch": 0.7509813961426864, |
| "grad_norm": 0.7493232488632202, |
| "learning_rate": 8.846710861096841e-07, |
| "loss": 0.6421, |
| "mean_token_accuracy": 0.8010455518960953, |
| "num_tokens": 71960164.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.7516641065028162, |
| "grad_norm": 0.7200855016708374, |
| "learning_rate": 8.801241574548735e-07, |
| "loss": 0.5745, |
| "mean_token_accuracy": 0.8203964978456497, |
| "num_tokens": 72025700.0, |
| "step": 1101 |
| }, |
| { |
| "epoch": 0.7523468168629459, |
| "grad_norm": 0.7226145267486572, |
| "learning_rate": 8.755864462273072e-07, |
| "loss": 0.5948, |
| "mean_token_accuracy": 0.8125305473804474, |
| "num_tokens": 72091236.0, |
| "step": 1102 |
| }, |
| { |
| "epoch": 0.7530295272230756, |
| "grad_norm": 0.7232508659362793, |
| "learning_rate": 8.710579782474974e-07, |
| "loss": 0.5999, |
| "mean_token_accuracy": 0.813049852848053, |
| "num_tokens": 72156772.0, |
| "step": 1103 |
| }, |
| { |
| "epoch": 0.7537122375832054, |
| "grad_norm": 0.6991187930107117, |
| "learning_rate": 8.665387792833582e-07, |
| "loss": 0.5508, |
| "mean_token_accuracy": 0.8265517950057983, |
| "num_tokens": 72222308.0, |
| "step": 1104 |
| }, |
| { |
| "epoch": 0.7543949479433351, |
| "grad_norm": 0.7070404887199402, |
| "learning_rate": 8.620288750500658e-07, |
| "loss": 0.6137, |
| "mean_token_accuracy": 0.8111559152603149, |
| "num_tokens": 72287844.0, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.7550776583034647, |
| "grad_norm": 0.7090805768966675, |
| "learning_rate": 8.575282912099003e-07, |
| "loss": 0.6246, |
| "mean_token_accuracy": 0.8064394891262054, |
| "num_tokens": 72353148.0, |
| "step": 1106 |
| }, |
| { |
| "epoch": 0.7557603686635944, |
| "grad_norm": 0.7003458142280579, |
| "learning_rate": 8.530370533721144e-07, |
| "loss": 0.597, |
| "mean_token_accuracy": 0.8132027834653854, |
| "num_tokens": 72418651.0, |
| "step": 1107 |
| }, |
| { |
| "epoch": 0.7564430790237242, |
| "grad_norm": 0.7059638500213623, |
| "learning_rate": 8.485551870927747e-07, |
| "loss": 0.602, |
| "mean_token_accuracy": 0.8118747770786285, |
| "num_tokens": 72483869.0, |
| "step": 1108 |
| }, |
| { |
| "epoch": 0.7571257893838539, |
| "grad_norm": 0.7228066921234131, |
| "learning_rate": 8.440827178746244e-07, |
| "loss": 0.5637, |
| "mean_token_accuracy": 0.8214890360832214, |
| "num_tokens": 72549379.0, |
| "step": 1109 |
| }, |
| { |
| "epoch": 0.7578084997439836, |
| "grad_norm": 0.6997213959693909, |
| "learning_rate": 8.396196711669335e-07, |
| "loss": 0.5883, |
| "mean_token_accuracy": 0.8160992115736008, |
| "num_tokens": 72614862.0, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.7584912101041134, |
| "grad_norm": 0.7410682439804077, |
| "learning_rate": 8.351660723653599e-07, |
| "loss": 0.6115, |
| "mean_token_accuracy": 0.8090939670801163, |
| "num_tokens": 72680398.0, |
| "step": 1111 |
| }, |
| { |
| "epoch": 0.759173920464243, |
| "grad_norm": 0.7432024478912354, |
| "learning_rate": 8.307219468117947e-07, |
| "loss": 0.6055, |
| "mean_token_accuracy": 0.8117057681083679, |
| "num_tokens": 72745934.0, |
| "step": 1112 |
| }, |
| { |
| "epoch": 0.7598566308243727, |
| "grad_norm": 0.7230374217033386, |
| "learning_rate": 8.262873197942303e-07, |
| "loss": 0.571, |
| "mean_token_accuracy": 0.820182666182518, |
| "num_tokens": 72811470.0, |
| "step": 1113 |
| }, |
| { |
| "epoch": 0.7605393411845025, |
| "grad_norm": 0.7165111899375916, |
| "learning_rate": 8.21862216546607e-07, |
| "loss": 0.5683, |
| "mean_token_accuracy": 0.8194189816713333, |
| "num_tokens": 72877006.0, |
| "step": 1114 |
| }, |
| { |
| "epoch": 0.7612220515446322, |
| "grad_norm": 0.7650198340415955, |
| "learning_rate": 8.174466622486743e-07, |
| "loss": 0.6179, |
| "mean_token_accuracy": 0.8073527663946152, |
| "num_tokens": 72942542.0, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.7619047619047619, |
| "grad_norm": 0.7104616165161133, |
| "learning_rate": 8.130406820258455e-07, |
| "loss": 0.6032, |
| "mean_token_accuracy": 0.8136285096406937, |
| "num_tokens": 73007960.0, |
| "step": 1116 |
| }, |
| { |
| "epoch": 0.7625874722648917, |
| "grad_norm": 0.7432583570480347, |
| "learning_rate": 8.086443009490558e-07, |
| "loss": 0.6147, |
| "mean_token_accuracy": 0.8103925734758377, |
| "num_tokens": 73073463.0, |
| "step": 1117 |
| }, |
| { |
| "epoch": 0.7632701826250213, |
| "grad_norm": 0.7258316874504089, |
| "learning_rate": 8.042575440346185e-07, |
| "loss": 0.6106, |
| "mean_token_accuracy": 0.806356206536293, |
| "num_tokens": 73138807.0, |
| "step": 1118 |
| }, |
| { |
| "epoch": 0.763952892985151, |
| "grad_norm": 0.7336967587471008, |
| "learning_rate": 7.998804362440854e-07, |
| "loss": 0.628, |
| "mean_token_accuracy": 0.8075513243675232, |
| "num_tokens": 73204343.0, |
| "step": 1119 |
| }, |
| { |
| "epoch": 0.7646356033452808, |
| "grad_norm": 0.7090906500816345, |
| "learning_rate": 7.955130024841009e-07, |
| "loss": 0.5771, |
| "mean_token_accuracy": 0.8172195702791214, |
| "num_tokens": 73269879.0, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.7653183137054105, |
| "grad_norm": 0.6913905739784241, |
| "learning_rate": 7.911552676062629e-07, |
| "loss": 0.5969, |
| "mean_token_accuracy": 0.8146230429410934, |
| "num_tokens": 73335415.0, |
| "step": 1121 |
| }, |
| { |
| "epoch": 0.7660010240655402, |
| "grad_norm": 0.7096146941184998, |
| "learning_rate": 7.868072564069807e-07, |
| "loss": 0.5892, |
| "mean_token_accuracy": 0.8174639493227005, |
| "num_tokens": 73400951.0, |
| "step": 1122 |
| }, |
| { |
| "epoch": 0.76668373442567, |
| "grad_norm": 0.7153047323226929, |
| "learning_rate": 7.82468993627333e-07, |
| "loss": 0.6304, |
| "mean_token_accuracy": 0.8055810183286667, |
| "num_tokens": 73466487.0, |
| "step": 1123 |
| }, |
| { |
| "epoch": 0.7673664447857996, |
| "grad_norm": 0.7417140603065491, |
| "learning_rate": 7.781405039529296e-07, |
| "loss": 0.5973, |
| "mean_token_accuracy": 0.8134699463844299, |
| "num_tokens": 73531758.0, |
| "step": 1124 |
| }, |
| { |
| "epoch": 0.7680491551459293, |
| "grad_norm": 0.7037280797958374, |
| "learning_rate": 7.738218120137672e-07, |
| "loss": 0.5728, |
| "mean_token_accuracy": 0.8211449086666107, |
| "num_tokens": 73597294.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.7687318655060591, |
| "grad_norm": 0.7011156678199768, |
| "learning_rate": 7.695129423840944e-07, |
| "loss": 0.6029, |
| "mean_token_accuracy": 0.8113086521625519, |
| "num_tokens": 73662830.0, |
| "step": 1126 |
| }, |
| { |
| "epoch": 0.7694145758661888, |
| "grad_norm": 0.7726342678070068, |
| "learning_rate": 7.652139195822658e-07, |
| "loss": 0.6001, |
| "mean_token_accuracy": 0.8128054738044739, |
| "num_tokens": 73728366.0, |
| "step": 1127 |
| }, |
| { |
| "epoch": 0.7700972862263185, |
| "grad_norm": 0.7018040418624878, |
| "learning_rate": 7.609247680706072e-07, |
| "loss": 0.5608, |
| "mean_token_accuracy": 0.8241385519504547, |
| "num_tokens": 73793902.0, |
| "step": 1128 |
| }, |
| { |
| "epoch": 0.7707799965864482, |
| "grad_norm": 0.7123659253120422, |
| "learning_rate": 7.566455122552744e-07, |
| "loss": 0.5959, |
| "mean_token_accuracy": 0.8140884637832642, |
| "num_tokens": 73859438.0, |
| "step": 1129 |
| }, |
| { |
| "epoch": 0.7714627069465779, |
| "grad_norm": 0.6594699025154114, |
| "learning_rate": 7.523761764861138e-07, |
| "loss": 0.5628, |
| "mean_token_accuracy": 0.8215267658233643, |
| "num_tokens": 73924974.0, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.7721454173067076, |
| "grad_norm": 0.7053015232086182, |
| "learning_rate": 7.481167850565255e-07, |
| "loss": 0.5569, |
| "mean_token_accuracy": 0.8225195407867432, |
| "num_tokens": 73990510.0, |
| "step": 1131 |
| }, |
| { |
| "epoch": 0.7728281276668374, |
| "grad_norm": 0.7094056606292725, |
| "learning_rate": 7.43867362203326e-07, |
| "loss": 0.5928, |
| "mean_token_accuracy": 0.8135844320058823, |
| "num_tokens": 74056046.0, |
| "step": 1132 |
| }, |
| { |
| "epoch": 0.7735108380269671, |
| "grad_norm": 0.7524219751358032, |
| "learning_rate": 7.396279321066035e-07, |
| "loss": 0.6431, |
| "mean_token_accuracy": 0.8012891113758087, |
| "num_tokens": 74121582.0, |
| "step": 1133 |
| }, |
| { |
| "epoch": 0.7741935483870968, |
| "grad_norm": 0.7120317816734314, |
| "learning_rate": 7.353985188895915e-07, |
| "loss": 0.6297, |
| "mean_token_accuracy": 0.8051542043685913, |
| "num_tokens": 74185990.0, |
| "step": 1134 |
| }, |
| { |
| "epoch": 0.7748762587472265, |
| "grad_norm": 0.7252651453018188, |
| "learning_rate": 7.311791466185214e-07, |
| "loss": 0.6045, |
| "mean_token_accuracy": 0.8110489994287491, |
| "num_tokens": 74251526.0, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.7755589691073562, |
| "grad_norm": 0.7242283225059509, |
| "learning_rate": 7.269698393024904e-07, |
| "loss": 0.6094, |
| "mean_token_accuracy": 0.8091860562562943, |
| "num_tokens": 74316251.0, |
| "step": 1136 |
| }, |
| { |
| "epoch": 0.7762416794674859, |
| "grad_norm": 0.7422311305999756, |
| "learning_rate": 7.227706208933239e-07, |
| "loss": 0.6407, |
| "mean_token_accuracy": 0.8013196587562561, |
| "num_tokens": 74381787.0, |
| "step": 1137 |
| }, |
| { |
| "epoch": 0.7769243898276156, |
| "grad_norm": 0.6992833018302917, |
| "learning_rate": 7.185815152854417e-07, |
| "loss": 0.5788, |
| "mean_token_accuracy": 0.8192576766014099, |
| "num_tokens": 74447315.0, |
| "step": 1138 |
| }, |
| { |
| "epoch": 0.7776071001877454, |
| "grad_norm": 0.7244083881378174, |
| "learning_rate": 7.144025463157147e-07, |
| "loss": 0.5659, |
| "mean_token_accuracy": 0.8207778781652451, |
| "num_tokens": 74512781.0, |
| "step": 1139 |
| }, |
| { |
| "epoch": 0.778289810547875, |
| "grad_norm": 0.7246714234352112, |
| "learning_rate": 7.102337377633395e-07, |
| "loss": 0.6041, |
| "mean_token_accuracy": 0.8115072101354599, |
| "num_tokens": 74578317.0, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.7789725209080047, |
| "grad_norm": 0.6905441284179688, |
| "learning_rate": 7.060751133496948e-07, |
| "loss": 0.561, |
| "mean_token_accuracy": 0.8240596801042557, |
| "num_tokens": 74643608.0, |
| "step": 1141 |
| }, |
| { |
| "epoch": 0.7796552312681345, |
| "grad_norm": 0.7125344276428223, |
| "learning_rate": 7.019266967382105e-07, |
| "loss": 0.6195, |
| "mean_token_accuracy": 0.8087732195854187, |
| "num_tokens": 74709144.0, |
| "step": 1142 |
| }, |
| { |
| "epoch": 0.7803379416282642, |
| "grad_norm": 0.7289828658103943, |
| "learning_rate": 6.977885115342306e-07, |
| "loss": 0.5832, |
| "mean_token_accuracy": 0.8175250440835953, |
| "num_tokens": 74774680.0, |
| "step": 1143 |
| }, |
| { |
| "epoch": 0.7810206519883939, |
| "grad_norm": 0.752156674861908, |
| "learning_rate": 6.936605812848837e-07, |
| "loss": 0.6223, |
| "mean_token_accuracy": 0.8056115657091141, |
| "num_tokens": 74840216.0, |
| "step": 1144 |
| }, |
| { |
| "epoch": 0.7817033623485237, |
| "grad_norm": 0.7085697650909424, |
| "learning_rate": 6.895429294789402e-07, |
| "loss": 0.6034, |
| "mean_token_accuracy": 0.8111406415700912, |
| "num_tokens": 74905752.0, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.7823860727086533, |
| "grad_norm": 0.7335535287857056, |
| "learning_rate": 6.854355795466897e-07, |
| "loss": 0.5903, |
| "mean_token_accuracy": 0.8162474036216736, |
| "num_tokens": 74971229.0, |
| "step": 1146 |
| }, |
| { |
| "epoch": 0.783068783068783, |
| "grad_norm": 0.7267248034477234, |
| "learning_rate": 6.813385548597976e-07, |
| "loss": 0.6087, |
| "mean_token_accuracy": 0.8100256621837616, |
| "num_tokens": 75036765.0, |
| "step": 1147 |
| }, |
| { |
| "epoch": 0.7837514934289128, |
| "grad_norm": 0.7313956618309021, |
| "learning_rate": 6.772518787311804e-07, |
| "loss": 0.6383, |
| "mean_token_accuracy": 0.8019611537456512, |
| "num_tokens": 75102301.0, |
| "step": 1148 |
| }, |
| { |
| "epoch": 0.7844342037890425, |
| "grad_norm": 0.7354364395141602, |
| "learning_rate": 6.731755744148652e-07, |
| "loss": 0.582, |
| "mean_token_accuracy": 0.8179615139961243, |
| "num_tokens": 75167672.0, |
| "step": 1149 |
| }, |
| { |
| "epoch": 0.7851169141491722, |
| "grad_norm": 0.7073311805725098, |
| "learning_rate": 6.691096651058643e-07, |
| "loss": 0.6127, |
| "mean_token_accuracy": 0.8103054463863373, |
| "num_tokens": 75233122.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.785799624509302, |
| "grad_norm": 0.748970091342926, |
| "learning_rate": 6.650541739400393e-07, |
| "loss": 0.6182, |
| "mean_token_accuracy": 0.8087337911128998, |
| "num_tokens": 75298544.0, |
| "step": 1151 |
| }, |
| { |
| "epoch": 0.7864823348694316, |
| "grad_norm": 0.7270997166633606, |
| "learning_rate": 6.610091239939704e-07, |
| "loss": 0.5972, |
| "mean_token_accuracy": 0.8138899058103561, |
| "num_tokens": 75364080.0, |
| "step": 1152 |
| }, |
| { |
| "epoch": 0.7871650452295613, |
| "grad_norm": 0.7352398633956909, |
| "learning_rate": 6.569745382848236e-07, |
| "loss": 0.6171, |
| "mean_token_accuracy": 0.8078258484601974, |
| "num_tokens": 75429577.0, |
| "step": 1153 |
| }, |
| { |
| "epoch": 0.7878477555896911, |
| "grad_norm": 0.7277079224586487, |
| "learning_rate": 6.529504397702255e-07, |
| "loss": 0.5929, |
| "mean_token_accuracy": 0.8138288110494614, |
| "num_tokens": 75495113.0, |
| "step": 1154 |
| }, |
| { |
| "epoch": 0.7885304659498208, |
| "grad_norm": 0.7266401648521423, |
| "learning_rate": 6.489368513481228e-07, |
| "loss": 0.618, |
| "mean_token_accuracy": 0.8080567568540573, |
| "num_tokens": 75560634.0, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.7892131763099505, |
| "grad_norm": 0.7263786196708679, |
| "learning_rate": 6.449337958566623e-07, |
| "loss": 0.6235, |
| "mean_token_accuracy": 0.8064727634191513, |
| "num_tokens": 75626081.0, |
| "step": 1156 |
| }, |
| { |
| "epoch": 0.7898958866700803, |
| "grad_norm": 0.7240126132965088, |
| "learning_rate": 6.40941296074054e-07, |
| "loss": 0.6134, |
| "mean_token_accuracy": 0.8087421804666519, |
| "num_tokens": 75691529.0, |
| "step": 1157 |
| }, |
| { |
| "epoch": 0.7905785970302099, |
| "grad_norm": 0.6938052773475647, |
| "learning_rate": 6.369593747184438e-07, |
| "loss": 0.5664, |
| "mean_token_accuracy": 0.8221682459115982, |
| "num_tokens": 75757065.0, |
| "step": 1158 |
| }, |
| { |
| "epoch": 0.7912613073903396, |
| "grad_norm": 0.7303853631019592, |
| "learning_rate": 6.32988054447784e-07, |
| "loss": 0.5906, |
| "mean_token_accuracy": 0.8149492591619492, |
| "num_tokens": 75822541.0, |
| "step": 1159 |
| }, |
| { |
| "epoch": 0.7919440177504694, |
| "grad_norm": 0.6966801285743713, |
| "learning_rate": 6.29027357859707e-07, |
| "loss": 0.5667, |
| "mean_token_accuracy": 0.8210838139057159, |
| "num_tokens": 75888077.0, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.7926267281105991, |
| "grad_norm": 0.6900127530097961, |
| "learning_rate": 6.250773074913897e-07, |
| "loss": 0.5997, |
| "mean_token_accuracy": 0.8122250735759735, |
| "num_tokens": 75953613.0, |
| "step": 1161 |
| }, |
| { |
| "epoch": 0.7933094384707288, |
| "grad_norm": 0.7056220173835754, |
| "learning_rate": 6.211379258194342e-07, |
| "loss": 0.6128, |
| "mean_token_accuracy": 0.8070778399705887, |
| "num_tokens": 76019149.0, |
| "step": 1162 |
| }, |
| { |
| "epoch": 0.7939921488308586, |
| "grad_norm": 0.7348355650901794, |
| "learning_rate": 6.172092352597334e-07, |
| "loss": 0.6386, |
| "mean_token_accuracy": 0.8025385290384293, |
| "num_tokens": 76084565.0, |
| "step": 1163 |
| }, |
| { |
| "epoch": 0.7946748591909882, |
| "grad_norm": 0.7304251790046692, |
| "learning_rate": 6.132912581673456e-07, |
| "loss": 0.6003, |
| "mean_token_accuracy": 0.8130903542041779, |
| "num_tokens": 76149571.0, |
| "step": 1164 |
| }, |
| { |
| "epoch": 0.7953575695511179, |
| "grad_norm": 0.7153202295303345, |
| "learning_rate": 6.093840168363679e-07, |
| "loss": 0.6151, |
| "mean_token_accuracy": 0.8080400824546814, |
| "num_tokens": 76215107.0, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.7960402799112477, |
| "grad_norm": 0.724839985370636, |
| "learning_rate": 6.054875334998084e-07, |
| "loss": 0.6474, |
| "mean_token_accuracy": 0.799018919467926, |
| "num_tokens": 76279104.0, |
| "step": 1166 |
| }, |
| { |
| "epoch": 0.7967229902713774, |
| "grad_norm": 0.716566801071167, |
| "learning_rate": 6.016018303294588e-07, |
| "loss": 0.591, |
| "mean_token_accuracy": 0.814546674489975, |
| "num_tokens": 76344640.0, |
| "step": 1167 |
| }, |
| { |
| "epoch": 0.7974057006315071, |
| "grad_norm": 0.759793758392334, |
| "learning_rate": 5.977269294357724e-07, |
| "loss": 0.6073, |
| "mean_token_accuracy": 0.8116446733474731, |
| "num_tokens": 76410176.0, |
| "step": 1168 |
| }, |
| { |
| "epoch": 0.7980884109916367, |
| "grad_norm": 0.7540044188499451, |
| "learning_rate": 5.938628528677326e-07, |
| "loss": 0.569, |
| "mean_token_accuracy": 0.821023091673851, |
| "num_tokens": 76475010.0, |
| "step": 1169 |
| }, |
| { |
| "epoch": 0.7987711213517665, |
| "grad_norm": 0.6950900554656982, |
| "learning_rate": 5.900096226127314e-07, |
| "loss": 0.597, |
| "mean_token_accuracy": 0.8137371689081192, |
| "num_tokens": 76540546.0, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.7994538317118962, |
| "grad_norm": 0.681171178817749, |
| "learning_rate": 5.861672605964422e-07, |
| "loss": 0.5647, |
| "mean_token_accuracy": 0.8229624778032303, |
| "num_tokens": 76606082.0, |
| "step": 1171 |
| }, |
| { |
| "epoch": 0.8001365420720259, |
| "grad_norm": 0.6856206655502319, |
| "learning_rate": 5.823357886826972e-07, |
| "loss": 0.5742, |
| "mean_token_accuracy": 0.819632813334465, |
| "num_tokens": 76671618.0, |
| "step": 1172 |
| }, |
| { |
| "epoch": 0.8008192524321557, |
| "grad_norm": 0.7125565409660339, |
| "learning_rate": 5.7851522867336e-07, |
| "loss": 0.6056, |
| "mean_token_accuracy": 0.8092008829116821, |
| "num_tokens": 76737154.0, |
| "step": 1173 |
| }, |
| { |
| "epoch": 0.8015019627922854, |
| "grad_norm": 0.7257224917411804, |
| "learning_rate": 5.747056023082042e-07, |
| "loss": 0.6366, |
| "mean_token_accuracy": 0.8041041642427444, |
| "num_tokens": 76802088.0, |
| "step": 1174 |
| }, |
| { |
| "epoch": 0.802184673152415, |
| "grad_norm": 0.6963748931884766, |
| "learning_rate": 5.709069312647894e-07, |
| "loss": 0.6104, |
| "mean_token_accuracy": 0.8100867569446564, |
| "num_tokens": 76867624.0, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.8028673835125448, |
| "grad_norm": 0.764582633972168, |
| "learning_rate": 5.671192371583361e-07, |
| "loss": 0.6299, |
| "mean_token_accuracy": 0.8033950179815292, |
| "num_tokens": 76932121.0, |
| "step": 1176 |
| }, |
| { |
| "epoch": 0.8035500938726745, |
| "grad_norm": 0.7009971141815186, |
| "learning_rate": 5.633425415416038e-07, |
| "loss": 0.6341, |
| "mean_token_accuracy": 0.8024957329034805, |
| "num_tokens": 76997657.0, |
| "step": 1177 |
| }, |
| { |
| "epoch": 0.8042328042328042, |
| "grad_norm": 0.7152801156044006, |
| "learning_rate": 5.595768659047688e-07, |
| "loss": 0.619, |
| "mean_token_accuracy": 0.8068641424179077, |
| "num_tokens": 77062948.0, |
| "step": 1178 |
| }, |
| { |
| "epoch": 0.804915514592934, |
| "grad_norm": 0.7051480412483215, |
| "learning_rate": 5.558222316753009e-07, |
| "loss": 0.6329, |
| "mean_token_accuracy": 0.8018084168434143, |
| "num_tokens": 77128484.0, |
| "step": 1179 |
| }, |
| { |
| "epoch": 0.8055982249530637, |
| "grad_norm": 0.7400657534599304, |
| "learning_rate": 5.520786602178418e-07, |
| "loss": 0.6173, |
| "mean_token_accuracy": 0.8071723580360413, |
| "num_tokens": 77193405.0, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.8062809353131933, |
| "grad_norm": 0.7040783762931824, |
| "learning_rate": 5.483461728340867e-07, |
| "loss": 0.5981, |
| "mean_token_accuracy": 0.8125204145908356, |
| "num_tokens": 77258112.0, |
| "step": 1181 |
| }, |
| { |
| "epoch": 0.8069636456733231, |
| "grad_norm": 0.6948963403701782, |
| "learning_rate": 5.446247907626543e-07, |
| "loss": 0.6115, |
| "mean_token_accuracy": 0.8102853149175644, |
| "num_tokens": 77323648.0, |
| "step": 1182 |
| }, |
| { |
| "epoch": 0.8076463560334528, |
| "grad_norm": 0.7158141732215881, |
| "learning_rate": 5.409145351789777e-07, |
| "loss": 0.6102, |
| "mean_token_accuracy": 0.8111711889505386, |
| "num_tokens": 77389184.0, |
| "step": 1183 |
| }, |
| { |
| "epoch": 0.8083290663935825, |
| "grad_norm": 0.7212420701980591, |
| "learning_rate": 5.372154271951746e-07, |
| "loss": 0.6006, |
| "mean_token_accuracy": 0.8151035904884338, |
| "num_tokens": 77454660.0, |
| "step": 1184 |
| }, |
| { |
| "epoch": 0.8090117767537123, |
| "grad_norm": 0.6939152479171753, |
| "learning_rate": 5.335274878599317e-07, |
| "loss": 0.5746, |
| "mean_token_accuracy": 0.8195730149745941, |
| "num_tokens": 77520141.0, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.809694487113842, |
| "grad_norm": 0.7088807225227356, |
| "learning_rate": 5.298507381583826e-07, |
| "loss": 0.6292, |
| "mean_token_accuracy": 0.805413007736206, |
| "num_tokens": 77585677.0, |
| "step": 1186 |
| }, |
| { |
| "epoch": 0.8103771974739716, |
| "grad_norm": 0.728660523891449, |
| "learning_rate": 5.261851990119926e-07, |
| "loss": 0.6153, |
| "mean_token_accuracy": 0.8086510300636292, |
| "num_tokens": 77651213.0, |
| "step": 1187 |
| }, |
| { |
| "epoch": 0.8110599078341014, |
| "grad_norm": 0.6933480501174927, |
| "learning_rate": 5.225308912784321e-07, |
| "loss": 0.5616, |
| "mean_token_accuracy": 0.8231625705957413, |
| "num_tokens": 77716478.0, |
| "step": 1188 |
| }, |
| { |
| "epoch": 0.8117426181942311, |
| "grad_norm": 0.7177363634109497, |
| "learning_rate": 5.188878357514668e-07, |
| "loss": 0.6168, |
| "mean_token_accuracy": 0.8091703355312347, |
| "num_tokens": 77782014.0, |
| "step": 1189 |
| }, |
| { |
| "epoch": 0.8124253285543608, |
| "grad_norm": 0.7240033149719238, |
| "learning_rate": 5.152560531608325e-07, |
| "loss": 0.608, |
| "mean_token_accuracy": 0.8108504414558411, |
| "num_tokens": 77847550.0, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.8131080389144906, |
| "grad_norm": 0.7297149896621704, |
| "learning_rate": 5.116355641721202e-07, |
| "loss": 0.5873, |
| "mean_token_accuracy": 0.8166238963603973, |
| "num_tokens": 77913086.0, |
| "step": 1191 |
| }, |
| { |
| "epoch": 0.8137907492746203, |
| "grad_norm": 0.7229415774345398, |
| "learning_rate": 5.080263893866572e-07, |
| "loss": 0.587, |
| "mean_token_accuracy": 0.8156934976577759, |
| "num_tokens": 77978469.0, |
| "step": 1192 |
| }, |
| { |
| "epoch": 0.8144734596347499, |
| "grad_norm": 0.6919569373130798, |
| "learning_rate": 5.044285493413942e-07, |
| "loss": 0.5878, |
| "mean_token_accuracy": 0.8175217360258102, |
| "num_tokens": 78043943.0, |
| "step": 1193 |
| }, |
| { |
| "epoch": 0.8151561699948797, |
| "grad_norm": 0.7613787055015564, |
| "learning_rate": 5.008420645087785e-07, |
| "loss": 0.6068, |
| "mean_token_accuracy": 0.8115917444229126, |
| "num_tokens": 78109340.0, |
| "step": 1194 |
| }, |
| { |
| "epoch": 0.8158388803550094, |
| "grad_norm": 0.7041038870811462, |
| "learning_rate": 4.972669552966508e-07, |
| "loss": 0.5735, |
| "mean_token_accuracy": 0.8211907297372818, |
| "num_tokens": 78174876.0, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.8165215907151391, |
| "grad_norm": 0.7106341123580933, |
| "learning_rate": 4.937032420481169e-07, |
| "loss": 0.5736, |
| "mean_token_accuracy": 0.8204770088195801, |
| "num_tokens": 78240208.0, |
| "step": 1196 |
| }, |
| { |
| "epoch": 0.8172043010752689, |
| "grad_norm": 0.7360470294952393, |
| "learning_rate": 4.901509450414418e-07, |
| "loss": 0.5837, |
| "mean_token_accuracy": 0.8159824013710022, |
| "num_tokens": 78305744.0, |
| "step": 1197 |
| }, |
| { |
| "epoch": 0.8178870114353985, |
| "grad_norm": 0.6911918520927429, |
| "learning_rate": 4.86610084489924e-07, |
| "loss": 0.6177, |
| "mean_token_accuracy": 0.8082218617200851, |
| "num_tokens": 78371143.0, |
| "step": 1198 |
| }, |
| { |
| "epoch": 0.8185697217955282, |
| "grad_norm": 0.7414296269416809, |
| "learning_rate": 4.83080680541792e-07, |
| "loss": 0.5975, |
| "mean_token_accuracy": 0.8125152736902237, |
| "num_tokens": 78436679.0, |
| "step": 1199 |
| }, |
| { |
| "epoch": 0.819252432155658, |
| "grad_norm": 0.7289243936538696, |
| "learning_rate": 4.795627532800806e-07, |
| "loss": 0.6296, |
| "mean_token_accuracy": 0.8031218498945236, |
| "num_tokens": 78502009.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8199351425157877, |
| "grad_norm": 0.7345913052558899, |
| "learning_rate": 4.760563227225204e-07, |
| "loss": 0.6461, |
| "mean_token_accuracy": 0.7988996803760529, |
| "num_tokens": 78567526.0, |
| "step": 1201 |
| }, |
| { |
| "epoch": 0.8206178528759174, |
| "grad_norm": 0.7304643988609314, |
| "learning_rate": 4.725614088214231e-07, |
| "loss": 0.6372, |
| "mean_token_accuracy": 0.802532970905304, |
| "num_tokens": 78632602.0, |
| "step": 1202 |
| }, |
| { |
| "epoch": 0.821300563236047, |
| "grad_norm": 0.711274266242981, |
| "learning_rate": 4.690780314635704e-07, |
| "loss": 0.598, |
| "mean_token_accuracy": 0.8129887580871582, |
| "num_tokens": 78698138.0, |
| "step": 1203 |
| }, |
| { |
| "epoch": 0.8219832735961768, |
| "grad_norm": 0.7121052145957947, |
| "learning_rate": 4.656062104700951e-07, |
| "loss": 0.5784, |
| "mean_token_accuracy": 0.8166697174310684, |
| "num_tokens": 78763674.0, |
| "step": 1204 |
| }, |
| { |
| "epoch": 0.8226659839563065, |
| "grad_norm": 0.7592654228210449, |
| "learning_rate": 4.621459655963753e-07, |
| "loss": 0.6325, |
| "mean_token_accuracy": 0.8049637675285339, |
| "num_tokens": 78828816.0, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.8233486943164362, |
| "grad_norm": 0.7198703289031982, |
| "learning_rate": 4.586973165319164e-07, |
| "loss": 0.5935, |
| "mean_token_accuracy": 0.8148216009140015, |
| "num_tokens": 78894352.0, |
| "step": 1206 |
| }, |
| { |
| "epoch": 0.824031404676566, |
| "grad_norm": 0.7096201181411743, |
| "learning_rate": 4.5526028290024253e-07, |
| "loss": 0.5818, |
| "mean_token_accuracy": 0.8182827532291412, |
| "num_tokens": 78959669.0, |
| "step": 1207 |
| }, |
| { |
| "epoch": 0.8247141150366957, |
| "grad_norm": 0.7286295890808105, |
| "learning_rate": 4.518348842587822e-07, |
| "loss": 0.5924, |
| "mean_token_accuracy": 0.8151398599147797, |
| "num_tokens": 79024817.0, |
| "step": 1208 |
| }, |
| { |
| "epoch": 0.8253968253968254, |
| "grad_norm": 0.7021476030349731, |
| "learning_rate": 4.484211400987612e-07, |
| "loss": 0.5974, |
| "mean_token_accuracy": 0.812148705124855, |
| "num_tokens": 79090353.0, |
| "step": 1209 |
| }, |
| { |
| "epoch": 0.8260795357569551, |
| "grad_norm": 0.7140706181526184, |
| "learning_rate": 4.450190698450843e-07, |
| "loss": 0.567, |
| "mean_token_accuracy": 0.8210151493549347, |
| "num_tokens": 79155623.0, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.8267622461170848, |
| "grad_norm": 0.736964762210846, |
| "learning_rate": 4.416286928562344e-07, |
| "loss": 0.6204, |
| "mean_token_accuracy": 0.8086510300636292, |
| "num_tokens": 79221159.0, |
| "step": 1211 |
| }, |
| { |
| "epoch": 0.8274449564772145, |
| "grad_norm": 0.6984681487083435, |
| "learning_rate": 4.3825002842415423e-07, |
| "loss": 0.6231, |
| "mean_token_accuracy": 0.8068502992391586, |
| "num_tokens": 79286680.0, |
| "step": 1212 |
| }, |
| { |
| "epoch": 0.8281276668373443, |
| "grad_norm": 0.7289581298828125, |
| "learning_rate": 4.3488309577414014e-07, |
| "loss": 0.6099, |
| "mean_token_accuracy": 0.8104861825704575, |
| "num_tokens": 79352167.0, |
| "step": 1213 |
| }, |
| { |
| "epoch": 0.828810377197474, |
| "grad_norm": 0.7081075310707092, |
| "learning_rate": 4.31527914064733e-07, |
| "loss": 0.568, |
| "mean_token_accuracy": 0.8197198361158371, |
| "num_tokens": 79417627.0, |
| "step": 1214 |
| }, |
| { |
| "epoch": 0.8294930875576036, |
| "grad_norm": 0.6993555426597595, |
| "learning_rate": 4.2818450238760745e-07, |
| "loss": 0.5891, |
| "mean_token_accuracy": 0.8168835490942001, |
| "num_tokens": 79483163.0, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.8301757979177334, |
| "grad_norm": 0.725541889667511, |
| "learning_rate": 4.248528797674645e-07, |
| "loss": 0.5958, |
| "mean_token_accuracy": 0.8150507062673569, |
| "num_tokens": 79548699.0, |
| "step": 1216 |
| }, |
| { |
| "epoch": 0.8308585082778631, |
| "grad_norm": 0.7068691849708557, |
| "learning_rate": 4.21533065161924e-07, |
| "loss": 0.5988, |
| "mean_token_accuracy": 0.8137481957674026, |
| "num_tokens": 79613306.0, |
| "step": 1217 |
| }, |
| { |
| "epoch": 0.8315412186379928, |
| "grad_norm": 0.709589958190918, |
| "learning_rate": 4.182250774614144e-07, |
| "loss": 0.587, |
| "mean_token_accuracy": 0.815237820148468, |
| "num_tokens": 79678725.0, |
| "step": 1218 |
| }, |
| { |
| "epoch": 0.8322239289981226, |
| "grad_norm": 0.7335296273231506, |
| "learning_rate": 4.149289354890676e-07, |
| "loss": 0.6533, |
| "mean_token_accuracy": 0.7973637729883194, |
| "num_tokens": 79744261.0, |
| "step": 1219 |
| }, |
| { |
| "epoch": 0.8329066393582523, |
| "grad_norm": 0.713219165802002, |
| "learning_rate": 4.1164465800060977e-07, |
| "loss": 0.6041, |
| "mean_token_accuracy": 0.8116752207279205, |
| "num_tokens": 79809797.0, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.833589349718382, |
| "grad_norm": 0.6817585825920105, |
| "learning_rate": 4.0837226368425714e-07, |
| "loss": 0.5986, |
| "mean_token_accuracy": 0.813984677195549, |
| "num_tokens": 79874634.0, |
| "step": 1221 |
| }, |
| { |
| "epoch": 0.8342720600785117, |
| "grad_norm": 0.7115651369094849, |
| "learning_rate": 4.0511177116060674e-07, |
| "loss": 0.6543, |
| "mean_token_accuracy": 0.7982038259506226, |
| "num_tokens": 79940170.0, |
| "step": 1222 |
| }, |
| { |
| "epoch": 0.8349547704386414, |
| "grad_norm": 0.7281413674354553, |
| "learning_rate": 4.018631989825328e-07, |
| "loss": 0.6427, |
| "mean_token_accuracy": 0.7999297529459, |
| "num_tokens": 80005706.0, |
| "step": 1223 |
| }, |
| { |
| "epoch": 0.8356374807987711, |
| "grad_norm": 0.7227478623390198, |
| "learning_rate": 3.98626565635081e-07, |
| "loss": 0.5918, |
| "mean_token_accuracy": 0.8124083578586578, |
| "num_tokens": 80071242.0, |
| "step": 1224 |
| }, |
| { |
| "epoch": 0.8363201911589009, |
| "grad_norm": 0.6979718804359436, |
| "learning_rate": 3.954018895353615e-07, |
| "loss": 0.5896, |
| "mean_token_accuracy": 0.8178916126489639, |
| "num_tokens": 80136778.0, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.8370029015190306, |
| "grad_norm": 0.6697226166725159, |
| "learning_rate": 3.92189189032445e-07, |
| "loss": 0.5544, |
| "mean_token_accuracy": 0.8262005001306534, |
| "num_tokens": 80202314.0, |
| "step": 1226 |
| }, |
| { |
| "epoch": 0.8376856118791602, |
| "grad_norm": 0.6812226176261902, |
| "learning_rate": 3.889884824072601e-07, |
| "loss": 0.568, |
| "mean_token_accuracy": 0.8224816024303436, |
| "num_tokens": 80267756.0, |
| "step": 1227 |
| }, |
| { |
| "epoch": 0.83836832223929, |
| "grad_norm": 0.7076123356819153, |
| "learning_rate": 3.857997878724862e-07, |
| "loss": 0.593, |
| "mean_token_accuracy": 0.8150422126054764, |
| "num_tokens": 80332455.0, |
| "step": 1228 |
| }, |
| { |
| "epoch": 0.8390510325994197, |
| "grad_norm": 0.7348125576972961, |
| "learning_rate": 3.8262312357245173e-07, |
| "loss": 0.6193, |
| "mean_token_accuracy": 0.8065432608127594, |
| "num_tokens": 80397991.0, |
| "step": 1229 |
| }, |
| { |
| "epoch": 0.8397337429595494, |
| "grad_norm": 0.7341455817222595, |
| "learning_rate": 3.7945850758303286e-07, |
| "loss": 0.6064, |
| "mean_token_accuracy": 0.8103394955396652, |
| "num_tokens": 80463054.0, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.8404164533196792, |
| "grad_norm": 0.7043530344963074, |
| "learning_rate": 3.7630595791154383e-07, |
| "loss": 0.5764, |
| "mean_token_accuracy": 0.8203535825014114, |
| "num_tokens": 80528495.0, |
| "step": 1231 |
| }, |
| { |
| "epoch": 0.8410991636798089, |
| "grad_norm": 0.7355886697769165, |
| "learning_rate": 3.7316549249664353e-07, |
| "loss": 0.5983, |
| "mean_token_accuracy": 0.811448335647583, |
| "num_tokens": 80593906.0, |
| "step": 1232 |
| }, |
| { |
| "epoch": 0.8417818740399385, |
| "grad_norm": 0.70097815990448, |
| "learning_rate": 3.7003712920822665e-07, |
| "loss": 0.6176, |
| "mean_token_accuracy": 0.8075818717479706, |
| "num_tokens": 80659442.0, |
| "step": 1233 |
| }, |
| { |
| "epoch": 0.8424645844000682, |
| "grad_norm": 0.7031533718109131, |
| "learning_rate": 3.6692088584732474e-07, |
| "loss": 0.5966, |
| "mean_token_accuracy": 0.8145619481801987, |
| "num_tokens": 80724978.0, |
| "step": 1234 |
| }, |
| { |
| "epoch": 0.843147294760198, |
| "grad_norm": 0.7055192589759827, |
| "learning_rate": 3.638167801460041e-07, |
| "loss": 0.6101, |
| "mean_token_accuracy": 0.810369685292244, |
| "num_tokens": 80790332.0, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.8438300051203277, |
| "grad_norm": 0.7216753363609314, |
| "learning_rate": 3.607248297672669e-07, |
| "loss": 0.6273, |
| "mean_token_accuracy": 0.8048436045646667, |
| "num_tokens": 80855669.0, |
| "step": 1236 |
| }, |
| { |
| "epoch": 0.8445127154804574, |
| "grad_norm": 0.7097287178039551, |
| "learning_rate": 3.576450523049457e-07, |
| "loss": 0.5449, |
| "mean_token_accuracy": 0.8278195112943649, |
| "num_tokens": 80921205.0, |
| "step": 1237 |
| }, |
| { |
| "epoch": 0.8451954258405872, |
| "grad_norm": 0.7152010202407837, |
| "learning_rate": 3.5457746528361035e-07, |
| "loss": 0.6267, |
| "mean_token_accuracy": 0.8040536344051361, |
| "num_tokens": 80986741.0, |
| "step": 1238 |
| }, |
| { |
| "epoch": 0.8458781362007168, |
| "grad_norm": 0.715424120426178, |
| "learning_rate": 3.5152208615846256e-07, |
| "loss": 0.6069, |
| "mean_token_accuracy": 0.8103158622980118, |
| "num_tokens": 81052277.0, |
| "step": 1239 |
| }, |
| { |
| "epoch": 0.8465608465608465, |
| "grad_norm": 0.7013223171234131, |
| "learning_rate": 3.4847893231523877e-07, |
| "loss": 0.5815, |
| "mean_token_accuracy": 0.8173014968633652, |
| "num_tokens": 81117665.0, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.8472435569209763, |
| "grad_norm": 0.7214623689651489, |
| "learning_rate": 3.454480210701108e-07, |
| "loss": 0.5866, |
| "mean_token_accuracy": 0.8163795173168182, |
| "num_tokens": 81183201.0, |
| "step": 1241 |
| }, |
| { |
| "epoch": 0.847926267281106, |
| "grad_norm": 0.7023662328720093, |
| "learning_rate": 3.424293696695896e-07, |
| "loss": 0.6043, |
| "mean_token_accuracy": 0.8114422112703323, |
| "num_tokens": 81248226.0, |
| "step": 1242 |
| }, |
| { |
| "epoch": 0.8486089776412357, |
| "grad_norm": 0.6930263042449951, |
| "learning_rate": 3.3942299529042157e-07, |
| "loss": 0.6071, |
| "mean_token_accuracy": 0.8101478517055511, |
| "num_tokens": 81313762.0, |
| "step": 1243 |
| }, |
| { |
| "epoch": 0.8492916880013655, |
| "grad_norm": 0.7055590152740479, |
| "learning_rate": 3.3642891503949724e-07, |
| "loss": 0.5873, |
| "mean_token_accuracy": 0.8138440847396851, |
| "num_tokens": 81379298.0, |
| "step": 1244 |
| }, |
| { |
| "epoch": 0.8499743983614951, |
| "grad_norm": 0.7280580997467041, |
| "learning_rate": 3.334471459537497e-07, |
| "loss": 0.6059, |
| "mean_token_accuracy": 0.8118687719106674, |
| "num_tokens": 81444583.0, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.8506571087216248, |
| "grad_norm": 0.6961165070533752, |
| "learning_rate": 3.304777050000604e-07, |
| "loss": 0.6349, |
| "mean_token_accuracy": 0.8040469288825989, |
| "num_tokens": 81510106.0, |
| "step": 1246 |
| }, |
| { |
| "epoch": 0.8513398190817546, |
| "grad_norm": 0.7137936949729919, |
| "learning_rate": 3.2752060907515813e-07, |
| "loss": 0.5937, |
| "mean_token_accuracy": 0.8142594546079636, |
| "num_tokens": 81575560.0, |
| "step": 1247 |
| }, |
| { |
| "epoch": 0.8520225294418843, |
| "grad_norm": 0.6947458982467651, |
| "learning_rate": 3.2457587500552946e-07, |
| "loss": 0.5696, |
| "mean_token_accuracy": 0.8206253200769424, |
| "num_tokens": 81640613.0, |
| "step": 1248 |
| }, |
| { |
| "epoch": 0.852705239802014, |
| "grad_norm": 0.7190272808074951, |
| "learning_rate": 3.216435195473175e-07, |
| "loss": 0.599, |
| "mean_token_accuracy": 0.8117821365594864, |
| "num_tokens": 81706149.0, |
| "step": 1249 |
| }, |
| { |
| "epoch": 0.8533879501621437, |
| "grad_norm": 0.7290658354759216, |
| "learning_rate": 3.187235593862284e-07, |
| "loss": 0.6001, |
| "mean_token_accuracy": 0.8131567686796188, |
| "num_tokens": 81771685.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.8540706605222734, |
| "grad_norm": 0.7395808696746826, |
| "learning_rate": 3.15816011137437e-07, |
| "loss": 0.6373, |
| "mean_token_accuracy": 0.8025502562522888, |
| "num_tokens": 81837208.0, |
| "step": 1251 |
| }, |
| { |
| "epoch": 0.8547533708824031, |
| "grad_norm": 0.7159065008163452, |
| "learning_rate": 3.129208913454931e-07, |
| "loss": 0.6143, |
| "mean_token_accuracy": 0.8089412301778793, |
| "num_tokens": 81902744.0, |
| "step": 1252 |
| }, |
| { |
| "epoch": 0.8554360812425329, |
| "grad_norm": 0.6992605328559875, |
| "learning_rate": 3.1003821648422277e-07, |
| "loss": 0.5936, |
| "mean_token_accuracy": 0.8155394643545151, |
| "num_tokens": 81968280.0, |
| "step": 1253 |
| }, |
| { |
| "epoch": 0.8561187916026626, |
| "grad_norm": 0.7298917770385742, |
| "learning_rate": 3.071680029566415e-07, |
| "loss": 0.5907, |
| "mean_token_accuracy": 0.8136913478374481, |
| "num_tokens": 82033816.0, |
| "step": 1254 |
| }, |
| { |
| "epoch": 0.8568015019627923, |
| "grad_norm": 0.7134488224983215, |
| "learning_rate": 3.043102670948545e-07, |
| "loss": 0.6234, |
| "mean_token_accuracy": 0.8073708415031433, |
| "num_tokens": 82099329.0, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.857484212322922, |
| "grad_norm": 0.7187513113021851, |
| "learning_rate": 3.0146502515996796e-07, |
| "loss": 0.6533, |
| "mean_token_accuracy": 0.7979899793863297, |
| "num_tokens": 82164865.0, |
| "step": 1256 |
| }, |
| { |
| "epoch": 0.8581669226830517, |
| "grad_norm": 0.6844106316566467, |
| "learning_rate": 2.9863229334199413e-07, |
| "loss": 0.5732, |
| "mean_token_accuracy": 0.8199816644191742, |
| "num_tokens": 82230373.0, |
| "step": 1257 |
| }, |
| { |
| "epoch": 0.8588496330431814, |
| "grad_norm": 0.7100004553794861, |
| "learning_rate": 2.958120877597617e-07, |
| "loss": 0.6157, |
| "mean_token_accuracy": 0.8082807064056396, |
| "num_tokens": 82295451.0, |
| "step": 1258 |
| }, |
| { |
| "epoch": 0.8595323434033112, |
| "grad_norm": 0.7535333633422852, |
| "learning_rate": 2.930044244608199e-07, |
| "loss": 0.6065, |
| "mean_token_accuracy": 0.8120112419128418, |
| "num_tokens": 82360987.0, |
| "step": 1259 |
| }, |
| { |
| "epoch": 0.8602150537634409, |
| "grad_norm": 0.7360159158706665, |
| "learning_rate": 2.902093194213526e-07, |
| "loss": 0.6027, |
| "mean_token_accuracy": 0.8140120953321457, |
| "num_tokens": 82426523.0, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.8608977641235706, |
| "grad_norm": 0.7043842673301697, |
| "learning_rate": 2.874267885460827e-07, |
| "loss": 0.6022, |
| "mean_token_accuracy": 0.812301442027092, |
| "num_tokens": 82492059.0, |
| "step": 1261 |
| }, |
| { |
| "epoch": 0.8615804744837003, |
| "grad_norm": 0.692662239074707, |
| "learning_rate": 2.8465684766818406e-07, |
| "loss": 0.5733, |
| "mean_token_accuracy": 0.8192416429519653, |
| "num_tokens": 82556988.0, |
| "step": 1262 |
| }, |
| { |
| "epoch": 0.86226318484383, |
| "grad_norm": 0.6765474081039429, |
| "learning_rate": 2.8189951254919105e-07, |
| "loss": 0.5842, |
| "mean_token_accuracy": 0.8190467804670334, |
| "num_tokens": 82622241.0, |
| "step": 1263 |
| }, |
| { |
| "epoch": 0.8629458952039597, |
| "grad_norm": 0.7088792324066162, |
| "learning_rate": 2.791547988789087e-07, |
| "loss": 0.6155, |
| "mean_token_accuracy": 0.8090481460094452, |
| "num_tokens": 82687777.0, |
| "step": 1264 |
| }, |
| { |
| "epoch": 0.8636286055640894, |
| "grad_norm": 0.6913022398948669, |
| "learning_rate": 2.7642272227532214e-07, |
| "loss": 0.5895, |
| "mean_token_accuracy": 0.8141342848539352, |
| "num_tokens": 82753313.0, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.8643113159242192, |
| "grad_norm": 0.7497997283935547, |
| "learning_rate": 2.737032982845114e-07, |
| "loss": 0.6165, |
| "mean_token_accuracy": 0.8086052089929581, |
| "num_tokens": 82818849.0, |
| "step": 1266 |
| }, |
| { |
| "epoch": 0.8649940262843488, |
| "grad_norm": 0.7286543250083923, |
| "learning_rate": 2.7099654238055886e-07, |
| "loss": 0.6268, |
| "mean_token_accuracy": 0.8044813126325607, |
| "num_tokens": 82884385.0, |
| "step": 1267 |
| }, |
| { |
| "epoch": 0.8656767366444785, |
| "grad_norm": 0.7021951079368591, |
| "learning_rate": 2.683024699654629e-07, |
| "loss": 0.5923, |
| "mean_token_accuracy": 0.8144855797290802, |
| "num_tokens": 82949921.0, |
| "step": 1268 |
| }, |
| { |
| "epoch": 0.8663594470046083, |
| "grad_norm": 0.7155722975730896, |
| "learning_rate": 2.6562109636905085e-07, |
| "loss": 0.6121, |
| "mean_token_accuracy": 0.8079178929328918, |
| "num_tokens": 83015457.0, |
| "step": 1269 |
| }, |
| { |
| "epoch": 0.867042157364738, |
| "grad_norm": 0.7293785810470581, |
| "learning_rate": 2.629524368488906e-07, |
| "loss": 0.6081, |
| "mean_token_accuracy": 0.8101783990859985, |
| "num_tokens": 83080993.0, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.8677248677248677, |
| "grad_norm": 0.7298372983932495, |
| "learning_rate": 2.6029650659020467e-07, |
| "loss": 0.6082, |
| "mean_token_accuracy": 0.8083854168653488, |
| "num_tokens": 83146351.0, |
| "step": 1271 |
| }, |
| { |
| "epoch": 0.8684075780849975, |
| "grad_norm": 0.7006279826164246, |
| "learning_rate": 2.5765332070578296e-07, |
| "loss": 0.6112, |
| "mean_token_accuracy": 0.8127480298280716, |
| "num_tokens": 83211735.0, |
| "step": 1272 |
| }, |
| { |
| "epoch": 0.8690902884451271, |
| "grad_norm": 0.7038744688034058, |
| "learning_rate": 2.5502289423589844e-07, |
| "loss": 0.591, |
| "mean_token_accuracy": 0.8158559948205948, |
| "num_tokens": 83277014.0, |
| "step": 1273 |
| }, |
| { |
| "epoch": 0.8697729988052568, |
| "grad_norm": 0.7346209287643433, |
| "learning_rate": 2.52405242148219e-07, |
| "loss": 0.6247, |
| "mean_token_accuracy": 0.8049071878194809, |
| "num_tokens": 83342354.0, |
| "step": 1274 |
| }, |
| { |
| "epoch": 0.8704557091653866, |
| "grad_norm": 0.7262876629829407, |
| "learning_rate": 2.4980037933772487e-07, |
| "loss": 0.5764, |
| "mean_token_accuracy": 0.8189149498939514, |
| "num_tokens": 83407890.0, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.8711384195255163, |
| "grad_norm": 0.7252150177955627, |
| "learning_rate": 2.472083206266215e-07, |
| "loss": 0.6541, |
| "mean_token_accuracy": 0.7969246655702591, |
| "num_tokens": 83473156.0, |
| "step": 1276 |
| }, |
| { |
| "epoch": 0.871821129885646, |
| "grad_norm": 0.7076960206031799, |
| "learning_rate": 2.4462908076425706e-07, |
| "loss": 0.5983, |
| "mean_token_accuracy": 0.8119195997714996, |
| "num_tokens": 83538692.0, |
| "step": 1277 |
| }, |
| { |
| "epoch": 0.8725038402457758, |
| "grad_norm": 0.741290807723999, |
| "learning_rate": 2.4206267442703743e-07, |
| "loss": 0.5495, |
| "mean_token_accuracy": 0.8270227015018463, |
| "num_tokens": 83604067.0, |
| "step": 1278 |
| }, |
| { |
| "epoch": 0.8731865506059054, |
| "grad_norm": 0.6800980567932129, |
| "learning_rate": 2.3950911621834437e-07, |
| "loss": 0.5726, |
| "mean_token_accuracy": 0.8190676867961884, |
| "num_tokens": 83669603.0, |
| "step": 1279 |
| }, |
| { |
| "epoch": 0.8738692609660351, |
| "grad_norm": 0.6879988312721252, |
| "learning_rate": 2.3696842066844862e-07, |
| "loss": 0.5828, |
| "mean_token_accuracy": 0.8156311064958572, |
| "num_tokens": 83735139.0, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.8745519713261649, |
| "grad_norm": 0.7209987044334412, |
| "learning_rate": 2.3444060223443226e-07, |
| "loss": 0.5755, |
| "mean_token_accuracy": 0.818487286567688, |
| "num_tokens": 83800675.0, |
| "step": 1281 |
| }, |
| { |
| "epoch": 0.8752346816862946, |
| "grad_norm": 0.7542129755020142, |
| "learning_rate": 2.3192567530010313e-07, |
| "loss": 0.6229, |
| "mean_token_accuracy": 0.8044636845588684, |
| "num_tokens": 83866196.0, |
| "step": 1282 |
| }, |
| { |
| "epoch": 0.8759173920464243, |
| "grad_norm": 0.7246004939079285, |
| "learning_rate": 2.2942365417591288e-07, |
| "loss": 0.6183, |
| "mean_token_accuracy": 0.8091397881507874, |
| "num_tokens": 83931732.0, |
| "step": 1283 |
| }, |
| { |
| "epoch": 0.8766001024065541, |
| "grad_norm": 0.6989543437957764, |
| "learning_rate": 2.2693455309887702e-07, |
| "loss": 0.5791, |
| "mean_token_accuracy": 0.8160740435123444, |
| "num_tokens": 83997268.0, |
| "step": 1284 |
| }, |
| { |
| "epoch": 0.8772828127666837, |
| "grad_norm": 0.7213975787162781, |
| "learning_rate": 2.2445838623249478e-07, |
| "loss": 0.5927, |
| "mean_token_accuracy": 0.8150552809238434, |
| "num_tokens": 84062653.0, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.8779655231268134, |
| "grad_norm": 0.7489351630210876, |
| "learning_rate": 2.2199516766666373e-07, |
| "loss": 0.6014, |
| "mean_token_accuracy": 0.812301442027092, |
| "num_tokens": 84128189.0, |
| "step": 1286 |
| }, |
| { |
| "epoch": 0.8786482334869432, |
| "grad_norm": 0.7406776547431946, |
| "learning_rate": 2.1954491141760653e-07, |
| "loss": 0.5933, |
| "mean_token_accuracy": 0.8135691583156586, |
| "num_tokens": 84193725.0, |
| "step": 1287 |
| }, |
| { |
| "epoch": 0.8793309438470729, |
| "grad_norm": 0.7083876729011536, |
| "learning_rate": 2.1710763142778562e-07, |
| "loss": 0.5823, |
| "mean_token_accuracy": 0.8167155385017395, |
| "num_tokens": 84259261.0, |
| "step": 1288 |
| }, |
| { |
| "epoch": 0.8800136542072026, |
| "grad_norm": 0.7141042947769165, |
| "learning_rate": 2.1468334156582588e-07, |
| "loss": 0.6013, |
| "mean_token_accuracy": 0.8128054738044739, |
| "num_tokens": 84324797.0, |
| "step": 1289 |
| }, |
| { |
| "epoch": 0.8806963645673324, |
| "grad_norm": 0.6838583946228027, |
| "learning_rate": 2.122720556264357e-07, |
| "loss": 0.5893, |
| "mean_token_accuracy": 0.811652198433876, |
| "num_tokens": 84390053.0, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.881379074927462, |
| "grad_norm": 0.7378459572792053, |
| "learning_rate": 2.0987378733033053e-07, |
| "loss": 0.6419, |
| "mean_token_accuracy": 0.8003421276807785, |
| "num_tokens": 84455589.0, |
| "step": 1291 |
| }, |
| { |
| "epoch": 0.8820617852875917, |
| "grad_norm": 0.7026259899139404, |
| "learning_rate": 2.074885503241486e-07, |
| "loss": 0.5948, |
| "mean_token_accuracy": 0.8153750449419022, |
| "num_tokens": 84521076.0, |
| "step": 1292 |
| }, |
| { |
| "epoch": 0.8827444956477215, |
| "grad_norm": 0.7418661713600159, |
| "learning_rate": 2.0511635818038167e-07, |
| "loss": 0.5942, |
| "mean_token_accuracy": 0.8132331371307373, |
| "num_tokens": 84586612.0, |
| "step": 1293 |
| }, |
| { |
| "epoch": 0.8834272060078512, |
| "grad_norm": 0.7378196120262146, |
| "learning_rate": 2.0275722439729084e-07, |
| "loss": 0.6204, |
| "mean_token_accuracy": 0.8063240945339203, |
| "num_tokens": 84651365.0, |
| "step": 1294 |
| }, |
| { |
| "epoch": 0.8841099163679809, |
| "grad_norm": 0.7377257943153381, |
| "learning_rate": 2.0041116239883418e-07, |
| "loss": 0.6248, |
| "mean_token_accuracy": 0.8058953881263733, |
| "num_tokens": 84716615.0, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.8847926267281107, |
| "grad_norm": 0.7164666056632996, |
| "learning_rate": 1.9807818553458647e-07, |
| "loss": 0.6007, |
| "mean_token_accuracy": 0.8099205642938614, |
| "num_tokens": 84781744.0, |
| "step": 1296 |
| }, |
| { |
| "epoch": 0.8854753370882403, |
| "grad_norm": 0.6761177182197571, |
| "learning_rate": 1.9575830707966787e-07, |
| "loss": 0.5384, |
| "mean_token_accuracy": 0.8291252255439758, |
| "num_tokens": 84846213.0, |
| "step": 1297 |
| }, |
| { |
| "epoch": 0.88615804744837, |
| "grad_norm": 0.6863158941268921, |
| "learning_rate": 1.93451540234664e-07, |
| "loss": 0.5811, |
| "mean_token_accuracy": 0.8185331076383591, |
| "num_tokens": 84911749.0, |
| "step": 1298 |
| }, |
| { |
| "epoch": 0.8868407578084997, |
| "grad_norm": 0.7062719464302063, |
| "learning_rate": 1.9115789812555379e-07, |
| "loss": 0.5811, |
| "mean_token_accuracy": 0.8158602118492126, |
| "num_tokens": 84977285.0, |
| "step": 1299 |
| }, |
| { |
| "epoch": 0.8875234681686295, |
| "grad_norm": 0.7036576867103577, |
| "learning_rate": 1.8887739380363286e-07, |
| "loss": 0.5893, |
| "mean_token_accuracy": 0.8168071806430817, |
| "num_tokens": 85042821.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.8882061785287592, |
| "grad_norm": 0.6974568963050842, |
| "learning_rate": 1.8661004024544155e-07, |
| "loss": 0.584, |
| "mean_token_accuracy": 0.8170604258775711, |
| "num_tokens": 85107913.0, |
| "step": 1301 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.7532427310943604, |
| "learning_rate": 1.843558503526871e-07, |
| "loss": 0.6289, |
| "mean_token_accuracy": 0.803327277302742, |
| "num_tokens": 85173223.0, |
| "step": 1302 |
| }, |
| { |
| "epoch": 0.8895715992490186, |
| "grad_norm": 0.693387508392334, |
| "learning_rate": 1.8211483695217607e-07, |
| "loss": 0.5964, |
| "mean_token_accuracy": 0.8128963261842728, |
| "num_tokens": 85238459.0, |
| "step": 1303 |
| }, |
| { |
| "epoch": 0.8902543096091483, |
| "grad_norm": 0.722933828830719, |
| "learning_rate": 1.7988701279573527e-07, |
| "loss": 0.6283, |
| "mean_token_accuracy": 0.8051991760730743, |
| "num_tokens": 85303995.0, |
| "step": 1304 |
| }, |
| { |
| "epoch": 0.890937019969278, |
| "grad_norm": 0.6916481852531433, |
| "learning_rate": 1.776723905601438e-07, |
| "loss": 0.5839, |
| "mean_token_accuracy": 0.8173570334911346, |
| "num_tokens": 85369531.0, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.8916197303294078, |
| "grad_norm": 0.7146339416503906, |
| "learning_rate": 1.7547098284705715e-07, |
| "loss": 0.5917, |
| "mean_token_accuracy": 0.813369631767273, |
| "num_tokens": 85434804.0, |
| "step": 1306 |
| }, |
| { |
| "epoch": 0.8923024406895375, |
| "grad_norm": 0.7200230956077576, |
| "learning_rate": 1.732828021829408e-07, |
| "loss": 0.5957, |
| "mean_token_accuracy": 0.8136608004570007, |
| "num_tokens": 85500340.0, |
| "step": 1307 |
| }, |
| { |
| "epoch": 0.8929851510496671, |
| "grad_norm": 0.6903286576271057, |
| "learning_rate": 1.711078610189912e-07, |
| "loss": 0.6013, |
| "mean_token_accuracy": 0.8124754130840302, |
| "num_tokens": 85565806.0, |
| "step": 1308 |
| }, |
| { |
| "epoch": 0.8936678614097969, |
| "grad_norm": 0.7025361061096191, |
| "learning_rate": 1.6894617173107336e-07, |
| "loss": 0.6147, |
| "mean_token_accuracy": 0.8069481402635574, |
| "num_tokens": 85630679.0, |
| "step": 1309 |
| }, |
| { |
| "epoch": 0.8943505717699266, |
| "grad_norm": 0.6796531081199646, |
| "learning_rate": 1.6679774661964382e-07, |
| "loss": 0.5591, |
| "mean_token_accuracy": 0.825024425983429, |
| "num_tokens": 85696215.0, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.8950332821300563, |
| "grad_norm": 0.7060521841049194, |
| "learning_rate": 1.6466259790968415e-07, |
| "loss": 0.6564, |
| "mean_token_accuracy": 0.7979288846254349, |
| "num_tokens": 85761751.0, |
| "step": 1311 |
| }, |
| { |
| "epoch": 0.8957159924901861, |
| "grad_norm": 0.7059783935546875, |
| "learning_rate": 1.6254073775063078e-07, |
| "loss": 0.571, |
| "mean_token_accuracy": 0.8190450817346573, |
| "num_tokens": 85827263.0, |
| "step": 1312 |
| }, |
| { |
| "epoch": 0.8963987028503158, |
| "grad_norm": 0.7131505012512207, |
| "learning_rate": 1.604321782163043e-07, |
| "loss": 0.5895, |
| "mean_token_accuracy": 0.8140321224927902, |
| "num_tokens": 85891330.0, |
| "step": 1313 |
| }, |
| { |
| "epoch": 0.8970814132104454, |
| "grad_norm": 0.701805591583252, |
| "learning_rate": 1.5833693130484328e-07, |
| "loss": 0.5899, |
| "mean_token_accuracy": 0.8150812536478043, |
| "num_tokens": 85956866.0, |
| "step": 1314 |
| }, |
| { |
| "epoch": 0.8977641235705752, |
| "grad_norm": 0.7065866589546204, |
| "learning_rate": 1.5625500893863445e-07, |
| "loss": 0.6296, |
| "mean_token_accuracy": 0.8030914068222046, |
| "num_tokens": 86022402.0, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.8984468339307049, |
| "grad_norm": 0.7108662724494934, |
| "learning_rate": 1.541864229642448e-07, |
| "loss": 0.6028, |
| "mean_token_accuracy": 0.8082844614982605, |
| "num_tokens": 86087938.0, |
| "step": 1316 |
| }, |
| { |
| "epoch": 0.8991295442908346, |
| "grad_norm": 0.7196572422981262, |
| "learning_rate": 1.5213118515235493e-07, |
| "loss": 0.5941, |
| "mean_token_accuracy": 0.8129324316978455, |
| "num_tokens": 86153367.0, |
| "step": 1317 |
| }, |
| { |
| "epoch": 0.8998122546509644, |
| "grad_norm": 0.7143006920814514, |
| "learning_rate": 1.5008930719769084e-07, |
| "loss": 0.5996, |
| "mean_token_accuracy": 0.8113086521625519, |
| "num_tokens": 86218903.0, |
| "step": 1318 |
| }, |
| { |
| "epoch": 0.900494965011094, |
| "grad_norm": 0.6986265778541565, |
| "learning_rate": 1.4806080071895906e-07, |
| "loss": 0.5964, |
| "mean_token_accuracy": 0.8124541789293289, |
| "num_tokens": 86284439.0, |
| "step": 1319 |
| }, |
| { |
| "epoch": 0.9011776753712237, |
| "grad_norm": 0.6960071325302124, |
| "learning_rate": 1.4604567725877926e-07, |
| "loss": 0.6196, |
| "mean_token_accuracy": 0.8103516399860382, |
| "num_tokens": 86349442.0, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.9018603857313535, |
| "grad_norm": 0.6868788599967957, |
| "learning_rate": 1.4404394828361896e-07, |
| "loss": 0.5626, |
| "mean_token_accuracy": 0.8217864036560059, |
| "num_tokens": 86414978.0, |
| "step": 1321 |
| }, |
| { |
| "epoch": 0.9025430960914832, |
| "grad_norm": 0.6804670095443726, |
| "learning_rate": 1.4205562518372851e-07, |
| "loss": 0.5924, |
| "mean_token_accuracy": 0.8156616538763046, |
| "num_tokens": 86480514.0, |
| "step": 1322 |
| }, |
| { |
| "epoch": 0.9032258064516129, |
| "grad_norm": 0.7235873937606812, |
| "learning_rate": 1.4008071927307605e-07, |
| "loss": 0.6339, |
| "mean_token_accuracy": 0.8052700459957123, |
| "num_tokens": 86545580.0, |
| "step": 1323 |
| }, |
| { |
| "epoch": 0.9039085168117427, |
| "grad_norm": 0.6959832310676575, |
| "learning_rate": 1.3811924178928292e-07, |
| "loss": 0.5681, |
| "mean_token_accuracy": 0.8211601823568344, |
| "num_tokens": 86611116.0, |
| "step": 1324 |
| }, |
| { |
| "epoch": 0.9045912271718723, |
| "grad_norm": 0.6909657120704651, |
| "learning_rate": 1.3617120389356002e-07, |
| "loss": 0.6008, |
| "mean_token_accuracy": 0.8118375539779663, |
| "num_tokens": 86676025.0, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.905273937532002, |
| "grad_norm": 0.7252708077430725, |
| "learning_rate": 1.3423661667064463e-07, |
| "loss": 0.6057, |
| "mean_token_accuracy": 0.8103173822164536, |
| "num_tokens": 86741105.0, |
| "step": 1326 |
| }, |
| { |
| "epoch": 0.9059566478921318, |
| "grad_norm": 0.6941865086555481, |
| "learning_rate": 1.323154911287358e-07, |
| "loss": 0.5833, |
| "mean_token_accuracy": 0.8163434863090515, |
| "num_tokens": 86806449.0, |
| "step": 1327 |
| }, |
| { |
| "epoch": 0.9066393582522615, |
| "grad_norm": 0.6878736019134521, |
| "learning_rate": 1.3040783819943476e-07, |
| "loss": 0.5342, |
| "mean_token_accuracy": 0.8290414214134216, |
| "num_tokens": 86871985.0, |
| "step": 1328 |
| }, |
| { |
| "epoch": 0.9073220686123912, |
| "grad_norm": 0.7258753776550293, |
| "learning_rate": 1.285136687376784e-07, |
| "loss": 0.5901, |
| "mean_token_accuracy": 0.812301442027092, |
| "num_tokens": 86937521.0, |
| "step": 1329 |
| }, |
| { |
| "epoch": 0.9080047789725209, |
| "grad_norm": 0.7099855542182922, |
| "learning_rate": 1.26632993521682e-07, |
| "loss": 0.5998, |
| "mean_token_accuracy": 0.8137090355157852, |
| "num_tokens": 87002994.0, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.9086874893326506, |
| "grad_norm": 0.6997334957122803, |
| "learning_rate": 1.247658232528748e-07, |
| "loss": 0.5838, |
| "mean_token_accuracy": 0.8170560598373413, |
| "num_tokens": 87068266.0, |
| "step": 1331 |
| }, |
| { |
| "epoch": 0.9093701996927803, |
| "grad_norm": 0.7097135186195374, |
| "learning_rate": 1.2291216855584016e-07, |
| "loss": 0.6261, |
| "mean_token_accuracy": 0.804741770029068, |
| "num_tokens": 87132857.0, |
| "step": 1332 |
| }, |
| { |
| "epoch": 0.91005291005291, |
| "grad_norm": 0.730303168296814, |
| "learning_rate": 1.21072039978255e-07, |
| "loss": 0.5975, |
| "mean_token_accuracy": 0.8127291053533554, |
| "num_tokens": 87198393.0, |
| "step": 1333 |
| }, |
| { |
| "epoch": 0.9107356204130398, |
| "grad_norm": 0.7141602635383606, |
| "learning_rate": 1.1924544799083089e-07, |
| "loss": 0.6183, |
| "mean_token_accuracy": 0.8092756867408752, |
| "num_tokens": 87263900.0, |
| "step": 1334 |
| }, |
| { |
| "epoch": 0.9114183307731695, |
| "grad_norm": 0.7135557532310486, |
| "learning_rate": 1.1743240298725117e-07, |
| "loss": 0.6232, |
| "mean_token_accuracy": 0.8066501766443253, |
| "num_tokens": 87329436.0, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.9121010411332992, |
| "grad_norm": 0.6904719471931458, |
| "learning_rate": 1.1563291528411653e-07, |
| "loss": 0.5862, |
| "mean_token_accuracy": 0.8155700117349625, |
| "num_tokens": 87394972.0, |
| "step": 1336 |
| }, |
| { |
| "epoch": 0.9127837514934289, |
| "grad_norm": 0.6796468496322632, |
| "learning_rate": 1.1384699512088204e-07, |
| "loss": 0.5445, |
| "mean_token_accuracy": 0.8271016478538513, |
| "num_tokens": 87460508.0, |
| "step": 1337 |
| }, |
| { |
| "epoch": 0.9134664618535586, |
| "grad_norm": 0.7061638236045837, |
| "learning_rate": 1.1207465265980183e-07, |
| "loss": 0.6197, |
| "mean_token_accuracy": 0.8061693012714386, |
| "num_tokens": 87525967.0, |
| "step": 1338 |
| }, |
| { |
| "epoch": 0.9141491722136883, |
| "grad_norm": 0.7025821208953857, |
| "learning_rate": 1.1031589798586873e-07, |
| "loss": 0.6177, |
| "mean_token_accuracy": 0.8084349930286407, |
| "num_tokens": 87591261.0, |
| "step": 1339 |
| }, |
| { |
| "epoch": 0.9148318825738181, |
| "grad_norm": 0.7310721278190613, |
| "learning_rate": 1.0857074110676075e-07, |
| "loss": 0.6389, |
| "mean_token_accuracy": 0.8023429960012436, |
| "num_tokens": 87656797.0, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.9155145929339478, |
| "grad_norm": 0.7138649821281433, |
| "learning_rate": 1.0683919195277809e-07, |
| "loss": 0.5841, |
| "mean_token_accuracy": 0.8164864331483841, |
| "num_tokens": 87722333.0, |
| "step": 1341 |
| }, |
| { |
| "epoch": 0.9161973032940774, |
| "grad_norm": 0.7323006391525269, |
| "learning_rate": 1.0512126037679371e-07, |
| "loss": 0.6125, |
| "mean_token_accuracy": 0.8062072396278381, |
| "num_tokens": 87787869.0, |
| "step": 1342 |
| }, |
| { |
| "epoch": 0.9168800136542072, |
| "grad_norm": 0.7108177542686462, |
| "learning_rate": 1.0341695615419089e-07, |
| "loss": 0.5828, |
| "mean_token_accuracy": 0.8172561675310135, |
| "num_tokens": 87853258.0, |
| "step": 1343 |
| }, |
| { |
| "epoch": 0.9175627240143369, |
| "grad_norm": 0.6755034923553467, |
| "learning_rate": 1.0172628898281329e-07, |
| "loss": 0.5654, |
| "mean_token_accuracy": 0.8198619186878204, |
| "num_tokens": 87918794.0, |
| "step": 1344 |
| }, |
| { |
| "epoch": 0.9182454343744666, |
| "grad_norm": 0.7065688967704773, |
| "learning_rate": 1.0004926848290409e-07, |
| "loss": 0.5673, |
| "mean_token_accuracy": 0.8215725719928741, |
| "num_tokens": 87984330.0, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.9189281447345964, |
| "grad_norm": 0.7121526598930359, |
| "learning_rate": 9.838590419705585e-08, |
| "loss": 0.556, |
| "mean_token_accuracy": 0.8245990574359894, |
| "num_tokens": 88049588.0, |
| "step": 1346 |
| }, |
| { |
| "epoch": 0.9196108550947261, |
| "grad_norm": 0.7254539132118225, |
| "learning_rate": 9.673620559015411e-08, |
| "loss": 0.638, |
| "mean_token_accuracy": 0.8036533892154694, |
| "num_tokens": 88114794.0, |
| "step": 1347 |
| }, |
| { |
| "epoch": 0.9202935654548557, |
| "grad_norm": 0.7252166271209717, |
| "learning_rate": 9.510018204932386e-08, |
| "loss": 0.5902, |
| "mean_token_accuracy": 0.8169325292110443, |
| "num_tokens": 88179712.0, |
| "step": 1348 |
| }, |
| { |
| "epoch": 0.9209762758149855, |
| "grad_norm": 0.7186472415924072, |
| "learning_rate": 9.347784288387534e-08, |
| "loss": 0.6058, |
| "mean_token_accuracy": 0.812102884054184, |
| "num_tokens": 88245248.0, |
| "step": 1349 |
| }, |
| { |
| "epoch": 0.9216589861751152, |
| "grad_norm": 0.7491932511329651, |
| "learning_rate": 9.18691973252539e-08, |
| "loss": 0.6458, |
| "mean_token_accuracy": 0.8007392585277557, |
| "num_tokens": 88310784.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.9223416965352449, |
| "grad_norm": 0.6888880729675293, |
| "learning_rate": 9.027425452698302e-08, |
| "loss": 0.5626, |
| "mean_token_accuracy": 0.8191746026277542, |
| "num_tokens": 88376320.0, |
| "step": 1351 |
| }, |
| { |
| "epoch": 0.9230244068953747, |
| "grad_norm": 0.739963948726654, |
| "learning_rate": 8.869302356461634e-08, |
| "loss": 0.6184, |
| "mean_token_accuracy": 0.8079484403133392, |
| "num_tokens": 88441856.0, |
| "step": 1352 |
| }, |
| { |
| "epoch": 0.9237071172555044, |
| "grad_norm": 0.7058925032615662, |
| "learning_rate": 8.712551343568354e-08, |
| "loss": 0.6479, |
| "mean_token_accuracy": 0.8010905534029007, |
| "num_tokens": 88507392.0, |
| "step": 1353 |
| }, |
| { |
| "epoch": 0.924389827615634, |
| "grad_norm": 0.7416124939918518, |
| "learning_rate": 8.557173305964034e-08, |
| "loss": 0.6333, |
| "mean_token_accuracy": 0.8034361302852631, |
| "num_tokens": 88572667.0, |
| "step": 1354 |
| }, |
| { |
| "epoch": 0.9250725379757638, |
| "grad_norm": 0.6951413750648499, |
| "learning_rate": 8.40316912778169e-08, |
| "loss": 0.5833, |
| "mean_token_accuracy": 0.8171948045492172, |
| "num_tokens": 88638102.0, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.9257552483358935, |
| "grad_norm": 0.7167767882347107, |
| "learning_rate": 8.2505396853369e-08, |
| "loss": 0.6298, |
| "mean_token_accuracy": 0.8051463514566422, |
| "num_tokens": 88703485.0, |
| "step": 1356 |
| }, |
| { |
| "epoch": 0.9264379586960232, |
| "grad_norm": 0.6876281499862671, |
| "learning_rate": 8.099285847122496e-08, |
| "loss": 0.6061, |
| "mean_token_accuracy": 0.8101783990859985, |
| "num_tokens": 88769021.0, |
| "step": 1357 |
| }, |
| { |
| "epoch": 0.927120669056153, |
| "grad_norm": 0.7173566222190857, |
| "learning_rate": 7.949408473804099e-08, |
| "loss": 0.573, |
| "mean_token_accuracy": 0.8186247497797012, |
| "num_tokens": 88834557.0, |
| "step": 1358 |
| }, |
| { |
| "epoch": 0.9278033794162827, |
| "grad_norm": 0.73436439037323, |
| "learning_rate": 7.800908418214792e-08, |
| "loss": 0.6062, |
| "mean_token_accuracy": 0.8138167709112167, |
| "num_tokens": 88900047.0, |
| "step": 1359 |
| }, |
| { |
| "epoch": 0.9284860897764123, |
| "grad_norm": 0.7449289560317993, |
| "learning_rate": 7.653786525350482e-08, |
| "loss": 0.6025, |
| "mean_token_accuracy": 0.8100714832544327, |
| "num_tokens": 88965583.0, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.929168800136542, |
| "grad_norm": 0.709634006023407, |
| "learning_rate": 7.508043632365042e-08, |
| "loss": 0.5799, |
| "mean_token_accuracy": 0.8167552798986435, |
| "num_tokens": 89030784.0, |
| "step": 1361 |
| }, |
| { |
| "epoch": 0.9298515104966718, |
| "grad_norm": 0.7063040137290955, |
| "learning_rate": 7.363680568565568e-08, |
| "loss": 0.6098, |
| "mean_token_accuracy": 0.809704914689064, |
| "num_tokens": 89096320.0, |
| "step": 1362 |
| }, |
| { |
| "epoch": 0.9305342208568015, |
| "grad_norm": 0.7060918211936951, |
| "learning_rate": 7.220698155407602e-08, |
| "loss": 0.5828, |
| "mean_token_accuracy": 0.8185582906007767, |
| "num_tokens": 89161825.0, |
| "step": 1363 |
| }, |
| { |
| "epoch": 0.9312169312169312, |
| "grad_norm": 0.6989777088165283, |
| "learning_rate": 7.079097206490581e-08, |
| "loss": 0.6244, |
| "mean_token_accuracy": 0.8075470179319382, |
| "num_tokens": 89226535.0, |
| "step": 1364 |
| }, |
| { |
| "epoch": 0.931899641577061, |
| "grad_norm": 0.6618038415908813, |
| "learning_rate": 6.938878527553067e-08, |
| "loss": 0.5753, |
| "mean_token_accuracy": 0.8200604766607285, |
| "num_tokens": 89292071.0, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.9325823519371906, |
| "grad_norm": 0.6828494668006897, |
| "learning_rate": 6.800042916468186e-08, |
| "loss": 0.5724, |
| "mean_token_accuracy": 0.8199077546596527, |
| "num_tokens": 89357607.0, |
| "step": 1366 |
| }, |
| { |
| "epoch": 0.9332650622973203, |
| "grad_norm": 0.7094539999961853, |
| "learning_rate": 6.662591163239224e-08, |
| "loss": 0.6011, |
| "mean_token_accuracy": 0.8109215497970581, |
| "num_tokens": 89423043.0, |
| "step": 1367 |
| }, |
| { |
| "epoch": 0.9339477726574501, |
| "grad_norm": 0.6817854046821594, |
| "learning_rate": 6.526524049994904e-08, |
| "loss": 0.5704, |
| "mean_token_accuracy": 0.8201979398727417, |
| "num_tokens": 89488579.0, |
| "step": 1368 |
| }, |
| { |
| "epoch": 0.9346304830175798, |
| "grad_norm": 0.7198058366775513, |
| "learning_rate": 6.391842350985195e-08, |
| "loss": 0.6235, |
| "mean_token_accuracy": 0.8059934079647064, |
| "num_tokens": 89554115.0, |
| "step": 1369 |
| }, |
| { |
| "epoch": 0.9353131933777095, |
| "grad_norm": 0.7299594283103943, |
| "learning_rate": 6.258546832576651e-08, |
| "loss": 0.6566, |
| "mean_token_accuracy": 0.7970735728740692, |
| "num_tokens": 89619651.0, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.9359959037378393, |
| "grad_norm": 0.689508318901062, |
| "learning_rate": 6.126638253248273e-08, |
| "loss": 0.5713, |
| "mean_token_accuracy": 0.8205289244651794, |
| "num_tokens": 89684778.0, |
| "step": 1371 |
| }, |
| { |
| "epoch": 0.9366786140979689, |
| "grad_norm": 0.6871770024299622, |
| "learning_rate": 5.996117363587045e-08, |
| "loss": 0.5902, |
| "mean_token_accuracy": 0.8142588883638382, |
| "num_tokens": 89750254.0, |
| "step": 1372 |
| }, |
| { |
| "epoch": 0.9373613244580986, |
| "grad_norm": 0.6985852122306824, |
| "learning_rate": 5.866984906283707e-08, |
| "loss": 0.6062, |
| "mean_token_accuracy": 0.8118279576301575, |
| "num_tokens": 89815790.0, |
| "step": 1373 |
| }, |
| { |
| "epoch": 0.9380440348182284, |
| "grad_norm": 0.7541195154190063, |
| "learning_rate": 5.739241616128544e-08, |
| "loss": 0.618, |
| "mean_token_accuracy": 0.8069403767585754, |
| "num_tokens": 89881326.0, |
| "step": 1374 |
| }, |
| { |
| "epoch": 0.9387267451783581, |
| "grad_norm": 0.7442590594291687, |
| "learning_rate": 5.6128882200071897e-08, |
| "loss": 0.5782, |
| "mean_token_accuracy": 0.8197620958089828, |
| "num_tokens": 89946482.0, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.9394094555384878, |
| "grad_norm": 0.6985146403312683, |
| "learning_rate": 5.4879254368964964e-08, |
| "loss": 0.6205, |
| "mean_token_accuracy": 0.8052755445241928, |
| "num_tokens": 90012018.0, |
| "step": 1376 |
| }, |
| { |
| "epoch": 0.9400921658986175, |
| "grad_norm": 0.7548423409461975, |
| "learning_rate": 5.3643539778605036e-08, |
| "loss": 0.6038, |
| "mean_token_accuracy": 0.8120417892932892, |
| "num_tokens": 90077554.0, |
| "step": 1377 |
| }, |
| { |
| "epoch": 0.9407748762587472, |
| "grad_norm": 0.6929084658622742, |
| "learning_rate": 5.2421745460461416e-08, |
| "loss": 0.6033, |
| "mean_token_accuracy": 0.8120417892932892, |
| "num_tokens": 90143090.0, |
| "step": 1378 |
| }, |
| { |
| "epoch": 0.9414575866188769, |
| "grad_norm": 0.7029266357421875, |
| "learning_rate": 5.121387836679676e-08, |
| "loss": 0.6018, |
| "mean_token_accuracy": 0.8125683218240738, |
| "num_tokens": 90208547.0, |
| "step": 1379 |
| }, |
| { |
| "epoch": 0.9421402969790067, |
| "grad_norm": 0.7251154780387878, |
| "learning_rate": 5.001994537062266e-08, |
| "loss": 0.6186, |
| "mean_token_accuracy": 0.8107893466949463, |
| "num_tokens": 90274083.0, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.9428230073391364, |
| "grad_norm": 0.7269313931465149, |
| "learning_rate": 4.8839953265664705e-08, |
| "loss": 0.6383, |
| "mean_token_accuracy": 0.8016709536314011, |
| "num_tokens": 90339619.0, |
| "step": 1381 |
| }, |
| { |
| "epoch": 0.943505717699266, |
| "grad_norm": 0.7003495693206787, |
| "learning_rate": 4.7673908766319996e-08, |
| "loss": 0.5779, |
| "mean_token_accuracy": 0.8185178339481354, |
| "num_tokens": 90405155.0, |
| "step": 1382 |
| }, |
| { |
| "epoch": 0.9441884280593958, |
| "grad_norm": 0.7337906956672668, |
| "learning_rate": 4.652181850762327e-08, |
| "loss": 0.6347, |
| "mean_token_accuracy": 0.8032441437244415, |
| "num_tokens": 90470691.0, |
| "step": 1383 |
| }, |
| { |
| "epoch": 0.9448711384195255, |
| "grad_norm": 0.7205424308776855, |
| "learning_rate": 4.5383689045204184e-08, |
| "loss": 0.595, |
| "mean_token_accuracy": 0.8148368746042252, |
| "num_tokens": 90536227.0, |
| "step": 1384 |
| }, |
| { |
| "epoch": 0.9455538487796552, |
| "grad_norm": 0.7154881954193115, |
| "learning_rate": 4.425952685525453e-08, |
| "loss": 0.5951, |
| "mean_token_accuracy": 0.8150924146175385, |
| "num_tokens": 90600758.0, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.946236559139785, |
| "grad_norm": 0.7424868941307068, |
| "learning_rate": 4.3149338334488864e-08, |
| "loss": 0.617, |
| "mean_token_accuracy": 0.8103158473968506, |
| "num_tokens": 90666294.0, |
| "step": 1386 |
| }, |
| { |
| "epoch": 0.9469192694999147, |
| "grad_norm": 0.7165055274963379, |
| "learning_rate": 4.2053129800108114e-08, |
| "loss": 0.5895, |
| "mean_token_accuracy": 0.8167934268712997, |
| "num_tokens": 90731240.0, |
| "step": 1387 |
| }, |
| { |
| "epoch": 0.9476019798600444, |
| "grad_norm": 0.7434617877006531, |
| "learning_rate": 4.0970907489764625e-08, |
| "loss": 0.6281, |
| "mean_token_accuracy": 0.8033971935510635, |
| "num_tokens": 90796695.0, |
| "step": 1388 |
| }, |
| { |
| "epoch": 0.9482846902201741, |
| "grad_norm": 0.7286208868026733, |
| "learning_rate": 3.990267756152688e-08, |
| "loss": 0.6078, |
| "mean_token_accuracy": 0.8082691878080368, |
| "num_tokens": 90862231.0, |
| "step": 1389 |
| }, |
| { |
| "epoch": 0.9489674005803038, |
| "grad_norm": 0.7145051956176758, |
| "learning_rate": 3.8848446093842365e-08, |
| "loss": 0.6077, |
| "mean_token_accuracy": 0.8082844614982605, |
| "num_tokens": 90927767.0, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.9496501109404335, |
| "grad_norm": 0.7143110036849976, |
| "learning_rate": 3.780821908550614e-08, |
| "loss": 0.6146, |
| "mean_token_accuracy": 0.8080993145704269, |
| "num_tokens": 90993192.0, |
| "step": 1391 |
| }, |
| { |
| "epoch": 0.9503328213005633, |
| "grad_norm": 0.7215762138366699, |
| "learning_rate": 3.6782002455623686e-08, |
| "loss": 0.5857, |
| "mean_token_accuracy": 0.8155962228775024, |
| "num_tokens": 91058644.0, |
| "step": 1392 |
| }, |
| { |
| "epoch": 0.951015531660693, |
| "grad_norm": 0.7443726062774658, |
| "learning_rate": 3.5769802043579546e-08, |
| "loss": 0.6452, |
| "mean_token_accuracy": 0.7994347214698792, |
| "num_tokens": 91123763.0, |
| "step": 1393 |
| }, |
| { |
| "epoch": 0.9516982420208226, |
| "grad_norm": 0.71949702501297, |
| "learning_rate": 3.477162360900177e-08, |
| "loss": 0.601, |
| "mean_token_accuracy": 0.8104380518198013, |
| "num_tokens": 91189299.0, |
| "step": 1394 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.7069823741912842, |
| "learning_rate": 3.3787472831732225e-08, |
| "loss": 0.5817, |
| "mean_token_accuracy": 0.8176014125347137, |
| "num_tokens": 91254835.0, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.9530636627410821, |
| "grad_norm": 0.7261773943901062, |
| "learning_rate": 3.281735531179053e-08, |
| "loss": 0.6213, |
| "mean_token_accuracy": 0.8069199919700623, |
| "num_tokens": 91320310.0, |
| "step": 1396 |
| }, |
| { |
| "epoch": 0.9537463731012118, |
| "grad_norm": 0.6818092465400696, |
| "learning_rate": 3.186127656934629e-08, |
| "loss": 0.5901, |
| "mean_token_accuracy": 0.8161265105009079, |
| "num_tokens": 91385722.0, |
| "step": 1397 |
| }, |
| { |
| "epoch": 0.9544290834613415, |
| "grad_norm": 0.7226613759994507, |
| "learning_rate": 3.0919242044683554e-08, |
| "loss": 0.6151, |
| "mean_token_accuracy": 0.8083223253488541, |
| "num_tokens": 91451157.0, |
| "step": 1398 |
| }, |
| { |
| "epoch": 0.9551117938214713, |
| "grad_norm": 0.7244173884391785, |
| "learning_rate": 2.999125709817363e-08, |
| "loss": 0.6046, |
| "mean_token_accuracy": 0.8112988919019699, |
| "num_tokens": 91516667.0, |
| "step": 1399 |
| }, |
| { |
| "epoch": 0.955794504181601, |
| "grad_norm": 0.722210705280304, |
| "learning_rate": 2.9077327010241242e-08, |
| "loss": 0.6025, |
| "mean_token_accuracy": 0.8122861683368683, |
| "num_tokens": 91582203.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.9564772145417306, |
| "grad_norm": 0.7271984815597534, |
| "learning_rate": 2.817745698133728e-08, |
| "loss": 0.6074, |
| "mean_token_accuracy": 0.80910924077034, |
| "num_tokens": 91647739.0, |
| "step": 1401 |
| }, |
| { |
| "epoch": 0.9571599249018604, |
| "grad_norm": 0.7084903120994568, |
| "learning_rate": 2.7291652131908043e-08, |
| "loss": 0.6065, |
| "mean_token_accuracy": 0.808757945895195, |
| "num_tokens": 91713275.0, |
| "step": 1402 |
| }, |
| { |
| "epoch": 0.9578426352619901, |
| "grad_norm": 0.7087424993515015, |
| "learning_rate": 2.6419917502364667e-08, |
| "loss": 0.5937, |
| "mean_token_accuracy": 0.8132331371307373, |
| "num_tokens": 91778811.0, |
| "step": 1403 |
| }, |
| { |
| "epoch": 0.9585253456221198, |
| "grad_norm": 0.7017878293991089, |
| "learning_rate": 2.5562258053057343e-08, |
| "loss": 0.61, |
| "mean_token_accuracy": 0.8106268793344498, |
| "num_tokens": 91844236.0, |
| "step": 1404 |
| }, |
| { |
| "epoch": 0.9592080559822496, |
| "grad_norm": 0.7056769132614136, |
| "learning_rate": 2.4718678664245323e-08, |
| "loss": 0.5793, |
| "mean_token_accuracy": 0.8190218657255173, |
| "num_tokens": 91909772.0, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.9598907663423792, |
| "grad_norm": 0.6610713005065918, |
| "learning_rate": 2.3889184136068334e-08, |
| "loss": 0.5798, |
| "mean_token_accuracy": 0.8191746026277542, |
| "num_tokens": 91975308.0, |
| "step": 1406 |
| }, |
| { |
| "epoch": 0.9605734767025089, |
| "grad_norm": 0.7304586172103882, |
| "learning_rate": 2.3073779188521606e-08, |
| "loss": 0.6338, |
| "mean_token_accuracy": 0.8037023395299911, |
| "num_tokens": 92040844.0, |
| "step": 1407 |
| }, |
| { |
| "epoch": 0.9612561870626387, |
| "grad_norm": 0.7191253304481506, |
| "learning_rate": 2.2272468461427276e-08, |
| "loss": 0.6016, |
| "mean_token_accuracy": 0.8119097203016281, |
| "num_tokens": 92106227.0, |
| "step": 1408 |
| }, |
| { |
| "epoch": 0.9619388974227684, |
| "grad_norm": 0.7064103484153748, |
| "learning_rate": 2.1485256514408025e-08, |
| "loss": 0.5488, |
| "mean_token_accuracy": 0.8251618891954422, |
| "num_tokens": 92171763.0, |
| "step": 1409 |
| }, |
| { |
| "epoch": 0.9626216077828981, |
| "grad_norm": 0.7105903029441833, |
| "learning_rate": 2.071214782686265e-08, |
| "loss": 0.5767, |
| "mean_token_accuracy": 0.81931933760643, |
| "num_tokens": 92236780.0, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.9633043181430279, |
| "grad_norm": 0.71761554479599, |
| "learning_rate": 1.9953146797938306e-08, |
| "loss": 0.6162, |
| "mean_token_accuracy": 0.8084098994731903, |
| "num_tokens": 92302202.0, |
| "step": 1411 |
| }, |
| { |
| "epoch": 0.9639870285031575, |
| "grad_norm": 0.7019249200820923, |
| "learning_rate": 1.9208257746507476e-08, |
| "loss": 0.5907, |
| "mean_token_accuracy": 0.8163642436265945, |
| "num_tokens": 92367738.0, |
| "step": 1412 |
| }, |
| { |
| "epoch": 0.9646697388632872, |
| "grad_norm": 0.7232638597488403, |
| "learning_rate": 1.847748491114215e-08, |
| "loss": 0.6128, |
| "mean_token_accuracy": 0.8088495880365372, |
| "num_tokens": 92433274.0, |
| "step": 1413 |
| }, |
| { |
| "epoch": 0.965352449223417, |
| "grad_norm": 0.7209507822990417, |
| "learning_rate": 1.7760832450090526e-08, |
| "loss": 0.6016, |
| "mean_token_accuracy": 0.8138899058103561, |
| "num_tokens": 92498810.0, |
| "step": 1414 |
| }, |
| { |
| "epoch": 0.9660351595835467, |
| "grad_norm": 0.6921422481536865, |
| "learning_rate": 1.705830444125256e-08, |
| "loss": 0.6244, |
| "mean_token_accuracy": 0.8077894598245621, |
| "num_tokens": 92564344.0, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.9667178699436764, |
| "grad_norm": 0.7221968770027161, |
| "learning_rate": 1.6369904882157507e-08, |
| "loss": 0.6278, |
| "mean_token_accuracy": 0.804711103439331, |
| "num_tokens": 92629875.0, |
| "step": 1416 |
| }, |
| { |
| "epoch": 0.9674005803038062, |
| "grad_norm": 0.6897109150886536, |
| "learning_rate": 1.569563768994031e-08, |
| "loss": 0.5864, |
| "mean_token_accuracy": 0.8165169805288315, |
| "num_tokens": 92695411.0, |
| "step": 1417 |
| }, |
| { |
| "epoch": 0.9680832906639358, |
| "grad_norm": 0.7170175909996033, |
| "learning_rate": 1.5035506701320812e-08, |
| "loss": 0.5601, |
| "mean_token_accuracy": 0.8227028250694275, |
| "num_tokens": 92760947.0, |
| "step": 1418 |
| }, |
| { |
| "epoch": 0.9687660010240655, |
| "grad_norm": 0.6924940943717957, |
| "learning_rate": 1.4389515672579568e-08, |
| "loss": 0.5804, |
| "mean_token_accuracy": 0.8154478222131729, |
| "num_tokens": 92826483.0, |
| "step": 1419 |
| }, |
| { |
| "epoch": 0.9694487113841953, |
| "grad_norm": 0.6951374411582947, |
| "learning_rate": 1.3757668279539282e-08, |
| "loss": 0.5984, |
| "mean_token_accuracy": 0.811620831489563, |
| "num_tokens": 92892016.0, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.970131421744325, |
| "grad_norm": 0.7141850590705872, |
| "learning_rate": 1.313996811754148e-08, |
| "loss": 0.5796, |
| "mean_token_accuracy": 0.8167347013950348, |
| "num_tokens": 92957477.0, |
| "step": 1421 |
| }, |
| { |
| "epoch": 0.9708141321044547, |
| "grad_norm": 0.7237462401390076, |
| "learning_rate": 1.2536418701427078e-08, |
| "loss": 0.6088, |
| "mean_token_accuracy": 0.8100256621837616, |
| "num_tokens": 93023013.0, |
| "step": 1422 |
| }, |
| { |
| "epoch": 0.9714968424645845, |
| "grad_norm": 0.7390242218971252, |
| "learning_rate": 1.1947023465517238e-08, |
| "loss": 0.6607, |
| "mean_token_accuracy": 0.7965389937162399, |
| "num_tokens": 93088549.0, |
| "step": 1423 |
| }, |
| { |
| "epoch": 0.9721795528247141, |
| "grad_norm": 0.693906307220459, |
| "learning_rate": 1.1371785763591714e-08, |
| "loss": 0.5722, |
| "mean_token_accuracy": 0.8215420246124268, |
| "num_tokens": 93154085.0, |
| "step": 1424 |
| }, |
| { |
| "epoch": 0.9728622631848438, |
| "grad_norm": 0.7362616062164307, |
| "learning_rate": 1.0810708868871645e-08, |
| "loss": 0.6348, |
| "mean_token_accuracy": 0.802773505449295, |
| "num_tokens": 93219447.0, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.9735449735449735, |
| "grad_norm": 0.6923553347587585, |
| "learning_rate": 1.0263795974000401e-08, |
| "loss": 0.5766, |
| "mean_token_accuracy": 0.8195104598999023, |
| "num_tokens": 93284829.0, |
| "step": 1426 |
| }, |
| { |
| "epoch": 0.9742276839051033, |
| "grad_norm": 0.7165243625640869, |
| "learning_rate": 9.731050191024716e-09, |
| "loss": 0.6419, |
| "mean_token_accuracy": 0.8003726899623871, |
| "num_tokens": 93350365.0, |
| "step": 1427 |
| }, |
| { |
| "epoch": 0.974910394265233, |
| "grad_norm": 0.6946384906768799, |
| "learning_rate": 9.212474551378025e-09, |
| "loss": 0.5609, |
| "mean_token_accuracy": 0.8229713141918182, |
| "num_tokens": 93415789.0, |
| "step": 1428 |
| }, |
| { |
| "epoch": 0.9755931046253626, |
| "grad_norm": 0.7266075611114502, |
| "learning_rate": 8.708072005862433e-09, |
| "loss": 0.6061, |
| "mean_token_accuracy": 0.8118890523910522, |
| "num_tokens": 93481325.0, |
| "step": 1429 |
| }, |
| { |
| "epoch": 0.9762758149854924, |
| "grad_norm": 0.7116613984107971, |
| "learning_rate": 8.217845424632332e-09, |
| "loss": 0.6266, |
| "mean_token_accuracy": 0.8050356209278107, |
| "num_tokens": 93546853.0, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.9769585253456221, |
| "grad_norm": 0.7236589789390564, |
| "learning_rate": 7.741797597178024e-09, |
| "loss": 0.5874, |
| "mean_token_accuracy": 0.8152187168598175, |
| "num_tokens": 93612389.0, |
| "step": 1431 |
| }, |
| { |
| "epoch": 0.9776412357057518, |
| "grad_norm": 0.7012879252433777, |
| "learning_rate": 7.279931232309911e-09, |
| "loss": 0.5846, |
| "mean_token_accuracy": 0.8153409063816071, |
| "num_tokens": 93677925.0, |
| "step": 1432 |
| }, |
| { |
| "epoch": 0.9783239460658816, |
| "grad_norm": 0.7387993335723877, |
| "learning_rate": 6.832248958142107e-09, |
| "loss": 0.5972, |
| "mean_token_accuracy": 0.811590164899826, |
| "num_tokens": 93742181.0, |
| "step": 1433 |
| }, |
| { |
| "epoch": 0.9790066564260113, |
| "grad_norm": 0.8164238929748535, |
| "learning_rate": 6.398753322079676e-09, |
| "loss": 0.6047, |
| "mean_token_accuracy": 0.8127580732107162, |
| "num_tokens": 93807621.0, |
| "step": 1434 |
| }, |
| { |
| "epoch": 0.9796893667861409, |
| "grad_norm": 0.7341460585594177, |
| "learning_rate": 5.979446790801979e-09, |
| "loss": 0.657, |
| "mean_token_accuracy": 0.7979288995265961, |
| "num_tokens": 93873157.0, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.9803720771462707, |
| "grad_norm": 0.728808581829071, |
| "learning_rate": 5.574331750249074e-09, |
| "loss": 0.6026, |
| "mean_token_accuracy": 0.8145161271095276, |
| "num_tokens": 93938693.0, |
| "step": 1436 |
| }, |
| { |
| "epoch": 0.9810547875064004, |
| "grad_norm": 0.7273023724555969, |
| "learning_rate": 5.183410505609498e-09, |
| "loss": 0.6069, |
| "mean_token_accuracy": 0.8126724660396576, |
| "num_tokens": 94003911.0, |
| "step": 1437 |
| }, |
| { |
| "epoch": 0.9817374978665301, |
| "grad_norm": 0.7281927466392517, |
| "learning_rate": 4.806685281305568e-09, |
| "loss": 0.5996, |
| "mean_token_accuracy": 0.8104685992002487, |
| "num_tokens": 94069447.0, |
| "step": 1438 |
| }, |
| { |
| "epoch": 0.9824202082266599, |
| "grad_norm": 0.7687538862228394, |
| "learning_rate": 4.444158220981154e-09, |
| "loss": 0.6093, |
| "mean_token_accuracy": 0.8106977045536041, |
| "num_tokens": 94134983.0, |
| "step": 1439 |
| }, |
| { |
| "epoch": 0.9831029185867896, |
| "grad_norm": 0.7035177946090698, |
| "learning_rate": 4.095831387490312e-09, |
| "loss": 0.6002, |
| "mean_token_accuracy": 0.810405820608139, |
| "num_tokens": 94200496.0, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.9837856289469192, |
| "grad_norm": 0.7233470678329468, |
| "learning_rate": 3.761706762884232e-09, |
| "loss": 0.6151, |
| "mean_token_accuracy": 0.8110263049602509, |
| "num_tokens": 94265839.0, |
| "step": 1441 |
| }, |
| { |
| "epoch": 0.984468339307049, |
| "grad_norm": 0.7216858267784119, |
| "learning_rate": 3.4417862484006914e-09, |
| "loss": 0.6047, |
| "mean_token_accuracy": 0.8115992546081543, |
| "num_tokens": 94331298.0, |
| "step": 1442 |
| }, |
| { |
| "epoch": 0.9851510496671787, |
| "grad_norm": 0.7692837715148926, |
| "learning_rate": 3.136071664453788e-09, |
| "loss": 0.607, |
| "mean_token_accuracy": 0.8103876262903214, |
| "num_tokens": 94396631.0, |
| "step": 1443 |
| }, |
| { |
| "epoch": 0.9858337600273084, |
| "grad_norm": 0.7010142803192139, |
| "learning_rate": 2.8445647506220032e-09, |
| "loss": 0.6288, |
| "mean_token_accuracy": 0.8041263669729233, |
| "num_tokens": 94462025.0, |
| "step": 1444 |
| }, |
| { |
| "epoch": 0.9865164703874382, |
| "grad_norm": 0.7143568396568298, |
| "learning_rate": 2.5672671656401526e-09, |
| "loss": 0.5985, |
| "mean_token_accuracy": 0.814523309469223, |
| "num_tokens": 94527487.0, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.9871991807475679, |
| "grad_norm": 0.6615064144134521, |
| "learning_rate": 2.3041804873882857e-09, |
| "loss": 0.5596, |
| "mean_token_accuracy": 0.8252646028995514, |
| "num_tokens": 94592979.0, |
| "step": 1446 |
| }, |
| { |
| "epoch": 0.9878818911076975, |
| "grad_norm": 0.7245396971702576, |
| "learning_rate": 2.0553062128839117e-09, |
| "loss": 0.5973, |
| "mean_token_accuracy": 0.8139510005712509, |
| "num_tokens": 94658515.0, |
| "step": 1447 |
| }, |
| { |
| "epoch": 0.9885646014678273, |
| "grad_norm": 0.7064764499664307, |
| "learning_rate": 1.8206457582728432e-09, |
| "loss": 0.6182, |
| "mean_token_accuracy": 0.8098118305206299, |
| "num_tokens": 94724051.0, |
| "step": 1448 |
| }, |
| { |
| "epoch": 0.989247311827957, |
| "grad_norm": 0.7237589955329895, |
| "learning_rate": 1.600200458821699e-09, |
| "loss": 0.619, |
| "mean_token_accuracy": 0.8067723661661148, |
| "num_tokens": 94789587.0, |
| "step": 1449 |
| }, |
| { |
| "epoch": 0.9899300221880867, |
| "grad_norm": 0.7340627312660217, |
| "learning_rate": 1.3939715689093025e-09, |
| "loss": 0.5694, |
| "mean_token_accuracy": 0.8215572983026505, |
| "num_tokens": 94855123.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.9906127325482165, |
| "grad_norm": 0.7079196572303772, |
| "learning_rate": 1.201960262020574e-09, |
| "loss": 0.5989, |
| "mean_token_accuracy": 0.815142348408699, |
| "num_tokens": 94920659.0, |
| "step": 1451 |
| }, |
| { |
| "epoch": 0.9912954429083461, |
| "grad_norm": 0.7087104916572571, |
| "learning_rate": 1.0241676307398696e-09, |
| "loss": 0.559, |
| "mean_token_accuracy": 0.8240730315446854, |
| "num_tokens": 94985980.0, |
| "step": 1452 |
| }, |
| { |
| "epoch": 0.9919781532684758, |
| "grad_norm": 0.7006968855857849, |
| "learning_rate": 8.605946867432103e-10, |
| "loss": 0.6028, |
| "mean_token_accuracy": 0.8109879046678543, |
| "num_tokens": 95051516.0, |
| "step": 1453 |
| }, |
| { |
| "epoch": 0.9926608636286056, |
| "grad_norm": 0.7328653931617737, |
| "learning_rate": 7.112423607946728e-10, |
| "loss": 0.6228, |
| "mean_token_accuracy": 0.8052449971437454, |
| "num_tokens": 95117052.0, |
| "step": 1454 |
| }, |
| { |
| "epoch": 0.9933435739887353, |
| "grad_norm": 0.7530898451805115, |
| "learning_rate": 5.76111502739729e-10, |
| "loss": 0.631, |
| "mean_token_accuracy": 0.8047104179859161, |
| "num_tokens": 95182588.0, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.994026284348865, |
| "grad_norm": 0.7755634784698486, |
| "learning_rate": 4.552028815008047e-10, |
| "loss": 0.647, |
| "mean_token_accuracy": 0.7990311533212662, |
| "num_tokens": 95247829.0, |
| "step": 1456 |
| }, |
| { |
| "epoch": 0.9947089947089947, |
| "grad_norm": 0.6960027813911438, |
| "learning_rate": 3.4851718507311617e-10, |
| "loss": 0.5842, |
| "mean_token_accuracy": 0.8175555914640427, |
| "num_tokens": 95313365.0, |
| "step": 1457 |
| }, |
| { |
| "epoch": 0.9953917050691244, |
| "grad_norm": 0.6915958523750305, |
| "learning_rate": 2.5605502051967435e-10, |
| "loss": 0.5701, |
| "mean_token_accuracy": 0.8211907297372818, |
| "num_tokens": 95378901.0, |
| "step": 1458 |
| }, |
| { |
| "epoch": 0.9960744154292541, |
| "grad_norm": 0.7065199017524719, |
| "learning_rate": 1.7781691396961952e-10, |
| "loss": 0.6341, |
| "mean_token_accuracy": 0.8043896704912186, |
| "num_tokens": 95444437.0, |
| "step": 1459 |
| }, |
| { |
| "epoch": 0.9967571257893838, |
| "grad_norm": 0.6968325972557068, |
| "learning_rate": 1.1380331061405791e-10, |
| "loss": 0.5858, |
| "mean_token_accuracy": 0.8169599175453186, |
| "num_tokens": 95509973.0, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.9974398361495136, |
| "grad_norm": 0.697854220867157, |
| "learning_rate": 6.401457470300853e-11, |
| "loss": 0.5955, |
| "mean_token_accuracy": 0.8130066990852356, |
| "num_tokens": 95575408.0, |
| "step": 1461 |
| }, |
| { |
| "epoch": 0.9981225465096433, |
| "grad_norm": 0.7216105461120605, |
| "learning_rate": 2.8450989545125706e-11, |
| "loss": 0.6084, |
| "mean_token_accuracy": 0.8100103884935379, |
| "num_tokens": 95640944.0, |
| "step": 1462 |
| }, |
| { |
| "epoch": 0.998805256869773, |
| "grad_norm": 0.6907703876495361, |
| "learning_rate": 7.112757504645907e-12, |
| "loss": 0.5858, |
| "mean_token_accuracy": 0.8143328428268433, |
| "num_tokens": 95706480.0, |
| "step": 1463 |
| }, |
| { |
| "epoch": 0.9994879672299027, |
| "grad_norm": 0.7113831043243408, |
| "learning_rate": 0.0, |
| "loss": 0.5998, |
| "mean_token_accuracy": 0.811400294303894, |
| "num_tokens": 95772016.0, |
| "step": 1464 |
| }, |
| { |
| "epoch": 0.9994879672299027, |
| "step": 1464, |
| "total_flos": 83397779128320.0, |
| "train_loss": 0.6384449617458823, |
| "train_runtime": 24106.2969, |
| "train_samples_per_second": 3.888, |
| "train_steps_per_second": 0.061 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1464, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 83397779128320.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|